Index: lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -541,6 +541,7 @@
     ScalarToTreeEntry.clear();
     MustGather.clear();
     ExternalUses.clear();
+    RemovedOprations.clear();
     NumOpsWantToKeepOrder.clear();
     NumOpsWantToKeepOriginalOrder = 0;
     for (auto &Iter : BlocksSchedules) {
@@ -600,6 +601,9 @@
   /// vectorizable. We do not vectorize such trees.
   bool isTreeTinyAndNotFullyVectorizable();
 
+  /// Reduce the cost of tree to make it patrially vectorizable, if possible.
+  bool reduceTreeCost(int Delta);
+
   OptimizationRemarkEmitter *getORE() { return ORE; }
 
 private:
@@ -700,6 +704,9 @@
     /// The TreeEntry index containing the user of this entry.  We can actually
     /// have multiple users so the data structure is not truly a tree.
     SmallVector<int, 1> UserTreeIndices;
+
+    /// Cost of the tree entry.
+    int Cost;
   };
 
   /// Create a new VectorizableTree entry.
@@ -742,6 +749,9 @@
   /// Maps a specific scalar to its tree entry.
   SmallDenseMap<Value*, int> ScalarToTreeEntry;
 
+  /// Tree entries that should not be vectorized due to throttling.
+  SmallVector<int, 2> RemovedOprations;
+
   /// A list of scalars that we found that we need to keep as scalars.
   ValueSet MustGather;
 
@@ -1170,6 +1180,9 @@
   /// Attaches the BlockScheduling structures to basic blocks.
   MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
 
+  /// Remove operations from the list of proposed to schedule.
+  void removeFromScheduling(BlockScheduling *BS, ArrayRef<int> Oprations);
+
   /// Performs the "real" scheduling. Done before vectorization is actually
   /// performed in a basic block.
   void scheduleBlock(BlockScheduling *BS);
@@ -2413,6 +2426,70 @@
   }
 }
 
+bool BoUpSLP::reduceTreeCost(int Delta) {
+  SmallVector<int, 2> Tree;
+  bool Reduced = false;
+  int CostSum = 0;
+
+  // Look at the tree elements in a backward way to the top,
+  // First, it should be gathering nodes, if any exist, and
+  // then actual vectorizable operation. So, by walking
+  // backward we could find the real cost of every operation by
+  // summing gathering nodes and actual operation node that
+  // follows gathering node(s).
+  for (unsigned I = VectorizableTree.size(); I--;) {
+    TreeEntry *Entry = &VectorizableTree[I];
+    if (!Entry->NeedToGather) {
+      CostSum += Entry->Cost;
+      Tree.push_back(CostSum);
+      CostSum = 0;
+    } else {
+      CostSum += Entry->Cost;
+    }
+  }
+
+  // Estimating where to stop vectorization.
+  CostSum = 0;
+  unsigned StopAt = 0;
+  for (unsigned I = 0, E = Tree.size(); I < E; I++) {
+    CostSum += Tree[I];
+    if (CostSum >= Delta) {
+      StopAt = I;
+      break;
+    }
+  }
+
+  // Canceling unprofitable elements.
+  if (StopAt > 0 && StopAt < (Tree.size() - 1)) {
+    LLVM_DEBUG(dbgs() << "SLP: Reduced the tree cost by " << Delta
+                      << " to make it partially vectorizable.\n");
+    Reduced = true;
+    for (unsigned I = 0, E = VectorizableTree.size(); I < E; I++) {
+      TreeEntry *Entry = &VectorizableTree[I];
+      if (!Entry->NeedToGather) {
+        if (I >= StopAt) {
+          Entry->NeedToGather = true;
+          for (Value *V : Entry->Scalars) {
+            LLVM_DEBUG(dbgs() << "SLP: Remove scalar " << *V
+                              << " out proposed to vectorize.\n");
+            RemovedOprations.push_back(I);
+            ScalarToTreeEntry.erase(V);
+            MustGather.insert(V);
+            ExternalUses.erase(std::remove_if(ExternalUses.begin(),
+                                              ExternalUses.end(),
+                                              [&](ExternalUser &EU) {
+                                                return EU.Scalar == V;
+                                              }),
+                               ExternalUses.end());
+          }
+        }
+      }
+    }
+  }
+
+  return Reduced;
+}
+
 bool BoUpSLP::isFullyVectorizableTinyTree() {
   LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
                     << VectorizableTree.size() << " is fully vectorizable .\n");
@@ -2468,7 +2545,8 @@
   SmallPtrSet<Instruction*, 4> LiveValues;
   Instruction *PrevInst = nullptr;
 
-  for (const auto &N : VectorizableTree) {
+  for (auto &N : VectorizableTree) {
+    int EntryCost = 0;
     Instruction *Inst = dyn_cast<Instruction>(N.Scalars[0]);
     if (!Inst)
       continue;
@@ -2508,15 +2586,19 @@
            !isa<DbgInfoIntrinsic>(&*PrevInstIt)) &&
           &*PrevInstIt != PrevInst) {
         SmallVector<Type*, 4> V;
+        int ElementCost;
         for (auto *II : LiveValues)
           V.push_back(VectorType::get(II->getType(), BundleWidth));
-        Cost += TTI->getCostOfKeepingLiveOverCall(V);
+        ElementCost = TTI->getCostOfKeepingLiveOverCall(V);
+        Cost += ElementCost;
+        EntryCost += ElementCost;
       }
 
       ++PrevInstIt;
     }
 
     PrevInst = Inst;
+    N.Cost += EntryCost;
   }
 
   return Cost;
@@ -2552,6 +2634,7 @@
       continue;
 
     int C = getEntryCost(&TE);
+    TE.Cost = C;
     LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
                       << " for bundle that starts with " << *TE.Scalars[0]
                       << ".\n");
@@ -3574,7 +3657,12 @@
 BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
   // All blocks must be scheduled before any instructions are inserted.
   for (auto &BSIter : BlocksSchedules) {
-    scheduleBlock(BSIter.second.get());
+    BlockScheduling *BS = BSIter.second.get();
+    // Remove all Schedule Data from all nodes that we have changed
+    // vectorization decision.
+    if (!RemovedOprations.empty())
+      removeFromScheduling(BS, RemovedOprations);
+    scheduleBlock(BS);
   }
 
   Builder.SetInsertPoint(&F->getEntryBlock().front());
@@ -3702,15 +3790,8 @@
 
       Type *Ty = Scalar->getType();
       if (!Ty->isVoidTy()) {
-#ifndef NDEBUG
-        for (User *U : Scalar->users()) {
-          LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
-
-          // It is legal to replace users in the ignorelist by undef.
-          assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) &&
-                 "Replacing out-of-tree value with undef");
-        }
-#endif
+        // The tree might not be fully vectorized, so we don't have to
+        // check every user.
         Value *Undef = UndefValue::get(Ty);
         Scalar->replaceAllUsesWith(Undef);
       }
@@ -4186,6 +4267,33 @@
   ReadyInsts.clear();
 }
 
+void BoUpSLP::removeFromScheduling(BlockScheduling *BS, ArrayRef<int> Oprations) {
+  bool Removed = false;
+  for (int I: Oprations) {
+    TreeEntry *Entry = &VectorizableTree[I];
+    ScheduleData *SD = BS->getScheduleData(Entry->Scalars[0]);
+    if (SD && SD->isPartOfBundle()) {
+      if (!Removed) {
+        Removed = true;
+        BS->resetSchedule();
+      }
+      BS->cancelScheduling(Entry->Scalars, SD->OpValue);
+    }
+  }
+  if (Removed) {
+    BS->resetSchedule();
+    BS->initialFillReadyList(BS->ReadyInsts);
+    for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
+         I = I->getNextNode()) {
+      if (BS->ScheduleDataMap.find(I) == BS->ScheduleDataMap.end())
+        continue;
+      BS->doForAllOpcodes(I, [&](ScheduleData *SD) {
+        SD->clearDependencies();
+      });
+    }
+  }
+}
+
 void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
   if (!BS->ScheduleStart)
     return;
@@ -4682,7 +4790,16 @@
   const unsigned ChainLen = Chain.size();
   LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
                     << "\n");
-  const unsigned Sz = R.getVectorElementSize(Chain[0]);
+  Value *FirstStore = nullptr;
+  for (Value *V : Chain) {
+    assert(isa<StoreInst>(V) && "Expected only StoreInst here!");
+    if (StoreInst *SI = cast<StoreInst>(V))
+      if (SI->getValueOperand())
+        FirstStore = V;
+  }
+  if (!FirstStore)
+    return false;
+  const unsigned Sz = R.getVectorElementSize(FirstStore);
   const unsigned VF = VecRegSize / Sz;
 
   if (!isPowerOf2_32(Sz) || VF < 2)
@@ -4696,13 +4813,17 @@
   for (unsigned i = 0, e = ChainLen; i + VF <= e; ++i) {
 
     // Check that a previous iteration of this loop did not delete the Value.
-    if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))
-      continue;
-
     LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
                       << "\n");
     ArrayRef<Value *> Operands = Chain.slice(i, VF);
 
+    // Skip if any store instruction vectorized.
+    if (std::any_of(Operands.begin(), Operands.end(),
+          [](Value *V) {
+            return (!(cast<StoreInst>(V))->getValueOperand());
+          }))
+      continue;
+
     R.buildTree(Operands);
     if (R.isTreeTinyAndNotFullyVectorizable())
       continue;
@@ -4729,6 +4850,16 @@
       // Move to the next bundle.
       i += VF - 1;
       Changed = true;
+    } else {
+      // Try to reduce the tree to make it patrially vectorizable.
+      int Delta = Cost - SLPCostThreshold + 1;
+      if (R.getTreeSize() > 1) {
+        Changed = R.reduceTreeCost(Delta);
+        if (Changed) {
+          R.vectorizeTree();
+          i += VF - 1;
+        }
+      }
     }
   }
 
@@ -4802,7 +4933,6 @@
          Size /= 2) {
       if (vectorizeStoreChain(Operands, R, Size)) {
         // Mark the vectorized stores so that we don't vectorize them again.
-        VectorizedStores.insert(Operands.begin(), Operands.end());
         Changed = true;
         break;
       }
Index: test/Transforms/SLPVectorizer/AArch64/transpose.ll
===================================================================
--- test/Transforms/SLPVectorizer/AArch64/transpose.ll
+++ test/Transforms/SLPVectorizer/AArch64/transpose.ll
@@ -39,7 +39,6 @@
 ; CHECK-LABEL: @store_chain_v2i64(
 ; CHECK-NEXT:    [[A_1:%.*]] = getelementptr i64, i64* [[A:%.*]], i64 1
 ; CHECK-NEXT:    [[B_1:%.*]] = getelementptr i64, i64* [[B:%.*]], i64 1
-; CHECK-NEXT:    [[C_1:%.*]] = getelementptr i64, i64* [[C:%.*]], i64 1
 ; CHECK-NEXT:    [[V0_0:%.*]] = load i64, i64* [[A]], align 8
 ; CHECK-NEXT:    [[V0_1:%.*]] = load i64, i64* [[A_1]], align 8
 ; CHECK-NEXT:    [[V1_0:%.*]] = load i64, i64* [[B]], align 8
@@ -50,8 +49,10 @@
 ; CHECK-NEXT:    [[TMP1_1:%.*]] = sub i64 [[V0_1]], [[V1_1]]
 ; CHECK-NEXT:    [[TMP2_0:%.*]] = add i64 [[TMP0_0]], [[TMP0_1]]
 ; CHECK-NEXT:    [[TMP2_1:%.*]] = add i64 [[TMP1_0]], [[TMP1_1]]
-; CHECK-NEXT:    store i64 [[TMP2_0]], i64* [[C]], align 8
-; CHECK-NEXT:    store i64 [[TMP2_1]], i64* [[C_1]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2_0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[TMP2_1]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[C:%.*]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* [[TMP3]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %a.0 = getelementptr i64, i64* %a, i64 0
Index: test/Transforms/SLPVectorizer/NVPTX/v2f16.ll
===================================================================
--- test/Transforms/SLPVectorizer/NVPTX/v2f16.ll
+++ test/Transforms/SLPVectorizer/NVPTX/v2f16.ll
@@ -33,16 +33,17 @@
 ; NOVECTOR-NEXT:    [[TMP11:%.*]] = getelementptr inbounds half, half* [[TMP10]], i64 [[TMP6]]
 ; NOVECTOR-NEXT:    [[TMP12:%.*]] = load half, half* [[TMP11]], align 8
 ; NOVECTOR-NEXT:    [[TMP13:%.*]] = fmul fast half [[TMP12]], 0xH5380
-; NOVECTOR-NEXT:    [[TMP14:%.*]] = fadd fast half [[TMP13]], 0xH57F0
 ; NOVECTOR-NEXT:    [[TMP15:%.*]] = bitcast i8* [[ARG:%.*]] to half*
 ; NOVECTOR-NEXT:    [[TMP16:%.*]] = getelementptr inbounds half, half* [[TMP15]], i64 [[TMP6]]
-; NOVECTOR-NEXT:    store half [[TMP14]], half* [[TMP16]], align 8
 ; NOVECTOR-NEXT:    [[TMP17:%.*]] = getelementptr inbounds half, half* [[TMP10]], i64 [[TMP7]]
 ; NOVECTOR-NEXT:    [[TMP18:%.*]] = load half, half* [[TMP17]], align 2
 ; NOVECTOR-NEXT:    [[TMP19:%.*]] = fmul fast half [[TMP18]], 0xH5380
-; NOVECTOR-NEXT:    [[TMP20:%.*]] = fadd fast half [[TMP19]], 0xH57F0
+; NOVECTOR-NEXT:    [[TMP1:%.*]] = insertelement <2 x half> undef, half [[TMP13]], i32 0
+; NOVECTOR-NEXT:    [[TMP2:%.*]] = insertelement <2 x half> [[TMP1]], half [[TMP19]], i32 1
+; NOVECTOR-NEXT:    [[TMP3:%.*]] = fadd fast <2 x half> <half 0xH57F0, half 0xH57F0>, [[TMP2]]
 ; NOVECTOR-NEXT:    [[TMP21:%.*]] = getelementptr inbounds half, half* [[TMP15]], i64 [[TMP7]]
-; NOVECTOR-NEXT:    store half [[TMP20]], half* [[TMP21]], align 2
+; NOVECTOR-NEXT:    [[TMP4:%.*]] = bitcast half* [[TMP16]] to <2 x half>*
+; NOVECTOR-NEXT:    store <2 x half> [[TMP3]], <2 x half>* [[TMP4]], align 8
 ; NOVECTOR-NEXT:    ret void
 ;
   %tmp = shl nuw nsw i32 %arg2, 6
Index: test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll
+++ test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll
@@ -27,92 +27,148 @@
 
 define void @add_v8i64() {
 ; SSE-LABEL: @add_v8i64(
-; SSE-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
-; SSE-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
-; SSE-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
-; SSE-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
-; SSE-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
-; SSE-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
-; SSE-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
-; SSE-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
-; SSE-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
-; SSE-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
-; SSE-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
-; SSE-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
-; SSE-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
-; SSE-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
-; SSE-NEXT:    [[R0:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A0]], i64 [[B0]])
-; SSE-NEXT:    [[R1:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A1]], i64 [[B1]])
-; SSE-NEXT:    [[R2:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A2]], i64 [[B2]])
-; SSE-NEXT:    [[R3:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A3]], i64 [[B3]])
-; SSE-NEXT:    [[R4:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A4]], i64 [[B4]])
-; SSE-NEXT:    [[R5:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A5]], i64 [[B5]])
-; SSE-NEXT:    [[R6:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A6]], i64 [[B6]])
-; SSE-NEXT:    [[R7:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A7]], i64 [[B7]])
-; SSE-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
-; SSE-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
-; SSE-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
-; SSE-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
-; SSE-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
-; SSE-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
-; SSE-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
-; SSE-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
+; SSE-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> undef, i64 [[TMP9]], i32 0
+; SSE-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+; SSE-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP11]], i32 1
+; SSE-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
+; SSE-NEXT:    [[TMP14:%.*]] = insertelement <2 x i64> undef, i64 [[TMP13]], i32 0
+; SSE-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
+; SSE-NEXT:    [[TMP16:%.*]] = insertelement <2 x i64> [[TMP14]], i64 [[TMP15]], i32 1
+; SSE-NEXT:    [[TMP17:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP12]], <2 x i64> [[TMP16]])
+; SSE-NEXT:    [[TMP18:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
+; SSE-NEXT:    [[TMP19:%.*]] = insertelement <2 x i64> undef, i64 [[TMP18]], i32 0
+; SSE-NEXT:    [[TMP20:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; SSE-NEXT:    [[TMP21:%.*]] = insertelement <2 x i64> [[TMP19]], i64 [[TMP20]], i32 1
+; SSE-NEXT:    [[TMP22:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
+; SSE-NEXT:    [[TMP23:%.*]] = insertelement <2 x i64> undef, i64 [[TMP22]], i32 0
+; SSE-NEXT:    [[TMP24:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
+; SSE-NEXT:    [[TMP25:%.*]] = insertelement <2 x i64> [[TMP23]], i64 [[TMP24]], i32 1
+; SSE-NEXT:    [[TMP26:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP21]], <2 x i64> [[TMP25]])
+; SSE-NEXT:    [[TMP27:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; SSE-NEXT:    [[TMP28:%.*]] = insertelement <2 x i64> undef, i64 [[TMP27]], i32 0
+; SSE-NEXT:    [[TMP29:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; SSE-NEXT:    [[TMP30:%.*]] = insertelement <2 x i64> [[TMP28]], i64 [[TMP29]], i32 1
+; SSE-NEXT:    [[TMP31:%.*]] = extractelement <2 x i64> [[TMP7]], i32 0
+; SSE-NEXT:    [[TMP32:%.*]] = insertelement <2 x i64> undef, i64 [[TMP31]], i32 0
+; SSE-NEXT:    [[TMP33:%.*]] = extractelement <2 x i64> [[TMP7]], i32 1
+; SSE-NEXT:    [[TMP34:%.*]] = insertelement <2 x i64> [[TMP32]], i64 [[TMP33]], i32 1
+; SSE-NEXT:    [[TMP35:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP30]], <2 x i64> [[TMP34]])
+; SSE-NEXT:    [[TMP36:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
+; SSE-NEXT:    [[TMP37:%.*]] = insertelement <2 x i64> undef, i64 [[TMP36]], i32 0
+; SSE-NEXT:    [[TMP38:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; SSE-NEXT:    [[TMP39:%.*]] = insertelement <2 x i64> [[TMP37]], i64 [[TMP38]], i32 1
+; SSE-NEXT:    [[TMP40:%.*]] = extractelement <2 x i64> [[TMP8]], i32 0
+; SSE-NEXT:    [[TMP41:%.*]] = insertelement <2 x i64> undef, i64 [[TMP40]], i32 0
+; SSE-NEXT:    [[TMP42:%.*]] = extractelement <2 x i64> [[TMP8]], i32 1
+; SSE-NEXT:    [[TMP43:%.*]] = insertelement <2 x i64> [[TMP41]], i64 [[TMP42]], i32 1
+; SSE-NEXT:    [[TMP44:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP39]], <2 x i64> [[TMP43]])
+; SSE-NEXT:    store <2 x i64> [[TMP17]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP26]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP35]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP44]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @add_v8i64(
-; SLM-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
-; SLM-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
-; SLM-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
-; SLM-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
-; SLM-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
-; SLM-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
-; SLM-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
-; SLM-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
-; SLM-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
-; SLM-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
-; SLM-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
-; SLM-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
-; SLM-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
-; SLM-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
-; SLM-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
-; SLM-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
-; SLM-NEXT:    [[R0:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A0]], i64 [[B0]])
-; SLM-NEXT:    [[R1:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A1]], i64 [[B1]])
-; SLM-NEXT:    [[R2:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A2]], i64 [[B2]])
-; SLM-NEXT:    [[R3:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A3]], i64 [[B3]])
-; SLM-NEXT:    [[R4:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A4]], i64 [[B4]])
-; SLM-NEXT:    [[R5:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A5]], i64 [[B5]])
-; SLM-NEXT:    [[R6:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A6]], i64 [[B6]])
-; SLM-NEXT:    [[R7:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A7]], i64 [[B7]])
-; SLM-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
-; SLM-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
-; SLM-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
-; SLM-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
-; SLM-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
-; SLM-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
-; SLM-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
-; SLM-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+; SLM-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
+; SLM-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> undef, i64 [[TMP9]], i32 0
+; SLM-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+; SLM-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP11]], i32 1
+; SLM-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
+; SLM-NEXT:    [[TMP14:%.*]] = insertelement <2 x i64> undef, i64 [[TMP13]], i32 0
+; SLM-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
+; SLM-NEXT:    [[TMP16:%.*]] = insertelement <2 x i64> [[TMP14]], i64 [[TMP15]], i32 1
+; SLM-NEXT:    [[TMP17:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP12]], <2 x i64> [[TMP16]])
+; SLM-NEXT:    [[TMP18:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
+; SLM-NEXT:    [[TMP19:%.*]] = insertelement <2 x i64> undef, i64 [[TMP18]], i32 0
+; SLM-NEXT:    [[TMP20:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; SLM-NEXT:    [[TMP21:%.*]] = insertelement <2 x i64> [[TMP19]], i64 [[TMP20]], i32 1
+; SLM-NEXT:    [[TMP22:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
+; SLM-NEXT:    [[TMP23:%.*]] = insertelement <2 x i64> undef, i64 [[TMP22]], i32 0
+; SLM-NEXT:    [[TMP24:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
+; SLM-NEXT:    [[TMP25:%.*]] = insertelement <2 x i64> [[TMP23]], i64 [[TMP24]], i32 1
+; SLM-NEXT:    [[TMP26:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP21]], <2 x i64> [[TMP25]])
+; SLM-NEXT:    [[TMP27:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; SLM-NEXT:    [[TMP28:%.*]] = insertelement <2 x i64> undef, i64 [[TMP27]], i32 0
+; SLM-NEXT:    [[TMP29:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; SLM-NEXT:    [[TMP30:%.*]] = insertelement <2 x i64> [[TMP28]], i64 [[TMP29]], i32 1
+; SLM-NEXT:    [[TMP31:%.*]] = extractelement <2 x i64> [[TMP7]], i32 0
+; SLM-NEXT:    [[TMP32:%.*]] = insertelement <2 x i64> undef, i64 [[TMP31]], i32 0
+; SLM-NEXT:    [[TMP33:%.*]] = extractelement <2 x i64> [[TMP7]], i32 1
+; SLM-NEXT:    [[TMP34:%.*]] = insertelement <2 x i64> [[TMP32]], i64 [[TMP33]], i32 1
+; SLM-NEXT:    [[TMP35:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP30]], <2 x i64> [[TMP34]])
+; SLM-NEXT:    [[TMP36:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
+; SLM-NEXT:    [[TMP37:%.*]] = insertelement <2 x i64> undef, i64 [[TMP36]], i32 0
+; SLM-NEXT:    [[TMP38:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; SLM-NEXT:    [[TMP39:%.*]] = insertelement <2 x i64> [[TMP37]], i64 [[TMP38]], i32 1
+; SLM-NEXT:    [[TMP40:%.*]] = extractelement <2 x i64> [[TMP8]], i32 0
+; SLM-NEXT:    [[TMP41:%.*]] = insertelement <2 x i64> undef, i64 [[TMP40]], i32 0
+; SLM-NEXT:    [[TMP42:%.*]] = extractelement <2 x i64> [[TMP8]], i32 1
+; SLM-NEXT:    [[TMP43:%.*]] = insertelement <2 x i64> [[TMP41]], i64 [[TMP42]], i32 1
+; SLM-NEXT:    [[TMP44:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP39]], <2 x i64> [[TMP43]])
+; SLM-NEXT:    store <2 x i64> [[TMP17]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP26]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP35]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP44]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SLM-NEXT:    ret void
 ;
 ; AVX1-LABEL: @add_v8i64(
-; AVX1-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]])
-; AVX1-NEXT:    [[TMP10:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]])
-; AVX1-NEXT:    [[TMP11:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]])
-; AVX1-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]])
-; AVX1-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; AVX1-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; AVX1-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
-; AVX1-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; AVX1-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX1-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX1-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX1-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP1]], i32 0
+; AVX1-NEXT:    [[TMP6:%.*]] = insertelement <4 x i64> undef, i64 [[TMP5]], i32 0
+; AVX1-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP1]], i32 1
+; AVX1-NEXT:    [[TMP8:%.*]] = insertelement <4 x i64> [[TMP6]], i64 [[TMP7]], i32 1
+; AVX1-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP1]], i32 2
+; AVX1-NEXT:    [[TMP10:%.*]] = insertelement <4 x i64> [[TMP8]], i64 [[TMP9]], i32 2
+; AVX1-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP1]], i32 3
+; AVX1-NEXT:    [[TMP12:%.*]] = insertelement <4 x i64> [[TMP10]], i64 [[TMP11]], i32 3
+; AVX1-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; AVX1-NEXT:    [[TMP14:%.*]] = insertelement <4 x i64> undef, i64 [[TMP13]], i32 0
+; AVX1-NEXT:    [[TMP15:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; AVX1-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP14]], i64 [[TMP15]], i32 1
+; AVX1-NEXT:    [[TMP17:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; AVX1-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP17]], i32 2
+; AVX1-NEXT:    [[TMP19:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; AVX1-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP19]], i32 3
+; AVX1-NEXT:    [[TMP21:%.*]] = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> [[TMP12]], <4 x i64> [[TMP20]])
+; AVX1-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0
+; AVX1-NEXT:    [[TMP23:%.*]] = insertelement <4 x i64> undef, i64 [[TMP22]], i32 0
+; AVX1-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
+; AVX1-NEXT:    [[TMP25:%.*]] = insertelement <4 x i64> [[TMP23]], i64 [[TMP24]], i32 1
+; AVX1-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
+; AVX1-NEXT:    [[TMP27:%.*]] = insertelement <4 x i64> [[TMP25]], i64 [[TMP26]], i32 2
+; AVX1-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
+; AVX1-NEXT:    [[TMP29:%.*]] = insertelement <4 x i64> [[TMP27]], i64 [[TMP28]], i32 3
+; AVX1-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP4]], i32 0
+; AVX1-NEXT:    [[TMP31:%.*]] = insertelement <4 x i64> undef, i64 [[TMP30]], i32 0
+; AVX1-NEXT:    [[TMP32:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1
+; AVX1-NEXT:    [[TMP33:%.*]] = insertelement <4 x i64> [[TMP31]], i64 [[TMP32]], i32 1
+; AVX1-NEXT:    [[TMP34:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2
+; AVX1-NEXT:    [[TMP35:%.*]] = insertelement <4 x i64> [[TMP33]], i64 [[TMP34]], i32 2
+; AVX1-NEXT:    [[TMP36:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3
+; AVX1-NEXT:    [[TMP37:%.*]] = insertelement <4 x i64> [[TMP35]], i64 [[TMP36]], i32 3
+; AVX1-NEXT:    [[TMP38:%.*]] = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> [[TMP29]], <4 x i64> [[TMP37]])
+; AVX1-NEXT:    store <4 x i64> [[TMP21]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX1-NEXT:    store <4 x i64> [[TMP38]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @add_v8i64(
Index: test/Transforms/SLPVectorizer/X86/arith-add-usat.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/arith-add-usat.ll
+++ test/Transforms/SLPVectorizer/X86/arith-add-usat.ll
@@ -27,38 +27,54 @@
 
 define void @add_v8i64() {
 ; SSE-LABEL: @add_v8i64(
-; SSE-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
-; SSE-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
-; SSE-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
-; SSE-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
-; SSE-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
-; SSE-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
-; SSE-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
-; SSE-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
-; SSE-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
-; SSE-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
-; SSE-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
-; SSE-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
-; SSE-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
-; SSE-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
-; SSE-NEXT:    [[R0:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A0]], i64 [[B0]])
-; SSE-NEXT:    [[R1:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A1]], i64 [[B1]])
-; SSE-NEXT:    [[R2:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A2]], i64 [[B2]])
-; SSE-NEXT:    [[R3:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A3]], i64 [[B3]])
-; SSE-NEXT:    [[R4:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A4]], i64 [[B4]])
-; SSE-NEXT:    [[R5:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A5]], i64 [[B5]])
-; SSE-NEXT:    [[R6:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A6]], i64 [[B6]])
-; SSE-NEXT:    [[R7:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A7]], i64 [[B7]])
-; SSE-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
-; SSE-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
-; SSE-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
-; SSE-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
-; SSE-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
-; SSE-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
-; SSE-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
-; SSE-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
+; SSE-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> undef, i64 [[TMP9]], i32 0
+; SSE-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+; SSE-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP11]], i32 1
+; SSE-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
+; SSE-NEXT:    [[TMP14:%.*]] = insertelement <2 x i64> undef, i64 [[TMP13]], i32 0
+; SSE-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
+; SSE-NEXT:    [[TMP16:%.*]] = insertelement <2 x i64> [[TMP14]], i64 [[TMP15]], i32 1
+; SSE-NEXT:    [[TMP17:%.*]] = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> [[TMP12]], <2 x i64> [[TMP16]])
+; SSE-NEXT:    [[TMP18:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
+; SSE-NEXT:    [[TMP19:%.*]] = insertelement <2 x i64> undef, i64 [[TMP18]], i32 0
+; SSE-NEXT:    [[TMP20:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; SSE-NEXT:    [[TMP21:%.*]] = insertelement <2 x i64> [[TMP19]], i64 [[TMP20]], i32 1
+; SSE-NEXT:    [[TMP22:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
+; SSE-NEXT:    [[TMP23:%.*]] = insertelement <2 x i64> undef, i64 [[TMP22]], i32 0
+; SSE-NEXT:    [[TMP24:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
+; SSE-NEXT:    [[TMP25:%.*]] = insertelement <2 x i64> [[TMP23]], i64 [[TMP24]], i32 1
+; SSE-NEXT:    [[TMP26:%.*]] = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> [[TMP21]], <2 x i64> [[TMP25]])
+; SSE-NEXT:    [[TMP27:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; SSE-NEXT:    [[TMP28:%.*]] = insertelement <2 x i64> undef, i64 [[TMP27]], i32 0
+; SSE-NEXT:    [[TMP29:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; SSE-NEXT:    [[TMP30:%.*]] = insertelement <2 x i64> [[TMP28]], i64 [[TMP29]], i32 1
+; SSE-NEXT:    [[TMP31:%.*]] = extractelement <2 x i64> [[TMP7]], i32 0
+; SSE-NEXT:    [[TMP32:%.*]] = insertelement <2 x i64> undef, i64 [[TMP31]], i32 0
+; SSE-NEXT:    [[TMP33:%.*]] = extractelement <2 x i64> [[TMP7]], i32 1
+; SSE-NEXT:    [[TMP34:%.*]] = insertelement <2 x i64> [[TMP32]], i64 [[TMP33]], i32 1
+; SSE-NEXT:    [[TMP35:%.*]] = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> [[TMP30]], <2 x i64> [[TMP34]])
+; SSE-NEXT:    [[TMP36:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
+; SSE-NEXT:    [[TMP37:%.*]] = insertelement <2 x i64> undef, i64 [[TMP36]], i32 0
+; SSE-NEXT:    [[TMP38:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; SSE-NEXT:    [[TMP39:%.*]] = insertelement <2 x i64> [[TMP37]], i64 [[TMP38]], i32 1
+; SSE-NEXT:    [[TMP40:%.*]] = extractelement <2 x i64> [[TMP8]], i32 0
+; SSE-NEXT:    [[TMP41:%.*]] = insertelement <2 x i64> undef, i64 [[TMP40]], i32 0
+; SSE-NEXT:    [[TMP42:%.*]] = extractelement <2 x i64> [[TMP8]], i32 1
+; SSE-NEXT:    [[TMP43:%.*]] = insertelement <2 x i64> [[TMP41]], i64 [[TMP42]], i32 1
+; SSE-NEXT:    [[TMP44:%.*]] = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> [[TMP39]], <2 x i64> [[TMP43]])
+; SSE-NEXT:    store <2 x i64> [[TMP17]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP26]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP35]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP44]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @add_v8i64(
Index: test/Transforms/SLPVectorizer/X86/arith-mul.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/arith-mul.ll
+++ test/Transforms/SLPVectorizer/X86/arith-mul.ll
@@ -22,108 +22,148 @@
 
 define void @mul_v8i64() {
 ; SSE-LABEL: @mul_v8i64(
-; SSE-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
-; SSE-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
-; SSE-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
-; SSE-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
-; SSE-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
-; SSE-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
-; SSE-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
-; SSE-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
-; SSE-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
-; SSE-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
-; SSE-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
-; SSE-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
-; SSE-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
-; SSE-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
-; SSE-NEXT:    [[R0:%.*]] = mul i64 [[A0]], [[B0]]
-; SSE-NEXT:    [[R1:%.*]] = mul i64 [[A1]], [[B1]]
-; SSE-NEXT:    [[R2:%.*]] = mul i64 [[A2]], [[B2]]
-; SSE-NEXT:    [[R3:%.*]] = mul i64 [[A3]], [[B3]]
-; SSE-NEXT:    [[R4:%.*]] = mul i64 [[A4]], [[B4]]
-; SSE-NEXT:    [[R5:%.*]] = mul i64 [[A5]], [[B5]]
-; SSE-NEXT:    [[R6:%.*]] = mul i64 [[A6]], [[B6]]
-; SSE-NEXT:    [[R7:%.*]] = mul i64 [[A7]], [[B7]]
-; SSE-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
-; SSE-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
-; SSE-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
-; SSE-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
-; SSE-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
-; SSE-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
-; SSE-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
-; SSE-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
+; SSE-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> undef, i64 [[TMP9]], i32 0
+; SSE-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+; SSE-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP11]], i32 1
+; SSE-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
+; SSE-NEXT:    [[TMP14:%.*]] = insertelement <2 x i64> undef, i64 [[TMP13]], i32 0
+; SSE-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
+; SSE-NEXT:    [[TMP16:%.*]] = insertelement <2 x i64> [[TMP14]], i64 [[TMP15]], i32 1
+; SSE-NEXT:    [[TMP17:%.*]] = mul <2 x i64> [[TMP12]], [[TMP16]]
+; SSE-NEXT:    [[TMP18:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
+; SSE-NEXT:    [[TMP19:%.*]] = insertelement <2 x i64> undef, i64 [[TMP18]], i32 0
+; SSE-NEXT:    [[TMP20:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; SSE-NEXT:    [[TMP21:%.*]] = insertelement <2 x i64> [[TMP19]], i64 [[TMP20]], i32 1
+; SSE-NEXT:    [[TMP22:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
+; SSE-NEXT:    [[TMP23:%.*]] = insertelement <2 x i64> undef, i64 [[TMP22]], i32 0
+; SSE-NEXT:    [[TMP24:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
+; SSE-NEXT:    [[TMP25:%.*]] = insertelement <2 x i64> [[TMP23]], i64 [[TMP24]], i32 1
+; SSE-NEXT:    [[TMP26:%.*]] = mul <2 x i64> [[TMP21]], [[TMP25]]
+; SSE-NEXT:    [[TMP27:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; SSE-NEXT:    [[TMP28:%.*]] = insertelement <2 x i64> undef, i64 [[TMP27]], i32 0
+; SSE-NEXT:    [[TMP29:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; SSE-NEXT:    [[TMP30:%.*]] = insertelement <2 x i64> [[TMP28]], i64 [[TMP29]], i32 1
+; SSE-NEXT:    [[TMP31:%.*]] = extractelement <2 x i64> [[TMP7]], i32 0
+; SSE-NEXT:    [[TMP32:%.*]] = insertelement <2 x i64> undef, i64 [[TMP31]], i32 0
+; SSE-NEXT:    [[TMP33:%.*]] = extractelement <2 x i64> [[TMP7]], i32 1
+; SSE-NEXT:    [[TMP34:%.*]] = insertelement <2 x i64> [[TMP32]], i64 [[TMP33]], i32 1
+; SSE-NEXT:    [[TMP35:%.*]] = mul <2 x i64> [[TMP30]], [[TMP34]]
+; SSE-NEXT:    [[TMP36:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
+; SSE-NEXT:    [[TMP37:%.*]] = insertelement <2 x i64> undef, i64 [[TMP36]], i32 0
+; SSE-NEXT:    [[TMP38:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; SSE-NEXT:    [[TMP39:%.*]] = insertelement <2 x i64> [[TMP37]], i64 [[TMP38]], i32 1
+; SSE-NEXT:    [[TMP40:%.*]] = extractelement <2 x i64> [[TMP8]], i32 0
+; SSE-NEXT:    [[TMP41:%.*]] = insertelement <2 x i64> undef, i64 [[TMP40]], i32 0
+; SSE-NEXT:    [[TMP42:%.*]] = extractelement <2 x i64> [[TMP8]], i32 1
+; SSE-NEXT:    [[TMP43:%.*]] = insertelement <2 x i64> [[TMP41]], i64 [[TMP42]], i32 1
+; SSE-NEXT:    [[TMP44:%.*]] = mul <2 x i64> [[TMP39]], [[TMP43]]
+; SSE-NEXT:    store <2 x i64> [[TMP17]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP26]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP35]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP44]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @mul_v8i64(
-; SLM-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
-; SLM-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
-; SLM-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
-; SLM-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
-; SLM-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
-; SLM-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
-; SLM-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
-; SLM-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
-; SLM-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
-; SLM-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
-; SLM-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
-; SLM-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
-; SLM-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
-; SLM-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
-; SLM-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
-; SLM-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
-; SLM-NEXT:    [[R0:%.*]] = mul i64 [[A0]], [[B0]]
-; SLM-NEXT:    [[R1:%.*]] = mul i64 [[A1]], [[B1]]
-; SLM-NEXT:    [[R2:%.*]] = mul i64 [[A2]], [[B2]]
-; SLM-NEXT:    [[R3:%.*]] = mul i64 [[A3]], [[B3]]
-; SLM-NEXT:    [[R4:%.*]] = mul i64 [[A4]], [[B4]]
-; SLM-NEXT:    [[R5:%.*]] = mul i64 [[A5]], [[B5]]
-; SLM-NEXT:    [[R6:%.*]] = mul i64 [[A6]], [[B6]]
-; SLM-NEXT:    [[R7:%.*]] = mul i64 [[A7]], [[B7]]
-; SLM-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
-; SLM-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
-; SLM-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
-; SLM-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
-; SLM-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
-; SLM-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
-; SLM-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
-; SLM-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+; SLM-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
+; SLM-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> undef, i64 [[TMP9]], i32 0
+; SLM-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+; SLM-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP11]], i32 1
+; SLM-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
+; SLM-NEXT:    [[TMP14:%.*]] = insertelement <2 x i64> undef, i64 [[TMP13]], i32 0
+; SLM-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
+; SLM-NEXT:    [[TMP16:%.*]] = insertelement <2 x i64> [[TMP14]], i64 [[TMP15]], i32 1
+; SLM-NEXT:    [[TMP17:%.*]] = mul <2 x i64> [[TMP12]], [[TMP16]]
+; SLM-NEXT:    [[TMP18:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
+; SLM-NEXT:    [[TMP19:%.*]] = insertelement <2 x i64> undef, i64 [[TMP18]], i32 0
+; SLM-NEXT:    [[TMP20:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; SLM-NEXT:    [[TMP21:%.*]] = insertelement <2 x i64> [[TMP19]], i64 [[TMP20]], i32 1
+; SLM-NEXT:    [[TMP22:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
+; SLM-NEXT:    [[TMP23:%.*]] = insertelement <2 x i64> undef, i64 [[TMP22]], i32 0
+; SLM-NEXT:    [[TMP24:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
+; SLM-NEXT:    [[TMP25:%.*]] = insertelement <2 x i64> [[TMP23]], i64 [[TMP24]], i32 1
+; SLM-NEXT:    [[TMP26:%.*]] = mul <2 x i64> [[TMP21]], [[TMP25]]
+; SLM-NEXT:    [[TMP27:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; SLM-NEXT:    [[TMP28:%.*]] = insertelement <2 x i64> undef, i64 [[TMP27]], i32 0
+; SLM-NEXT:    [[TMP29:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; SLM-NEXT:    [[TMP30:%.*]] = insertelement <2 x i64> [[TMP28]], i64 [[TMP29]], i32 1
+; SLM-NEXT:    [[TMP31:%.*]] = extractelement <2 x i64> [[TMP7]], i32 0
+; SLM-NEXT:    [[TMP32:%.*]] = insertelement <2 x i64> undef, i64 [[TMP31]], i32 0
+; SLM-NEXT:    [[TMP33:%.*]] = extractelement <2 x i64> [[TMP7]], i32 1
+; SLM-NEXT:    [[TMP34:%.*]] = insertelement <2 x i64> [[TMP32]], i64 [[TMP33]], i32 1
+; SLM-NEXT:    [[TMP35:%.*]] = mul <2 x i64> [[TMP30]], [[TMP34]]
+; SLM-NEXT:    [[TMP36:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
+; SLM-NEXT:    [[TMP37:%.*]] = insertelement <2 x i64> undef, i64 [[TMP36]], i32 0
+; SLM-NEXT:    [[TMP38:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; SLM-NEXT:    [[TMP39:%.*]] = insertelement <2 x i64> [[TMP37]], i64 [[TMP38]], i32 1
+; SLM-NEXT:    [[TMP40:%.*]] = extractelement <2 x i64> [[TMP8]], i32 0
+; SLM-NEXT:    [[TMP41:%.*]] = insertelement <2 x i64> undef, i64 [[TMP40]], i32 0
+; SLM-NEXT:    [[TMP42:%.*]] = extractelement <2 x i64> [[TMP8]], i32 1
+; SLM-NEXT:    [[TMP43:%.*]] = insertelement <2 x i64> [[TMP41]], i64 [[TMP42]], i32 1
+; SLM-NEXT:    [[TMP44:%.*]] = mul <2 x i64> [[TMP39]], [[TMP43]]
+; SLM-NEXT:    store <2 x i64> [[TMP17]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP26]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP35]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP44]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SLM-NEXT:    ret void
 ;
 ; AVX1-LABEL: @mul_v8i64(
-; AVX1-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
-; AVX1-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
-; AVX1-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
-; AVX1-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
-; AVX1-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
-; AVX1-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
-; AVX1-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
-; AVX1-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
-; AVX1-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
-; AVX1-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
-; AVX1-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
-; AVX1-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
-; AVX1-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
-; AVX1-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
-; AVX1-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
-; AVX1-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
-; AVX1-NEXT:    [[R0:%.*]] = mul i64 [[A0]], [[B0]]
-; AVX1-NEXT:    [[R1:%.*]] = mul i64 [[A1]], [[B1]]
-; AVX1-NEXT:    [[R2:%.*]] = mul i64 [[A2]], [[B2]]
-; AVX1-NEXT:    [[R3:%.*]] = mul i64 [[A3]], [[B3]]
-; AVX1-NEXT:    [[R4:%.*]] = mul i64 [[A4]], [[B4]]
-; AVX1-NEXT:    [[R5:%.*]] = mul i64 [[A5]], [[B5]]
-; AVX1-NEXT:    [[R6:%.*]] = mul i64 [[A6]], [[B6]]
-; AVX1-NEXT:    [[R7:%.*]] = mul i64 [[A7]], [[B7]]
-; AVX1-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
-; AVX1-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
-; AVX1-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
-; AVX1-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
-; AVX1-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
-; AVX1-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
-; AVX1-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
-; AVX1-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+; AVX1-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; AVX1-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX1-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX1-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX1-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP1]], i32 0
+; AVX1-NEXT:    [[TMP6:%.*]] = insertelement <4 x i64> undef, i64 [[TMP5]], i32 0
+; AVX1-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP1]], i32 1
+; AVX1-NEXT:    [[TMP8:%.*]] = insertelement <4 x i64> [[TMP6]], i64 [[TMP7]], i32 1
+; AVX1-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP1]], i32 2
+; AVX1-NEXT:    [[TMP10:%.*]] = insertelement <4 x i64> [[TMP8]], i64 [[TMP9]], i32 2
+; AVX1-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP1]], i32 3
+; AVX1-NEXT:    [[TMP12:%.*]] = insertelement <4 x i64> [[TMP10]], i64 [[TMP11]], i32 3
+; AVX1-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; AVX1-NEXT:    [[TMP14:%.*]] = insertelement <4 x i64> undef, i64 [[TMP13]], i32 0
+; AVX1-NEXT:    [[TMP15:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; AVX1-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP14]], i64 [[TMP15]], i32 1
+; AVX1-NEXT:    [[TMP17:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; AVX1-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP17]], i32 2
+; AVX1-NEXT:    [[TMP19:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; AVX1-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP19]], i32 3
+; AVX1-NEXT:    [[TMP21:%.*]] = mul <4 x i64> [[TMP12]], [[TMP20]]
+; AVX1-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0
+; AVX1-NEXT:    [[TMP23:%.*]] = insertelement <4 x i64> undef, i64 [[TMP22]], i32 0
+; AVX1-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
+; AVX1-NEXT:    [[TMP25:%.*]] = insertelement <4 x i64> [[TMP23]], i64 [[TMP24]], i32 1
+; AVX1-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
+; AVX1-NEXT:    [[TMP27:%.*]] = insertelement <4 x i64> [[TMP25]], i64 [[TMP26]], i32 2
+; AVX1-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
+; AVX1-NEXT:    [[TMP29:%.*]] = insertelement <4 x i64> [[TMP27]], i64 [[TMP28]], i32 3
+; AVX1-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP4]], i32 0
+; AVX1-NEXT:    [[TMP31:%.*]] = insertelement <4 x i64> undef, i64 [[TMP30]], i32 0
+; AVX1-NEXT:    [[TMP32:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1
+; AVX1-NEXT:    [[TMP33:%.*]] = insertelement <4 x i64> [[TMP31]], i64 [[TMP32]], i32 1
+; AVX1-NEXT:    [[TMP34:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2
+; AVX1-NEXT:    [[TMP35:%.*]] = insertelement <4 x i64> [[TMP33]], i64 [[TMP34]], i32 2
+; AVX1-NEXT:    [[TMP36:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3
+; AVX1-NEXT:    [[TMP37:%.*]] = insertelement <4 x i64> [[TMP35]], i64 [[TMP36]], i32 3
+; AVX1-NEXT:    [[TMP38:%.*]] = mul <4 x i64> [[TMP29]], [[TMP37]]
+; AVX1-NEXT:    store <4 x i64> [[TMP21]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX1-NEXT:    store <4 x i64> [[TMP38]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @mul_v8i64(
Index: test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll
+++ test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll
@@ -27,92 +27,148 @@
 
 define void @sub_v8i64() {
 ; SSE-LABEL: @sub_v8i64(
-; SSE-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
-; SSE-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
-; SSE-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
-; SSE-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
-; SSE-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
-; SSE-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
-; SSE-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
-; SSE-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
-; SSE-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
-; SSE-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
-; SSE-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
-; SSE-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
-; SSE-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
-; SSE-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
-; SSE-NEXT:    [[R0:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A0]], i64 [[B0]])
-; SSE-NEXT:    [[R1:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A1]], i64 [[B1]])
-; SSE-NEXT:    [[R2:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A2]], i64 [[B2]])
-; SSE-NEXT:    [[R3:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A3]], i64 [[B3]])
-; SSE-NEXT:    [[R4:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A4]], i64 [[B4]])
-; SSE-NEXT:    [[R5:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A5]], i64 [[B5]])
-; SSE-NEXT:    [[R6:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A6]], i64 [[B6]])
-; SSE-NEXT:    [[R7:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A7]], i64 [[B7]])
-; SSE-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
-; SSE-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
-; SSE-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
-; SSE-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
-; SSE-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
-; SSE-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
-; SSE-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
-; SSE-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
+; SSE-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> undef, i64 [[TMP9]], i32 0
+; SSE-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+; SSE-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP11]], i32 1
+; SSE-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
+; SSE-NEXT:    [[TMP14:%.*]] = insertelement <2 x i64> undef, i64 [[TMP13]], i32 0
+; SSE-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
+; SSE-NEXT:    [[TMP16:%.*]] = insertelement <2 x i64> [[TMP14]], i64 [[TMP15]], i32 1
+; SSE-NEXT:    [[TMP17:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP12]], <2 x i64> [[TMP16]])
+; SSE-NEXT:    [[TMP18:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
+; SSE-NEXT:    [[TMP19:%.*]] = insertelement <2 x i64> undef, i64 [[TMP18]], i32 0
+; SSE-NEXT:    [[TMP20:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; SSE-NEXT:    [[TMP21:%.*]] = insertelement <2 x i64> [[TMP19]], i64 [[TMP20]], i32 1
+; SSE-NEXT:    [[TMP22:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
+; SSE-NEXT:    [[TMP23:%.*]] = insertelement <2 x i64> undef, i64 [[TMP22]], i32 0
+; SSE-NEXT:    [[TMP24:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
+; SSE-NEXT:    [[TMP25:%.*]] = insertelement <2 x i64> [[TMP23]], i64 [[TMP24]], i32 1
+; SSE-NEXT:    [[TMP26:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP21]], <2 x i64> [[TMP25]])
+; SSE-NEXT:    [[TMP27:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; SSE-NEXT:    [[TMP28:%.*]] = insertelement <2 x i64> undef, i64 [[TMP27]], i32 0
+; SSE-NEXT:    [[TMP29:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; SSE-NEXT:    [[TMP30:%.*]] = insertelement <2 x i64> [[TMP28]], i64 [[TMP29]], i32 1
+; SSE-NEXT:    [[TMP31:%.*]] = extractelement <2 x i64> [[TMP7]], i32 0
+; SSE-NEXT:    [[TMP32:%.*]] = insertelement <2 x i64> undef, i64 [[TMP31]], i32 0
+; SSE-NEXT:    [[TMP33:%.*]] = extractelement <2 x i64> [[TMP7]], i32 1
+; SSE-NEXT:    [[TMP34:%.*]] = insertelement <2 x i64> [[TMP32]], i64 [[TMP33]], i32 1
+; SSE-NEXT:    [[TMP35:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP30]], <2 x i64> [[TMP34]])
+; SSE-NEXT:    [[TMP36:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
+; SSE-NEXT:    [[TMP37:%.*]] = insertelement <2 x i64> undef, i64 [[TMP36]], i32 0
+; SSE-NEXT:    [[TMP38:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; SSE-NEXT:    [[TMP39:%.*]] = insertelement <2 x i64> [[TMP37]], i64 [[TMP38]], i32 1
+; SSE-NEXT:    [[TMP40:%.*]] = extractelement <2 x i64> [[TMP8]], i32 0
+; SSE-NEXT:    [[TMP41:%.*]] = insertelement <2 x i64> undef, i64 [[TMP40]], i32 0
+; SSE-NEXT:    [[TMP42:%.*]] = extractelement <2 x i64> [[TMP8]], i32 1
+; SSE-NEXT:    [[TMP43:%.*]] = insertelement <2 x i64> [[TMP41]], i64 [[TMP42]], i32 1
+; SSE-NEXT:    [[TMP44:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP39]], <2 x i64> [[TMP43]])
+; SSE-NEXT:    store <2 x i64> [[TMP17]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP26]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP35]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP44]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @sub_v8i64(
-; SLM-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
-; SLM-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
-; SLM-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
-; SLM-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
-; SLM-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
-; SLM-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
-; SLM-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
-; SLM-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
-; SLM-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
-; SLM-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
-; SLM-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
-; SLM-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
-; SLM-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
-; SLM-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
-; SLM-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
-; SLM-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
-; SLM-NEXT:    [[R0:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A0]], i64 [[B0]])
-; SLM-NEXT:    [[R1:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A1]], i64 [[B1]])
-; SLM-NEXT:    [[R2:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A2]], i64 [[B2]])
-; SLM-NEXT:    [[R3:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A3]], i64 [[B3]])
-; SLM-NEXT:    [[R4:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A4]], i64 [[B4]])
-; SLM-NEXT:    [[R5:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A5]], i64 [[B5]])
-; SLM-NEXT:    [[R6:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A6]], i64 [[B6]])
-; SLM-NEXT:    [[R7:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A7]], i64 [[B7]])
-; SLM-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
-; SLM-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
-; SLM-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
-; SLM-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
-; SLM-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
-; SLM-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
-; SLM-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
-; SLM-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+; SLM-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
+; SLM-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> undef, i64 [[TMP9]], i32 0
+; SLM-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+; SLM-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP11]], i32 1
+; SLM-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
+; SLM-NEXT:    [[TMP14:%.*]] = insertelement <2 x i64> undef, i64 [[TMP13]], i32 0
+; SLM-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
+; SLM-NEXT:    [[TMP16:%.*]] = insertelement <2 x i64> [[TMP14]], i64 [[TMP15]], i32 1
+; SLM-NEXT:    [[TMP17:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP12]], <2 x i64> [[TMP16]])
+; SLM-NEXT:    [[TMP18:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
+; SLM-NEXT:    [[TMP19:%.*]] = insertelement <2 x i64> undef, i64 [[TMP18]], i32 0
+; SLM-NEXT:    [[TMP20:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; SLM-NEXT:    [[TMP21:%.*]] = insertelement <2 x i64> [[TMP19]], i64 [[TMP20]], i32 1
+; SLM-NEXT:    [[TMP22:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
+; SLM-NEXT:    [[TMP23:%.*]] = insertelement <2 x i64> undef, i64 [[TMP22]], i32 0
+; SLM-NEXT:    [[TMP24:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
+; SLM-NEXT:    [[TMP25:%.*]] = insertelement <2 x i64> [[TMP23]], i64 [[TMP24]], i32 1
+; SLM-NEXT:    [[TMP26:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP21]], <2 x i64> [[TMP25]])
+; SLM-NEXT:    [[TMP27:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; SLM-NEXT:    [[TMP28:%.*]] = insertelement <2 x i64> undef, i64 [[TMP27]], i32 0
+; SLM-NEXT:    [[TMP29:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; SLM-NEXT:    [[TMP30:%.*]] = insertelement <2 x i64> [[TMP28]], i64 [[TMP29]], i32 1
+; SLM-NEXT:    [[TMP31:%.*]] = extractelement <2 x i64> [[TMP7]], i32 0
+; SLM-NEXT:    [[TMP32:%.*]] = insertelement <2 x i64> undef, i64 [[TMP31]], i32 0
+; SLM-NEXT:    [[TMP33:%.*]] = extractelement <2 x i64> [[TMP7]], i32 1
+; SLM-NEXT:    [[TMP34:%.*]] = insertelement <2 x i64> [[TMP32]], i64 [[TMP33]], i32 1
+; SLM-NEXT:    [[TMP35:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP30]], <2 x i64> [[TMP34]])
+; SLM-NEXT:    [[TMP36:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
+; SLM-NEXT:    [[TMP37:%.*]] = insertelement <2 x i64> undef, i64 [[TMP36]], i32 0
+; SLM-NEXT:    [[TMP38:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; SLM-NEXT:    [[TMP39:%.*]] = insertelement <2 x i64> [[TMP37]], i64 [[TMP38]], i32 1
+; SLM-NEXT:    [[TMP40:%.*]] = extractelement <2 x i64> [[TMP8]], i32 0
+; SLM-NEXT:    [[TMP41:%.*]] = insertelement <2 x i64> undef, i64 [[TMP40]], i32 0
+; SLM-NEXT:    [[TMP42:%.*]] = extractelement <2 x i64> [[TMP8]], i32 1
+; SLM-NEXT:    [[TMP43:%.*]] = insertelement <2 x i64> [[TMP41]], i64 [[TMP42]], i32 1
+; SLM-NEXT:    [[TMP44:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP39]], <2 x i64> [[TMP43]])
+; SLM-NEXT:    store <2 x i64> [[TMP17]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP26]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP35]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP44]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SLM-NEXT:    ret void
 ;
 ; AVX1-LABEL: @sub_v8i64(
-; AVX1-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]])
-; AVX1-NEXT:    [[TMP10:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]])
-; AVX1-NEXT:    [[TMP11:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]])
-; AVX1-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]])
-; AVX1-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; AVX1-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; AVX1-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
-; AVX1-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; AVX1-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX1-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX1-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX1-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP1]], i32 0
+; AVX1-NEXT:    [[TMP6:%.*]] = insertelement <4 x i64> undef, i64 [[TMP5]], i32 0
+; AVX1-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP1]], i32 1
+; AVX1-NEXT:    [[TMP8:%.*]] = insertelement <4 x i64> [[TMP6]], i64 [[TMP7]], i32 1
+; AVX1-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP1]], i32 2
+; AVX1-NEXT:    [[TMP10:%.*]] = insertelement <4 x i64> [[TMP8]], i64 [[TMP9]], i32 2
+; AVX1-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP1]], i32 3
+; AVX1-NEXT:    [[TMP12:%.*]] = insertelement <4 x i64> [[TMP10]], i64 [[TMP11]], i32 3
+; AVX1-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; AVX1-NEXT:    [[TMP14:%.*]] = insertelement <4 x i64> undef, i64 [[TMP13]], i32 0
+; AVX1-NEXT:    [[TMP15:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; AVX1-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP14]], i64 [[TMP15]], i32 1
+; AVX1-NEXT:    [[TMP17:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; AVX1-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP17]], i32 2
+; AVX1-NEXT:    [[TMP19:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; AVX1-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP19]], i32 3
+; AVX1-NEXT:    [[TMP21:%.*]] = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> [[TMP12]], <4 x i64> [[TMP20]])
+; AVX1-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0
+; AVX1-NEXT:    [[TMP23:%.*]] = insertelement <4 x i64> undef, i64 [[TMP22]], i32 0
+; AVX1-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
+; AVX1-NEXT:    [[TMP25:%.*]] = insertelement <4 x i64> [[TMP23]], i64 [[TMP24]], i32 1
+; AVX1-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
+; AVX1-NEXT:    [[TMP27:%.*]] = insertelement <4 x i64> [[TMP25]], i64 [[TMP26]], i32 2
+; AVX1-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
+; AVX1-NEXT:    [[TMP29:%.*]] = insertelement <4 x i64> [[TMP27]], i64 [[TMP28]], i32 3
+; AVX1-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP4]], i32 0
+; AVX1-NEXT:    [[TMP31:%.*]] = insertelement <4 x i64> undef, i64 [[TMP30]], i32 0
+; AVX1-NEXT:    [[TMP32:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1
+; AVX1-NEXT:    [[TMP33:%.*]] = insertelement <4 x i64> [[TMP31]], i64 [[TMP32]], i32 1
+; AVX1-NEXT:    [[TMP34:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2
+; AVX1-NEXT:    [[TMP35:%.*]] = insertelement <4 x i64> [[TMP33]], i64 [[TMP34]], i32 2
+; AVX1-NEXT:    [[TMP36:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3
+; AVX1-NEXT:    [[TMP37:%.*]] = insertelement <4 x i64> [[TMP35]], i64 [[TMP36]], i32 3
+; AVX1-NEXT:    [[TMP38:%.*]] = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> [[TMP29]], <4 x i64> [[TMP37]])
+; AVX1-NEXT:    store <4 x i64> [[TMP21]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX1-NEXT:    store <4 x i64> [[TMP38]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @sub_v8i64(
Index: test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll
+++ test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll
@@ -27,38 +27,54 @@
 
 define void @sub_v8i64() {
 ; SSE-LABEL: @sub_v8i64(
-; SSE-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
-; SSE-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
-; SSE-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
-; SSE-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
-; SSE-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
-; SSE-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
-; SSE-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
-; SSE-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
-; SSE-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
-; SSE-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
-; SSE-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
-; SSE-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
-; SSE-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
-; SSE-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
-; SSE-NEXT:    [[R0:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A0]], i64 [[B0]])
-; SSE-NEXT:    [[R1:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A1]], i64 [[B1]])
-; SSE-NEXT:    [[R2:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A2]], i64 [[B2]])
-; SSE-NEXT:    [[R3:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A3]], i64 [[B3]])
-; SSE-NEXT:    [[R4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A4]], i64 [[B4]])
-; SSE-NEXT:    [[R5:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A5]], i64 [[B5]])
-; SSE-NEXT:    [[R6:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A6]], i64 [[B6]])
-; SSE-NEXT:    [[R7:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A7]], i64 [[B7]])
-; SSE-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
-; SSE-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
-; SSE-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
-; SSE-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
-; SSE-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
-; SSE-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
-; SSE-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
-; SSE-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
+; SSE-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> undef, i64 [[TMP9]], i32 0
+; SSE-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+; SSE-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP11]], i32 1
+; SSE-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
+; SSE-NEXT:    [[TMP14:%.*]] = insertelement <2 x i64> undef, i64 [[TMP13]], i32 0
+; SSE-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
+; SSE-NEXT:    [[TMP16:%.*]] = insertelement <2 x i64> [[TMP14]], i64 [[TMP15]], i32 1
+; SSE-NEXT:    [[TMP17:%.*]] = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> [[TMP12]], <2 x i64> [[TMP16]])
+; SSE-NEXT:    [[TMP18:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
+; SSE-NEXT:    [[TMP19:%.*]] = insertelement <2 x i64> undef, i64 [[TMP18]], i32 0
+; SSE-NEXT:    [[TMP20:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; SSE-NEXT:    [[TMP21:%.*]] = insertelement <2 x i64> [[TMP19]], i64 [[TMP20]], i32 1
+; SSE-NEXT:    [[TMP22:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
+; SSE-NEXT:    [[TMP23:%.*]] = insertelement <2 x i64> undef, i64 [[TMP22]], i32 0
+; SSE-NEXT:    [[TMP24:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
+; SSE-NEXT:    [[TMP25:%.*]] = insertelement <2 x i64> [[TMP23]], i64 [[TMP24]], i32 1
+; SSE-NEXT:    [[TMP26:%.*]] = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> [[TMP21]], <2 x i64> [[TMP25]])
+; SSE-NEXT:    [[TMP27:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; SSE-NEXT:    [[TMP28:%.*]] = insertelement <2 x i64> undef, i64 [[TMP27]], i32 0
+; SSE-NEXT:    [[TMP29:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; SSE-NEXT:    [[TMP30:%.*]] = insertelement <2 x i64> [[TMP28]], i64 [[TMP29]], i32 1
+; SSE-NEXT:    [[TMP31:%.*]] = extractelement <2 x i64> [[TMP7]], i32 0
+; SSE-NEXT:    [[TMP32:%.*]] = insertelement <2 x i64> undef, i64 [[TMP31]], i32 0
+; SSE-NEXT:    [[TMP33:%.*]] = extractelement <2 x i64> [[TMP7]], i32 1
+; SSE-NEXT:    [[TMP34:%.*]] = insertelement <2 x i64> [[TMP32]], i64 [[TMP33]], i32 1
+; SSE-NEXT:    [[TMP35:%.*]] = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> [[TMP30]], <2 x i64> [[TMP34]])
+; SSE-NEXT:    [[TMP36:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
+; SSE-NEXT:    [[TMP37:%.*]] = insertelement <2 x i64> undef, i64 [[TMP36]], i32 0
+; SSE-NEXT:    [[TMP38:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; SSE-NEXT:    [[TMP39:%.*]] = insertelement <2 x i64> [[TMP37]], i64 [[TMP38]], i32 1
+; SSE-NEXT:    [[TMP40:%.*]] = extractelement <2 x i64> [[TMP8]], i32 0
+; SSE-NEXT:    [[TMP41:%.*]] = insertelement <2 x i64> undef, i64 [[TMP40]], i32 0
+; SSE-NEXT:    [[TMP42:%.*]] = extractelement <2 x i64> [[TMP8]], i32 1
+; SSE-NEXT:    [[TMP43:%.*]] = insertelement <2 x i64> [[TMP41]], i64 [[TMP42]], i32 1
+; SSE-NEXT:    [[TMP44:%.*]] = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> [[TMP39]], <2 x i64> [[TMP43]])
+; SSE-NEXT:    store <2 x i64> [[TMP17]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP26]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP35]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP44]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @sub_v8i64(
Index: test/Transforms/SLPVectorizer/X86/bswap.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/bswap.ll
+++ test/Transforms/SLPVectorizer/X86/bswap.ll
@@ -22,8 +22,9 @@
 ; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8
 ; SSE-NEXT:    [[BSWAP0:%.*]] = call i64 @llvm.bswap.i64(i64 [[LD0]])
 ; SSE-NEXT:    [[BSWAP1:%.*]] = call i64 @llvm.bswap.i64(i64 [[LD1]])
-; SSE-NEXT:    store i64 [[BSWAP0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8
-; SSE-NEXT:    store i64 [[BSWAP1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[BSWAP0]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[BSWAP1]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 8
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @bswap_2i64(
@@ -51,10 +52,12 @@
 ; SSE-NEXT:    [[BSWAP1:%.*]] = call i64 @llvm.bswap.i64(i64 [[LD1]])
 ; SSE-NEXT:    [[BSWAP2:%.*]] = call i64 @llvm.bswap.i64(i64 [[LD2]])
 ; SSE-NEXT:    [[BSWAP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[LD3]])
-; SSE-NEXT:    store i64 [[BSWAP0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
-; SSE-NEXT:    store i64 [[BSWAP1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
-; SSE-NEXT:    store i64 [[BSWAP2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
-; SSE-NEXT:    store i64 [[BSWAP3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[BSWAP0]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[BSWAP1]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[BSWAP2]], i32 0
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[BSWAP3]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2) to <2 x i64>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @bswap_4i64(
Index: test/Transforms/SLPVectorizer/X86/cast.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/cast.ll
+++ test/Transforms/SLPVectorizer/X86/cast.ll
@@ -81,15 +81,20 @@
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[B:%.*]] to <2 x i16>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i16>, <2 x i16>* [[TMP0]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext <2 x i16> [[TMP1]] to <2 x i64>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[A:%.*]] to <2 x i64>*
-; CHECK-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* [[TMP3]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i16, i16* [[B]], i64 2
-; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 2
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16* [[ARRAYIDX5]] to <2 x i16>*
-; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i16>, <2 x i16>* [[TMP4]], align 1
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <2 x i16> [[TMP5]] to <2 x i64>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i64* [[ARRAYIDX7]] to <2 x i64>*
-; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, i16* [[ARRAYIDX5]], align 1
+; CHECK-NEXT:    [[CONV6:%.*]] = sext i16 [[TMP3]] to i64
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i16, i16* [[B]], i64 3
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, i16* [[ARRAYIDX8]], align 1
+; CHECK-NEXT:    [[CONV9:%.*]] = sext i16 [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i64> undef, i64 [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i64> [[TMP6]], i64 [[TMP7]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i64> [[TMP8]], i64 [[CONV6]], i32 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i64> [[TMP9]], i64 [[CONV9]], i32 3
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64* [[A:%.*]] to <4 x i64>*
+; CHECK-NEXT:    store <4 x i64> [[TMP10]], <4 x i64>* [[TMP11]], align 4
 ; CHECK-NEXT:    ret i64 undef
 ;
 entry:
Index: test/Transforms/SLPVectorizer/X86/ctlz.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/ctlz.ll
+++ test/Transforms/SLPVectorizer/X86/ctlz.ll
@@ -30,8 +30,9 @@
 ; CHECK-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8
 ; CHECK-NEXT:    [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 false)
 ; CHECK-NEXT:    [[CTLZ1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD1]], i1 false)
-; CHECK-NEXT:    store i64 [[CTLZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8
-; CHECK-NEXT:    store i64 [[CTLZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[CTLZ0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[CTLZ1]], i32 1
+; CHECK-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 8
 ; CHECK-NEXT:    ret void
 ;
   %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8
@@ -44,20 +45,38 @@
 }
 
 define void @ctlz_4i64() #0 {
-; CHECK-LABEL: @ctlz_4i64(
-; CHECK-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
-; CHECK-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
-; CHECK-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
-; CHECK-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
-; CHECK-NEXT:    [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 false)
-; CHECK-NEXT:    [[CTLZ1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD1]], i1 false)
-; CHECK-NEXT:    [[CTLZ2:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD2]], i1 false)
-; CHECK-NEXT:    [[CTLZ3:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD3]], i1 false)
-; CHECK-NEXT:    store i64 [[CTLZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
-; CHECK-NEXT:    store i64 [[CTLZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
-; CHECK-NEXT:    store i64 [[CTLZ2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
-; CHECK-NEXT:    store i64 [[CTLZ3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
-; CHECK-NEXT:    ret void
+; SSE-LABEL: @ctlz_4i64(
+; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
+; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
+; SSE-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
+; SSE-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
+; SSE-NEXT:    [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 false)
+; SSE-NEXT:    [[CTLZ1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD1]], i1 false)
+; SSE-NEXT:    [[CTLZ2:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD2]], i1 false)
+; SSE-NEXT:    [[CTLZ3:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD3]], i1 false)
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[CTLZ0]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[CTLZ1]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[CTLZ2]], i32 0
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[CTLZ3]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2) to <2 x i64>*), align 4
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @ctlz_4i64(
+; AVX-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
+; AVX-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
+; AVX-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
+; AVX-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
+; AVX-NEXT:    [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 false)
+; AVX-NEXT:    [[CTLZ1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD1]], i1 false)
+; AVX-NEXT:    [[CTLZ2:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD2]], i1 false)
+; AVX-NEXT:    [[CTLZ3:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD3]], i1 false)
+; AVX-NEXT:    [[TMP1:%.*]] = insertelement <4 x i64> undef, i64 [[CTLZ0]], i32 0
+; AVX-NEXT:    [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[CTLZ1]], i32 1
+; AVX-NEXT:    [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[CTLZ2]], i32 2
+; AVX-NEXT:    [[TMP4:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[CTLZ3]], i32 3
+; AVX-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* bitcast ([4 x i64]* @dst64 to <4 x i64>*), align 4
+; AVX-NEXT:    ret void
 ;
   %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
   %ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
@@ -84,10 +103,11 @@
 ; CHECK-NEXT:    [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false)
 ; CHECK-NEXT:    [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false)
 ; CHECK-NEXT:    [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false)
-; CHECK-NEXT:    store i32 [[CTLZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
-; CHECK-NEXT:    store i32 [[CTLZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
-; CHECK-NEXT:    store i32 [[CTLZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
-; CHECK-NEXT:    store i32 [[CTLZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[CTLZ0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[CTLZ1]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[CTLZ2]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[CTLZ3]], i32 3
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
 ; CHECK-NEXT:    ret void
 ;
   %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
@@ -123,14 +143,16 @@
 ; SSE-NEXT:    [[CTLZ5:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD5]], i1 false)
 ; SSE-NEXT:    [[CTLZ6:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD6]], i1 false)
 ; SSE-NEXT:    [[CTLZ7:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD7]], i1 false)
-; SSE-NEXT:    store i32 [[CTLZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
-; SSE-NEXT:    store i32 [[CTLZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
-; SSE-NEXT:    store i32 [[CTLZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
-; SSE-NEXT:    store i32 [[CTLZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
-; SSE-NEXT:    store i32 [[CTLZ4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
-; SSE-NEXT:    store i32 [[CTLZ5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
-; SSE-NEXT:    store i32 [[CTLZ6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
-; SSE-NEXT:    store i32 [[CTLZ7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[CTLZ0]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[CTLZ1]], i32 1
+; SSE-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[CTLZ2]], i32 2
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[CTLZ3]], i32 3
+; SSE-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> undef, i32 [[CTLZ4]], i32 0
+; SSE-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[CTLZ5]], i32 1
+; SSE-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[CTLZ6]], i32 2
+; SSE-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[CTLZ7]], i32 3
+; SSE-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; AVX1-LABEL: @ctlz_8i32(
@@ -150,14 +172,15 @@
 ; AVX1-NEXT:    [[CTLZ5:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD5]], i1 false)
 ; AVX1-NEXT:    [[CTLZ6:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD6]], i1 false)
 ; AVX1-NEXT:    [[CTLZ7:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD7]], i1 false)
-; AVX1-NEXT:    store i32 [[CTLZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
-; AVX1-NEXT:    store i32 [[CTLZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
-; AVX1-NEXT:    store i32 [[CTLZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
-; AVX1-NEXT:    store i32 [[CTLZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
-; AVX1-NEXT:    store i32 [[CTLZ4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
-; AVX1-NEXT:    store i32 [[CTLZ5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
-; AVX1-NEXT:    store i32 [[CTLZ6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
-; AVX1-NEXT:    store i32 [[CTLZ7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
+; AVX1-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> undef, i32 [[CTLZ0]], i32 0
+; AVX1-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[CTLZ1]], i32 1
+; AVX1-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[CTLZ2]], i32 2
+; AVX1-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[CTLZ3]], i32 3
+; AVX1-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[CTLZ4]], i32 4
+; AVX1-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[CTLZ5]], i32 5
+; AVX1-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[CTLZ6]], i32 6
+; AVX1-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[CTLZ7]], i32 7
+; AVX1-NEXT:    store <8 x i32> [[TMP8]], <8 x i32>* bitcast ([8 x i32]* @dst32 to <8 x i32>*), align 2
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @ctlz_8i32(
@@ -471,8 +494,9 @@
 ; CHECK-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8
 ; CHECK-NEXT:    [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 true)
 ; CHECK-NEXT:    [[CTLZ1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD1]], i1 true)
-; CHECK-NEXT:    store i64 [[CTLZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8
-; CHECK-NEXT:    store i64 [[CTLZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[CTLZ0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[CTLZ1]], i32 1
+; CHECK-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 8
 ; CHECK-NEXT:    ret void
 ;
   %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8
@@ -485,20 +509,38 @@
 }
 
 define void @ctlz_undef_4i64() #0 {
-; CHECK-LABEL: @ctlz_undef_4i64(
-; CHECK-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
-; CHECK-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
-; CHECK-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
-; CHECK-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
-; CHECK-NEXT:    [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 true)
-; CHECK-NEXT:    [[CTLZ1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD1]], i1 true)
-; CHECK-NEXT:    [[CTLZ2:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD2]], i1 true)
-; CHECK-NEXT:    [[CTLZ3:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD3]], i1 true)
-; CHECK-NEXT:    store i64 [[CTLZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
-; CHECK-NEXT:    store i64 [[CTLZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
-; CHECK-NEXT:    store i64 [[CTLZ2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
-; CHECK-NEXT:    store i64 [[CTLZ3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
-; CHECK-NEXT:    ret void
+; SSE-LABEL: @ctlz_undef_4i64(
+; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
+; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
+; SSE-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
+; SSE-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
+; SSE-NEXT:    [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 true)
+; SSE-NEXT:    [[CTLZ1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD1]], i1 true)
+; SSE-NEXT:    [[CTLZ2:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD2]], i1 true)
+; SSE-NEXT:    [[CTLZ3:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD3]], i1 true)
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[CTLZ0]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[CTLZ1]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[CTLZ2]], i32 0
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[CTLZ3]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2) to <2 x i64>*), align 4
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @ctlz_undef_4i64(
+; AVX-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
+; AVX-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
+; AVX-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
+; AVX-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
+; AVX-NEXT:    [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 true)
+; AVX-NEXT:    [[CTLZ1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD1]], i1 true)
+; AVX-NEXT:    [[CTLZ2:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD2]], i1 true)
+; AVX-NEXT:    [[CTLZ3:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD3]], i1 true)
+; AVX-NEXT:    [[TMP1:%.*]] = insertelement <4 x i64> undef, i64 [[CTLZ0]], i32 0
+; AVX-NEXT:    [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[CTLZ1]], i32 1
+; AVX-NEXT:    [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[CTLZ2]], i32 2
+; AVX-NEXT:    [[TMP4:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[CTLZ3]], i32 3
+; AVX-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* bitcast ([4 x i64]* @dst64 to <4 x i64>*), align 4
+; AVX-NEXT:    ret void
 ;
   %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
   %ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
@@ -525,10 +567,11 @@
 ; CHECK-NEXT:    [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 true)
 ; CHECK-NEXT:    [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 true)
 ; CHECK-NEXT:    [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 true)
-; CHECK-NEXT:    store i32 [[CTLZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
-; CHECK-NEXT:    store i32 [[CTLZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
-; CHECK-NEXT:    store i32 [[CTLZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
-; CHECK-NEXT:    store i32 [[CTLZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[CTLZ0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[CTLZ1]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[CTLZ2]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[CTLZ3]], i32 3
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
 ; CHECK-NEXT:    ret void
 ;
   %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
@@ -564,14 +607,16 @@
 ; SSE-NEXT:    [[CTLZ5:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD5]], i1 true)
 ; SSE-NEXT:    [[CTLZ6:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD6]], i1 true)
 ; SSE-NEXT:    [[CTLZ7:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD7]], i1 true)
-; SSE-NEXT:    store i32 [[CTLZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
-; SSE-NEXT:    store i32 [[CTLZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
-; SSE-NEXT:    store i32 [[CTLZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
-; SSE-NEXT:    store i32 [[CTLZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
-; SSE-NEXT:    store i32 [[CTLZ4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
-; SSE-NEXT:    store i32 [[CTLZ5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
-; SSE-NEXT:    store i32 [[CTLZ6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
-; SSE-NEXT:    store i32 [[CTLZ7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[CTLZ0]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[CTLZ1]], i32 1
+; SSE-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[CTLZ2]], i32 2
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[CTLZ3]], i32 3
+; SSE-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> undef, i32 [[CTLZ4]], i32 0
+; SSE-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[CTLZ5]], i32 1
+; SSE-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[CTLZ6]], i32 2
+; SSE-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[CTLZ7]], i32 3
+; SSE-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; AVX1-LABEL: @ctlz_undef_8i32(
@@ -591,14 +636,15 @@
 ; AVX1-NEXT:    [[CTLZ5:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD5]], i1 true)
 ; AVX1-NEXT:    [[CTLZ6:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD6]], i1 true)
 ; AVX1-NEXT:    [[CTLZ7:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD7]], i1 true)
-; AVX1-NEXT:    store i32 [[CTLZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
-; AVX1-NEXT:    store i32 [[CTLZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
-; AVX1-NEXT:    store i32 [[CTLZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
-; AVX1-NEXT:    store i32 [[CTLZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
-; AVX1-NEXT:    store i32 [[CTLZ4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
-; AVX1-NEXT:    store i32 [[CTLZ5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
-; AVX1-NEXT:    store i32 [[CTLZ6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
-; AVX1-NEXT:    store i32 [[CTLZ7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
+; AVX1-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> undef, i32 [[CTLZ0]], i32 0
+; AVX1-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[CTLZ1]], i32 1
+; AVX1-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[CTLZ2]], i32 2
+; AVX1-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[CTLZ3]], i32 3
+; AVX1-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[CTLZ4]], i32 4
+; AVX1-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[CTLZ5]], i32 5
+; AVX1-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[CTLZ6]], i32 6
+; AVX1-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[CTLZ7]], i32 7
+; AVX1-NEXT:    store <8 x i32> [[TMP8]], <8 x i32>* bitcast ([8 x i32]* @dst32 to <8 x i32>*), align 2
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @ctlz_undef_8i32(
Index: test/Transforms/SLPVectorizer/X86/ctpop.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/ctpop.ll
+++ test/Transforms/SLPVectorizer/X86/ctpop.ll
@@ -26,8 +26,9 @@
 ; CHECK-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8
 ; CHECK-NEXT:    [[CTPOP0:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD0]])
 ; CHECK-NEXT:    [[CTPOP1:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD1]])
-; CHECK-NEXT:    store i64 [[CTPOP0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8
-; CHECK-NEXT:    store i64 [[CTPOP1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[CTPOP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[CTPOP1]], i32 1
+; CHECK-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 8
 ; CHECK-NEXT:    ret void
 ;
   %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8
@@ -49,10 +50,12 @@
 ; SSE-NEXT:    [[CTPOP1:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD1]])
 ; SSE-NEXT:    [[CTPOP2:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD2]])
 ; SSE-NEXT:    [[CTPOP3:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD3]])
-; SSE-NEXT:    store i64 [[CTPOP0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
-; SSE-NEXT:    store i64 [[CTPOP1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
-; SSE-NEXT:    store i64 [[CTPOP2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
-; SSE-NEXT:    store i64 [[CTPOP3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[CTPOP0]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[CTPOP1]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[CTPOP2]], i32 0
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[CTPOP3]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2) to <2 x i64>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; AVX1-LABEL: @ctpop_4i64(
@@ -64,10 +67,11 @@
 ; AVX1-NEXT:    [[CTPOP1:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD1]])
 ; AVX1-NEXT:    [[CTPOP2:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD2]])
 ; AVX1-NEXT:    [[CTPOP3:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD3]])
-; AVX1-NEXT:    store i64 [[CTPOP0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
-; AVX1-NEXT:    store i64 [[CTPOP1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
-; AVX1-NEXT:    store i64 [[CTPOP2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
-; AVX1-NEXT:    store i64 [[CTPOP3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
+; AVX1-NEXT:    [[TMP1:%.*]] = insertelement <4 x i64> undef, i64 [[CTPOP0]], i32 0
+; AVX1-NEXT:    [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[CTPOP1]], i32 1
+; AVX1-NEXT:    [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[CTPOP2]], i32 2
+; AVX1-NEXT:    [[TMP4:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[CTPOP3]], i32 3
+; AVX1-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* bitcast ([4 x i64]* @dst64 to <4 x i64>*), align 4
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @ctpop_4i64(
@@ -107,10 +111,11 @@
 ; SSE42-NEXT:    [[CTPOP1:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD1]])
 ; SSE42-NEXT:    [[CTPOP2:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD2]])
 ; SSE42-NEXT:    [[CTPOP3:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD3]])
-; SSE42-NEXT:    store i32 [[CTPOP0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
-; SSE42-NEXT:    store i32 [[CTPOP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
-; SSE42-NEXT:    store i32 [[CTPOP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
-; SSE42-NEXT:    store i32 [[CTPOP3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
+; SSE42-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[CTPOP0]], i32 0
+; SSE42-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[CTPOP1]], i32 1
+; SSE42-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[CTPOP2]], i32 2
+; SSE42-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[CTPOP3]], i32 3
+; SSE42-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
 ; SSE42-NEXT:    ret void
 ;
 ; AVX-LABEL: @ctpop_4i32(
@@ -122,10 +127,11 @@
 ; AVX-NEXT:    [[CTPOP1:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD1]])
 ; AVX-NEXT:    [[CTPOP2:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD2]])
 ; AVX-NEXT:    [[CTPOP3:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD3]])
-; AVX-NEXT:    store i32 [[CTPOP0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
-; AVX-NEXT:    store i32 [[CTPOP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
-; AVX-NEXT:    store i32 [[CTPOP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
-; AVX-NEXT:    store i32 [[CTPOP3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
+; AVX-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[CTPOP0]], i32 0
+; AVX-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[CTPOP1]], i32 1
+; AVX-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[CTPOP2]], i32 2
+; AVX-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[CTPOP3]], i32 3
+; AVX-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
 ; AVX-NEXT:    ret void
 ;
   %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
@@ -170,14 +176,16 @@
 ; SSE42-NEXT:    [[CTPOP5:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD5]])
 ; SSE42-NEXT:    [[CTPOP6:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD6]])
 ; SSE42-NEXT:    [[CTPOP7:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD7]])
-; SSE42-NEXT:    store i32 [[CTPOP0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
-; SSE42-NEXT:    store i32 [[CTPOP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
-; SSE42-NEXT:    store i32 [[CTPOP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
-; SSE42-NEXT:    store i32 [[CTPOP3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
-; SSE42-NEXT:    store i32 [[CTPOP4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
-; SSE42-NEXT:    store i32 [[CTPOP5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
-; SSE42-NEXT:    store i32 [[CTPOP6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
-; SSE42-NEXT:    store i32 [[CTPOP7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
+; SSE42-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[CTPOP0]], i32 0
+; SSE42-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[CTPOP1]], i32 1
+; SSE42-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[CTPOP2]], i32 2
+; SSE42-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[CTPOP3]], i32 3
+; SSE42-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
+; SSE42-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> undef, i32 [[CTPOP4]], i32 0
+; SSE42-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[CTPOP5]], i32 1
+; SSE42-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[CTPOP6]], i32 2
+; SSE42-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[CTPOP7]], i32 3
+; SSE42-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2
 ; SSE42-NEXT:    ret void
 ;
 ; AVX1-LABEL: @ctpop_8i32(
@@ -197,14 +205,15 @@
 ; AVX1-NEXT:    [[CTPOP5:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD5]])
 ; AVX1-NEXT:    [[CTPOP6:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD6]])
 ; AVX1-NEXT:    [[CTPOP7:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD7]])
-; AVX1-NEXT:    store i32 [[CTPOP0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
-; AVX1-NEXT:    store i32 [[CTPOP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
-; AVX1-NEXT:    store i32 [[CTPOP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
-; AVX1-NEXT:    store i32 [[CTPOP3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
-; AVX1-NEXT:    store i32 [[CTPOP4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
-; AVX1-NEXT:    store i32 [[CTPOP5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
-; AVX1-NEXT:    store i32 [[CTPOP6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
-; AVX1-NEXT:    store i32 [[CTPOP7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
+; AVX1-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> undef, i32 [[CTPOP0]], i32 0
+; AVX1-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[CTPOP1]], i32 1
+; AVX1-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[CTPOP2]], i32 2
+; AVX1-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[CTPOP3]], i32 3
+; AVX1-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[CTPOP4]], i32 4
+; AVX1-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[CTPOP5]], i32 5
+; AVX1-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[CTPOP6]], i32 6
+; AVX1-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[CTPOP7]], i32 7
+; AVX1-NEXT:    store <8 x i32> [[TMP8]], <8 x i32>* bitcast ([8 x i32]* @dst32 to <8 x i32>*), align 2
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @ctpop_8i32(
Index: test/Transforms/SLPVectorizer/X86/cttz.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/cttz.ll
+++ test/Transforms/SLPVectorizer/X86/cttz.ll
@@ -30,8 +30,9 @@
 ; CHECK-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8
 ; CHECK-NEXT:    [[CTTZ0:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD0]], i1 false)
 ; CHECK-NEXT:    [[CTTZ1:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD1]], i1 false)
-; CHECK-NEXT:    store i64 [[CTTZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8
-; CHECK-NEXT:    store i64 [[CTTZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[CTTZ0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[CTTZ1]], i32 1
+; CHECK-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 8
 ; CHECK-NEXT:    ret void
 ;
   %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8
@@ -44,20 +45,38 @@
 }
 
 define void @cttz_4i64() #0 {
-; CHECK-LABEL: @cttz_4i64(
-; CHECK-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
-; CHECK-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
-; CHECK-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
-; CHECK-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
-; CHECK-NEXT:    [[CTTZ0:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD0]], i1 false)
-; CHECK-NEXT:    [[CTTZ1:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD1]], i1 false)
-; CHECK-NEXT:    [[CTTZ2:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD2]], i1 false)
-; CHECK-NEXT:    [[CTTZ3:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD3]], i1 false)
-; CHECK-NEXT:    store i64 [[CTTZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
-; CHECK-NEXT:    store i64 [[CTTZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
-; CHECK-NEXT:    store i64 [[CTTZ2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
-; CHECK-NEXT:    store i64 [[CTTZ3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
-; CHECK-NEXT:    ret void
+; SSE-LABEL: @cttz_4i64(
+; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
+; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
+; SSE-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
+; SSE-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
+; SSE-NEXT:    [[CTTZ0:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD0]], i1 false)
+; SSE-NEXT:    [[CTTZ1:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD1]], i1 false)
+; SSE-NEXT:    [[CTTZ2:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD2]], i1 false)
+; SSE-NEXT:    [[CTTZ3:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD3]], i1 false)
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[CTTZ0]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[CTTZ1]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[CTTZ2]], i32 0
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[CTTZ3]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2) to <2 x i64>*), align 4
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @cttz_4i64(
+; AVX-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
+; AVX-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
+; AVX-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
+; AVX-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
+; AVX-NEXT:    [[CTTZ0:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD0]], i1 false)
+; AVX-NEXT:    [[CTTZ1:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD1]], i1 false)
+; AVX-NEXT:    [[CTTZ2:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD2]], i1 false)
+; AVX-NEXT:    [[CTTZ3:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD3]], i1 false)
+; AVX-NEXT:    [[TMP1:%.*]] = insertelement <4 x i64> undef, i64 [[CTTZ0]], i32 0
+; AVX-NEXT:    [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[CTTZ1]], i32 1
+; AVX-NEXT:    [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[CTTZ2]], i32 2
+; AVX-NEXT:    [[TMP4:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[CTTZ3]], i32 3
+; AVX-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* bitcast ([4 x i64]* @dst64 to <4 x i64>*), align 4
+; AVX-NEXT:    ret void
 ;
   %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
   %ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
@@ -84,10 +103,11 @@
 ; CHECK-NEXT:    [[CTTZ1:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD1]], i1 false)
 ; CHECK-NEXT:    [[CTTZ2:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD2]], i1 false)
 ; CHECK-NEXT:    [[CTTZ3:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD3]], i1 false)
-; CHECK-NEXT:    store i32 [[CTTZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
-; CHECK-NEXT:    store i32 [[CTTZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
-; CHECK-NEXT:    store i32 [[CTTZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
-; CHECK-NEXT:    store i32 [[CTTZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[CTTZ0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[CTTZ1]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[CTTZ2]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[CTTZ3]], i32 3
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
 ; CHECK-NEXT:    ret void
 ;
   %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
@@ -123,14 +143,16 @@
 ; SSE-NEXT:    [[CTTZ5:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD5]], i1 false)
 ; SSE-NEXT:    [[CTTZ6:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD6]], i1 false)
 ; SSE-NEXT:    [[CTTZ7:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD7]], i1 false)
-; SSE-NEXT:    store i32 [[CTTZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
-; SSE-NEXT:    store i32 [[CTTZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
-; SSE-NEXT:    store i32 [[CTTZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
-; SSE-NEXT:    store i32 [[CTTZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
-; SSE-NEXT:    store i32 [[CTTZ4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
-; SSE-NEXT:    store i32 [[CTTZ5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
-; SSE-NEXT:    store i32 [[CTTZ6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
-; SSE-NEXT:    store i32 [[CTTZ7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[CTTZ0]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[CTTZ1]], i32 1
+; SSE-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[CTTZ2]], i32 2
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[CTTZ3]], i32 3
+; SSE-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> undef, i32 [[CTTZ4]], i32 0
+; SSE-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[CTTZ5]], i32 1
+; SSE-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[CTTZ6]], i32 2
+; SSE-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[CTTZ7]], i32 3
+; SSE-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; AVX1-LABEL: @cttz_8i32(
@@ -150,14 +172,15 @@
 ; AVX1-NEXT:    [[CTTZ5:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD5]], i1 false)
 ; AVX1-NEXT:    [[CTTZ6:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD6]], i1 false)
 ; AVX1-NEXT:    [[CTTZ7:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD7]], i1 false)
-; AVX1-NEXT:    store i32 [[CTTZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
-; AVX1-NEXT:    store i32 [[CTTZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
-; AVX1-NEXT:    store i32 [[CTTZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
-; AVX1-NEXT:    store i32 [[CTTZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
-; AVX1-NEXT:    store i32 [[CTTZ4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
-; AVX1-NEXT:    store i32 [[CTTZ5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
-; AVX1-NEXT:    store i32 [[CTTZ6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
-; AVX1-NEXT:    store i32 [[CTTZ7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
+; AVX1-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> undef, i32 [[CTTZ0]], i32 0
+; AVX1-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[CTTZ1]], i32 1
+; AVX1-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[CTTZ2]], i32 2
+; AVX1-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[CTTZ3]], i32 3
+; AVX1-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[CTTZ4]], i32 4
+; AVX1-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[CTTZ5]], i32 5
+; AVX1-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[CTTZ6]], i32 6
+; AVX1-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[CTTZ7]], i32 7
+; AVX1-NEXT:    store <8 x i32> [[TMP8]], <8 x i32>* bitcast ([8 x i32]* @dst32 to <8 x i32>*), align 2
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @cttz_8i32(
@@ -471,8 +494,9 @@
 ; CHECK-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8
 ; CHECK-NEXT:    [[CTTZ0:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD0]], i1 true)
 ; CHECK-NEXT:    [[CTTZ1:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD1]], i1 true)
-; CHECK-NEXT:    store i64 [[CTTZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8
-; CHECK-NEXT:    store i64 [[CTTZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[CTTZ0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[CTTZ1]], i32 1
+; CHECK-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 8
 ; CHECK-NEXT:    ret void
 ;
   %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8
@@ -485,20 +509,38 @@
 }
 
 define void @cttz_undef_4i64() #0 {
-; CHECK-LABEL: @cttz_undef_4i64(
-; CHECK-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
-; CHECK-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
-; CHECK-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
-; CHECK-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
-; CHECK-NEXT:    [[CTTZ0:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD0]], i1 true)
-; CHECK-NEXT:    [[CTTZ1:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD1]], i1 true)
-; CHECK-NEXT:    [[CTTZ2:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD2]], i1 true)
-; CHECK-NEXT:    [[CTTZ3:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD3]], i1 true)
-; CHECK-NEXT:    store i64 [[CTTZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
-; CHECK-NEXT:    store i64 [[CTTZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
-; CHECK-NEXT:    store i64 [[CTTZ2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
-; CHECK-NEXT:    store i64 [[CTTZ3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
-; CHECK-NEXT:    ret void
+; SSE-LABEL: @cttz_undef_4i64(
+; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
+; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
+; SSE-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
+; SSE-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
+; SSE-NEXT:    [[CTTZ0:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD0]], i1 true)
+; SSE-NEXT:    [[CTTZ1:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD1]], i1 true)
+; SSE-NEXT:    [[CTTZ2:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD2]], i1 true)
+; SSE-NEXT:    [[CTTZ3:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD3]], i1 true)
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[CTTZ0]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[CTTZ1]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[CTTZ2]], i32 0
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[CTTZ3]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2) to <2 x i64>*), align 4
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @cttz_undef_4i64(
+; AVX-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
+; AVX-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
+; AVX-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
+; AVX-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
+; AVX-NEXT:    [[CTTZ0:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD0]], i1 true)
+; AVX-NEXT:    [[CTTZ1:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD1]], i1 true)
+; AVX-NEXT:    [[CTTZ2:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD2]], i1 true)
+; AVX-NEXT:    [[CTTZ3:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD3]], i1 true)
+; AVX-NEXT:    [[TMP1:%.*]] = insertelement <4 x i64> undef, i64 [[CTTZ0]], i32 0
+; AVX-NEXT:    [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[CTTZ1]], i32 1
+; AVX-NEXT:    [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[CTTZ2]], i32 2
+; AVX-NEXT:    [[TMP4:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[CTTZ3]], i32 3
+; AVX-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* bitcast ([4 x i64]* @dst64 to <4 x i64>*), align 4
+; AVX-NEXT:    ret void
 ;
   %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
   %ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
@@ -525,10 +567,11 @@
 ; CHECK-NEXT:    [[CTTZ1:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD1]], i1 true)
 ; CHECK-NEXT:    [[CTTZ2:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD2]], i1 true)
 ; CHECK-NEXT:    [[CTTZ3:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD3]], i1 true)
-; CHECK-NEXT:    store i32 [[CTTZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
-; CHECK-NEXT:    store i32 [[CTTZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
-; CHECK-NEXT:    store i32 [[CTTZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
-; CHECK-NEXT:    store i32 [[CTTZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[CTTZ0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[CTTZ1]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[CTTZ2]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[CTTZ3]], i32 3
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
 ; CHECK-NEXT:    ret void
 ;
   %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
@@ -564,14 +607,16 @@
 ; SSE-NEXT:    [[CTTZ5:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD5]], i1 true)
 ; SSE-NEXT:    [[CTTZ6:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD6]], i1 true)
 ; SSE-NEXT:    [[CTTZ7:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD7]], i1 true)
-; SSE-NEXT:    store i32 [[CTTZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
-; SSE-NEXT:    store i32 [[CTTZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
-; SSE-NEXT:    store i32 [[CTTZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
-; SSE-NEXT:    store i32 [[CTTZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
-; SSE-NEXT:    store i32 [[CTTZ4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
-; SSE-NEXT:    store i32 [[CTTZ5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
-; SSE-NEXT:    store i32 [[CTTZ6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
-; SSE-NEXT:    store i32 [[CTTZ7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[CTTZ0]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[CTTZ1]], i32 1
+; SSE-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[CTTZ2]], i32 2
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[CTTZ3]], i32 3
+; SSE-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> undef, i32 [[CTTZ4]], i32 0
+; SSE-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[CTTZ5]], i32 1
+; SSE-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[CTTZ6]], i32 2
+; SSE-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[CTTZ7]], i32 3
+; SSE-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; AVX1-LABEL: @cttz_undef_8i32(
@@ -591,14 +636,15 @@
 ; AVX1-NEXT:    [[CTTZ5:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD5]], i1 true)
 ; AVX1-NEXT:    [[CTTZ6:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD6]], i1 true)
 ; AVX1-NEXT:    [[CTTZ7:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD7]], i1 true)
-; AVX1-NEXT:    store i32 [[CTTZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
-; AVX1-NEXT:    store i32 [[CTTZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
-; AVX1-NEXT:    store i32 [[CTTZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
-; AVX1-NEXT:    store i32 [[CTTZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
-; AVX1-NEXT:    store i32 [[CTTZ4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
-; AVX1-NEXT:    store i32 [[CTTZ5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
-; AVX1-NEXT:    store i32 [[CTTZ6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
-; AVX1-NEXT:    store i32 [[CTTZ7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
+; AVX1-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> undef, i32 [[CTTZ0]], i32 0
+; AVX1-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[CTTZ1]], i32 1
+; AVX1-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[CTTZ2]], i32 2
+; AVX1-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[CTTZ3]], i32 3
+; AVX1-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[CTTZ4]], i32 4
+; AVX1-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[CTTZ5]], i32 5
+; AVX1-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[CTTZ6]], i32 6
+; AVX1-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[CTTZ7]], i32 7
+; AVX1-NEXT:    store <8 x i32> [[TMP8]], <8 x i32>* bitcast ([8 x i32]* @dst32 to <8 x i32>*), align 2
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @cttz_undef_8i32(
Index: test/Transforms/SLPVectorizer/X86/fma.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/fma.ll
+++ test/Transforms/SLPVectorizer/X86/fma.ll
@@ -1,5 +1,4 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=NO-FMA
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=NO-FMA
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=FMA --check-prefix=FMA256
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=FMA --check-prefix=FMA256
@@ -26,16 +25,19 @@
 
 define void @fma_2f64() #0 {
 ; NO-FMA-LABEL: @fma_2f64(
-; NO-FMA-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 0), align 8
-; NO-FMA-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 1), align 8
-; NO-FMA-NEXT:    [[B0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 0), align 8
-; NO-FMA-NEXT:    [[B1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 1), align 8
-; NO-FMA-NEXT:    [[C0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 0), align 8
-; NO-FMA-NEXT:    [[C1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 1), align 8
-; NO-FMA-NEXT:    [[FMA0:%.*]] = call double @llvm.fma.f64(double [[A0]], double [[B0]], double [[C0]])
-; NO-FMA-NEXT:    [[FMA1:%.*]] = call double @llvm.fma.f64(double [[A1]], double [[B1]], double [[C1]])
-; NO-FMA-NEXT:    store double [[FMA0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; NO-FMA-NEXT:    store double [[FMA1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; NO-FMA-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcA64 to <2 x double>*), align 8
+; NO-FMA-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 8
+; NO-FMA-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcC64 to <2 x double>*), align 8
+; NO-FMA-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; NO-FMA-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> undef, double [[TMP4]], i32 0
+; NO-FMA-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; NO-FMA-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> [[TMP5]], double [[TMP6]], i32 1
+; NO-FMA-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; NO-FMA-NEXT:    [[TMP9:%.*]] = insertelement <2 x double> undef, double [[TMP8]], i32 0
+; NO-FMA-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+; NO-FMA-NEXT:    [[TMP11:%.*]] = insertelement <2 x double> [[TMP9]], double [[TMP10]], i32 1
+; NO-FMA-NEXT:    [[TMP12:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP7]], <2 x double> [[TMP11]])
+; NO-FMA-NEXT:    store <2 x double> [[TMP12]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
 ; NO-FMA-NEXT:    ret void
 ;
 ; FMA-LABEL: @fma_2f64(
@@ -61,26 +63,27 @@
 
 define void @fma_4f64() #0 {
 ; NO-FMA-LABEL: @fma_4f64(
-; NO-FMA-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 0), align 8
-; NO-FMA-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 1), align 8
-; NO-FMA-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2), align 8
-; NO-FMA-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 3), align 8
-; NO-FMA-NEXT:    [[B0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 0), align 8
-; NO-FMA-NEXT:    [[B1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 1), align 8
-; NO-FMA-NEXT:    [[B2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2), align 8
-; NO-FMA-NEXT:    [[B3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 3), align 8
-; NO-FMA-NEXT:    [[C0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 0), align 8
-; NO-FMA-NEXT:    [[C1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 1), align 8
-; NO-FMA-NEXT:    [[C2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 2), align 8
-; NO-FMA-NEXT:    [[C3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 3), align 8
-; NO-FMA-NEXT:    [[FMA0:%.*]] = call double @llvm.fma.f64(double [[A0]], double [[B0]], double [[C0]])
-; NO-FMA-NEXT:    [[FMA1:%.*]] = call double @llvm.fma.f64(double [[A1]], double [[B1]], double [[C1]])
-; NO-FMA-NEXT:    [[FMA2:%.*]] = call double @llvm.fma.f64(double [[A2]], double [[B2]], double [[C2]])
-; NO-FMA-NEXT:    [[FMA3:%.*]] = call double @llvm.fma.f64(double [[A3]], double [[B3]], double [[C3]])
-; NO-FMA-NEXT:    store double [[FMA0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; NO-FMA-NEXT:    store double [[FMA1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; NO-FMA-NEXT:    store double [[FMA2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
-; NO-FMA-NEXT:    store double [[FMA3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; NO-FMA-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcA64 to <4 x double>*), align 8
+; NO-FMA-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcB64 to <4 x double>*), align 8
+; NO-FMA-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcC64 to <4 x double>*), align 8
+; NO-FMA-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP2]], i32 0
+; NO-FMA-NEXT:    [[TMP5:%.*]] = insertelement <4 x double> undef, double [[TMP4]], i32 0
+; NO-FMA-NEXT:    [[TMP6:%.*]] = extractelement <4 x double> [[TMP2]], i32 1
+; NO-FMA-NEXT:    [[TMP7:%.*]] = insertelement <4 x double> [[TMP5]], double [[TMP6]], i32 1
+; NO-FMA-NEXT:    [[TMP8:%.*]] = extractelement <4 x double> [[TMP2]], i32 2
+; NO-FMA-NEXT:    [[TMP9:%.*]] = insertelement <4 x double> [[TMP7]], double [[TMP8]], i32 2
+; NO-FMA-NEXT:    [[TMP10:%.*]] = extractelement <4 x double> [[TMP2]], i32 3
+; NO-FMA-NEXT:    [[TMP11:%.*]] = insertelement <4 x double> [[TMP9]], double [[TMP10]], i32 3
+; NO-FMA-NEXT:    [[TMP12:%.*]] = extractelement <4 x double> [[TMP3]], i32 0
+; NO-FMA-NEXT:    [[TMP13:%.*]] = insertelement <4 x double> undef, double [[TMP12]], i32 0
+; NO-FMA-NEXT:    [[TMP14:%.*]] = extractelement <4 x double> [[TMP3]], i32 1
+; NO-FMA-NEXT:    [[TMP15:%.*]] = insertelement <4 x double> [[TMP13]], double [[TMP14]], i32 1
+; NO-FMA-NEXT:    [[TMP16:%.*]] = extractelement <4 x double> [[TMP3]], i32 2
+; NO-FMA-NEXT:    [[TMP17:%.*]] = insertelement <4 x double> [[TMP15]], double [[TMP16]], i32 2
+; NO-FMA-NEXT:    [[TMP18:%.*]] = extractelement <4 x double> [[TMP3]], i32 3
+; NO-FMA-NEXT:    [[TMP19:%.*]] = insertelement <4 x double> [[TMP17]], double [[TMP18]], i32 3
+; NO-FMA-NEXT:    [[TMP20:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP11]], <4 x double> [[TMP19]])
+; NO-FMA-NEXT:    store <4 x double> [[TMP20]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
 ; NO-FMA-NEXT:    ret void
 ;
 ; FMA-LABEL: @fma_4f64(
@@ -116,46 +119,48 @@
 
 define void @fma_8f64() #0 {
 ; NO-FMA-LABEL: @fma_8f64(
-; NO-FMA-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 0), align 4
-; NO-FMA-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 1), align 4
-; NO-FMA-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2), align 4
-; NO-FMA-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 3), align 4
-; NO-FMA-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4), align 4
-; NO-FMA-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 5), align 4
-; NO-FMA-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 6), align 4
-; NO-FMA-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 7), align 4
-; NO-FMA-NEXT:    [[B0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 0), align 4
-; NO-FMA-NEXT:    [[B1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 1), align 4
-; NO-FMA-NEXT:    [[B2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2), align 4
-; NO-FMA-NEXT:    [[B3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 3), align 4
-; NO-FMA-NEXT:    [[B4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4), align 4
-; NO-FMA-NEXT:    [[B5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 5), align 4
-; NO-FMA-NEXT:    [[B6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 6), align 4
-; NO-FMA-NEXT:    [[B7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 7), align 4
-; NO-FMA-NEXT:    [[C0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 0), align 4
-; NO-FMA-NEXT:    [[C1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 1), align 4
-; NO-FMA-NEXT:    [[C2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 2), align 4
-; NO-FMA-NEXT:    [[C3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 3), align 4
-; NO-FMA-NEXT:    [[C4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 4), align 4
-; NO-FMA-NEXT:    [[C5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 5), align 4
-; NO-FMA-NEXT:    [[C6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 6), align 4
-; NO-FMA-NEXT:    [[C7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 7), align 4
-; NO-FMA-NEXT:    [[FMA0:%.*]] = call double @llvm.fma.f64(double [[A0]], double [[B0]], double [[C0]])
-; NO-FMA-NEXT:    [[FMA1:%.*]] = call double @llvm.fma.f64(double [[A1]], double [[B1]], double [[C1]])
-; NO-FMA-NEXT:    [[FMA2:%.*]] = call double @llvm.fma.f64(double [[A2]], double [[B2]], double [[C2]])
-; NO-FMA-NEXT:    [[FMA3:%.*]] = call double @llvm.fma.f64(double [[A3]], double [[B3]], double [[C3]])
-; NO-FMA-NEXT:    [[FMA4:%.*]] = call double @llvm.fma.f64(double [[A4]], double [[B4]], double [[C4]])
-; NO-FMA-NEXT:    [[FMA5:%.*]] = call double @llvm.fma.f64(double [[A5]], double [[B5]], double [[C5]])
-; NO-FMA-NEXT:    [[FMA6:%.*]] = call double @llvm.fma.f64(double [[A6]], double [[B6]], double [[C6]])
-; NO-FMA-NEXT:    [[FMA7:%.*]] = call double @llvm.fma.f64(double [[A7]], double [[B7]], double [[C7]])
-; NO-FMA-NEXT:    store double [[FMA0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 4
-; NO-FMA-NEXT:    store double [[FMA1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 4
-; NO-FMA-NEXT:    store double [[FMA2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 4
-; NO-FMA-NEXT:    store double [[FMA3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 4
-; NO-FMA-NEXT:    store double [[FMA4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 4
-; NO-FMA-NEXT:    store double [[FMA5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 4
-; NO-FMA-NEXT:    store double [[FMA6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 4
-; NO-FMA-NEXT:    store double [[FMA7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 4
+; NO-FMA-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcA64 to <4 x double>*), align 4
+; NO-FMA-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <4 x double>*), align 4
+; NO-FMA-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcB64 to <4 x double>*), align 4
+; NO-FMA-NEXT:    [[TMP4:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <4 x double>*), align 4
+; NO-FMA-NEXT:    [[TMP5:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcC64 to <4 x double>*), align 4
+; NO-FMA-NEXT:    [[TMP6:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 4) to <4 x double>*), align 4
+; NO-FMA-NEXT:    [[TMP7:%.*]] = extractelement <4 x double> [[TMP3]], i32 0
+; NO-FMA-NEXT:    [[TMP8:%.*]] = insertelement <4 x double> undef, double [[TMP7]], i32 0
+; NO-FMA-NEXT:    [[TMP9:%.*]] = extractelement <4 x double> [[TMP3]], i32 1
+; NO-FMA-NEXT:    [[TMP10:%.*]] = insertelement <4 x double> [[TMP8]], double [[TMP9]], i32 1
+; NO-FMA-NEXT:    [[TMP11:%.*]] = extractelement <4 x double> [[TMP3]], i32 2
+; NO-FMA-NEXT:    [[TMP12:%.*]] = insertelement <4 x double> [[TMP10]], double [[TMP11]], i32 2
+; NO-FMA-NEXT:    [[TMP13:%.*]] = extractelement <4 x double> [[TMP3]], i32 3
+; NO-FMA-NEXT:    [[TMP14:%.*]] = insertelement <4 x double> [[TMP12]], double [[TMP13]], i32 3
+; NO-FMA-NEXT:    [[TMP15:%.*]] = extractelement <4 x double> [[TMP5]], i32 0
+; NO-FMA-NEXT:    [[TMP16:%.*]] = insertelement <4 x double> undef, double [[TMP15]], i32 0
+; NO-FMA-NEXT:    [[TMP17:%.*]] = extractelement <4 x double> [[TMP5]], i32 1
+; NO-FMA-NEXT:    [[TMP18:%.*]] = insertelement <4 x double> [[TMP16]], double [[TMP17]], i32 1
+; NO-FMA-NEXT:    [[TMP19:%.*]] = extractelement <4 x double> [[TMP5]], i32 2
+; NO-FMA-NEXT:    [[TMP20:%.*]] = insertelement <4 x double> [[TMP18]], double [[TMP19]], i32 2
+; NO-FMA-NEXT:    [[TMP21:%.*]] = extractelement <4 x double> [[TMP5]], i32 3
+; NO-FMA-NEXT:    [[TMP22:%.*]] = insertelement <4 x double> [[TMP20]], double [[TMP21]], i32 3
+; NO-FMA-NEXT:    [[TMP23:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP14]], <4 x double> [[TMP22]])
+; NO-FMA-NEXT:    [[TMP24:%.*]] = extractelement <4 x double> [[TMP4]], i32 0
+; NO-FMA-NEXT:    [[TMP25:%.*]] = insertelement <4 x double> undef, double [[TMP24]], i32 0
+; NO-FMA-NEXT:    [[TMP26:%.*]] = extractelement <4 x double> [[TMP4]], i32 1
+; NO-FMA-NEXT:    [[TMP27:%.*]] = insertelement <4 x double> [[TMP25]], double [[TMP26]], i32 1
+; NO-FMA-NEXT:    [[TMP28:%.*]] = extractelement <4 x double> [[TMP4]], i32 2
+; NO-FMA-NEXT:    [[TMP29:%.*]] = insertelement <4 x double> [[TMP27]], double [[TMP28]], i32 2
+; NO-FMA-NEXT:    [[TMP30:%.*]] = extractelement <4 x double> [[TMP4]], i32 3
+; NO-FMA-NEXT:    [[TMP31:%.*]] = insertelement <4 x double> [[TMP29]], double [[TMP30]], i32 3
+; NO-FMA-NEXT:    [[TMP32:%.*]] = extractelement <4 x double> [[TMP6]], i32 0
+; NO-FMA-NEXT:    [[TMP33:%.*]] = insertelement <4 x double> undef, double [[TMP32]], i32 0
+; NO-FMA-NEXT:    [[TMP34:%.*]] = extractelement <4 x double> [[TMP6]], i32 1
+; NO-FMA-NEXT:    [[TMP35:%.*]] = insertelement <4 x double> [[TMP33]], double [[TMP34]], i32 1
+; NO-FMA-NEXT:    [[TMP36:%.*]] = extractelement <4 x double> [[TMP6]], i32 2
+; NO-FMA-NEXT:    [[TMP37:%.*]] = insertelement <4 x double> [[TMP35]], double [[TMP36]], i32 2
+; NO-FMA-NEXT:    [[TMP38:%.*]] = extractelement <4 x double> [[TMP6]], i32 3
+; NO-FMA-NEXT:    [[TMP39:%.*]] = insertelement <4 x double> [[TMP37]], double [[TMP38]], i32 3
+; NO-FMA-NEXT:    [[TMP40:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[TMP2]], <4 x double> [[TMP31]], <4 x double> [[TMP39]])
+; NO-FMA-NEXT:    store <4 x double> [[TMP23]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4
+; NO-FMA-NEXT:    store <4 x double> [[TMP40]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 4
 ; NO-FMA-NEXT:    ret void
 ;
 ; FMA256-LABEL: @fma_8f64(
@@ -224,26 +229,27 @@
 
 define void @fma_4f32() #0 {
 ; NO-FMA-LABEL: @fma_4f32(
-; NO-FMA-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 0), align 4
-; NO-FMA-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 1), align 4
-; NO-FMA-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 2), align 4
-; NO-FMA-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 3), align 4
-; NO-FMA-NEXT:    [[B0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 0), align 4
-; NO-FMA-NEXT:    [[B1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 1), align 4
-; NO-FMA-NEXT:    [[B2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 2), align 4
-; NO-FMA-NEXT:    [[B3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 3), align 4
-; NO-FMA-NEXT:    [[C0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 0), align 4
-; NO-FMA-NEXT:    [[C1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 1), align 4
-; NO-FMA-NEXT:    [[C2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 2), align 4
-; NO-FMA-NEXT:    [[C3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 3), align 4
-; NO-FMA-NEXT:    [[FMA0:%.*]] = call float @llvm.fma.f32(float [[A0]], float [[B0]], float [[C0]])
-; NO-FMA-NEXT:    [[FMA1:%.*]] = call float @llvm.fma.f32(float [[A1]], float [[B1]], float [[C1]])
-; NO-FMA-NEXT:    [[FMA2:%.*]] = call float @llvm.fma.f32(float [[A2]], float [[B2]], float [[C2]])
-; NO-FMA-NEXT:    [[FMA3:%.*]] = call float @llvm.fma.f32(float [[A3]], float [[B3]], float [[C3]])
-; NO-FMA-NEXT:    store float [[FMA0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; NO-FMA-NEXT:    store float [[FMA1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; NO-FMA-NEXT:    store float [[FMA2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; NO-FMA-NEXT:    store float [[FMA3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; NO-FMA-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcC32 to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; NO-FMA-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> undef, float [[TMP4]], i32 0
+; NO-FMA-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; NO-FMA-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP5]], float [[TMP6]], i32 1
+; NO-FMA-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; NO-FMA-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> [[TMP7]], float [[TMP8]], i32 2
+; NO-FMA-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; NO-FMA-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP10]], i32 3
+; NO-FMA-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
+; NO-FMA-NEXT:    [[TMP13:%.*]] = insertelement <4 x float> undef, float [[TMP12]], i32 0
+; NO-FMA-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
+; NO-FMA-NEXT:    [[TMP15:%.*]] = insertelement <4 x float> [[TMP13]], float [[TMP14]], i32 1
+; NO-FMA-NEXT:    [[TMP16:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
+; NO-FMA-NEXT:    [[TMP17:%.*]] = insertelement <4 x float> [[TMP15]], float [[TMP16]], i32 2
+; NO-FMA-NEXT:    [[TMP18:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
+; NO-FMA-NEXT:    [[TMP19:%.*]] = insertelement <4 x float> [[TMP17]], float [[TMP18]], i32 3
+; NO-FMA-NEXT:    [[TMP20:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP11]], <4 x float> [[TMP19]])
+; NO-FMA-NEXT:    store <4 x float> [[TMP20]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
 ; NO-FMA-NEXT:    ret void
 ;
 ; FMA-LABEL: @fma_4f32(
@@ -279,46 +285,43 @@
 
 define void @fma_8f32() #0 {
 ; NO-FMA-LABEL: @fma_8f32(
-; NO-FMA-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 0), align 4
-; NO-FMA-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 1), align 4
-; NO-FMA-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 2), align 4
-; NO-FMA-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 3), align 4
-; NO-FMA-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4), align 4
-; NO-FMA-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 5), align 4
-; NO-FMA-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 6), align 4
-; NO-FMA-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 7), align 4
-; NO-FMA-NEXT:    [[B0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 0), align 4
-; NO-FMA-NEXT:    [[B1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 1), align 4
-; NO-FMA-NEXT:    [[B2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 2), align 4
-; NO-FMA-NEXT:    [[B3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 3), align 4
-; NO-FMA-NEXT:    [[B4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4), align 4
-; NO-FMA-NEXT:    [[B5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 5), align 4
-; NO-FMA-NEXT:    [[B6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 6), align 4
-; NO-FMA-NEXT:    [[B7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 7), align 4
-; NO-FMA-NEXT:    [[C0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 0), align 4
-; NO-FMA-NEXT:    [[C1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 1), align 4
-; NO-FMA-NEXT:    [[C2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 2), align 4
-; NO-FMA-NEXT:    [[C3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 3), align 4
-; NO-FMA-NEXT:    [[C4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 4), align 4
-; NO-FMA-NEXT:    [[C5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 5), align 4
-; NO-FMA-NEXT:    [[C6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 6), align 4
-; NO-FMA-NEXT:    [[C7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 7), align 4
-; NO-FMA-NEXT:    [[FMA0:%.*]] = call float @llvm.fma.f32(float [[A0]], float [[B0]], float [[C0]])
-; NO-FMA-NEXT:    [[FMA1:%.*]] = call float @llvm.fma.f32(float [[A1]], float [[B1]], float [[C1]])
-; NO-FMA-NEXT:    [[FMA2:%.*]] = call float @llvm.fma.f32(float [[A2]], float [[B2]], float [[C2]])
-; NO-FMA-NEXT:    [[FMA3:%.*]] = call float @llvm.fma.f32(float [[A3]], float [[B3]], float [[C3]])
-; NO-FMA-NEXT:    [[FMA4:%.*]] = call float @llvm.fma.f32(float [[A4]], float [[B4]], float [[C4]])
-; NO-FMA-NEXT:    [[FMA5:%.*]] = call float @llvm.fma.f32(float [[A5]], float [[B5]], float [[C5]])
-; NO-FMA-NEXT:    [[FMA6:%.*]] = call float @llvm.fma.f32(float [[A6]], float [[B6]], float [[C6]])
-; NO-FMA-NEXT:    [[FMA7:%.*]] = call float @llvm.fma.f32(float [[A7]], float [[B7]], float [[C7]])
-; NO-FMA-NEXT:    store float [[FMA0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; NO-FMA-NEXT:    store float [[FMA1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; NO-FMA-NEXT:    store float [[FMA2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; NO-FMA-NEXT:    store float [[FMA3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; NO-FMA-NEXT:    store float [[FMA4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
-; NO-FMA-NEXT:    store float [[FMA5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; NO-FMA-NEXT:    store float [[FMA6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
-; NO-FMA-NEXT:    store float [[FMA7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; NO-FMA-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcA32 to <8 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcC32 to <8 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[TMP2]], i32 0
+; NO-FMA-NEXT:    [[TMP5:%.*]] = insertelement <8 x float> undef, float [[TMP4]], i32 0
+; NO-FMA-NEXT:    [[TMP6:%.*]] = extractelement <8 x float> [[TMP2]], i32 1
+; NO-FMA-NEXT:    [[TMP7:%.*]] = insertelement <8 x float> [[TMP5]], float [[TMP6]], i32 1
+; NO-FMA-NEXT:    [[TMP8:%.*]] = extractelement <8 x float> [[TMP2]], i32 2
+; NO-FMA-NEXT:    [[TMP9:%.*]] = insertelement <8 x float> [[TMP7]], float [[TMP8]], i32 2
+; NO-FMA-NEXT:    [[TMP10:%.*]] = extractelement <8 x float> [[TMP2]], i32 3
+; NO-FMA-NEXT:    [[TMP11:%.*]] = insertelement <8 x float> [[TMP9]], float [[TMP10]], i32 3
+; NO-FMA-NEXT:    [[TMP12:%.*]] = extractelement <8 x float> [[TMP2]], i32 4
+; NO-FMA-NEXT:    [[TMP13:%.*]] = insertelement <8 x float> [[TMP11]], float [[TMP12]], i32 4
+; NO-FMA-NEXT:    [[TMP14:%.*]] = extractelement <8 x float> [[TMP2]], i32 5
+; NO-FMA-NEXT:    [[TMP15:%.*]] = insertelement <8 x float> [[TMP13]], float [[TMP14]], i32 5
+; NO-FMA-NEXT:    [[TMP16:%.*]] = extractelement <8 x float> [[TMP2]], i32 6
+; NO-FMA-NEXT:    [[TMP17:%.*]] = insertelement <8 x float> [[TMP15]], float [[TMP16]], i32 6
+; NO-FMA-NEXT:    [[TMP18:%.*]] = extractelement <8 x float> [[TMP2]], i32 7
+; NO-FMA-NEXT:    [[TMP19:%.*]] = insertelement <8 x float> [[TMP17]], float [[TMP18]], i32 7
+; NO-FMA-NEXT:    [[TMP20:%.*]] = extractelement <8 x float> [[TMP3]], i32 0
+; NO-FMA-NEXT:    [[TMP21:%.*]] = insertelement <8 x float> undef, float [[TMP20]], i32 0
+; NO-FMA-NEXT:    [[TMP22:%.*]] = extractelement <8 x float> [[TMP3]], i32 1
+; NO-FMA-NEXT:    [[TMP23:%.*]] = insertelement <8 x float> [[TMP21]], float [[TMP22]], i32 1
+; NO-FMA-NEXT:    [[TMP24:%.*]] = extractelement <8 x float> [[TMP3]], i32 2
+; NO-FMA-NEXT:    [[TMP25:%.*]] = insertelement <8 x float> [[TMP23]], float [[TMP24]], i32 2
+; NO-FMA-NEXT:    [[TMP26:%.*]] = extractelement <8 x float> [[TMP3]], i32 3
+; NO-FMA-NEXT:    [[TMP27:%.*]] = insertelement <8 x float> [[TMP25]], float [[TMP26]], i32 3
+; NO-FMA-NEXT:    [[TMP28:%.*]] = extractelement <8 x float> [[TMP3]], i32 4
+; NO-FMA-NEXT:    [[TMP29:%.*]] = insertelement <8 x float> [[TMP27]], float [[TMP28]], i32 4
+; NO-FMA-NEXT:    [[TMP30:%.*]] = extractelement <8 x float> [[TMP3]], i32 5
+; NO-FMA-NEXT:    [[TMP31:%.*]] = insertelement <8 x float> [[TMP29]], float [[TMP30]], i32 5
+; NO-FMA-NEXT:    [[TMP32:%.*]] = extractelement <8 x float> [[TMP3]], i32 6
+; NO-FMA-NEXT:    [[TMP33:%.*]] = insertelement <8 x float> [[TMP31]], float [[TMP32]], i32 6
+; NO-FMA-NEXT:    [[TMP34:%.*]] = extractelement <8 x float> [[TMP3]], i32 7
+; NO-FMA-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP33]], float [[TMP34]], i32 7
+; NO-FMA-NEXT:    [[TMP36:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP19]], <8 x float> [[TMP35]])
+; NO-FMA-NEXT:    store <8 x float> [[TMP36]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
 ; NO-FMA-NEXT:    ret void
 ;
 ; FMA-LABEL: @fma_8f32(
@@ -374,86 +377,80 @@
 
 define void @fma_16f32() #0 {
 ; NO-FMA-LABEL: @fma_16f32(
-; NO-FMA-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 0), align 4
-; NO-FMA-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 1), align 4
-; NO-FMA-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 2), align 4
-; NO-FMA-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 3), align 4
-; NO-FMA-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4), align 4
-; NO-FMA-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 5), align 4
-; NO-FMA-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 6), align 4
-; NO-FMA-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 7), align 4
-; NO-FMA-NEXT:    [[A8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8), align 4
-; NO-FMA-NEXT:    [[A9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 9), align 4
-; NO-FMA-NEXT:    [[A10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 10), align 4
-; NO-FMA-NEXT:    [[A11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 11), align 4
-; NO-FMA-NEXT:    [[A12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 12), align 4
-; NO-FMA-NEXT:    [[A13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 13), align 4
-; NO-FMA-NEXT:    [[A14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 14), align 4
-; NO-FMA-NEXT:    [[A15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 15), align 4
-; NO-FMA-NEXT:    [[B0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 0), align 4
-; NO-FMA-NEXT:    [[B1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 1), align 4
-; NO-FMA-NEXT:    [[B2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 2), align 4
-; NO-FMA-NEXT:    [[B3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 3), align 4
-; NO-FMA-NEXT:    [[B4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4), align 4
-; NO-FMA-NEXT:    [[B5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 5), align 4
-; NO-FMA-NEXT:    [[B6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 6), align 4
-; NO-FMA-NEXT:    [[B7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 7), align 4
-; NO-FMA-NEXT:    [[B8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8), align 4
-; NO-FMA-NEXT:    [[B9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 9), align 4
-; NO-FMA-NEXT:    [[B10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 10), align 4
-; NO-FMA-NEXT:    [[B11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 11), align 4
-; NO-FMA-NEXT:    [[B12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 12), align 4
-; NO-FMA-NEXT:    [[B13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 13), align 4
-; NO-FMA-NEXT:    [[B14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 14), align 4
-; NO-FMA-NEXT:    [[B15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 15), align 4
-; NO-FMA-NEXT:    [[C0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 0), align 4
-; NO-FMA-NEXT:    [[C1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 1), align 4
-; NO-FMA-NEXT:    [[C2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 2), align 4
-; NO-FMA-NEXT:    [[C3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 3), align 4
-; NO-FMA-NEXT:    [[C4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 4), align 4
-; NO-FMA-NEXT:    [[C5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 5), align 4
-; NO-FMA-NEXT:    [[C6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 6), align 4
-; NO-FMA-NEXT:    [[C7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 7), align 4
-; NO-FMA-NEXT:    [[C8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 8), align 4
-; NO-FMA-NEXT:    [[C9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 9), align 4
-; NO-FMA-NEXT:    [[C10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 10), align 4
-; NO-FMA-NEXT:    [[C11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 11), align 4
-; NO-FMA-NEXT:    [[C12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 12), align 4
-; NO-FMA-NEXT:    [[C13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 13), align 4
-; NO-FMA-NEXT:    [[C14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 14), align 4
-; NO-FMA-NEXT:    [[C15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 15), align 4
-; NO-FMA-NEXT:    [[FMA0:%.*]] = call float @llvm.fma.f32(float [[A0]], float [[B0]], float [[C0]])
-; NO-FMA-NEXT:    [[FMA1:%.*]] = call float @llvm.fma.f32(float [[A1]], float [[B1]], float [[C1]])
-; NO-FMA-NEXT:    [[FMA2:%.*]] = call float @llvm.fma.f32(float [[A2]], float [[B2]], float [[C2]])
-; NO-FMA-NEXT:    [[FMA3:%.*]] = call float @llvm.fma.f32(float [[A3]], float [[B3]], float [[C3]])
-; NO-FMA-NEXT:    [[FMA4:%.*]] = call float @llvm.fma.f32(float [[A4]], float [[B4]], float [[C4]])
-; NO-FMA-NEXT:    [[FMA5:%.*]] = call float @llvm.fma.f32(float [[A5]], float [[B5]], float [[C5]])
-; NO-FMA-NEXT:    [[FMA6:%.*]] = call float @llvm.fma.f32(float [[A6]], float [[B6]], float [[C6]])
-; NO-FMA-NEXT:    [[FMA7:%.*]] = call float @llvm.fma.f32(float [[A7]], float [[B7]], float [[C7]])
-; NO-FMA-NEXT:    [[FMA8:%.*]] = call float @llvm.fma.f32(float [[A8]], float [[B8]], float [[C8]])
-; NO-FMA-NEXT:    [[FMA9:%.*]] = call float @llvm.fma.f32(float [[A9]], float [[B9]], float [[C9]])
-; NO-FMA-NEXT:    [[FMA10:%.*]] = call float @llvm.fma.f32(float [[A10]], float [[B10]], float [[C10]])
-; NO-FMA-NEXT:    [[FMA11:%.*]] = call float @llvm.fma.f32(float [[A11]], float [[B11]], float [[C11]])
-; NO-FMA-NEXT:    [[FMA12:%.*]] = call float @llvm.fma.f32(float [[A12]], float [[B12]], float [[C12]])
-; NO-FMA-NEXT:    [[FMA13:%.*]] = call float @llvm.fma.f32(float [[A13]], float [[B13]], float [[C13]])
-; NO-FMA-NEXT:    [[FMA14:%.*]] = call float @llvm.fma.f32(float [[A14]], float [[B14]], float [[C14]])
-; NO-FMA-NEXT:    [[FMA15:%.*]] = call float @llvm.fma.f32(float [[A15]], float [[B15]], float [[C15]])
-; NO-FMA-NEXT:    store float [[FMA0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; NO-FMA-NEXT:    store float [[FMA1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; NO-FMA-NEXT:    store float [[FMA2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; NO-FMA-NEXT:    store float [[FMA3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; NO-FMA-NEXT:    store float [[FMA4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
-; NO-FMA-NEXT:    store float [[FMA5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; NO-FMA-NEXT:    store float [[FMA6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
-; NO-FMA-NEXT:    store float [[FMA7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
-; NO-FMA-NEXT:    store float [[FMA8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4
-; NO-FMA-NEXT:    store float [[FMA9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4
-; NO-FMA-NEXT:    store float [[FMA10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4
-; NO-FMA-NEXT:    store float [[FMA11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
-; NO-FMA-NEXT:    store float [[FMA12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4
-; NO-FMA-NEXT:    store float [[FMA13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
-; NO-FMA-NEXT:    store float [[FMA14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4
-; NO-FMA-NEXT:    store float [[FMA15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+; NO-FMA-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcA32 to <8 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <8 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP4:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <8 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP5:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcC32 to <8 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP6:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 8) to <8 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP7:%.*]] = extractelement <8 x float> [[TMP3]], i32 0
+; NO-FMA-NEXT:    [[TMP8:%.*]] = insertelement <8 x float> undef, float [[TMP7]], i32 0
+; NO-FMA-NEXT:    [[TMP9:%.*]] = extractelement <8 x float> [[TMP3]], i32 1
+; NO-FMA-NEXT:    [[TMP10:%.*]] = insertelement <8 x float> [[TMP8]], float [[TMP9]], i32 1
+; NO-FMA-NEXT:    [[TMP11:%.*]] = extractelement <8 x float> [[TMP3]], i32 2
+; NO-FMA-NEXT:    [[TMP12:%.*]] = insertelement <8 x float> [[TMP10]], float [[TMP11]], i32 2
+; NO-FMA-NEXT:    [[TMP13:%.*]] = extractelement <8 x float> [[TMP3]], i32 3
+; NO-FMA-NEXT:    [[TMP14:%.*]] = insertelement <8 x float> [[TMP12]], float [[TMP13]], i32 3
+; NO-FMA-NEXT:    [[TMP15:%.*]] = extractelement <8 x float> [[TMP3]], i32 4
+; NO-FMA-NEXT:    [[TMP16:%.*]] = insertelement <8 x float> [[TMP14]], float [[TMP15]], i32 4
+; NO-FMA-NEXT:    [[TMP17:%.*]] = extractelement <8 x float> [[TMP3]], i32 5
+; NO-FMA-NEXT:    [[TMP18:%.*]] = insertelement <8 x float> [[TMP16]], float [[TMP17]], i32 5
+; NO-FMA-NEXT:    [[TMP19:%.*]] = extractelement <8 x float> [[TMP3]], i32 6
+; NO-FMA-NEXT:    [[TMP20:%.*]] = insertelement <8 x float> [[TMP18]], float [[TMP19]], i32 6
+; NO-FMA-NEXT:    [[TMP21:%.*]] = extractelement <8 x float> [[TMP3]], i32 7
+; NO-FMA-NEXT:    [[TMP22:%.*]] = insertelement <8 x float> [[TMP20]], float [[TMP21]], i32 7
+; NO-FMA-NEXT:    [[TMP23:%.*]] = extractelement <8 x float> [[TMP5]], i32 0
+; NO-FMA-NEXT:    [[TMP24:%.*]] = insertelement <8 x float> undef, float [[TMP23]], i32 0
+; NO-FMA-NEXT:    [[TMP25:%.*]] = extractelement <8 x float> [[TMP5]], i32 1
+; NO-FMA-NEXT:    [[TMP26:%.*]] = insertelement <8 x float> [[TMP24]], float [[TMP25]], i32 1
+; NO-FMA-NEXT:    [[TMP27:%.*]] = extractelement <8 x float> [[TMP5]], i32 2
+; NO-FMA-NEXT:    [[TMP28:%.*]] = insertelement <8 x float> [[TMP26]], float [[TMP27]], i32 2
+; NO-FMA-NEXT:    [[TMP29:%.*]] = extractelement <8 x float> [[TMP5]], i32 3
+; NO-FMA-NEXT:    [[TMP30:%.*]] = insertelement <8 x float> [[TMP28]], float [[TMP29]], i32 3
+; NO-FMA-NEXT:    [[TMP31:%.*]] = extractelement <8 x float> [[TMP5]], i32 4
+; NO-FMA-NEXT:    [[TMP32:%.*]] = insertelement <8 x float> [[TMP30]], float [[TMP31]], i32 4
+; NO-FMA-NEXT:    [[TMP33:%.*]] = extractelement <8 x float> [[TMP5]], i32 5
+; NO-FMA-NEXT:    [[TMP34:%.*]] = insertelement <8 x float> [[TMP32]], float [[TMP33]], i32 5
+; NO-FMA-NEXT:    [[TMP35:%.*]] = extractelement <8 x float> [[TMP5]], i32 6
+; NO-FMA-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP35]], i32 6
+; NO-FMA-NEXT:    [[TMP37:%.*]] = extractelement <8 x float> [[TMP5]], i32 7
+; NO-FMA-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP37]], i32 7
+; NO-FMA-NEXT:    [[TMP39:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP22]], <8 x float> [[TMP38]])
+; NO-FMA-NEXT:    [[TMP40:%.*]] = extractelement <8 x float> [[TMP4]], i32 0
+; NO-FMA-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> undef, float [[TMP40]], i32 0
+; NO-FMA-NEXT:    [[TMP42:%.*]] = extractelement <8 x float> [[TMP4]], i32 1
+; NO-FMA-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP42]], i32 1
+; NO-FMA-NEXT:    [[TMP44:%.*]] = extractelement <8 x float> [[TMP4]], i32 2
+; NO-FMA-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP44]], i32 2
+; NO-FMA-NEXT:    [[TMP46:%.*]] = extractelement <8 x float> [[TMP4]], i32 3
+; NO-FMA-NEXT:    [[TMP47:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP46]], i32 3
+; NO-FMA-NEXT:    [[TMP48:%.*]] = extractelement <8 x float> [[TMP4]], i32 4
+; NO-FMA-NEXT:    [[TMP49:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP48]], i32 4
+; NO-FMA-NEXT:    [[TMP50:%.*]] = extractelement <8 x float> [[TMP4]], i32 5
+; NO-FMA-NEXT:    [[TMP51:%.*]] = insertelement <8 x float> [[TMP49]], float [[TMP50]], i32 5
+; NO-FMA-NEXT:    [[TMP52:%.*]] = extractelement <8 x float> [[TMP4]], i32 6
+; NO-FMA-NEXT:    [[TMP53:%.*]] = insertelement <8 x float> [[TMP51]], float [[TMP52]], i32 6
+; NO-FMA-NEXT:    [[TMP54:%.*]] = extractelement <8 x float> [[TMP4]], i32 7
+; NO-FMA-NEXT:    [[TMP55:%.*]] = insertelement <8 x float> [[TMP53]], float [[TMP54]], i32 7
+; NO-FMA-NEXT:    [[TMP56:%.*]] = extractelement <8 x float> [[TMP6]], i32 0
+; NO-FMA-NEXT:    [[TMP57:%.*]] = insertelement <8 x float> undef, float [[TMP56]], i32 0
+; NO-FMA-NEXT:    [[TMP58:%.*]] = extractelement <8 x float> [[TMP6]], i32 1
+; NO-FMA-NEXT:    [[TMP59:%.*]] = insertelement <8 x float> [[TMP57]], float [[TMP58]], i32 1
+; NO-FMA-NEXT:    [[TMP60:%.*]] = extractelement <8 x float> [[TMP6]], i32 2
+; NO-FMA-NEXT:    [[TMP61:%.*]] = insertelement <8 x float> [[TMP59]], float [[TMP60]], i32 2
+; NO-FMA-NEXT:    [[TMP62:%.*]] = extractelement <8 x float> [[TMP6]], i32 3
+; NO-FMA-NEXT:    [[TMP63:%.*]] = insertelement <8 x float> [[TMP61]], float [[TMP62]], i32 3
+; NO-FMA-NEXT:    [[TMP64:%.*]] = extractelement <8 x float> [[TMP6]], i32 4
+; NO-FMA-NEXT:    [[TMP65:%.*]] = insertelement <8 x float> [[TMP63]], float [[TMP64]], i32 4
+; NO-FMA-NEXT:    [[TMP66:%.*]] = extractelement <8 x float> [[TMP6]], i32 5
+; NO-FMA-NEXT:    [[TMP67:%.*]] = insertelement <8 x float> [[TMP65]], float [[TMP66]], i32 5
+; NO-FMA-NEXT:    [[TMP68:%.*]] = extractelement <8 x float> [[TMP6]], i32 6
+; NO-FMA-NEXT:    [[TMP69:%.*]] = insertelement <8 x float> [[TMP67]], float [[TMP68]], i32 6
+; NO-FMA-NEXT:    [[TMP70:%.*]] = extractelement <8 x float> [[TMP6]], i32 7
+; NO-FMA-NEXT:    [[TMP71:%.*]] = insertelement <8 x float> [[TMP69]], float [[TMP70]], i32 7
+; NO-FMA-NEXT:    [[TMP72:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[TMP2]], <8 x float> [[TMP55]], <8 x float> [[TMP71]])
+; NO-FMA-NEXT:    store <8 x float> [[TMP39]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; NO-FMA-NEXT:    store <8 x float> [[TMP72]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
 ; NO-FMA-NEXT:    ret void
 ;
 ; FMA256-LABEL: @fma_16f32(
Index: test/Transforms/SLPVectorizer/X86/fptosi.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/fptosi.ll
+++ test/Transforms/SLPVectorizer/X86/fptosi.ll
@@ -37,14 +37,18 @@
 ; SSE-NEXT:    [[CVT5:%.*]] = fptosi double [[A5]] to i64
 ; SSE-NEXT:    [[CVT6:%.*]] = fptosi double [[A6]] to i64
 ; SSE-NEXT:    [[CVT7:%.*]] = fptosi double [[A7]] to i64
-; SSE-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
-; SSE-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
-; SSE-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
-; SSE-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
-; SSE-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
-; SSE-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
-; SSE-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
-; SSE-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[CVT1]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([8 x i64]* @dst64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[CVT2]], i32 0
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[CVT3]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = insertelement <2 x i64> undef, i64 [[CVT4]], i32 0
+; SSE-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[CVT5]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> undef, i64 [[CVT6]], i32 0
+; SSE-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[CVT7]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP8]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SSE-NEXT:    ret void
 ;
 ; AVX256NODQ-LABEL: @fptosi_8f64_8i64(
@@ -64,14 +68,16 @@
 ; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptosi double [[A5]] to i64
 ; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptosi double [[A6]] to i64
 ; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptosi double [[A7]] to i64
-; AVX256NODQ-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+; AVX256NODQ-NEXT:    [[TMP1:%.*]] = insertelement <4 x i64> undef, i64 [[CVT0]], i32 0
+; AVX256NODQ-NEXT:    [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[CVT1]], i32 1
+; AVX256NODQ-NEXT:    [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[CVT2]], i32 2
+; AVX256NODQ-NEXT:    [[TMP4:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[CVT3]], i32 3
+; AVX256NODQ-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
+; AVX256NODQ-NEXT:    [[TMP5:%.*]] = insertelement <4 x i64> undef, i64 [[CVT4]], i32 0
+; AVX256NODQ-NEXT:    [[TMP6:%.*]] = insertelement <4 x i64> [[TMP5]], i64 [[CVT5]], i32 1
+; AVX256NODQ-NEXT:    [[TMP7:%.*]] = insertelement <4 x i64> [[TMP6]], i64 [[CVT6]], i32 2
+; AVX256NODQ-NEXT:    [[TMP8:%.*]] = insertelement <4 x i64> [[TMP7]], i64 [[CVT7]], i32 3
+; AVX256NODQ-NEXT:    store <4 x i64> [[TMP8]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX256NODQ-NEXT:    ret void
 ;
 ; AVX512-LABEL: @fptosi_8f64_8i64(
@@ -161,30 +167,31 @@
 
 define void @fptosi_8f64_8i16() #0 {
 ; SSE-LABEL: @fptosi_8f64_8i16(
-; SSE-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; SSE-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; SSE-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; SSE-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
-; SSE-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
-; SSE-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
-; SSE-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
-; SSE-NEXT:    [[CVT0:%.*]] = fptosi double [[A0]] to i16
-; SSE-NEXT:    [[CVT1:%.*]] = fptosi double [[A1]] to i16
-; SSE-NEXT:    [[CVT2:%.*]] = fptosi double [[A2]] to i16
-; SSE-NEXT:    [[CVT3:%.*]] = fptosi double [[A3]] to i16
-; SSE-NEXT:    [[CVT4:%.*]] = fptosi double [[A4]] to i16
-; SSE-NEXT:    [[CVT5:%.*]] = fptosi double [[A5]] to i16
-; SSE-NEXT:    [[CVT6:%.*]] = fptosi double [[A6]] to i16
-; SSE-NEXT:    [[CVT7:%.*]] = fptosi double [[A7]] to i16
-; SSE-NEXT:    store i16 [[CVT0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 0), align 2
-; SSE-NEXT:    store i16 [[CVT1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 1), align 2
-; SSE-NEXT:    store i16 [[CVT2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 2), align 2
-; SSE-NEXT:    store i16 [[CVT3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 3), align 2
-; SSE-NEXT:    store i16 [[CVT4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 4), align 2
-; SSE-NEXT:    store i16 [[CVT5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 5), align 2
-; SSE-NEXT:    store i16 [[CVT6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 6), align 2
-; SSE-NEXT:    store i16 [[CVT7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 7), align 2
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = fptosi <2 x double> [[TMP1]] to <2 x i16>
+; SSE-NEXT:    [[TMP6:%.*]] = fptosi <2 x double> [[TMP2]] to <2 x i16>
+; SSE-NEXT:    [[TMP7:%.*]] = fptosi <2 x double> [[TMP3]] to <2 x i16>
+; SSE-NEXT:    [[TMP8:%.*]] = fptosi <2 x double> [[TMP4]] to <2 x i16>
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <2 x i16> [[TMP5]], i32 0
+; SSE-NEXT:    [[TMP10:%.*]] = insertelement <8 x i16> undef, i16 [[TMP9]], i32 0
+; SSE-NEXT:    [[TMP11:%.*]] = extractelement <2 x i16> [[TMP5]], i32 1
+; SSE-NEXT:    [[TMP12:%.*]] = insertelement <8 x i16> [[TMP10]], i16 [[TMP11]], i32 1
+; SSE-NEXT:    [[TMP13:%.*]] = extractelement <2 x i16> [[TMP6]], i32 0
+; SSE-NEXT:    [[TMP14:%.*]] = insertelement <8 x i16> [[TMP12]], i16 [[TMP13]], i32 2
+; SSE-NEXT:    [[TMP15:%.*]] = extractelement <2 x i16> [[TMP6]], i32 1
+; SSE-NEXT:    [[TMP16:%.*]] = insertelement <8 x i16> [[TMP14]], i16 [[TMP15]], i32 3
+; SSE-NEXT:    [[TMP17:%.*]] = extractelement <2 x i16> [[TMP7]], i32 0
+; SSE-NEXT:    [[TMP18:%.*]] = insertelement <8 x i16> [[TMP16]], i16 [[TMP17]], i32 4
+; SSE-NEXT:    [[TMP19:%.*]] = extractelement <2 x i16> [[TMP7]], i32 1
+; SSE-NEXT:    [[TMP20:%.*]] = insertelement <8 x i16> [[TMP18]], i16 [[TMP19]], i32 5
+; SSE-NEXT:    [[TMP21:%.*]] = extractelement <2 x i16> [[TMP8]], i32 0
+; SSE-NEXT:    [[TMP22:%.*]] = insertelement <8 x i16> [[TMP20]], i16 [[TMP21]], i32 6
+; SSE-NEXT:    [[TMP23:%.*]] = extractelement <2 x i16> [[TMP8]], i32 1
+; SSE-NEXT:    [[TMP24:%.*]] = insertelement <8 x i16> [[TMP22]], i16 [[TMP23]], i32 7
+; SSE-NEXT:    store <8 x i16> [[TMP24]], <8 x i16>* bitcast ([32 x i16]* @dst16 to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @fptosi_8f64_8i16(
@@ -297,14 +304,18 @@
 ; SSE-NEXT:    [[CVT5:%.*]] = fptosi float [[A5]] to i64
 ; SSE-NEXT:    [[CVT6:%.*]] = fptosi float [[A6]] to i64
 ; SSE-NEXT:    [[CVT7:%.*]] = fptosi float [[A7]] to i64
-; SSE-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
-; SSE-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
-; SSE-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
-; SSE-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
-; SSE-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
-; SSE-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
-; SSE-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
-; SSE-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[CVT1]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([8 x i64]* @dst64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[CVT2]], i32 0
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[CVT3]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = insertelement <2 x i64> undef, i64 [[CVT4]], i32 0
+; SSE-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[CVT5]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> undef, i64 [[CVT6]], i32 0
+; SSE-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[CVT7]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP8]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SSE-NEXT:    ret void
 ;
 ; AVX256NODQ-LABEL: @fptosi_8f32_8i64(
@@ -324,14 +335,16 @@
 ; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptosi float [[A5]] to i64
 ; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptosi float [[A6]] to i64
 ; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptosi float [[A7]] to i64
-; AVX256NODQ-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+; AVX256NODQ-NEXT:    [[TMP1:%.*]] = insertelement <4 x i64> undef, i64 [[CVT0]], i32 0
+; AVX256NODQ-NEXT:    [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[CVT1]], i32 1
+; AVX256NODQ-NEXT:    [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[CVT2]], i32 2
+; AVX256NODQ-NEXT:    [[TMP4:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[CVT3]], i32 3
+; AVX256NODQ-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
+; AVX256NODQ-NEXT:    [[TMP5:%.*]] = insertelement <4 x i64> undef, i64 [[CVT4]], i32 0
+; AVX256NODQ-NEXT:    [[TMP6:%.*]] = insertelement <4 x i64> [[TMP5]], i64 [[CVT5]], i32 1
+; AVX256NODQ-NEXT:    [[TMP7:%.*]] = insertelement <4 x i64> [[TMP6]], i64 [[CVT6]], i32 2
+; AVX256NODQ-NEXT:    [[TMP8:%.*]] = insertelement <4 x i64> [[TMP7]], i64 [[CVT7]], i32 3
+; AVX256NODQ-NEXT:    store <4 x i64> [[TMP8]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX256NODQ-NEXT:    ret void
 ;
 ; AVX512-LABEL: @fptosi_8f32_8i64(
Index: test/Transforms/SLPVectorizer/X86/fptoui.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/fptoui.ll
+++ test/Transforms/SLPVectorizer/X86/fptoui.ll
@@ -37,14 +37,18 @@
 ; SSE-NEXT:    [[CVT5:%.*]] = fptoui double [[A5]] to i64
 ; SSE-NEXT:    [[CVT6:%.*]] = fptoui double [[A6]] to i64
 ; SSE-NEXT:    [[CVT7:%.*]] = fptoui double [[A7]] to i64
-; SSE-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
-; SSE-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
-; SSE-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
-; SSE-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
-; SSE-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
-; SSE-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
-; SSE-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
-; SSE-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[CVT1]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([8 x i64]* @dst64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[CVT2]], i32 0
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[CVT3]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = insertelement <2 x i64> undef, i64 [[CVT4]], i32 0
+; SSE-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[CVT5]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> undef, i64 [[CVT6]], i32 0
+; SSE-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[CVT7]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP8]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SSE-NEXT:    ret void
 ;
 ; AVX256NODQ-LABEL: @fptoui_8f64_8i64(
@@ -64,14 +68,16 @@
 ; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptoui double [[A5]] to i64
 ; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptoui double [[A6]] to i64
 ; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptoui double [[A7]] to i64
-; AVX256NODQ-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+; AVX256NODQ-NEXT:    [[TMP1:%.*]] = insertelement <4 x i64> undef, i64 [[CVT0]], i32 0
+; AVX256NODQ-NEXT:    [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[CVT1]], i32 1
+; AVX256NODQ-NEXT:    [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[CVT2]], i32 2
+; AVX256NODQ-NEXT:    [[TMP4:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[CVT3]], i32 3
+; AVX256NODQ-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
+; AVX256NODQ-NEXT:    [[TMP5:%.*]] = insertelement <4 x i64> undef, i64 [[CVT4]], i32 0
+; AVX256NODQ-NEXT:    [[TMP6:%.*]] = insertelement <4 x i64> [[TMP5]], i64 [[CVT5]], i32 1
+; AVX256NODQ-NEXT:    [[TMP7:%.*]] = insertelement <4 x i64> [[TMP6]], i64 [[CVT6]], i32 2
+; AVX256NODQ-NEXT:    [[TMP8:%.*]] = insertelement <4 x i64> [[TMP7]], i64 [[CVT7]], i32 3
+; AVX256NODQ-NEXT:    store <4 x i64> [[TMP8]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX256NODQ-NEXT:    ret void
 ;
 ; AVX512-LABEL: @fptoui_8f64_8i64(
@@ -134,41 +140,44 @@
 ; SSE-NEXT:    [[CVT5:%.*]] = fptoui double [[A5]] to i32
 ; SSE-NEXT:    [[CVT6:%.*]] = fptoui double [[A6]] to i32
 ; SSE-NEXT:    [[CVT7:%.*]] = fptoui double [[A7]] to i32
-; SSE-NEXT:    store i32 [[CVT0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 0), align 4
-; SSE-NEXT:    store i32 [[CVT1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 1), align 4
-; SSE-NEXT:    store i32 [[CVT2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 2), align 4
-; SSE-NEXT:    store i32 [[CVT3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 3), align 4
-; SSE-NEXT:    store i32 [[CVT4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4), align 4
-; SSE-NEXT:    store i32 [[CVT5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 5), align 4
-; SSE-NEXT:    store i32 [[CVT6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 6), align 4
-; SSE-NEXT:    store i32 [[CVT7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[CVT1]], i32 1
+; SSE-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[CVT2]], i32 2
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[CVT3]], i32 3
+; SSE-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> undef, i32 [[CVT4]], i32 0
+; SSE-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[CVT5]], i32 1
+; SSE-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[CVT6]], i32 2
+; SSE-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[CVT7]], i32 3
+; SSE-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; AVX256NODQ-LABEL: @fptoui_8f64_8i32(
-; AVX256NODQ-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; AVX256NODQ-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; AVX256NODQ-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; AVX256NODQ-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; AVX256NODQ-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
-; AVX256NODQ-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
-; AVX256NODQ-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
-; AVX256NODQ-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
-; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptoui double [[A0]] to i32
-; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptoui double [[A1]] to i32
-; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptoui double [[A2]] to i32
-; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptoui double [[A3]] to i32
-; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptoui double [[A4]] to i32
-; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptoui double [[A5]] to i32
-; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptoui double [[A6]] to i32
-; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptoui double [[A7]] to i32
-; AVX256NODQ-NEXT:    store i32 [[CVT0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 0), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 1), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 2), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 3), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 5), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 6), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4
+; AVX256NODQ-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; AVX256NODQ-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; AVX256NODQ-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8
+; AVX256NODQ-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8
+; AVX256NODQ-NEXT:    [[TMP5:%.*]] = fptoui <2 x double> [[TMP1]] to <2 x i32>
+; AVX256NODQ-NEXT:    [[TMP6:%.*]] = fptoui <2 x double> [[TMP2]] to <2 x i32>
+; AVX256NODQ-NEXT:    [[TMP7:%.*]] = fptoui <2 x double> [[TMP3]] to <2 x i32>
+; AVX256NODQ-NEXT:    [[TMP8:%.*]] = fptoui <2 x double> [[TMP4]] to <2 x i32>
+; AVX256NODQ-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0
+; AVX256NODQ-NEXT:    [[TMP10:%.*]] = insertelement <8 x i32> undef, i32 [[TMP9]], i32 0
+; AVX256NODQ-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1
+; AVX256NODQ-NEXT:    [[TMP12:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[TMP11]], i32 1
+; AVX256NODQ-NEXT:    [[TMP13:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0
+; AVX256NODQ-NEXT:    [[TMP14:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[TMP13]], i32 2
+; AVX256NODQ-NEXT:    [[TMP15:%.*]] = extractelement <2 x i32> [[TMP6]], i32 1
+; AVX256NODQ-NEXT:    [[TMP16:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[TMP15]], i32 3
+; AVX256NODQ-NEXT:    [[TMP17:%.*]] = extractelement <2 x i32> [[TMP7]], i32 0
+; AVX256NODQ-NEXT:    [[TMP18:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[TMP17]], i32 4
+; AVX256NODQ-NEXT:    [[TMP19:%.*]] = extractelement <2 x i32> [[TMP7]], i32 1
+; AVX256NODQ-NEXT:    [[TMP20:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP19]], i32 5
+; AVX256NODQ-NEXT:    [[TMP21:%.*]] = extractelement <2 x i32> [[TMP8]], i32 0
+; AVX256NODQ-NEXT:    [[TMP22:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP21]], i32 6
+; AVX256NODQ-NEXT:    [[TMP23:%.*]] = extractelement <2 x i32> [[TMP8]], i32 1
+; AVX256NODQ-NEXT:    [[TMP24:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP23]], i32 7
+; AVX256NODQ-NEXT:    store <8 x i32> [[TMP24]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4
 ; AVX256NODQ-NEXT:    ret void
 ;
 ; AVX512-LABEL: @fptoui_8f64_8i32(
@@ -212,57 +221,59 @@
 
 define void @fptoui_8f64_8i16() #0 {
 ; SSE-LABEL: @fptoui_8f64_8i16(
-; SSE-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; SSE-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; SSE-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; SSE-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
-; SSE-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
-; SSE-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
-; SSE-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
-; SSE-NEXT:    [[CVT0:%.*]] = fptoui double [[A0]] to i16
-; SSE-NEXT:    [[CVT1:%.*]] = fptoui double [[A1]] to i16
-; SSE-NEXT:    [[CVT2:%.*]] = fptoui double [[A2]] to i16
-; SSE-NEXT:    [[CVT3:%.*]] = fptoui double [[A3]] to i16
-; SSE-NEXT:    [[CVT4:%.*]] = fptoui double [[A4]] to i16
-; SSE-NEXT:    [[CVT5:%.*]] = fptoui double [[A5]] to i16
-; SSE-NEXT:    [[CVT6:%.*]] = fptoui double [[A6]] to i16
-; SSE-NEXT:    [[CVT7:%.*]] = fptoui double [[A7]] to i16
-; SSE-NEXT:    store i16 [[CVT0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 0), align 2
-; SSE-NEXT:    store i16 [[CVT1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 1), align 2
-; SSE-NEXT:    store i16 [[CVT2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 2), align 2
-; SSE-NEXT:    store i16 [[CVT3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 3), align 2
-; SSE-NEXT:    store i16 [[CVT4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 4), align 2
-; SSE-NEXT:    store i16 [[CVT5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 5), align 2
-; SSE-NEXT:    store i16 [[CVT6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 6), align 2
-; SSE-NEXT:    store i16 [[CVT7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 7), align 2
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = fptoui <2 x double> [[TMP1]] to <2 x i16>
+; SSE-NEXT:    [[TMP6:%.*]] = fptoui <2 x double> [[TMP2]] to <2 x i16>
+; SSE-NEXT:    [[TMP7:%.*]] = fptoui <2 x double> [[TMP3]] to <2 x i16>
+; SSE-NEXT:    [[TMP8:%.*]] = fptoui <2 x double> [[TMP4]] to <2 x i16>
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <2 x i16> [[TMP5]], i32 0
+; SSE-NEXT:    [[TMP10:%.*]] = insertelement <8 x i16> undef, i16 [[TMP9]], i32 0
+; SSE-NEXT:    [[TMP11:%.*]] = extractelement <2 x i16> [[TMP5]], i32 1
+; SSE-NEXT:    [[TMP12:%.*]] = insertelement <8 x i16> [[TMP10]], i16 [[TMP11]], i32 1
+; SSE-NEXT:    [[TMP13:%.*]] = extractelement <2 x i16> [[TMP6]], i32 0
+; SSE-NEXT:    [[TMP14:%.*]] = insertelement <8 x i16> [[TMP12]], i16 [[TMP13]], i32 2
+; SSE-NEXT:    [[TMP15:%.*]] = extractelement <2 x i16> [[TMP6]], i32 1
+; SSE-NEXT:    [[TMP16:%.*]] = insertelement <8 x i16> [[TMP14]], i16 [[TMP15]], i32 3
+; SSE-NEXT:    [[TMP17:%.*]] = extractelement <2 x i16> [[TMP7]], i32 0
+; SSE-NEXT:    [[TMP18:%.*]] = insertelement <8 x i16> [[TMP16]], i16 [[TMP17]], i32 4
+; SSE-NEXT:    [[TMP19:%.*]] = extractelement <2 x i16> [[TMP7]], i32 1
+; SSE-NEXT:    [[TMP20:%.*]] = insertelement <8 x i16> [[TMP18]], i16 [[TMP19]], i32 5
+; SSE-NEXT:    [[TMP21:%.*]] = extractelement <2 x i16> [[TMP8]], i32 0
+; SSE-NEXT:    [[TMP22:%.*]] = insertelement <8 x i16> [[TMP20]], i16 [[TMP21]], i32 6
+; SSE-NEXT:    [[TMP23:%.*]] = extractelement <2 x i16> [[TMP8]], i32 1
+; SSE-NEXT:    [[TMP24:%.*]] = insertelement <8 x i16> [[TMP22]], i16 [[TMP23]], i32 7
+; SSE-NEXT:    store <8 x i16> [[TMP24]], <8 x i16>* bitcast ([32 x i16]* @dst16 to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; AVX256NODQ-LABEL: @fptoui_8f64_8i16(
-; AVX256NODQ-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; AVX256NODQ-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; AVX256NODQ-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; AVX256NODQ-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; AVX256NODQ-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
-; AVX256NODQ-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
-; AVX256NODQ-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
-; AVX256NODQ-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
-; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptoui double [[A0]] to i16
-; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptoui double [[A1]] to i16
-; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptoui double [[A2]] to i16
-; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptoui double [[A3]] to i16
-; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptoui double [[A4]] to i16
-; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptoui double [[A5]] to i16
-; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptoui double [[A6]] to i16
-; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptoui double [[A7]] to i16
-; AVX256NODQ-NEXT:    store i16 [[CVT0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 0), align 2
-; AVX256NODQ-NEXT:    store i16 [[CVT1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 1), align 2
-; AVX256NODQ-NEXT:    store i16 [[CVT2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 2), align 2
-; AVX256NODQ-NEXT:    store i16 [[CVT3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 3), align 2
-; AVX256NODQ-NEXT:    store i16 [[CVT4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 4), align 2
-; AVX256NODQ-NEXT:    store i16 [[CVT5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 5), align 2
-; AVX256NODQ-NEXT:    store i16 [[CVT6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 6), align 2
-; AVX256NODQ-NEXT:    store i16 [[CVT7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 7), align 2
+; AVX256NODQ-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; AVX256NODQ-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; AVX256NODQ-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8
+; AVX256NODQ-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8
+; AVX256NODQ-NEXT:    [[TMP5:%.*]] = fptoui <2 x double> [[TMP1]] to <2 x i16>
+; AVX256NODQ-NEXT:    [[TMP6:%.*]] = fptoui <2 x double> [[TMP2]] to <2 x i16>
+; AVX256NODQ-NEXT:    [[TMP7:%.*]] = fptoui <2 x double> [[TMP3]] to <2 x i16>
+; AVX256NODQ-NEXT:    [[TMP8:%.*]] = fptoui <2 x double> [[TMP4]] to <2 x i16>
+; AVX256NODQ-NEXT:    [[TMP9:%.*]] = extractelement <2 x i16> [[TMP5]], i32 0
+; AVX256NODQ-NEXT:    [[TMP10:%.*]] = insertelement <8 x i16> undef, i16 [[TMP9]], i32 0
+; AVX256NODQ-NEXT:    [[TMP11:%.*]] = extractelement <2 x i16> [[TMP5]], i32 1
+; AVX256NODQ-NEXT:    [[TMP12:%.*]] = insertelement <8 x i16> [[TMP10]], i16 [[TMP11]], i32 1
+; AVX256NODQ-NEXT:    [[TMP13:%.*]] = extractelement <2 x i16> [[TMP6]], i32 0
+; AVX256NODQ-NEXT:    [[TMP14:%.*]] = insertelement <8 x i16> [[TMP12]], i16 [[TMP13]], i32 2
+; AVX256NODQ-NEXT:    [[TMP15:%.*]] = extractelement <2 x i16> [[TMP6]], i32 1
+; AVX256NODQ-NEXT:    [[TMP16:%.*]] = insertelement <8 x i16> [[TMP14]], i16 [[TMP15]], i32 3
+; AVX256NODQ-NEXT:    [[TMP17:%.*]] = extractelement <2 x i16> [[TMP7]], i32 0
+; AVX256NODQ-NEXT:    [[TMP18:%.*]] = insertelement <8 x i16> [[TMP16]], i16 [[TMP17]], i32 4
+; AVX256NODQ-NEXT:    [[TMP19:%.*]] = extractelement <2 x i16> [[TMP7]], i32 1
+; AVX256NODQ-NEXT:    [[TMP20:%.*]] = insertelement <8 x i16> [[TMP18]], i16 [[TMP19]], i32 5
+; AVX256NODQ-NEXT:    [[TMP21:%.*]] = extractelement <2 x i16> [[TMP8]], i32 0
+; AVX256NODQ-NEXT:    [[TMP22:%.*]] = insertelement <8 x i16> [[TMP20]], i16 [[TMP21]], i32 6
+; AVX256NODQ-NEXT:    [[TMP23:%.*]] = extractelement <2 x i16> [[TMP8]], i32 1
+; AVX256NODQ-NEXT:    [[TMP24:%.*]] = insertelement <8 x i16> [[TMP22]], i16 [[TMP23]], i32 7
+; AVX256NODQ-NEXT:    store <8 x i16> [[TMP24]], <8 x i16>* bitcast ([32 x i16]* @dst16 to <8 x i16>*), align 2
 ; AVX256NODQ-NEXT:    ret void
 ;
 ; AVX512-LABEL: @fptoui_8f64_8i16(
@@ -381,14 +392,18 @@
 ; SSE-NEXT:    [[CVT5:%.*]] = fptoui float [[A5]] to i64
 ; SSE-NEXT:    [[CVT6:%.*]] = fptoui float [[A6]] to i64
 ; SSE-NEXT:    [[CVT7:%.*]] = fptoui float [[A7]] to i64
-; SSE-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
-; SSE-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
-; SSE-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
-; SSE-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
-; SSE-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
-; SSE-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
-; SSE-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
-; SSE-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[CVT1]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([8 x i64]* @dst64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[CVT2]], i32 0
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[CVT3]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = insertelement <2 x i64> undef, i64 [[CVT4]], i32 0
+; SSE-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[CVT5]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> undef, i64 [[CVT6]], i32 0
+; SSE-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[CVT7]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP8]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SSE-NEXT:    ret void
 ;
 ; AVX256NODQ-LABEL: @fptoui_8f32_8i64(
@@ -408,14 +423,16 @@
 ; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptoui float [[A5]] to i64
 ; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptoui float [[A6]] to i64
 ; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptoui float [[A7]] to i64
-; AVX256NODQ-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+; AVX256NODQ-NEXT:    [[TMP1:%.*]] = insertelement <4 x i64> undef, i64 [[CVT0]], i32 0
+; AVX256NODQ-NEXT:    [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[CVT1]], i32 1
+; AVX256NODQ-NEXT:    [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[CVT2]], i32 2
+; AVX256NODQ-NEXT:    [[TMP4:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[CVT3]], i32 3
+; AVX256NODQ-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
+; AVX256NODQ-NEXT:    [[TMP5:%.*]] = insertelement <4 x i64> undef, i64 [[CVT4]], i32 0
+; AVX256NODQ-NEXT:    [[TMP6:%.*]] = insertelement <4 x i64> [[TMP5]], i64 [[CVT5]], i32 1
+; AVX256NODQ-NEXT:    [[TMP7:%.*]] = insertelement <4 x i64> [[TMP6]], i64 [[CVT6]], i32 2
+; AVX256NODQ-NEXT:    [[TMP8:%.*]] = insertelement <4 x i64> [[TMP7]], i64 [[CVT7]], i32 3
+; AVX256NODQ-NEXT:    store <4 x i64> [[TMP8]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX256NODQ-NEXT:    ret void
 ;
 ; AVX512-LABEL: @fptoui_8f32_8i64(
@@ -478,14 +495,16 @@
 ; SSE-NEXT:    [[CVT5:%.*]] = fptoui float [[A5]] to i32
 ; SSE-NEXT:    [[CVT6:%.*]] = fptoui float [[A6]] to i32
 ; SSE-NEXT:    [[CVT7:%.*]] = fptoui float [[A7]] to i32
-; SSE-NEXT:    store i32 [[CVT0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 0), align 4
-; SSE-NEXT:    store i32 [[CVT1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 1), align 4
-; SSE-NEXT:    store i32 [[CVT2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 2), align 4
-; SSE-NEXT:    store i32 [[CVT3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 3), align 4
-; SSE-NEXT:    store i32 [[CVT4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4), align 4
-; SSE-NEXT:    store i32 [[CVT5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 5), align 4
-; SSE-NEXT:    store i32 [[CVT6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 6), align 4
-; SSE-NEXT:    store i32 [[CVT7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[CVT1]], i32 1
+; SSE-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[CVT2]], i32 2
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[CVT3]], i32 3
+; SSE-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> undef, i32 [[CVT4]], i32 0
+; SSE-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[CVT5]], i32 1
+; SSE-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[CVT6]], i32 2
+; SSE-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[CVT7]], i32 3
+; SSE-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; AVX256NODQ-LABEL: @fptoui_8f32_8i32(
@@ -505,14 +524,15 @@
 ; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptoui float [[A5]] to i32
 ; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptoui float [[A6]] to i32
 ; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptoui float [[A7]] to i32
-; AVX256NODQ-NEXT:    store i32 [[CVT0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 0), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 1), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 2), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 3), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 5), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 6), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4
+; AVX256NODQ-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> undef, i32 [[CVT0]], i32 0
+; AVX256NODQ-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[CVT1]], i32 1
+; AVX256NODQ-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[CVT2]], i32 2
+; AVX256NODQ-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[CVT3]], i32 3
+; AVX256NODQ-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[CVT4]], i32 4
+; AVX256NODQ-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[CVT5]], i32 5
+; AVX256NODQ-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[CVT6]], i32 6
+; AVX256NODQ-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[CVT7]], i32 7
+; AVX256NODQ-NEXT:    store <8 x i32> [[TMP8]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4
 ; AVX256NODQ-NEXT:    ret void
 ;
 ; AVX512-LABEL: @fptoui_8f32_8i32(
@@ -572,14 +592,15 @@
 ; SSE-NEXT:    [[CVT5:%.*]] = fptoui float [[A5]] to i16
 ; SSE-NEXT:    [[CVT6:%.*]] = fptoui float [[A6]] to i16
 ; SSE-NEXT:    [[CVT7:%.*]] = fptoui float [[A7]] to i16
-; SSE-NEXT:    store i16 [[CVT0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 0), align 2
-; SSE-NEXT:    store i16 [[CVT1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 1), align 2
-; SSE-NEXT:    store i16 [[CVT2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 2), align 2
-; SSE-NEXT:    store i16 [[CVT3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 3), align 2
-; SSE-NEXT:    store i16 [[CVT4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 4), align 2
-; SSE-NEXT:    store i16 [[CVT5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 5), align 2
-; SSE-NEXT:    store i16 [[CVT6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 6), align 2
-; SSE-NEXT:    store i16 [[CVT7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 7), align 2
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <8 x i16> undef, i16 [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <8 x i16> [[TMP1]], i16 [[CVT1]], i32 1
+; SSE-NEXT:    [[TMP3:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[CVT2]], i32 2
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <8 x i16> [[TMP3]], i16 [[CVT3]], i32 3
+; SSE-NEXT:    [[TMP5:%.*]] = insertelement <8 x i16> [[TMP4]], i16 [[CVT4]], i32 4
+; SSE-NEXT:    [[TMP6:%.*]] = insertelement <8 x i16> [[TMP5]], i16 [[CVT5]], i32 5
+; SSE-NEXT:    [[TMP7:%.*]] = insertelement <8 x i16> [[TMP6]], i16 [[CVT6]], i32 6
+; SSE-NEXT:    [[TMP8:%.*]] = insertelement <8 x i16> [[TMP7]], i16 [[CVT7]], i32 7
+; SSE-NEXT:    store <8 x i16> [[TMP8]], <8 x i16>* bitcast ([32 x i16]* @dst16 to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @fptoui_8f32_8i16(
Index: test/Transforms/SLPVectorizer/X86/fround.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/fround.ll
+++ test/Transforms/SLPVectorizer/X86/fround.ll
@@ -31,8 +31,9 @@
 ; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
 ; SSE2-NEXT:    [[CEIL0:%.*]] = call double @llvm.ceil.f64(double [[LD0]])
 ; SSE2-NEXT:    [[CEIL1:%.*]] = call double @llvm.ceil.f64(double [[LD1]])
-; SSE2-NEXT:    store double [[CEIL0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; SSE2-NEXT:    store double [[CEIL1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE2-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CEIL0]], i32 0
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CEIL1]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @ceil_2f64(
@@ -66,10 +67,12 @@
 ; SSE2-NEXT:    [[CEIL1:%.*]] = call double @llvm.ceil.f64(double [[LD1]])
 ; SSE2-NEXT:    [[CEIL2:%.*]] = call double @llvm.ceil.f64(double [[LD2]])
 ; SSE2-NEXT:    [[CEIL3:%.*]] = call double @llvm.ceil.f64(double [[LD3]])
-; SSE2-NEXT:    store double [[CEIL0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; SSE2-NEXT:    store double [[CEIL1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE2-NEXT:    store double [[CEIL2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
-; SSE2-NEXT:    store double [[CEIL3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE2-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CEIL0]], i32 0
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CEIL1]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> undef, double [[CEIL2]], i32 0
+; SSE2-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[CEIL3]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @ceil_4f64(
@@ -120,14 +123,18 @@
 ; SSE2-NEXT:    [[CEIL5:%.*]] = call double @llvm.ceil.f64(double [[LD5]])
 ; SSE2-NEXT:    [[CEIL6:%.*]] = call double @llvm.ceil.f64(double [[LD6]])
 ; SSE2-NEXT:    [[CEIL7:%.*]] = call double @llvm.ceil.f64(double [[LD7]])
-; SSE2-NEXT:    store double [[CEIL0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; SSE2-NEXT:    store double [[CEIL1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE2-NEXT:    store double [[CEIL2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
-; SSE2-NEXT:    store double [[CEIL3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
-; SSE2-NEXT:    store double [[CEIL4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8
-; SSE2-NEXT:    store double [[CEIL5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
-; SSE2-NEXT:    store double [[CEIL6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8
-; SSE2-NEXT:    store double [[CEIL7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; SSE2-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CEIL0]], i32 0
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CEIL1]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> undef, double [[CEIL2]], i32 0
+; SSE2-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[CEIL3]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> undef, double [[CEIL4]], i32 0
+; SSE2-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[CEIL5]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> undef, double [[CEIL6]], i32 0
+; SSE2-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[CEIL7]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @ceil_8f64(
@@ -202,8 +209,9 @@
 ; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
 ; SSE2-NEXT:    [[FLOOR0:%.*]] = call double @llvm.floor.f64(double [[LD0]])
 ; SSE2-NEXT:    [[FLOOR1:%.*]] = call double @llvm.floor.f64(double [[LD1]])
-; SSE2-NEXT:    store double [[FLOOR0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; SSE2-NEXT:    store double [[FLOOR1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE2-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[FLOOR0]], i32 0
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[FLOOR1]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @floor_2f64(
@@ -237,10 +245,12 @@
 ; SSE2-NEXT:    [[FLOOR1:%.*]] = call double @llvm.floor.f64(double [[LD1]])
 ; SSE2-NEXT:    [[FLOOR2:%.*]] = call double @llvm.floor.f64(double [[LD2]])
 ; SSE2-NEXT:    [[FLOOR3:%.*]] = call double @llvm.floor.f64(double [[LD3]])
-; SSE2-NEXT:    store double [[FLOOR0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; SSE2-NEXT:    store double [[FLOOR1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE2-NEXT:    store double [[FLOOR2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
-; SSE2-NEXT:    store double [[FLOOR3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE2-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[FLOOR0]], i32 0
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[FLOOR1]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> undef, double [[FLOOR2]], i32 0
+; SSE2-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[FLOOR3]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @floor_4f64(
@@ -291,14 +301,18 @@
 ; SSE2-NEXT:    [[FLOOR5:%.*]] = call double @llvm.floor.f64(double [[LD5]])
 ; SSE2-NEXT:    [[FLOOR6:%.*]] = call double @llvm.floor.f64(double [[LD6]])
 ; SSE2-NEXT:    [[FLOOR7:%.*]] = call double @llvm.floor.f64(double [[LD7]])
-; SSE2-NEXT:    store double [[FLOOR0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; SSE2-NEXT:    store double [[FLOOR1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE2-NEXT:    store double [[FLOOR2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
-; SSE2-NEXT:    store double [[FLOOR3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
-; SSE2-NEXT:    store double [[FLOOR4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8
-; SSE2-NEXT:    store double [[FLOOR5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
-; SSE2-NEXT:    store double [[FLOOR6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8
-; SSE2-NEXT:    store double [[FLOOR7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; SSE2-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[FLOOR0]], i32 0
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[FLOOR1]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> undef, double [[FLOOR2]], i32 0
+; SSE2-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[FLOOR3]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> undef, double [[FLOOR4]], i32 0
+; SSE2-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[FLOOR5]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> undef, double [[FLOOR6]], i32 0
+; SSE2-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[FLOOR7]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @floor_8f64(
@@ -373,8 +387,9 @@
 ; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
 ; SSE2-NEXT:    [[NEARBYINT0:%.*]] = call double @llvm.nearbyint.f64(double [[LD0]])
 ; SSE2-NEXT:    [[NEARBYINT1:%.*]] = call double @llvm.nearbyint.f64(double [[LD1]])
-; SSE2-NEXT:    store double [[NEARBYINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; SSE2-NEXT:    store double [[NEARBYINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE2-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[NEARBYINT0]], i32 0
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[NEARBYINT1]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @nearbyint_2f64(
@@ -408,10 +423,12 @@
 ; SSE2-NEXT:    [[NEARBYINT1:%.*]] = call double @llvm.nearbyint.f64(double [[LD1]])
 ; SSE2-NEXT:    [[NEARBYINT2:%.*]] = call double @llvm.nearbyint.f64(double [[LD2]])
 ; SSE2-NEXT:    [[NEARBYINT3:%.*]] = call double @llvm.nearbyint.f64(double [[LD3]])
-; SSE2-NEXT:    store double [[NEARBYINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; SSE2-NEXT:    store double [[NEARBYINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE2-NEXT:    store double [[NEARBYINT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
-; SSE2-NEXT:    store double [[NEARBYINT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE2-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[NEARBYINT0]], i32 0
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[NEARBYINT1]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> undef, double [[NEARBYINT2]], i32 0
+; SSE2-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[NEARBYINT3]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @nearbyint_4f64(
@@ -462,14 +479,18 @@
 ; SSE2-NEXT:    [[NEARBYINT5:%.*]] = call double @llvm.nearbyint.f64(double [[LD5]])
 ; SSE2-NEXT:    [[NEARBYINT6:%.*]] = call double @llvm.nearbyint.f64(double [[LD6]])
 ; SSE2-NEXT:    [[NEARBYINT7:%.*]] = call double @llvm.nearbyint.f64(double [[LD7]])
-; SSE2-NEXT:    store double [[NEARBYINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; SSE2-NEXT:    store double [[NEARBYINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE2-NEXT:    store double [[NEARBYINT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
-; SSE2-NEXT:    store double [[NEARBYINT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
-; SSE2-NEXT:    store double [[NEARBYINT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8
-; SSE2-NEXT:    store double [[NEARBYINT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
-; SSE2-NEXT:    store double [[NEARBYINT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8
-; SSE2-NEXT:    store double [[NEARBYINT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; SSE2-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[NEARBYINT0]], i32 0
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[NEARBYINT1]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> undef, double [[NEARBYINT2]], i32 0
+; SSE2-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[NEARBYINT3]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> undef, double [[NEARBYINT4]], i32 0
+; SSE2-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[NEARBYINT5]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> undef, double [[NEARBYINT6]], i32 0
+; SSE2-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[NEARBYINT7]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @nearbyint_8f64(
@@ -544,8 +565,9 @@
 ; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
 ; SSE2-NEXT:    [[RINT0:%.*]] = call double @llvm.rint.f64(double [[LD0]])
 ; SSE2-NEXT:    [[RINT1:%.*]] = call double @llvm.rint.f64(double [[LD1]])
-; SSE2-NEXT:    store double [[RINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; SSE2-NEXT:    store double [[RINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE2-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[RINT0]], i32 0
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[RINT1]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @rint_2f64(
@@ -579,10 +601,12 @@
 ; SSE2-NEXT:    [[RINT1:%.*]] = call double @llvm.rint.f64(double [[LD1]])
 ; SSE2-NEXT:    [[RINT2:%.*]] = call double @llvm.rint.f64(double [[LD2]])
 ; SSE2-NEXT:    [[RINT3:%.*]] = call double @llvm.rint.f64(double [[LD3]])
-; SSE2-NEXT:    store double [[RINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; SSE2-NEXT:    store double [[RINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE2-NEXT:    store double [[RINT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
-; SSE2-NEXT:    store double [[RINT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE2-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[RINT0]], i32 0
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[RINT1]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> undef, double [[RINT2]], i32 0
+; SSE2-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[RINT3]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @rint_4f64(
@@ -633,14 +657,18 @@
 ; SSE2-NEXT:    [[RINT5:%.*]] = call double @llvm.rint.f64(double [[LD5]])
 ; SSE2-NEXT:    [[RINT6:%.*]] = call double @llvm.rint.f64(double [[LD6]])
 ; SSE2-NEXT:    [[RINT7:%.*]] = call double @llvm.rint.f64(double [[LD7]])
-; SSE2-NEXT:    store double [[RINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; SSE2-NEXT:    store double [[RINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE2-NEXT:    store double [[RINT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
-; SSE2-NEXT:    store double [[RINT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
-; SSE2-NEXT:    store double [[RINT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8
-; SSE2-NEXT:    store double [[RINT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
-; SSE2-NEXT:    store double [[RINT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8
-; SSE2-NEXT:    store double [[RINT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; SSE2-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[RINT0]], i32 0
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[RINT1]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> undef, double [[RINT2]], i32 0
+; SSE2-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[RINT3]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> undef, double [[RINT4]], i32 0
+; SSE2-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[RINT5]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> undef, double [[RINT6]], i32 0
+; SSE2-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[RINT7]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @rint_8f64(
@@ -715,8 +743,9 @@
 ; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
 ; SSE2-NEXT:    [[TRUNC0:%.*]] = call double @llvm.trunc.f64(double [[LD0]])
 ; SSE2-NEXT:    [[TRUNC1:%.*]] = call double @llvm.trunc.f64(double [[LD1]])
-; SSE2-NEXT:    store double [[TRUNC0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; SSE2-NEXT:    store double [[TRUNC1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE2-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[TRUNC0]], i32 0
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[TRUNC1]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @trunc_2f64(
@@ -750,10 +779,12 @@
 ; SSE2-NEXT:    [[TRUNC1:%.*]] = call double @llvm.trunc.f64(double [[LD1]])
 ; SSE2-NEXT:    [[TRUNC2:%.*]] = call double @llvm.trunc.f64(double [[LD2]])
 ; SSE2-NEXT:    [[TRUNC3:%.*]] = call double @llvm.trunc.f64(double [[LD3]])
-; SSE2-NEXT:    store double [[TRUNC0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; SSE2-NEXT:    store double [[TRUNC1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE2-NEXT:    store double [[TRUNC2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
-; SSE2-NEXT:    store double [[TRUNC3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE2-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[TRUNC0]], i32 0
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[TRUNC1]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> undef, double [[TRUNC2]], i32 0
+; SSE2-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[TRUNC3]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @trunc_4f64(
@@ -804,14 +835,18 @@
 ; SSE2-NEXT:    [[TRUNC5:%.*]] = call double @llvm.trunc.f64(double [[LD5]])
 ; SSE2-NEXT:    [[TRUNC6:%.*]] = call double @llvm.trunc.f64(double [[LD6]])
 ; SSE2-NEXT:    [[TRUNC7:%.*]] = call double @llvm.trunc.f64(double [[LD7]])
-; SSE2-NEXT:    store double [[TRUNC0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; SSE2-NEXT:    store double [[TRUNC1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE2-NEXT:    store double [[TRUNC2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
-; SSE2-NEXT:    store double [[TRUNC3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
-; SSE2-NEXT:    store double [[TRUNC4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8
-; SSE2-NEXT:    store double [[TRUNC5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
-; SSE2-NEXT:    store double [[TRUNC6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8
-; SSE2-NEXT:    store double [[TRUNC7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; SSE2-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[TRUNC0]], i32 0
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[TRUNC1]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> undef, double [[TRUNC2]], i32 0
+; SSE2-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[TRUNC3]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> undef, double [[TRUNC4]], i32 0
+; SSE2-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[TRUNC5]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> undef, double [[TRUNC6]], i32 0
+; SSE2-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[TRUNC7]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @trunc_8f64(
@@ -890,10 +925,11 @@
 ; SSE2-NEXT:    [[CEIL1:%.*]] = call float @llvm.ceil.f32(float [[LD1]])
 ; SSE2-NEXT:    [[CEIL2:%.*]] = call float @llvm.ceil.f32(float [[LD2]])
 ; SSE2-NEXT:    [[CEIL3:%.*]] = call float @llvm.ceil.f32(float [[LD3]])
-; SSE2-NEXT:    store float [[CEIL0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; SSE2-NEXT:    store float [[CEIL1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE2-NEXT:    store float [[CEIL2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; SSE2-NEXT:    store float [[CEIL3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[CEIL0]], i32 0
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[CEIL1]], i32 1
+; SSE2-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[CEIL2]], i32 2
+; SSE2-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[CEIL3]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @ceil_4f32(
@@ -941,14 +977,16 @@
 ; SSE2-NEXT:    [[CEIL5:%.*]] = call float @llvm.ceil.f32(float [[LD5]])
 ; SSE2-NEXT:    [[CEIL6:%.*]] = call float @llvm.ceil.f32(float [[LD6]])
 ; SSE2-NEXT:    [[CEIL7:%.*]] = call float @llvm.ceil.f32(float [[LD7]])
-; SSE2-NEXT:    store float [[CEIL0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; SSE2-NEXT:    store float [[CEIL1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE2-NEXT:    store float [[CEIL2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; SSE2-NEXT:    store float [[CEIL3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE2-NEXT:    store float [[CEIL4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
-; SSE2-NEXT:    store float [[CEIL5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE2-NEXT:    store float [[CEIL6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
-; SSE2-NEXT:    store float [[CEIL7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[CEIL0]], i32 0
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[CEIL1]], i32 1
+; SSE2-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[CEIL2]], i32 2
+; SSE2-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[CEIL3]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> undef, float [[CEIL4]], i32 0
+; SSE2-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[CEIL5]], i32 1
+; SSE2-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[CEIL6]], i32 2
+; SSE2-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[CEIL7]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @ceil_8f32(
@@ -1027,22 +1065,26 @@
 ; SSE2-NEXT:    [[CEIL13:%.*]] = call float @llvm.ceil.f32(float [[LD13]])
 ; SSE2-NEXT:    [[CEIL14:%.*]] = call float @llvm.ceil.f32(float [[LD14]])
 ; SSE2-NEXT:    [[CEIL15:%.*]] = call float @llvm.ceil.f32(float [[LD15]])
-; SSE2-NEXT:    store float [[CEIL0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; SSE2-NEXT:    store float [[CEIL1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE2-NEXT:    store float [[CEIL2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; SSE2-NEXT:    store float [[CEIL3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE2-NEXT:    store float [[CEIL4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
-; SSE2-NEXT:    store float [[CEIL5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE2-NEXT:    store float [[CEIL6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
-; SSE2-NEXT:    store float [[CEIL7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
-; SSE2-NEXT:    store float [[CEIL8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4
-; SSE2-NEXT:    store float [[CEIL9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4
-; SSE2-NEXT:    store float [[CEIL10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4
-; SSE2-NEXT:    store float [[CEIL11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
-; SSE2-NEXT:    store float [[CEIL12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4
-; SSE2-NEXT:    store float [[CEIL13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
-; SSE2-NEXT:    store float [[CEIL14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4
-; SSE2-NEXT:    store float [[CEIL15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[CEIL0]], i32 0
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[CEIL1]], i32 1
+; SSE2-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[CEIL2]], i32 2
+; SSE2-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[CEIL3]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> undef, float [[CEIL4]], i32 0
+; SSE2-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[CEIL5]], i32 1
+; SSE2-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[CEIL6]], i32 2
+; SSE2-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[CEIL7]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> undef, float [[CEIL8]], i32 0
+; SSE2-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[CEIL9]], i32 1
+; SSE2-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[CEIL10]], i32 2
+; SSE2-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[CEIL11]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP12]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP13:%.*]] = insertelement <4 x float> undef, float [[CEIL12]], i32 0
+; SSE2-NEXT:    [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[CEIL13]], i32 1
+; SSE2-NEXT:    [[TMP15:%.*]] = insertelement <4 x float> [[TMP14]], float [[CEIL14]], i32 2
+; SSE2-NEXT:    [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[CEIL15]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP16]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @ceil_16f32(
@@ -1145,10 +1187,11 @@
 ; SSE2-NEXT:    [[FLOOR1:%.*]] = call float @llvm.floor.f32(float [[LD1]])
 ; SSE2-NEXT:    [[FLOOR2:%.*]] = call float @llvm.floor.f32(float [[LD2]])
 ; SSE2-NEXT:    [[FLOOR3:%.*]] = call float @llvm.floor.f32(float [[LD3]])
-; SSE2-NEXT:    store float [[FLOOR0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; SSE2-NEXT:    store float [[FLOOR1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE2-NEXT:    store float [[FLOOR2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; SSE2-NEXT:    store float [[FLOOR3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[FLOOR0]], i32 0
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[FLOOR1]], i32 1
+; SSE2-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[FLOOR2]], i32 2
+; SSE2-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[FLOOR3]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @floor_4f32(
@@ -1196,14 +1239,16 @@
 ; SSE2-NEXT:    [[FLOOR5:%.*]] = call float @llvm.floor.f32(float [[LD5]])
 ; SSE2-NEXT:    [[FLOOR6:%.*]] = call float @llvm.floor.f32(float [[LD6]])
 ; SSE2-NEXT:    [[FLOOR7:%.*]] = call float @llvm.floor.f32(float [[LD7]])
-; SSE2-NEXT:    store float [[FLOOR0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; SSE2-NEXT:    store float [[FLOOR1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE2-NEXT:    store float [[FLOOR2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; SSE2-NEXT:    store float [[FLOOR3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE2-NEXT:    store float [[FLOOR4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
-; SSE2-NEXT:    store float [[FLOOR5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE2-NEXT:    store float [[FLOOR6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
-; SSE2-NEXT:    store float [[FLOOR7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[FLOOR0]], i32 0
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[FLOOR1]], i32 1
+; SSE2-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[FLOOR2]], i32 2
+; SSE2-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[FLOOR3]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> undef, float [[FLOOR4]], i32 0
+; SSE2-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[FLOOR5]], i32 1
+; SSE2-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[FLOOR6]], i32 2
+; SSE2-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[FLOOR7]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @floor_8f32(
@@ -1282,22 +1327,26 @@
 ; SSE2-NEXT:    [[FLOOR13:%.*]] = call float @llvm.floor.f32(float [[LD13]])
 ; SSE2-NEXT:    [[FLOOR14:%.*]] = call float @llvm.floor.f32(float [[LD14]])
 ; SSE2-NEXT:    [[FLOOR15:%.*]] = call float @llvm.floor.f32(float [[LD15]])
-; SSE2-NEXT:    store float [[FLOOR0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; SSE2-NEXT:    store float [[FLOOR1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE2-NEXT:    store float [[FLOOR2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; SSE2-NEXT:    store float [[FLOOR3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE2-NEXT:    store float [[FLOOR4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
-; SSE2-NEXT:    store float [[FLOOR5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE2-NEXT:    store float [[FLOOR6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
-; SSE2-NEXT:    store float [[FLOOR7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
-; SSE2-NEXT:    store float [[FLOOR8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4
-; SSE2-NEXT:    store float [[FLOOR9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4
-; SSE2-NEXT:    store float [[FLOOR10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4
-; SSE2-NEXT:    store float [[FLOOR11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
-; SSE2-NEXT:    store float [[FLOOR12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4
-; SSE2-NEXT:    store float [[FLOOR13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
-; SSE2-NEXT:    store float [[FLOOR14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4
-; SSE2-NEXT:    store float [[FLOOR15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[FLOOR0]], i32 0
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[FLOOR1]], i32 1
+; SSE2-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[FLOOR2]], i32 2
+; SSE2-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[FLOOR3]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> undef, float [[FLOOR4]], i32 0
+; SSE2-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[FLOOR5]], i32 1
+; SSE2-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[FLOOR6]], i32 2
+; SSE2-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[FLOOR7]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> undef, float [[FLOOR8]], i32 0
+; SSE2-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[FLOOR9]], i32 1
+; SSE2-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[FLOOR10]], i32 2
+; SSE2-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[FLOOR11]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP12]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP13:%.*]] = insertelement <4 x float> undef, float [[FLOOR12]], i32 0
+; SSE2-NEXT:    [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[FLOOR13]], i32 1
+; SSE2-NEXT:    [[TMP15:%.*]] = insertelement <4 x float> [[TMP14]], float [[FLOOR14]], i32 2
+; SSE2-NEXT:    [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[FLOOR15]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP16]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @floor_16f32(
@@ -1400,10 +1449,11 @@
 ; SSE2-NEXT:    [[NEARBYINT1:%.*]] = call float @llvm.nearbyint.f32(float [[LD1]])
 ; SSE2-NEXT:    [[NEARBYINT2:%.*]] = call float @llvm.nearbyint.f32(float [[LD2]])
 ; SSE2-NEXT:    [[NEARBYINT3:%.*]] = call float @llvm.nearbyint.f32(float [[LD3]])
-; SSE2-NEXT:    store float [[NEARBYINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; SSE2-NEXT:    store float [[NEARBYINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE2-NEXT:    store float [[NEARBYINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; SSE2-NEXT:    store float [[NEARBYINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[NEARBYINT0]], i32 0
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[NEARBYINT1]], i32 1
+; SSE2-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[NEARBYINT2]], i32 2
+; SSE2-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[NEARBYINT3]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @nearbyint_4f32(
@@ -1451,14 +1501,16 @@
 ; SSE2-NEXT:    [[NEARBYINT5:%.*]] = call float @llvm.nearbyint.f32(float [[LD5]])
 ; SSE2-NEXT:    [[NEARBYINT6:%.*]] = call float @llvm.nearbyint.f32(float [[LD6]])
 ; SSE2-NEXT:    [[NEARBYINT7:%.*]] = call float @llvm.nearbyint.f32(float [[LD7]])
-; SSE2-NEXT:    store float [[NEARBYINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; SSE2-NEXT:    store float [[NEARBYINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE2-NEXT:    store float [[NEARBYINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; SSE2-NEXT:    store float [[NEARBYINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE2-NEXT:    store float [[NEARBYINT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
-; SSE2-NEXT:    store float [[NEARBYINT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE2-NEXT:    store float [[NEARBYINT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
-; SSE2-NEXT:    store float [[NEARBYINT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[NEARBYINT0]], i32 0
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[NEARBYINT1]], i32 1
+; SSE2-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[NEARBYINT2]], i32 2
+; SSE2-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[NEARBYINT3]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> undef, float [[NEARBYINT4]], i32 0
+; SSE2-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[NEARBYINT5]], i32 1
+; SSE2-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[NEARBYINT6]], i32 2
+; SSE2-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[NEARBYINT7]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @nearbyint_8f32(
@@ -1537,22 +1589,26 @@
 ; SSE2-NEXT:    [[NEARBYINT13:%.*]] = call float @llvm.nearbyint.f32(float [[LD13]])
 ; SSE2-NEXT:    [[NEARBYINT14:%.*]] = call float @llvm.nearbyint.f32(float [[LD14]])
 ; SSE2-NEXT:    [[NEARBYINT15:%.*]] = call float @llvm.nearbyint.f32(float [[LD15]])
-; SSE2-NEXT:    store float [[NEARBYINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; SSE2-NEXT:    store float [[NEARBYINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE2-NEXT:    store float [[NEARBYINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; SSE2-NEXT:    store float [[NEARBYINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE2-NEXT:    store float [[NEARBYINT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
-; SSE2-NEXT:    store float [[NEARBYINT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE2-NEXT:    store float [[NEARBYINT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
-; SSE2-NEXT:    store float [[NEARBYINT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
-; SSE2-NEXT:    store float [[NEARBYINT8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4
-; SSE2-NEXT:    store float [[NEARBYINT9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4
-; SSE2-NEXT:    store float [[NEARBYINT10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4
-; SSE2-NEXT:    store float [[NEARBYINT11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
-; SSE2-NEXT:    store float [[NEARBYINT12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4
-; SSE2-NEXT:    store float [[NEARBYINT13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
-; SSE2-NEXT:    store float [[NEARBYINT14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4
-; SSE2-NEXT:    store float [[NEARBYINT15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[NEARBYINT0]], i32 0
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[NEARBYINT1]], i32 1
+; SSE2-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[NEARBYINT2]], i32 2
+; SSE2-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[NEARBYINT3]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> undef, float [[NEARBYINT4]], i32 0
+; SSE2-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[NEARBYINT5]], i32 1
+; SSE2-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[NEARBYINT6]], i32 2
+; SSE2-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[NEARBYINT7]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> undef, float [[NEARBYINT8]], i32 0
+; SSE2-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[NEARBYINT9]], i32 1
+; SSE2-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[NEARBYINT10]], i32 2
+; SSE2-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[NEARBYINT11]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP12]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP13:%.*]] = insertelement <4 x float> undef, float [[NEARBYINT12]], i32 0
+; SSE2-NEXT:    [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[NEARBYINT13]], i32 1
+; SSE2-NEXT:    [[TMP15:%.*]] = insertelement <4 x float> [[TMP14]], float [[NEARBYINT14]], i32 2
+; SSE2-NEXT:    [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[NEARBYINT15]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP16]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @nearbyint_16f32(
@@ -1655,10 +1711,11 @@
 ; SSE2-NEXT:    [[RINT1:%.*]] = call float @llvm.rint.f32(float [[LD1]])
 ; SSE2-NEXT:    [[RINT2:%.*]] = call float @llvm.rint.f32(float [[LD2]])
 ; SSE2-NEXT:    [[RINT3:%.*]] = call float @llvm.rint.f32(float [[LD3]])
-; SSE2-NEXT:    store float [[RINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; SSE2-NEXT:    store float [[RINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE2-NEXT:    store float [[RINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; SSE2-NEXT:    store float [[RINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[RINT0]], i32 0
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[RINT1]], i32 1
+; SSE2-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[RINT2]], i32 2
+; SSE2-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[RINT3]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @rint_4f32(
@@ -1706,14 +1763,16 @@
 ; SSE2-NEXT:    [[RINT5:%.*]] = call float @llvm.rint.f32(float [[LD5]])
 ; SSE2-NEXT:    [[RINT6:%.*]] = call float @llvm.rint.f32(float [[LD6]])
 ; SSE2-NEXT:    [[RINT7:%.*]] = call float @llvm.rint.f32(float [[LD7]])
-; SSE2-NEXT:    store float [[RINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; SSE2-NEXT:    store float [[RINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE2-NEXT:    store float [[RINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; SSE2-NEXT:    store float [[RINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE2-NEXT:    store float [[RINT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
-; SSE2-NEXT:    store float [[RINT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE2-NEXT:    store float [[RINT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
-; SSE2-NEXT:    store float [[RINT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[RINT0]], i32 0
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[RINT1]], i32 1
+; SSE2-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[RINT2]], i32 2
+; SSE2-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[RINT3]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> undef, float [[RINT4]], i32 0
+; SSE2-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[RINT5]], i32 1
+; SSE2-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[RINT6]], i32 2
+; SSE2-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[RINT7]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @rint_8f32(
@@ -1792,22 +1851,26 @@
 ; SSE2-NEXT:    [[RINT13:%.*]] = call float @llvm.rint.f32(float [[LD13]])
 ; SSE2-NEXT:    [[RINT14:%.*]] = call float @llvm.rint.f32(float [[LD14]])
 ; SSE2-NEXT:    [[RINT15:%.*]] = call float @llvm.rint.f32(float [[LD15]])
-; SSE2-NEXT:    store float [[RINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; SSE2-NEXT:    store float [[RINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE2-NEXT:    store float [[RINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; SSE2-NEXT:    store float [[RINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE2-NEXT:    store float [[RINT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
-; SSE2-NEXT:    store float [[RINT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE2-NEXT:    store float [[RINT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
-; SSE2-NEXT:    store float [[RINT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
-; SSE2-NEXT:    store float [[RINT8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4
-; SSE2-NEXT:    store float [[RINT9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4
-; SSE2-NEXT:    store float [[RINT10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4
-; SSE2-NEXT:    store float [[RINT11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
-; SSE2-NEXT:    store float [[RINT12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4
-; SSE2-NEXT:    store float [[RINT13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
-; SSE2-NEXT:    store float [[RINT14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4
-; SSE2-NEXT:    store float [[RINT15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[RINT0]], i32 0
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[RINT1]], i32 1
+; SSE2-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[RINT2]], i32 2
+; SSE2-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[RINT3]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> undef, float [[RINT4]], i32 0
+; SSE2-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[RINT5]], i32 1
+; SSE2-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[RINT6]], i32 2
+; SSE2-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[RINT7]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> undef, float [[RINT8]], i32 0
+; SSE2-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[RINT9]], i32 1
+; SSE2-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[RINT10]], i32 2
+; SSE2-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[RINT11]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP12]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP13:%.*]] = insertelement <4 x float> undef, float [[RINT12]], i32 0
+; SSE2-NEXT:    [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[RINT13]], i32 1
+; SSE2-NEXT:    [[TMP15:%.*]] = insertelement <4 x float> [[TMP14]], float [[RINT14]], i32 2
+; SSE2-NEXT:    [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[RINT15]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP16]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @rint_16f32(
@@ -1910,10 +1973,11 @@
 ; SSE2-NEXT:    [[TRUNC1:%.*]] = call float @llvm.trunc.f32(float [[LD1]])
 ; SSE2-NEXT:    [[TRUNC2:%.*]] = call float @llvm.trunc.f32(float [[LD2]])
 ; SSE2-NEXT:    [[TRUNC3:%.*]] = call float @llvm.trunc.f32(float [[LD3]])
-; SSE2-NEXT:    store float [[TRUNC0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; SSE2-NEXT:    store float [[TRUNC1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE2-NEXT:    store float [[TRUNC2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; SSE2-NEXT:    store float [[TRUNC3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[TRUNC0]], i32 0
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[TRUNC1]], i32 1
+; SSE2-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[TRUNC2]], i32 2
+; SSE2-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[TRUNC3]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @trunc_4f32(
@@ -1961,14 +2025,16 @@
 ; SSE2-NEXT:    [[TRUNC5:%.*]] = call float @llvm.trunc.f32(float [[LD5]])
 ; SSE2-NEXT:    [[TRUNC6:%.*]] = call float @llvm.trunc.f32(float [[LD6]])
 ; SSE2-NEXT:    [[TRUNC7:%.*]] = call float @llvm.trunc.f32(float [[LD7]])
-; SSE2-NEXT:    store float [[TRUNC0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; SSE2-NEXT:    store float [[TRUNC1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE2-NEXT:    store float [[TRUNC2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; SSE2-NEXT:    store float [[TRUNC3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE2-NEXT:    store float [[TRUNC4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
-; SSE2-NEXT:    store float [[TRUNC5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE2-NEXT:    store float [[TRUNC6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
-; SSE2-NEXT:    store float [[TRUNC7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[TRUNC0]], i32 0
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[TRUNC1]], i32 1
+; SSE2-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[TRUNC2]], i32 2
+; SSE2-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[TRUNC3]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> undef, float [[TRUNC4]], i32 0
+; SSE2-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[TRUNC5]], i32 1
+; SSE2-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[TRUNC6]], i32 2
+; SSE2-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[TRUNC7]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @trunc_8f32(
@@ -2047,22 +2113,26 @@
 ; SSE2-NEXT:    [[TRUNC13:%.*]] = call float @llvm.trunc.f32(float [[LD13]])
 ; SSE2-NEXT:    [[TRUNC14:%.*]] = call float @llvm.trunc.f32(float [[LD14]])
 ; SSE2-NEXT:    [[TRUNC15:%.*]] = call float @llvm.trunc.f32(float [[LD15]])
-; SSE2-NEXT:    store float [[TRUNC0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; SSE2-NEXT:    store float [[TRUNC1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE2-NEXT:    store float [[TRUNC2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; SSE2-NEXT:    store float [[TRUNC3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE2-NEXT:    store float [[TRUNC4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
-; SSE2-NEXT:    store float [[TRUNC5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE2-NEXT:    store float [[TRUNC6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
-; SSE2-NEXT:    store float [[TRUNC7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
-; SSE2-NEXT:    store float [[TRUNC8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4
-; SSE2-NEXT:    store float [[TRUNC9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4
-; SSE2-NEXT:    store float [[TRUNC10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4
-; SSE2-NEXT:    store float [[TRUNC11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
-; SSE2-NEXT:    store float [[TRUNC12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4
-; SSE2-NEXT:    store float [[TRUNC13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
-; SSE2-NEXT:    store float [[TRUNC14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4
-; SSE2-NEXT:    store float [[TRUNC15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[TRUNC0]], i32 0
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[TRUNC1]], i32 1
+; SSE2-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[TRUNC2]], i32 2
+; SSE2-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[TRUNC3]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> undef, float [[TRUNC4]], i32 0
+; SSE2-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[TRUNC5]], i32 1
+; SSE2-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[TRUNC6]], i32 2
+; SSE2-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[TRUNC7]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> undef, float [[TRUNC8]], i32 0
+; SSE2-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TRUNC9]], i32 1
+; SSE2-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[TRUNC10]], i32 2
+; SSE2-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[TRUNC11]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP12]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP13:%.*]] = insertelement <4 x float> undef, float [[TRUNC12]], i32 0
+; SSE2-NEXT:    [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[TRUNC13]], i32 1
+; SSE2-NEXT:    [[TMP15:%.*]] = insertelement <4 x float> [[TMP14]], float [[TRUNC14]], i32 2
+; SSE2-NEXT:    [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[TRUNC15]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP16]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @trunc_16f32(
Index: test/Transforms/SLPVectorizer/X86/powof2div.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/powof2div.ll
+++ test/Transforms/SLPVectorizer/X86/powof2div.ll
@@ -60,35 +60,39 @@
 define void @powof2div_nonuniform(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c){
 ; AVX1-LABEL: @powof2div_nonuniform(
 ; AVX1-NEXT:  entry:
-; AVX1-NEXT:    [[TMP0:%.*]] = load i32, i32* [[B:%.*]], align 4
-; AVX1-NEXT:    [[TMP1:%.*]] = load i32, i32* [[C:%.*]], align 4
-; AVX1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
-; AVX1-NEXT:    [[DIV:%.*]] = sdiv i32 [[ADD]], 2
-; AVX1-NEXT:    store i32 [[DIV]], i32* [[A:%.*]], align 4
-; AVX1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 1
-; AVX1-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
-; AVX1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 1
-; AVX1-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX4]], align 4
-; AVX1-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP3]], [[TMP2]]
-; AVX1-NEXT:    [[DIV6:%.*]] = sdiv i32 [[ADD5]], 4
-; AVX1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1
-; AVX1-NEXT:    store i32 [[DIV6]], i32* [[ARRAYIDX7]], align 4
+; AVX1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
+; AVX1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 1
+; AVX1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 1
 ; AVX1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
-; AVX1-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX8]], align 4
 ; AVX1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2
-; AVX1-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX9]], align 4
-; AVX1-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP5]], [[TMP4]]
-; AVX1-NEXT:    [[DIV11:%.*]] = sdiv i32 [[ADD10]], 8
 ; AVX1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
-; AVX1-NEXT:    store i32 [[DIV11]], i32* [[ARRAYIDX12]], align 4
 ; AVX1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
-; AVX1-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX13]], align 4
+; AVX1-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to <4 x i32>*
+; AVX1-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
 ; AVX1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3
-; AVX1-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX14]], align 4
-; AVX1-NEXT:    [[ADD15:%.*]] = add nsw i32 [[TMP7]], [[TMP6]]
-; AVX1-NEXT:    [[DIV16:%.*]] = sdiv i32 [[ADD15]], 16
+; AVX1-NEXT:    [[TMP2:%.*]] = bitcast i32* [[C]] to <4 x i32>*
+; AVX1-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
+; AVX1-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; AVX1-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; AVX1-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; AVX1-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP6]], i32 1
+; AVX1-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; AVX1-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP8]], i32 2
+; AVX1-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; AVX1-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP10]], i32 3
+; AVX1-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; AVX1-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> undef, i32 [[TMP12]], i32 0
+; AVX1-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
+; AVX1-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP14]], i32 1
+; AVX1-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
+; AVX1-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP16]], i32 2
+; AVX1-NEXT:    [[TMP18:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+; AVX1-NEXT:    [[TMP19:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP18]], i32 3
+; AVX1-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[TMP11]], [[TMP19]]
+; AVX1-NEXT:    [[TMP21:%.*]] = sdiv <4 x i32> [[TMP20]], <i32 2, i32 4, i32 8, i32 16>
 ; AVX1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
-; AVX1-NEXT:    store i32 [[DIV16]], i32* [[ARRAYIDX17]], align 4
+; AVX1-NEXT:    [[TMP22:%.*]] = bitcast i32* [[A]] to <4 x i32>*
+; AVX1-NEXT:    store <4 x i32> [[TMP21]], <4 x i32>* [[TMP22]], align 4
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @powof2div_nonuniform(
Index: test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
+++ test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
@@ -15,32 +15,59 @@
 ; Vector cost is 6, Scalar cost is 7
 ; SSE: Adding cost -1 for reduction that starts with   %7 = load i32, i32* %arrayidx.7, align 4 (It is a splitting reduction)
 define i32 @test_add(i32* nocapture readonly %p) {
-; CHECK-LABEL: @test_add(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
-; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
-; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
-; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
-; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
-; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[MUL_18:%.*]] = add i32 undef, undef
-; CHECK-NEXT:    [[MUL_29:%.*]] = add i32 undef, [[MUL_18]]
-; CHECK-NEXT:    [[MUL_310:%.*]] = add i32 undef, [[MUL_29]]
-; CHECK-NEXT:    [[MUL_411:%.*]] = add i32 undef, [[MUL_310]]
-; CHECK-NEXT:    [[MUL_512:%.*]] = add i32 undef, [[MUL_411]]
-; CHECK-NEXT:    [[MUL_613:%.*]] = add i32 undef, [[MUL_512]]
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <8 x i32> [[TMP1]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX2:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
-; CHECK-NEXT:    [[MUL_714:%.*]] = add i32 undef, [[MUL_613]]
-; CHECK-NEXT:    ret i32 [[TMP2]]
+; AVX-LABEL: @test_add(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; AVX-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; AVX-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; AVX-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; AVX-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; AVX-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; AVX-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
+; AVX-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
+; AVX-NEXT:    [[MUL_18:%.*]] = add i32 undef, undef
+; AVX-NEXT:    [[MUL_29:%.*]] = add i32 undef, [[MUL_18]]
+; AVX-NEXT:    [[MUL_310:%.*]] = add i32 undef, [[MUL_29]]
+; AVX-NEXT:    [[MUL_411:%.*]] = add i32 undef, [[MUL_310]]
+; AVX-NEXT:    [[MUL_512:%.*]] = add i32 undef, [[MUL_411]]
+; AVX-NEXT:    [[MUL_613:%.*]] = add i32 undef, [[MUL_512]]
+; AVX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[BIN_RDX:%.*]] = add <8 x i32> [[TMP1]], [[RDX_SHUF]]
+; AVX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[BIN_RDX2:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; AVX-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; AVX-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; AVX-NEXT:    [[MUL_714:%.*]] = add i32 undef, [[MUL_613]]
+; AVX-NEXT:    ret i32 [[TMP2]]
+;
+; SSE-LABEL: @test_add(
+; SSE-NEXT:  entry:
+; SSE-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; SSE-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; SSE-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; SSE-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; SSE-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; SSE-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; SSE-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
+; SSE-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
+; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
+; SSE-NEXT:    [[MUL_18:%.*]] = add i32 undef, undef
+; SSE-NEXT:    [[MUL_29:%.*]] = add i32 undef, [[MUL_18]]
+; SSE-NEXT:    [[MUL_310:%.*]] = add i32 undef, [[MUL_29]]
+; SSE-NEXT:    [[MUL_411:%.*]] = add i32 undef, [[MUL_310]]
+; SSE-NEXT:    [[MUL_512:%.*]] = add i32 undef, [[MUL_411]]
+; SSE-NEXT:    [[MUL_613:%.*]] = add i32 undef, [[MUL_512]]
+; SSE-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[BIN_RDX:%.*]] = add <8 x i32> [[TMP1]], [[RDX_SHUF]]
+; SSE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[BIN_RDX2:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; SSE-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; SSE-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; SSE-NEXT:    [[MUL_714:%.*]] = add i32 undef, [[MUL_613]]
+; SSE-NEXT:    ret i32 [[TMP2]]
 ;
 entry:
   %0 = load i32, i32* %p, align 4
@@ -136,32 +163,59 @@
 ; }
 
 define i32 @test_and(i32* nocapture readonly %p) {
-; CHECK-LABEL: @test_and(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
-; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
-; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
-; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
-; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
-; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[MUL_18:%.*]] = and i32 undef, undef
-; CHECK-NEXT:    [[MUL_29:%.*]] = and i32 undef, [[MUL_18]]
-; CHECK-NEXT:    [[MUL_310:%.*]] = and i32 undef, [[MUL_29]]
-; CHECK-NEXT:    [[MUL_411:%.*]] = and i32 undef, [[MUL_310]]
-; CHECK-NEXT:    [[MUL_512:%.*]] = and i32 undef, [[MUL_411]]
-; CHECK-NEXT:    [[MUL_613:%.*]] = and i32 undef, [[MUL_512]]
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = and <8 x i32> [[TMP1]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX2:%.*]] = and <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX4:%.*]] = and <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
-; CHECK-NEXT:    [[MUL_714:%.*]] = and i32 undef, [[MUL_613]]
-; CHECK-NEXT:    ret i32 [[TMP2]]
+; AVX-LABEL: @test_and(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; AVX-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; AVX-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; AVX-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; AVX-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; AVX-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; AVX-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
+; AVX-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
+; AVX-NEXT:    [[MUL_18:%.*]] = and i32 undef, undef
+; AVX-NEXT:    [[MUL_29:%.*]] = and i32 undef, [[MUL_18]]
+; AVX-NEXT:    [[MUL_310:%.*]] = and i32 undef, [[MUL_29]]
+; AVX-NEXT:    [[MUL_411:%.*]] = and i32 undef, [[MUL_310]]
+; AVX-NEXT:    [[MUL_512:%.*]] = and i32 undef, [[MUL_411]]
+; AVX-NEXT:    [[MUL_613:%.*]] = and i32 undef, [[MUL_512]]
+; AVX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[BIN_RDX:%.*]] = and <8 x i32> [[TMP1]], [[RDX_SHUF]]
+; AVX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[BIN_RDX2:%.*]] = and <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; AVX-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[BIN_RDX4:%.*]] = and <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; AVX-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; AVX-NEXT:    [[MUL_714:%.*]] = and i32 undef, [[MUL_613]]
+; AVX-NEXT:    ret i32 [[TMP2]]
+;
+; SSE-LABEL: @test_and(
+; SSE-NEXT:  entry:
+; SSE-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; SSE-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; SSE-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; SSE-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; SSE-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; SSE-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; SSE-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
+; SSE-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
+; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
+; SSE-NEXT:    [[MUL_18:%.*]] = and i32 undef, undef
+; SSE-NEXT:    [[MUL_29:%.*]] = and i32 undef, [[MUL_18]]
+; SSE-NEXT:    [[MUL_310:%.*]] = and i32 undef, [[MUL_29]]
+; SSE-NEXT:    [[MUL_411:%.*]] = and i32 undef, [[MUL_310]]
+; SSE-NEXT:    [[MUL_512:%.*]] = and i32 undef, [[MUL_411]]
+; SSE-NEXT:    [[MUL_613:%.*]] = and i32 undef, [[MUL_512]]
+; SSE-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[BIN_RDX:%.*]] = and <8 x i32> [[TMP1]], [[RDX_SHUF]]
+; SSE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[BIN_RDX2:%.*]] = and <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; SSE-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[BIN_RDX4:%.*]] = and <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; SSE-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; SSE-NEXT:    [[MUL_714:%.*]] = and i32 undef, [[MUL_613]]
+; SSE-NEXT:    ret i32 [[TMP2]]
 ;
 entry:
   %0 = load i32, i32* %p, align 4
@@ -197,32 +251,59 @@
 ; }
 
 define i32 @test_or(i32* nocapture readonly %p) {
-; CHECK-LABEL: @test_or(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
-; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
-; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
-; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
-; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
-; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[MUL_18:%.*]] = or i32 undef, undef
-; CHECK-NEXT:    [[MUL_29:%.*]] = or i32 undef, [[MUL_18]]
-; CHECK-NEXT:    [[MUL_310:%.*]] = or i32 undef, [[MUL_29]]
-; CHECK-NEXT:    [[MUL_411:%.*]] = or i32 undef, [[MUL_310]]
-; CHECK-NEXT:    [[MUL_512:%.*]] = or i32 undef, [[MUL_411]]
-; CHECK-NEXT:    [[MUL_613:%.*]] = or i32 undef, [[MUL_512]]
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = or <8 x i32> [[TMP1]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX2:%.*]] = or <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX4:%.*]] = or <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
-; CHECK-NEXT:    [[MUL_714:%.*]] = or i32 undef, [[MUL_613]]
-; CHECK-NEXT:    ret i32 [[TMP2]]
+; AVX-LABEL: @test_or(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; AVX-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; AVX-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; AVX-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; AVX-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; AVX-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; AVX-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
+; AVX-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
+; AVX-NEXT:    [[MUL_18:%.*]] = or i32 undef, undef
+; AVX-NEXT:    [[MUL_29:%.*]] = or i32 undef, [[MUL_18]]
+; AVX-NEXT:    [[MUL_310:%.*]] = or i32 undef, [[MUL_29]]
+; AVX-NEXT:    [[MUL_411:%.*]] = or i32 undef, [[MUL_310]]
+; AVX-NEXT:    [[MUL_512:%.*]] = or i32 undef, [[MUL_411]]
+; AVX-NEXT:    [[MUL_613:%.*]] = or i32 undef, [[MUL_512]]
+; AVX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[BIN_RDX:%.*]] = or <8 x i32> [[TMP1]], [[RDX_SHUF]]
+; AVX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[BIN_RDX2:%.*]] = or <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; AVX-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[BIN_RDX4:%.*]] = or <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; AVX-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; AVX-NEXT:    [[MUL_714:%.*]] = or i32 undef, [[MUL_613]]
+; AVX-NEXT:    ret i32 [[TMP2]]
+;
+; SSE-LABEL: @test_or(
+; SSE-NEXT:  entry:
+; SSE-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; SSE-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; SSE-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; SSE-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; SSE-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; SSE-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; SSE-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
+; SSE-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
+; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
+; SSE-NEXT:    [[MUL_18:%.*]] = or i32 undef, undef
+; SSE-NEXT:    [[MUL_29:%.*]] = or i32 undef, [[MUL_18]]
+; SSE-NEXT:    [[MUL_310:%.*]] = or i32 undef, [[MUL_29]]
+; SSE-NEXT:    [[MUL_411:%.*]] = or i32 undef, [[MUL_310]]
+; SSE-NEXT:    [[MUL_512:%.*]] = or i32 undef, [[MUL_411]]
+; SSE-NEXT:    [[MUL_613:%.*]] = or i32 undef, [[MUL_512]]
+; SSE-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[BIN_RDX:%.*]] = or <8 x i32> [[TMP1]], [[RDX_SHUF]]
+; SSE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[BIN_RDX2:%.*]] = or <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; SSE-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[BIN_RDX4:%.*]] = or <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; SSE-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; SSE-NEXT:    [[MUL_714:%.*]] = or i32 undef, [[MUL_613]]
+; SSE-NEXT:    ret i32 [[TMP2]]
 ;
 entry:
   %0 = load i32, i32* %p, align 4
@@ -258,32 +339,59 @@
 ; }
 
 define i32 @test_xor(i32* nocapture readonly %p) {
-; CHECK-LABEL: @test_xor(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
-; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
-; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
-; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
-; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
-; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[MUL_18:%.*]] = xor i32 undef, undef
-; CHECK-NEXT:    [[MUL_29:%.*]] = xor i32 undef, [[MUL_18]]
-; CHECK-NEXT:    [[MUL_310:%.*]] = xor i32 undef, [[MUL_29]]
-; CHECK-NEXT:    [[MUL_411:%.*]] = xor i32 undef, [[MUL_310]]
-; CHECK-NEXT:    [[MUL_512:%.*]] = xor i32 undef, [[MUL_411]]
-; CHECK-NEXT:    [[MUL_613:%.*]] = xor i32 undef, [[MUL_512]]
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = xor <8 x i32> [[TMP1]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX2:%.*]] = xor <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX4:%.*]] = xor <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
-; CHECK-NEXT:    [[MUL_714:%.*]] = xor i32 undef, [[MUL_613]]
-; CHECK-NEXT:    ret i32 [[TMP2]]
+; AVX-LABEL: @test_xor(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; AVX-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; AVX-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; AVX-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; AVX-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; AVX-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; AVX-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
+; AVX-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
+; AVX-NEXT:    [[MUL_18:%.*]] = xor i32 undef, undef
+; AVX-NEXT:    [[MUL_29:%.*]] = xor i32 undef, [[MUL_18]]
+; AVX-NEXT:    [[MUL_310:%.*]] = xor i32 undef, [[MUL_29]]
+; AVX-NEXT:    [[MUL_411:%.*]] = xor i32 undef, [[MUL_310]]
+; AVX-NEXT:    [[MUL_512:%.*]] = xor i32 undef, [[MUL_411]]
+; AVX-NEXT:    [[MUL_613:%.*]] = xor i32 undef, [[MUL_512]]
+; AVX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[BIN_RDX:%.*]] = xor <8 x i32> [[TMP1]], [[RDX_SHUF]]
+; AVX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[BIN_RDX2:%.*]] = xor <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; AVX-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[BIN_RDX4:%.*]] = xor <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; AVX-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; AVX-NEXT:    [[MUL_714:%.*]] = xor i32 undef, [[MUL_613]]
+; AVX-NEXT:    ret i32 [[TMP2]]
+;
+; SSE-LABEL: @test_xor(
+; SSE-NEXT:  entry:
+; SSE-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; SSE-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; SSE-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; SSE-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; SSE-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; SSE-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; SSE-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
+; SSE-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
+; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
+; SSE-NEXT:    [[MUL_18:%.*]] = xor i32 undef, undef
+; SSE-NEXT:    [[MUL_29:%.*]] = xor i32 undef, [[MUL_18]]
+; SSE-NEXT:    [[MUL_310:%.*]] = xor i32 undef, [[MUL_29]]
+; SSE-NEXT:    [[MUL_411:%.*]] = xor i32 undef, [[MUL_310]]
+; SSE-NEXT:    [[MUL_512:%.*]] = xor i32 undef, [[MUL_411]]
+; SSE-NEXT:    [[MUL_613:%.*]] = xor i32 undef, [[MUL_512]]
+; SSE-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[BIN_RDX:%.*]] = xor <8 x i32> [[TMP1]], [[RDX_SHUF]]
+; SSE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[BIN_RDX2:%.*]] = xor <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; SSE-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[BIN_RDX4:%.*]] = xor <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; SSE-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; SSE-NEXT:    [[MUL_714:%.*]] = xor i32 undef, [[MUL_613]]
+; SSE-NEXT:    ret i32 [[TMP2]]
 ;
 entry:
   %0 = load i32, i32* %p, align 4
@@ -312,25 +420,45 @@
 }
 
 define i32 @PR37731(<4 x i32>* noalias nocapture dereferenceable(16) %self) unnamed_addr #0 {
-; CHECK-LABEL: @PR37731(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* [[SELF:%.*]], align 16
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <4 x i32> [[TMP0]], <i32 6, i32 2, i32 13, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = lshr <4 x i32> [[TMP2]], <i32 13, i32 27, i32 21, i32 12>
-; CHECK-NEXT:    [[TMP4:%.*]] = and <4 x i32> [[TMP0]], <i32 -2, i32 -8, i32 -16, i32 -128>
-; CHECK-NEXT:    [[TMP5:%.*]] = shl <4 x i32> [[TMP4]], <i32 18, i32 2, i32 7, i32 13>
-; CHECK-NEXT:    [[TMP6:%.*]] = xor <4 x i32> [[TMP3]], [[TMP5]]
-; CHECK-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* [[SELF]], align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i32 undef, undef
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP7]], undef
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = xor <4 x i32> [[TMP6]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX2:%.*]] = xor <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], undef
-; CHECK-NEXT:    ret i32 [[TMP9]]
+; AVX-LABEL: @PR37731(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* [[SELF:%.*]], align 16
+; AVX-NEXT:    [[TMP1:%.*]] = shl <4 x i32> [[TMP0]], <i32 6, i32 2, i32 13, i32 3>
+; AVX-NEXT:    [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], [[TMP0]]
+; AVX-NEXT:    [[TMP3:%.*]] = lshr <4 x i32> [[TMP2]], <i32 13, i32 27, i32 21, i32 12>
+; AVX-NEXT:    [[TMP4:%.*]] = and <4 x i32> [[TMP0]], <i32 -2, i32 -8, i32 -16, i32 -128>
+; AVX-NEXT:    [[TMP5:%.*]] = shl <4 x i32> [[TMP4]], <i32 18, i32 2, i32 7, i32 13>
+; AVX-NEXT:    [[TMP6:%.*]] = xor <4 x i32> [[TMP3]], [[TMP5]]
+; AVX-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* [[SELF]], align 16
+; AVX-NEXT:    [[TMP7:%.*]] = xor i32 undef, undef
+; AVX-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP7]], undef
+; AVX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; AVX-NEXT:    [[BIN_RDX:%.*]] = xor <4 x i32> [[TMP6]], [[RDX_SHUF]]
+; AVX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[BIN_RDX2:%.*]] = xor <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; AVX-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
+; AVX-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], undef
+; AVX-NEXT:    ret i32 [[TMP9]]
+;
+; SSE-LABEL: @PR37731(
+; SSE-NEXT:  entry:
+; SSE-NEXT:    [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* [[SELF:%.*]], align 16
+; SSE-NEXT:    [[TMP1:%.*]] = shl <4 x i32> [[TMP0]], <i32 6, i32 2, i32 13, i32 3>
+; SSE-NEXT:    [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], [[TMP0]]
+; SSE-NEXT:    [[TMP3:%.*]] = lshr <4 x i32> [[TMP2]], <i32 13, i32 27, i32 21, i32 12>
+; SSE-NEXT:    [[TMP4:%.*]] = and <4 x i32> [[TMP0]], <i32 -2, i32 -8, i32 -16, i32 -128>
+; SSE-NEXT:    [[TMP5:%.*]] = shl <4 x i32> [[TMP4]], <i32 18, i32 2, i32 7, i32 13>
+; SSE-NEXT:    [[TMP6:%.*]] = xor <4 x i32> [[TMP3]], [[TMP5]]
+; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* [[SELF]], align 16
+; SSE-NEXT:    [[TMP7:%.*]] = xor i32 undef, undef
+; SSE-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP7]], undef
+; SSE-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; SSE-NEXT:    [[BIN_RDX:%.*]] = xor <4 x i32> [[TMP6]], [[RDX_SHUF]]
+; SSE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[BIN_RDX2:%.*]] = xor <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
+; SSE-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], undef
+; SSE-NEXT:    ret i32 [[TMP9]]
 ;
 entry:
   %0 = load <4 x i32>, <4 x i32>* %self, align 16
Index: test/Transforms/SLPVectorizer/X86/shift-ashr.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/shift-ashr.ll
+++ test/Transforms/SLPVectorizer/X86/shift-ashr.ll
@@ -22,73 +22,97 @@
 
 define void @ashr_v8i64() {
 ; SSE-LABEL: @ashr_v8i64(
-; SSE-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
-; SSE-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
-; SSE-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
-; SSE-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
-; SSE-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
-; SSE-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
-; SSE-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
-; SSE-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
-; SSE-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
-; SSE-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
-; SSE-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
-; SSE-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
-; SSE-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
-; SSE-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
-; SSE-NEXT:    [[R0:%.*]] = ashr i64 [[A0]], [[B0]]
-; SSE-NEXT:    [[R1:%.*]] = ashr i64 [[A1]], [[B1]]
-; SSE-NEXT:    [[R2:%.*]] = ashr i64 [[A2]], [[B2]]
-; SSE-NEXT:    [[R3:%.*]] = ashr i64 [[A3]], [[B3]]
-; SSE-NEXT:    [[R4:%.*]] = ashr i64 [[A4]], [[B4]]
-; SSE-NEXT:    [[R5:%.*]] = ashr i64 [[A5]], [[B5]]
-; SSE-NEXT:    [[R6:%.*]] = ashr i64 [[A6]], [[B6]]
-; SSE-NEXT:    [[R7:%.*]] = ashr i64 [[A7]], [[B7]]
-; SSE-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
-; SSE-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
-; SSE-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
-; SSE-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
-; SSE-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
-; SSE-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
-; SSE-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
-; SSE-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
+; SSE-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> undef, i64 [[TMP9]], i32 0
+; SSE-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+; SSE-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP11]], i32 1
+; SSE-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
+; SSE-NEXT:    [[TMP14:%.*]] = insertelement <2 x i64> undef, i64 [[TMP13]], i32 0
+; SSE-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
+; SSE-NEXT:    [[TMP16:%.*]] = insertelement <2 x i64> [[TMP14]], i64 [[TMP15]], i32 1
+; SSE-NEXT:    [[TMP17:%.*]] = ashr <2 x i64> [[TMP12]], [[TMP16]]
+; SSE-NEXT:    [[TMP18:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
+; SSE-NEXT:    [[TMP19:%.*]] = insertelement <2 x i64> undef, i64 [[TMP18]], i32 0
+; SSE-NEXT:    [[TMP20:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; SSE-NEXT:    [[TMP21:%.*]] = insertelement <2 x i64> [[TMP19]], i64 [[TMP20]], i32 1
+; SSE-NEXT:    [[TMP22:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
+; SSE-NEXT:    [[TMP23:%.*]] = insertelement <2 x i64> undef, i64 [[TMP22]], i32 0
+; SSE-NEXT:    [[TMP24:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
+; SSE-NEXT:    [[TMP25:%.*]] = insertelement <2 x i64> [[TMP23]], i64 [[TMP24]], i32 1
+; SSE-NEXT:    [[TMP26:%.*]] = ashr <2 x i64> [[TMP21]], [[TMP25]]
+; SSE-NEXT:    [[TMP27:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; SSE-NEXT:    [[TMP28:%.*]] = insertelement <2 x i64> undef, i64 [[TMP27]], i32 0
+; SSE-NEXT:    [[TMP29:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; SSE-NEXT:    [[TMP30:%.*]] = insertelement <2 x i64> [[TMP28]], i64 [[TMP29]], i32 1
+; SSE-NEXT:    [[TMP31:%.*]] = extractelement <2 x i64> [[TMP7]], i32 0
+; SSE-NEXT:    [[TMP32:%.*]] = insertelement <2 x i64> undef, i64 [[TMP31]], i32 0
+; SSE-NEXT:    [[TMP33:%.*]] = extractelement <2 x i64> [[TMP7]], i32 1
+; SSE-NEXT:    [[TMP34:%.*]] = insertelement <2 x i64> [[TMP32]], i64 [[TMP33]], i32 1
+; SSE-NEXT:    [[TMP35:%.*]] = ashr <2 x i64> [[TMP30]], [[TMP34]]
+; SSE-NEXT:    [[TMP36:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
+; SSE-NEXT:    [[TMP37:%.*]] = insertelement <2 x i64> undef, i64 [[TMP36]], i32 0
+; SSE-NEXT:    [[TMP38:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; SSE-NEXT:    [[TMP39:%.*]] = insertelement <2 x i64> [[TMP37]], i64 [[TMP38]], i32 1
+; SSE-NEXT:    [[TMP40:%.*]] = extractelement <2 x i64> [[TMP8]], i32 0
+; SSE-NEXT:    [[TMP41:%.*]] = insertelement <2 x i64> undef, i64 [[TMP40]], i32 0
+; SSE-NEXT:    [[TMP42:%.*]] = extractelement <2 x i64> [[TMP8]], i32 1
+; SSE-NEXT:    [[TMP43:%.*]] = insertelement <2 x i64> [[TMP41]], i64 [[TMP42]], i32 1
+; SSE-NEXT:    [[TMP44:%.*]] = ashr <2 x i64> [[TMP39]], [[TMP43]]
+; SSE-NEXT:    store <2 x i64> [[TMP17]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP26]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP35]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP44]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SSE-NEXT:    ret void
 ;
 ; AVX1-LABEL: @ashr_v8i64(
-; AVX1-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
-; AVX1-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
-; AVX1-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
-; AVX1-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
-; AVX1-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
-; AVX1-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
-; AVX1-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
-; AVX1-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
-; AVX1-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
-; AVX1-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
-; AVX1-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
-; AVX1-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
-; AVX1-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
-; AVX1-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
-; AVX1-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
-; AVX1-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
-; AVX1-NEXT:    [[R0:%.*]] = ashr i64 [[A0]], [[B0]]
-; AVX1-NEXT:    [[R1:%.*]] = ashr i64 [[A1]], [[B1]]
-; AVX1-NEXT:    [[R2:%.*]] = ashr i64 [[A2]], [[B2]]
-; AVX1-NEXT:    [[R3:%.*]] = ashr i64 [[A3]], [[B3]]
-; AVX1-NEXT:    [[R4:%.*]] = ashr i64 [[A4]], [[B4]]
-; AVX1-NEXT:    [[R5:%.*]] = ashr i64 [[A5]], [[B5]]
-; AVX1-NEXT:    [[R6:%.*]] = ashr i64 [[A6]], [[B6]]
-; AVX1-NEXT:    [[R7:%.*]] = ashr i64 [[A7]], [[B7]]
-; AVX1-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
-; AVX1-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
-; AVX1-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
-; AVX1-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
-; AVX1-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
-; AVX1-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
-; AVX1-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
-; AVX1-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+; AVX1-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; AVX1-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX1-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX1-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX1-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP1]], i32 0
+; AVX1-NEXT:    [[TMP6:%.*]] = insertelement <4 x i64> undef, i64 [[TMP5]], i32 0
+; AVX1-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP1]], i32 1
+; AVX1-NEXT:    [[TMP8:%.*]] = insertelement <4 x i64> [[TMP6]], i64 [[TMP7]], i32 1
+; AVX1-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP1]], i32 2
+; AVX1-NEXT:    [[TMP10:%.*]] = insertelement <4 x i64> [[TMP8]], i64 [[TMP9]], i32 2
+; AVX1-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP1]], i32 3
+; AVX1-NEXT:    [[TMP12:%.*]] = insertelement <4 x i64> [[TMP10]], i64 [[TMP11]], i32 3
+; AVX1-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; AVX1-NEXT:    [[TMP14:%.*]] = insertelement <4 x i64> undef, i64 [[TMP13]], i32 0
+; AVX1-NEXT:    [[TMP15:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; AVX1-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP14]], i64 [[TMP15]], i32 1
+; AVX1-NEXT:    [[TMP17:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; AVX1-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP17]], i32 2
+; AVX1-NEXT:    [[TMP19:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; AVX1-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP19]], i32 3
+; AVX1-NEXT:    [[TMP21:%.*]] = ashr <4 x i64> [[TMP12]], [[TMP20]]
+; AVX1-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0
+; AVX1-NEXT:    [[TMP23:%.*]] = insertelement <4 x i64> undef, i64 [[TMP22]], i32 0
+; AVX1-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
+; AVX1-NEXT:    [[TMP25:%.*]] = insertelement <4 x i64> [[TMP23]], i64 [[TMP24]], i32 1
+; AVX1-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
+; AVX1-NEXT:    [[TMP27:%.*]] = insertelement <4 x i64> [[TMP25]], i64 [[TMP26]], i32 2
+; AVX1-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
+; AVX1-NEXT:    [[TMP29:%.*]] = insertelement <4 x i64> [[TMP27]], i64 [[TMP28]], i32 3
+; AVX1-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP4]], i32 0
+; AVX1-NEXT:    [[TMP31:%.*]] = insertelement <4 x i64> undef, i64 [[TMP30]], i32 0
+; AVX1-NEXT:    [[TMP32:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1
+; AVX1-NEXT:    [[TMP33:%.*]] = insertelement <4 x i64> [[TMP31]], i64 [[TMP32]], i32 1
+; AVX1-NEXT:    [[TMP34:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2
+; AVX1-NEXT:    [[TMP35:%.*]] = insertelement <4 x i64> [[TMP33]], i64 [[TMP34]], i32 2
+; AVX1-NEXT:    [[TMP36:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3
+; AVX1-NEXT:    [[TMP37:%.*]] = insertelement <4 x i64> [[TMP35]], i64 [[TMP36]], i32 3
+; AVX1-NEXT:    [[TMP38:%.*]] = ashr <4 x i64> [[TMP29]], [[TMP37]]
+; AVX1-NEXT:    store <4 x i64> [[TMP21]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX1-NEXT:    store <4 x i64> [[TMP38]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @ashr_v8i64(
@@ -157,89 +181,161 @@
 
 define void @ashr_v16i32() {
 ; SSE-LABEL: @ashr_v16i32(
-; SSE-NEXT:    [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4
-; SSE-NEXT:    [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4
-; SSE-NEXT:    [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4
-; SSE-NEXT:    [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4
-; SSE-NEXT:    [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4
-; SSE-NEXT:    [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4
-; SSE-NEXT:    [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4
-; SSE-NEXT:    [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4
-; SSE-NEXT:    [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4
-; SSE-NEXT:    [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4
-; SSE-NEXT:    [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
-; SSE-NEXT:    [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
-; SSE-NEXT:    [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
-; SSE-NEXT:    [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
-; SSE-NEXT:    [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
-; SSE-NEXT:    [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
-; SSE-NEXT:    [[B0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4
-; SSE-NEXT:    [[B1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4
-; SSE-NEXT:    [[B2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4
-; SSE-NEXT:    [[B3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4
-; SSE-NEXT:    [[B4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4
-; SSE-NEXT:    [[B5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4
-; SSE-NEXT:    [[B6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4
-; SSE-NEXT:    [[B7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4
-; SSE-NEXT:    [[B8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4
-; SSE-NEXT:    [[B9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4
-; SSE-NEXT:    [[B10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
-; SSE-NEXT:    [[B11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
-; SSE-NEXT:    [[B12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
-; SSE-NEXT:    [[B13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
-; SSE-NEXT:    [[B14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
-; SSE-NEXT:    [[B15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
-; SSE-NEXT:    [[R0:%.*]] = ashr i32 [[A0]], [[B0]]
-; SSE-NEXT:    [[R1:%.*]] = ashr i32 [[A1]], [[B1]]
-; SSE-NEXT:    [[R2:%.*]] = ashr i32 [[A2]], [[B2]]
-; SSE-NEXT:    [[R3:%.*]] = ashr i32 [[A3]], [[B3]]
-; SSE-NEXT:    [[R4:%.*]] = ashr i32 [[A4]], [[B4]]
-; SSE-NEXT:    [[R5:%.*]] = ashr i32 [[A5]], [[B5]]
-; SSE-NEXT:    [[R6:%.*]] = ashr i32 [[A6]], [[B6]]
-; SSE-NEXT:    [[R7:%.*]] = ashr i32 [[A7]], [[B7]]
-; SSE-NEXT:    [[R8:%.*]] = ashr i32 [[A8]], [[B8]]
-; SSE-NEXT:    [[R9:%.*]] = ashr i32 [[A9]], [[B9]]
-; SSE-NEXT:    [[R10:%.*]] = ashr i32 [[A10]], [[B10]]
-; SSE-NEXT:    [[R11:%.*]] = ashr i32 [[A11]], [[B11]]
-; SSE-NEXT:    [[R12:%.*]] = ashr i32 [[A12]], [[B12]]
-; SSE-NEXT:    [[R13:%.*]] = ashr i32 [[A13]], [[B13]]
-; SSE-NEXT:    [[R14:%.*]] = ashr i32 [[A14]], [[B14]]
-; SSE-NEXT:    [[R15:%.*]] = ashr i32 [[A15]], [[B15]]
-; SSE-NEXT:    store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4
-; SSE-NEXT:    store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4
-; SSE-NEXT:    store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4
-; SSE-NEXT:    store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4
-; SSE-NEXT:    store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4
-; SSE-NEXT:    store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4
-; SSE-NEXT:    store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4
-; SSE-NEXT:    store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4
-; SSE-NEXT:    store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4
-; SSE-NEXT:    store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4
-; SSE-NEXT:    store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
-; SSE-NEXT:    store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
-; SSE-NEXT:    store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
-; SSE-NEXT:    store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
-; SSE-NEXT:    store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
-; SSE-NEXT:    store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; SSE-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[TMP9]], i32 0
+; SSE-NEXT:    [[TMP11:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
+; SSE-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP11]], i32 1
+; SSE-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
+; SSE-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP13]], i32 2
+; SSE-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+; SSE-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP15]], i32 3
+; SSE-NEXT:    [[TMP17:%.*]] = extractelement <4 x i32> [[TMP5]], i32 0
+; SSE-NEXT:    [[TMP18:%.*]] = insertelement <4 x i32> undef, i32 [[TMP17]], i32 0
+; SSE-NEXT:    [[TMP19:%.*]] = extractelement <4 x i32> [[TMP5]], i32 1
+; SSE-NEXT:    [[TMP20:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP19]], i32 1
+; SSE-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[TMP5]], i32 2
+; SSE-NEXT:    [[TMP22:%.*]] = insertelement <4 x i32> [[TMP20]], i32 [[TMP21]], i32 2
+; SSE-NEXT:    [[TMP23:%.*]] = extractelement <4 x i32> [[TMP5]], i32 3
+; SSE-NEXT:    [[TMP24:%.*]] = insertelement <4 x i32> [[TMP22]], i32 [[TMP23]], i32 3
+; SSE-NEXT:    [[TMP25:%.*]] = ashr <4 x i32> [[TMP16]], [[TMP24]]
+; SSE-NEXT:    [[TMP26:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
+; SSE-NEXT:    [[TMP27:%.*]] = insertelement <4 x i32> undef, i32 [[TMP26]], i32 0
+; SSE-NEXT:    [[TMP28:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
+; SSE-NEXT:    [[TMP29:%.*]] = insertelement <4 x i32> [[TMP27]], i32 [[TMP28]], i32 1
+; SSE-NEXT:    [[TMP30:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
+; SSE-NEXT:    [[TMP31:%.*]] = insertelement <4 x i32> [[TMP29]], i32 [[TMP30]], i32 2
+; SSE-NEXT:    [[TMP32:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; SSE-NEXT:    [[TMP33:%.*]] = insertelement <4 x i32> [[TMP31]], i32 [[TMP32]], i32 3
+; SSE-NEXT:    [[TMP34:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0
+; SSE-NEXT:    [[TMP35:%.*]] = insertelement <4 x i32> undef, i32 [[TMP34]], i32 0
+; SSE-NEXT:    [[TMP36:%.*]] = extractelement <4 x i32> [[TMP6]], i32 1
+; SSE-NEXT:    [[TMP37:%.*]] = insertelement <4 x i32> [[TMP35]], i32 [[TMP36]], i32 1
+; SSE-NEXT:    [[TMP38:%.*]] = extractelement <4 x i32> [[TMP6]], i32 2
+; SSE-NEXT:    [[TMP39:%.*]] = insertelement <4 x i32> [[TMP37]], i32 [[TMP38]], i32 2
+; SSE-NEXT:    [[TMP40:%.*]] = extractelement <4 x i32> [[TMP6]], i32 3
+; SSE-NEXT:    [[TMP41:%.*]] = insertelement <4 x i32> [[TMP39]], i32 [[TMP40]], i32 3
+; SSE-NEXT:    [[TMP42:%.*]] = ashr <4 x i32> [[TMP33]], [[TMP41]]
+; SSE-NEXT:    [[TMP43:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; SSE-NEXT:    [[TMP44:%.*]] = insertelement <4 x i32> undef, i32 [[TMP43]], i32 0
+; SSE-NEXT:    [[TMP45:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; SSE-NEXT:    [[TMP46:%.*]] = insertelement <4 x i32> [[TMP44]], i32 [[TMP45]], i32 1
+; SSE-NEXT:    [[TMP47:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; SSE-NEXT:    [[TMP48:%.*]] = insertelement <4 x i32> [[TMP46]], i32 [[TMP47]], i32 2
+; SSE-NEXT:    [[TMP49:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; SSE-NEXT:    [[TMP50:%.*]] = insertelement <4 x i32> [[TMP48]], i32 [[TMP49]], i32 3
+; SSE-NEXT:    [[TMP51:%.*]] = extractelement <4 x i32> [[TMP7]], i32 0
+; SSE-NEXT:    [[TMP52:%.*]] = insertelement <4 x i32> undef, i32 [[TMP51]], i32 0
+; SSE-NEXT:    [[TMP53:%.*]] = extractelement <4 x i32> [[TMP7]], i32 1
+; SSE-NEXT:    [[TMP54:%.*]] = insertelement <4 x i32> [[TMP52]], i32 [[TMP53]], i32 1
+; SSE-NEXT:    [[TMP55:%.*]] = extractelement <4 x i32> [[TMP7]], i32 2
+; SSE-NEXT:    [[TMP56:%.*]] = insertelement <4 x i32> [[TMP54]], i32 [[TMP55]], i32 2
+; SSE-NEXT:    [[TMP57:%.*]] = extractelement <4 x i32> [[TMP7]], i32 3
+; SSE-NEXT:    [[TMP58:%.*]] = insertelement <4 x i32> [[TMP56]], i32 [[TMP57]], i32 3
+; SSE-NEXT:    [[TMP59:%.*]] = ashr <4 x i32> [[TMP50]], [[TMP58]]
+; SSE-NEXT:    [[TMP60:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
+; SSE-NEXT:    [[TMP61:%.*]] = insertelement <4 x i32> undef, i32 [[TMP60]], i32 0
+; SSE-NEXT:    [[TMP62:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1
+; SSE-NEXT:    [[TMP63:%.*]] = insertelement <4 x i32> [[TMP61]], i32 [[TMP62]], i32 1
+; SSE-NEXT:    [[TMP64:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
+; SSE-NEXT:    [[TMP65:%.*]] = insertelement <4 x i32> [[TMP63]], i32 [[TMP64]], i32 2
+; SSE-NEXT:    [[TMP66:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
+; SSE-NEXT:    [[TMP67:%.*]] = insertelement <4 x i32> [[TMP65]], i32 [[TMP66]], i32 3
+; SSE-NEXT:    [[TMP68:%.*]] = extractelement <4 x i32> [[TMP8]], i32 0
+; SSE-NEXT:    [[TMP69:%.*]] = insertelement <4 x i32> undef, i32 [[TMP68]], i32 0
+; SSE-NEXT:    [[TMP70:%.*]] = extractelement <4 x i32> [[TMP8]], i32 1
+; SSE-NEXT:    [[TMP71:%.*]] = insertelement <4 x i32> [[TMP69]], i32 [[TMP70]], i32 1
+; SSE-NEXT:    [[TMP72:%.*]] = extractelement <4 x i32> [[TMP8]], i32 2
+; SSE-NEXT:    [[TMP73:%.*]] = insertelement <4 x i32> [[TMP71]], i32 [[TMP72]], i32 2
+; SSE-NEXT:    [[TMP74:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3
+; SSE-NEXT:    [[TMP75:%.*]] = insertelement <4 x i32> [[TMP73]], i32 [[TMP74]], i32 3
+; SSE-NEXT:    [[TMP76:%.*]] = ashr <4 x i32> [[TMP67]], [[TMP75]]
+; SSE-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP42]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP59]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP76]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; AVX1-LABEL: @ashr_v16i32(
-; AVX1-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; AVX1-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; AVX1-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; AVX1-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; AVX1-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; AVX1-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; AVX1-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; AVX1-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; AVX1-NEXT:    [[TMP9:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP5]]
-; AVX1-NEXT:    [[TMP10:%.*]] = ashr <4 x i32> [[TMP2]], [[TMP6]]
-; AVX1-NEXT:    [[TMP11:%.*]] = ashr <4 x i32> [[TMP3]], [[TMP7]]
-; AVX1-NEXT:    [[TMP12:%.*]] = ashr <4 x i32> [[TMP4]], [[TMP8]]
-; AVX1-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; AVX1-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; AVX1-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
-; AVX1-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
+; AVX1-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX1-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX1-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX1-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0
+; AVX1-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> undef, i32 [[TMP5]], i32 0
+; AVX1-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP1]], i32 1
+; AVX1-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[TMP7]], i32 1
+; AVX1-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2
+; AVX1-NEXT:    [[TMP10:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[TMP9]], i32 2
+; AVX1-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3
+; AVX1-NEXT:    [[TMP12:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[TMP11]], i32 3
+; AVX1-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4
+; AVX1-NEXT:    [[TMP14:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[TMP13]], i32 4
+; AVX1-NEXT:    [[TMP15:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5
+; AVX1-NEXT:    [[TMP16:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[TMP15]], i32 5
+; AVX1-NEXT:    [[TMP17:%.*]] = extractelement <8 x i32> [[TMP1]], i32 6
+; AVX1-NEXT:    [[TMP18:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[TMP17]], i32 6
+; AVX1-NEXT:    [[TMP19:%.*]] = extractelement <8 x i32> [[TMP1]], i32 7
+; AVX1-NEXT:    [[TMP20:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP19]], i32 7
+; AVX1-NEXT:    [[TMP21:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
+; AVX1-NEXT:    [[TMP22:%.*]] = insertelement <8 x i32> undef, i32 [[TMP21]], i32 0
+; AVX1-NEXT:    [[TMP23:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
+; AVX1-NEXT:    [[TMP24:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP23]], i32 1
+; AVX1-NEXT:    [[TMP25:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
+; AVX1-NEXT:    [[TMP26:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP25]], i32 2
+; AVX1-NEXT:    [[TMP27:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
+; AVX1-NEXT:    [[TMP28:%.*]] = insertelement <8 x i32> [[TMP26]], i32 [[TMP27]], i32 3
+; AVX1-NEXT:    [[TMP29:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
+; AVX1-NEXT:    [[TMP30:%.*]] = insertelement <8 x i32> [[TMP28]], i32 [[TMP29]], i32 4
+; AVX1-NEXT:    [[TMP31:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
+; AVX1-NEXT:    [[TMP32:%.*]] = insertelement <8 x i32> [[TMP30]], i32 [[TMP31]], i32 5
+; AVX1-NEXT:    [[TMP33:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
+; AVX1-NEXT:    [[TMP34:%.*]] = insertelement <8 x i32> [[TMP32]], i32 [[TMP33]], i32 6
+; AVX1-NEXT:    [[TMP35:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
+; AVX1-NEXT:    [[TMP36:%.*]] = insertelement <8 x i32> [[TMP34]], i32 [[TMP35]], i32 7
+; AVX1-NEXT:    [[TMP37:%.*]] = ashr <8 x i32> [[TMP20]], [[TMP36]]
+; AVX1-NEXT:    [[TMP38:%.*]] = extractelement <8 x i32> [[TMP2]], i32 0
+; AVX1-NEXT:    [[TMP39:%.*]] = insertelement <8 x i32> undef, i32 [[TMP38]], i32 0
+; AVX1-NEXT:    [[TMP40:%.*]] = extractelement <8 x i32> [[TMP2]], i32 1
+; AVX1-NEXT:    [[TMP41:%.*]] = insertelement <8 x i32> [[TMP39]], i32 [[TMP40]], i32 1
+; AVX1-NEXT:    [[TMP42:%.*]] = extractelement <8 x i32> [[TMP2]], i32 2
+; AVX1-NEXT:    [[TMP43:%.*]] = insertelement <8 x i32> [[TMP41]], i32 [[TMP42]], i32 2
+; AVX1-NEXT:    [[TMP44:%.*]] = extractelement <8 x i32> [[TMP2]], i32 3
+; AVX1-NEXT:    [[TMP45:%.*]] = insertelement <8 x i32> [[TMP43]], i32 [[TMP44]], i32 3
+; AVX1-NEXT:    [[TMP46:%.*]] = extractelement <8 x i32> [[TMP2]], i32 4
+; AVX1-NEXT:    [[TMP47:%.*]] = insertelement <8 x i32> [[TMP45]], i32 [[TMP46]], i32 4
+; AVX1-NEXT:    [[TMP48:%.*]] = extractelement <8 x i32> [[TMP2]], i32 5
+; AVX1-NEXT:    [[TMP49:%.*]] = insertelement <8 x i32> [[TMP47]], i32 [[TMP48]], i32 5
+; AVX1-NEXT:    [[TMP50:%.*]] = extractelement <8 x i32> [[TMP2]], i32 6
+; AVX1-NEXT:    [[TMP51:%.*]] = insertelement <8 x i32> [[TMP49]], i32 [[TMP50]], i32 6
+; AVX1-NEXT:    [[TMP52:%.*]] = extractelement <8 x i32> [[TMP2]], i32 7
+; AVX1-NEXT:    [[TMP53:%.*]] = insertelement <8 x i32> [[TMP51]], i32 [[TMP52]], i32 7
+; AVX1-NEXT:    [[TMP54:%.*]] = extractelement <8 x i32> [[TMP4]], i32 0
+; AVX1-NEXT:    [[TMP55:%.*]] = insertelement <8 x i32> undef, i32 [[TMP54]], i32 0
+; AVX1-NEXT:    [[TMP56:%.*]] = extractelement <8 x i32> [[TMP4]], i32 1
+; AVX1-NEXT:    [[TMP57:%.*]] = insertelement <8 x i32> [[TMP55]], i32 [[TMP56]], i32 1
+; AVX1-NEXT:    [[TMP58:%.*]] = extractelement <8 x i32> [[TMP4]], i32 2
+; AVX1-NEXT:    [[TMP59:%.*]] = insertelement <8 x i32> [[TMP57]], i32 [[TMP58]], i32 2
+; AVX1-NEXT:    [[TMP60:%.*]] = extractelement <8 x i32> [[TMP4]], i32 3
+; AVX1-NEXT:    [[TMP61:%.*]] = insertelement <8 x i32> [[TMP59]], i32 [[TMP60]], i32 3
+; AVX1-NEXT:    [[TMP62:%.*]] = extractelement <8 x i32> [[TMP4]], i32 4
+; AVX1-NEXT:    [[TMP63:%.*]] = insertelement <8 x i32> [[TMP61]], i32 [[TMP62]], i32 4
+; AVX1-NEXT:    [[TMP64:%.*]] = extractelement <8 x i32> [[TMP4]], i32 5
+; AVX1-NEXT:    [[TMP65:%.*]] = insertelement <8 x i32> [[TMP63]], i32 [[TMP64]], i32 5
+; AVX1-NEXT:    [[TMP66:%.*]] = extractelement <8 x i32> [[TMP4]], i32 6
+; AVX1-NEXT:    [[TMP67:%.*]] = insertelement <8 x i32> [[TMP65]], i32 [[TMP66]], i32 6
+; AVX1-NEXT:    [[TMP68:%.*]] = extractelement <8 x i32> [[TMP4]], i32 7
+; AVX1-NEXT:    [[TMP69:%.*]] = insertelement <8 x i32> [[TMP67]], i32 [[TMP68]], i32 7
+; AVX1-NEXT:    [[TMP70:%.*]] = ashr <8 x i32> [[TMP53]], [[TMP69]]
+; AVX1-NEXT:    store <8 x i32> [[TMP37]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX1-NEXT:    store <8 x i32> [[TMP70]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @ashr_v16i32(
@@ -340,134 +436,150 @@
 
 define void @ashr_v32i16() {
 ; SSE-LABEL: @ashr_v32i16(
-; SSE-NEXT:    [[A0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0), align 2
-; SSE-NEXT:    [[A1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1), align 2
-; SSE-NEXT:    [[A2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2), align 2
-; SSE-NEXT:    [[A3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3), align 2
-; SSE-NEXT:    [[A4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4), align 2
-; SSE-NEXT:    [[A5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5), align 2
-; SSE-NEXT:    [[A6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6), align 2
-; SSE-NEXT:    [[A7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7), align 2
-; SSE-NEXT:    [[A8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8), align 2
-; SSE-NEXT:    [[A9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9), align 2
-; SSE-NEXT:    [[A10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
-; SSE-NEXT:    [[A11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
-; SSE-NEXT:    [[A12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
-; SSE-NEXT:    [[A13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
-; SSE-NEXT:    [[A14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
-; SSE-NEXT:    [[A15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
-; SSE-NEXT:    [[A16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
-; SSE-NEXT:    [[A17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
-; SSE-NEXT:    [[A18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
-; SSE-NEXT:    [[A19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
-; SSE-NEXT:    [[A20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
-; SSE-NEXT:    [[A21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
-; SSE-NEXT:    [[A22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
-; SSE-NEXT:    [[A23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
-; SSE-NEXT:    [[A24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
-; SSE-NEXT:    [[A25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
-; SSE-NEXT:    [[A26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
-; SSE-NEXT:    [[A27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
-; SSE-NEXT:    [[A28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
-; SSE-NEXT:    [[A29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
-; SSE-NEXT:    [[A30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
-; SSE-NEXT:    [[A31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
-; SSE-NEXT:    [[B0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0), align 2
-; SSE-NEXT:    [[B1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1), align 2
-; SSE-NEXT:    [[B2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2), align 2
-; SSE-NEXT:    [[B3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3), align 2
-; SSE-NEXT:    [[B4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4), align 2
-; SSE-NEXT:    [[B5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5), align 2
-; SSE-NEXT:    [[B6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6), align 2
-; SSE-NEXT:    [[B7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7), align 2
-; SSE-NEXT:    [[B8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8), align 2
-; SSE-NEXT:    [[B9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9), align 2
-; SSE-NEXT:    [[B10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
-; SSE-NEXT:    [[B11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
-; SSE-NEXT:    [[B12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
-; SSE-NEXT:    [[B13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
-; SSE-NEXT:    [[B14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
-; SSE-NEXT:    [[B15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
-; SSE-NEXT:    [[B16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
-; SSE-NEXT:    [[B17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
-; SSE-NEXT:    [[B18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
-; SSE-NEXT:    [[B19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
-; SSE-NEXT:    [[B20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
-; SSE-NEXT:    [[B21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
-; SSE-NEXT:    [[B22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
-; SSE-NEXT:    [[B23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
-; SSE-NEXT:    [[B24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
-; SSE-NEXT:    [[B25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
-; SSE-NEXT:    [[B26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
-; SSE-NEXT:    [[B27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
-; SSE-NEXT:    [[B28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
-; SSE-NEXT:    [[B29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
-; SSE-NEXT:    [[B30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
-; SSE-NEXT:    [[B31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
-; SSE-NEXT:    [[R0:%.*]] = ashr i16 [[A0]], [[B0]]
-; SSE-NEXT:    [[R1:%.*]] = ashr i16 [[A1]], [[B1]]
-; SSE-NEXT:    [[R2:%.*]] = ashr i16 [[A2]], [[B2]]
-; SSE-NEXT:    [[R3:%.*]] = ashr i16 [[A3]], [[B3]]
-; SSE-NEXT:    [[R4:%.*]] = ashr i16 [[A4]], [[B4]]
-; SSE-NEXT:    [[R5:%.*]] = ashr i16 [[A5]], [[B5]]
-; SSE-NEXT:    [[R6:%.*]] = ashr i16 [[A6]], [[B6]]
-; SSE-NEXT:    [[R7:%.*]] = ashr i16 [[A7]], [[B7]]
-; SSE-NEXT:    [[R8:%.*]] = ashr i16 [[A8]], [[B8]]
-; SSE-NEXT:    [[R9:%.*]] = ashr i16 [[A9]], [[B9]]
-; SSE-NEXT:    [[R10:%.*]] = ashr i16 [[A10]], [[B10]]
-; SSE-NEXT:    [[R11:%.*]] = ashr i16 [[A11]], [[B11]]
-; SSE-NEXT:    [[R12:%.*]] = ashr i16 [[A12]], [[B12]]
-; SSE-NEXT:    [[R13:%.*]] = ashr i16 [[A13]], [[B13]]
-; SSE-NEXT:    [[R14:%.*]] = ashr i16 [[A14]], [[B14]]
-; SSE-NEXT:    [[R15:%.*]] = ashr i16 [[A15]], [[B15]]
-; SSE-NEXT:    [[R16:%.*]] = ashr i16 [[A16]], [[B16]]
-; SSE-NEXT:    [[R17:%.*]] = ashr i16 [[A17]], [[B17]]
-; SSE-NEXT:    [[R18:%.*]] = ashr i16 [[A18]], [[B18]]
-; SSE-NEXT:    [[R19:%.*]] = ashr i16 [[A19]], [[B19]]
-; SSE-NEXT:    [[R20:%.*]] = ashr i16 [[A20]], [[B20]]
-; SSE-NEXT:    [[R21:%.*]] = ashr i16 [[A21]], [[B21]]
-; SSE-NEXT:    [[R22:%.*]] = ashr i16 [[A22]], [[B22]]
-; SSE-NEXT:    [[R23:%.*]] = ashr i16 [[A23]], [[B23]]
-; SSE-NEXT:    [[R24:%.*]] = ashr i16 [[A24]], [[B24]]
-; SSE-NEXT:    [[R25:%.*]] = ashr i16 [[A25]], [[B25]]
-; SSE-NEXT:    [[R26:%.*]] = ashr i16 [[A26]], [[B26]]
-; SSE-NEXT:    [[R27:%.*]] = ashr i16 [[A27]], [[B27]]
-; SSE-NEXT:    [[R28:%.*]] = ashr i16 [[A28]], [[B28]]
-; SSE-NEXT:    [[R29:%.*]] = ashr i16 [[A29]], [[B29]]
-; SSE-NEXT:    [[R30:%.*]] = ashr i16 [[A30]], [[B30]]
-; SSE-NEXT:    [[R31:%.*]] = ashr i16 [[A31]], [[B31]]
-; SSE-NEXT:    store i16 [[R0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0), align 2
-; SSE-NEXT:    store i16 [[R1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1), align 2
-; SSE-NEXT:    store i16 [[R2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2), align 2
-; SSE-NEXT:    store i16 [[R3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3), align 2
-; SSE-NEXT:    store i16 [[R4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4), align 2
-; SSE-NEXT:    store i16 [[R5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5), align 2
-; SSE-NEXT:    store i16 [[R6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6), align 2
-; SSE-NEXT:    store i16 [[R7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7), align 2
-; SSE-NEXT:    store i16 [[R8]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8), align 2
-; SSE-NEXT:    store i16 [[R9]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9), align 2
-; SSE-NEXT:    store i16 [[R10]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
-; SSE-NEXT:    store i16 [[R11]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
-; SSE-NEXT:    store i16 [[R12]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
-; SSE-NEXT:    store i16 [[R13]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
-; SSE-NEXT:    store i16 [[R14]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
-; SSE-NEXT:    store i16 [[R15]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
-; SSE-NEXT:    store i16 [[R16]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
-; SSE-NEXT:    store i16 [[R17]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
-; SSE-NEXT:    store i16 [[R18]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
-; SSE-NEXT:    store i16 [[R19]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
-; SSE-NEXT:    store i16 [[R20]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
-; SSE-NEXT:    store i16 [[R21]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
-; SSE-NEXT:    store i16 [[R22]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
-; SSE-NEXT:    store i16 [[R23]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
-; SSE-NEXT:    store i16 [[R24]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
-; SSE-NEXT:    store i16 [[R25]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
-; SSE-NEXT:    store i16 [[R26]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
-; SSE-NEXT:    store i16 [[R27]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
-; SSE-NEXT:    store i16 [[R28]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
-; SSE-NEXT:    store i16 [[R29]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
-; SSE-NEXT:    store i16 [[R30]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
-; SSE-NEXT:    store i16 [[R31]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0
+; SSE-NEXT:    [[TMP10:%.*]] = insertelement <8 x i16> undef, i16 [[TMP9]], i32 0
+; SSE-NEXT:    [[TMP11:%.*]] = extractelement <8 x i16> [[TMP1]], i32 1
+; SSE-NEXT:    [[TMP12:%.*]] = insertelement <8 x i16> [[TMP10]], i16 [[TMP11]], i32 1
+; SSE-NEXT:    [[TMP13:%.*]] = extractelement <8 x i16> [[TMP1]], i32 2
+; SSE-NEXT:    [[TMP14:%.*]] = insertelement <8 x i16> [[TMP12]], i16 [[TMP13]], i32 2
+; SSE-NEXT:    [[TMP15:%.*]] = extractelement <8 x i16> [[TMP1]], i32 3
+; SSE-NEXT:    [[TMP16:%.*]] = insertelement <8 x i16> [[TMP14]], i16 [[TMP15]], i32 3
+; SSE-NEXT:    [[TMP17:%.*]] = extractelement <8 x i16> [[TMP1]], i32 4
+; SSE-NEXT:    [[TMP18:%.*]] = insertelement <8 x i16> [[TMP16]], i16 [[TMP17]], i32 4
+; SSE-NEXT:    [[TMP19:%.*]] = extractelement <8 x i16> [[TMP1]], i32 5
+; SSE-NEXT:    [[TMP20:%.*]] = insertelement <8 x i16> [[TMP18]], i16 [[TMP19]], i32 5
+; SSE-NEXT:    [[TMP21:%.*]] = extractelement <8 x i16> [[TMP1]], i32 6
+; SSE-NEXT:    [[TMP22:%.*]] = insertelement <8 x i16> [[TMP20]], i16 [[TMP21]], i32 6
+; SSE-NEXT:    [[TMP23:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+; SSE-NEXT:    [[TMP24:%.*]] = insertelement <8 x i16> [[TMP22]], i16 [[TMP23]], i32 7
+; SSE-NEXT:    [[TMP25:%.*]] = extractelement <8 x i16> [[TMP5]], i32 0
+; SSE-NEXT:    [[TMP26:%.*]] = insertelement <8 x i16> undef, i16 [[TMP25]], i32 0
+; SSE-NEXT:    [[TMP27:%.*]] = extractelement <8 x i16> [[TMP5]], i32 1
+; SSE-NEXT:    [[TMP28:%.*]] = insertelement <8 x i16> [[TMP26]], i16 [[TMP27]], i32 1
+; SSE-NEXT:    [[TMP29:%.*]] = extractelement <8 x i16> [[TMP5]], i32 2
+; SSE-NEXT:    [[TMP30:%.*]] = insertelement <8 x i16> [[TMP28]], i16 [[TMP29]], i32 2
+; SSE-NEXT:    [[TMP31:%.*]] = extractelement <8 x i16> [[TMP5]], i32 3
+; SSE-NEXT:    [[TMP32:%.*]] = insertelement <8 x i16> [[TMP30]], i16 [[TMP31]], i32 3
+; SSE-NEXT:    [[TMP33:%.*]] = extractelement <8 x i16> [[TMP5]], i32 4
+; SSE-NEXT:    [[TMP34:%.*]] = insertelement <8 x i16> [[TMP32]], i16 [[TMP33]], i32 4
+; SSE-NEXT:    [[TMP35:%.*]] = extractelement <8 x i16> [[TMP5]], i32 5
+; SSE-NEXT:    [[TMP36:%.*]] = insertelement <8 x i16> [[TMP34]], i16 [[TMP35]], i32 5
+; SSE-NEXT:    [[TMP37:%.*]] = extractelement <8 x i16> [[TMP5]], i32 6
+; SSE-NEXT:    [[TMP38:%.*]] = insertelement <8 x i16> [[TMP36]], i16 [[TMP37]], i32 6
+; SSE-NEXT:    [[TMP39:%.*]] = extractelement <8 x i16> [[TMP5]], i32 7
+; SSE-NEXT:    [[TMP40:%.*]] = insertelement <8 x i16> [[TMP38]], i16 [[TMP39]], i32 7
+; SSE-NEXT:    [[TMP41:%.*]] = ashr <8 x i16> [[TMP24]], [[TMP40]]
+; SSE-NEXT:    [[TMP42:%.*]] = extractelement <8 x i16> [[TMP2]], i32 0
+; SSE-NEXT:    [[TMP43:%.*]] = insertelement <8 x i16> undef, i16 [[TMP42]], i32 0
+; SSE-NEXT:    [[TMP44:%.*]] = extractelement <8 x i16> [[TMP2]], i32 1
+; SSE-NEXT:    [[TMP45:%.*]] = insertelement <8 x i16> [[TMP43]], i16 [[TMP44]], i32 1
+; SSE-NEXT:    [[TMP46:%.*]] = extractelement <8 x i16> [[TMP2]], i32 2
+; SSE-NEXT:    [[TMP47:%.*]] = insertelement <8 x i16> [[TMP45]], i16 [[TMP46]], i32 2
+; SSE-NEXT:    [[TMP48:%.*]] = extractelement <8 x i16> [[TMP2]], i32 3
+; SSE-NEXT:    [[TMP49:%.*]] = insertelement <8 x i16> [[TMP47]], i16 [[TMP48]], i32 3
+; SSE-NEXT:    [[TMP50:%.*]] = extractelement <8 x i16> [[TMP2]], i32 4
+; SSE-NEXT:    [[TMP51:%.*]] = insertelement <8 x i16> [[TMP49]], i16 [[TMP50]], i32 4
+; SSE-NEXT:    [[TMP52:%.*]] = extractelement <8 x i16> [[TMP2]], i32 5
+; SSE-NEXT:    [[TMP53:%.*]] = insertelement <8 x i16> [[TMP51]], i16 [[TMP52]], i32 5
+; SSE-NEXT:    [[TMP54:%.*]] = extractelement <8 x i16> [[TMP2]], i32 6
+; SSE-NEXT:    [[TMP55:%.*]] = insertelement <8 x i16> [[TMP53]], i16 [[TMP54]], i32 6
+; SSE-NEXT:    [[TMP56:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
+; SSE-NEXT:    [[TMP57:%.*]] = insertelement <8 x i16> [[TMP55]], i16 [[TMP56]], i32 7
+; SSE-NEXT:    [[TMP58:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0
+; SSE-NEXT:    [[TMP59:%.*]] = insertelement <8 x i16> undef, i16 [[TMP58]], i32 0
+; SSE-NEXT:    [[TMP60:%.*]] = extractelement <8 x i16> [[TMP6]], i32 1
+; SSE-NEXT:    [[TMP61:%.*]] = insertelement <8 x i16> [[TMP59]], i16 [[TMP60]], i32 1
+; SSE-NEXT:    [[TMP62:%.*]] = extractelement <8 x i16> [[TMP6]], i32 2
+; SSE-NEXT:    [[TMP63:%.*]] = insertelement <8 x i16> [[TMP61]], i16 [[TMP62]], i32 2
+; SSE-NEXT:    [[TMP64:%.*]] = extractelement <8 x i16> [[TMP6]], i32 3
+; SSE-NEXT:    [[TMP65:%.*]] = insertelement <8 x i16> [[TMP63]], i16 [[TMP64]], i32 3
+; SSE-NEXT:    [[TMP66:%.*]] = extractelement <8 x i16> [[TMP6]], i32 4
+; SSE-NEXT:    [[TMP67:%.*]] = insertelement <8 x i16> [[TMP65]], i16 [[TMP66]], i32 4
+; SSE-NEXT:    [[TMP68:%.*]] = extractelement <8 x i16> [[TMP6]], i32 5
+; SSE-NEXT:    [[TMP69:%.*]] = insertelement <8 x i16> [[TMP67]], i16 [[TMP68]], i32 5
+; SSE-NEXT:    [[TMP70:%.*]] = extractelement <8 x i16> [[TMP6]], i32 6
+; SSE-NEXT:    [[TMP71:%.*]] = insertelement <8 x i16> [[TMP69]], i16 [[TMP70]], i32 6
+; SSE-NEXT:    [[TMP72:%.*]] = extractelement <8 x i16> [[TMP6]], i32 7
+; SSE-NEXT:    [[TMP73:%.*]] = insertelement <8 x i16> [[TMP71]], i16 [[TMP72]], i32 7
+; SSE-NEXT:    [[TMP74:%.*]] = ashr <8 x i16> [[TMP57]], [[TMP73]]
+; SSE-NEXT:    [[TMP75:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
+; SSE-NEXT:    [[TMP76:%.*]] = insertelement <8 x i16> undef, i16 [[TMP75]], i32 0
+; SSE-NEXT:    [[TMP77:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
+; SSE-NEXT:    [[TMP78:%.*]] = insertelement <8 x i16> [[TMP76]], i16 [[TMP77]], i32 1
+; SSE-NEXT:    [[TMP79:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
+; SSE-NEXT:    [[TMP80:%.*]] = insertelement <8 x i16> [[TMP78]], i16 [[TMP79]], i32 2
+; SSE-NEXT:    [[TMP81:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
+; SSE-NEXT:    [[TMP82:%.*]] = insertelement <8 x i16> [[TMP80]], i16 [[TMP81]], i32 3
+; SSE-NEXT:    [[TMP83:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
+; SSE-NEXT:    [[TMP84:%.*]] = insertelement <8 x i16> [[TMP82]], i16 [[TMP83]], i32 4
+; SSE-NEXT:    [[TMP85:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
+; SSE-NEXT:    [[TMP86:%.*]] = insertelement <8 x i16> [[TMP84]], i16 [[TMP85]], i32 5
+; SSE-NEXT:    [[TMP87:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
+; SSE-NEXT:    [[TMP88:%.*]] = insertelement <8 x i16> [[TMP86]], i16 [[TMP87]], i32 6
+; SSE-NEXT:    [[TMP89:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
+; SSE-NEXT:    [[TMP90:%.*]] = insertelement <8 x i16> [[TMP88]], i16 [[TMP89]], i32 7
+; SSE-NEXT:    [[TMP91:%.*]] = extractelement <8 x i16> [[TMP7]], i32 0
+; SSE-NEXT:    [[TMP92:%.*]] = insertelement <8 x i16> undef, i16 [[TMP91]], i32 0
+; SSE-NEXT:    [[TMP93:%.*]] = extractelement <8 x i16> [[TMP7]], i32 1
+; SSE-NEXT:    [[TMP94:%.*]] = insertelement <8 x i16> [[TMP92]], i16 [[TMP93]], i32 1
+; SSE-NEXT:    [[TMP95:%.*]] = extractelement <8 x i16> [[TMP7]], i32 2
+; SSE-NEXT:    [[TMP96:%.*]] = insertelement <8 x i16> [[TMP94]], i16 [[TMP95]], i32 2
+; SSE-NEXT:    [[TMP97:%.*]] = extractelement <8 x i16> [[TMP7]], i32 3
+; SSE-NEXT:    [[TMP98:%.*]] = insertelement <8 x i16> [[TMP96]], i16 [[TMP97]], i32 3
+; SSE-NEXT:    [[TMP99:%.*]] = extractelement <8 x i16> [[TMP7]], i32 4
+; SSE-NEXT:    [[TMP100:%.*]] = insertelement <8 x i16> [[TMP98]], i16 [[TMP99]], i32 4
+; SSE-NEXT:    [[TMP101:%.*]] = extractelement <8 x i16> [[TMP7]], i32 5
+; SSE-NEXT:    [[TMP102:%.*]] = insertelement <8 x i16> [[TMP100]], i16 [[TMP101]], i32 5
+; SSE-NEXT:    [[TMP103:%.*]] = extractelement <8 x i16> [[TMP7]], i32 6
+; SSE-NEXT:    [[TMP104:%.*]] = insertelement <8 x i16> [[TMP102]], i16 [[TMP103]], i32 6
+; SSE-NEXT:    [[TMP105:%.*]] = extractelement <8 x i16> [[TMP7]], i32 7
+; SSE-NEXT:    [[TMP106:%.*]] = insertelement <8 x i16> [[TMP104]], i16 [[TMP105]], i32 7
+; SSE-NEXT:    [[TMP107:%.*]] = ashr <8 x i16> [[TMP90]], [[TMP106]]
+; SSE-NEXT:    [[TMP108:%.*]] = extractelement <8 x i16> [[TMP4]], i32 0
+; SSE-NEXT:    [[TMP109:%.*]] = insertelement <8 x i16> undef, i16 [[TMP108]], i32 0
+; SSE-NEXT:    [[TMP110:%.*]] = extractelement <8 x i16> [[TMP4]], i32 1
+; SSE-NEXT:    [[TMP111:%.*]] = insertelement <8 x i16> [[TMP109]], i16 [[TMP110]], i32 1
+; SSE-NEXT:    [[TMP112:%.*]] = extractelement <8 x i16> [[TMP4]], i32 2
+; SSE-NEXT:    [[TMP113:%.*]] = insertelement <8 x i16> [[TMP111]], i16 [[TMP112]], i32 2
+; SSE-NEXT:    [[TMP114:%.*]] = extractelement <8 x i16> [[TMP4]], i32 3
+; SSE-NEXT:    [[TMP115:%.*]] = insertelement <8 x i16> [[TMP113]], i16 [[TMP114]], i32 3
+; SSE-NEXT:    [[TMP116:%.*]] = extractelement <8 x i16> [[TMP4]], i32 4
+; SSE-NEXT:    [[TMP117:%.*]] = insertelement <8 x i16> [[TMP115]], i16 [[TMP116]], i32 4
+; SSE-NEXT:    [[TMP118:%.*]] = extractelement <8 x i16> [[TMP4]], i32 5
+; SSE-NEXT:    [[TMP119:%.*]] = insertelement <8 x i16> [[TMP117]], i16 [[TMP118]], i32 5
+; SSE-NEXT:    [[TMP120:%.*]] = extractelement <8 x i16> [[TMP4]], i32 6
+; SSE-NEXT:    [[TMP121:%.*]] = insertelement <8 x i16> [[TMP119]], i16 [[TMP120]], i32 6
+; SSE-NEXT:    [[TMP122:%.*]] = extractelement <8 x i16> [[TMP4]], i32 7
+; SSE-NEXT:    [[TMP123:%.*]] = insertelement <8 x i16> [[TMP121]], i16 [[TMP122]], i32 7
+; SSE-NEXT:    [[TMP124:%.*]] = extractelement <8 x i16> [[TMP8]], i32 0
+; SSE-NEXT:    [[TMP125:%.*]] = insertelement <8 x i16> undef, i16 [[TMP124]], i32 0
+; SSE-NEXT:    [[TMP126:%.*]] = extractelement <8 x i16> [[TMP8]], i32 1
+; SSE-NEXT:    [[TMP127:%.*]] = insertelement <8 x i16> [[TMP125]], i16 [[TMP126]], i32 1
+; SSE-NEXT:    [[TMP128:%.*]] = extractelement <8 x i16> [[TMP8]], i32 2
+; SSE-NEXT:    [[TMP129:%.*]] = insertelement <8 x i16> [[TMP127]], i16 [[TMP128]], i32 2
+; SSE-NEXT:    [[TMP130:%.*]] = extractelement <8 x i16> [[TMP8]], i32 3
+; SSE-NEXT:    [[TMP131:%.*]] = insertelement <8 x i16> [[TMP129]], i16 [[TMP130]], i32 3
+; SSE-NEXT:    [[TMP132:%.*]] = extractelement <8 x i16> [[TMP8]], i32 4
+; SSE-NEXT:    [[TMP133:%.*]] = insertelement <8 x i16> [[TMP131]], i16 [[TMP132]], i32 4
+; SSE-NEXT:    [[TMP134:%.*]] = extractelement <8 x i16> [[TMP8]], i32 5
+; SSE-NEXT:    [[TMP135:%.*]] = insertelement <8 x i16> [[TMP133]], i16 [[TMP134]], i32 5
+; SSE-NEXT:    [[TMP136:%.*]] = extractelement <8 x i16> [[TMP8]], i32 6
+; SSE-NEXT:    [[TMP137:%.*]] = insertelement <8 x i16> [[TMP135]], i16 [[TMP136]], i32 6
+; SSE-NEXT:    [[TMP138:%.*]] = extractelement <8 x i16> [[TMP8]], i32 7
+; SSE-NEXT:    [[TMP139:%.*]] = insertelement <8 x i16> [[TMP137]], i16 [[TMP138]], i32 7
+; SSE-NEXT:    [[TMP140:%.*]] = ashr <8 x i16> [[TMP123]], [[TMP139]]
+; SSE-NEXT:    store <8 x i16> [[TMP41]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP74]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP107]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP140]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @ashr_v32i16(
Index: test/Transforms/SLPVectorizer/X86/shift-lshr.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/shift-lshr.ll
+++ test/Transforms/SLPVectorizer/X86/shift-lshr.ll
@@ -41,22 +41,46 @@
 ; SSE-NEXT:    ret void
 ;
 ; AVX1-LABEL: @lshr_v8i64(
-; AVX1-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP9:%.*]] = lshr <2 x i64> [[TMP1]], [[TMP5]]
-; AVX1-NEXT:    [[TMP10:%.*]] = lshr <2 x i64> [[TMP2]], [[TMP6]]
-; AVX1-NEXT:    [[TMP11:%.*]] = lshr <2 x i64> [[TMP3]], [[TMP7]]
-; AVX1-NEXT:    [[TMP12:%.*]] = lshr <2 x i64> [[TMP4]], [[TMP8]]
-; AVX1-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; AVX1-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; AVX1-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
-; AVX1-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; AVX1-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX1-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX1-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX1-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP1]], i32 0
+; AVX1-NEXT:    [[TMP6:%.*]] = insertelement <4 x i64> undef, i64 [[TMP5]], i32 0
+; AVX1-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP1]], i32 1
+; AVX1-NEXT:    [[TMP8:%.*]] = insertelement <4 x i64> [[TMP6]], i64 [[TMP7]], i32 1
+; AVX1-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP1]], i32 2
+; AVX1-NEXT:    [[TMP10:%.*]] = insertelement <4 x i64> [[TMP8]], i64 [[TMP9]], i32 2
+; AVX1-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP1]], i32 3
+; AVX1-NEXT:    [[TMP12:%.*]] = insertelement <4 x i64> [[TMP10]], i64 [[TMP11]], i32 3
+; AVX1-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; AVX1-NEXT:    [[TMP14:%.*]] = insertelement <4 x i64> undef, i64 [[TMP13]], i32 0
+; AVX1-NEXT:    [[TMP15:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; AVX1-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP14]], i64 [[TMP15]], i32 1
+; AVX1-NEXT:    [[TMP17:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; AVX1-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP17]], i32 2
+; AVX1-NEXT:    [[TMP19:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; AVX1-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP19]], i32 3
+; AVX1-NEXT:    [[TMP21:%.*]] = lshr <4 x i64> [[TMP12]], [[TMP20]]
+; AVX1-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0
+; AVX1-NEXT:    [[TMP23:%.*]] = insertelement <4 x i64> undef, i64 [[TMP22]], i32 0
+; AVX1-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
+; AVX1-NEXT:    [[TMP25:%.*]] = insertelement <4 x i64> [[TMP23]], i64 [[TMP24]], i32 1
+; AVX1-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
+; AVX1-NEXT:    [[TMP27:%.*]] = insertelement <4 x i64> [[TMP25]], i64 [[TMP26]], i32 2
+; AVX1-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
+; AVX1-NEXT:    [[TMP29:%.*]] = insertelement <4 x i64> [[TMP27]], i64 [[TMP28]], i32 3
+; AVX1-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP4]], i32 0
+; AVX1-NEXT:    [[TMP31:%.*]] = insertelement <4 x i64> undef, i64 [[TMP30]], i32 0
+; AVX1-NEXT:    [[TMP32:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1
+; AVX1-NEXT:    [[TMP33:%.*]] = insertelement <4 x i64> [[TMP31]], i64 [[TMP32]], i32 1
+; AVX1-NEXT:    [[TMP34:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2
+; AVX1-NEXT:    [[TMP35:%.*]] = insertelement <4 x i64> [[TMP33]], i64 [[TMP34]], i32 2
+; AVX1-NEXT:    [[TMP36:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3
+; AVX1-NEXT:    [[TMP37:%.*]] = insertelement <4 x i64> [[TMP35]], i64 [[TMP36]], i32 3
+; AVX1-NEXT:    [[TMP38:%.*]] = lshr <4 x i64> [[TMP29]], [[TMP37]]
+; AVX1-NEXT:    store <4 x i64> [[TMP21]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX1-NEXT:    store <4 x i64> [[TMP38]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @lshr_v8i64(
@@ -125,70 +149,86 @@
 
 define void @lshr_v16i32() {
 ; SSE-LABEL: @lshr_v16i32(
-; SSE-NEXT:    [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4
-; SSE-NEXT:    [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4
-; SSE-NEXT:    [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4
-; SSE-NEXT:    [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4
-; SSE-NEXT:    [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4
-; SSE-NEXT:    [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4
-; SSE-NEXT:    [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4
-; SSE-NEXT:    [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4
-; SSE-NEXT:    [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4
-; SSE-NEXT:    [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4
-; SSE-NEXT:    [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
-; SSE-NEXT:    [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
-; SSE-NEXT:    [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
-; SSE-NEXT:    [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
-; SSE-NEXT:    [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
-; SSE-NEXT:    [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
-; SSE-NEXT:    [[B0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4
-; SSE-NEXT:    [[B1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4
-; SSE-NEXT:    [[B2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4
-; SSE-NEXT:    [[B3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4
-; SSE-NEXT:    [[B4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4
-; SSE-NEXT:    [[B5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4
-; SSE-NEXT:    [[B6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4
-; SSE-NEXT:    [[B7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4
-; SSE-NEXT:    [[B8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4
-; SSE-NEXT:    [[B9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4
-; SSE-NEXT:    [[B10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
-; SSE-NEXT:    [[B11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
-; SSE-NEXT:    [[B12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
-; SSE-NEXT:    [[B13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
-; SSE-NEXT:    [[B14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
-; SSE-NEXT:    [[B15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
-; SSE-NEXT:    [[R0:%.*]] = lshr i32 [[A0]], [[B0]]
-; SSE-NEXT:    [[R1:%.*]] = lshr i32 [[A1]], [[B1]]
-; SSE-NEXT:    [[R2:%.*]] = lshr i32 [[A2]], [[B2]]
-; SSE-NEXT:    [[R3:%.*]] = lshr i32 [[A3]], [[B3]]
-; SSE-NEXT:    [[R4:%.*]] = lshr i32 [[A4]], [[B4]]
-; SSE-NEXT:    [[R5:%.*]] = lshr i32 [[A5]], [[B5]]
-; SSE-NEXT:    [[R6:%.*]] = lshr i32 [[A6]], [[B6]]
-; SSE-NEXT:    [[R7:%.*]] = lshr i32 [[A7]], [[B7]]
-; SSE-NEXT:    [[R8:%.*]] = lshr i32 [[A8]], [[B8]]
-; SSE-NEXT:    [[R9:%.*]] = lshr i32 [[A9]], [[B9]]
-; SSE-NEXT:    [[R10:%.*]] = lshr i32 [[A10]], [[B10]]
-; SSE-NEXT:    [[R11:%.*]] = lshr i32 [[A11]], [[B11]]
-; SSE-NEXT:    [[R12:%.*]] = lshr i32 [[A12]], [[B12]]
-; SSE-NEXT:    [[R13:%.*]] = lshr i32 [[A13]], [[B13]]
-; SSE-NEXT:    [[R14:%.*]] = lshr i32 [[A14]], [[B14]]
-; SSE-NEXT:    [[R15:%.*]] = lshr i32 [[A15]], [[B15]]
-; SSE-NEXT:    store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4
-; SSE-NEXT:    store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4
-; SSE-NEXT:    store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4
-; SSE-NEXT:    store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4
-; SSE-NEXT:    store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4
-; SSE-NEXT:    store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4
-; SSE-NEXT:    store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4
-; SSE-NEXT:    store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4
-; SSE-NEXT:    store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4
-; SSE-NEXT:    store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4
-; SSE-NEXT:    store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
-; SSE-NEXT:    store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
-; SSE-NEXT:    store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
-; SSE-NEXT:    store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
-; SSE-NEXT:    store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
-; SSE-NEXT:    store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; SSE-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[TMP9]], i32 0
+; SSE-NEXT:    [[TMP11:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
+; SSE-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP11]], i32 1
+; SSE-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
+; SSE-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP13]], i32 2
+; SSE-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+; SSE-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP15]], i32 3
+; SSE-NEXT:    [[TMP17:%.*]] = extractelement <4 x i32> [[TMP5]], i32 0
+; SSE-NEXT:    [[TMP18:%.*]] = insertelement <4 x i32> undef, i32 [[TMP17]], i32 0
+; SSE-NEXT:    [[TMP19:%.*]] = extractelement <4 x i32> [[TMP5]], i32 1
+; SSE-NEXT:    [[TMP20:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP19]], i32 1
+; SSE-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[TMP5]], i32 2
+; SSE-NEXT:    [[TMP22:%.*]] = insertelement <4 x i32> [[TMP20]], i32 [[TMP21]], i32 2
+; SSE-NEXT:    [[TMP23:%.*]] = extractelement <4 x i32> [[TMP5]], i32 3
+; SSE-NEXT:    [[TMP24:%.*]] = insertelement <4 x i32> [[TMP22]], i32 [[TMP23]], i32 3
+; SSE-NEXT:    [[TMP25:%.*]] = lshr <4 x i32> [[TMP16]], [[TMP24]]
+; SSE-NEXT:    [[TMP26:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
+; SSE-NEXT:    [[TMP27:%.*]] = insertelement <4 x i32> undef, i32 [[TMP26]], i32 0
+; SSE-NEXT:    [[TMP28:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
+; SSE-NEXT:    [[TMP29:%.*]] = insertelement <4 x i32> [[TMP27]], i32 [[TMP28]], i32 1
+; SSE-NEXT:    [[TMP30:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
+; SSE-NEXT:    [[TMP31:%.*]] = insertelement <4 x i32> [[TMP29]], i32 [[TMP30]], i32 2
+; SSE-NEXT:    [[TMP32:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; SSE-NEXT:    [[TMP33:%.*]] = insertelement <4 x i32> [[TMP31]], i32 [[TMP32]], i32 3
+; SSE-NEXT:    [[TMP34:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0
+; SSE-NEXT:    [[TMP35:%.*]] = insertelement <4 x i32> undef, i32 [[TMP34]], i32 0
+; SSE-NEXT:    [[TMP36:%.*]] = extractelement <4 x i32> [[TMP6]], i32 1
+; SSE-NEXT:    [[TMP37:%.*]] = insertelement <4 x i32> [[TMP35]], i32 [[TMP36]], i32 1
+; SSE-NEXT:    [[TMP38:%.*]] = extractelement <4 x i32> [[TMP6]], i32 2
+; SSE-NEXT:    [[TMP39:%.*]] = insertelement <4 x i32> [[TMP37]], i32 [[TMP38]], i32 2
+; SSE-NEXT:    [[TMP40:%.*]] = extractelement <4 x i32> [[TMP6]], i32 3
+; SSE-NEXT:    [[TMP41:%.*]] = insertelement <4 x i32> [[TMP39]], i32 [[TMP40]], i32 3
+; SSE-NEXT:    [[TMP42:%.*]] = lshr <4 x i32> [[TMP33]], [[TMP41]]
+; SSE-NEXT:    [[TMP43:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; SSE-NEXT:    [[TMP44:%.*]] = insertelement <4 x i32> undef, i32 [[TMP43]], i32 0
+; SSE-NEXT:    [[TMP45:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; SSE-NEXT:    [[TMP46:%.*]] = insertelement <4 x i32> [[TMP44]], i32 [[TMP45]], i32 1
+; SSE-NEXT:    [[TMP47:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; SSE-NEXT:    [[TMP48:%.*]] = insertelement <4 x i32> [[TMP46]], i32 [[TMP47]], i32 2
+; SSE-NEXT:    [[TMP49:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; SSE-NEXT:    [[TMP50:%.*]] = insertelement <4 x i32> [[TMP48]], i32 [[TMP49]], i32 3
+; SSE-NEXT:    [[TMP51:%.*]] = extractelement <4 x i32> [[TMP7]], i32 0
+; SSE-NEXT:    [[TMP52:%.*]] = insertelement <4 x i32> undef, i32 [[TMP51]], i32 0
+; SSE-NEXT:    [[TMP53:%.*]] = extractelement <4 x i32> [[TMP7]], i32 1
+; SSE-NEXT:    [[TMP54:%.*]] = insertelement <4 x i32> [[TMP52]], i32 [[TMP53]], i32 1
+; SSE-NEXT:    [[TMP55:%.*]] = extractelement <4 x i32> [[TMP7]], i32 2
+; SSE-NEXT:    [[TMP56:%.*]] = insertelement <4 x i32> [[TMP54]], i32 [[TMP55]], i32 2
+; SSE-NEXT:    [[TMP57:%.*]] = extractelement <4 x i32> [[TMP7]], i32 3
+; SSE-NEXT:    [[TMP58:%.*]] = insertelement <4 x i32> [[TMP56]], i32 [[TMP57]], i32 3
+; SSE-NEXT:    [[TMP59:%.*]] = lshr <4 x i32> [[TMP50]], [[TMP58]]
+; SSE-NEXT:    [[TMP60:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
+; SSE-NEXT:    [[TMP61:%.*]] = insertelement <4 x i32> undef, i32 [[TMP60]], i32 0
+; SSE-NEXT:    [[TMP62:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1
+; SSE-NEXT:    [[TMP63:%.*]] = insertelement <4 x i32> [[TMP61]], i32 [[TMP62]], i32 1
+; SSE-NEXT:    [[TMP64:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
+; SSE-NEXT:    [[TMP65:%.*]] = insertelement <4 x i32> [[TMP63]], i32 [[TMP64]], i32 2
+; SSE-NEXT:    [[TMP66:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
+; SSE-NEXT:    [[TMP67:%.*]] = insertelement <4 x i32> [[TMP65]], i32 [[TMP66]], i32 3
+; SSE-NEXT:    [[TMP68:%.*]] = extractelement <4 x i32> [[TMP8]], i32 0
+; SSE-NEXT:    [[TMP69:%.*]] = insertelement <4 x i32> undef, i32 [[TMP68]], i32 0
+; SSE-NEXT:    [[TMP70:%.*]] = extractelement <4 x i32> [[TMP8]], i32 1
+; SSE-NEXT:    [[TMP71:%.*]] = insertelement <4 x i32> [[TMP69]], i32 [[TMP70]], i32 1
+; SSE-NEXT:    [[TMP72:%.*]] = extractelement <4 x i32> [[TMP8]], i32 2
+; SSE-NEXT:    [[TMP73:%.*]] = insertelement <4 x i32> [[TMP71]], i32 [[TMP72]], i32 2
+; SSE-NEXT:    [[TMP74:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3
+; SSE-NEXT:    [[TMP75:%.*]] = insertelement <4 x i32> [[TMP73]], i32 [[TMP74]], i32 3
+; SSE-NEXT:    [[TMP76:%.*]] = lshr <4 x i32> [[TMP67]], [[TMP75]]
+; SSE-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP42]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP59]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP76]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @lshr_v16i32(
@@ -289,134 +329,150 @@
 
 define void @lshr_v32i16() {
 ; SSE-LABEL: @lshr_v32i16(
-; SSE-NEXT:    [[A0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0), align 2
-; SSE-NEXT:    [[A1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1), align 2
-; SSE-NEXT:    [[A2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2), align 2
-; SSE-NEXT:    [[A3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3), align 2
-; SSE-NEXT:    [[A4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4), align 2
-; SSE-NEXT:    [[A5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5), align 2
-; SSE-NEXT:    [[A6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6), align 2
-; SSE-NEXT:    [[A7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7), align 2
-; SSE-NEXT:    [[A8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8), align 2
-; SSE-NEXT:    [[A9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9), align 2
-; SSE-NEXT:    [[A10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
-; SSE-NEXT:    [[A11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
-; SSE-NEXT:    [[A12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
-; SSE-NEXT:    [[A13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
-; SSE-NEXT:    [[A14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
-; SSE-NEXT:    [[A15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
-; SSE-NEXT:    [[A16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
-; SSE-NEXT:    [[A17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
-; SSE-NEXT:    [[A18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
-; SSE-NEXT:    [[A19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
-; SSE-NEXT:    [[A20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
-; SSE-NEXT:    [[A21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
-; SSE-NEXT:    [[A22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
-; SSE-NEXT:    [[A23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
-; SSE-NEXT:    [[A24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
-; SSE-NEXT:    [[A25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
-; SSE-NEXT:    [[A26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
-; SSE-NEXT:    [[A27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
-; SSE-NEXT:    [[A28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
-; SSE-NEXT:    [[A29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
-; SSE-NEXT:    [[A30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
-; SSE-NEXT:    [[A31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
-; SSE-NEXT:    [[B0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0), align 2
-; SSE-NEXT:    [[B1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1), align 2
-; SSE-NEXT:    [[B2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2), align 2
-; SSE-NEXT:    [[B3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3), align 2
-; SSE-NEXT:    [[B4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4), align 2
-; SSE-NEXT:    [[B5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5), align 2
-; SSE-NEXT:    [[B6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6), align 2
-; SSE-NEXT:    [[B7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7), align 2
-; SSE-NEXT:    [[B8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8), align 2
-; SSE-NEXT:    [[B9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9), align 2
-; SSE-NEXT:    [[B10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
-; SSE-NEXT:    [[B11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
-; SSE-NEXT:    [[B12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
-; SSE-NEXT:    [[B13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
-; SSE-NEXT:    [[B14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
-; SSE-NEXT:    [[B15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
-; SSE-NEXT:    [[B16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
-; SSE-NEXT:    [[B17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
-; SSE-NEXT:    [[B18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
-; SSE-NEXT:    [[B19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
-; SSE-NEXT:    [[B20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
-; SSE-NEXT:    [[B21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
-; SSE-NEXT:    [[B22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
-; SSE-NEXT:    [[B23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
-; SSE-NEXT:    [[B24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
-; SSE-NEXT:    [[B25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
-; SSE-NEXT:    [[B26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
-; SSE-NEXT:    [[B27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
-; SSE-NEXT:    [[B28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
-; SSE-NEXT:    [[B29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
-; SSE-NEXT:    [[B30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
-; SSE-NEXT:    [[B31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
-; SSE-NEXT:    [[R0:%.*]] = lshr i16 [[A0]], [[B0]]
-; SSE-NEXT:    [[R1:%.*]] = lshr i16 [[A1]], [[B1]]
-; SSE-NEXT:    [[R2:%.*]] = lshr i16 [[A2]], [[B2]]
-; SSE-NEXT:    [[R3:%.*]] = lshr i16 [[A3]], [[B3]]
-; SSE-NEXT:    [[R4:%.*]] = lshr i16 [[A4]], [[B4]]
-; SSE-NEXT:    [[R5:%.*]] = lshr i16 [[A5]], [[B5]]
-; SSE-NEXT:    [[R6:%.*]] = lshr i16 [[A6]], [[B6]]
-; SSE-NEXT:    [[R7:%.*]] = lshr i16 [[A7]], [[B7]]
-; SSE-NEXT:    [[R8:%.*]] = lshr i16 [[A8]], [[B8]]
-; SSE-NEXT:    [[R9:%.*]] = lshr i16 [[A9]], [[B9]]
-; SSE-NEXT:    [[R10:%.*]] = lshr i16 [[A10]], [[B10]]
-; SSE-NEXT:    [[R11:%.*]] = lshr i16 [[A11]], [[B11]]
-; SSE-NEXT:    [[R12:%.*]] = lshr i16 [[A12]], [[B12]]
-; SSE-NEXT:    [[R13:%.*]] = lshr i16 [[A13]], [[B13]]
-; SSE-NEXT:    [[R14:%.*]] = lshr i16 [[A14]], [[B14]]
-; SSE-NEXT:    [[R15:%.*]] = lshr i16 [[A15]], [[B15]]
-; SSE-NEXT:    [[R16:%.*]] = lshr i16 [[A16]], [[B16]]
-; SSE-NEXT:    [[R17:%.*]] = lshr i16 [[A17]], [[B17]]
-; SSE-NEXT:    [[R18:%.*]] = lshr i16 [[A18]], [[B18]]
-; SSE-NEXT:    [[R19:%.*]] = lshr i16 [[A19]], [[B19]]
-; SSE-NEXT:    [[R20:%.*]] = lshr i16 [[A20]], [[B20]]
-; SSE-NEXT:    [[R21:%.*]] = lshr i16 [[A21]], [[B21]]
-; SSE-NEXT:    [[R22:%.*]] = lshr i16 [[A22]], [[B22]]
-; SSE-NEXT:    [[R23:%.*]] = lshr i16 [[A23]], [[B23]]
-; SSE-NEXT:    [[R24:%.*]] = lshr i16 [[A24]], [[B24]]
-; SSE-NEXT:    [[R25:%.*]] = lshr i16 [[A25]], [[B25]]
-; SSE-NEXT:    [[R26:%.*]] = lshr i16 [[A26]], [[B26]]
-; SSE-NEXT:    [[R27:%.*]] = lshr i16 [[A27]], [[B27]]
-; SSE-NEXT:    [[R28:%.*]] = lshr i16 [[A28]], [[B28]]
-; SSE-NEXT:    [[R29:%.*]] = lshr i16 [[A29]], [[B29]]
-; SSE-NEXT:    [[R30:%.*]] = lshr i16 [[A30]], [[B30]]
-; SSE-NEXT:    [[R31:%.*]] = lshr i16 [[A31]], [[B31]]
-; SSE-NEXT:    store i16 [[R0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0), align 2
-; SSE-NEXT:    store i16 [[R1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1), align 2
-; SSE-NEXT:    store i16 [[R2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2), align 2
-; SSE-NEXT:    store i16 [[R3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3), align 2
-; SSE-NEXT:    store i16 [[R4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4), align 2
-; SSE-NEXT:    store i16 [[R5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5), align 2
-; SSE-NEXT:    store i16 [[R6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6), align 2
-; SSE-NEXT:    store i16 [[R7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7), align 2
-; SSE-NEXT:    store i16 [[R8]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8), align 2
-; SSE-NEXT:    store i16 [[R9]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9), align 2
-; SSE-NEXT:    store i16 [[R10]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
-; SSE-NEXT:    store i16 [[R11]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
-; SSE-NEXT:    store i16 [[R12]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
-; SSE-NEXT:    store i16 [[R13]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
-; SSE-NEXT:    store i16 [[R14]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
-; SSE-NEXT:    store i16 [[R15]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
-; SSE-NEXT:    store i16 [[R16]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
-; SSE-NEXT:    store i16 [[R17]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
-; SSE-NEXT:    store i16 [[R18]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
-; SSE-NEXT:    store i16 [[R19]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
-; SSE-NEXT:    store i16 [[R20]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
-; SSE-NEXT:    store i16 [[R21]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
-; SSE-NEXT:    store i16 [[R22]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
-; SSE-NEXT:    store i16 [[R23]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
-; SSE-NEXT:    store i16 [[R24]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
-; SSE-NEXT:    store i16 [[R25]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
-; SSE-NEXT:    store i16 [[R26]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
-; SSE-NEXT:    store i16 [[R27]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
-; SSE-NEXT:    store i16 [[R28]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
-; SSE-NEXT:    store i16 [[R29]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
-; SSE-NEXT:    store i16 [[R30]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
-; SSE-NEXT:    store i16 [[R31]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0
+; SSE-NEXT:    [[TMP10:%.*]] = insertelement <8 x i16> undef, i16 [[TMP9]], i32 0
+; SSE-NEXT:    [[TMP11:%.*]] = extractelement <8 x i16> [[TMP1]], i32 1
+; SSE-NEXT:    [[TMP12:%.*]] = insertelement <8 x i16> [[TMP10]], i16 [[TMP11]], i32 1
+; SSE-NEXT:    [[TMP13:%.*]] = extractelement <8 x i16> [[TMP1]], i32 2
+; SSE-NEXT:    [[TMP14:%.*]] = insertelement <8 x i16> [[TMP12]], i16 [[TMP13]], i32 2
+; SSE-NEXT:    [[TMP15:%.*]] = extractelement <8 x i16> [[TMP1]], i32 3
+; SSE-NEXT:    [[TMP16:%.*]] = insertelement <8 x i16> [[TMP14]], i16 [[TMP15]], i32 3
+; SSE-NEXT:    [[TMP17:%.*]] = extractelement <8 x i16> [[TMP1]], i32 4
+; SSE-NEXT:    [[TMP18:%.*]] = insertelement <8 x i16> [[TMP16]], i16 [[TMP17]], i32 4
+; SSE-NEXT:    [[TMP19:%.*]] = extractelement <8 x i16> [[TMP1]], i32 5
+; SSE-NEXT:    [[TMP20:%.*]] = insertelement <8 x i16> [[TMP18]], i16 [[TMP19]], i32 5
+; SSE-NEXT:    [[TMP21:%.*]] = extractelement <8 x i16> [[TMP1]], i32 6
+; SSE-NEXT:    [[TMP22:%.*]] = insertelement <8 x i16> [[TMP20]], i16 [[TMP21]], i32 6
+; SSE-NEXT:    [[TMP23:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+; SSE-NEXT:    [[TMP24:%.*]] = insertelement <8 x i16> [[TMP22]], i16 [[TMP23]], i32 7
+; SSE-NEXT:    [[TMP25:%.*]] = extractelement <8 x i16> [[TMP5]], i32 0
+; SSE-NEXT:    [[TMP26:%.*]] = insertelement <8 x i16> undef, i16 [[TMP25]], i32 0
+; SSE-NEXT:    [[TMP27:%.*]] = extractelement <8 x i16> [[TMP5]], i32 1
+; SSE-NEXT:    [[TMP28:%.*]] = insertelement <8 x i16> [[TMP26]], i16 [[TMP27]], i32 1
+; SSE-NEXT:    [[TMP29:%.*]] = extractelement <8 x i16> [[TMP5]], i32 2
+; SSE-NEXT:    [[TMP30:%.*]] = insertelement <8 x i16> [[TMP28]], i16 [[TMP29]], i32 2
+; SSE-NEXT:    [[TMP31:%.*]] = extractelement <8 x i16> [[TMP5]], i32 3
+; SSE-NEXT:    [[TMP32:%.*]] = insertelement <8 x i16> [[TMP30]], i16 [[TMP31]], i32 3
+; SSE-NEXT:    [[TMP33:%.*]] = extractelement <8 x i16> [[TMP5]], i32 4
+; SSE-NEXT:    [[TMP34:%.*]] = insertelement <8 x i16> [[TMP32]], i16 [[TMP33]], i32 4
+; SSE-NEXT:    [[TMP35:%.*]] = extractelement <8 x i16> [[TMP5]], i32 5
+; SSE-NEXT:    [[TMP36:%.*]] = insertelement <8 x i16> [[TMP34]], i16 [[TMP35]], i32 5
+; SSE-NEXT:    [[TMP37:%.*]] = extractelement <8 x i16> [[TMP5]], i32 6
+; SSE-NEXT:    [[TMP38:%.*]] = insertelement <8 x i16> [[TMP36]], i16 [[TMP37]], i32 6
+; SSE-NEXT:    [[TMP39:%.*]] = extractelement <8 x i16> [[TMP5]], i32 7
+; SSE-NEXT:    [[TMP40:%.*]] = insertelement <8 x i16> [[TMP38]], i16 [[TMP39]], i32 7
+; SSE-NEXT:    [[TMP41:%.*]] = lshr <8 x i16> [[TMP24]], [[TMP40]]
+; SSE-NEXT:    [[TMP42:%.*]] = extractelement <8 x i16> [[TMP2]], i32 0
+; SSE-NEXT:    [[TMP43:%.*]] = insertelement <8 x i16> undef, i16 [[TMP42]], i32 0
+; SSE-NEXT:    [[TMP44:%.*]] = extractelement <8 x i16> [[TMP2]], i32 1
+; SSE-NEXT:    [[TMP45:%.*]] = insertelement <8 x i16> [[TMP43]], i16 [[TMP44]], i32 1
+; SSE-NEXT:    [[TMP46:%.*]] = extractelement <8 x i16> [[TMP2]], i32 2
+; SSE-NEXT:    [[TMP47:%.*]] = insertelement <8 x i16> [[TMP45]], i16 [[TMP46]], i32 2
+; SSE-NEXT:    [[TMP48:%.*]] = extractelement <8 x i16> [[TMP2]], i32 3
+; SSE-NEXT:    [[TMP49:%.*]] = insertelement <8 x i16> [[TMP47]], i16 [[TMP48]], i32 3
+; SSE-NEXT:    [[TMP50:%.*]] = extractelement <8 x i16> [[TMP2]], i32 4
+; SSE-NEXT:    [[TMP51:%.*]] = insertelement <8 x i16> [[TMP49]], i16 [[TMP50]], i32 4
+; SSE-NEXT:    [[TMP52:%.*]] = extractelement <8 x i16> [[TMP2]], i32 5
+; SSE-NEXT:    [[TMP53:%.*]] = insertelement <8 x i16> [[TMP51]], i16 [[TMP52]], i32 5
+; SSE-NEXT:    [[TMP54:%.*]] = extractelement <8 x i16> [[TMP2]], i32 6
+; SSE-NEXT:    [[TMP55:%.*]] = insertelement <8 x i16> [[TMP53]], i16 [[TMP54]], i32 6
+; SSE-NEXT:    [[TMP56:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
+; SSE-NEXT:    [[TMP57:%.*]] = insertelement <8 x i16> [[TMP55]], i16 [[TMP56]], i32 7
+; SSE-NEXT:    [[TMP58:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0
+; SSE-NEXT:    [[TMP59:%.*]] = insertelement <8 x i16> undef, i16 [[TMP58]], i32 0
+; SSE-NEXT:    [[TMP60:%.*]] = extractelement <8 x i16> [[TMP6]], i32 1
+; SSE-NEXT:    [[TMP61:%.*]] = insertelement <8 x i16> [[TMP59]], i16 [[TMP60]], i32 1
+; SSE-NEXT:    [[TMP62:%.*]] = extractelement <8 x i16> [[TMP6]], i32 2
+; SSE-NEXT:    [[TMP63:%.*]] = insertelement <8 x i16> [[TMP61]], i16 [[TMP62]], i32 2
+; SSE-NEXT:    [[TMP64:%.*]] = extractelement <8 x i16> [[TMP6]], i32 3
+; SSE-NEXT:    [[TMP65:%.*]] = insertelement <8 x i16> [[TMP63]], i16 [[TMP64]], i32 3
+; SSE-NEXT:    [[TMP66:%.*]] = extractelement <8 x i16> [[TMP6]], i32 4
+; SSE-NEXT:    [[TMP67:%.*]] = insertelement <8 x i16> [[TMP65]], i16 [[TMP66]], i32 4
+; SSE-NEXT:    [[TMP68:%.*]] = extractelement <8 x i16> [[TMP6]], i32 5
+; SSE-NEXT:    [[TMP69:%.*]] = insertelement <8 x i16> [[TMP67]], i16 [[TMP68]], i32 5
+; SSE-NEXT:    [[TMP70:%.*]] = extractelement <8 x i16> [[TMP6]], i32 6
+; SSE-NEXT:    [[TMP71:%.*]] = insertelement <8 x i16> [[TMP69]], i16 [[TMP70]], i32 6
+; SSE-NEXT:    [[TMP72:%.*]] = extractelement <8 x i16> [[TMP6]], i32 7
+; SSE-NEXT:    [[TMP73:%.*]] = insertelement <8 x i16> [[TMP71]], i16 [[TMP72]], i32 7
+; SSE-NEXT:    [[TMP74:%.*]] = lshr <8 x i16> [[TMP57]], [[TMP73]]
+; SSE-NEXT:    [[TMP75:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
+; SSE-NEXT:    [[TMP76:%.*]] = insertelement <8 x i16> undef, i16 [[TMP75]], i32 0
+; SSE-NEXT:    [[TMP77:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
+; SSE-NEXT:    [[TMP78:%.*]] = insertelement <8 x i16> [[TMP76]], i16 [[TMP77]], i32 1
+; SSE-NEXT:    [[TMP79:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
+; SSE-NEXT:    [[TMP80:%.*]] = insertelement <8 x i16> [[TMP78]], i16 [[TMP79]], i32 2
+; SSE-NEXT:    [[TMP81:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
+; SSE-NEXT:    [[TMP82:%.*]] = insertelement <8 x i16> [[TMP80]], i16 [[TMP81]], i32 3
+; SSE-NEXT:    [[TMP83:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
+; SSE-NEXT:    [[TMP84:%.*]] = insertelement <8 x i16> [[TMP82]], i16 [[TMP83]], i32 4
+; SSE-NEXT:    [[TMP85:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
+; SSE-NEXT:    [[TMP86:%.*]] = insertelement <8 x i16> [[TMP84]], i16 [[TMP85]], i32 5
+; SSE-NEXT:    [[TMP87:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
+; SSE-NEXT:    [[TMP88:%.*]] = insertelement <8 x i16> [[TMP86]], i16 [[TMP87]], i32 6
+; SSE-NEXT:    [[TMP89:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
+; SSE-NEXT:    [[TMP90:%.*]] = insertelement <8 x i16> [[TMP88]], i16 [[TMP89]], i32 7
+; SSE-NEXT:    [[TMP91:%.*]] = extractelement <8 x i16> [[TMP7]], i32 0
+; SSE-NEXT:    [[TMP92:%.*]] = insertelement <8 x i16> undef, i16 [[TMP91]], i32 0
+; SSE-NEXT:    [[TMP93:%.*]] = extractelement <8 x i16> [[TMP7]], i32 1
+; SSE-NEXT:    [[TMP94:%.*]] = insertelement <8 x i16> [[TMP92]], i16 [[TMP93]], i32 1
+; SSE-NEXT:    [[TMP95:%.*]] = extractelement <8 x i16> [[TMP7]], i32 2
+; SSE-NEXT:    [[TMP96:%.*]] = insertelement <8 x i16> [[TMP94]], i16 [[TMP95]], i32 2
+; SSE-NEXT:    [[TMP97:%.*]] = extractelement <8 x i16> [[TMP7]], i32 3
+; SSE-NEXT:    [[TMP98:%.*]] = insertelement <8 x i16> [[TMP96]], i16 [[TMP97]], i32 3
+; SSE-NEXT:    [[TMP99:%.*]] = extractelement <8 x i16> [[TMP7]], i32 4
+; SSE-NEXT:    [[TMP100:%.*]] = insertelement <8 x i16> [[TMP98]], i16 [[TMP99]], i32 4
+; SSE-NEXT:    [[TMP101:%.*]] = extractelement <8 x i16> [[TMP7]], i32 5
+; SSE-NEXT:    [[TMP102:%.*]] = insertelement <8 x i16> [[TMP100]], i16 [[TMP101]], i32 5
+; SSE-NEXT:    [[TMP103:%.*]] = extractelement <8 x i16> [[TMP7]], i32 6
+; SSE-NEXT:    [[TMP104:%.*]] = insertelement <8 x i16> [[TMP102]], i16 [[TMP103]], i32 6
+; SSE-NEXT:    [[TMP105:%.*]] = extractelement <8 x i16> [[TMP7]], i32 7
+; SSE-NEXT:    [[TMP106:%.*]] = insertelement <8 x i16> [[TMP104]], i16 [[TMP105]], i32 7
+; SSE-NEXT:    [[TMP107:%.*]] = lshr <8 x i16> [[TMP90]], [[TMP106]]
+; SSE-NEXT:    [[TMP108:%.*]] = extractelement <8 x i16> [[TMP4]], i32 0
+; SSE-NEXT:    [[TMP109:%.*]] = insertelement <8 x i16> undef, i16 [[TMP108]], i32 0
+; SSE-NEXT:    [[TMP110:%.*]] = extractelement <8 x i16> [[TMP4]], i32 1
+; SSE-NEXT:    [[TMP111:%.*]] = insertelement <8 x i16> [[TMP109]], i16 [[TMP110]], i32 1
+; SSE-NEXT:    [[TMP112:%.*]] = extractelement <8 x i16> [[TMP4]], i32 2
+; SSE-NEXT:    [[TMP113:%.*]] = insertelement <8 x i16> [[TMP111]], i16 [[TMP112]], i32 2
+; SSE-NEXT:    [[TMP114:%.*]] = extractelement <8 x i16> [[TMP4]], i32 3
+; SSE-NEXT:    [[TMP115:%.*]] = insertelement <8 x i16> [[TMP113]], i16 [[TMP114]], i32 3
+; SSE-NEXT:    [[TMP116:%.*]] = extractelement <8 x i16> [[TMP4]], i32 4
+; SSE-NEXT:    [[TMP117:%.*]] = insertelement <8 x i16> [[TMP115]], i16 [[TMP116]], i32 4
+; SSE-NEXT:    [[TMP118:%.*]] = extractelement <8 x i16> [[TMP4]], i32 5
+; SSE-NEXT:    [[TMP119:%.*]] = insertelement <8 x i16> [[TMP117]], i16 [[TMP118]], i32 5
+; SSE-NEXT:    [[TMP120:%.*]] = extractelement <8 x i16> [[TMP4]], i32 6
+; SSE-NEXT:    [[TMP121:%.*]] = insertelement <8 x i16> [[TMP119]], i16 [[TMP120]], i32 6
+; SSE-NEXT:    [[TMP122:%.*]] = extractelement <8 x i16> [[TMP4]], i32 7
+; SSE-NEXT:    [[TMP123:%.*]] = insertelement <8 x i16> [[TMP121]], i16 [[TMP122]], i32 7
+; SSE-NEXT:    [[TMP124:%.*]] = extractelement <8 x i16> [[TMP8]], i32 0
+; SSE-NEXT:    [[TMP125:%.*]] = insertelement <8 x i16> undef, i16 [[TMP124]], i32 0
+; SSE-NEXT:    [[TMP126:%.*]] = extractelement <8 x i16> [[TMP8]], i32 1
+; SSE-NEXT:    [[TMP127:%.*]] = insertelement <8 x i16> [[TMP125]], i16 [[TMP126]], i32 1
+; SSE-NEXT:    [[TMP128:%.*]] = extractelement <8 x i16> [[TMP8]], i32 2
+; SSE-NEXT:    [[TMP129:%.*]] = insertelement <8 x i16> [[TMP127]], i16 [[TMP128]], i32 2
+; SSE-NEXT:    [[TMP130:%.*]] = extractelement <8 x i16> [[TMP8]], i32 3
+; SSE-NEXT:    [[TMP131:%.*]] = insertelement <8 x i16> [[TMP129]], i16 [[TMP130]], i32 3
+; SSE-NEXT:    [[TMP132:%.*]] = extractelement <8 x i16> [[TMP8]], i32 4
+; SSE-NEXT:    [[TMP133:%.*]] = insertelement <8 x i16> [[TMP131]], i16 [[TMP132]], i32 4
+; SSE-NEXT:    [[TMP134:%.*]] = extractelement <8 x i16> [[TMP8]], i32 5
+; SSE-NEXT:    [[TMP135:%.*]] = insertelement <8 x i16> [[TMP133]], i16 [[TMP134]], i32 5
+; SSE-NEXT:    [[TMP136:%.*]] = extractelement <8 x i16> [[TMP8]], i32 6
+; SSE-NEXT:    [[TMP137:%.*]] = insertelement <8 x i16> [[TMP135]], i16 [[TMP136]], i32 6
+; SSE-NEXT:    [[TMP138:%.*]] = extractelement <8 x i16> [[TMP8]], i32 7
+; SSE-NEXT:    [[TMP139:%.*]] = insertelement <8 x i16> [[TMP137]], i16 [[TMP138]], i32 7
+; SSE-NEXT:    [[TMP140:%.*]] = lshr <8 x i16> [[TMP123]], [[TMP139]]
+; SSE-NEXT:    store <8 x i16> [[TMP41]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP74]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP107]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP140]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @lshr_v32i16(
Index: test/Transforms/SLPVectorizer/X86/shift-shl.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/shift-shl.ll
+++ test/Transforms/SLPVectorizer/X86/shift-shl.ll
@@ -41,22 +41,46 @@
 ; SSE-NEXT:    ret void
 ;
 ; AVX1-LABEL: @shl_v8i64(
-; AVX1-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP9:%.*]] = shl <2 x i64> [[TMP1]], [[TMP5]]
-; AVX1-NEXT:    [[TMP10:%.*]] = shl <2 x i64> [[TMP2]], [[TMP6]]
-; AVX1-NEXT:    [[TMP11:%.*]] = shl <2 x i64> [[TMP3]], [[TMP7]]
-; AVX1-NEXT:    [[TMP12:%.*]] = shl <2 x i64> [[TMP4]], [[TMP8]]
-; AVX1-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; AVX1-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; AVX1-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
-; AVX1-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; AVX1-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX1-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX1-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX1-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP1]], i32 0
+; AVX1-NEXT:    [[TMP6:%.*]] = insertelement <4 x i64> undef, i64 [[TMP5]], i32 0
+; AVX1-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP1]], i32 1
+; AVX1-NEXT:    [[TMP8:%.*]] = insertelement <4 x i64> [[TMP6]], i64 [[TMP7]], i32 1
+; AVX1-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP1]], i32 2
+; AVX1-NEXT:    [[TMP10:%.*]] = insertelement <4 x i64> [[TMP8]], i64 [[TMP9]], i32 2
+; AVX1-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP1]], i32 3
+; AVX1-NEXT:    [[TMP12:%.*]] = insertelement <4 x i64> [[TMP10]], i64 [[TMP11]], i32 3
+; AVX1-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; AVX1-NEXT:    [[TMP14:%.*]] = insertelement <4 x i64> undef, i64 [[TMP13]], i32 0
+; AVX1-NEXT:    [[TMP15:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; AVX1-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP14]], i64 [[TMP15]], i32 1
+; AVX1-NEXT:    [[TMP17:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; AVX1-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP17]], i32 2
+; AVX1-NEXT:    [[TMP19:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; AVX1-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP19]], i32 3
+; AVX1-NEXT:    [[TMP21:%.*]] = shl <4 x i64> [[TMP12]], [[TMP20]]
+; AVX1-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0
+; AVX1-NEXT:    [[TMP23:%.*]] = insertelement <4 x i64> undef, i64 [[TMP22]], i32 0
+; AVX1-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
+; AVX1-NEXT:    [[TMP25:%.*]] = insertelement <4 x i64> [[TMP23]], i64 [[TMP24]], i32 1
+; AVX1-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
+; AVX1-NEXT:    [[TMP27:%.*]] = insertelement <4 x i64> [[TMP25]], i64 [[TMP26]], i32 2
+; AVX1-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
+; AVX1-NEXT:    [[TMP29:%.*]] = insertelement <4 x i64> [[TMP27]], i64 [[TMP28]], i32 3
+; AVX1-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP4]], i32 0
+; AVX1-NEXT:    [[TMP31:%.*]] = insertelement <4 x i64> undef, i64 [[TMP30]], i32 0
+; AVX1-NEXT:    [[TMP32:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1
+; AVX1-NEXT:    [[TMP33:%.*]] = insertelement <4 x i64> [[TMP31]], i64 [[TMP32]], i32 1
+; AVX1-NEXT:    [[TMP34:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2
+; AVX1-NEXT:    [[TMP35:%.*]] = insertelement <4 x i64> [[TMP33]], i64 [[TMP34]], i32 2
+; AVX1-NEXT:    [[TMP36:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3
+; AVX1-NEXT:    [[TMP37:%.*]] = insertelement <4 x i64> [[TMP35]], i64 [[TMP36]], i32 3
+; AVX1-NEXT:    [[TMP38:%.*]] = shl <4 x i64> [[TMP29]], [[TMP37]]
+; AVX1-NEXT:    store <4 x i64> [[TMP21]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX1-NEXT:    store <4 x i64> [[TMP38]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @shl_v8i64(
@@ -241,134 +265,150 @@
 
 define void @shl_v32i16() {
 ; SSE-LABEL: @shl_v32i16(
-; SSE-NEXT:    [[A0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0), align 2
-; SSE-NEXT:    [[A1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1), align 2
-; SSE-NEXT:    [[A2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2), align 2
-; SSE-NEXT:    [[A3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3), align 2
-; SSE-NEXT:    [[A4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4), align 2
-; SSE-NEXT:    [[A5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5), align 2
-; SSE-NEXT:    [[A6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6), align 2
-; SSE-NEXT:    [[A7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7), align 2
-; SSE-NEXT:    [[A8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8), align 2
-; SSE-NEXT:    [[A9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9), align 2
-; SSE-NEXT:    [[A10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
-; SSE-NEXT:    [[A11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
-; SSE-NEXT:    [[A12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
-; SSE-NEXT:    [[A13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
-; SSE-NEXT:    [[A14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
-; SSE-NEXT:    [[A15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
-; SSE-NEXT:    [[A16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
-; SSE-NEXT:    [[A17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
-; SSE-NEXT:    [[A18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
-; SSE-NEXT:    [[A19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
-; SSE-NEXT:    [[A20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
-; SSE-NEXT:    [[A21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
-; SSE-NEXT:    [[A22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
-; SSE-NEXT:    [[A23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
-; SSE-NEXT:    [[A24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
-; SSE-NEXT:    [[A25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
-; SSE-NEXT:    [[A26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
-; SSE-NEXT:    [[A27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
-; SSE-NEXT:    [[A28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
-; SSE-NEXT:    [[A29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
-; SSE-NEXT:    [[A30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
-; SSE-NEXT:    [[A31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
-; SSE-NEXT:    [[B0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0), align 2
-; SSE-NEXT:    [[B1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1), align 2
-; SSE-NEXT:    [[B2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2), align 2
-; SSE-NEXT:    [[B3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3), align 2
-; SSE-NEXT:    [[B4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4), align 2
-; SSE-NEXT:    [[B5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5), align 2
-; SSE-NEXT:    [[B6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6), align 2
-; SSE-NEXT:    [[B7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7), align 2
-; SSE-NEXT:    [[B8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8), align 2
-; SSE-NEXT:    [[B9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9), align 2
-; SSE-NEXT:    [[B10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
-; SSE-NEXT:    [[B11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
-; SSE-NEXT:    [[B12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
-; SSE-NEXT:    [[B13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
-; SSE-NEXT:    [[B14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
-; SSE-NEXT:    [[B15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
-; SSE-NEXT:    [[B16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
-; SSE-NEXT:    [[B17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
-; SSE-NEXT:    [[B18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
-; SSE-NEXT:    [[B19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
-; SSE-NEXT:    [[B20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
-; SSE-NEXT:    [[B21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
-; SSE-NEXT:    [[B22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
-; SSE-NEXT:    [[B23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
-; SSE-NEXT:    [[B24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
-; SSE-NEXT:    [[B25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
-; SSE-NEXT:    [[B26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
-; SSE-NEXT:    [[B27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
-; SSE-NEXT:    [[B28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
-; SSE-NEXT:    [[B29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
-; SSE-NEXT:    [[B30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
-; SSE-NEXT:    [[B31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
-; SSE-NEXT:    [[R0:%.*]] = shl i16 [[A0]], [[B0]]
-; SSE-NEXT:    [[R1:%.*]] = shl i16 [[A1]], [[B1]]
-; SSE-NEXT:    [[R2:%.*]] = shl i16 [[A2]], [[B2]]
-; SSE-NEXT:    [[R3:%.*]] = shl i16 [[A3]], [[B3]]
-; SSE-NEXT:    [[R4:%.*]] = shl i16 [[A4]], [[B4]]
-; SSE-NEXT:    [[R5:%.*]] = shl i16 [[A5]], [[B5]]
-; SSE-NEXT:    [[R6:%.*]] = shl i16 [[A6]], [[B6]]
-; SSE-NEXT:    [[R7:%.*]] = shl i16 [[A7]], [[B7]]
-; SSE-NEXT:    [[R8:%.*]] = shl i16 [[A8]], [[B8]]
-; SSE-NEXT:    [[R9:%.*]] = shl i16 [[A9]], [[B9]]
-; SSE-NEXT:    [[R10:%.*]] = shl i16 [[A10]], [[B10]]
-; SSE-NEXT:    [[R11:%.*]] = shl i16 [[A11]], [[B11]]
-; SSE-NEXT:    [[R12:%.*]] = shl i16 [[A12]], [[B12]]
-; SSE-NEXT:    [[R13:%.*]] = shl i16 [[A13]], [[B13]]
-; SSE-NEXT:    [[R14:%.*]] = shl i16 [[A14]], [[B14]]
-; SSE-NEXT:    [[R15:%.*]] = shl i16 [[A15]], [[B15]]
-; SSE-NEXT:    [[R16:%.*]] = shl i16 [[A16]], [[B16]]
-; SSE-NEXT:    [[R17:%.*]] = shl i16 [[A17]], [[B17]]
-; SSE-NEXT:    [[R18:%.*]] = shl i16 [[A18]], [[B18]]
-; SSE-NEXT:    [[R19:%.*]] = shl i16 [[A19]], [[B19]]
-; SSE-NEXT:    [[R20:%.*]] = shl i16 [[A20]], [[B20]]
-; SSE-NEXT:    [[R21:%.*]] = shl i16 [[A21]], [[B21]]
-; SSE-NEXT:    [[R22:%.*]] = shl i16 [[A22]], [[B22]]
-; SSE-NEXT:    [[R23:%.*]] = shl i16 [[A23]], [[B23]]
-; SSE-NEXT:    [[R24:%.*]] = shl i16 [[A24]], [[B24]]
-; SSE-NEXT:    [[R25:%.*]] = shl i16 [[A25]], [[B25]]
-; SSE-NEXT:    [[R26:%.*]] = shl i16 [[A26]], [[B26]]
-; SSE-NEXT:    [[R27:%.*]] = shl i16 [[A27]], [[B27]]
-; SSE-NEXT:    [[R28:%.*]] = shl i16 [[A28]], [[B28]]
-; SSE-NEXT:    [[R29:%.*]] = shl i16 [[A29]], [[B29]]
-; SSE-NEXT:    [[R30:%.*]] = shl i16 [[A30]], [[B30]]
-; SSE-NEXT:    [[R31:%.*]] = shl i16 [[A31]], [[B31]]
-; SSE-NEXT:    store i16 [[R0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0), align 2
-; SSE-NEXT:    store i16 [[R1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1), align 2
-; SSE-NEXT:    store i16 [[R2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2), align 2
-; SSE-NEXT:    store i16 [[R3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3), align 2
-; SSE-NEXT:    store i16 [[R4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4), align 2
-; SSE-NEXT:    store i16 [[R5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5), align 2
-; SSE-NEXT:    store i16 [[R6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6), align 2
-; SSE-NEXT:    store i16 [[R7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7), align 2
-; SSE-NEXT:    store i16 [[R8]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8), align 2
-; SSE-NEXT:    store i16 [[R9]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9), align 2
-; SSE-NEXT:    store i16 [[R10]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
-; SSE-NEXT:    store i16 [[R11]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
-; SSE-NEXT:    store i16 [[R12]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
-; SSE-NEXT:    store i16 [[R13]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
-; SSE-NEXT:    store i16 [[R14]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
-; SSE-NEXT:    store i16 [[R15]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
-; SSE-NEXT:    store i16 [[R16]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
-; SSE-NEXT:    store i16 [[R17]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
-; SSE-NEXT:    store i16 [[R18]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
-; SSE-NEXT:    store i16 [[R19]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
-; SSE-NEXT:    store i16 [[R20]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
-; SSE-NEXT:    store i16 [[R21]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
-; SSE-NEXT:    store i16 [[R22]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
-; SSE-NEXT:    store i16 [[R23]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
-; SSE-NEXT:    store i16 [[R24]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
-; SSE-NEXT:    store i16 [[R25]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
-; SSE-NEXT:    store i16 [[R26]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
-; SSE-NEXT:    store i16 [[R27]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
-; SSE-NEXT:    store i16 [[R28]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
-; SSE-NEXT:    store i16 [[R29]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
-; SSE-NEXT:    store i16 [[R30]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
-; SSE-NEXT:    store i16 [[R31]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0
+; SSE-NEXT:    [[TMP10:%.*]] = insertelement <8 x i16> undef, i16 [[TMP9]], i32 0
+; SSE-NEXT:    [[TMP11:%.*]] = extractelement <8 x i16> [[TMP1]], i32 1
+; SSE-NEXT:    [[TMP12:%.*]] = insertelement <8 x i16> [[TMP10]], i16 [[TMP11]], i32 1
+; SSE-NEXT:    [[TMP13:%.*]] = extractelement <8 x i16> [[TMP1]], i32 2
+; SSE-NEXT:    [[TMP14:%.*]] = insertelement <8 x i16> [[TMP12]], i16 [[TMP13]], i32 2
+; SSE-NEXT:    [[TMP15:%.*]] = extractelement <8 x i16> [[TMP1]], i32 3
+; SSE-NEXT:    [[TMP16:%.*]] = insertelement <8 x i16> [[TMP14]], i16 [[TMP15]], i32 3
+; SSE-NEXT:    [[TMP17:%.*]] = extractelement <8 x i16> [[TMP1]], i32 4
+; SSE-NEXT:    [[TMP18:%.*]] = insertelement <8 x i16> [[TMP16]], i16 [[TMP17]], i32 4
+; SSE-NEXT:    [[TMP19:%.*]] = extractelement <8 x i16> [[TMP1]], i32 5
+; SSE-NEXT:    [[TMP20:%.*]] = insertelement <8 x i16> [[TMP18]], i16 [[TMP19]], i32 5
+; SSE-NEXT:    [[TMP21:%.*]] = extractelement <8 x i16> [[TMP1]], i32 6
+; SSE-NEXT:    [[TMP22:%.*]] = insertelement <8 x i16> [[TMP20]], i16 [[TMP21]], i32 6
+; SSE-NEXT:    [[TMP23:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+; SSE-NEXT:    [[TMP24:%.*]] = insertelement <8 x i16> [[TMP22]], i16 [[TMP23]], i32 7
+; SSE-NEXT:    [[TMP25:%.*]] = extractelement <8 x i16> [[TMP5]], i32 0
+; SSE-NEXT:    [[TMP26:%.*]] = insertelement <8 x i16> undef, i16 [[TMP25]], i32 0
+; SSE-NEXT:    [[TMP27:%.*]] = extractelement <8 x i16> [[TMP5]], i32 1
+; SSE-NEXT:    [[TMP28:%.*]] = insertelement <8 x i16> [[TMP26]], i16 [[TMP27]], i32 1
+; SSE-NEXT:    [[TMP29:%.*]] = extractelement <8 x i16> [[TMP5]], i32 2
+; SSE-NEXT:    [[TMP30:%.*]] = insertelement <8 x i16> [[TMP28]], i16 [[TMP29]], i32 2
+; SSE-NEXT:    [[TMP31:%.*]] = extractelement <8 x i16> [[TMP5]], i32 3
+; SSE-NEXT:    [[TMP32:%.*]] = insertelement <8 x i16> [[TMP30]], i16 [[TMP31]], i32 3
+; SSE-NEXT:    [[TMP33:%.*]] = extractelement <8 x i16> [[TMP5]], i32 4
+; SSE-NEXT:    [[TMP34:%.*]] = insertelement <8 x i16> [[TMP32]], i16 [[TMP33]], i32 4
+; SSE-NEXT:    [[TMP35:%.*]] = extractelement <8 x i16> [[TMP5]], i32 5
+; SSE-NEXT:    [[TMP36:%.*]] = insertelement <8 x i16> [[TMP34]], i16 [[TMP35]], i32 5
+; SSE-NEXT:    [[TMP37:%.*]] = extractelement <8 x i16> [[TMP5]], i32 6
+; SSE-NEXT:    [[TMP38:%.*]] = insertelement <8 x i16> [[TMP36]], i16 [[TMP37]], i32 6
+; SSE-NEXT:    [[TMP39:%.*]] = extractelement <8 x i16> [[TMP5]], i32 7
+; SSE-NEXT:    [[TMP40:%.*]] = insertelement <8 x i16> [[TMP38]], i16 [[TMP39]], i32 7
+; SSE-NEXT:    [[TMP41:%.*]] = shl <8 x i16> [[TMP24]], [[TMP40]]
+; SSE-NEXT:    [[TMP42:%.*]] = extractelement <8 x i16> [[TMP2]], i32 0
+; SSE-NEXT:    [[TMP43:%.*]] = insertelement <8 x i16> undef, i16 [[TMP42]], i32 0
+; SSE-NEXT:    [[TMP44:%.*]] = extractelement <8 x i16> [[TMP2]], i32 1
+; SSE-NEXT:    [[TMP45:%.*]] = insertelement <8 x i16> [[TMP43]], i16 [[TMP44]], i32 1
+; SSE-NEXT:    [[TMP46:%.*]] = extractelement <8 x i16> [[TMP2]], i32 2
+; SSE-NEXT:    [[TMP47:%.*]] = insertelement <8 x i16> [[TMP45]], i16 [[TMP46]], i32 2
+; SSE-NEXT:    [[TMP48:%.*]] = extractelement <8 x i16> [[TMP2]], i32 3
+; SSE-NEXT:    [[TMP49:%.*]] = insertelement <8 x i16> [[TMP47]], i16 [[TMP48]], i32 3
+; SSE-NEXT:    [[TMP50:%.*]] = extractelement <8 x i16> [[TMP2]], i32 4
+; SSE-NEXT:    [[TMP51:%.*]] = insertelement <8 x i16> [[TMP49]], i16 [[TMP50]], i32 4
+; SSE-NEXT:    [[TMP52:%.*]] = extractelement <8 x i16> [[TMP2]], i32 5
+; SSE-NEXT:    [[TMP53:%.*]] = insertelement <8 x i16> [[TMP51]], i16 [[TMP52]], i32 5
+; SSE-NEXT:    [[TMP54:%.*]] = extractelement <8 x i16> [[TMP2]], i32 6
+; SSE-NEXT:    [[TMP55:%.*]] = insertelement <8 x i16> [[TMP53]], i16 [[TMP54]], i32 6
+; SSE-NEXT:    [[TMP56:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
+; SSE-NEXT:    [[TMP57:%.*]] = insertelement <8 x i16> [[TMP55]], i16 [[TMP56]], i32 7
+; SSE-NEXT:    [[TMP58:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0
+; SSE-NEXT:    [[TMP59:%.*]] = insertelement <8 x i16> undef, i16 [[TMP58]], i32 0
+; SSE-NEXT:    [[TMP60:%.*]] = extractelement <8 x i16> [[TMP6]], i32 1
+; SSE-NEXT:    [[TMP61:%.*]] = insertelement <8 x i16> [[TMP59]], i16 [[TMP60]], i32 1
+; SSE-NEXT:    [[TMP62:%.*]] = extractelement <8 x i16> [[TMP6]], i32 2
+; SSE-NEXT:    [[TMP63:%.*]] = insertelement <8 x i16> [[TMP61]], i16 [[TMP62]], i32 2
+; SSE-NEXT:    [[TMP64:%.*]] = extractelement <8 x i16> [[TMP6]], i32 3
+; SSE-NEXT:    [[TMP65:%.*]] = insertelement <8 x i16> [[TMP63]], i16 [[TMP64]], i32 3
+; SSE-NEXT:    [[TMP66:%.*]] = extractelement <8 x i16> [[TMP6]], i32 4
+; SSE-NEXT:    [[TMP67:%.*]] = insertelement <8 x i16> [[TMP65]], i16 [[TMP66]], i32 4
+; SSE-NEXT:    [[TMP68:%.*]] = extractelement <8 x i16> [[TMP6]], i32 5
+; SSE-NEXT:    [[TMP69:%.*]] = insertelement <8 x i16> [[TMP67]], i16 [[TMP68]], i32 5
+; SSE-NEXT:    [[TMP70:%.*]] = extractelement <8 x i16> [[TMP6]], i32 6
+; SSE-NEXT:    [[TMP71:%.*]] = insertelement <8 x i16> [[TMP69]], i16 [[TMP70]], i32 6
+; SSE-NEXT:    [[TMP72:%.*]] = extractelement <8 x i16> [[TMP6]], i32 7
+; SSE-NEXT:    [[TMP73:%.*]] = insertelement <8 x i16> [[TMP71]], i16 [[TMP72]], i32 7
+; SSE-NEXT:    [[TMP74:%.*]] = shl <8 x i16> [[TMP57]], [[TMP73]]
+; SSE-NEXT:    [[TMP75:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
+; SSE-NEXT:    [[TMP76:%.*]] = insertelement <8 x i16> undef, i16 [[TMP75]], i32 0
+; SSE-NEXT:    [[TMP77:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
+; SSE-NEXT:    [[TMP78:%.*]] = insertelement <8 x i16> [[TMP76]], i16 [[TMP77]], i32 1
+; SSE-NEXT:    [[TMP79:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
+; SSE-NEXT:    [[TMP80:%.*]] = insertelement <8 x i16> [[TMP78]], i16 [[TMP79]], i32 2
+; SSE-NEXT:    [[TMP81:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
+; SSE-NEXT:    [[TMP82:%.*]] = insertelement <8 x i16> [[TMP80]], i16 [[TMP81]], i32 3
+; SSE-NEXT:    [[TMP83:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
+; SSE-NEXT:    [[TMP84:%.*]] = insertelement <8 x i16> [[TMP82]], i16 [[TMP83]], i32 4
+; SSE-NEXT:    [[TMP85:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
+; SSE-NEXT:    [[TMP86:%.*]] = insertelement <8 x i16> [[TMP84]], i16 [[TMP85]], i32 5
+; SSE-NEXT:    [[TMP87:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
+; SSE-NEXT:    [[TMP88:%.*]] = insertelement <8 x i16> [[TMP86]], i16 [[TMP87]], i32 6
+; SSE-NEXT:    [[TMP89:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
+; SSE-NEXT:    [[TMP90:%.*]] = insertelement <8 x i16> [[TMP88]], i16 [[TMP89]], i32 7
+; SSE-NEXT:    [[TMP91:%.*]] = extractelement <8 x i16> [[TMP7]], i32 0
+; SSE-NEXT:    [[TMP92:%.*]] = insertelement <8 x i16> undef, i16 [[TMP91]], i32 0
+; SSE-NEXT:    [[TMP93:%.*]] = extractelement <8 x i16> [[TMP7]], i32 1
+; SSE-NEXT:    [[TMP94:%.*]] = insertelement <8 x i16> [[TMP92]], i16 [[TMP93]], i32 1
+; SSE-NEXT:    [[TMP95:%.*]] = extractelement <8 x i16> [[TMP7]], i32 2
+; SSE-NEXT:    [[TMP96:%.*]] = insertelement <8 x i16> [[TMP94]], i16 [[TMP95]], i32 2
+; SSE-NEXT:    [[TMP97:%.*]] = extractelement <8 x i16> [[TMP7]], i32 3
+; SSE-NEXT:    [[TMP98:%.*]] = insertelement <8 x i16> [[TMP96]], i16 [[TMP97]], i32 3
+; SSE-NEXT:    [[TMP99:%.*]] = extractelement <8 x i16> [[TMP7]], i32 4
+; SSE-NEXT:    [[TMP100:%.*]] = insertelement <8 x i16> [[TMP98]], i16 [[TMP99]], i32 4
+; SSE-NEXT:    [[TMP101:%.*]] = extractelement <8 x i16> [[TMP7]], i32 5
+; SSE-NEXT:    [[TMP102:%.*]] = insertelement <8 x i16> [[TMP100]], i16 [[TMP101]], i32 5
+; SSE-NEXT:    [[TMP103:%.*]] = extractelement <8 x i16> [[TMP7]], i32 6
+; SSE-NEXT:    [[TMP104:%.*]] = insertelement <8 x i16> [[TMP102]], i16 [[TMP103]], i32 6
+; SSE-NEXT:    [[TMP105:%.*]] = extractelement <8 x i16> [[TMP7]], i32 7
+; SSE-NEXT:    [[TMP106:%.*]] = insertelement <8 x i16> [[TMP104]], i16 [[TMP105]], i32 7
+; SSE-NEXT:    [[TMP107:%.*]] = shl <8 x i16> [[TMP90]], [[TMP106]]
+; SSE-NEXT:    [[TMP108:%.*]] = extractelement <8 x i16> [[TMP4]], i32 0
+; SSE-NEXT:    [[TMP109:%.*]] = insertelement <8 x i16> undef, i16 [[TMP108]], i32 0
+; SSE-NEXT:    [[TMP110:%.*]] = extractelement <8 x i16> [[TMP4]], i32 1
+; SSE-NEXT:    [[TMP111:%.*]] = insertelement <8 x i16> [[TMP109]], i16 [[TMP110]], i32 1
+; SSE-NEXT:    [[TMP112:%.*]] = extractelement <8 x i16> [[TMP4]], i32 2
+; SSE-NEXT:    [[TMP113:%.*]] = insertelement <8 x i16> [[TMP111]], i16 [[TMP112]], i32 2
+; SSE-NEXT:    [[TMP114:%.*]] = extractelement <8 x i16> [[TMP4]], i32 3
+; SSE-NEXT:    [[TMP115:%.*]] = insertelement <8 x i16> [[TMP113]], i16 [[TMP114]], i32 3
+; SSE-NEXT:    [[TMP116:%.*]] = extractelement <8 x i16> [[TMP4]], i32 4
+; SSE-NEXT:    [[TMP117:%.*]] = insertelement <8 x i16> [[TMP115]], i16 [[TMP116]], i32 4
+; SSE-NEXT:    [[TMP118:%.*]] = extractelement <8 x i16> [[TMP4]], i32 5
+; SSE-NEXT:    [[TMP119:%.*]] = insertelement <8 x i16> [[TMP117]], i16 [[TMP118]], i32 5
+; SSE-NEXT:    [[TMP120:%.*]] = extractelement <8 x i16> [[TMP4]], i32 6
+; SSE-NEXT:    [[TMP121:%.*]] = insertelement <8 x i16> [[TMP119]], i16 [[TMP120]], i32 6
+; SSE-NEXT:    [[TMP122:%.*]] = extractelement <8 x i16> [[TMP4]], i32 7
+; SSE-NEXT:    [[TMP123:%.*]] = insertelement <8 x i16> [[TMP121]], i16 [[TMP122]], i32 7
+; SSE-NEXT:    [[TMP124:%.*]] = extractelement <8 x i16> [[TMP8]], i32 0
+; SSE-NEXT:    [[TMP125:%.*]] = insertelement <8 x i16> undef, i16 [[TMP124]], i32 0
+; SSE-NEXT:    [[TMP126:%.*]] = extractelement <8 x i16> [[TMP8]], i32 1
+; SSE-NEXT:    [[TMP127:%.*]] = insertelement <8 x i16> [[TMP125]], i16 [[TMP126]], i32 1
+; SSE-NEXT:    [[TMP128:%.*]] = extractelement <8 x i16> [[TMP8]], i32 2
+; SSE-NEXT:    [[TMP129:%.*]] = insertelement <8 x i16> [[TMP127]], i16 [[TMP128]], i32 2
+; SSE-NEXT:    [[TMP130:%.*]] = extractelement <8 x i16> [[TMP8]], i32 3
+; SSE-NEXT:    [[TMP131:%.*]] = insertelement <8 x i16> [[TMP129]], i16 [[TMP130]], i32 3
+; SSE-NEXT:    [[TMP132:%.*]] = extractelement <8 x i16> [[TMP8]], i32 4
+; SSE-NEXT:    [[TMP133:%.*]] = insertelement <8 x i16> [[TMP131]], i16 [[TMP132]], i32 4
+; SSE-NEXT:    [[TMP134:%.*]] = extractelement <8 x i16> [[TMP8]], i32 5
+; SSE-NEXT:    [[TMP135:%.*]] = insertelement <8 x i16> [[TMP133]], i16 [[TMP134]], i32 5
+; SSE-NEXT:    [[TMP136:%.*]] = extractelement <8 x i16> [[TMP8]], i32 6
+; SSE-NEXT:    [[TMP137:%.*]] = insertelement <8 x i16> [[TMP135]], i16 [[TMP136]], i32 6
+; SSE-NEXT:    [[TMP138:%.*]] = extractelement <8 x i16> [[TMP8]], i32 7
+; SSE-NEXT:    [[TMP139:%.*]] = insertelement <8 x i16> [[TMP137]], i16 [[TMP138]], i32 7
+; SSE-NEXT:    [[TMP140:%.*]] = shl <8 x i16> [[TMP123]], [[TMP139]]
+; SSE-NEXT:    store <8 x i16> [[TMP41]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP74]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP107]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP140]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @shl_v32i16(
Index: test/Transforms/SLPVectorizer/X86/sitofp.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/sitofp.ll
+++ test/Transforms/SLPVectorizer/X86/sitofp.ll
@@ -1,8 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256DQ
 
@@ -26,31 +23,26 @@
 ; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
 ; SSE-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to double
 ; SSE-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to double
-; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1
+; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
 ; SSE-NEXT:    ret void
 ;
+; AVX-LABEL: @sitofp_2i64_2f64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
+; AVX-NEXT:    [[TMP2:%.*]] = sitofp <2 x i64> [[TMP1]] to <2 x double>
+; AVX-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; AVX-NEXT:    ret void
+;
 ; AVX256NODQ-LABEL: @sitofp_2i64_2f64(
 ; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
 ; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
 ; AVX256NODQ-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to double
 ; AVX256NODQ-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to double
-; AVX256NODQ-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; AVX256NODQ-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0
+; AVX256NODQ-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1
+; AVX256NODQ-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
 ; AVX256NODQ-NEXT:    ret void
-;
-; AVX512-LABEL: @sitofp_2i64_2f64(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
-; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <2 x i64> [[TMP1]] to <2 x double>
-; AVX512-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
-; AVX512-NEXT:    ret void
-;
-; AVX256DQ-LABEL: @sitofp_2i64_2f64(
-; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
-; AVX256DQ-NEXT:    [[TMP2:%.*]] = sitofp <2 x i64> [[TMP1]] to <2 x double>
-; AVX256DQ-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
-; AVX256DQ-NEXT:    ret void
-;
   %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
   %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
   %cvt0 = sitofp i64 %ld0 to double
@@ -70,12 +62,20 @@
 ; SSE-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to double
 ; SSE-NEXT:    [[CVT2:%.*]] = sitofp i64 [[LD2]] to double
 ; SSE-NEXT:    [[CVT3:%.*]] = sitofp i64 [[LD3]] to double
-; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
-; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1
+; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> undef, double [[CVT2]], i32 0
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[CVT3]], i32 1
+; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
+; AVX-LABEL: @sitofp_4i64_4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
+; AVX-NEXT:    [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x double>
+; AVX-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX-NEXT:    ret void
+;
 ; AVX256NODQ-LABEL: @sitofp_4i64_4f64(
 ; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
 ; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
@@ -85,24 +85,12 @@
 ; AVX256NODQ-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to double
 ; AVX256NODQ-NEXT:    [[CVT2:%.*]] = sitofp i64 [[LD2]] to double
 ; AVX256NODQ-NEXT:    [[CVT3:%.*]] = sitofp i64 [[LD3]] to double
-; AVX256NODQ-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; AVX256NODQ-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; AVX256NODQ-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
-; AVX256NODQ-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; AVX256NODQ-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> undef, double [[CVT0]], i32 0
+; AVX256NODQ-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[CVT1]], i32 1
+; AVX256NODQ-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[CVT2]], i32 2
+; AVX256NODQ-NEXT:    [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[CVT3]], i32 3
+; AVX256NODQ-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
 ; AVX256NODQ-NEXT:    ret void
-;
-; AVX512-LABEL: @sitofp_4i64_4f64(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
-; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x double>
-; AVX512-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
-; AVX512-NEXT:    ret void
-;
-; AVX256DQ-LABEL: @sitofp_4i64_4f64(
-; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
-; AVX256DQ-NEXT:    [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x double>
-; AVX256DQ-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
-; AVX256DQ-NEXT:    ret void
-;
   %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
   %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
   %ld2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
@@ -136,16 +124,35 @@
 ; SSE-NEXT:    [[CVT5:%.*]] = sitofp i64 [[LD5]] to double
 ; SSE-NEXT:    [[CVT6:%.*]] = sitofp i64 [[LD6]] to double
 ; SSE-NEXT:    [[CVT7:%.*]] = sitofp i64 [[LD7]] to double
-; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
-; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
-; SSE-NEXT:    store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
-; SSE-NEXT:    store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
-; SSE-NEXT:    store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
-; SSE-NEXT:    store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1
+; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> undef, double [[CVT2]], i32 0
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[CVT3]], i32 1
+; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
+; SSE-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> undef, double [[CVT4]], i32 0
+; SSE-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[CVT5]], i32 1
+; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
+; SSE-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> undef, double [[CVT6]], i32 0
+; SSE-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[CVT7]], i32 1
+; SSE-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
+; AVX512-LABEL: @sitofp_8i64_8f64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <8 x i64> [[TMP1]] to <8 x double>
+; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
+; AVX512-NEXT:    ret void
+;
+; AVX256-LABEL: @sitofp_8i64_8f64(
+; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
+; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32
+; AVX256-NEXT:    [[TMP3:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x double>
+; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <4 x i64> [[TMP2]] to <4 x double>
+; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
+; AVX256-NEXT:    ret void
+;
 ; AVX256NODQ-LABEL: @sitofp_8i64_8f64(
 ; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
 ; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
@@ -163,31 +170,17 @@
 ; AVX256NODQ-NEXT:    [[CVT5:%.*]] = sitofp i64 [[LD5]] to double
 ; AVX256NODQ-NEXT:    [[CVT6:%.*]] = sitofp i64 [[LD6]] to double
 ; AVX256NODQ-NEXT:    [[CVT7:%.*]] = sitofp i64 [[LD7]] to double
-; AVX256NODQ-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; AVX256NODQ-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; AVX256NODQ-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
-; AVX256NODQ-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
-; AVX256NODQ-NEXT:    store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
-; AVX256NODQ-NEXT:    store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
-; AVX256NODQ-NEXT:    store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
-; AVX256NODQ-NEXT:    store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; AVX256NODQ-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> undef, double [[CVT0]], i32 0
+; AVX256NODQ-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[CVT1]], i32 1
+; AVX256NODQ-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[CVT2]], i32 2
+; AVX256NODQ-NEXT:    [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[CVT3]], i32 3
+; AVX256NODQ-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX256NODQ-NEXT:    [[TMP5:%.*]] = insertelement <4 x double> undef, double [[CVT4]], i32 0
+; AVX256NODQ-NEXT:    [[TMP6:%.*]] = insertelement <4 x double> [[TMP5]], double [[CVT5]], i32 1
+; AVX256NODQ-NEXT:    [[TMP7:%.*]] = insertelement <4 x double> [[TMP6]], double [[CVT6]], i32 2
+; AVX256NODQ-NEXT:    [[TMP8:%.*]] = insertelement <4 x double> [[TMP7]], double [[CVT7]], i32 3
+; AVX256NODQ-NEXT:    store <4 x double> [[TMP8]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
 ; AVX256NODQ-NEXT:    ret void
-;
-; AVX512-LABEL: @sitofp_8i64_8f64(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
-; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <8 x i64> [[TMP1]] to <8 x double>
-; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
-; AVX512-NEXT:    ret void
-;
-; AVX256DQ-LABEL: @sitofp_8i64_8f64(
-; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
-; AVX256DQ-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32
-; AVX256DQ-NEXT:    [[TMP3:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x double>
-; AVX256DQ-NEXT:    [[TMP4:%.*]] = sitofp <4 x i64> [[TMP2]] to <4 x double>
-; AVX256DQ-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
-; AVX256DQ-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
-; AVX256DQ-NEXT:    ret void
-;
   %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
   %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
   %ld2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
@@ -221,8 +214,9 @@
 ; CHECK-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
 ; CHECK-NEXT:    [[CVT0:%.*]] = sitofp i32 [[LD0]] to double
 ; CHECK-NEXT:    [[CVT1:%.*]] = sitofp i32 [[LD1]] to double
-; CHECK-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; CHECK-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1
+; CHECK-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
 ; CHECK-NEXT:    ret void
 ;
   %ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
@@ -244,10 +238,12 @@
 ; SSE-NEXT:    [[CVT1:%.*]] = sitofp i32 [[LD1]] to double
 ; SSE-NEXT:    [[CVT2:%.*]] = sitofp i32 [[LD2]] to double
 ; SSE-NEXT:    [[CVT3:%.*]] = sitofp i32 [[LD3]] to double
-; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
-; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1
+; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> undef, double [[CVT2]], i32 0
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[CVT3]], i32 1
+; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @sitofp_4i32_4f64(
@@ -289,16 +285,26 @@
 ; SSE-NEXT:    [[CVT5:%.*]] = sitofp i32 [[LD5]] to double
 ; SSE-NEXT:    [[CVT6:%.*]] = sitofp i32 [[LD6]] to double
 ; SSE-NEXT:    [[CVT7:%.*]] = sitofp i32 [[LD7]] to double
-; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
-; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
-; SSE-NEXT:    store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
-; SSE-NEXT:    store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
-; SSE-NEXT:    store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
-; SSE-NEXT:    store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1
+; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> undef, double [[CVT2]], i32 0
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[CVT3]], i32 1
+; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
+; SSE-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> undef, double [[CVT4]], i32 0
+; SSE-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[CVT5]], i32 1
+; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
+; SSE-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> undef, double [[CVT6]], i32 0
+; SSE-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[CVT7]], i32 1
+; SSE-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
+; AVX512-LABEL: @sitofp_8i32_8f64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <8 x i32> [[TMP1]] to <8 x double>
+; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
+; AVX512-NEXT:    ret void
+;
 ; AVX256-LABEL: @sitofp_8i32_8f64(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
 ; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
@@ -307,12 +313,6 @@
 ; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
 ; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
 ; AVX256-NEXT:    ret void
-;
-; AVX512-LABEL: @sitofp_8i32_8f64(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64
-; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <8 x i32> [[TMP1]] to <8 x double>
-; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
-; AVX512-NEXT:    ret void
 ;
   %ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
   %ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
@@ -347,8 +347,9 @@
 ; CHECK-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
 ; CHECK-NEXT:    [[CVT0:%.*]] = sitofp i16 [[LD0]] to double
 ; CHECK-NEXT:    [[CVT1:%.*]] = sitofp i16 [[LD1]] to double
-; CHECK-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; CHECK-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1
+; CHECK-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
 ; CHECK-NEXT:    ret void
 ;
   %ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
@@ -370,10 +371,12 @@
 ; SSE-NEXT:    [[CVT1:%.*]] = sitofp i16 [[LD1]] to double
 ; SSE-NEXT:    [[CVT2:%.*]] = sitofp i16 [[LD2]] to double
 ; SSE-NEXT:    [[CVT3:%.*]] = sitofp i16 [[LD3]] to double
-; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
-; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1
+; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> undef, double [[CVT2]], i32 0
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[CVT3]], i32 1
+; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @sitofp_4i16_4f64(
@@ -415,16 +418,26 @@
 ; SSE-NEXT:    [[CVT5:%.*]] = sitofp i16 [[LD5]] to double
 ; SSE-NEXT:    [[CVT6:%.*]] = sitofp i16 [[LD6]] to double
 ; SSE-NEXT:    [[CVT7:%.*]] = sitofp i16 [[LD7]] to double
-; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
-; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
-; SSE-NEXT:    store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
-; SSE-NEXT:    store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
-; SSE-NEXT:    store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
-; SSE-NEXT:    store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1
+; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> undef, double [[CVT2]], i32 0
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[CVT3]], i32 1
+; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
+; SSE-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> undef, double [[CVT4]], i32 0
+; SSE-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[CVT5]], i32 1
+; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
+; SSE-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> undef, double [[CVT6]], i32 0
+; SSE-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[CVT7]], i32 1
+; SSE-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
+; AVX512-LABEL: @sitofp_8i16_8f64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <8 x i16> [[TMP1]] to <8 x double>
+; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
+; AVX512-NEXT:    ret void
+;
 ; AVX256-LABEL: @sitofp_8i16_8f64(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
 ; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
@@ -433,12 +446,6 @@
 ; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
 ; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
 ; AVX256-NEXT:    ret void
-;
-; AVX512-LABEL: @sitofp_8i16_8f64(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64
-; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <8 x i16> [[TMP1]] to <8 x double>
-; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
-; AVX512-NEXT:    ret void
 ;
   %ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
   %ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
@@ -473,8 +480,9 @@
 ; CHECK-NEXT:    [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
 ; CHECK-NEXT:    [[CVT0:%.*]] = sitofp i8 [[LD0]] to double
 ; CHECK-NEXT:    [[CVT1:%.*]] = sitofp i8 [[LD1]] to double
-; CHECK-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; CHECK-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1
+; CHECK-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
 ; CHECK-NEXT:    ret void
 ;
   %ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
@@ -496,10 +504,12 @@
 ; SSE-NEXT:    [[CVT1:%.*]] = sitofp i8 [[LD1]] to double
 ; SSE-NEXT:    [[CVT2:%.*]] = sitofp i8 [[LD2]] to double
 ; SSE-NEXT:    [[CVT3:%.*]] = sitofp i8 [[LD3]] to double
-; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
-; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1
+; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> undef, double [[CVT2]], i32 0
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[CVT3]], i32 1
+; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @sitofp_4i8_4f64(
@@ -541,16 +551,26 @@
 ; SSE-NEXT:    [[CVT5:%.*]] = sitofp i8 [[LD5]] to double
 ; SSE-NEXT:    [[CVT6:%.*]] = sitofp i8 [[LD6]] to double
 ; SSE-NEXT:    [[CVT7:%.*]] = sitofp i8 [[LD7]] to double
-; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
-; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
-; SSE-NEXT:    store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
-; SSE-NEXT:    store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
-; SSE-NEXT:    store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
-; SSE-NEXT:    store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1
+; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> undef, double [[CVT2]], i32 0
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[CVT3]], i32 1
+; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
+; SSE-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> undef, double [[CVT4]], i32 0
+; SSE-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[CVT5]], i32 1
+; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
+; SSE-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> undef, double [[CVT6]], i32 0
+; SSE-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[CVT7]], i32 1
+; SSE-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
+; AVX512-LABEL: @sitofp_8i8_8f64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <8 x i8> [[TMP1]] to <8 x double>
+; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
+; AVX512-NEXT:    ret void
+;
 ; AVX256-LABEL: @sitofp_8i8_8f64(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
 ; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
@@ -559,12 +579,6 @@
 ; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
 ; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
 ; AVX256-NEXT:    ret void
-;
-; AVX512-LABEL: @sitofp_8i8_8f64(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64
-; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <8 x i8> [[TMP1]] to <8 x double>
-; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
-; AVX512-NEXT:    ret void
 ;
   %ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
   %ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
@@ -626,39 +640,34 @@
 ; SSE-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to float
 ; SSE-NEXT:    [[CVT2:%.*]] = sitofp i64 [[LD2]] to float
 ; SSE-NEXT:    [[CVT3:%.*]] = sitofp i64 [[LD3]] to float
-; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
-; SSE-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[CVT1]], i32 1
+; SSE-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[CVT2]], i32 2
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[CVT3]], i32 3
+; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
 ; SSE-NEXT:    ret void
 ;
+; AVX-LABEL: @sitofp_4i64_4f32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
+; AVX-NEXT:    [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x float>
+; AVX-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; AVX-NEXT:    ret void
+;
 ; AVX256NODQ-LABEL: @sitofp_4i64_4f32(
-; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
-; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
-; AVX256NODQ-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
-; AVX256NODQ-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
-; AVX256NODQ-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to float
-; AVX256NODQ-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to float
-; AVX256NODQ-NEXT:    [[CVT2:%.*]] = sitofp i64 [[LD2]] to float
-; AVX256NODQ-NEXT:    [[CVT3:%.*]] = sitofp i64 [[LD3]] to float
-; AVX256NODQ-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; AVX256NODQ-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; AVX256NODQ-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
-; AVX256NODQ-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; AVX256NODQ-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
+; AVX256NODQ-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2) to <2 x i64>*), align 16
+; AVX256NODQ-NEXT:    [[TMP3:%.*]] = sitofp <2 x i64> [[TMP1]] to <2 x float>
+; AVX256NODQ-NEXT:    [[TMP4:%.*]] = sitofp <2 x i64> [[TMP2]] to <2 x float>
+; AVX256NODQ-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; AVX256NODQ-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> undef, float [[TMP5]], i32 0
+; AVX256NODQ-NEXT:    [[TMP7:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; AVX256NODQ-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP7]], i32 1
+; AVX256NODQ-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; AVX256NODQ-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP9]], i32 2
+; AVX256NODQ-NEXT:    [[TMP11:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; AVX256NODQ-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP11]], i32 3
+; AVX256NODQ-NEXT:    store <4 x float> [[TMP12]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
 ; AVX256NODQ-NEXT:    ret void
-;
-; AVX512-LABEL: @sitofp_4i64_4f32(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
-; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x float>
-; AVX512-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; AVX512-NEXT:    ret void
-;
-; AVX256DQ-LABEL: @sitofp_4i64_4f32(
-; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
-; AVX256DQ-NEXT:    [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x float>
-; AVX256DQ-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; AVX256DQ-NEXT:    ret void
-;
   %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
   %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
   %ld2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
@@ -692,55 +701,47 @@
 ; SSE-NEXT:    [[CVT5:%.*]] = sitofp i64 [[LD5]] to float
 ; SSE-NEXT:    [[CVT6:%.*]] = sitofp i64 [[LD6]] to float
 ; SSE-NEXT:    [[CVT7:%.*]] = sitofp i64 [[LD7]] to float
-; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
-; SSE-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE-NEXT:    store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
-; SSE-NEXT:    store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE-NEXT:    store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
-; SSE-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[CVT1]], i32 1
+; SSE-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[CVT2]], i32 2
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[CVT3]], i32 3
+; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> undef, float [[CVT4]], i32 0
+; SSE-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[CVT5]], i32 1
+; SSE-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[CVT6]], i32 2
+; SSE-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[CVT7]], i32 3
+; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
+; AVX-LABEL: @sitofp_8i64_8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
+; AVX-NEXT:    [[TMP2:%.*]] = sitofp <8 x i64> [[TMP1]] to <8 x float>
+; AVX-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX-NEXT:    ret void
+;
 ; AVX256NODQ-LABEL: @sitofp_8i64_8f32(
-; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
-; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
-; AVX256NODQ-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
-; AVX256NODQ-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
-; AVX256NODQ-NEXT:    [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
-; AVX256NODQ-NEXT:    [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
-; AVX256NODQ-NEXT:    [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
-; AVX256NODQ-NEXT:    [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
-; AVX256NODQ-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to float
-; AVX256NODQ-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to float
-; AVX256NODQ-NEXT:    [[CVT2:%.*]] = sitofp i64 [[LD2]] to float
-; AVX256NODQ-NEXT:    [[CVT3:%.*]] = sitofp i64 [[LD3]] to float
-; AVX256NODQ-NEXT:    [[CVT4:%.*]] = sitofp i64 [[LD4]] to float
-; AVX256NODQ-NEXT:    [[CVT5:%.*]] = sitofp i64 [[LD5]] to float
-; AVX256NODQ-NEXT:    [[CVT6:%.*]] = sitofp i64 [[LD6]] to float
-; AVX256NODQ-NEXT:    [[CVT7:%.*]] = sitofp i64 [[LD7]] to float
-; AVX256NODQ-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; AVX256NODQ-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; AVX256NODQ-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
-; AVX256NODQ-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; AVX256NODQ-NEXT:    store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
-; AVX256NODQ-NEXT:    store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; AVX256NODQ-NEXT:    store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
-; AVX256NODQ-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; AVX256NODQ-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
+; AVX256NODQ-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32
+; AVX256NODQ-NEXT:    [[TMP3:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x float>
+; AVX256NODQ-NEXT:    [[TMP4:%.*]] = sitofp <4 x i64> [[TMP2]] to <4 x float>
+; AVX256NODQ-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
+; AVX256NODQ-NEXT:    [[TMP6:%.*]] = insertelement <8 x float> undef, float [[TMP5]], i32 0
+; AVX256NODQ-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
+; AVX256NODQ-NEXT:    [[TMP8:%.*]] = insertelement <8 x float> [[TMP6]], float [[TMP7]], i32 1
+; AVX256NODQ-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
+; AVX256NODQ-NEXT:    [[TMP10:%.*]] = insertelement <8 x float> [[TMP8]], float [[TMP9]], i32 2
+; AVX256NODQ-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
+; AVX256NODQ-NEXT:    [[TMP12:%.*]] = insertelement <8 x float> [[TMP10]], float [[TMP11]], i32 3
+; AVX256NODQ-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP4]], i32 0
+; AVX256NODQ-NEXT:    [[TMP14:%.*]] = insertelement <8 x float> [[TMP12]], float [[TMP13]], i32 4
+; AVX256NODQ-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[TMP4]], i32 1
+; AVX256NODQ-NEXT:    [[TMP16:%.*]] = insertelement <8 x float> [[TMP14]], float [[TMP15]], i32 5
+; AVX256NODQ-NEXT:    [[TMP17:%.*]] = extractelement <4 x float> [[TMP4]], i32 2
+; AVX256NODQ-NEXT:    [[TMP18:%.*]] = insertelement <8 x float> [[TMP16]], float [[TMP17]], i32 6
+; AVX256NODQ-NEXT:    [[TMP19:%.*]] = extractelement <4 x float> [[TMP4]], i32 3
+; AVX256NODQ-NEXT:    [[TMP20:%.*]] = insertelement <8 x float> [[TMP18]], float [[TMP19]], i32 7
+; AVX256NODQ-NEXT:    store <8 x float> [[TMP20]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
 ; AVX256NODQ-NEXT:    ret void
-;
-; AVX512-LABEL: @sitofp_8i64_8f32(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
-; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <8 x i64> [[TMP1]] to <8 x float>
-; AVX512-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
-; AVX512-NEXT:    ret void
-;
-; AVX256DQ-LABEL: @sitofp_8i64_8f32(
-; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
-; AVX256DQ-NEXT:    [[TMP2:%.*]] = sitofp <8 x i64> [[TMP1]] to <8 x float>
-; AVX256DQ-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
-; AVX256DQ-NEXT:    ret void
-;
   %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
   %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
   %ld2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
@@ -849,6 +850,12 @@
 ; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
+; AVX512-LABEL: @sitofp_16i32_16f32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @src32 to <16 x i32>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <16 x i32> [[TMP1]] to <16 x float>
+; AVX512-NEXT:    store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64
+; AVX512-NEXT:    ret void
+;
 ; AVX256-LABEL: @sitofp_16i32_16f32(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64
 ; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <8 x i32>*), align 32
@@ -857,12 +864,6 @@
 ; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
 ; AVX256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32
 ; AVX256-NEXT:    ret void
-;
-; AVX512-LABEL: @sitofp_16i32_16f32(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @src32 to <16 x i32>*), align 64
-; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <16 x i32> [[TMP1]] to <16 x float>
-; AVX512-NEXT:    store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64
-; AVX512-NEXT:    ret void
 ;
   %ld0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0 ), align 64
   %ld1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1 ), align 4
@@ -996,6 +997,12 @@
 ; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
+; AVX512-LABEL: @sitofp_16i16_16f32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @src16 to <16 x i16>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <16 x i16> [[TMP1]] to <16 x float>
+; AVX512-NEXT:    store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64
+; AVX512-NEXT:    ret void
+;
 ; AVX256-LABEL: @sitofp_16i16_16f32(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64
 ; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <8 x i16>*), align 16
@@ -1004,12 +1011,6 @@
 ; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
 ; AVX256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32
 ; AVX256-NEXT:    ret void
-;
-; AVX512-LABEL: @sitofp_16i16_16f32(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @src16 to <16 x i16>*), align 64
-; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <16 x i16> [[TMP1]] to <16 x float>
-; AVX512-NEXT:    store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64
-; AVX512-NEXT:    ret void
 ;
   %ld0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0 ), align 64
   %ld1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1 ), align 2
@@ -1143,6 +1144,12 @@
 ; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
+; AVX512-LABEL: @sitofp_16i8_16f32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @src8 to <16 x i8>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <16 x i8> [[TMP1]] to <16 x float>
+; AVX512-NEXT:    store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64
+; AVX512-NEXT:    ret void
+;
 ; AVX256-LABEL: @sitofp_16i8_16f32(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64
 ; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <8 x i8>*), align 8
@@ -1151,12 +1158,6 @@
 ; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
 ; AVX256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32
 ; AVX256-NEXT:    ret void
-;
-; AVX512-LABEL: @sitofp_16i8_16f32(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @src8 to <16 x i8>*), align 64
-; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <16 x i8> [[TMP1]] to <16 x float>
-; AVX512-NEXT:    store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64
-; AVX512-NEXT:    ret void
 ;
   %ld0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0 ), align 64
   %ld1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1 ), align 1
Index: test/Transforms/SLPVectorizer/X86/uitofp.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/uitofp.ll
+++ test/Transforms/SLPVectorizer/X86/uitofp.ll
@@ -1,8 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256DQ
 
@@ -83,6 +80,12 @@
 ; SSE-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
+; AVX512-LABEL: @uitofp_8i64_8f64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <8 x i64> [[TMP1]] to <8 x double>
+; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
+; AVX512-NEXT:    ret void
+;
 ; AVX256-LABEL: @uitofp_8i64_8f64(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
 ; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32
@@ -91,12 +94,6 @@
 ; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
 ; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
 ; AVX256-NEXT:    ret void
-;
-; AVX512-LABEL: @uitofp_8i64_8f64(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
-; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <8 x i64> [[TMP1]] to <8 x double>
-; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
-; AVX512-NEXT:    ret void
 ;
   %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
   %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
@@ -131,31 +128,26 @@
 ; SSE-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
 ; SSE-NEXT:    [[CVT0:%.*]] = uitofp i32 [[LD0]] to double
 ; SSE-NEXT:    [[CVT1:%.*]] = uitofp i32 [[LD1]] to double
-; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1
+; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
 ; SSE-NEXT:    ret void
 ;
+; AVX-LABEL: @uitofp_2i32_2f64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 64
+; AVX-NEXT:    [[TMP2:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x double>
+; AVX-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; AVX-NEXT:    ret void
+;
 ; AVX256NODQ-LABEL: @uitofp_2i32_2f64(
 ; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
 ; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
 ; AVX256NODQ-NEXT:    [[CVT0:%.*]] = uitofp i32 [[LD0]] to double
 ; AVX256NODQ-NEXT:    [[CVT1:%.*]] = uitofp i32 [[LD1]] to double
-; AVX256NODQ-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; AVX256NODQ-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0
+; AVX256NODQ-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1
+; AVX256NODQ-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
 ; AVX256NODQ-NEXT:    ret void
-;
-; AVX512-LABEL: @uitofp_2i32_2f64(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 64
-; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x double>
-; AVX512-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
-; AVX512-NEXT:    ret void
-;
-; AVX256DQ-LABEL: @uitofp_2i32_2f64(
-; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 64
-; AVX256DQ-NEXT:    [[TMP2:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x double>
-; AVX256DQ-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
-; AVX256DQ-NEXT:    ret void
-;
   %ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
   %ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
   %cvt0 = uitofp i32 %ld0 to double
@@ -175,10 +167,12 @@
 ; SSE-NEXT:    [[CVT1:%.*]] = uitofp i32 [[LD1]] to double
 ; SSE-NEXT:    [[CVT2:%.*]] = uitofp i32 [[LD2]] to double
 ; SSE-NEXT:    [[CVT3:%.*]] = uitofp i32 [[LD3]] to double
-; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
-; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1
+; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> undef, double [[CVT2]], i32 0
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[CVT3]], i32 1
+; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @uitofp_4i32_4f64(
@@ -220,16 +214,26 @@
 ; SSE-NEXT:    [[CVT5:%.*]] = uitofp i32 [[LD5]] to double
 ; SSE-NEXT:    [[CVT6:%.*]] = uitofp i32 [[LD6]] to double
 ; SSE-NEXT:    [[CVT7:%.*]] = uitofp i32 [[LD7]] to double
-; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
-; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
-; SSE-NEXT:    store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
-; SSE-NEXT:    store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
-; SSE-NEXT:    store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
-; SSE-NEXT:    store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1
+; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> undef, double [[CVT2]], i32 0
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[CVT3]], i32 1
+; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
+; SSE-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> undef, double [[CVT4]], i32 0
+; SSE-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[CVT5]], i32 1
+; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
+; SSE-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> undef, double [[CVT6]], i32 0
+; SSE-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[CVT7]], i32 1
+; SSE-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
+; AVX512-LABEL: @uitofp_8i32_8f64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <8 x i32> [[TMP1]] to <8 x double>
+; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
+; AVX512-NEXT:    ret void
+;
 ; AVX256-LABEL: @uitofp_8i32_8f64(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
 ; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
@@ -238,12 +242,6 @@
 ; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
 ; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
 ; AVX256-NEXT:    ret void
-;
-; AVX512-LABEL: @uitofp_8i32_8f64(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64
-; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <8 x i32> [[TMP1]] to <8 x double>
-; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
-; AVX512-NEXT:    ret void
 ;
   %ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
   %ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
@@ -278,8 +276,9 @@
 ; CHECK-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
 ; CHECK-NEXT:    [[CVT0:%.*]] = uitofp i16 [[LD0]] to double
 ; CHECK-NEXT:    [[CVT1:%.*]] = uitofp i16 [[LD1]] to double
-; CHECK-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; CHECK-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1
+; CHECK-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
 ; CHECK-NEXT:    ret void
 ;
   %ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
@@ -301,10 +300,12 @@
 ; SSE-NEXT:    [[CVT1:%.*]] = uitofp i16 [[LD1]] to double
 ; SSE-NEXT:    [[CVT2:%.*]] = uitofp i16 [[LD2]] to double
 ; SSE-NEXT:    [[CVT3:%.*]] = uitofp i16 [[LD3]] to double
-; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
-; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1
+; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> undef, double [[CVT2]], i32 0
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[CVT3]], i32 1
+; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @uitofp_4i16_4f64(
@@ -346,16 +347,26 @@
 ; SSE-NEXT:    [[CVT5:%.*]] = uitofp i16 [[LD5]] to double
 ; SSE-NEXT:    [[CVT6:%.*]] = uitofp i16 [[LD6]] to double
 ; SSE-NEXT:    [[CVT7:%.*]] = uitofp i16 [[LD7]] to double
-; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
-; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
-; SSE-NEXT:    store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
-; SSE-NEXT:    store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
-; SSE-NEXT:    store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
-; SSE-NEXT:    store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1
+; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> undef, double [[CVT2]], i32 0
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[CVT3]], i32 1
+; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
+; SSE-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> undef, double [[CVT4]], i32 0
+; SSE-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[CVT5]], i32 1
+; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
+; SSE-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> undef, double [[CVT6]], i32 0
+; SSE-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[CVT7]], i32 1
+; SSE-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
+; AVX512-LABEL: @uitofp_8i16_8f64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <8 x i16> [[TMP1]] to <8 x double>
+; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
+; AVX512-NEXT:    ret void
+;
 ; AVX256-LABEL: @uitofp_8i16_8f64(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
 ; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
@@ -364,12 +375,6 @@
 ; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
 ; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
 ; AVX256-NEXT:    ret void
-;
-; AVX512-LABEL: @uitofp_8i16_8f64(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64
-; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <8 x i16> [[TMP1]] to <8 x double>
-; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
-; AVX512-NEXT:    ret void
 ;
   %ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
   %ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
@@ -404,31 +409,26 @@
 ; SSE-NEXT:    [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
 ; SSE-NEXT:    [[CVT0:%.*]] = uitofp i8 [[LD0]] to double
 ; SSE-NEXT:    [[CVT1:%.*]] = uitofp i8 [[LD1]] to double
-; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1
+; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
 ; SSE-NEXT:    ret void
 ;
+; AVX-LABEL: @uitofp_2i8_2f64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64
+; AVX-NEXT:    [[TMP2:%.*]] = uitofp <2 x i8> [[TMP1]] to <2 x double>
+; AVX-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; AVX-NEXT:    ret void
+;
 ; AVX256NODQ-LABEL: @uitofp_2i8_2f64(
 ; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
 ; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
 ; AVX256NODQ-NEXT:    [[CVT0:%.*]] = uitofp i8 [[LD0]] to double
 ; AVX256NODQ-NEXT:    [[CVT1:%.*]] = uitofp i8 [[LD1]] to double
-; AVX256NODQ-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; AVX256NODQ-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0
+; AVX256NODQ-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1
+; AVX256NODQ-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
 ; AVX256NODQ-NEXT:    ret void
-;
-; AVX512-LABEL: @uitofp_2i8_2f64(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64
-; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <2 x i8> [[TMP1]] to <2 x double>
-; AVX512-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
-; AVX512-NEXT:    ret void
-;
-; AVX256DQ-LABEL: @uitofp_2i8_2f64(
-; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64
-; AVX256DQ-NEXT:    [[TMP2:%.*]] = uitofp <2 x i8> [[TMP1]] to <2 x double>
-; AVX256DQ-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
-; AVX256DQ-NEXT:    ret void
-;
   %ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
   %ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
   %cvt0 = uitofp i8 %ld0 to double
@@ -448,10 +448,12 @@
 ; SSE-NEXT:    [[CVT1:%.*]] = uitofp i8 [[LD1]] to double
 ; SSE-NEXT:    [[CVT2:%.*]] = uitofp i8 [[LD2]] to double
 ; SSE-NEXT:    [[CVT3:%.*]] = uitofp i8 [[LD3]] to double
-; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
-; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1
+; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> undef, double [[CVT2]], i32 0
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[CVT3]], i32 1
+; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @uitofp_4i8_4f64(
@@ -493,16 +495,26 @@
 ; SSE-NEXT:    [[CVT5:%.*]] = uitofp i8 [[LD5]] to double
 ; SSE-NEXT:    [[CVT6:%.*]] = uitofp i8 [[LD6]] to double
 ; SSE-NEXT:    [[CVT7:%.*]] = uitofp i8 [[LD7]] to double
-; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
-; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
-; SSE-NEXT:    store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
-; SSE-NEXT:    store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
-; SSE-NEXT:    store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
-; SSE-NEXT:    store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1
+; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> undef, double [[CVT2]], i32 0
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[CVT3]], i32 1
+; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
+; SSE-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> undef, double [[CVT4]], i32 0
+; SSE-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[CVT5]], i32 1
+; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
+; SSE-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> undef, double [[CVT6]], i32 0
+; SSE-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[CVT7]], i32 1
+; SSE-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
+; AVX512-LABEL: @uitofp_8i8_8f64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <8 x i8> [[TMP1]] to <8 x double>
+; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
+; AVX512-NEXT:    ret void
+;
 ; AVX256-LABEL: @uitofp_8i8_8f64(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
 ; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
@@ -511,12 +523,6 @@
 ; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
 ; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
 ; AVX256-NEXT:    ret void
-;
-; AVX512-LABEL: @uitofp_8i8_8f64(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64
-; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <8 x i8> [[TMP1]] to <8 x double>
-; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
-; AVX512-NEXT:    ret void
 ;
   %ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
   %ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
@@ -578,39 +584,34 @@
 ; SSE-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to float
 ; SSE-NEXT:    [[CVT2:%.*]] = uitofp i64 [[LD2]] to float
 ; SSE-NEXT:    [[CVT3:%.*]] = uitofp i64 [[LD3]] to float
-; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
-; SSE-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[CVT1]], i32 1
+; SSE-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[CVT2]], i32 2
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[CVT3]], i32 3
+; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
 ; SSE-NEXT:    ret void
 ;
+; AVX-LABEL: @uitofp_4i64_4f32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
+; AVX-NEXT:    [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x float>
+; AVX-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; AVX-NEXT:    ret void
+;
 ; AVX256NODQ-LABEL: @uitofp_4i64_4f32(
-; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
-; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
-; AVX256NODQ-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
-; AVX256NODQ-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
-; AVX256NODQ-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to float
-; AVX256NODQ-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to float
-; AVX256NODQ-NEXT:    [[CVT2:%.*]] = uitofp i64 [[LD2]] to float
-; AVX256NODQ-NEXT:    [[CVT3:%.*]] = uitofp i64 [[LD3]] to float
-; AVX256NODQ-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; AVX256NODQ-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; AVX256NODQ-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
-; AVX256NODQ-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; AVX256NODQ-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
+; AVX256NODQ-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2) to <2 x i64>*), align 16
+; AVX256NODQ-NEXT:    [[TMP3:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x float>
+; AVX256NODQ-NEXT:    [[TMP4:%.*]] = uitofp <2 x i64> [[TMP2]] to <2 x float>
+; AVX256NODQ-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; AVX256NODQ-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> undef, float [[TMP5]], i32 0
+; AVX256NODQ-NEXT:    [[TMP7:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; AVX256NODQ-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP7]], i32 1
+; AVX256NODQ-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; AVX256NODQ-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP9]], i32 2
+; AVX256NODQ-NEXT:    [[TMP11:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; AVX256NODQ-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP11]], i32 3
+; AVX256NODQ-NEXT:    store <4 x float> [[TMP12]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
 ; AVX256NODQ-NEXT:    ret void
-;
-; AVX512-LABEL: @uitofp_4i64_4f32(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
-; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x float>
-; AVX512-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; AVX512-NEXT:    ret void
-;
-; AVX256DQ-LABEL: @uitofp_4i64_4f32(
-; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
-; AVX256DQ-NEXT:    [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x float>
-; AVX256DQ-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; AVX256DQ-NEXT:    ret void
-;
   %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
   %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
   %ld2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
@@ -644,55 +645,47 @@
 ; SSE-NEXT:    [[CVT5:%.*]] = uitofp i64 [[LD5]] to float
 ; SSE-NEXT:    [[CVT6:%.*]] = uitofp i64 [[LD6]] to float
 ; SSE-NEXT:    [[CVT7:%.*]] = uitofp i64 [[LD7]] to float
-; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
-; SSE-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE-NEXT:    store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
-; SSE-NEXT:    store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE-NEXT:    store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
-; SSE-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[CVT1]], i32 1
+; SSE-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[CVT2]], i32 2
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[CVT3]], i32 3
+; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> undef, float [[CVT4]], i32 0
+; SSE-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[CVT5]], i32 1
+; SSE-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[CVT6]], i32 2
+; SSE-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[CVT7]], i32 3
+; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
+; AVX-LABEL: @uitofp_8i64_8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
+; AVX-NEXT:    [[TMP2:%.*]] = uitofp <8 x i64> [[TMP1]] to <8 x float>
+; AVX-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX-NEXT:    ret void
+;
 ; AVX256NODQ-LABEL: @uitofp_8i64_8f32(
-; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
-; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
-; AVX256NODQ-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
-; AVX256NODQ-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
-; AVX256NODQ-NEXT:    [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
-; AVX256NODQ-NEXT:    [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
-; AVX256NODQ-NEXT:    [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
-; AVX256NODQ-NEXT:    [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
-; AVX256NODQ-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to float
-; AVX256NODQ-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to float
-; AVX256NODQ-NEXT:    [[CVT2:%.*]] = uitofp i64 [[LD2]] to float
-; AVX256NODQ-NEXT:    [[CVT3:%.*]] = uitofp i64 [[LD3]] to float
-; AVX256NODQ-NEXT:    [[CVT4:%.*]] = uitofp i64 [[LD4]] to float
-; AVX256NODQ-NEXT:    [[CVT5:%.*]] = uitofp i64 [[LD5]] to float
-; AVX256NODQ-NEXT:    [[CVT6:%.*]] = uitofp i64 [[LD6]] to float
-; AVX256NODQ-NEXT:    [[CVT7:%.*]] = uitofp i64 [[LD7]] to float
-; AVX256NODQ-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; AVX256NODQ-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; AVX256NODQ-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
-; AVX256NODQ-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; AVX256NODQ-NEXT:    store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
-; AVX256NODQ-NEXT:    store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; AVX256NODQ-NEXT:    store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
-; AVX256NODQ-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; AVX256NODQ-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
+; AVX256NODQ-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32
+; AVX256NODQ-NEXT:    [[TMP3:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x float>
+; AVX256NODQ-NEXT:    [[TMP4:%.*]] = uitofp <4 x i64> [[TMP2]] to <4 x float>
+; AVX256NODQ-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
+; AVX256NODQ-NEXT:    [[TMP6:%.*]] = insertelement <8 x float> undef, float [[TMP5]], i32 0
+; AVX256NODQ-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
+; AVX256NODQ-NEXT:    [[TMP8:%.*]] = insertelement <8 x float> [[TMP6]], float [[TMP7]], i32 1
+; AVX256NODQ-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
+; AVX256NODQ-NEXT:    [[TMP10:%.*]] = insertelement <8 x float> [[TMP8]], float [[TMP9]], i32 2
+; AVX256NODQ-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
+; AVX256NODQ-NEXT:    [[TMP12:%.*]] = insertelement <8 x float> [[TMP10]], float [[TMP11]], i32 3
+; AVX256NODQ-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP4]], i32 0
+; AVX256NODQ-NEXT:    [[TMP14:%.*]] = insertelement <8 x float> [[TMP12]], float [[TMP13]], i32 4
+; AVX256NODQ-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[TMP4]], i32 1
+; AVX256NODQ-NEXT:    [[TMP16:%.*]] = insertelement <8 x float> [[TMP14]], float [[TMP15]], i32 5
+; AVX256NODQ-NEXT:    [[TMP17:%.*]] = extractelement <4 x float> [[TMP4]], i32 2
+; AVX256NODQ-NEXT:    [[TMP18:%.*]] = insertelement <8 x float> [[TMP16]], float [[TMP17]], i32 6
+; AVX256NODQ-NEXT:    [[TMP19:%.*]] = extractelement <4 x float> [[TMP4]], i32 3
+; AVX256NODQ-NEXT:    [[TMP20:%.*]] = insertelement <8 x float> [[TMP18]], float [[TMP19]], i32 7
+; AVX256NODQ-NEXT:    store <8 x float> [[TMP20]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
 ; AVX256NODQ-NEXT:    ret void
-;
-; AVX512-LABEL: @uitofp_8i64_8f32(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
-; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <8 x i64> [[TMP1]] to <8 x float>
-; AVX512-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
-; AVX512-NEXT:    ret void
-;
-; AVX256DQ-LABEL: @uitofp_8i64_8f32(
-; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
-; AVX256DQ-NEXT:    [[TMP2:%.*]] = uitofp <8 x i64> [[TMP1]] to <8 x float>
-; AVX256DQ-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
-; AVX256DQ-NEXT:    ret void
-;
   %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
   %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
   %ld2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
@@ -801,6 +794,12 @@
 ; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
+; AVX512-LABEL: @uitofp_16i32_16f32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @src32 to <16 x i32>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <16 x i32> [[TMP1]] to <16 x float>
+; AVX512-NEXT:    store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64
+; AVX512-NEXT:    ret void
+;
 ; AVX256-LABEL: @uitofp_16i32_16f32(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64
 ; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <8 x i32>*), align 32
@@ -809,12 +808,6 @@
 ; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
 ; AVX256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32
 ; AVX256-NEXT:    ret void
-;
-; AVX512-LABEL: @uitofp_16i32_16f32(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @src32 to <16 x i32>*), align 64
-; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <16 x i32> [[TMP1]] to <16 x float>
-; AVX512-NEXT:    store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64
-; AVX512-NEXT:    ret void
 ;
   %ld0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0 ), align 64
   %ld1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1 ), align 4
@@ -948,6 +941,12 @@
 ; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
+; AVX512-LABEL: @uitofp_16i16_16f32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @src16 to <16 x i16>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <16 x i16> [[TMP1]] to <16 x float>
+; AVX512-NEXT:    store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64
+; AVX512-NEXT:    ret void
+;
 ; AVX256-LABEL: @uitofp_16i16_16f32(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64
 ; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <8 x i16>*), align 16
@@ -956,12 +955,6 @@
 ; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
 ; AVX256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32
 ; AVX256-NEXT:    ret void
-;
-; AVX512-LABEL: @uitofp_16i16_16f32(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @src16 to <16 x i16>*), align 64
-; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <16 x i16> [[TMP1]] to <16 x float>
-; AVX512-NEXT:    store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64
-; AVX512-NEXT:    ret void
 ;
   %ld0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0 ), align 64
   %ld1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1 ), align 2
@@ -1095,6 +1088,12 @@
 ; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
+; AVX512-LABEL: @uitofp_16i8_16f32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @src8 to <16 x i8>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <16 x i8> [[TMP1]] to <16 x float>
+; AVX512-NEXT:    store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64
+; AVX512-NEXT:    ret void
+;
 ; AVX256-LABEL: @uitofp_16i8_16f32(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64
 ; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <8 x i8>*), align 8
@@ -1103,12 +1102,6 @@
 ; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
 ; AVX256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32
 ; AVX256-NEXT:    ret void
-;
-; AVX512-LABEL: @uitofp_16i8_16f32(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @src8 to <16 x i8>*), align 64
-; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <16 x i8> [[TMP1]] to <16 x float>
-; AVX512-NEXT:    store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64
-; AVX512-NEXT:    ret void
 ;
   %ld0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0 ), align 64
   %ld1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1 ), align 1