Index: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -124,6 +124,15 @@
     cl::desc(
         "Attempt to vectorize horizontal reductions feeding into a store"));
 
+static cl::opt<bool>
+    SLPThrottle("slp-throttle", cl::init(true), cl::Hidden,
+                  cl::desc("Enable tree partial vectorize with throttling"));
+
+static cl::opt<unsigned>
+    SLPThrottleBudget("slp-throttling-budget", cl::init(32), cl::Hidden,
+                      cl::desc("Limit the total number of nodes for cost "
+                               "recalculations during throttling"));
+
 static cl::opt<int>
 MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
     cl::desc("Attempt to vectorize for this register size in bits"));
@@ -595,11 +604,62 @@
 
   /// \returns the cost incurred by unwanted spills and fills, caused by
   /// holding live values over call sites.
-  InstructionCost getSpillCost() const;
+  InstructionCost getSpillCost();
+
+  /// \returns the cost extracting vectorized elements.
+  InstructionCost getExtractCost() const;
+
+  /// \returns the cost of gathering canceled elements to be used
+  /// by vectorized operations during throttling.
+  InstructionCost getInsertCost();
+
+  struct TECostComparator {
+    bool operator()(const TreeEntry *LHS, const TreeEntry *RHS) const {
+      return LHS->Cost > RHS->Cost;
+    }
+  };
+  using TEVectorizableSet = std::set<TreeEntry *, TECostComparator>;
+
+  /// Find a subtree of the whole tree suitable to be vectorized. When
+  /// vectorizing the whole tree is not profitable, we can consider vectorizing
+  /// part of that tree. SLP algorithm looks to operations to vectorize starting
+  /// from seed instructions on the bottom toward the end of chains of
+  /// dependencies to the top of SLP graph, it groups potentially vectorizable
+  /// operations in scalar form to bundles.
+  /// For example:
+  ///
+  ///   <bundle 1> vector form
+  ///      |
+  ///   <bundle 2> vector form  <bundle 3> vector form
+  ///       \                    /
+  ///        <seed root bundle> vector form
+  ///
+  /// Total cost is not profitable to vectorize, hence all operations are in
+  /// scalar form.
+  ///
+  /// Here is the same tree after SLP throttling transformation:
+  ///
+  ///   <bundle 1> vector form
+  ///      |
+  ///   <bundle 2> vector form  <bundle 3> gathered nodes
+  ///       \                    /
+  ///        <seed root bundle> vector form
+  ///
+  /// So, we can throttle some operations in such a way that it is still
+  /// profitable to vectorize part on the tree, while all tree vectorization
+  /// does not make sense.
+  /// More details:
+  /// https://www.cl.cam.ac.uk/~tmj32/papers/docs/porpodas15-pact.pdf
+  bool findSubTree(TEVectorizableSet &Vec, InstructionCost TreeCost,
+                   InstructionCost UserCost);
+
+  /// Get raw summary of all elements of the tree.
+  InstructionCost getRawTreeCost();
 
   /// \returns the vectorization cost of the subtree that starts at \p VL.
   /// A negative number means that this is profitable.
-  InstructionCost getTreeCost();
+  InstructionCost getTreeCost(bool TreeReduce = false,
+                              InstructionCost UserCost = 0);
 
   /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
   /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
@@ -620,6 +680,8 @@
     ScalarToTreeEntry.clear();
     MustGather.clear();
     ExternalUses.clear();
+    InternalTreeUses.clear();
+    ProposedToGather.clear();
     NumOpsWantToKeepOrder.clear();
     NumOpsWantToKeepOriginalOrder = 0;
     for (auto &Iter : BlocksSchedules) {
@@ -628,6 +690,9 @@
     }
     MinBWs.clear();
     InstrElementSize.clear();
+    NoCallInst = true;
+    RawTreeCost = 0;
+    IsCostSumReady = false;
   }
 
   unsigned getTreeSize() const { return VectorizableTree.size(); }
@@ -790,6 +855,9 @@
   ///       may not be necessary.
   bool isLoadCombineCandidate() const;
 
+  /// Cut the tree to make it partially vectorizable.
+  void cutTree();
+
   OptimizationRemarkEmitter *getORE() { return ORE; }
 
   /// This structure holds any data we need about the edges being traversed
@@ -1606,6 +1674,9 @@
     /// Does this entry require reordering?
     SmallVector<unsigned, 4> ReorderIndices;
 
+    /// Cost of this tree entry.
+    InstructionCost Cost = 0;
+
     /// Points back to the VectorizableTree.
     ///
     /// Only used for Graphviz right now.  Unfortunately GraphTrait::NodeRef has
@@ -1618,6 +1689,9 @@
     /// have multiple users so the data structure is not truly a tree.
     SmallVector<EdgeInfo, 1> UserTreeIndices;
 
+    /// Use of this entry.
+    TinyPtrVector<TreeEntry *> UseEntries;
+
     /// The index of this treeEntry in VectorizableTree.
     int Idx = -1;
 
@@ -1850,8 +1924,10 @@
       MustGather.insert(VL.begin(), VL.end());
     }
 
-    if (UserTreeIdx.UserTE)
+    if (UserTreeIdx.UserTE) {
       Last->UserTreeIndices.push_back(UserTreeIdx);
+      VectorizableTree[UserTreeIdx.UserTE->Idx]->UseEntries.push_back(Last);
+    }
 
     return Last;
   }
@@ -1901,6 +1977,9 @@
   };
   using UserList = SmallVector<ExternalUser, 16>;
 
+  /// \returns the cost of extracting the vectorized elements.
+  InstructionCost getExtractOperationCost(const ExternalUser &EU) const;
+
   /// Checks if two instructions may access the same memory.
   ///
   /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
@@ -1951,6 +2030,25 @@
   /// after vectorization.
   UserList ExternalUses;
 
+  /// Tree entries that should not be vectorized due to throttling.
+  SmallPtrSet<TreeEntry *, 2> ProposedToGather;
+
+  /// Raw cost of all elemts in the tree.
+  InstructionCost RawTreeCost = 0;
+
+  /// Indicate that no CallInst found in the tree and we don't need to
+  /// calculate spill cost.
+  bool NoCallInst = true;
+
+  /// True, if we have calucalte tree cost for the tree.
+  bool IsCostSumReady = false;
+
+  /// Current operations width to vectorize.
+  unsigned BundleWidth = 0;
+
+  /// Internal tree oprations proposed to be vectorized values use.
+  SmallDenseMap<Value *, UserList> InternalTreeUses;
+
   /// Values used only by @llvm.assume calls.
   SmallPtrSet<const Value *, 32> EphValues;
 
@@ -2293,6 +2391,9 @@
     /// Sets all instruction in the scheduling region to un-scheduled.
     void resetSchedule();
 
+    /// Make the scheduling region smaller.
+    void reduceSchedulingRegion(Instruction *Start, Instruction *End);
+
     BasicBlock *BB;
 
     /// Simple memory allocation for ScheduleData.
@@ -2355,6 +2456,9 @@
   /// performed in a basic block.
   void scheduleBlock(BlockScheduling *BS);
 
+  /// Remove operations from the list of proposed to schedule.
+  void removeFromScheduling(BlockScheduling *BS);
+
   /// List of users to ignore during scheduling and that don't need extracting.
   ArrayRef<Value *> UserIgnoreList;
 
@@ -2569,7 +2673,7 @@
   buildTree_rec(Roots, 0, EdgeInfo());
 
   // Collect the values that we need to extract from the tree.
-  for (auto &TEPtr : VectorizableTree) {
+  for (std::unique_ptr<TreeEntry> &TEPtr : VectorizableTree) {
     TreeEntry *Entry = TEPtr.get();
 
     // No need to handle users of gathered values.
@@ -2602,6 +2706,7 @@
           // Some in-tree scalars will remain as scalar in vectorized
           // instructions. If that is the case, the one in Lane 0 will
           // be used.
+          InternalTreeUses[U].emplace_back(Scalar, U, FoundLane);
           if (UseScalar != U ||
               UseEntry->State == TreeEntry::ScatterVectorize ||
               !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
@@ -3328,6 +3433,50 @@
   }
 }
 
+void BoUpSLP::cutTree() {
+  SmallVector<TreeEntry *, 4> VecNodes;
+
+  for (std::unique_ptr<TreeEntry> &TEPtr : VectorizableTree) {
+    TreeEntry *Entry = TEPtr.get();
+    if (Entry->State != TreeEntry::Vectorize &&
+        Entry->State != TreeEntry::ScatterVectorize)
+      continue;
+    // For all canceled operations we should consider the possibility of
+    // use by with non-canceled operations and for that, it requires
+    // to populate ExternalUser list with canceled elements.
+    for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
+      Value *Scalar = Entry->Scalars[Lane];
+      for (User *U : Scalar->users()) {
+        LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
+        TreeEntry *UserTE = getTreeEntry(U);
+        if (!UserTE || ProposedToGather.count(UserTE) == 0)
+          continue;
+        // Ignore users in the user ignore list.
+        auto *UserInst = dyn_cast<Instruction>(U);
+        if (!UserInst)
+          continue;
+
+        if (is_contained(UserIgnoreList, UserInst))
+          continue;
+        LLVM_DEBUG(dbgs() << "SLP: Need extract to canceled operation :" << *U
+                          << " from lane " << Lane << " from " << *Scalar
+                          << ".\n");
+        ExternalUses.emplace_back(Scalar, U, Lane);
+      }
+    }
+  }
+  // Canceling unprofitable elements.
+  for (TreeEntry *Entry : ProposedToGather) {
+    for (Value *V : Entry->Scalars) {
+      ScalarToTreeEntry.erase(V);
+#ifndef NDEBUG
+      LLVM_DEBUG(dbgs() << "SLP: Remove scalar " << *V
+                        << " out of proposed to vectorize.\n");
+#endif
+    }
+  }
+}
+
 unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
   unsigned N = 1;
   Type *EltTy = T;
@@ -4122,12 +4271,11 @@
   return true;
 }
 
-InstructionCost BoUpSLP::getSpillCost() const {
+InstructionCost BoUpSLP::getSpillCost() {
   // Walk from the bottom of the tree to the top, tracking which values are
   // live. When we see a call instruction that is not part of our tree,
   // query TTI to see if there is a cost to keeping values live over it
   // (for example, if spills and fills are required).
-  unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
   InstructionCost Cost = 0;
 
   SmallPtrSet<Instruction*, 4> LiveValues;
@@ -4192,6 +4340,7 @@
     }
 
     if (NumCalls) {
+      NoCallInst = false;
       SmallVector<Type*, 4> V;
       for (auto *II : LiveValues)
         V.push_back(FixedVectorType::get(II->getType(), BundleWidth));
@@ -4204,15 +4353,110 @@
   return Cost;
 }
 
-InstructionCost BoUpSLP::getTreeCost() {
-  InstructionCost Cost = 0;
+InstructionCost BoUpSLP::getExtractOperationCost(const ExternalUser &EU) const {
+  // Uses by ephemeral values are free (because the ephemeral value will be
+  // removed prior to code generation, and so the extraction will be
+  // removed as well).
+  if (EphValues.count(EU.User))
+    return 0;
+
+  // If we plan to rewrite the tree in a smaller type, we will need to sign
+  // extend the extracted value back to the original type. Here, we account
+  // for the extract and the added cost of the sign extend if needed.
+  auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth);
+  Value *ScalarRoot = VectorizableTree.front()->Scalars[0];
+
+  auto It = MinBWs.find(ScalarRoot);
+  if (It != MinBWs.end()) {
+    uint64_t Width = It->second.first;
+    bool Signed = It->second.second;
+    auto *MinTy = IntegerType::get(F->getContext(), Width);
+    unsigned ExtOp = Signed ? Instruction::SExt : Instruction::ZExt;
+    VecTy = FixedVectorType::get(MinTy, BundleWidth);
+    return (TTI->getExtractWithExtendCost(ExtOp, EU.Scalar->getType(), VecTy,
+                                          EU.Lane));
+  }
+  return TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
+}
+
+InstructionCost BoUpSLP::getExtractCost() const {
+  InstructionCost ExtractCost = 0;
+  SmallPtrSet<Value *, 16> ExtractCostCalculated;
+  // Consider the possibility of extracting vectorized
+  // values for canceled elements use.
+  for (TreeEntry *Entry : ProposedToGather) {
+    for (Value *V : Entry->Scalars) {
+      // Consider the possibility of extracting vectorized
+      // values for canceled elements use.
+      auto It = InternalTreeUses.find(V);
+      if (It != InternalTreeUses.end()) {
+        const UserList &UL = It->second;
+        for (const ExternalUser &IU : UL)
+          ExtractCost += getExtractOperationCost(IU);
+      }
+    }
+  }
+  for (const ExternalUser &EU : ExternalUses) {
+    // We only add extract cost once for the same scalar.
+    if (!ExtractCostCalculated.insert(EU.Scalar).second)
+      continue;
+
+    ExtractCost += getExtractOperationCost(EU);
+  }
+  return ExtractCost;
+}
+
+InstructionCost BoUpSLP::getInsertCost() {
+  InstructionCost InsertCost = 0;
+  for (TreeEntry *Entry : ProposedToGather) {
+    // Avoid already vectorized TreeEntries, it is already in a vector form and
+    // we don't need to gather those operations.
+    if (ProposedToGather.count(Entry) == 0)
+      continue;
+    for (Value *V : Entry->Scalars) {
+      auto *Inst = cast<Instruction>(V);
+      if (llvm::any_of(Inst->users(), [this](User *Op) {
+            return ScalarToTreeEntry.count(Op) > 0;
+          })) {
+        InsertCost += getEntryCost(Entry);
+        break;
+      }
+    }
+  }
+  return InsertCost;
+}
+
+bool BoUpSLP::findSubTree(TEVectorizableSet &Vec,
+                          InstructionCost TreeCost,
+                          InstructionCost UserCost) {
+  for (const std::unique_ptr<TreeEntry> &TEPtr : VectorizableTree) {
+    TreeEntry *Entry = TEPtr.get();
+    // Ignore any non-vectoriable entries, entries with low cost,
+    // or root entry.
+    if ((Entry->State != TreeEntry::Vectorize &&
+         Entry->State != TreeEntry::ScatterVectorize) ||
+        Entry->Cost <= 0 || !Entry->Idx)
+      continue;
+    Vec.insert(Entry);
+  }
+  InstructionCost Sum = 0;
+  for (TreeEntry *Entry : Vec)
+    Sum += Entry->Cost;
+  // Avoid reducing the tree if there is no potential room to reduce.
+  if ((TreeCost - UserCost - Sum) >= -SLPCostThreshold)
+    return false;
+
+  return (Vec.size() > 0);
+}
+
+InstructionCost BoUpSLP::getRawTreeCost() {
+  InstructionCost CostSum = 0;
+  BundleWidth = VectorizableTree.front()->Scalars.size();
   LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
                     << VectorizableTree.size() << ".\n");
 
-  unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
-
-  for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
-    TreeEntry &TE = *VectorizableTree[I].get();
+  for (std::unique_ptr<TreeEntry> &TEPtr : VectorizableTree) {
+    TreeEntry &TE = *TEPtr.get();
 
     // We create duplicate tree entries for gather sequences that have multiple
     // uses. However, we should not compute the cost of duplicate sequences.
@@ -4227,69 +4471,103 @@
     // existing heuristics based on tree size may yield different results.
     //
     if (TE.State == TreeEntry::NeedToGather &&
-        std::any_of(std::next(VectorizableTree.begin(), I + 1),
-                    VectorizableTree.end(),
-                    [TE](const std::unique_ptr<TreeEntry> &EntryPtr) {
-                      return EntryPtr->State == TreeEntry::NeedToGather &&
-                             EntryPtr->isSame(TE.Scalars);
-                    }))
+        llvm::any_of(llvm::drop_begin(VectorizableTree, TE.Idx + 1),
+                     [TE](const std::unique_ptr<TreeEntry> &EntryPtr) {
+                       return EntryPtr->State == TreeEntry::NeedToGather &&
+                              EntryPtr->isSame(TE.Scalars);
+                     }))
       continue;
 
-    InstructionCost C = getEntryCost(&TE);
-    Cost += C;
-    LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
+    TE.Cost = getEntryCost(&TE);
+    LLVM_DEBUG(dbgs() << "SLP: Adding cost " << TE.Cost
                       << " for bundle that starts with " << *TE.Scalars[0]
-                      << ".\n"
-                      << "SLP: Current total cost = " << Cost << "\n");
+                      << ".\n");
+    CostSum += TE.Cost;
+    LLVM_DEBUG(dbgs() << "SLP: Current total cost = " << CostSum << "\n");
   }
 
-  SmallPtrSet<Value *, 16> ExtractCostCalculated;
-  InstructionCost ExtractCost = 0;
-  for (ExternalUser &EU : ExternalUses) {
-    // We only add extract cost once for the same scalar.
-    if (!ExtractCostCalculated.insert(EU.Scalar).second)
-      continue;
-
-    // Uses by ephemeral values are free (because the ephemeral value will be
-    // removed prior to code generation, and so the extraction will be
-    // removed as well).
-    if (EphValues.count(EU.User))
+  for (std::unique_ptr<TreeEntry> &TEPtr : VectorizableTree) {
+    TreeEntry *TE = TEPtr.get();
+    if (TE->State != TreeEntry::Vectorize &&
+        TE->State != TreeEntry::ScatterVectorize)
       continue;
+    InstructionCost GatherCost = 0;
+    for (TreeEntry *Gather : TE->UseEntries)
+      if (Gather->State != TreeEntry::Vectorize &&
+          Gather->State != TreeEntry::ScatterVectorize)
+        GatherCost += Gather->Cost;
+    TE->Cost += GatherCost;
+  }
+  return CostSum;
+}
 
-    // If we plan to rewrite the tree in a smaller type, we will need to sign
-    // extend the extracted value back to the original type. Here, we account
-    // for the extract and the added cost of the sign extend if needed.
-    auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth);
-    auto *ScalarRoot = VectorizableTree[0]->Scalars[0];
-    if (MinBWs.count(ScalarRoot)) {
-      auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
-      auto Extend =
-          MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt;
-      VecTy = FixedVectorType::get(MinTy, BundleWidth);
-      ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
-                                                   VecTy, EU.Lane);
-    } else {
-      ExtractCost +=
-          TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
-    }
+InstructionCost BoUpSLP::getTreeCost(bool TreeReduce,
+                                     InstructionCost UserCost) {
+  InstructionCost CostSum;
+  if (!IsCostSumReady) {
+    CostSum = getRawTreeCost();
+    RawTreeCost = CostSum;
+  } else {
+    CostSum = RawTreeCost;
   }
 
-  InstructionCost SpillCost = getSpillCost();
-  Cost += SpillCost + ExtractCost;
+  InstructionCost ExtractCost = getExtractCost();
+  InstructionCost SpillCost = 0;
+  if (!NoCallInst || !IsCostSumReady)
+    SpillCost = getSpillCost();
+  assert((!NoCallInst || getSpillCost() == 0) && "Incorrect spill cost");
+  if (!IsCostSumReady)
+    IsCostSumReady = true;
+  InstructionCost InsertCost = getInsertCost();
+  InstructionCost Cost =
+      CostSum + ExtractCost + SpillCost + InsertCost - UserCost;
 
 #ifndef NDEBUG
   SmallString<256> Str;
-  {
-    raw_svector_ostream OS(Str);
-    OS << "SLP: Spill Cost = " << SpillCost << ".\n"
-       << "SLP: Extract Cost = " << ExtractCost << ".\n"
-       << "SLP: Total Cost = " << Cost << ".\n";
-  }
+  raw_svector_ostream OS(Str);
+  OS << "SLP: Spill Cost = " << SpillCost << ".\n"
+     << "SLP: Extract Cost = " << ExtractCost << ".\n"
+     << "SLP: Insert Cost = " << InsertCost << ".\n"
+     << "SLP: Total Cost = " << Cost << ".\n";
   LLVM_DEBUG(dbgs() << Str);
   if (ViewSLPTree)
     ViewGraph(this, "SLP" + F->getName(), false, Str);
 #endif
-
+  if (SLPThrottle && TreeReduce && (Cost - UserCost) >= -SLPCostThreshold) {
+    TEVectorizableSet Vec;
+    if (!findSubTree(Vec, Cost, UserCost))
+      return Cost;
+    if (!NoCallInst && Vec.size() > SLPThrottleBudget) {
+      std::set<llvm::slpvectorizer::BoUpSLP::TreeEntry *>::iterator It =
+          Vec.begin();
+      std::advance(It, (unsigned)SLPThrottleBudget);
+      Vec.erase(It, Vec.end());
+    }
+
+    for (TreeEntry *T : Vec) {
+      ProposedToGather.insert(T);
+      T->State = TreeEntry::NeedToGather;
+      for (Value *V : T->Scalars) {
+        MustGather.insert(V);
+        ExternalUses.erase(
+            llvm::remove_if(ExternalUses,
+                            [V](ExternalUser &EU) { return EU.Scalar == V; }),
+            ExternalUses.end());
+      }
+      CostSum -= T->Cost;
+      ExtractCost = getExtractCost();
+      if (!NoCallInst)
+        SpillCost = getSpillCost();
+      assert((!NoCallInst || getSpillCost() == 0) && "Incorrect spill cost");
+      InsertCost = getInsertCost();
+      Cost = CostSum + ExtractCost + SpillCost + InsertCost - UserCost;
+      if (Cost < -SLPCostThreshold) {
+        cutTree();
+        return Cost;
+      }
+    }
+    ProposedToGather.clear();
+  }
   return Cost;
 }
 
@@ -5136,12 +5414,25 @@
 BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
   // All blocks must be scheduled before any instructions are inserted.
   for (auto &BSIter : BlocksSchedules) {
-    scheduleBlock(BSIter.second.get());
+    BlockScheduling *BS = BSIter.second.get();
+    // Remove all Schedule Data from all nodes that we have changed
+    // vectorization decision.
+    if (!ProposedToGather.empty())
+      removeFromScheduling(BS);
+    scheduleBlock(BS);
   }
 
   Builder.SetInsertPoint(&F->getEntryBlock().front());
   auto *VectorRoot = vectorizeTree(VectorizableTree[0].get());
 
+  for (std::unique_ptr<TreeEntry> &TEPtr : VectorizableTree) {
+    TreeEntry *Entry = TEPtr.get();
+    if ((Entry->State == TreeEntry::Vectorize ||
+         Entry->State == TreeEntry::ScatterVectorize) &&
+        !Entry->VectorizedValue)
+      vectorizeTree(Entry);
+  }
+
   // If the vectorized tree can be rewritten in a smaller type, we truncate the
   // vectorized root. InstCombine will then rewrite the entire expression. We
   // sign extend the extracted values below.
@@ -5271,7 +5562,9 @@
 
 #ifndef NDEBUG
       Type *Ty = Scalar->getType();
-      if (!Ty->isVoidTy()) {
+      // The tree might not be fully vectorized, so we don't have to
+      // check every user.
+      if (!Ty->isVoidTy() && ProposedToGather.empty()) {
         for (User *U : Scalar->users()) {
           LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
 
@@ -5496,6 +5789,7 @@
     BundleMember->FirstInBundle = BundleMember;
     ScheduleData *Next = BundleMember->NextInBundle;
     BundleMember->NextInBundle = nullptr;
+    BundleMember->TE = nullptr;
     BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
     if (BundleMember->UnscheduledDepsInBundle == 0) {
       ReadyInsts.insert(BundleMember);
@@ -5764,6 +6058,85 @@
   ReadyInsts.clear();
 }
 
+void BoUpSLP::BlockScheduling::reduceSchedulingRegion(Instruction *Start,
+                                                      Instruction *End) {
+  if (Start)
+    ScheduleStart = Start;
+  if (End)
+    ScheduleEnd = End;
+}
+
+void BoUpSLP::removeFromScheduling(BlockScheduling *BS) {
+  bool Removed = false;
+  SmallPtrSet<Instruction *, 12> Gathers;
+  SmallPtrSet<Instruction *, 12> Reduced;
+  Instruction *Start = nullptr;
+
+  // We can reduce the number of instructions to be considered for scheduling,
+  // after cutting the tree. Here we shrink the scheduling area from the top,
+  // consecutively, untill we encounter the required instruction. There might be
+  // unnecessary NeedToGather nodes with the relationship only to other
+  // NeedToGather nodes and unmap instructions in chains, we could safely
+  // delete those.
+  for (std::unique_ptr<TreeEntry> &TEPtr : reverse(VectorizableTree)) {
+    TreeEntry *TE = TEPtr.get();
+    if (TE->State != TreeEntry::NeedToGather || !TE->getOpcode() ||
+        TE->getMainOp()->getParent() != BS->BB)
+      continue;
+    for (const EdgeInfo &EI : TE->UserTreeIndices) {
+      if (EI.UserTE && (EI.UserTE->State != TreeEntry::NeedToGather)) {
+        auto InstructionsOnly =
+            make_filter_range(TE->Scalars, Instruction::classof);
+        for (Value *V : InstructionsOnly)
+          Gathers.insert(cast<Instruction>(V));
+        break;
+      }
+    }
+  }
+
+  for (Instruction *I = BS->ScheduleStart; I != BS->ScheduleEnd;
+       I = I->getNextNode()) {
+    if (!getTreeEntry(I) && !Gathers.count(I)) {
+      Reduced.insert(I);
+    } else {
+      Start = I;
+      break;
+    }
+  }
+
+  BS->reduceSchedulingRegion(Start, nullptr);
+
+  for (TreeEntry *Entry : ProposedToGather) {
+    ScheduleData *SD = BS->getScheduleData(Entry->Scalars[0]);
+    if (SD && SD->isPartOfBundle()) {
+      if (!Removed) {
+        Removed = true;
+        BS->resetSchedule();
+      }
+      SD->IsScheduled = false;
+      BS->cancelScheduling(Entry->Scalars, SD->OpValue);
+    }
+  }
+  if (!Removed)
+    return;
+
+  if (Reduced.size()) {
+    for (Instruction *I : Reduced) {
+      ScheduleData *SD = BS->getScheduleData(I);
+      if (SD)
+        SD->SchedulingRegionID = -1;
+    }
+  }
+  BS->resetSchedule();
+  BS->initialFillReadyList(BS->ReadyInsts);
+  for (Instruction *I = BS->ScheduleStart; I != BS->ScheduleEnd;
+       I = I->getNextNode()) {
+    if (BS->ScheduleDataMap.find(I) == BS->ScheduleDataMap.end())
+      continue;
+    BS->doForAllOpcodes(I, [](ScheduleData *SD) { SD->clearDependencies(); });
+  }
+}
+
 void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
   if (!BS->ScheduleStart)
     return;
@@ -6293,7 +6666,7 @@
 
   R.computeMinimumValueSizes();
 
-  InstructionCost Cost = R.getTreeCost();
+  InstructionCost Cost = R.getTreeCost(true);
 
   LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF =" << VF << "\n");
   if (Cost < -SLPCostThreshold) {
@@ -6499,6 +6872,7 @@
   // Check that all of the parts are instructions of the same type,
   // we permit an alternate opcode via InstructionsState.
   InstructionsState S = getSameOpcode(VL);
+
   if (!S.getOpcode())
     return false;
 
@@ -6593,7 +6967,7 @@
         continue;
 
       R.computeMinimumValueSizes();
-      InstructionCost Cost = R.getTreeCost();
+      InstructionCost UserCost = 0;
       CandidateFound = true;
       if (CompensateUseCost) {
         // TODO: Use TTI's getScalarizationOverhead for sequence of inserts
@@ -6623,7 +6997,6 @@
         // Switching to the TTI interface might help a bit.
         // Alternative solution could be pattern-match to detect a no-op or
         // shuffle.
-        InstructionCost UserCost = 0;
         for (unsigned Lane = 0; Lane < OpsWidth; Lane++) {
           auto *IE = cast<InsertElementInst>(InsertUses[I + Lane]);
           if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2)))
@@ -6632,8 +7005,8 @@
         }
         LLVM_DEBUG(dbgs() << "SLP: Compensate cost of users by: " << UserCost
                           << ".\n");
-        Cost -= UserCost;
       }
+      InstructionCost Cost = R.getTreeCost(true, UserCost);
 
       MinCost = std::min(MinCost, Cost);
 
Index: llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
+++ llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -S -slp-vectorizer -instcombine -pass-remarks-output=%t | FileCheck %s
+; RUN: opt < %s -S -slp-vectorizer -instcombine -pass-remarks-output=%t -slp-throttle=false | FileCheck %s
 ; RUN: cat %t | FileCheck -check-prefix=REMARK %s
-; RUN: opt < %s -S -aa-pipeline=basic-aa -passes='slp-vectorizer,instcombine' -pass-remarks-output=%t | FileCheck %s
+; RUN: opt < %s -S -aa-pipeline=basic-aa -passes='slp-vectorizer,instcombine' -pass-remarks-output=%t -slp-throttle=false | FileCheck %s
 ; RUN: cat %t | FileCheck -check-prefix=REMARK %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
Index: llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
+++ llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
@@ -204,11 +204,15 @@
 ; MAX-COST-LABEL: @PR32038(
 ; MAX-COST-NEXT:  entry:
 ; MAX-COST-NEXT:    [[TMP0:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <2 x i8>*), align 1
-; MAX-COST-NEXT:    [[TMP1:%.*]] = icmp eq <2 x i8> [[TMP0]], zeroinitializer
 ; MAX-COST-NEXT:    [[P4:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1
-; MAX-COST-NEXT:    [[P5:%.*]] = icmp eq i8 [[P4]], 0
 ; MAX-COST-NEXT:    [[P6:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4
-; MAX-COST-NEXT:    [[P7:%.*]] = icmp eq i8 [[P6]], 0
+; MAX-COST-NEXT:    [[TMP1:%.*]] = extractelement <2 x i8> [[TMP0]], i32 0
+; MAX-COST-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i32 0
+; MAX-COST-NEXT:    [[TMP3:%.*]] = extractelement <2 x i8> [[TMP0]], i32 1
+; MAX-COST-NEXT:    [[TMP4:%.*]] = insertelement <4 x i8> [[TMP2]], i8 [[TMP3]], i32 1
+; MAX-COST-NEXT:    [[TMP5:%.*]] = insertelement <4 x i8> [[TMP4]], i8 [[P4]], i32 2
+; MAX-COST-NEXT:    [[TMP6:%.*]] = insertelement <4 x i8> [[TMP5]], i8 [[P6]], i32 3
+; MAX-COST-NEXT:    [[TMP7:%.*]] = icmp eq <4 x i8> [[TMP6]], zeroinitializer
 ; MAX-COST-NEXT:    [[P8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1
 ; MAX-COST-NEXT:    [[P9:%.*]] = icmp eq i8 [[P8]], 0
 ; MAX-COST-NEXT:    [[P10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2
@@ -220,19 +224,21 @@
 ; MAX-COST-NEXT:    br label [[FOR_BODY:%.*]]
 ; MAX-COST:       for.body:
 ; MAX-COST-NEXT:    [[P17:%.*]] = phi i32 [ [[P34:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; MAX-COST-NEXT:    [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
-; MAX-COST-NEXT:    [[TMP3:%.*]] = insertelement <4 x i1> poison, i1 [[TMP2]], i32 0
-; MAX-COST-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
-; MAX-COST-NEXT:    [[TMP5:%.*]] = insertelement <4 x i1> [[TMP3]], i1 [[TMP4]], i32 1
-; MAX-COST-NEXT:    [[TMP6:%.*]] = insertelement <4 x i1> [[TMP5]], i1 [[P5]], i32 2
-; MAX-COST-NEXT:    [[TMP7:%.*]] = insertelement <4 x i1> [[TMP6]], i1 [[P7]], i32 3
-; MAX-COST-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> <i32 -720, i32 -720, i32 -720, i32 -720>, <4 x i32> <i32 -80, i32 -80, i32 -80, i32 -80>
+; MAX-COST-NEXT:    [[TMP8:%.*]] = extractelement <4 x i1> [[TMP7]], i32 3
+; MAX-COST-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP7]], i32 0
+; MAX-COST-NEXT:    [[TMP10:%.*]] = insertelement <4 x i1> poison, i1 [[TMP9]], i32 0
+; MAX-COST-NEXT:    [[TMP11:%.*]] = extractelement <4 x i1> [[TMP7]], i32 1
+; MAX-COST-NEXT:    [[TMP12:%.*]] = insertelement <4 x i1> [[TMP10]], i1 [[TMP11]], i32 1
+; MAX-COST-NEXT:    [[TMP13:%.*]] = extractelement <4 x i1> [[TMP7]], i32 2
+; MAX-COST-NEXT:    [[TMP14:%.*]] = insertelement <4 x i1> [[TMP12]], i1 [[TMP13]], i32 2
+; MAX-COST-NEXT:    [[TMP15:%.*]] = insertelement <4 x i1> [[TMP14]], i1 [[TMP8]], i32 3
+; MAX-COST-NEXT:    [[TMP16:%.*]] = select <4 x i1> [[TMP15]], <4 x i32> <i32 -720, i32 -720, i32 -720, i32 -720>, <4 x i32> <i32 -80, i32 -80, i32 -80, i32 -80>
 ; MAX-COST-NEXT:    [[P27:%.*]] = select i1 [[P9]], i32 -720, i32 -80
 ; MAX-COST-NEXT:    [[P29:%.*]] = select i1 [[P11]], i32 -720, i32 -80
-; MAX-COST-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]])
-; MAX-COST-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], [[P27]]
-; MAX-COST-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], [[P29]]
-; MAX-COST-NEXT:    [[OP_EXTRA:%.*]] = add i32 [[TMP11]], -5
+; MAX-COST-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP16]])
+; MAX-COST-NEXT:    [[TMP18:%.*]] = add i32 [[TMP17]], [[P27]]
+; MAX-COST-NEXT:    [[TMP19:%.*]] = add i32 [[TMP18]], [[P29]]
+; MAX-COST-NEXT:    [[OP_EXTRA:%.*]] = add i32 [[TMP19]], -5
 ; MAX-COST-NEXT:    [[P31:%.*]] = select i1 [[P13]], i32 -720, i32 -80
 ; MAX-COST-NEXT:    [[P32:%.*]] = add i32 [[OP_EXTRA]], [[P31]]
 ; MAX-COST-NEXT:    [[P33:%.*]] = select i1 [[P15]], i32 -720, i32 -80
Index: llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
+++ llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
@@ -9,6 +9,16 @@
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux"
 
+; YAML: --- !Passed
+; YAML-NEXT: Pass:            slp-vectorizer
+; YAML-NEXT: Name:            VectorizedList
+; YAML-NEXT: Function:        test_select
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'SLP vectorized with cost '
+; YAML-NEXT:   - Cost:            '3'
+; YAML-NEXT:   - String:          ' and with tree size '
+; YAML-NEXT:   - TreeSize:        '5'
+
 ; YAML:      --- !Passed
 ; YAML-NEXT: Pass:            slp-vectorizer
 ; YAML-NEXT: Name:            VectorizedHorizontalReduction
@@ -19,6 +29,17 @@
 ; YAML-NEXT:   - String:          ' and with tree size '
 ; YAML-NEXT:   - TreeSize:        '8'
 
+; YAML: --- !Passed
+; YAML-NEXT: Pass:            slp-vectorizer
+; YAML-NEXT: Name:            VectorizedList
+; YAML-NEXT: Function:        test_select
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'SLP vectorized with cost '
+; YAML-NEXT:   - Cost:            '-1'
+; YAML-NEXT:   - String:          ' and with tree size '
+; YAML-NEXT:   - TreeSize:        '3'
+
+
 define i32 @test_select(i32* noalias nocapture readonly %blk1, i32* noalias nocapture readonly %blk2, i32 %lx, i32 %h) {
 ; CHECK-LABEL: @test_select(
 ; CHECK-NEXT:  entry:
@@ -28,35 +49,40 @@
 ; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[LX:%.*]] to i64
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[S_026:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[J_025:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[P2_024:%.*]] = phi i32* [ [[BLK2:%.*]], [[FOR_BODY_LR_PH]] ], [ [[ADD_PTR29:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[P1_023:%.*]] = phi i32* [ [[BLK1:%.*]], [[FOR_BODY_LR_PH]] ], [ [[ADD_PTR:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x i32> [ zeroinitializer, [[FOR_BODY_LR_PH]] ], [ [[TMP17:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[P1_023]], i64 1
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[P2_024]], i64 1
 ; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[P1_023]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[P2_024]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, i32* [[P1_023]], i64 3
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P1_023]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P1_023]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[P2_024]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[P2_024]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw <4 x i32> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <4 x i32> zeroinitializer, [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP7]])
-; CHECK-NEXT:    [[OP_EXTRA]] = add nsw i32 [[TMP8]], [[S_026]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[P2_024]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = sub nsw <4 x i32> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp slt <4 x i32> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = sub nsw <4 x i32> zeroinitializer, [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]])
 ; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i32, i32* [[P1_023]], i64 [[IDX_EXT]]
 ; CHECK-NEXT:    [[ADD_PTR29]] = getelementptr inbounds i32, i32* [[P2_024]], i64 [[IDX_EXT]]
-; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[J_025]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[H]]
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x i32> [[TMP11]], i32 1, i32 1
+; CHECK-NEXT:    [[TMP13:%.*]] = add nsw <2 x i32> [[TMP12]], [[TMP0]]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i32> [[TMP13]], i32 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[TMP14]], [[H]]
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i32> [[TMP13]], i32 0
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x i32> poison, i32 [[TMP15]], i32 0
+; CHECK-NEXT:    [[TMP17]] = insertelement <2 x i32> [[TMP16]], i32 [[TMP14]], i32 1
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.end.loopexit:
 ; CHECK-NEXT:    br label [[FOR_END]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[S_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_EXTRA]], [[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    [[S_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP15]], [[FOR_END_LOOPEXIT]] ]
 ; CHECK-NEXT:    ret i32 [[S_0_LCSSA]]
 ;
 entry:
Index: llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll
+++ llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll
@@ -233,14 +233,16 @@
 ; GFX9-NEXT:    ret void
 ;
 ; VI-LABEL: @canonicalize_v2f16(
-; VI-NEXT:    [[I0:%.*]] = load half, half addrspace(3)* [[A:%.*]], align 2
-; VI-NEXT:    [[CANONICALIZE0:%.*]] = call half @llvm.canonicalize.f16(half [[I0]])
-; VI-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds half, half addrspace(3)* [[A]], i64 1
-; VI-NEXT:    [[I3:%.*]] = load half, half addrspace(3)* [[ARRAYIDX3]], align 2
-; VI-NEXT:    [[CANONICALIZE1:%.*]] = call half @llvm.canonicalize.f16(half [[I3]])
-; VI-NEXT:    store half [[CANONICALIZE0]], half addrspace(3)* [[C:%.*]], align 2
-; VI-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds half, half addrspace(3)* [[C]], i64 1
-; VI-NEXT:    store half [[CANONICALIZE1]], half addrspace(3)* [[ARRAYIDX5]], align 2
+; VI-NEXT:    [[TMP1:%.*]] = bitcast half addrspace(3)* [[A:%.*]] to <2 x half> addrspace(3)*
+; VI-NEXT:    [[TMP2:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP1]], align 2
+; VI-NEXT:    [[TMP3:%.*]] = extractelement <2 x half> [[TMP2]], i32 0
+; VI-NEXT:    [[CANONICALIZE0:%.*]] = call half @llvm.canonicalize.f16(half [[TMP3]])
+; VI-NEXT:    [[TMP4:%.*]] = extractelement <2 x half> [[TMP2]], i32 1
+; VI-NEXT:    [[CANONICALIZE1:%.*]] = call half @llvm.canonicalize.f16(half [[TMP4]])
+; VI-NEXT:    [[TMP5:%.*]] = insertelement <2 x half> poison, half [[CANONICALIZE0]], i32 0
+; VI-NEXT:    [[TMP6:%.*]] = insertelement <2 x half> [[TMP5]], half [[CANONICALIZE1]], i32 1
+; VI-NEXT:    [[TMP7:%.*]] = bitcast half addrspace(3)* [[C:%.*]] to <2 x half> addrspace(3)*
+; VI-NEXT:    store <2 x half> [[TMP6]], <2 x half> addrspace(3)* [[TMP7]], align 2
 ; VI-NEXT:    ret void
 ;
   %i0 = load half, half addrspace(3)* %a, align 2
Index: llvm/test/Transforms/SLPVectorizer/X86/PR31847.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/PR31847.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/PR31847.ll
@@ -24,53 +24,53 @@
 ; CHECK-NEXT:    [[Y_045:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_1:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
 ; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP4]] to i32
-; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[CONV]], -128
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1
 ; CHECK-NEXT:    [[CONV3:%.*]] = zext i8 [[TMP5]] to i32
-; CHECK-NEXT:    [[SUB4:%.*]] = add nsw i32 [[CONV3]], -128
-; CHECK-NEXT:    [[CMP5:%.*]] = icmp sgt i32 [[SUB]], -1
-; CHECK-NEXT:    [[SUB7:%.*]] = sub nsw i32 128, [[CONV]]
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP5]], i32 [[SUB]], i32 [[SUB7]]
-; CHECK-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[SUB4]], -1
-; CHECK-NEXT:    [[SUB12:%.*]] = sub nsw i32 128, [[CONV3]]
-; CHECK-NEXT:    [[COND14:%.*]] = select i1 [[CMP8]], i32 [[SUB4]], i32 [[SUB12]]
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[COND14]], [[COND]]
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[CONV3]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[CONV]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <2 x i32> [[TMP7]], <i32 -128, i32 -128>
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp sgt <2 x i32> [[TMP8]], <i32 -1, i32 -1>
+; CHECK-NEXT:    [[TMP10:%.*]] = sub nsw <2 x i32> <i32 128, i32 128>, [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <2 x i1> [[TMP9]], <2 x i32> [[TMP8]], <2 x i32> [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i32> [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i32> [[TMP11]], i32 1
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP13]]
 ; CHECK-NEXT:    [[IDX_NEG:%.*]] = sub nsw i32 0, [[ADD]]
 ; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, i8* [[D1_DATA_046]], i32 [[IDX_NEG]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load i8, i8* [[ADD_PTR]], align 1
-; CHECK-NEXT:    [[CONV15:%.*]] = zext i8 [[TMP6]] to i32
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, i8* [[ADD_PTR]], align 1
+; CHECK-NEXT:    [[CONV15:%.*]] = zext i8 [[TMP14]] to i32
 ; CHECK-NEXT:    [[ADD16:%.*]] = add nsw i32 [[CONV15]], [[INTENSITY:%.*]]
 ; CHECK-NEXT:    [[CONV17:%.*]] = trunc i32 [[ADD16]] to i8
 ; CHECK-NEXT:    store i8 [[CONV17]], i8* [[ADD_PTR]], align 1
 ; CHECK-NEXT:    [[ADD_PTR18:%.*]] = getelementptr inbounds i8, i8* [[D1_DATA_046]], i32 [[ADD]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i8, i8* [[ADD_PTR18]], align 1
-; CHECK-NEXT:    [[NOT_TOBOOL:%.*]] = icmp eq i8 [[TMP7]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = load i8, i8* [[ADD_PTR18]], align 1
+; CHECK-NEXT:    [[NOT_TOBOOL:%.*]] = icmp eq i8 [[TMP15]], 0
 ; CHECK-NEXT:    [[CONV21:%.*]] = zext i1 [[NOT_TOBOOL]] to i8
 ; CHECK-NEXT:    store i8 [[CONV21]], i8* [[ADD_PTR18]], align 1
 ; CHECK-NEXT:    [[ADD_PTR23:%.*]] = getelementptr inbounds i8, i8* [[D1_DATA_046]], i32 [[TMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CONV_1:%.*]] = zext i8 [[TMP8]] to i32
-; CHECK-NEXT:    [[SUB_1:%.*]] = add nsw i32 [[CONV_1]], -128
-; CHECK-NEXT:    [[TMP9:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1
-; CHECK-NEXT:    [[CONV3_1:%.*]] = zext i8 [[TMP9]] to i32
-; CHECK-NEXT:    [[SUB4_1:%.*]] = add nsw i32 [[CONV3_1]], -128
-; CHECK-NEXT:    [[CMP5_1:%.*]] = icmp sgt i32 [[SUB_1]], -1
-; CHECK-NEXT:    [[SUB7_1:%.*]] = sub nsw i32 128, [[CONV_1]]
-; CHECK-NEXT:    [[COND_1:%.*]] = select i1 [[CMP5_1]], i32 [[SUB_1]], i32 [[SUB7_1]]
-; CHECK-NEXT:    [[CMP8_1:%.*]] = icmp sgt i32 [[SUB4_1]], -1
-; CHECK-NEXT:    [[SUB12_1:%.*]] = sub nsw i32 128, [[CONV3_1]]
-; CHECK-NEXT:    [[COND14_1:%.*]] = select i1 [[CMP8_1]], i32 [[SUB4_1]], i32 [[SUB12_1]]
-; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[COND14_1]], [[COND_1]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CONV_1:%.*]] = zext i8 [[TMP16]] to i32
+; CHECK-NEXT:    [[TMP17:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1
+; CHECK-NEXT:    [[CONV3_1:%.*]] = zext i8 [[TMP17]] to i32
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <2 x i32> poison, i32 [[CONV3_1]], i32 0
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x i32> [[TMP18]], i32 [[CONV_1]], i32 1
+; CHECK-NEXT:    [[TMP20:%.*]] = add nsw <2 x i32> [[TMP19]], <i32 -128, i32 -128>
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp sgt <2 x i32> [[TMP20]], <i32 -1, i32 -1>
+; CHECK-NEXT:    [[TMP22:%.*]] = sub nsw <2 x i32> <i32 128, i32 128>, [[TMP19]]
+; CHECK-NEXT:    [[TMP23:%.*]] = select <2 x i1> [[TMP21]], <2 x i32> [[TMP20]], <2 x i32> [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <2 x i32> [[TMP23]], i32 0
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <2 x i32> [[TMP23]], i32 1
+; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[TMP24]], [[TMP25]]
 ; CHECK-NEXT:    [[IDX_NEG_1:%.*]] = sub nsw i32 0, [[ADD_1]]
 ; CHECK-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR23]], i32 [[IDX_NEG_1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load i8, i8* [[ADD_PTR_1]], align 1
-; CHECK-NEXT:    [[CONV15_1:%.*]] = zext i8 [[TMP10]] to i32
+; CHECK-NEXT:    [[TMP26:%.*]] = load i8, i8* [[ADD_PTR_1]], align 1
+; CHECK-NEXT:    [[CONV15_1:%.*]] = zext i8 [[TMP26]] to i32
 ; CHECK-NEXT:    [[ADD16_1:%.*]] = add nsw i32 [[CONV15_1]], [[INTENSITY]]
 ; CHECK-NEXT:    [[CONV17_1:%.*]] = trunc i32 [[ADD16_1]] to i8
 ; CHECK-NEXT:    store i8 [[CONV17_1]], i8* [[ADD_PTR_1]], align 1
 ; CHECK-NEXT:    [[ADD_PTR18_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR23]], i32 [[ADD_1]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load i8, i8* [[ADD_PTR18_1]], align 1
-; CHECK-NEXT:    [[NOT_TOBOOL_1:%.*]] = icmp eq i8 [[TMP11]], 0
+; CHECK-NEXT:    [[TMP27:%.*]] = load i8, i8* [[ADD_PTR18_1]], align 1
+; CHECK-NEXT:    [[NOT_TOBOOL_1:%.*]] = icmp eq i8 [[TMP27]], 0
 ; CHECK-NEXT:    [[CONV21_1:%.*]] = zext i1 [[NOT_TOBOOL_1]] to i8
 ; CHECK-NEXT:    store i8 [[CONV21_1]], i8* [[ADD_PTR18_1]], align 1
 ; CHECK-NEXT:    [[ADD_PTR23_1]] = getelementptr inbounds i8, i8* [[ADD_PTR23]], i32 [[TMP1]]
Index: llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
@@ -7,49 +7,65 @@
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP15:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[SHUFFLE]], <i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 12529, i32 13685>
-; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]])
-; CHECK-NEXT:    [[OP_EXTRA:%.*]] = and i32 [[TMP4]], [[TMP0:%.*]]
-; CHECK-NEXT:    [[OP_EXTRA1:%.*]] = and i32 [[OP_EXTRA]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA2:%.*]] = and i32 [[OP_EXTRA1]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA3:%.*]] = and i32 [[OP_EXTRA2]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA4:%.*]] = and i32 [[OP_EXTRA3]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA5:%.*]] = and i32 [[OP_EXTRA4]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA6:%.*]] = and i32 [[OP_EXTRA5]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA7:%.*]] = and i32 [[OP_EXTRA6]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA8:%.*]] = and i32 [[OP_EXTRA7]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA9:%.*]] = and i32 [[OP_EXTRA8]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA10:%.*]] = and i32 [[OP_EXTRA9]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA11:%.*]] = and i32 [[OP_EXTRA10]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA12:%.*]] = and i32 [[OP_EXTRA11]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA13:%.*]] = and i32 [[OP_EXTRA12]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA14:%.*]] = and i32 [[OP_EXTRA13]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA15:%.*]] = and i32 [[OP_EXTRA14]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA16:%.*]] = and i32 [[OP_EXTRA15]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA17:%.*]] = and i32 [[OP_EXTRA16]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA18:%.*]] = and i32 [[OP_EXTRA17]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA19:%.*]] = and i32 [[OP_EXTRA18]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA20:%.*]] = and i32 [[OP_EXTRA19]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA21:%.*]] = and i32 [[OP_EXTRA20]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA22:%.*]] = and i32 [[OP_EXTRA21]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA23:%.*]] = and i32 [[OP_EXTRA22]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA24:%.*]] = and i32 [[OP_EXTRA23]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA25:%.*]] = and i32 [[OP_EXTRA24]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA26:%.*]] = and i32 [[OP_EXTRA25]], [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[OP_EXTRA26]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 14910, i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP9:%.*]] = and <2 x i32> [[TMP6]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = add <2 x i32> [[TMP6]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i32> [[TMP11]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP12]], i32 0
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i32> [[TMP11]], i32 1
-; CHECK-NEXT:    [[TMP15]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP14]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP19:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[VAL_0:%.*]] = add i32 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; CHECK-NEXT:    [[VAL_1:%.*]] = and i32 [[TMP3]], [[VAL_0]]
+; CHECK-NEXT:    [[VAL_2:%.*]] = and i32 [[VAL_1]], [[TMP0:%.*]]
+; CHECK-NEXT:    [[VAL_3:%.*]] = and i32 [[VAL_2]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_4:%.*]] = and i32 [[VAL_3]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_5:%.*]] = and i32 [[VAL_4]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_6:%.*]] = add i32 [[TMP3]], 55
+; CHECK-NEXT:    [[VAL_7:%.*]] = and i32 [[VAL_5]], [[VAL_6]]
+; CHECK-NEXT:    [[VAL_8:%.*]] = and i32 [[VAL_7]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_9:%.*]] = and i32 [[VAL_8]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_10:%.*]] = and i32 [[VAL_9]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_11:%.*]] = add i32 [[TMP3]], 285
+; CHECK-NEXT:    [[VAL_12:%.*]] = and i32 [[VAL_10]], [[VAL_11]]
+; CHECK-NEXT:    [[VAL_13:%.*]] = and i32 [[VAL_12]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_14:%.*]] = and i32 [[VAL_13]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_15:%.*]] = and i32 [[VAL_14]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_16:%.*]] = and i32 [[VAL_15]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_17:%.*]] = and i32 [[VAL_16]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_18:%.*]] = add i32 [[TMP3]], 1240
+; CHECK-NEXT:    [[VAL_19:%.*]] = and i32 [[VAL_17]], [[VAL_18]]
+; CHECK-NEXT:    [[VAL_20:%.*]] = add i32 [[TMP3]], 1496
+; CHECK-NEXT:    [[VAL_21:%.*]] = and i32 [[VAL_19]], [[VAL_20]]
+; CHECK-NEXT:    [[VAL_22:%.*]] = and i32 [[VAL_21]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_23:%.*]] = and i32 [[VAL_22]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_24:%.*]] = and i32 [[VAL_23]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_25:%.*]] = and i32 [[VAL_24]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_26:%.*]] = and i32 [[VAL_25]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_27:%.*]] = and i32 [[VAL_26]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_28:%.*]] = and i32 [[VAL_27]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_29:%.*]] = and i32 [[VAL_28]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_30:%.*]] = and i32 [[VAL_29]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_31:%.*]] = and i32 [[VAL_30]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_32:%.*]] = and i32 [[VAL_31]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_33:%.*]] = and i32 [[VAL_32]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_34:%.*]] = add i32 [[TMP3]], 8555
+; CHECK-NEXT:    [[VAL_35:%.*]] = and i32 [[VAL_33]], [[VAL_34]]
+; CHECK-NEXT:    [[VAL_36:%.*]] = and i32 [[VAL_35]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_37:%.*]] = and i32 [[VAL_36]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_38:%.*]] = and i32 [[VAL_37]], [[TMP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i32> [[TMP5]], <i32 12529, i32 13685>
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0
+; CHECK-NEXT:    [[VAL_40:%.*]] = and i32 [[VAL_38]], [[TMP7]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i32> [[TMP6]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[VAL_40]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 14910, i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32> poison, i32 [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x i32> [[TMP11]], i32 [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP13:%.*]] = and <2 x i32> [[TMP10]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = add <2 x i32> [[TMP10]], [[TMP12]]
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i32> [[TMP15]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <2 x i32> poison, i32 [[TMP16]], i32 0
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x i32> [[TMP15]], i32 1
+; CHECK-NEXT:    [[TMP19]] = insertelement <2 x i32> [[TMP17]], i32 [[TMP18]], i32 1
 ; CHECK-NEXT:    br label [[LOOP]]
 ;
 ; FORCE_REDUCTION-LABEL: @Test(
Index: llvm/test/Transforms/SLPVectorizer/X86/addsub.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/addsub.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/addsub.ll
@@ -348,22 +348,24 @@
 
 define void @no_vec_shuff_reorder() #0 {
 ; CHECK-LABEL: @no_vec_shuff_reorder(
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 0), align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 0), align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd float [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    store float [[TMP3]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 0), align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 1), align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 1), align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = fsub float [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    store float [[TMP6]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 1), align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 2), align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 2), align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = fadd float [[TMP7]], [[TMP8]]
-; CHECK-NEXT:    store float [[TMP9]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 2), align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 3), align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 3), align 4
-; CHECK-NEXT:    [[TMP12:%.*]] = fsub float [[TMP10]], [[TMP11]]
-; CHECK-NEXT:    store float [[TMP12]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 3), align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([4 x float]* @fa to <2 x float>*), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast ([4 x float]* @fb to <2 x float>*), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 2), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 2), align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd float [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 3), align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 3), align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = fsub float [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x float> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x float> poison, float [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x float> [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x float> [[TMP13]], float [[TMP14]], i32 1
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[TMP8]], i32 2
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float [[TMP11]], i32 3
+; CHECK-NEXT:    store <4 x float> [[TMP17]], <4 x float>* bitcast ([4 x float]* @fc to <4 x float>*), align 4
 ; CHECK-NEXT:    ret void
 ;
   %1 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 0), align 4
Index: llvm/test/Transforms/SLPVectorizer/X86/arith-fix.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/arith-fix.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/arith-fix.ll
@@ -283,70 +283,82 @@
 ; SLM-NEXT:    ret void
 ;
 ; AVX1-LABEL: @smul_v16i32(
-; AVX1-NEXT:    [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4
-; AVX1-NEXT:    [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4
-; AVX1-NEXT:    [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4
-; AVX1-NEXT:    [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4
-; AVX1-NEXT:    [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4
-; AVX1-NEXT:    [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4
-; AVX1-NEXT:    [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4
-; AVX1-NEXT:    [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4
-; AVX1-NEXT:    [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4
-; AVX1-NEXT:    [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4
-; AVX1-NEXT:    [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
-; AVX1-NEXT:    [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
-; AVX1-NEXT:    [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
-; AVX1-NEXT:    [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
-; AVX1-NEXT:    [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
-; AVX1-NEXT:    [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
-; AVX1-NEXT:    [[B0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4
-; AVX1-NEXT:    [[B1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4
-; AVX1-NEXT:    [[B2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4
-; AVX1-NEXT:    [[B3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4
-; AVX1-NEXT:    [[B4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4
-; AVX1-NEXT:    [[B5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4
-; AVX1-NEXT:    [[B6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4
-; AVX1-NEXT:    [[B7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4
-; AVX1-NEXT:    [[B8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4
-; AVX1-NEXT:    [[B9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4
-; AVX1-NEXT:    [[B10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
-; AVX1-NEXT:    [[B11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
-; AVX1-NEXT:    [[B12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
-; AVX1-NEXT:    [[B13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
-; AVX1-NEXT:    [[B14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
-; AVX1-NEXT:    [[B15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
-; AVX1-NEXT:    [[R0:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A0]], i32 [[B0]], i32 3)
-; AVX1-NEXT:    [[R1:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A1]], i32 [[B1]], i32 3)
-; AVX1-NEXT:    [[R2:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A2]], i32 [[B2]], i32 3)
-; AVX1-NEXT:    [[R3:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A3]], i32 [[B3]], i32 3)
-; AVX1-NEXT:    [[R4:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A4]], i32 [[B4]], i32 3)
-; AVX1-NEXT:    [[R5:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A5]], i32 [[B5]], i32 3)
-; AVX1-NEXT:    [[R6:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A6]], i32 [[B6]], i32 3)
-; AVX1-NEXT:    [[R7:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A7]], i32 [[B7]], i32 3)
-; AVX1-NEXT:    [[R8:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A8]], i32 [[B8]], i32 3)
-; AVX1-NEXT:    [[R9:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A9]], i32 [[B9]], i32 3)
-; AVX1-NEXT:    [[R10:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A10]], i32 [[B10]], i32 3)
-; AVX1-NEXT:    [[R11:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A11]], i32 [[B11]], i32 3)
-; AVX1-NEXT:    [[R12:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A12]], i32 [[B12]], i32 3)
-; AVX1-NEXT:    [[R13:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A13]], i32 [[B13]], i32 3)
-; AVX1-NEXT:    [[R14:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A14]], i32 [[B14]], i32 3)
-; AVX1-NEXT:    [[R15:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A15]], i32 [[B15]], i32 3)
-; AVX1-NEXT:    store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4
-; AVX1-NEXT:    store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4
-; AVX1-NEXT:    store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4
-; AVX1-NEXT:    store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4
-; AVX1-NEXT:    store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4
-; AVX1-NEXT:    store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4
-; AVX1-NEXT:    store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4
-; AVX1-NEXT:    store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4
-; AVX1-NEXT:    store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4
-; AVX1-NEXT:    store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4
-; AVX1-NEXT:    store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
-; AVX1-NEXT:    store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
-; AVX1-NEXT:    store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
-; AVX1-NEXT:    store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
-; AVX1-NEXT:    store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
-; AVX1-NEXT:    store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+; AVX1-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; AVX1-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[TMP5]], i32 0
+; AVX1-NEXT:    [[R0:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[TMP9]], i32 [[TMP10]], i32 3)
+; AVX1-NEXT:    [[TMP11:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
+; AVX1-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[TMP5]], i32 1
+; AVX1-NEXT:    [[R1:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[TMP11]], i32 [[TMP12]], i32 3)
+; AVX1-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
+; AVX1-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[TMP5]], i32 2
+; AVX1-NEXT:    [[R2:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[TMP13]], i32 [[TMP14]], i32 3)
+; AVX1-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+; AVX1-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[TMP5]], i32 3
+; AVX1-NEXT:    [[R3:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[TMP15]], i32 [[TMP16]], i32 3)
+; AVX1-NEXT:    [[TMP17:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
+; AVX1-NEXT:    [[TMP18:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0
+; AVX1-NEXT:    [[R4:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[TMP17]], i32 [[TMP18]], i32 3)
+; AVX1-NEXT:    [[TMP19:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
+; AVX1-NEXT:    [[TMP20:%.*]] = extractelement <4 x i32> [[TMP6]], i32 1
+; AVX1-NEXT:    [[R5:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[TMP19]], i32 [[TMP20]], i32 3)
+; AVX1-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
+; AVX1-NEXT:    [[TMP22:%.*]] = extractelement <4 x i32> [[TMP6]], i32 2
+; AVX1-NEXT:    [[R6:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[TMP21]], i32 [[TMP22]], i32 3)
+; AVX1-NEXT:    [[TMP23:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; AVX1-NEXT:    [[TMP24:%.*]] = extractelement <4 x i32> [[TMP6]], i32 3
+; AVX1-NEXT:    [[R7:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[TMP23]], i32 [[TMP24]], i32 3)
+; AVX1-NEXT:    [[TMP25:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; AVX1-NEXT:    [[TMP26:%.*]] = extractelement <4 x i32> [[TMP7]], i32 0
+; AVX1-NEXT:    [[R8:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[TMP25]], i32 [[TMP26]], i32 3)
+; AVX1-NEXT:    [[TMP27:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; AVX1-NEXT:    [[TMP28:%.*]] = extractelement <4 x i32> [[TMP7]], i32 1
+; AVX1-NEXT:    [[R9:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[TMP27]], i32 [[TMP28]], i32 3)
+; AVX1-NEXT:    [[TMP29:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; AVX1-NEXT:    [[TMP30:%.*]] = extractelement <4 x i32> [[TMP7]], i32 2
+; AVX1-NEXT:    [[R10:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[TMP29]], i32 [[TMP30]], i32 3)
+; AVX1-NEXT:    [[TMP31:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; AVX1-NEXT:    [[TMP32:%.*]] = extractelement <4 x i32> [[TMP7]], i32 3
+; AVX1-NEXT:    [[R11:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[TMP31]], i32 [[TMP32]], i32 3)
+; AVX1-NEXT:    [[TMP33:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
+; AVX1-NEXT:    [[TMP34:%.*]] = extractelement <4 x i32> [[TMP8]], i32 0
+; AVX1-NEXT:    [[R12:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[TMP33]], i32 [[TMP34]], i32 3)
+; AVX1-NEXT:    [[TMP35:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1
+; AVX1-NEXT:    [[TMP36:%.*]] = extractelement <4 x i32> [[TMP8]], i32 1
+; AVX1-NEXT:    [[R13:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[TMP35]], i32 [[TMP36]], i32 3)
+; AVX1-NEXT:    [[TMP37:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
+; AVX1-NEXT:    [[TMP38:%.*]] = extractelement <4 x i32> [[TMP8]], i32 2
+; AVX1-NEXT:    [[R14:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[TMP37]], i32 [[TMP38]], i32 3)
+; AVX1-NEXT:    [[TMP39:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
+; AVX1-NEXT:    [[TMP40:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3
+; AVX1-NEXT:    [[R15:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[TMP39]], i32 [[TMP40]], i32 3)
+; AVX1-NEXT:    [[TMP41:%.*]] = insertelement <4 x i32> poison, i32 [[R0]], i32 0
+; AVX1-NEXT:    [[TMP42:%.*]] = insertelement <4 x i32> [[TMP41]], i32 [[R1]], i32 1
+; AVX1-NEXT:    [[TMP43:%.*]] = insertelement <4 x i32> [[TMP42]], i32 [[R2]], i32 2
+; AVX1-NEXT:    [[TMP44:%.*]] = insertelement <4 x i32> [[TMP43]], i32 [[R3]], i32 3
+; AVX1-NEXT:    store <4 x i32> [[TMP44]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP45:%.*]] = insertelement <4 x i32> poison, i32 [[R4]], i32 0
+; AVX1-NEXT:    [[TMP46:%.*]] = insertelement <4 x i32> [[TMP45]], i32 [[R5]], i32 1
+; AVX1-NEXT:    [[TMP47:%.*]] = insertelement <4 x i32> [[TMP46]], i32 [[R6]], i32 2
+; AVX1-NEXT:    [[TMP48:%.*]] = insertelement <4 x i32> [[TMP47]], i32 [[R7]], i32 3
+; AVX1-NEXT:    store <4 x i32> [[TMP48]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP49:%.*]] = insertelement <4 x i32> poison, i32 [[R8]], i32 0
+; AVX1-NEXT:    [[TMP50:%.*]] = insertelement <4 x i32> [[TMP49]], i32 [[R9]], i32 1
+; AVX1-NEXT:    [[TMP51:%.*]] = insertelement <4 x i32> [[TMP50]], i32 [[R10]], i32 2
+; AVX1-NEXT:    [[TMP52:%.*]] = insertelement <4 x i32> [[TMP51]], i32 [[R11]], i32 3
+; AVX1-NEXT:    store <4 x i32> [[TMP52]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP53:%.*]] = insertelement <4 x i32> poison, i32 [[R12]], i32 0
+; AVX1-NEXT:    [[TMP54:%.*]] = insertelement <4 x i32> [[TMP53]], i32 [[R13]], i32 1
+; AVX1-NEXT:    [[TMP55:%.*]] = insertelement <4 x i32> [[TMP54]], i32 [[R14]], i32 2
+; AVX1-NEXT:    [[TMP56:%.*]] = insertelement <4 x i32> [[TMP55]], i32 [[R15]], i32 3
+; AVX1-NEXT:    store <4 x i32> [[TMP56]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @smul_v16i32(
@@ -1212,70 +1224,82 @@
 ; SLM-NEXT:    ret void
 ;
 ; AVX1-LABEL: @umul_v16i32(
-; AVX1-NEXT:    [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4
-; AVX1-NEXT:    [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4
-; AVX1-NEXT:    [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4
-; AVX1-NEXT:    [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4
-; AVX1-NEXT:    [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4
-; AVX1-NEXT:    [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4
-; AVX1-NEXT:    [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4
-; AVX1-NEXT:    [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4
-; AVX1-NEXT:    [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4
-; AVX1-NEXT:    [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4
-; AVX1-NEXT:    [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
-; AVX1-NEXT:    [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
-; AVX1-NEXT:    [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
-; AVX1-NEXT:    [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
-; AVX1-NEXT:    [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
-; AVX1-NEXT:    [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
-; AVX1-NEXT:    [[B0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4
-; AVX1-NEXT:    [[B1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4
-; AVX1-NEXT:    [[B2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4
-; AVX1-NEXT:    [[B3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4
-; AVX1-NEXT:    [[B4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4
-; AVX1-NEXT:    [[B5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4
-; AVX1-NEXT:    [[B6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4
-; AVX1-NEXT:    [[B7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4
-; AVX1-NEXT:    [[B8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4
-; AVX1-NEXT:    [[B9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4
-; AVX1-NEXT:    [[B10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
-; AVX1-NEXT:    [[B11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
-; AVX1-NEXT:    [[B12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
-; AVX1-NEXT:    [[B13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
-; AVX1-NEXT:    [[B14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
-; AVX1-NEXT:    [[B15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
-; AVX1-NEXT:    [[R0:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A0]], i32 [[B0]], i32 3)
-; AVX1-NEXT:    [[R1:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A1]], i32 [[B1]], i32 3)
-; AVX1-NEXT:    [[R2:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A2]], i32 [[B2]], i32 3)
-; AVX1-NEXT:    [[R3:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A3]], i32 [[B3]], i32 3)
-; AVX1-NEXT:    [[R4:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A4]], i32 [[B4]], i32 3)
-; AVX1-NEXT:    [[R5:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A5]], i32 [[B5]], i32 3)
-; AVX1-NEXT:    [[R6:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A6]], i32 [[B6]], i32 3)
-; AVX1-NEXT:    [[R7:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A7]], i32 [[B7]], i32 3)
-; AVX1-NEXT:    [[R8:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A8]], i32 [[B8]], i32 3)
-; AVX1-NEXT:    [[R9:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A9]], i32 [[B9]], i32 3)
-; AVX1-NEXT:    [[R10:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A10]], i32 [[B10]], i32 3)
-; AVX1-NEXT:    [[R11:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A11]], i32 [[B11]], i32 3)
-; AVX1-NEXT:    [[R12:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A12]], i32 [[B12]], i32 3)
-; AVX1-NEXT:    [[R13:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A13]], i32 [[B13]], i32 3)
-; AVX1-NEXT:    [[R14:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A14]], i32 [[B14]], i32 3)
-; AVX1-NEXT:    [[R15:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A15]], i32 [[B15]], i32 3)
-; AVX1-NEXT:    store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4
-; AVX1-NEXT:    store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4
-; AVX1-NEXT:    store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4
-; AVX1-NEXT:    store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4
-; AVX1-NEXT:    store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4
-; AVX1-NEXT:    store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4
-; AVX1-NEXT:    store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4
-; AVX1-NEXT:    store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4
-; AVX1-NEXT:    store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4
-; AVX1-NEXT:    store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4
-; AVX1-NEXT:    store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
-; AVX1-NEXT:    store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
-; AVX1-NEXT:    store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
-; AVX1-NEXT:    store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
-; AVX1-NEXT:    store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
-; AVX1-NEXT:    store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+; AVX1-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; AVX1-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[TMP5]], i32 0
+; AVX1-NEXT:    [[R0:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[TMP9]], i32 [[TMP10]], i32 3)
+; AVX1-NEXT:    [[TMP11:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
+; AVX1-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[TMP5]], i32 1
+; AVX1-NEXT:    [[R1:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[TMP11]], i32 [[TMP12]], i32 3)
+; AVX1-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
+; AVX1-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[TMP5]], i32 2
+; AVX1-NEXT:    [[R2:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[TMP13]], i32 [[TMP14]], i32 3)
+; AVX1-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+; AVX1-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[TMP5]], i32 3
+; AVX1-NEXT:    [[R3:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[TMP15]], i32 [[TMP16]], i32 3)
+; AVX1-NEXT:    [[TMP17:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
+; AVX1-NEXT:    [[TMP18:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0
+; AVX1-NEXT:    [[R4:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[TMP17]], i32 [[TMP18]], i32 3)
+; AVX1-NEXT:    [[TMP19:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
+; AVX1-NEXT:    [[TMP20:%.*]] = extractelement <4 x i32> [[TMP6]], i32 1
+; AVX1-NEXT:    [[R5:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[TMP19]], i32 [[TMP20]], i32 3)
+; AVX1-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
+; AVX1-NEXT:    [[TMP22:%.*]] = extractelement <4 x i32> [[TMP6]], i32 2
+; AVX1-NEXT:    [[R6:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[TMP21]], i32 [[TMP22]], i32 3)
+; AVX1-NEXT:    [[TMP23:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; AVX1-NEXT:    [[TMP24:%.*]] = extractelement <4 x i32> [[TMP6]], i32 3
+; AVX1-NEXT:    [[R7:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[TMP23]], i32 [[TMP24]], i32 3)
+; AVX1-NEXT:    [[TMP25:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; AVX1-NEXT:    [[TMP26:%.*]] = extractelement <4 x i32> [[TMP7]], i32 0
+; AVX1-NEXT:    [[R8:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[TMP25]], i32 [[TMP26]], i32 3)
+; AVX1-NEXT:    [[TMP27:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; AVX1-NEXT:    [[TMP28:%.*]] = extractelement <4 x i32> [[TMP7]], i32 1
+; AVX1-NEXT:    [[R9:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[TMP27]], i32 [[TMP28]], i32 3)
+; AVX1-NEXT:    [[TMP29:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; AVX1-NEXT:    [[TMP30:%.*]] = extractelement <4 x i32> [[TMP7]], i32 2
+; AVX1-NEXT:    [[R10:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[TMP29]], i32 [[TMP30]], i32 3)
+; AVX1-NEXT:    [[TMP31:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; AVX1-NEXT:    [[TMP32:%.*]] = extractelement <4 x i32> [[TMP7]], i32 3
+; AVX1-NEXT:    [[R11:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[TMP31]], i32 [[TMP32]], i32 3)
+; AVX1-NEXT:    [[TMP33:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
+; AVX1-NEXT:    [[TMP34:%.*]] = extractelement <4 x i32> [[TMP8]], i32 0
+; AVX1-NEXT:    [[R12:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[TMP33]], i32 [[TMP34]], i32 3)
+; AVX1-NEXT:    [[TMP35:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1
+; AVX1-NEXT:    [[TMP36:%.*]] = extractelement <4 x i32> [[TMP8]], i32 1
+; AVX1-NEXT:    [[R13:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[TMP35]], i32 [[TMP36]], i32 3)
+; AVX1-NEXT:    [[TMP37:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
+; AVX1-NEXT:    [[TMP38:%.*]] = extractelement <4 x i32> [[TMP8]], i32 2
+; AVX1-NEXT:    [[R14:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[TMP37]], i32 [[TMP38]], i32 3)
+; AVX1-NEXT:    [[TMP39:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
+; AVX1-NEXT:    [[TMP40:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3
+; AVX1-NEXT:    [[R15:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[TMP39]], i32 [[TMP40]], i32 3)
+; AVX1-NEXT:    [[TMP41:%.*]] = insertelement <4 x i32> poison, i32 [[R0]], i32 0
+; AVX1-NEXT:    [[TMP42:%.*]] = insertelement <4 x i32> [[TMP41]], i32 [[R1]], i32 1
+; AVX1-NEXT:    [[TMP43:%.*]] = insertelement <4 x i32> [[TMP42]], i32 [[R2]], i32 2
+; AVX1-NEXT:    [[TMP44:%.*]] = insertelement <4 x i32> [[TMP43]], i32 [[R3]], i32 3
+; AVX1-NEXT:    store <4 x i32> [[TMP44]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP45:%.*]] = insertelement <4 x i32> poison, i32 [[R4]], i32 0
+; AVX1-NEXT:    [[TMP46:%.*]] = insertelement <4 x i32> [[TMP45]], i32 [[R5]], i32 1
+; AVX1-NEXT:    [[TMP47:%.*]] = insertelement <4 x i32> [[TMP46]], i32 [[R6]], i32 2
+; AVX1-NEXT:    [[TMP48:%.*]] = insertelement <4 x i32> [[TMP47]], i32 [[R7]], i32 3
+; AVX1-NEXT:    store <4 x i32> [[TMP48]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP49:%.*]] = insertelement <4 x i32> poison, i32 [[R8]], i32 0
+; AVX1-NEXT:    [[TMP50:%.*]] = insertelement <4 x i32> [[TMP49]], i32 [[R9]], i32 1
+; AVX1-NEXT:    [[TMP51:%.*]] = insertelement <4 x i32> [[TMP50]], i32 [[R10]], i32 2
+; AVX1-NEXT:    [[TMP52:%.*]] = insertelement <4 x i32> [[TMP51]], i32 [[R11]], i32 3
+; AVX1-NEXT:    store <4 x i32> [[TMP52]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP53:%.*]] = insertelement <4 x i32> poison, i32 [[R12]], i32 0
+; AVX1-NEXT:    [[TMP54:%.*]] = insertelement <4 x i32> [[TMP53]], i32 [[R13]], i32 1
+; AVX1-NEXT:    [[TMP55:%.*]] = insertelement <4 x i32> [[TMP54]], i32 [[R14]], i32 2
+; AVX1-NEXT:    [[TMP56:%.*]] = insertelement <4 x i32> [[TMP55]], i32 [[R15]], i32 3
+; AVX1-NEXT:    store <4 x i32> [[TMP56]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @umul_v16i32(
Index: llvm/test/Transforms/SLPVectorizer/X86/bad_types.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/bad_types.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/bad_types.ll
@@ -15,8 +15,10 @@
 ; CHECK-NEXT:    [[A_AND:%.*]] = and i64 [[A_CAST]], 42
 ; CHECK-NEXT:    [[B_AND:%.*]] = and i64 [[B_CAST]], 42
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i64, i64* [[PTR:%.*]], i32 1
-; CHECK-NEXT:    store i64 [[A_AND]], i64* [[PTR]], align 8
-; CHECK-NEXT:    store i64 [[B_AND]], i64* [[GEP]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i64> poison, i64 [[A_AND]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> [[TMP0]], i64 [[B_AND]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[PTR]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[TMP2]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
Index: llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll
@@ -16,38 +16,59 @@
 
 define void @splat(i8 %a, i8 %b, i8 %c) {
 ; SSE-LABEL: @splat(
-; SSE-NEXT:    [[TMP1:%.*]] = xor i8 [[C:%.*]], [[A:%.*]]
-; SSE-NEXT:    store i8 [[TMP1]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 0), align 16
-; SSE-NEXT:    [[TMP2:%.*]] = xor i8 [[A]], [[C]]
-; SSE-NEXT:    store i8 [[TMP2]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 1), align 1
-; SSE-NEXT:    [[TMP3:%.*]] = xor i8 [[A]], [[C]]
-; SSE-NEXT:    store i8 [[TMP3]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 2), align 1
-; SSE-NEXT:    [[TMP4:%.*]] = xor i8 [[A]], [[C]]
-; SSE-NEXT:    store i8 [[TMP4]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 3), align 1
-; SSE-NEXT:    [[TMP5:%.*]] = xor i8 [[C]], [[A]]
-; SSE-NEXT:    store i8 [[TMP5]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 4), align 1
-; SSE-NEXT:    [[TMP6:%.*]] = xor i8 [[C]], [[B:%.*]]
-; SSE-NEXT:    store i8 [[TMP6]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 5), align 1
-; SSE-NEXT:    [[TMP7:%.*]] = xor i8 [[C]], [[A]]
-; SSE-NEXT:    store i8 [[TMP7]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 6), align 1
-; SSE-NEXT:    [[TMP8:%.*]] = xor i8 [[C]], [[B]]
-; SSE-NEXT:    store i8 [[TMP8]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 7), align 1
-; SSE-NEXT:    [[TMP9:%.*]] = xor i8 [[A]], [[C]]
-; SSE-NEXT:    store i8 [[TMP9]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 8), align 1
-; SSE-NEXT:    [[TMP10:%.*]] = xor i8 [[A]], [[C]]
-; SSE-NEXT:    store i8 [[TMP10]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 9), align 1
-; SSE-NEXT:    [[TMP11:%.*]] = xor i8 [[A]], [[C]]
-; SSE-NEXT:    store i8 [[TMP11]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 10), align 1
-; SSE-NEXT:    [[TMP12:%.*]] = xor i8 [[A]], [[C]]
-; SSE-NEXT:    store i8 [[TMP12]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 11), align 1
-; SSE-NEXT:    [[TMP13:%.*]] = xor i8 [[A]], [[C]]
-; SSE-NEXT:    store i8 [[TMP13]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 12), align 1
-; SSE-NEXT:    [[TMP14:%.*]] = xor i8 [[A]], [[C]]
-; SSE-NEXT:    store i8 [[TMP14]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 13), align 1
-; SSE-NEXT:    [[TMP15:%.*]] = xor i8 [[A]], [[C]]
-; SSE-NEXT:    store i8 [[TMP15]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 14), align 1
-; SSE-NEXT:    [[TMP16:%.*]] = xor i8 [[A]], [[C]]
-; SSE-NEXT:    store i8 [[TMP16]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 15), align 1
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[C:%.*]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <16 x i8> [[TMP1]], i8 [[C]], i32 1
+; SSE-NEXT:    [[TMP3:%.*]] = insertelement <16 x i8> [[TMP2]], i8 [[C]], i32 2
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <16 x i8> [[TMP3]], i8 [[C]], i32 3
+; SSE-NEXT:    [[TMP5:%.*]] = insertelement <16 x i8> [[TMP4]], i8 [[C]], i32 4
+; SSE-NEXT:    [[TMP6:%.*]] = insertelement <16 x i8> [[TMP5]], i8 [[C]], i32 5
+; SSE-NEXT:    [[TMP7:%.*]] = insertelement <16 x i8> [[TMP6]], i8 [[C]], i32 6
+; SSE-NEXT:    [[TMP8:%.*]] = insertelement <16 x i8> [[TMP7]], i8 [[C]], i32 7
+; SSE-NEXT:    [[TMP9:%.*]] = insertelement <16 x i8> [[TMP8]], i8 [[C]], i32 8
+; SSE-NEXT:    [[TMP10:%.*]] = insertelement <16 x i8> [[TMP9]], i8 [[C]], i32 9
+; SSE-NEXT:    [[TMP11:%.*]] = insertelement <16 x i8> [[TMP10]], i8 [[C]], i32 10
+; SSE-NEXT:    [[TMP12:%.*]] = insertelement <16 x i8> [[TMP11]], i8 [[C]], i32 11
+; SSE-NEXT:    [[TMP13:%.*]] = insertelement <16 x i8> [[TMP12]], i8 [[C]], i32 12
+; SSE-NEXT:    [[TMP14:%.*]] = insertelement <16 x i8> [[TMP13]], i8 [[C]], i32 13
+; SSE-NEXT:    [[TMP15:%.*]] = insertelement <16 x i8> [[TMP14]], i8 [[C]], i32 14
+; SSE-NEXT:    [[TMP16:%.*]] = insertelement <16 x i8> [[TMP15]], i8 [[C]], i32 15
+; SSE-NEXT:    [[TMP17:%.*]] = insertelement <2 x i8> poison, i8 [[A:%.*]], i32 0
+; SSE-NEXT:    [[TMP18:%.*]] = insertelement <2 x i8> [[TMP17]], i8 [[B:%.*]], i32 1
+; SSE-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i8> [[TMP18]], <2 x i8> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+; SSE-NEXT:    [[TMP19:%.*]] = xor <16 x i8> [[TMP16]], [[SHUFFLE]]
+; SSE-NEXT:    [[TMP20:%.*]] = extractelement <16 x i8> [[TMP19]], i32 15
+; SSE-NEXT:    [[TMP21:%.*]] = extractelement <16 x i8> [[TMP19]], i32 0
+; SSE-NEXT:    [[TMP22:%.*]] = insertelement <16 x i8> poison, i8 [[TMP21]], i32 0
+; SSE-NEXT:    [[TMP23:%.*]] = extractelement <16 x i8> [[TMP19]], i32 1
+; SSE-NEXT:    [[TMP24:%.*]] = insertelement <16 x i8> [[TMP22]], i8 [[TMP23]], i32 1
+; SSE-NEXT:    [[TMP25:%.*]] = extractelement <16 x i8> [[TMP19]], i32 2
+; SSE-NEXT:    [[TMP26:%.*]] = insertelement <16 x i8> [[TMP24]], i8 [[TMP25]], i32 2
+; SSE-NEXT:    [[TMP27:%.*]] = extractelement <16 x i8> [[TMP19]], i32 3
+; SSE-NEXT:    [[TMP28:%.*]] = insertelement <16 x i8> [[TMP26]], i8 [[TMP27]], i32 3
+; SSE-NEXT:    [[TMP29:%.*]] = extractelement <16 x i8> [[TMP19]], i32 4
+; SSE-NEXT:    [[TMP30:%.*]] = insertelement <16 x i8> [[TMP28]], i8 [[TMP29]], i32 4
+; SSE-NEXT:    [[TMP31:%.*]] = extractelement <16 x i8> [[TMP19]], i32 5
+; SSE-NEXT:    [[TMP32:%.*]] = insertelement <16 x i8> [[TMP30]], i8 [[TMP31]], i32 5
+; SSE-NEXT:    [[TMP33:%.*]] = extractelement <16 x i8> [[TMP19]], i32 6
+; SSE-NEXT:    [[TMP34:%.*]] = insertelement <16 x i8> [[TMP32]], i8 [[TMP33]], i32 6
+; SSE-NEXT:    [[TMP35:%.*]] = extractelement <16 x i8> [[TMP19]], i32 7
+; SSE-NEXT:    [[TMP36:%.*]] = insertelement <16 x i8> [[TMP34]], i8 [[TMP35]], i32 7
+; SSE-NEXT:    [[TMP37:%.*]] = extractelement <16 x i8> [[TMP19]], i32 8
+; SSE-NEXT:    [[TMP38:%.*]] = insertelement <16 x i8> [[TMP36]], i8 [[TMP37]], i32 8
+; SSE-NEXT:    [[TMP39:%.*]] = extractelement <16 x i8> [[TMP19]], i32 9
+; SSE-NEXT:    [[TMP40:%.*]] = insertelement <16 x i8> [[TMP38]], i8 [[TMP39]], i32 9
+; SSE-NEXT:    [[TMP41:%.*]] = extractelement <16 x i8> [[TMP19]], i32 10
+; SSE-NEXT:    [[TMP42:%.*]] = insertelement <16 x i8> [[TMP40]], i8 [[TMP41]], i32 10
+; SSE-NEXT:    [[TMP43:%.*]] = extractelement <16 x i8> [[TMP19]], i32 11
+; SSE-NEXT:    [[TMP44:%.*]] = insertelement <16 x i8> [[TMP42]], i8 [[TMP43]], i32 11
+; SSE-NEXT:    [[TMP45:%.*]] = extractelement <16 x i8> [[TMP19]], i32 12
+; SSE-NEXT:    [[TMP46:%.*]] = insertelement <16 x i8> [[TMP44]], i8 [[TMP45]], i32 12
+; SSE-NEXT:    [[TMP47:%.*]] = extractelement <16 x i8> [[TMP19]], i32 13
+; SSE-NEXT:    [[TMP48:%.*]] = insertelement <16 x i8> [[TMP46]], i8 [[TMP47]], i32 13
+; SSE-NEXT:    [[TMP49:%.*]] = extractelement <16 x i8> [[TMP19]], i32 14
+; SSE-NEXT:    [[TMP50:%.*]] = insertelement <16 x i8> [[TMP48]], i8 [[TMP49]], i32 14
+; SSE-NEXT:    [[TMP51:%.*]] = insertelement <16 x i8> [[TMP50]], i8 [[TMP20]], i32 15
+; SSE-NEXT:    store <16 x i8> [[TMP51]], <16 x i8>* bitcast ([32 x i8]* @cle to <16 x i8>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @splat(
Index: llvm/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll
@@ -11,7 +11,6 @@
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[_M_CUR2_I_I:%.*]] = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731", %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* [[__FIRST:%.*]], i64 0, i32 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = load double*, double** [[_M_CUR2_I_I]], align 8
-; CHECK-NEXT:    [[_M_FIRST3_I_I:%.*]] = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731", %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* [[__FIRST]], i64 0, i32 1
 ; CHECK-NEXT:    [[_M_CUR2_I_I81:%.*]] = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731", %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* [[__LAST:%.*]], i64 0, i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = load double*, double** [[_M_CUR2_I_I81]], align 8
 ; CHECK-NEXT:    [[_M_FIRST3_I_I83:%.*]] = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731", %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* [[__LAST]], i64 0, i32 1
@@ -26,8 +25,10 @@
 ; CHECK:       _ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit:
 ; CHECK-NEXT:    [[TMP3:%.*]] = phi double* [ [[TMP2]], [[ENTRY:%.*]] ], [ [[TMP2]], [[WHILE_COND_I]] ], [ undef, [[WHILE_BODY_I]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = phi double* [ [[TMP0]], [[ENTRY]] ], [ [[TMP1]], [[WHILE_COND_I]] ], [ undef, [[WHILE_BODY_I]] ]
-; CHECK-NEXT:    store double* [[TMP4]], double** [[_M_CUR2_I_I]], align 8
-; CHECK-NEXT:    store double* [[TMP3]], double** [[_M_FIRST3_I_I]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double*> poison, double* [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double*> [[TMP5]], double* [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast double** [[_M_CUR2_I_I]] to <2 x double*>*
+; CHECK-NEXT:    store <2 x double*> [[TMP6]], <2 x double*>* [[TMP7]], align 8
 ; CHECK-NEXT:    br i1 undef, label [[IF_THEN_I55:%.*]], label [[WHILE_COND:%.*]]
 ; CHECK:       if.then.i55:
 ; CHECK-NEXT:    br label [[WHILE_COND]]
Index: llvm/test/Transforms/SLPVectorizer/X86/cse.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/cse.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/cse.ll
@@ -21,18 +21,24 @@
 ; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], <double 4.000000e+00, double 3.000000e+00>
 ; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], <double 1.000000e+00, double 6.000000e+00>
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[G]], i64 1
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[G]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds double, double* [[G]], i64 2
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
-; CHECK-NEXT:    [[MUL11:%.*]] = fmul double [[TMP6]], 4.000000e+00
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[MUL11]], i32 1
-; CHECK-NEXT:    [[TMP9:%.*]] = fadd <2 x double> [[TMP8]], <double 7.000000e+00, double 8.000000e+00>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; CHECK-NEXT:    [[MUL11:%.*]] = fmul double [[TMP5]], 4.000000e+00
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> poison, double [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[MUL11]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd <2 x double> [[TMP7]], <double 7.000000e+00, double 8.000000e+00>
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds double, double* [[G]], i64 3
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast double* [[ARRAYIDX9]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP9]], <2 x double>* [[TMP10]], align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP8]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x double> poison, double [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x double> [[TMP11]], double [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x double> [[TMP13]], double [[TMP14]], i32 2
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x double> [[TMP15]], double [[TMP9]], i32 3
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast double* [[G]] to <4 x double>*
+; CHECK-NEXT:    store <4 x double> [[TMP16]], <4 x double>* [[TMP17]], align 8
 ; CHECK-NEXT:    ret i32 undef
 ;
 entry:
Index: llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll
@@ -103,18 +103,20 @@
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @ctlz_4i32(
-; AVX2-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
-; AVX2-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
-; AVX2-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
-; AVX2-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
-; AVX2-NEXT:    [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false)
-; AVX2-NEXT:    [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false)
-; AVX2-NEXT:    [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false)
-; AVX2-NEXT:    [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false)
-; AVX2-NEXT:    store i32 [[CTLZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
-; AVX2-NEXT:    store i32 [[CTLZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
-; AVX2-NEXT:    store i32 [[CTLZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
-; AVX2-NEXT:    store i32 [[CTLZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
+; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
+; AVX2-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; AVX2-NEXT:    [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[TMP2]], i1 false)
+; AVX2-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
+; AVX2-NEXT:    [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[TMP3]], i1 false)
+; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
+; AVX2-NEXT:    [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[TMP4]], i1 false)
+; AVX2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+; AVX2-NEXT:    [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[TMP5]], i1 false)
+; AVX2-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[CTLZ0]], i32 0
+; AVX2-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[CTLZ1]], i32 1
+; AVX2-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[CTLZ2]], i32 2
+; AVX2-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[CTLZ3]], i32 3
+; AVX2-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
 ; AVX2-NEXT:    ret void
 ;
   %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
@@ -559,18 +561,20 @@
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @ctlz_undef_4i32(
-; AVX2-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
-; AVX2-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
-; AVX2-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
-; AVX2-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
-; AVX2-NEXT:    [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 true)
-; AVX2-NEXT:    [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 true)
-; AVX2-NEXT:    [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 true)
-; AVX2-NEXT:    [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 true)
-; AVX2-NEXT:    store i32 [[CTLZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
-; AVX2-NEXT:    store i32 [[CTLZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
-; AVX2-NEXT:    store i32 [[CTLZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
-; AVX2-NEXT:    store i32 [[CTLZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
+; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
+; AVX2-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; AVX2-NEXT:    [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[TMP2]], i1 true)
+; AVX2-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
+; AVX2-NEXT:    [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[TMP3]], i1 true)
+; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
+; AVX2-NEXT:    [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[TMP4]], i1 true)
+; AVX2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+; AVX2-NEXT:    [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[TMP5]], i1 true)
+; AVX2-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[CTLZ0]], i32 0
+; AVX2-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[CTLZ1]], i32 1
+; AVX2-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[CTLZ2]], i32 2
+; AVX2-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[CTLZ3]], i32 3
+; AVX2-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
 ; AVX2-NEXT:    ret void
 ;
   %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
Index: llvm/test/Transforms/SLPVectorizer/X86/ctpop.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/ctpop.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/ctpop.ll
@@ -123,33 +123,37 @@
 ; SSE2-NEXT:    ret void
 ;
 ; SSE42-LABEL: @ctpop_4i32(
-; SSE42-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
-; SSE42-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
-; SSE42-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
-; SSE42-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
-; SSE42-NEXT:    [[CTPOP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD0]])
-; SSE42-NEXT:    [[CTPOP1:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD1]])
-; SSE42-NEXT:    [[CTPOP2:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD2]])
-; SSE42-NEXT:    [[CTPOP3:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD3]])
-; SSE42-NEXT:    store i32 [[CTPOP0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
-; SSE42-NEXT:    store i32 [[CTPOP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
-; SSE42-NEXT:    store i32 [[CTPOP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
-; SSE42-NEXT:    store i32 [[CTPOP3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
+; SSE42-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
+; SSE42-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; SSE42-NEXT:    [[CTPOP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP2]])
+; SSE42-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
+; SSE42-NEXT:    [[CTPOP1:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP3]])
+; SSE42-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
+; SSE42-NEXT:    [[CTPOP2:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP4]])
+; SSE42-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+; SSE42-NEXT:    [[CTPOP3:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP5]])
+; SSE42-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[CTPOP0]], i32 0
+; SSE42-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[CTPOP1]], i32 1
+; SSE42-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[CTPOP2]], i32 2
+; SSE42-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[CTPOP3]], i32 3
+; SSE42-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
 ; SSE42-NEXT:    ret void
 ;
 ; AVX-LABEL: @ctpop_4i32(
-; AVX-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
-; AVX-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
-; AVX-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
-; AVX-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
-; AVX-NEXT:    [[CTPOP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD0]])
-; AVX-NEXT:    [[CTPOP1:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD1]])
-; AVX-NEXT:    [[CTPOP2:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD2]])
-; AVX-NEXT:    [[CTPOP3:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD3]])
-; AVX-NEXT:    store i32 [[CTPOP0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
-; AVX-NEXT:    store i32 [[CTPOP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
-; AVX-NEXT:    store i32 [[CTPOP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
-; AVX-NEXT:    store i32 [[CTPOP3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; AVX-NEXT:    [[CTPOP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP2]])
+; AVX-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
+; AVX-NEXT:    [[CTPOP1:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP3]])
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
+; AVX-NEXT:    [[CTPOP2:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP4]])
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+; AVX-NEXT:    [[CTPOP3:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP5]])
+; AVX-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[CTPOP0]], i32 0
+; AVX-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[CTPOP1]], i32 1
+; AVX-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[CTPOP2]], i32 2
+; AVX-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[CTPOP3]], i32 3
+; AVX-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
 ; AVX-NEXT:    ret void
 ;
   %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
@@ -178,57 +182,63 @@
 ; SSE2-NEXT:    ret void
 ;
 ; SSE42-LABEL: @ctpop_8i32(
-; SSE42-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2
-; SSE42-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2
-; SSE42-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2
-; SSE42-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2
-; SSE42-NEXT:    [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2
-; SSE42-NEXT:    [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2
-; SSE42-NEXT:    [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2
-; SSE42-NEXT:    [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2
-; SSE42-NEXT:    [[CTPOP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD0]])
-; SSE42-NEXT:    [[CTPOP1:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD1]])
-; SSE42-NEXT:    [[CTPOP2:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD2]])
-; SSE42-NEXT:    [[CTPOP3:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD3]])
-; SSE42-NEXT:    [[CTPOP4:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD4]])
-; SSE42-NEXT:    [[CTPOP5:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD5]])
-; SSE42-NEXT:    [[CTPOP6:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD6]])
-; SSE42-NEXT:    [[CTPOP7:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD7]])
-; SSE42-NEXT:    store i32 [[CTPOP0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
-; SSE42-NEXT:    store i32 [[CTPOP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
-; SSE42-NEXT:    store i32 [[CTPOP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
-; SSE42-NEXT:    store i32 [[CTPOP3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
-; SSE42-NEXT:    store i32 [[CTPOP4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
-; SSE42-NEXT:    store i32 [[CTPOP5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
-; SSE42-NEXT:    store i32 [[CTPOP6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
-; SSE42-NEXT:    store i32 [[CTPOP7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
+; SSE42-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2
+; SSE42-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2
+; SSE42-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; SSE42-NEXT:    [[CTPOP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP3]])
+; SSE42-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
+; SSE42-NEXT:    [[CTPOP1:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP4]])
+; SSE42-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
+; SSE42-NEXT:    [[CTPOP2:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP5]])
+; SSE42-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+; SSE42-NEXT:    [[CTPOP3:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP6]])
+; SSE42-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
+; SSE42-NEXT:    [[CTPOP4:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP7]])
+; SSE42-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
+; SSE42-NEXT:    [[CTPOP5:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP8]])
+; SSE42-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
+; SSE42-NEXT:    [[CTPOP6:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP9]])
+; SSE42-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; SSE42-NEXT:    [[CTPOP7:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP10]])
+; SSE42-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[CTPOP0]], i32 0
+; SSE42-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[CTPOP1]], i32 1
+; SSE42-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[CTPOP2]], i32 2
+; SSE42-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[CTPOP3]], i32 3
+; SSE42-NEXT:    store <4 x i32> [[TMP14]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
+; SSE42-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> poison, i32 [[CTPOP4]], i32 0
+; SSE42-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[CTPOP5]], i32 1
+; SSE42-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[CTPOP6]], i32 2
+; SSE42-NEXT:    [[TMP18:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[CTPOP7]], i32 3
+; SSE42-NEXT:    store <4 x i32> [[TMP18]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2
 ; SSE42-NEXT:    ret void
 ;
 ; AVX1-LABEL: @ctpop_8i32(
-; AVX1-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2
-; AVX1-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2
-; AVX1-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2
-; AVX1-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2
-; AVX1-NEXT:    [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2
-; AVX1-NEXT:    [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2
-; AVX1-NEXT:    [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2
-; AVX1-NEXT:    [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2
-; AVX1-NEXT:    [[CTPOP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD0]])
-; AVX1-NEXT:    [[CTPOP1:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD1]])
-; AVX1-NEXT:    [[CTPOP2:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD2]])
-; AVX1-NEXT:    [[CTPOP3:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD3]])
-; AVX1-NEXT:    [[CTPOP4:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD4]])
-; AVX1-NEXT:    [[CTPOP5:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD5]])
-; AVX1-NEXT:    [[CTPOP6:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD6]])
-; AVX1-NEXT:    [[CTPOP7:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD7]])
-; AVX1-NEXT:    store i32 [[CTPOP0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
-; AVX1-NEXT:    store i32 [[CTPOP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
-; AVX1-NEXT:    store i32 [[CTPOP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
-; AVX1-NEXT:    store i32 [[CTPOP3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
-; AVX1-NEXT:    store i32 [[CTPOP4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
-; AVX1-NEXT:    store i32 [[CTPOP5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
-; AVX1-NEXT:    store i32 [[CTPOP6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
-; AVX1-NEXT:    store i32 [[CTPOP7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
+; AVX1-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([8 x i32]* @src32 to <8 x i32>*), align 2
+; AVX1-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0
+; AVX1-NEXT:    [[CTPOP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP2]])
+; AVX1-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP1]], i32 1
+; AVX1-NEXT:    [[CTPOP1:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP3]])
+; AVX1-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2
+; AVX1-NEXT:    [[CTPOP2:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP4]])
+; AVX1-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3
+; AVX1-NEXT:    [[CTPOP3:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP5]])
+; AVX1-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4
+; AVX1-NEXT:    [[CTPOP4:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP6]])
+; AVX1-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5
+; AVX1-NEXT:    [[CTPOP5:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP7]])
+; AVX1-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP1]], i32 6
+; AVX1-NEXT:    [[CTPOP6:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP8]])
+; AVX1-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP1]], i32 7
+; AVX1-NEXT:    [[CTPOP7:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP9]])
+; AVX1-NEXT:    [[TMP10:%.*]] = insertelement <8 x i32> poison, i32 [[CTPOP0]], i32 0
+; AVX1-NEXT:    [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[CTPOP1]], i32 1
+; AVX1-NEXT:    [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[CTPOP2]], i32 2
+; AVX1-NEXT:    [[TMP13:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[CTPOP3]], i32 3
+; AVX1-NEXT:    [[TMP14:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[CTPOP4]], i32 4
+; AVX1-NEXT:    [[TMP15:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[CTPOP5]], i32 5
+; AVX1-NEXT:    [[TMP16:%.*]] = insertelement <8 x i32> [[TMP15]], i32 [[CTPOP6]], i32 6
+; AVX1-NEXT:    [[TMP17:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[CTPOP7]], i32 7
+; AVX1-NEXT:    store <8 x i32> [[TMP17]], <8 x i32>* bitcast ([8 x i32]* @dst32 to <8 x i32>*), align 2
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @ctpop_8i32(
Index: llvm/test/Transforms/SLPVectorizer/X86/cttz.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/cttz.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/cttz.ll
@@ -103,18 +103,20 @@
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @cttz_4i32(
-; AVX2-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
-; AVX2-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
-; AVX2-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
-; AVX2-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
-; AVX2-NEXT:    [[CTTZ0:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD0]], i1 false)
-; AVX2-NEXT:    [[CTTZ1:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD1]], i1 false)
-; AVX2-NEXT:    [[CTTZ2:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD2]], i1 false)
-; AVX2-NEXT:    [[CTTZ3:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD3]], i1 false)
-; AVX2-NEXT:    store i32 [[CTTZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
-; AVX2-NEXT:    store i32 [[CTTZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
-; AVX2-NEXT:    store i32 [[CTTZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
-; AVX2-NEXT:    store i32 [[CTTZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
+; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
+; AVX2-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; AVX2-NEXT:    [[CTTZ0:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP2]], i1 false)
+; AVX2-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
+; AVX2-NEXT:    [[CTTZ1:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP3]], i1 false)
+; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
+; AVX2-NEXT:    [[CTTZ2:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP4]], i1 false)
+; AVX2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+; AVX2-NEXT:    [[CTTZ3:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP5]], i1 false)
+; AVX2-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[CTTZ0]], i32 0
+; AVX2-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[CTTZ1]], i32 1
+; AVX2-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[CTTZ2]], i32 2
+; AVX2-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[CTTZ3]], i32 3
+; AVX2-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
 ; AVX2-NEXT:    ret void
 ;
   %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
@@ -559,18 +561,20 @@
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @cttz_undef_4i32(
-; AVX2-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
-; AVX2-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
-; AVX2-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
-; AVX2-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
-; AVX2-NEXT:    [[CTTZ0:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD0]], i1 true)
-; AVX2-NEXT:    [[CTTZ1:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD1]], i1 true)
-; AVX2-NEXT:    [[CTTZ2:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD2]], i1 true)
-; AVX2-NEXT:    [[CTTZ3:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD3]], i1 true)
-; AVX2-NEXT:    store i32 [[CTTZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
-; AVX2-NEXT:    store i32 [[CTTZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
-; AVX2-NEXT:    store i32 [[CTTZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
-; AVX2-NEXT:    store i32 [[CTTZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
+; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
+; AVX2-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; AVX2-NEXT:    [[CTTZ0:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP2]], i1 true)
+; AVX2-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
+; AVX2-NEXT:    [[CTTZ1:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP3]], i1 true)
+; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
+; AVX2-NEXT:    [[CTTZ2:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP4]], i1 true)
+; AVX2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+; AVX2-NEXT:    [[CTTZ3:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP5]], i1 true)
+; AVX2-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[CTTZ0]], i32 0
+; AVX2-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[CTTZ1]], i32 1
+; AVX2-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[CTTZ2]], i32 2
+; AVX2-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[CTTZ3]], i32 3
+; AVX2-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
 ; AVX2-NEXT:    ret void
 ;
   %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
Index: llvm/test/Transforms/SLPVectorizer/X86/fma.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/fma.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/fma.ll
@@ -26,16 +26,20 @@
 
 define void @fma_2f64() #0 {
 ; NO-FMA-LABEL: @fma_2f64(
-; NO-FMA-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 0), align 8
-; NO-FMA-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 1), align 8
-; NO-FMA-NEXT:    [[B0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 0), align 8
-; NO-FMA-NEXT:    [[B1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 1), align 8
-; NO-FMA-NEXT:    [[C0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 0), align 8
-; NO-FMA-NEXT:    [[C1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 1), align 8
-; NO-FMA-NEXT:    [[FMA0:%.*]] = call double @llvm.fma.f64(double [[A0]], double [[B0]], double [[C0]])
-; NO-FMA-NEXT:    [[FMA1:%.*]] = call double @llvm.fma.f64(double [[A1]], double [[B1]], double [[C1]])
-; NO-FMA-NEXT:    store double [[FMA0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; NO-FMA-NEXT:    store double [[FMA1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; NO-FMA-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcA64 to <2 x double>*), align 8
+; NO-FMA-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 8
+; NO-FMA-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcC64 to <2 x double>*), align 8
+; NO-FMA-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; NO-FMA-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; NO-FMA-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; NO-FMA-NEXT:    [[FMA0:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[TMP5]], double [[TMP6]])
+; NO-FMA-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; NO-FMA-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; NO-FMA-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+; NO-FMA-NEXT:    [[FMA1:%.*]] = call double @llvm.fma.f64(double [[TMP7]], double [[TMP8]], double [[TMP9]])
+; NO-FMA-NEXT:    [[TMP10:%.*]] = insertelement <2 x double> poison, double [[FMA0]], i32 0
+; NO-FMA-NEXT:    [[TMP11:%.*]] = insertelement <2 x double> [[TMP10]], double [[FMA1]], i32 1
+; NO-FMA-NEXT:    store <2 x double> [[TMP11]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
 ; NO-FMA-NEXT:    ret void
 ;
 ; FMA-LABEL: @fma_2f64(
@@ -61,26 +65,34 @@
 
 define void @fma_4f64() #0 {
 ; NO-FMA-LABEL: @fma_4f64(
-; NO-FMA-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 0), align 8
-; NO-FMA-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 1), align 8
-; NO-FMA-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2), align 8
-; NO-FMA-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 3), align 8
-; NO-FMA-NEXT:    [[B0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 0), align 8
-; NO-FMA-NEXT:    [[B1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 1), align 8
-; NO-FMA-NEXT:    [[B2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2), align 8
-; NO-FMA-NEXT:    [[B3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 3), align 8
-; NO-FMA-NEXT:    [[C0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 0), align 8
-; NO-FMA-NEXT:    [[C1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 1), align 8
-; NO-FMA-NEXT:    [[C2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 2), align 8
-; NO-FMA-NEXT:    [[C3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 3), align 8
-; NO-FMA-NEXT:    [[FMA0:%.*]] = call double @llvm.fma.f64(double [[A0]], double [[B0]], double [[C0]])
-; NO-FMA-NEXT:    [[FMA1:%.*]] = call double @llvm.fma.f64(double [[A1]], double [[B1]], double [[C1]])
-; NO-FMA-NEXT:    [[FMA2:%.*]] = call double @llvm.fma.f64(double [[A2]], double [[B2]], double [[C2]])
-; NO-FMA-NEXT:    [[FMA3:%.*]] = call double @llvm.fma.f64(double [[A3]], double [[B3]], double [[C3]])
-; NO-FMA-NEXT:    store double [[FMA0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; NO-FMA-NEXT:    store double [[FMA1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; NO-FMA-NEXT:    store double [[FMA2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
-; NO-FMA-NEXT:    store double [[FMA3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; NO-FMA-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcA64 to <2 x double>*), align 8
+; NO-FMA-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 8
+; NO-FMA-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 8
+; NO-FMA-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 8
+; NO-FMA-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcC64 to <2 x double>*), align 8
+; NO-FMA-NEXT:    [[TMP6:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 2) to <2 x double>*), align 8
+; NO-FMA-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; NO-FMA-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; NO-FMA-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
+; NO-FMA-NEXT:    [[FMA0:%.*]] = call double @llvm.fma.f64(double [[TMP7]], double [[TMP8]], double [[TMP9]])
+; NO-FMA-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; NO-FMA-NEXT:    [[TMP11:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+; NO-FMA-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
+; NO-FMA-NEXT:    [[FMA1:%.*]] = call double @llvm.fma.f64(double [[TMP10]], double [[TMP11]], double [[TMP12]])
+; NO-FMA-NEXT:    [[TMP13:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; NO-FMA-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
+; NO-FMA-NEXT:    [[TMP15:%.*]] = extractelement <2 x double> [[TMP6]], i32 0
+; NO-FMA-NEXT:    [[FMA2:%.*]] = call double @llvm.fma.f64(double [[TMP13]], double [[TMP14]], double [[TMP15]])
+; NO-FMA-NEXT:    [[TMP16:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; NO-FMA-NEXT:    [[TMP17:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
+; NO-FMA-NEXT:    [[TMP18:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
+; NO-FMA-NEXT:    [[FMA3:%.*]] = call double @llvm.fma.f64(double [[TMP16]], double [[TMP17]], double [[TMP18]])
+; NO-FMA-NEXT:    [[TMP19:%.*]] = insertelement <2 x double> poison, double [[FMA0]], i32 0
+; NO-FMA-NEXT:    [[TMP20:%.*]] = insertelement <2 x double> [[TMP19]], double [[FMA1]], i32 1
+; NO-FMA-NEXT:    store <2 x double> [[TMP20]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; NO-FMA-NEXT:    [[TMP21:%.*]] = insertelement <2 x double> poison, double [[FMA2]], i32 0
+; NO-FMA-NEXT:    [[TMP22:%.*]] = insertelement <2 x double> [[TMP21]], double [[FMA3]], i32 1
+; NO-FMA-NEXT:    store <2 x double> [[TMP22]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
 ; NO-FMA-NEXT:    ret void
 ;
 ; FMA-LABEL: @fma_4f64(
@@ -116,46 +128,62 @@
 
 define void @fma_8f64() #0 {
 ; NO-FMA-LABEL: @fma_8f64(
-; NO-FMA-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 0), align 4
-; NO-FMA-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 1), align 4
-; NO-FMA-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2), align 4
-; NO-FMA-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 3), align 4
-; NO-FMA-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4), align 4
-; NO-FMA-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 5), align 4
-; NO-FMA-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 6), align 4
-; NO-FMA-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 7), align 4
-; NO-FMA-NEXT:    [[B0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 0), align 4
-; NO-FMA-NEXT:    [[B1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 1), align 4
-; NO-FMA-NEXT:    [[B2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2), align 4
-; NO-FMA-NEXT:    [[B3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 3), align 4
-; NO-FMA-NEXT:    [[B4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4), align 4
-; NO-FMA-NEXT:    [[B5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 5), align 4
-; NO-FMA-NEXT:    [[B6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 6), align 4
-; NO-FMA-NEXT:    [[B7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 7), align 4
-; NO-FMA-NEXT:    [[C0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 0), align 4
-; NO-FMA-NEXT:    [[C1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 1), align 4
-; NO-FMA-NEXT:    [[C2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 2), align 4
-; NO-FMA-NEXT:    [[C3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 3), align 4
-; NO-FMA-NEXT:    [[C4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 4), align 4
-; NO-FMA-NEXT:    [[C5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 5), align 4
-; NO-FMA-NEXT:    [[C6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 6), align 4
-; NO-FMA-NEXT:    [[C7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 7), align 4
-; NO-FMA-NEXT:    [[FMA0:%.*]] = call double @llvm.fma.f64(double [[A0]], double [[B0]], double [[C0]])
-; NO-FMA-NEXT:    [[FMA1:%.*]] = call double @llvm.fma.f64(double [[A1]], double [[B1]], double [[C1]])
-; NO-FMA-NEXT:    [[FMA2:%.*]] = call double @llvm.fma.f64(double [[A2]], double [[B2]], double [[C2]])
-; NO-FMA-NEXT:    [[FMA3:%.*]] = call double @llvm.fma.f64(double [[A3]], double [[B3]], double [[C3]])
-; NO-FMA-NEXT:    [[FMA4:%.*]] = call double @llvm.fma.f64(double [[A4]], double [[B4]], double [[C4]])
-; NO-FMA-NEXT:    [[FMA5:%.*]] = call double @llvm.fma.f64(double [[A5]], double [[B5]], double [[C5]])
-; NO-FMA-NEXT:    [[FMA6:%.*]] = call double @llvm.fma.f64(double [[A6]], double [[B6]], double [[C6]])
-; NO-FMA-NEXT:    [[FMA7:%.*]] = call double @llvm.fma.f64(double [[A7]], double [[B7]], double [[C7]])
-; NO-FMA-NEXT:    store double [[FMA0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 4
-; NO-FMA-NEXT:    store double [[FMA1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 4
-; NO-FMA-NEXT:    store double [[FMA2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 4
-; NO-FMA-NEXT:    store double [[FMA3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 4
-; NO-FMA-NEXT:    store double [[FMA4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 4
-; NO-FMA-NEXT:    store double [[FMA5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 4
-; NO-FMA-NEXT:    store double [[FMA6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 4
-; NO-FMA-NEXT:    store double [[FMA7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 4
+; NO-FMA-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcA64 to <2 x double>*), align 4
+; NO-FMA-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 4
+; NO-FMA-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <2 x double>*), align 4
+; NO-FMA-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 6) to <2 x double>*), align 4
+; NO-FMA-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 4
+; NO-FMA-NEXT:    [[TMP6:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 4
+; NO-FMA-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <2 x double>*), align 4
+; NO-FMA-NEXT:    [[TMP8:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 6) to <2 x double>*), align 4
+; NO-FMA-NEXT:    [[TMP9:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcC64 to <2 x double>*), align 4
+; NO-FMA-NEXT:    [[TMP10:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 2) to <2 x double>*), align 4
+; NO-FMA-NEXT:    [[TMP11:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 4) to <2 x double>*), align 4
+; NO-FMA-NEXT:    [[TMP12:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 6) to <2 x double>*), align 4
+; NO-FMA-NEXT:    [[TMP13:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; NO-FMA-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
+; NO-FMA-NEXT:    [[TMP15:%.*]] = extractelement <2 x double> [[TMP9]], i32 0
+; NO-FMA-NEXT:    [[FMA0:%.*]] = call double @llvm.fma.f64(double [[TMP13]], double [[TMP14]], double [[TMP15]])
+; NO-FMA-NEXT:    [[TMP16:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; NO-FMA-NEXT:    [[TMP17:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
+; NO-FMA-NEXT:    [[TMP18:%.*]] = extractelement <2 x double> [[TMP9]], i32 1
+; NO-FMA-NEXT:    [[FMA1:%.*]] = call double @llvm.fma.f64(double [[TMP16]], double [[TMP17]], double [[TMP18]])
+; NO-FMA-NEXT:    [[TMP19:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; NO-FMA-NEXT:    [[TMP20:%.*]] = extractelement <2 x double> [[TMP6]], i32 0
+; NO-FMA-NEXT:    [[TMP21:%.*]] = extractelement <2 x double> [[TMP10]], i32 0
+; NO-FMA-NEXT:    [[FMA2:%.*]] = call double @llvm.fma.f64(double [[TMP19]], double [[TMP20]], double [[TMP21]])
+; NO-FMA-NEXT:    [[TMP22:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; NO-FMA-NEXT:    [[TMP23:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
+; NO-FMA-NEXT:    [[TMP24:%.*]] = extractelement <2 x double> [[TMP10]], i32 1
+; NO-FMA-NEXT:    [[FMA3:%.*]] = call double @llvm.fma.f64(double [[TMP22]], double [[TMP23]], double [[TMP24]])
+; NO-FMA-NEXT:    [[TMP25:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; NO-FMA-NEXT:    [[TMP26:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
+; NO-FMA-NEXT:    [[TMP27:%.*]] = extractelement <2 x double> [[TMP11]], i32 0
+; NO-FMA-NEXT:    [[FMA4:%.*]] = call double @llvm.fma.f64(double [[TMP25]], double [[TMP26]], double [[TMP27]])
+; NO-FMA-NEXT:    [[TMP28:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+; NO-FMA-NEXT:    [[TMP29:%.*]] = extractelement <2 x double> [[TMP7]], i32 1
+; NO-FMA-NEXT:    [[TMP30:%.*]] = extractelement <2 x double> [[TMP11]], i32 1
+; NO-FMA-NEXT:    [[FMA5:%.*]] = call double @llvm.fma.f64(double [[TMP28]], double [[TMP29]], double [[TMP30]])
+; NO-FMA-NEXT:    [[TMP31:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
+; NO-FMA-NEXT:    [[TMP32:%.*]] = extractelement <2 x double> [[TMP8]], i32 0
+; NO-FMA-NEXT:    [[TMP33:%.*]] = extractelement <2 x double> [[TMP12]], i32 0
+; NO-FMA-NEXT:    [[FMA6:%.*]] = call double @llvm.fma.f64(double [[TMP31]], double [[TMP32]], double [[TMP33]])
+; NO-FMA-NEXT:    [[TMP34:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
+; NO-FMA-NEXT:    [[TMP35:%.*]] = extractelement <2 x double> [[TMP8]], i32 1
+; NO-FMA-NEXT:    [[TMP36:%.*]] = extractelement <2 x double> [[TMP12]], i32 1
+; NO-FMA-NEXT:    [[FMA7:%.*]] = call double @llvm.fma.f64(double [[TMP34]], double [[TMP35]], double [[TMP36]])
+; NO-FMA-NEXT:    [[TMP37:%.*]] = insertelement <2 x double> poison, double [[FMA0]], i32 0
+; NO-FMA-NEXT:    [[TMP38:%.*]] = insertelement <2 x double> [[TMP37]], double [[FMA1]], i32 1
+; NO-FMA-NEXT:    store <2 x double> [[TMP38]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 4
+; NO-FMA-NEXT:    [[TMP39:%.*]] = insertelement <2 x double> poison, double [[FMA2]], i32 0
+; NO-FMA-NEXT:    [[TMP40:%.*]] = insertelement <2 x double> [[TMP39]], double [[FMA3]], i32 1
+; NO-FMA-NEXT:    store <2 x double> [[TMP40]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 4
+; NO-FMA-NEXT:    [[TMP41:%.*]] = insertelement <2 x double> poison, double [[FMA4]], i32 0
+; NO-FMA-NEXT:    [[TMP42:%.*]] = insertelement <2 x double> [[TMP41]], double [[FMA5]], i32 1
+; NO-FMA-NEXT:    store <2 x double> [[TMP42]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 4
+; NO-FMA-NEXT:    [[TMP43:%.*]] = insertelement <2 x double> poison, double [[FMA6]], i32 0
+; NO-FMA-NEXT:    [[TMP44:%.*]] = insertelement <2 x double> [[TMP43]], double [[FMA7]], i32 1
+; NO-FMA-NEXT:    store <2 x double> [[TMP44]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 4
 ; NO-FMA-NEXT:    ret void
 ;
 ; FMA256-LABEL: @fma_8f64(
@@ -224,26 +252,30 @@
 
 define void @fma_4f32() #0 {
 ; NO-FMA-LABEL: @fma_4f32(
-; NO-FMA-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 0), align 4
-; NO-FMA-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 1), align 4
-; NO-FMA-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 2), align 4
-; NO-FMA-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 3), align 4
-; NO-FMA-NEXT:    [[B0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 0), align 4
-; NO-FMA-NEXT:    [[B1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 1), align 4
-; NO-FMA-NEXT:    [[B2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 2), align 4
-; NO-FMA-NEXT:    [[B3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 3), align 4
-; NO-FMA-NEXT:    [[C0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 0), align 4
-; NO-FMA-NEXT:    [[C1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 1), align 4
-; NO-FMA-NEXT:    [[C2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 2), align 4
-; NO-FMA-NEXT:    [[C3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 3), align 4
-; NO-FMA-NEXT:    [[FMA0:%.*]] = call float @llvm.fma.f32(float [[A0]], float [[B0]], float [[C0]])
-; NO-FMA-NEXT:    [[FMA1:%.*]] = call float @llvm.fma.f32(float [[A1]], float [[B1]], float [[C1]])
-; NO-FMA-NEXT:    [[FMA2:%.*]] = call float @llvm.fma.f32(float [[A2]], float [[B2]], float [[C2]])
-; NO-FMA-NEXT:    [[FMA3:%.*]] = call float @llvm.fma.f32(float [[A3]], float [[B3]], float [[C3]])
-; NO-FMA-NEXT:    store float [[FMA0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; NO-FMA-NEXT:    store float [[FMA1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; NO-FMA-NEXT:    store float [[FMA2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; NO-FMA-NEXT:    store float [[FMA3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; NO-FMA-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcC32 to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; NO-FMA-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; NO-FMA-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
+; NO-FMA-NEXT:    [[FMA0:%.*]] = call float @llvm.fma.f32(float [[TMP4]], float [[TMP5]], float [[TMP6]])
+; NO-FMA-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; NO-FMA-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; NO-FMA-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
+; NO-FMA-NEXT:    [[FMA1:%.*]] = call float @llvm.fma.f32(float [[TMP7]], float [[TMP8]], float [[TMP9]])
+; NO-FMA-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; NO-FMA-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; NO-FMA-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
+; NO-FMA-NEXT:    [[FMA2:%.*]] = call float @llvm.fma.f32(float [[TMP10]], float [[TMP11]], float [[TMP12]])
+; NO-FMA-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; NO-FMA-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; NO-FMA-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
+; NO-FMA-NEXT:    [[FMA3:%.*]] = call float @llvm.fma.f32(float [[TMP13]], float [[TMP14]], float [[TMP15]])
+; NO-FMA-NEXT:    [[TMP16:%.*]] = insertelement <4 x float> poison, float [[FMA0]], i32 0
+; NO-FMA-NEXT:    [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float [[FMA1]], i32 1
+; NO-FMA-NEXT:    [[TMP18:%.*]] = insertelement <4 x float> [[TMP17]], float [[FMA2]], i32 2
+; NO-FMA-NEXT:    [[TMP19:%.*]] = insertelement <4 x float> [[TMP18]], float [[FMA3]], i32 3
+; NO-FMA-NEXT:    store <4 x float> [[TMP19]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
 ; NO-FMA-NEXT:    ret void
 ;
 ; FMA-LABEL: @fma_4f32(
@@ -279,46 +311,54 @@
 
 define void @fma_8f32() #0 {
 ; NO-FMA-LABEL: @fma_8f32(
-; NO-FMA-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 0), align 4
-; NO-FMA-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 1), align 4
-; NO-FMA-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 2), align 4
-; NO-FMA-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 3), align 4
-; NO-FMA-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4), align 4
-; NO-FMA-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 5), align 4
-; NO-FMA-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 6), align 4
-; NO-FMA-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 7), align 4
-; NO-FMA-NEXT:    [[B0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 0), align 4
-; NO-FMA-NEXT:    [[B1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 1), align 4
-; NO-FMA-NEXT:    [[B2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 2), align 4
-; NO-FMA-NEXT:    [[B3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 3), align 4
-; NO-FMA-NEXT:    [[B4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4), align 4
-; NO-FMA-NEXT:    [[B5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 5), align 4
-; NO-FMA-NEXT:    [[B6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 6), align 4
-; NO-FMA-NEXT:    [[B7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 7), align 4
-; NO-FMA-NEXT:    [[C0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 0), align 4
-; NO-FMA-NEXT:    [[C1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 1), align 4
-; NO-FMA-NEXT:    [[C2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 2), align 4
-; NO-FMA-NEXT:    [[C3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 3), align 4
-; NO-FMA-NEXT:    [[C4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 4), align 4
-; NO-FMA-NEXT:    [[C5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 5), align 4
-; NO-FMA-NEXT:    [[C6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 6), align 4
-; NO-FMA-NEXT:    [[C7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 7), align 4
-; NO-FMA-NEXT:    [[FMA0:%.*]] = call float @llvm.fma.f32(float [[A0]], float [[B0]], float [[C0]])
-; NO-FMA-NEXT:    [[FMA1:%.*]] = call float @llvm.fma.f32(float [[A1]], float [[B1]], float [[C1]])
-; NO-FMA-NEXT:    [[FMA2:%.*]] = call float @llvm.fma.f32(float [[A2]], float [[B2]], float [[C2]])
-; NO-FMA-NEXT:    [[FMA3:%.*]] = call float @llvm.fma.f32(float [[A3]], float [[B3]], float [[C3]])
-; NO-FMA-NEXT:    [[FMA4:%.*]] = call float @llvm.fma.f32(float [[A4]], float [[B4]], float [[C4]])
-; NO-FMA-NEXT:    [[FMA5:%.*]] = call float @llvm.fma.f32(float [[A5]], float [[B5]], float [[C5]])
-; NO-FMA-NEXT:    [[FMA6:%.*]] = call float @llvm.fma.f32(float [[A6]], float [[B6]], float [[C6]])
-; NO-FMA-NEXT:    [[FMA7:%.*]] = call float @llvm.fma.f32(float [[A7]], float [[B7]], float [[C7]])
-; NO-FMA-NEXT:    store float [[FMA0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; NO-FMA-NEXT:    store float [[FMA1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; NO-FMA-NEXT:    store float [[FMA2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; NO-FMA-NEXT:    store float [[FMA3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; NO-FMA-NEXT:    store float [[FMA4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
-; NO-FMA-NEXT:    store float [[FMA5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; NO-FMA-NEXT:    store float [[FMA6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
-; NO-FMA-NEXT:    store float [[FMA7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; NO-FMA-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcC32 to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP6:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 4) to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; NO-FMA-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
+; NO-FMA-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP5]], i32 0
+; NO-FMA-NEXT:    [[FMA0:%.*]] = call float @llvm.fma.f32(float [[TMP7]], float [[TMP8]], float [[TMP9]])
+; NO-FMA-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; NO-FMA-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
+; NO-FMA-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[TMP5]], i32 1
+; NO-FMA-NEXT:    [[FMA1:%.*]] = call float @llvm.fma.f32(float [[TMP10]], float [[TMP11]], float [[TMP12]])
+; NO-FMA-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; NO-FMA-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
+; NO-FMA-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[TMP5]], i32 2
+; NO-FMA-NEXT:    [[FMA2:%.*]] = call float @llvm.fma.f32(float [[TMP13]], float [[TMP14]], float [[TMP15]])
+; NO-FMA-NEXT:    [[TMP16:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; NO-FMA-NEXT:    [[TMP17:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
+; NO-FMA-NEXT:    [[TMP18:%.*]] = extractelement <4 x float> [[TMP5]], i32 3
+; NO-FMA-NEXT:    [[FMA3:%.*]] = call float @llvm.fma.f32(float [[TMP16]], float [[TMP17]], float [[TMP18]])
+; NO-FMA-NEXT:    [[TMP19:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; NO-FMA-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[TMP4]], i32 0
+; NO-FMA-NEXT:    [[TMP21:%.*]] = extractelement <4 x float> [[TMP6]], i32 0
+; NO-FMA-NEXT:    [[FMA4:%.*]] = call float @llvm.fma.f32(float [[TMP19]], float [[TMP20]], float [[TMP21]])
+; NO-FMA-NEXT:    [[TMP22:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; NO-FMA-NEXT:    [[TMP23:%.*]] = extractelement <4 x float> [[TMP4]], i32 1
+; NO-FMA-NEXT:    [[TMP24:%.*]] = extractelement <4 x float> [[TMP6]], i32 1
+; NO-FMA-NEXT:    [[FMA5:%.*]] = call float @llvm.fma.f32(float [[TMP22]], float [[TMP23]], float [[TMP24]])
+; NO-FMA-NEXT:    [[TMP25:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; NO-FMA-NEXT:    [[TMP26:%.*]] = extractelement <4 x float> [[TMP4]], i32 2
+; NO-FMA-NEXT:    [[TMP27:%.*]] = extractelement <4 x float> [[TMP6]], i32 2
+; NO-FMA-NEXT:    [[FMA6:%.*]] = call float @llvm.fma.f32(float [[TMP25]], float [[TMP26]], float [[TMP27]])
+; NO-FMA-NEXT:    [[TMP28:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; NO-FMA-NEXT:    [[TMP29:%.*]] = extractelement <4 x float> [[TMP4]], i32 3
+; NO-FMA-NEXT:    [[TMP30:%.*]] = extractelement <4 x float> [[TMP6]], i32 3
+; NO-FMA-NEXT:    [[FMA7:%.*]] = call float @llvm.fma.f32(float [[TMP28]], float [[TMP29]], float [[TMP30]])
+; NO-FMA-NEXT:    [[TMP31:%.*]] = insertelement <4 x float> poison, float [[FMA0]], i32 0
+; NO-FMA-NEXT:    [[TMP32:%.*]] = insertelement <4 x float> [[TMP31]], float [[FMA1]], i32 1
+; NO-FMA-NEXT:    [[TMP33:%.*]] = insertelement <4 x float> [[TMP32]], float [[FMA2]], i32 2
+; NO-FMA-NEXT:    [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[FMA3]], i32 3
+; NO-FMA-NEXT:    store <4 x float> [[TMP34]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP35:%.*]] = insertelement <4 x float> poison, float [[FMA4]], i32 0
+; NO-FMA-NEXT:    [[TMP36:%.*]] = insertelement <4 x float> [[TMP35]], float [[FMA5]], i32 1
+; NO-FMA-NEXT:    [[TMP37:%.*]] = insertelement <4 x float> [[TMP36]], float [[FMA6]], i32 2
+; NO-FMA-NEXT:    [[TMP38:%.*]] = insertelement <4 x float> [[TMP37]], float [[FMA7]], i32 3
+; NO-FMA-NEXT:    store <4 x float> [[TMP38]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
 ; NO-FMA-NEXT:    ret void
 ;
 ; FMA-LABEL: @fma_8f32(
@@ -374,86 +414,102 @@
 
 define void @fma_16f32() #0 {
 ; NO-FMA-LABEL: @fma_16f32(
-; NO-FMA-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 0), align 4
-; NO-FMA-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 1), align 4
-; NO-FMA-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 2), align 4
-; NO-FMA-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 3), align 4
-; NO-FMA-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4), align 4
-; NO-FMA-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 5), align 4
-; NO-FMA-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 6), align 4
-; NO-FMA-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 7), align 4
-; NO-FMA-NEXT:    [[A8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8), align 4
-; NO-FMA-NEXT:    [[A9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 9), align 4
-; NO-FMA-NEXT:    [[A10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 10), align 4
-; NO-FMA-NEXT:    [[A11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 11), align 4
-; NO-FMA-NEXT:    [[A12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 12), align 4
-; NO-FMA-NEXT:    [[A13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 13), align 4
-; NO-FMA-NEXT:    [[A14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 14), align 4
-; NO-FMA-NEXT:    [[A15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 15), align 4
-; NO-FMA-NEXT:    [[B0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 0), align 4
-; NO-FMA-NEXT:    [[B1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 1), align 4
-; NO-FMA-NEXT:    [[B2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 2), align 4
-; NO-FMA-NEXT:    [[B3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 3), align 4
-; NO-FMA-NEXT:    [[B4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4), align 4
-; NO-FMA-NEXT:    [[B5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 5), align 4
-; NO-FMA-NEXT:    [[B6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 6), align 4
-; NO-FMA-NEXT:    [[B7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 7), align 4
-; NO-FMA-NEXT:    [[B8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8), align 4
-; NO-FMA-NEXT:    [[B9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 9), align 4
-; NO-FMA-NEXT:    [[B10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 10), align 4
-; NO-FMA-NEXT:    [[B11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 11), align 4
-; NO-FMA-NEXT:    [[B12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 12), align 4
-; NO-FMA-NEXT:    [[B13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 13), align 4
-; NO-FMA-NEXT:    [[B14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 14), align 4
-; NO-FMA-NEXT:    [[B15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 15), align 4
-; NO-FMA-NEXT:    [[C0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 0), align 4
-; NO-FMA-NEXT:    [[C1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 1), align 4
-; NO-FMA-NEXT:    [[C2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 2), align 4
-; NO-FMA-NEXT:    [[C3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 3), align 4
-; NO-FMA-NEXT:    [[C4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 4), align 4
-; NO-FMA-NEXT:    [[C5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 5), align 4
-; NO-FMA-NEXT:    [[C6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 6), align 4
-; NO-FMA-NEXT:    [[C7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 7), align 4
-; NO-FMA-NEXT:    [[C8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 8), align 4
-; NO-FMA-NEXT:    [[C9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 9), align 4
-; NO-FMA-NEXT:    [[C10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 10), align 4
-; NO-FMA-NEXT:    [[C11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 11), align 4
-; NO-FMA-NEXT:    [[C12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 12), align 4
-; NO-FMA-NEXT:    [[C13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 13), align 4
-; NO-FMA-NEXT:    [[C14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 14), align 4
-; NO-FMA-NEXT:    [[C15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 15), align 4
-; NO-FMA-NEXT:    [[FMA0:%.*]] = call float @llvm.fma.f32(float [[A0]], float [[B0]], float [[C0]])
-; NO-FMA-NEXT:    [[FMA1:%.*]] = call float @llvm.fma.f32(float [[A1]], float [[B1]], float [[C1]])
-; NO-FMA-NEXT:    [[FMA2:%.*]] = call float @llvm.fma.f32(float [[A2]], float [[B2]], float [[C2]])
-; NO-FMA-NEXT:    [[FMA3:%.*]] = call float @llvm.fma.f32(float [[A3]], float [[B3]], float [[C3]])
-; NO-FMA-NEXT:    [[FMA4:%.*]] = call float @llvm.fma.f32(float [[A4]], float [[B4]], float [[C4]])
-; NO-FMA-NEXT:    [[FMA5:%.*]] = call float @llvm.fma.f32(float [[A5]], float [[B5]], float [[C5]])
-; NO-FMA-NEXT:    [[FMA6:%.*]] = call float @llvm.fma.f32(float [[A6]], float [[B6]], float [[C6]])
-; NO-FMA-NEXT:    [[FMA7:%.*]] = call float @llvm.fma.f32(float [[A7]], float [[B7]], float [[C7]])
-; NO-FMA-NEXT:    [[FMA8:%.*]] = call float @llvm.fma.f32(float [[A8]], float [[B8]], float [[C8]])
-; NO-FMA-NEXT:    [[FMA9:%.*]] = call float @llvm.fma.f32(float [[A9]], float [[B9]], float [[C9]])
-; NO-FMA-NEXT:    [[FMA10:%.*]] = call float @llvm.fma.f32(float [[A10]], float [[B10]], float [[C10]])
-; NO-FMA-NEXT:    [[FMA11:%.*]] = call float @llvm.fma.f32(float [[A11]], float [[B11]], float [[C11]])
-; NO-FMA-NEXT:    [[FMA12:%.*]] = call float @llvm.fma.f32(float [[A12]], float [[B12]], float [[C12]])
-; NO-FMA-NEXT:    [[FMA13:%.*]] = call float @llvm.fma.f32(float [[A13]], float [[B13]], float [[C13]])
-; NO-FMA-NEXT:    [[FMA14:%.*]] = call float @llvm.fma.f32(float [[A14]], float [[B14]], float [[C14]])
-; NO-FMA-NEXT:    [[FMA15:%.*]] = call float @llvm.fma.f32(float [[A15]], float [[B15]], float [[C15]])
-; NO-FMA-NEXT:    store float [[FMA0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; NO-FMA-NEXT:    store float [[FMA1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; NO-FMA-NEXT:    store float [[FMA2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; NO-FMA-NEXT:    store float [[FMA3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; NO-FMA-NEXT:    store float [[FMA4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
-; NO-FMA-NEXT:    store float [[FMA5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; NO-FMA-NEXT:    store float [[FMA6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
-; NO-FMA-NEXT:    store float [[FMA7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
-; NO-FMA-NEXT:    store float [[FMA8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4
-; NO-FMA-NEXT:    store float [[FMA9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4
-; NO-FMA-NEXT:    store float [[FMA10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4
-; NO-FMA-NEXT:    store float [[FMA11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
-; NO-FMA-NEXT:    store float [[FMA12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4
-; NO-FMA-NEXT:    store float [[FMA13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
-; NO-FMA-NEXT:    store float [[FMA14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4
-; NO-FMA-NEXT:    store float [[FMA15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+; NO-FMA-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 12) to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP6:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP8:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 12) to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP9:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcC32 to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP10:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 4) to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP11:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 8) to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP12:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 12) to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; NO-FMA-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[TMP5]], i32 0
+; NO-FMA-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[TMP9]], i32 0
+; NO-FMA-NEXT:    [[FMA0:%.*]] = call float @llvm.fma.f32(float [[TMP13]], float [[TMP14]], float [[TMP15]])
+; NO-FMA-NEXT:    [[TMP16:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; NO-FMA-NEXT:    [[TMP17:%.*]] = extractelement <4 x float> [[TMP5]], i32 1
+; NO-FMA-NEXT:    [[TMP18:%.*]] = extractelement <4 x float> [[TMP9]], i32 1
+; NO-FMA-NEXT:    [[FMA1:%.*]] = call float @llvm.fma.f32(float [[TMP16]], float [[TMP17]], float [[TMP18]])
+; NO-FMA-NEXT:    [[TMP19:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; NO-FMA-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[TMP5]], i32 2
+; NO-FMA-NEXT:    [[TMP21:%.*]] = extractelement <4 x float> [[TMP9]], i32 2
+; NO-FMA-NEXT:    [[FMA2:%.*]] = call float @llvm.fma.f32(float [[TMP19]], float [[TMP20]], float [[TMP21]])
+; NO-FMA-NEXT:    [[TMP22:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; NO-FMA-NEXT:    [[TMP23:%.*]] = extractelement <4 x float> [[TMP5]], i32 3
+; NO-FMA-NEXT:    [[TMP24:%.*]] = extractelement <4 x float> [[TMP9]], i32 3
+; NO-FMA-NEXT:    [[FMA3:%.*]] = call float @llvm.fma.f32(float [[TMP22]], float [[TMP23]], float [[TMP24]])
+; NO-FMA-NEXT:    [[TMP25:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; NO-FMA-NEXT:    [[TMP26:%.*]] = extractelement <4 x float> [[TMP6]], i32 0
+; NO-FMA-NEXT:    [[TMP27:%.*]] = extractelement <4 x float> [[TMP10]], i32 0
+; NO-FMA-NEXT:    [[FMA4:%.*]] = call float @llvm.fma.f32(float [[TMP25]], float [[TMP26]], float [[TMP27]])
+; NO-FMA-NEXT:    [[TMP28:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; NO-FMA-NEXT:    [[TMP29:%.*]] = extractelement <4 x float> [[TMP6]], i32 1
+; NO-FMA-NEXT:    [[TMP30:%.*]] = extractelement <4 x float> [[TMP10]], i32 1
+; NO-FMA-NEXT:    [[FMA5:%.*]] = call float @llvm.fma.f32(float [[TMP28]], float [[TMP29]], float [[TMP30]])
+; NO-FMA-NEXT:    [[TMP31:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; NO-FMA-NEXT:    [[TMP32:%.*]] = extractelement <4 x float> [[TMP6]], i32 2
+; NO-FMA-NEXT:    [[TMP33:%.*]] = extractelement <4 x float> [[TMP10]], i32 2
+; NO-FMA-NEXT:    [[FMA6:%.*]] = call float @llvm.fma.f32(float [[TMP31]], float [[TMP32]], float [[TMP33]])
+; NO-FMA-NEXT:    [[TMP34:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; NO-FMA-NEXT:    [[TMP35:%.*]] = extractelement <4 x float> [[TMP6]], i32 3
+; NO-FMA-NEXT:    [[TMP36:%.*]] = extractelement <4 x float> [[TMP10]], i32 3
+; NO-FMA-NEXT:    [[FMA7:%.*]] = call float @llvm.fma.f32(float [[TMP34]], float [[TMP35]], float [[TMP36]])
+; NO-FMA-NEXT:    [[TMP37:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
+; NO-FMA-NEXT:    [[TMP38:%.*]] = extractelement <4 x float> [[TMP7]], i32 0
+; NO-FMA-NEXT:    [[TMP39:%.*]] = extractelement <4 x float> [[TMP11]], i32 0
+; NO-FMA-NEXT:    [[FMA8:%.*]] = call float @llvm.fma.f32(float [[TMP37]], float [[TMP38]], float [[TMP39]])
+; NO-FMA-NEXT:    [[TMP40:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
+; NO-FMA-NEXT:    [[TMP41:%.*]] = extractelement <4 x float> [[TMP7]], i32 1
+; NO-FMA-NEXT:    [[TMP42:%.*]] = extractelement <4 x float> [[TMP11]], i32 1
+; NO-FMA-NEXT:    [[FMA9:%.*]] = call float @llvm.fma.f32(float [[TMP40]], float [[TMP41]], float [[TMP42]])
+; NO-FMA-NEXT:    [[TMP43:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
+; NO-FMA-NEXT:    [[TMP44:%.*]] = extractelement <4 x float> [[TMP7]], i32 2
+; NO-FMA-NEXT:    [[TMP45:%.*]] = extractelement <4 x float> [[TMP11]], i32 2
+; NO-FMA-NEXT:    [[FMA10:%.*]] = call float @llvm.fma.f32(float [[TMP43]], float [[TMP44]], float [[TMP45]])
+; NO-FMA-NEXT:    [[TMP46:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
+; NO-FMA-NEXT:    [[TMP47:%.*]] = extractelement <4 x float> [[TMP7]], i32 3
+; NO-FMA-NEXT:    [[TMP48:%.*]] = extractelement <4 x float> [[TMP11]], i32 3
+; NO-FMA-NEXT:    [[FMA11:%.*]] = call float @llvm.fma.f32(float [[TMP46]], float [[TMP47]], float [[TMP48]])
+; NO-FMA-NEXT:    [[TMP49:%.*]] = extractelement <4 x float> [[TMP4]], i32 0
+; NO-FMA-NEXT:    [[TMP50:%.*]] = extractelement <4 x float> [[TMP8]], i32 0
+; NO-FMA-NEXT:    [[TMP51:%.*]] = extractelement <4 x float> [[TMP12]], i32 0
+; NO-FMA-NEXT:    [[FMA12:%.*]] = call float @llvm.fma.f32(float [[TMP49]], float [[TMP50]], float [[TMP51]])
+; NO-FMA-NEXT:    [[TMP52:%.*]] = extractelement <4 x float> [[TMP4]], i32 1
+; NO-FMA-NEXT:    [[TMP53:%.*]] = extractelement <4 x float> [[TMP8]], i32 1
+; NO-FMA-NEXT:    [[TMP54:%.*]] = extractelement <4 x float> [[TMP12]], i32 1
+; NO-FMA-NEXT:    [[FMA13:%.*]] = call float @llvm.fma.f32(float [[TMP52]], float [[TMP53]], float [[TMP54]])
+; NO-FMA-NEXT:    [[TMP55:%.*]] = extractelement <4 x float> [[TMP4]], i32 2
+; NO-FMA-NEXT:    [[TMP56:%.*]] = extractelement <4 x float> [[TMP8]], i32 2
+; NO-FMA-NEXT:    [[TMP57:%.*]] = extractelement <4 x float> [[TMP12]], i32 2
+; NO-FMA-NEXT:    [[FMA14:%.*]] = call float @llvm.fma.f32(float [[TMP55]], float [[TMP56]], float [[TMP57]])
+; NO-FMA-NEXT:    [[TMP58:%.*]] = extractelement <4 x float> [[TMP4]], i32 3
+; NO-FMA-NEXT:    [[TMP59:%.*]] = extractelement <4 x float> [[TMP8]], i32 3
+; NO-FMA-NEXT:    [[TMP60:%.*]] = extractelement <4 x float> [[TMP12]], i32 3
+; NO-FMA-NEXT:    [[FMA15:%.*]] = call float @llvm.fma.f32(float [[TMP58]], float [[TMP59]], float [[TMP60]])
+; NO-FMA-NEXT:    [[TMP61:%.*]] = insertelement <4 x float> poison, float [[FMA0]], i32 0
+; NO-FMA-NEXT:    [[TMP62:%.*]] = insertelement <4 x float> [[TMP61]], float [[FMA1]], i32 1
+; NO-FMA-NEXT:    [[TMP63:%.*]] = insertelement <4 x float> [[TMP62]], float [[FMA2]], i32 2
+; NO-FMA-NEXT:    [[TMP64:%.*]] = insertelement <4 x float> [[TMP63]], float [[FMA3]], i32 3
+; NO-FMA-NEXT:    store <4 x float> [[TMP64]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP65:%.*]] = insertelement <4 x float> poison, float [[FMA4]], i32 0
+; NO-FMA-NEXT:    [[TMP66:%.*]] = insertelement <4 x float> [[TMP65]], float [[FMA5]], i32 1
+; NO-FMA-NEXT:    [[TMP67:%.*]] = insertelement <4 x float> [[TMP66]], float [[FMA6]], i32 2
+; NO-FMA-NEXT:    [[TMP68:%.*]] = insertelement <4 x float> [[TMP67]], float [[FMA7]], i32 3
+; NO-FMA-NEXT:    store <4 x float> [[TMP68]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP69:%.*]] = insertelement <4 x float> poison, float [[FMA8]], i32 0
+; NO-FMA-NEXT:    [[TMP70:%.*]] = insertelement <4 x float> [[TMP69]], float [[FMA9]], i32 1
+; NO-FMA-NEXT:    [[TMP71:%.*]] = insertelement <4 x float> [[TMP70]], float [[FMA10]], i32 2
+; NO-FMA-NEXT:    [[TMP72:%.*]] = insertelement <4 x float> [[TMP71]], float [[FMA11]], i32 3
+; NO-FMA-NEXT:    store <4 x float> [[TMP72]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP73:%.*]] = insertelement <4 x float> poison, float [[FMA12]], i32 0
+; NO-FMA-NEXT:    [[TMP74:%.*]] = insertelement <4 x float> [[TMP73]], float [[FMA13]], i32 1
+; NO-FMA-NEXT:    [[TMP75:%.*]] = insertelement <4 x float> [[TMP74]], float [[FMA14]], i32 2
+; NO-FMA-NEXT:    [[TMP76:%.*]] = insertelement <4 x float> [[TMP75]], float [[FMA15]], i32 3
+; NO-FMA-NEXT:    store <4 x float> [[TMP76]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
 ; NO-FMA-NEXT:    ret void
 ;
 ; FMA256-LABEL: @fma_16f32(
Index: llvm/test/Transforms/SLPVectorizer/X86/fptosi-inseltpoison.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/fptosi-inseltpoison.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/fptosi-inseltpoison.ll
@@ -21,59 +21,40 @@
 
 define void @fptosi_8f64_8i64() #0 {
 ; SSE-LABEL: @fptosi_8f64_8i64(
-; SSE-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; SSE-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; SSE-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; SSE-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
-; SSE-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
-; SSE-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
-; SSE-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
-; SSE-NEXT:    [[CVT0:%.*]] = fptosi double [[A0]] to i64
-; SSE-NEXT:    [[CVT1:%.*]] = fptosi double [[A1]] to i64
-; SSE-NEXT:    [[CVT2:%.*]] = fptosi double [[A2]] to i64
-; SSE-NEXT:    [[CVT3:%.*]] = fptosi double [[A3]] to i64
-; SSE-NEXT:    [[CVT4:%.*]] = fptosi double [[A4]] to i64
-; SSE-NEXT:    [[CVT5:%.*]] = fptosi double [[A5]] to i64
-; SSE-NEXT:    [[CVT6:%.*]] = fptosi double [[A6]] to i64
-; SSE-NEXT:    [[CVT7:%.*]] = fptosi double [[A7]] to i64
-; SSE-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
-; SSE-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
-; SSE-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
-; SSE-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
-; SSE-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
-; SSE-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
-; SSE-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
-; SSE-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; SSE-NEXT:    [[CVT0:%.*]] = fptosi double [[TMP5]] to i64
+; SSE-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; SSE-NEXT:    [[CVT1:%.*]] = fptosi double [[TMP6]] to i64
+; SSE-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; SSE-NEXT:    [[CVT2:%.*]] = fptosi double [[TMP7]] to i64
+; SSE-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; SSE-NEXT:    [[CVT3:%.*]] = fptosi double [[TMP8]] to i64
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; SSE-NEXT:    [[CVT4:%.*]] = fptosi double [[TMP9]] to i64
+; SSE-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+; SSE-NEXT:    [[CVT5:%.*]] = fptosi double [[TMP10]] to i64
+; SSE-NEXT:    [[TMP11:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
+; SSE-NEXT:    [[CVT6:%.*]] = fptosi double [[TMP11]] to i64
+; SSE-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
+; SSE-NEXT:    [[CVT7:%.*]] = fptosi double [[TMP12]] to i64
+; SSE-NEXT:    [[TMP13:%.*]] = insertelement <2 x i64> poison, i64 [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP14:%.*]] = insertelement <2 x i64> [[TMP13]], i64 [[CVT1]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP14]], <2 x i64>* bitcast ([8 x i64]* @dst64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP15:%.*]] = insertelement <2 x i64> poison, i64 [[CVT2]], i32 0
+; SSE-NEXT:    [[TMP16:%.*]] = insertelement <2 x i64> [[TMP15]], i64 [[CVT3]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP16]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP17:%.*]] = insertelement <2 x i64> poison, i64 [[CVT4]], i32 0
+; SSE-NEXT:    [[TMP18:%.*]] = insertelement <2 x i64> [[TMP17]], i64 [[CVT5]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP18]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP19:%.*]] = insertelement <2 x i64> poison, i64 [[CVT6]], i32 0
+; SSE-NEXT:    [[TMP20:%.*]] = insertelement <2 x i64> [[TMP19]], i64 [[CVT7]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP20]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SSE-NEXT:    ret void
 ;
-; AVX256NODQ-LABEL: @fptosi_8f64_8i64(
-; AVX256NODQ-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; AVX256NODQ-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; AVX256NODQ-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; AVX256NODQ-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; AVX256NODQ-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
-; AVX256NODQ-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
-; AVX256NODQ-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
-; AVX256NODQ-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
-; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptosi double [[A0]] to i64
-; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptosi double [[A1]] to i64
-; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptosi double [[A2]] to i64
-; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptosi double [[A3]] to i64
-; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptosi double [[A4]] to i64
-; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptosi double [[A5]] to i64
-; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptosi double [[A6]] to i64
-; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptosi double [[A7]] to i64
-; AVX256NODQ-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
-; AVX256NODQ-NEXT:    ret void
-;
 ; AVX512-LABEL: @fptosi_8f64_8i64(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
 ; AVX512-NEXT:    [[TMP2:%.*]] = fptosi <8 x double> [[TMP1]] to <8 x i64>
@@ -254,57 +235,69 @@
 
 define void @fptosi_8f32_8i64() #0 {
 ; SSE-LABEL: @fptosi_8f32_8i64(
-; SSE-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; SSE-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; SSE-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; SSE-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; SSE-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; SSE-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; SSE-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; SSE-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; SSE-NEXT:    [[CVT0:%.*]] = fptosi float [[A0]] to i64
-; SSE-NEXT:    [[CVT1:%.*]] = fptosi float [[A1]] to i64
-; SSE-NEXT:    [[CVT2:%.*]] = fptosi float [[A2]] to i64
-; SSE-NEXT:    [[CVT3:%.*]] = fptosi float [[A3]] to i64
-; SSE-NEXT:    [[CVT4:%.*]] = fptosi float [[A4]] to i64
-; SSE-NEXT:    [[CVT5:%.*]] = fptosi float [[A5]] to i64
-; SSE-NEXT:    [[CVT6:%.*]] = fptosi float [[A6]] to i64
-; SSE-NEXT:    [[CVT7:%.*]] = fptosi float [[A7]] to i64
-; SSE-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
-; SSE-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
-; SSE-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
-; SSE-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
-; SSE-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
-; SSE-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
-; SSE-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
-; SSE-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([16 x float]* @src32 to <2 x float>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2) to <2 x float>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <2 x float>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6) to <2 x float>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
+; SSE-NEXT:    [[CVT0:%.*]] = fptosi float [[TMP5]] to i64
+; SSE-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+; SSE-NEXT:    [[CVT1:%.*]] = fptosi float [[TMP6]] to i64
+; SSE-NEXT:    [[TMP7:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; SSE-NEXT:    [[CVT2:%.*]] = fptosi float [[TMP7]] to i64
+; SSE-NEXT:    [[TMP8:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+; SSE-NEXT:    [[CVT3:%.*]] = fptosi float [[TMP8]] to i64
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; SSE-NEXT:    [[CVT4:%.*]] = fptosi float [[TMP9]] to i64
+; SSE-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; SSE-NEXT:    [[CVT5:%.*]] = fptosi float [[TMP10]] to i64
+; SSE-NEXT:    [[TMP11:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; SSE-NEXT:    [[CVT6:%.*]] = fptosi float [[TMP11]] to i64
+; SSE-NEXT:    [[TMP12:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; SSE-NEXT:    [[CVT7:%.*]] = fptosi float [[TMP12]] to i64
+; SSE-NEXT:    [[TMP13:%.*]] = insertelement <2 x i64> poison, i64 [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP14:%.*]] = insertelement <2 x i64> [[TMP13]], i64 [[CVT1]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP14]], <2 x i64>* bitcast ([8 x i64]* @dst64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP15:%.*]] = insertelement <2 x i64> poison, i64 [[CVT2]], i32 0
+; SSE-NEXT:    [[TMP16:%.*]] = insertelement <2 x i64> [[TMP15]], i64 [[CVT3]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP16]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP17:%.*]] = insertelement <2 x i64> poison, i64 [[CVT4]], i32 0
+; SSE-NEXT:    [[TMP18:%.*]] = insertelement <2 x i64> [[TMP17]], i64 [[CVT5]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP18]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP19:%.*]] = insertelement <2 x i64> poison, i64 [[CVT6]], i32 0
+; SSE-NEXT:    [[TMP20:%.*]] = insertelement <2 x i64> [[TMP19]], i64 [[CVT7]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP20]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SSE-NEXT:    ret void
 ;
 ; AVX256NODQ-LABEL: @fptosi_8f32_8i64(
-; AVX256NODQ-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; AVX256NODQ-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; AVX256NODQ-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; AVX256NODQ-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; AVX256NODQ-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; AVX256NODQ-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; AVX256NODQ-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; AVX256NODQ-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptosi float [[A0]] to i64
-; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptosi float [[A1]] to i64
-; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptosi float [[A2]] to i64
-; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptosi float [[A3]] to i64
-; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptosi float [[A4]] to i64
-; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptosi float [[A5]] to i64
-; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptosi float [[A6]] to i64
-; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptosi float [[A7]] to i64
-; AVX256NODQ-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+; AVX256NODQ-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; AVX256NODQ-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; AVX256NODQ-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptosi float [[TMP3]] to i64
+; AVX256NODQ-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptosi float [[TMP4]] to i64
+; AVX256NODQ-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptosi float [[TMP5]] to i64
+; AVX256NODQ-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptosi float [[TMP6]] to i64
+; AVX256NODQ-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptosi float [[TMP7]] to i64
+; AVX256NODQ-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptosi float [[TMP8]] to i64
+; AVX256NODQ-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptosi float [[TMP9]] to i64
+; AVX256NODQ-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptosi float [[TMP10]] to i64
+; AVX256NODQ-NEXT:    [[TMP11:%.*]] = insertelement <4 x i64> poison, i64 [[CVT0]], i32 0
+; AVX256NODQ-NEXT:    [[TMP12:%.*]] = insertelement <4 x i64> [[TMP11]], i64 [[CVT1]], i32 1
+; AVX256NODQ-NEXT:    [[TMP13:%.*]] = insertelement <4 x i64> [[TMP12]], i64 [[CVT2]], i32 2
+; AVX256NODQ-NEXT:    [[TMP14:%.*]] = insertelement <4 x i64> [[TMP13]], i64 [[CVT3]], i32 3
+; AVX256NODQ-NEXT:    store <4 x i64> [[TMP14]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
+; AVX256NODQ-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[CVT4]], i32 0
+; AVX256NODQ-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[CVT5]], i32 1
+; AVX256NODQ-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[CVT6]], i32 2
+; AVX256NODQ-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[CVT7]], i32 3
+; AVX256NODQ-NEXT:    store <4 x i64> [[TMP18]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX256NODQ-NEXT:    ret void
 ;
 ; AVX512-LABEL: @fptosi_8f32_8i64(
Index: llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll
@@ -21,59 +21,40 @@
 
 define void @fptosi_8f64_8i64() #0 {
 ; SSE-LABEL: @fptosi_8f64_8i64(
-; SSE-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; SSE-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; SSE-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; SSE-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
-; SSE-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
-; SSE-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
-; SSE-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
-; SSE-NEXT:    [[CVT0:%.*]] = fptosi double [[A0]] to i64
-; SSE-NEXT:    [[CVT1:%.*]] = fptosi double [[A1]] to i64
-; SSE-NEXT:    [[CVT2:%.*]] = fptosi double [[A2]] to i64
-; SSE-NEXT:    [[CVT3:%.*]] = fptosi double [[A3]] to i64
-; SSE-NEXT:    [[CVT4:%.*]] = fptosi double [[A4]] to i64
-; SSE-NEXT:    [[CVT5:%.*]] = fptosi double [[A5]] to i64
-; SSE-NEXT:    [[CVT6:%.*]] = fptosi double [[A6]] to i64
-; SSE-NEXT:    [[CVT7:%.*]] = fptosi double [[A7]] to i64
-; SSE-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
-; SSE-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
-; SSE-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
-; SSE-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
-; SSE-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
-; SSE-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
-; SSE-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
-; SSE-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; SSE-NEXT:    [[CVT0:%.*]] = fptosi double [[TMP5]] to i64
+; SSE-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; SSE-NEXT:    [[CVT1:%.*]] = fptosi double [[TMP6]] to i64
+; SSE-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; SSE-NEXT:    [[CVT2:%.*]] = fptosi double [[TMP7]] to i64
+; SSE-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; SSE-NEXT:    [[CVT3:%.*]] = fptosi double [[TMP8]] to i64
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; SSE-NEXT:    [[CVT4:%.*]] = fptosi double [[TMP9]] to i64
+; SSE-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+; SSE-NEXT:    [[CVT5:%.*]] = fptosi double [[TMP10]] to i64
+; SSE-NEXT:    [[TMP11:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
+; SSE-NEXT:    [[CVT6:%.*]] = fptosi double [[TMP11]] to i64
+; SSE-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
+; SSE-NEXT:    [[CVT7:%.*]] = fptosi double [[TMP12]] to i64
+; SSE-NEXT:    [[TMP13:%.*]] = insertelement <2 x i64> poison, i64 [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP14:%.*]] = insertelement <2 x i64> [[TMP13]], i64 [[CVT1]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP14]], <2 x i64>* bitcast ([8 x i64]* @dst64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP15:%.*]] = insertelement <2 x i64> poison, i64 [[CVT2]], i32 0
+; SSE-NEXT:    [[TMP16:%.*]] = insertelement <2 x i64> [[TMP15]], i64 [[CVT3]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP16]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP17:%.*]] = insertelement <2 x i64> poison, i64 [[CVT4]], i32 0
+; SSE-NEXT:    [[TMP18:%.*]] = insertelement <2 x i64> [[TMP17]], i64 [[CVT5]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP18]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP19:%.*]] = insertelement <2 x i64> poison, i64 [[CVT6]], i32 0
+; SSE-NEXT:    [[TMP20:%.*]] = insertelement <2 x i64> [[TMP19]], i64 [[CVT7]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP20]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SSE-NEXT:    ret void
 ;
-; AVX256NODQ-LABEL: @fptosi_8f64_8i64(
-; AVX256NODQ-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; AVX256NODQ-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; AVX256NODQ-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; AVX256NODQ-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; AVX256NODQ-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
-; AVX256NODQ-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
-; AVX256NODQ-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
-; AVX256NODQ-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
-; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptosi double [[A0]] to i64
-; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptosi double [[A1]] to i64
-; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptosi double [[A2]] to i64
-; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptosi double [[A3]] to i64
-; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptosi double [[A4]] to i64
-; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptosi double [[A5]] to i64
-; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptosi double [[A6]] to i64
-; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptosi double [[A7]] to i64
-; AVX256NODQ-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
-; AVX256NODQ-NEXT:    ret void
-;
 ; AVX512-LABEL: @fptosi_8f64_8i64(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
 ; AVX512-NEXT:    [[TMP2:%.*]] = fptosi <8 x double> [[TMP1]] to <8 x i64>
@@ -254,57 +235,69 @@
 
 define void @fptosi_8f32_8i64() #0 {
 ; SSE-LABEL: @fptosi_8f32_8i64(
-; SSE-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; SSE-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; SSE-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; SSE-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; SSE-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; SSE-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; SSE-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; SSE-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; SSE-NEXT:    [[CVT0:%.*]] = fptosi float [[A0]] to i64
-; SSE-NEXT:    [[CVT1:%.*]] = fptosi float [[A1]] to i64
-; SSE-NEXT:    [[CVT2:%.*]] = fptosi float [[A2]] to i64
-; SSE-NEXT:    [[CVT3:%.*]] = fptosi float [[A3]] to i64
-; SSE-NEXT:    [[CVT4:%.*]] = fptosi float [[A4]] to i64
-; SSE-NEXT:    [[CVT5:%.*]] = fptosi float [[A5]] to i64
-; SSE-NEXT:    [[CVT6:%.*]] = fptosi float [[A6]] to i64
-; SSE-NEXT:    [[CVT7:%.*]] = fptosi float [[A7]] to i64
-; SSE-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
-; SSE-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
-; SSE-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
-; SSE-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
-; SSE-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
-; SSE-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
-; SSE-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
-; SSE-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([16 x float]* @src32 to <2 x float>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2) to <2 x float>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <2 x float>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6) to <2 x float>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
+; SSE-NEXT:    [[CVT0:%.*]] = fptosi float [[TMP5]] to i64
+; SSE-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+; SSE-NEXT:    [[CVT1:%.*]] = fptosi float [[TMP6]] to i64
+; SSE-NEXT:    [[TMP7:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; SSE-NEXT:    [[CVT2:%.*]] = fptosi float [[TMP7]] to i64
+; SSE-NEXT:    [[TMP8:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+; SSE-NEXT:    [[CVT3:%.*]] = fptosi float [[TMP8]] to i64
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; SSE-NEXT:    [[CVT4:%.*]] = fptosi float [[TMP9]] to i64
+; SSE-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; SSE-NEXT:    [[CVT5:%.*]] = fptosi float [[TMP10]] to i64
+; SSE-NEXT:    [[TMP11:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; SSE-NEXT:    [[CVT6:%.*]] = fptosi float [[TMP11]] to i64
+; SSE-NEXT:    [[TMP12:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; SSE-NEXT:    [[CVT7:%.*]] = fptosi float [[TMP12]] to i64
+; SSE-NEXT:    [[TMP13:%.*]] = insertelement <2 x i64> poison, i64 [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP14:%.*]] = insertelement <2 x i64> [[TMP13]], i64 [[CVT1]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP14]], <2 x i64>* bitcast ([8 x i64]* @dst64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP15:%.*]] = insertelement <2 x i64> poison, i64 [[CVT2]], i32 0
+; SSE-NEXT:    [[TMP16:%.*]] = insertelement <2 x i64> [[TMP15]], i64 [[CVT3]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP16]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP17:%.*]] = insertelement <2 x i64> poison, i64 [[CVT4]], i32 0
+; SSE-NEXT:    [[TMP18:%.*]] = insertelement <2 x i64> [[TMP17]], i64 [[CVT5]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP18]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP19:%.*]] = insertelement <2 x i64> poison, i64 [[CVT6]], i32 0
+; SSE-NEXT:    [[TMP20:%.*]] = insertelement <2 x i64> [[TMP19]], i64 [[CVT7]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP20]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SSE-NEXT:    ret void
 ;
 ; AVX256NODQ-LABEL: @fptosi_8f32_8i64(
-; AVX256NODQ-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; AVX256NODQ-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; AVX256NODQ-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; AVX256NODQ-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; AVX256NODQ-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; AVX256NODQ-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; AVX256NODQ-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; AVX256NODQ-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptosi float [[A0]] to i64
-; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptosi float [[A1]] to i64
-; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptosi float [[A2]] to i64
-; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptosi float [[A3]] to i64
-; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptosi float [[A4]] to i64
-; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptosi float [[A5]] to i64
-; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptosi float [[A6]] to i64
-; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptosi float [[A7]] to i64
-; AVX256NODQ-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+; AVX256NODQ-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; AVX256NODQ-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; AVX256NODQ-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptosi float [[TMP3]] to i64
+; AVX256NODQ-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptosi float [[TMP4]] to i64
+; AVX256NODQ-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptosi float [[TMP5]] to i64
+; AVX256NODQ-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptosi float [[TMP6]] to i64
+; AVX256NODQ-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptosi float [[TMP7]] to i64
+; AVX256NODQ-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptosi float [[TMP8]] to i64
+; AVX256NODQ-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptosi float [[TMP9]] to i64
+; AVX256NODQ-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptosi float [[TMP10]] to i64
+; AVX256NODQ-NEXT:    [[TMP11:%.*]] = insertelement <4 x i64> poison, i64 [[CVT0]], i32 0
+; AVX256NODQ-NEXT:    [[TMP12:%.*]] = insertelement <4 x i64> [[TMP11]], i64 [[CVT1]], i32 1
+; AVX256NODQ-NEXT:    [[TMP13:%.*]] = insertelement <4 x i64> [[TMP12]], i64 [[CVT2]], i32 2
+; AVX256NODQ-NEXT:    [[TMP14:%.*]] = insertelement <4 x i64> [[TMP13]], i64 [[CVT3]], i32 3
+; AVX256NODQ-NEXT:    store <4 x i64> [[TMP14]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
+; AVX256NODQ-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[CVT4]], i32 0
+; AVX256NODQ-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[CVT5]], i32 1
+; AVX256NODQ-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[CVT6]], i32 2
+; AVX256NODQ-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[CVT7]], i32 3
+; AVX256NODQ-NEXT:    store <4 x i64> [[TMP18]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX256NODQ-NEXT:    ret void
 ;
 ; AVX512-LABEL: @fptosi_8f32_8i64(
Index: llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll
@@ -21,59 +21,40 @@
 
 define void @fptoui_8f64_8i64() #0 {
 ; SSE-LABEL: @fptoui_8f64_8i64(
-; SSE-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; SSE-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; SSE-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; SSE-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
-; SSE-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
-; SSE-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
-; SSE-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
-; SSE-NEXT:    [[CVT0:%.*]] = fptoui double [[A0]] to i64
-; SSE-NEXT:    [[CVT1:%.*]] = fptoui double [[A1]] to i64
-; SSE-NEXT:    [[CVT2:%.*]] = fptoui double [[A2]] to i64
-; SSE-NEXT:    [[CVT3:%.*]] = fptoui double [[A3]] to i64
-; SSE-NEXT:    [[CVT4:%.*]] = fptoui double [[A4]] to i64
-; SSE-NEXT:    [[CVT5:%.*]] = fptoui double [[A5]] to i64
-; SSE-NEXT:    [[CVT6:%.*]] = fptoui double [[A6]] to i64
-; SSE-NEXT:    [[CVT7:%.*]] = fptoui double [[A7]] to i64
-; SSE-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
-; SSE-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
-; SSE-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
-; SSE-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
-; SSE-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
-; SSE-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
-; SSE-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
-; SSE-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; SSE-NEXT:    [[CVT0:%.*]] = fptoui double [[TMP5]] to i64
+; SSE-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; SSE-NEXT:    [[CVT1:%.*]] = fptoui double [[TMP6]] to i64
+; SSE-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; SSE-NEXT:    [[CVT2:%.*]] = fptoui double [[TMP7]] to i64
+; SSE-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; SSE-NEXT:    [[CVT3:%.*]] = fptoui double [[TMP8]] to i64
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; SSE-NEXT:    [[CVT4:%.*]] = fptoui double [[TMP9]] to i64
+; SSE-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+; SSE-NEXT:    [[CVT5:%.*]] = fptoui double [[TMP10]] to i64
+; SSE-NEXT:    [[TMP11:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
+; SSE-NEXT:    [[CVT6:%.*]] = fptoui double [[TMP11]] to i64
+; SSE-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
+; SSE-NEXT:    [[CVT7:%.*]] = fptoui double [[TMP12]] to i64
+; SSE-NEXT:    [[TMP13:%.*]] = insertelement <2 x i64> poison, i64 [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP14:%.*]] = insertelement <2 x i64> [[TMP13]], i64 [[CVT1]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP14]], <2 x i64>* bitcast ([8 x i64]* @dst64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP15:%.*]] = insertelement <2 x i64> poison, i64 [[CVT2]], i32 0
+; SSE-NEXT:    [[TMP16:%.*]] = insertelement <2 x i64> [[TMP15]], i64 [[CVT3]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP16]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP17:%.*]] = insertelement <2 x i64> poison, i64 [[CVT4]], i32 0
+; SSE-NEXT:    [[TMP18:%.*]] = insertelement <2 x i64> [[TMP17]], i64 [[CVT5]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP18]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP19:%.*]] = insertelement <2 x i64> poison, i64 [[CVT6]], i32 0
+; SSE-NEXT:    [[TMP20:%.*]] = insertelement <2 x i64> [[TMP19]], i64 [[CVT7]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP20]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SSE-NEXT:    ret void
 ;
-; AVX256NODQ-LABEL: @fptoui_8f64_8i64(
-; AVX256NODQ-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; AVX256NODQ-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; AVX256NODQ-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; AVX256NODQ-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; AVX256NODQ-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
-; AVX256NODQ-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
-; AVX256NODQ-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
-; AVX256NODQ-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
-; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptoui double [[A0]] to i64
-; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptoui double [[A1]] to i64
-; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptoui double [[A2]] to i64
-; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptoui double [[A3]] to i64
-; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptoui double [[A4]] to i64
-; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptoui double [[A5]] to i64
-; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptoui double [[A6]] to i64
-; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptoui double [[A7]] to i64
-; AVX256NODQ-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
-; AVX256NODQ-NEXT:    ret void
-;
 ; AVX512-LABEL: @fptoui_8f64_8i64(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
 ; AVX512-NEXT:    [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i64>
@@ -118,57 +99,63 @@
 
 define void @fptoui_8f64_8i32() #0 {
 ; SSE-LABEL: @fptoui_8f64_8i32(
-; SSE-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; SSE-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; SSE-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; SSE-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
-; SSE-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
-; SSE-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
-; SSE-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
-; SSE-NEXT:    [[CVT0:%.*]] = fptoui double [[A0]] to i32
-; SSE-NEXT:    [[CVT1:%.*]] = fptoui double [[A1]] to i32
-; SSE-NEXT:    [[CVT2:%.*]] = fptoui double [[A2]] to i32
-; SSE-NEXT:    [[CVT3:%.*]] = fptoui double [[A3]] to i32
-; SSE-NEXT:    [[CVT4:%.*]] = fptoui double [[A4]] to i32
-; SSE-NEXT:    [[CVT5:%.*]] = fptoui double [[A5]] to i32
-; SSE-NEXT:    [[CVT6:%.*]] = fptoui double [[A6]] to i32
-; SSE-NEXT:    [[CVT7:%.*]] = fptoui double [[A7]] to i32
-; SSE-NEXT:    store i32 [[CVT0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 0), align 4
-; SSE-NEXT:    store i32 [[CVT1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 1), align 4
-; SSE-NEXT:    store i32 [[CVT2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 2), align 4
-; SSE-NEXT:    store i32 [[CVT3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 3), align 4
-; SSE-NEXT:    store i32 [[CVT4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4), align 4
-; SSE-NEXT:    store i32 [[CVT5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 5), align 4
-; SSE-NEXT:    store i32 [[CVT6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 6), align 4
-; SSE-NEXT:    store i32 [[CVT7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
+; SSE-NEXT:    [[CVT0:%.*]] = fptoui double [[TMP3]] to i32
+; SSE-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
+; SSE-NEXT:    [[CVT1:%.*]] = fptoui double [[TMP4]] to i32
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 2
+; SSE-NEXT:    [[CVT2:%.*]] = fptoui double [[TMP5]] to i32
+; SSE-NEXT:    [[TMP6:%.*]] = extractelement <4 x double> [[TMP1]], i32 3
+; SSE-NEXT:    [[CVT3:%.*]] = fptoui double [[TMP6]] to i32
+; SSE-NEXT:    [[TMP7:%.*]] = extractelement <4 x double> [[TMP2]], i32 0
+; SSE-NEXT:    [[CVT4:%.*]] = fptoui double [[TMP7]] to i32
+; SSE-NEXT:    [[TMP8:%.*]] = extractelement <4 x double> [[TMP2]], i32 1
+; SSE-NEXT:    [[CVT5:%.*]] = fptoui double [[TMP8]] to i32
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <4 x double> [[TMP2]], i32 2
+; SSE-NEXT:    [[CVT6:%.*]] = fptoui double [[TMP9]] to i32
+; SSE-NEXT:    [[TMP10:%.*]] = extractelement <4 x double> [[TMP2]], i32 3
+; SSE-NEXT:    [[CVT7:%.*]] = fptoui double [[TMP10]] to i32
+; SSE-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[CVT1]], i32 1
+; SSE-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[CVT2]], i32 2
+; SSE-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[CVT3]], i32 3
+; SSE-NEXT:    store <4 x i32> [[TMP14]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> poison, i32 [[CVT4]], i32 0
+; SSE-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[CVT5]], i32 1
+; SSE-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[CVT6]], i32 2
+; SSE-NEXT:    [[TMP18:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[CVT7]], i32 3
+; SSE-NEXT:    store <4 x i32> [[TMP18]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; AVX256NODQ-LABEL: @fptoui_8f64_8i32(
-; AVX256NODQ-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; AVX256NODQ-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; AVX256NODQ-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; AVX256NODQ-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; AVX256NODQ-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
-; AVX256NODQ-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
-; AVX256NODQ-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
-; AVX256NODQ-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
-; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptoui double [[A0]] to i32
-; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptoui double [[A1]] to i32
-; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptoui double [[A2]] to i32
-; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptoui double [[A3]] to i32
-; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptoui double [[A4]] to i32
-; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptoui double [[A5]] to i32
-; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptoui double [[A6]] to i32
-; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptoui double [[A7]] to i32
-; AVX256NODQ-NEXT:    store i32 [[CVT0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 0), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 1), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 2), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 3), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 5), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 6), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4
+; AVX256NODQ-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
+; AVX256NODQ-NEXT:    [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptoui double [[TMP2]] to i32
+; AVX256NODQ-NEXT:    [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptoui double [[TMP3]] to i32
+; AVX256NODQ-NEXT:    [[TMP4:%.*]] = extractelement <8 x double> [[TMP1]], i32 2
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptoui double [[TMP4]] to i32
+; AVX256NODQ-NEXT:    [[TMP5:%.*]] = extractelement <8 x double> [[TMP1]], i32 3
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptoui double [[TMP5]] to i32
+; AVX256NODQ-NEXT:    [[TMP6:%.*]] = extractelement <8 x double> [[TMP1]], i32 4
+; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptoui double [[TMP6]] to i32
+; AVX256NODQ-NEXT:    [[TMP7:%.*]] = extractelement <8 x double> [[TMP1]], i32 5
+; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptoui double [[TMP7]] to i32
+; AVX256NODQ-NEXT:    [[TMP8:%.*]] = extractelement <8 x double> [[TMP1]], i32 6
+; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptoui double [[TMP8]] to i32
+; AVX256NODQ-NEXT:    [[TMP9:%.*]] = extractelement <8 x double> [[TMP1]], i32 7
+; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptoui double [[TMP9]] to i32
+; AVX256NODQ-NEXT:    [[TMP10:%.*]] = insertelement <8 x i32> poison, i32 [[CVT0]], i32 0
+; AVX256NODQ-NEXT:    [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[CVT1]], i32 1
+; AVX256NODQ-NEXT:    [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[CVT2]], i32 2
+; AVX256NODQ-NEXT:    [[TMP13:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[CVT3]], i32 3
+; AVX256NODQ-NEXT:    [[TMP14:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[CVT4]], i32 4
+; AVX256NODQ-NEXT:    [[TMP15:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[CVT5]], i32 5
+; AVX256NODQ-NEXT:    [[TMP16:%.*]] = insertelement <8 x i32> [[TMP15]], i32 [[CVT6]], i32 6
+; AVX256NODQ-NEXT:    [[TMP17:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[CVT7]], i32 7
+; AVX256NODQ-NEXT:    store <8 x i32> [[TMP17]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4
 ; AVX256NODQ-NEXT:    ret void
 ;
 ; AVX-LABEL: @fptoui_8f64_8i32(
@@ -299,57 +286,69 @@
 
 define void @fptoui_8f32_8i64() #0 {
 ; SSE-LABEL: @fptoui_8f32_8i64(
-; SSE-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; SSE-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; SSE-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; SSE-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; SSE-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; SSE-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; SSE-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; SSE-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; SSE-NEXT:    [[CVT0:%.*]] = fptoui float [[A0]] to i64
-; SSE-NEXT:    [[CVT1:%.*]] = fptoui float [[A1]] to i64
-; SSE-NEXT:    [[CVT2:%.*]] = fptoui float [[A2]] to i64
-; SSE-NEXT:    [[CVT3:%.*]] = fptoui float [[A3]] to i64
-; SSE-NEXT:    [[CVT4:%.*]] = fptoui float [[A4]] to i64
-; SSE-NEXT:    [[CVT5:%.*]] = fptoui float [[A5]] to i64
-; SSE-NEXT:    [[CVT6:%.*]] = fptoui float [[A6]] to i64
-; SSE-NEXT:    [[CVT7:%.*]] = fptoui float [[A7]] to i64
-; SSE-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
-; SSE-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
-; SSE-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
-; SSE-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
-; SSE-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
-; SSE-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
-; SSE-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
-; SSE-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([16 x float]* @src32 to <2 x float>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2) to <2 x float>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <2 x float>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6) to <2 x float>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
+; SSE-NEXT:    [[CVT0:%.*]] = fptoui float [[TMP5]] to i64
+; SSE-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+; SSE-NEXT:    [[CVT1:%.*]] = fptoui float [[TMP6]] to i64
+; SSE-NEXT:    [[TMP7:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; SSE-NEXT:    [[CVT2:%.*]] = fptoui float [[TMP7]] to i64
+; SSE-NEXT:    [[TMP8:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+; SSE-NEXT:    [[CVT3:%.*]] = fptoui float [[TMP8]] to i64
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; SSE-NEXT:    [[CVT4:%.*]] = fptoui float [[TMP9]] to i64
+; SSE-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; SSE-NEXT:    [[CVT5:%.*]] = fptoui float [[TMP10]] to i64
+; SSE-NEXT:    [[TMP11:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; SSE-NEXT:    [[CVT6:%.*]] = fptoui float [[TMP11]] to i64
+; SSE-NEXT:    [[TMP12:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; SSE-NEXT:    [[CVT7:%.*]] = fptoui float [[TMP12]] to i64
+; SSE-NEXT:    [[TMP13:%.*]] = insertelement <2 x i64> poison, i64 [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP14:%.*]] = insertelement <2 x i64> [[TMP13]], i64 [[CVT1]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP14]], <2 x i64>* bitcast ([8 x i64]* @dst64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP15:%.*]] = insertelement <2 x i64> poison, i64 [[CVT2]], i32 0
+; SSE-NEXT:    [[TMP16:%.*]] = insertelement <2 x i64> [[TMP15]], i64 [[CVT3]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP16]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP17:%.*]] = insertelement <2 x i64> poison, i64 [[CVT4]], i32 0
+; SSE-NEXT:    [[TMP18:%.*]] = insertelement <2 x i64> [[TMP17]], i64 [[CVT5]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP18]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP19:%.*]] = insertelement <2 x i64> poison, i64 [[CVT6]], i32 0
+; SSE-NEXT:    [[TMP20:%.*]] = insertelement <2 x i64> [[TMP19]], i64 [[CVT7]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP20]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SSE-NEXT:    ret void
 ;
 ; AVX256NODQ-LABEL: @fptoui_8f32_8i64(
-; AVX256NODQ-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; AVX256NODQ-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; AVX256NODQ-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; AVX256NODQ-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; AVX256NODQ-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; AVX256NODQ-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; AVX256NODQ-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; AVX256NODQ-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptoui float [[A0]] to i64
-; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptoui float [[A1]] to i64
-; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptoui float [[A2]] to i64
-; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptoui float [[A3]] to i64
-; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptoui float [[A4]] to i64
-; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptoui float [[A5]] to i64
-; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptoui float [[A6]] to i64
-; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptoui float [[A7]] to i64
-; AVX256NODQ-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+; AVX256NODQ-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; AVX256NODQ-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; AVX256NODQ-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptoui float [[TMP3]] to i64
+; AVX256NODQ-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptoui float [[TMP4]] to i64
+; AVX256NODQ-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptoui float [[TMP5]] to i64
+; AVX256NODQ-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptoui float [[TMP6]] to i64
+; AVX256NODQ-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptoui float [[TMP7]] to i64
+; AVX256NODQ-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptoui float [[TMP8]] to i64
+; AVX256NODQ-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptoui float [[TMP9]] to i64
+; AVX256NODQ-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptoui float [[TMP10]] to i64
+; AVX256NODQ-NEXT:    [[TMP11:%.*]] = insertelement <4 x i64> poison, i64 [[CVT0]], i32 0
+; AVX256NODQ-NEXT:    [[TMP12:%.*]] = insertelement <4 x i64> [[TMP11]], i64 [[CVT1]], i32 1
+; AVX256NODQ-NEXT:    [[TMP13:%.*]] = insertelement <4 x i64> [[TMP12]], i64 [[CVT2]], i32 2
+; AVX256NODQ-NEXT:    [[TMP14:%.*]] = insertelement <4 x i64> [[TMP13]], i64 [[CVT3]], i32 3
+; AVX256NODQ-NEXT:    store <4 x i64> [[TMP14]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
+; AVX256NODQ-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[CVT4]], i32 0
+; AVX256NODQ-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[CVT5]], i32 1
+; AVX256NODQ-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[CVT6]], i32 2
+; AVX256NODQ-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[CVT7]], i32 3
+; AVX256NODQ-NEXT:    store <4 x i64> [[TMP18]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX256NODQ-NEXT:    ret void
 ;
 ; AVX512-LABEL: @fptoui_8f32_8i64(
@@ -396,57 +395,63 @@
 
 define void @fptoui_8f32_8i32() #0 {
 ; SSE-LABEL: @fptoui_8f32_8i32(
-; SSE-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; SSE-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; SSE-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; SSE-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; SSE-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; SSE-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; SSE-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; SSE-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; SSE-NEXT:    [[CVT0:%.*]] = fptoui float [[A0]] to i32
-; SSE-NEXT:    [[CVT1:%.*]] = fptoui float [[A1]] to i32
-; SSE-NEXT:    [[CVT2:%.*]] = fptoui float [[A2]] to i32
-; SSE-NEXT:    [[CVT3:%.*]] = fptoui float [[A3]] to i32
-; SSE-NEXT:    [[CVT4:%.*]] = fptoui float [[A4]] to i32
-; SSE-NEXT:    [[CVT5:%.*]] = fptoui float [[A5]] to i32
-; SSE-NEXT:    [[CVT6:%.*]] = fptoui float [[A6]] to i32
-; SSE-NEXT:    [[CVT7:%.*]] = fptoui float [[A7]] to i32
-; SSE-NEXT:    store i32 [[CVT0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 0), align 4
-; SSE-NEXT:    store i32 [[CVT1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 1), align 4
-; SSE-NEXT:    store i32 [[CVT2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 2), align 4
-; SSE-NEXT:    store i32 [[CVT3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 3), align 4
-; SSE-NEXT:    store i32 [[CVT4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4), align 4
-; SSE-NEXT:    store i32 [[CVT5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 5), align 4
-; SSE-NEXT:    store i32 [[CVT6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 6), align 4
-; SSE-NEXT:    store i32 [[CVT7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; SSE-NEXT:    [[CVT0:%.*]] = fptoui float [[TMP3]] to i32
+; SSE-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; SSE-NEXT:    [[CVT1:%.*]] = fptoui float [[TMP4]] to i32
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; SSE-NEXT:    [[CVT2:%.*]] = fptoui float [[TMP5]] to i32
+; SSE-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; SSE-NEXT:    [[CVT3:%.*]] = fptoui float [[TMP6]] to i32
+; SSE-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; SSE-NEXT:    [[CVT4:%.*]] = fptoui float [[TMP7]] to i32
+; SSE-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; SSE-NEXT:    [[CVT5:%.*]] = fptoui float [[TMP8]] to i32
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; SSE-NEXT:    [[CVT6:%.*]] = fptoui float [[TMP9]] to i32
+; SSE-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; SSE-NEXT:    [[CVT7:%.*]] = fptoui float [[TMP10]] to i32
+; SSE-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[CVT1]], i32 1
+; SSE-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[CVT2]], i32 2
+; SSE-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[CVT3]], i32 3
+; SSE-NEXT:    store <4 x i32> [[TMP14]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> poison, i32 [[CVT4]], i32 0
+; SSE-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[CVT5]], i32 1
+; SSE-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[CVT6]], i32 2
+; SSE-NEXT:    [[TMP18:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[CVT7]], i32 3
+; SSE-NEXT:    store <4 x i32> [[TMP18]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; AVX256NODQ-LABEL: @fptoui_8f32_8i32(
-; AVX256NODQ-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; AVX256NODQ-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; AVX256NODQ-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; AVX256NODQ-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; AVX256NODQ-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; AVX256NODQ-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; AVX256NODQ-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; AVX256NODQ-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptoui float [[A0]] to i32
-; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptoui float [[A1]] to i32
-; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptoui float [[A2]] to i32
-; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptoui float [[A3]] to i32
-; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptoui float [[A4]] to i32
-; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptoui float [[A5]] to i32
-; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptoui float [[A6]] to i32
-; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptoui float [[A7]] to i32
-; AVX256NODQ-NEXT:    store i32 [[CVT0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 0), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 1), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 2), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 3), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 5), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 6), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4
+; AVX256NODQ-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; AVX256NODQ-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[TMP1]], i32 0
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptoui float [[TMP2]] to i32
+; AVX256NODQ-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP1]], i32 1
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptoui float [[TMP3]] to i32
+; AVX256NODQ-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[TMP1]], i32 2
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptoui float [[TMP4]] to i32
+; AVX256NODQ-NEXT:    [[TMP5:%.*]] = extractelement <8 x float> [[TMP1]], i32 3
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptoui float [[TMP5]] to i32
+; AVX256NODQ-NEXT:    [[TMP6:%.*]] = extractelement <8 x float> [[TMP1]], i32 4
+; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptoui float [[TMP6]] to i32
+; AVX256NODQ-NEXT:    [[TMP7:%.*]] = extractelement <8 x float> [[TMP1]], i32 5
+; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptoui float [[TMP7]] to i32
+; AVX256NODQ-NEXT:    [[TMP8:%.*]] = extractelement <8 x float> [[TMP1]], i32 6
+; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptoui float [[TMP8]] to i32
+; AVX256NODQ-NEXT:    [[TMP9:%.*]] = extractelement <8 x float> [[TMP1]], i32 7
+; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptoui float [[TMP9]] to i32
+; AVX256NODQ-NEXT:    [[TMP10:%.*]] = insertelement <8 x i32> poison, i32 [[CVT0]], i32 0
+; AVX256NODQ-NEXT:    [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[CVT1]], i32 1
+; AVX256NODQ-NEXT:    [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[CVT2]], i32 2
+; AVX256NODQ-NEXT:    [[TMP13:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[CVT3]], i32 3
+; AVX256NODQ-NEXT:    [[TMP14:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[CVT4]], i32 4
+; AVX256NODQ-NEXT:    [[TMP15:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[CVT5]], i32 5
+; AVX256NODQ-NEXT:    [[TMP16:%.*]] = insertelement <8 x i32> [[TMP15]], i32 [[CVT6]], i32 6
+; AVX256NODQ-NEXT:    [[TMP17:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[CVT7]], i32 7
+; AVX256NODQ-NEXT:    store <8 x i32> [[TMP17]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4
 ; AVX256NODQ-NEXT:    ret void
 ;
 ; AVX-LABEL: @fptoui_8f32_8i32(
Index: llvm/test/Transforms/SLPVectorizer/X86/fround.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/fround.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/fround.ll
@@ -27,12 +27,14 @@
 
 define void @ceil_2f64() #0 {
 ; SSE2-LABEL: @ceil_2f64(
-; SSE2-NEXT:    [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; SSE2-NEXT:    [[CEIL0:%.*]] = call double @llvm.ceil.f64(double [[LD0]])
-; SSE2-NEXT:    [[CEIL1:%.*]] = call double @llvm.ceil.f64(double [[LD1]])
-; SSE2-NEXT:    store double [[CEIL0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; SSE2-NEXT:    store double [[CEIL1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE2-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; SSE2-NEXT:    [[CEIL0:%.*]] = call double @llvm.ceil.f64(double [[TMP2]])
+; SSE2-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; SSE2-NEXT:    [[CEIL1:%.*]] = call double @llvm.ceil.f64(double [[TMP3]])
+; SSE2-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[CEIL0]], i32 0
+; SSE2-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[CEIL1]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @ceil_2f64(
@@ -58,18 +60,22 @@
 
 define void @ceil_4f64() #0 {
 ; SSE2-LABEL: @ceil_4f64(
-; SSE2-NEXT:    [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; SSE2-NEXT:    [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; SSE2-NEXT:    [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; SSE2-NEXT:    [[CEIL0:%.*]] = call double @llvm.ceil.f64(double [[LD0]])
-; SSE2-NEXT:    [[CEIL1:%.*]] = call double @llvm.ceil.f64(double [[LD1]])
-; SSE2-NEXT:    [[CEIL2:%.*]] = call double @llvm.ceil.f64(double [[LD2]])
-; SSE2-NEXT:    [[CEIL3:%.*]] = call double @llvm.ceil.f64(double [[LD3]])
-; SSE2-NEXT:    store double [[CEIL0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; SSE2-NEXT:    store double [[CEIL1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE2-NEXT:    store double [[CEIL2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
-; SSE2-NEXT:    store double [[CEIL3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE2-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; SSE2-NEXT:    [[CEIL0:%.*]] = call double @llvm.ceil.f64(double [[TMP3]])
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; SSE2-NEXT:    [[CEIL1:%.*]] = call double @llvm.ceil.f64(double [[TMP4]])
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; SSE2-NEXT:    [[CEIL2:%.*]] = call double @llvm.ceil.f64(double [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; SSE2-NEXT:    [[CEIL3:%.*]] = call double @llvm.ceil.f64(double [[TMP6]])
+; SSE2-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> poison, double [[CEIL0]], i32 0
+; SSE2-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[CEIL1]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP9:%.*]] = insertelement <2 x double> poison, double [[CEIL2]], i32 0
+; SSE2-NEXT:    [[TMP10:%.*]] = insertelement <2 x double> [[TMP9]], double [[CEIL3]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP10]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @ceil_4f64(
@@ -104,30 +110,38 @@
 
 define void @ceil_8f64() #0 {
 ; SSE2-LABEL: @ceil_8f64(
-; SSE2-NEXT:    [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; SSE2-NEXT:    [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; SSE2-NEXT:    [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; SSE2-NEXT:    [[LD4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
-; SSE2-NEXT:    [[LD5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
-; SSE2-NEXT:    [[LD6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
-; SSE2-NEXT:    [[LD7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
-; SSE2-NEXT:    [[CEIL0:%.*]] = call double @llvm.ceil.f64(double [[LD0]])
-; SSE2-NEXT:    [[CEIL1:%.*]] = call double @llvm.ceil.f64(double [[LD1]])
-; SSE2-NEXT:    [[CEIL2:%.*]] = call double @llvm.ceil.f64(double [[LD2]])
-; SSE2-NEXT:    [[CEIL3:%.*]] = call double @llvm.ceil.f64(double [[LD3]])
-; SSE2-NEXT:    [[CEIL4:%.*]] = call double @llvm.ceil.f64(double [[LD4]])
-; SSE2-NEXT:    [[CEIL5:%.*]] = call double @llvm.ceil.f64(double [[LD5]])
-; SSE2-NEXT:    [[CEIL6:%.*]] = call double @llvm.ceil.f64(double [[LD6]])
-; SSE2-NEXT:    [[CEIL7:%.*]] = call double @llvm.ceil.f64(double [[LD7]])
-; SSE2-NEXT:    store double [[CEIL0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; SSE2-NEXT:    store double [[CEIL1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE2-NEXT:    store double [[CEIL2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
-; SSE2-NEXT:    store double [[CEIL3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
-; SSE2-NEXT:    store double [[CEIL4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8
-; SSE2-NEXT:    store double [[CEIL5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
-; SSE2-NEXT:    store double [[CEIL6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8
-; SSE2-NEXT:    store double [[CEIL7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; SSE2-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; SSE2-NEXT:    [[CEIL0:%.*]] = call double @llvm.ceil.f64(double [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; SSE2-NEXT:    [[CEIL1:%.*]] = call double @llvm.ceil.f64(double [[TMP6]])
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; SSE2-NEXT:    [[CEIL2:%.*]] = call double @llvm.ceil.f64(double [[TMP7]])
+; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; SSE2-NEXT:    [[CEIL3:%.*]] = call double @llvm.ceil.f64(double [[TMP8]])
+; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; SSE2-NEXT:    [[CEIL4:%.*]] = call double @llvm.ceil.f64(double [[TMP9]])
+; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+; SSE2-NEXT:    [[CEIL5:%.*]] = call double @llvm.ceil.f64(double [[TMP10]])
+; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
+; SSE2-NEXT:    [[CEIL6:%.*]] = call double @llvm.ceil.f64(double [[TMP11]])
+; SSE2-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
+; SSE2-NEXT:    [[CEIL7:%.*]] = call double @llvm.ceil.f64(double [[TMP12]])
+; SSE2-NEXT:    [[TMP13:%.*]] = insertelement <2 x double> poison, double [[CEIL0]], i32 0
+; SSE2-NEXT:    [[TMP14:%.*]] = insertelement <2 x double> [[TMP13]], double [[CEIL1]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP14]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP15:%.*]] = insertelement <2 x double> poison, double [[CEIL2]], i32 0
+; SSE2-NEXT:    [[TMP16:%.*]] = insertelement <2 x double> [[TMP15]], double [[CEIL3]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP16]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP17:%.*]] = insertelement <2 x double> poison, double [[CEIL4]], i32 0
+; SSE2-NEXT:    [[TMP18:%.*]] = insertelement <2 x double> [[TMP17]], double [[CEIL5]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP18]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP19:%.*]] = insertelement <2 x double> poison, double [[CEIL6]], i32 0
+; SSE2-NEXT:    [[TMP20:%.*]] = insertelement <2 x double> [[TMP19]], double [[CEIL7]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP20]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @ceil_8f64(
@@ -198,12 +212,14 @@
 
 define void @floor_2f64() #0 {
 ; SSE2-LABEL: @floor_2f64(
-; SSE2-NEXT:    [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; SSE2-NEXT:    [[FLOOR0:%.*]] = call double @llvm.floor.f64(double [[LD0]])
-; SSE2-NEXT:    [[FLOOR1:%.*]] = call double @llvm.floor.f64(double [[LD1]])
-; SSE2-NEXT:    store double [[FLOOR0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; SSE2-NEXT:    store double [[FLOOR1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE2-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; SSE2-NEXT:    [[FLOOR0:%.*]] = call double @llvm.floor.f64(double [[TMP2]])
+; SSE2-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; SSE2-NEXT:    [[FLOOR1:%.*]] = call double @llvm.floor.f64(double [[TMP3]])
+; SSE2-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[FLOOR0]], i32 0
+; SSE2-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[FLOOR1]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @floor_2f64(
@@ -229,18 +245,22 @@
 
 define void @floor_4f64() #0 {
 ; SSE2-LABEL: @floor_4f64(
-; SSE2-NEXT:    [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; SSE2-NEXT:    [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; SSE2-NEXT:    [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; SSE2-NEXT:    [[FLOOR0:%.*]] = call double @llvm.floor.f64(double [[LD0]])
-; SSE2-NEXT:    [[FLOOR1:%.*]] = call double @llvm.floor.f64(double [[LD1]])
-; SSE2-NEXT:    [[FLOOR2:%.*]] = call double @llvm.floor.f64(double [[LD2]])
-; SSE2-NEXT:    [[FLOOR3:%.*]] = call double @llvm.floor.f64(double [[LD3]])
-; SSE2-NEXT:    store double [[FLOOR0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; SSE2-NEXT:    store double [[FLOOR1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE2-NEXT:    store double [[FLOOR2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
-; SSE2-NEXT:    store double [[FLOOR3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE2-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; SSE2-NEXT:    [[FLOOR0:%.*]] = call double @llvm.floor.f64(double [[TMP3]])
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; SSE2-NEXT:    [[FLOOR1:%.*]] = call double @llvm.floor.f64(double [[TMP4]])
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; SSE2-NEXT:    [[FLOOR2:%.*]] = call double @llvm.floor.f64(double [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; SSE2-NEXT:    [[FLOOR3:%.*]] = call double @llvm.floor.f64(double [[TMP6]])
+; SSE2-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> poison, double [[FLOOR0]], i32 0
+; SSE2-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[FLOOR1]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP9:%.*]] = insertelement <2 x double> poison, double [[FLOOR2]], i32 0
+; SSE2-NEXT:    [[TMP10:%.*]] = insertelement <2 x double> [[TMP9]], double [[FLOOR3]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP10]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @floor_4f64(
@@ -275,30 +295,38 @@
 
 define void @floor_8f64() #0 {
 ; SSE2-LABEL: @floor_8f64(
-; SSE2-NEXT:    [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; SSE2-NEXT:    [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; SSE2-NEXT:    [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; SSE2-NEXT:    [[LD4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
-; SSE2-NEXT:    [[LD5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
-; SSE2-NEXT:    [[LD6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
-; SSE2-NEXT:    [[LD7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
-; SSE2-NEXT:    [[FLOOR0:%.*]] = call double @llvm.floor.f64(double [[LD0]])
-; SSE2-NEXT:    [[FLOOR1:%.*]] = call double @llvm.floor.f64(double [[LD1]])
-; SSE2-NEXT:    [[FLOOR2:%.*]] = call double @llvm.floor.f64(double [[LD2]])
-; SSE2-NEXT:    [[FLOOR3:%.*]] = call double @llvm.floor.f64(double [[LD3]])
-; SSE2-NEXT:    [[FLOOR4:%.*]] = call double @llvm.floor.f64(double [[LD4]])
-; SSE2-NEXT:    [[FLOOR5:%.*]] = call double @llvm.floor.f64(double [[LD5]])
-; SSE2-NEXT:    [[FLOOR6:%.*]] = call double @llvm.floor.f64(double [[LD6]])
-; SSE2-NEXT:    [[FLOOR7:%.*]] = call double @llvm.floor.f64(double [[LD7]])
-; SSE2-NEXT:    store double [[FLOOR0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; SSE2-NEXT:    store double [[FLOOR1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE2-NEXT:    store double [[FLOOR2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
-; SSE2-NEXT:    store double [[FLOOR3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
-; SSE2-NEXT:    store double [[FLOOR4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8
-; SSE2-NEXT:    store double [[FLOOR5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
-; SSE2-NEXT:    store double [[FLOOR6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8
-; SSE2-NEXT:    store double [[FLOOR7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; SSE2-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; SSE2-NEXT:    [[FLOOR0:%.*]] = call double @llvm.floor.f64(double [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; SSE2-NEXT:    [[FLOOR1:%.*]] = call double @llvm.floor.f64(double [[TMP6]])
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; SSE2-NEXT:    [[FLOOR2:%.*]] = call double @llvm.floor.f64(double [[TMP7]])
+; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; SSE2-NEXT:    [[FLOOR3:%.*]] = call double @llvm.floor.f64(double [[TMP8]])
+; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; SSE2-NEXT:    [[FLOOR4:%.*]] = call double @llvm.floor.f64(double [[TMP9]])
+; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+; SSE2-NEXT:    [[FLOOR5:%.*]] = call double @llvm.floor.f64(double [[TMP10]])
+; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
+; SSE2-NEXT:    [[FLOOR6:%.*]] = call double @llvm.floor.f64(double [[TMP11]])
+; SSE2-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
+; SSE2-NEXT:    [[FLOOR7:%.*]] = call double @llvm.floor.f64(double [[TMP12]])
+; SSE2-NEXT:    [[TMP13:%.*]] = insertelement <2 x double> poison, double [[FLOOR0]], i32 0
+; SSE2-NEXT:    [[TMP14:%.*]] = insertelement <2 x double> [[TMP13]], double [[FLOOR1]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP14]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP15:%.*]] = insertelement <2 x double> poison, double [[FLOOR2]], i32 0
+; SSE2-NEXT:    [[TMP16:%.*]] = insertelement <2 x double> [[TMP15]], double [[FLOOR3]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP16]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP17:%.*]] = insertelement <2 x double> poison, double [[FLOOR4]], i32 0
+; SSE2-NEXT:    [[TMP18:%.*]] = insertelement <2 x double> [[TMP17]], double [[FLOOR5]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP18]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP19:%.*]] = insertelement <2 x double> poison, double [[FLOOR6]], i32 0
+; SSE2-NEXT:    [[TMP20:%.*]] = insertelement <2 x double> [[TMP19]], double [[FLOOR7]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP20]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @floor_8f64(
@@ -369,12 +397,14 @@
 
 define void @nearbyint_2f64() #0 {
 ; SSE2-LABEL: @nearbyint_2f64(
-; SSE2-NEXT:    [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; SSE2-NEXT:    [[NEARBYINT0:%.*]] = call double @llvm.nearbyint.f64(double [[LD0]])
-; SSE2-NEXT:    [[NEARBYINT1:%.*]] = call double @llvm.nearbyint.f64(double [[LD1]])
-; SSE2-NEXT:    store double [[NEARBYINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; SSE2-NEXT:    store double [[NEARBYINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE2-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; SSE2-NEXT:    [[NEARBYINT0:%.*]] = call double @llvm.nearbyint.f64(double [[TMP2]])
+; SSE2-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; SSE2-NEXT:    [[NEARBYINT1:%.*]] = call double @llvm.nearbyint.f64(double [[TMP3]])
+; SSE2-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[NEARBYINT0]], i32 0
+; SSE2-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[NEARBYINT1]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @nearbyint_2f64(
@@ -400,18 +430,22 @@
 
 define void @nearbyint_4f64() #0 {
 ; SSE2-LABEL: @nearbyint_4f64(
-; SSE2-NEXT:    [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; SSE2-NEXT:    [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; SSE2-NEXT:    [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; SSE2-NEXT:    [[NEARBYINT0:%.*]] = call double @llvm.nearbyint.f64(double [[LD0]])
-; SSE2-NEXT:    [[NEARBYINT1:%.*]] = call double @llvm.nearbyint.f64(double [[LD1]])
-; SSE2-NEXT:    [[NEARBYINT2:%.*]] = call double @llvm.nearbyint.f64(double [[LD2]])
-; SSE2-NEXT:    [[NEARBYINT3:%.*]] = call double @llvm.nearbyint.f64(double [[LD3]])
-; SSE2-NEXT:    store double [[NEARBYINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; SSE2-NEXT:    store double [[NEARBYINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE2-NEXT:    store double [[NEARBYINT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
-; SSE2-NEXT:    store double [[NEARBYINT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE2-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; SSE2-NEXT:    [[NEARBYINT0:%.*]] = call double @llvm.nearbyint.f64(double [[TMP3]])
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; SSE2-NEXT:    [[NEARBYINT1:%.*]] = call double @llvm.nearbyint.f64(double [[TMP4]])
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; SSE2-NEXT:    [[NEARBYINT2:%.*]] = call double @llvm.nearbyint.f64(double [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; SSE2-NEXT:    [[NEARBYINT3:%.*]] = call double @llvm.nearbyint.f64(double [[TMP6]])
+; SSE2-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> poison, double [[NEARBYINT0]], i32 0
+; SSE2-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[NEARBYINT1]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP9:%.*]] = insertelement <2 x double> poison, double [[NEARBYINT2]], i32 0
+; SSE2-NEXT:    [[TMP10:%.*]] = insertelement <2 x double> [[TMP9]], double [[NEARBYINT3]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP10]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @nearbyint_4f64(
@@ -446,30 +480,38 @@
 
 define void @nearbyint_8f64() #0 {
 ; SSE2-LABEL: @nearbyint_8f64(
-; SSE2-NEXT:    [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; SSE2-NEXT:    [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; SSE2-NEXT:    [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; SSE2-NEXT:    [[LD4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
-; SSE2-NEXT:    [[LD5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
-; SSE2-NEXT:    [[LD6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
-; SSE2-NEXT:    [[LD7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
-; SSE2-NEXT:    [[NEARBYINT0:%.*]] = call double @llvm.nearbyint.f64(double [[LD0]])
-; SSE2-NEXT:    [[NEARBYINT1:%.*]] = call double @llvm.nearbyint.f64(double [[LD1]])
-; SSE2-NEXT:    [[NEARBYINT2:%.*]] = call double @llvm.nearbyint.f64(double [[LD2]])
-; SSE2-NEXT:    [[NEARBYINT3:%.*]] = call double @llvm.nearbyint.f64(double [[LD3]])
-; SSE2-NEXT:    [[NEARBYINT4:%.*]] = call double @llvm.nearbyint.f64(double [[LD4]])
-; SSE2-NEXT:    [[NEARBYINT5:%.*]] = call double @llvm.nearbyint.f64(double [[LD5]])
-; SSE2-NEXT:    [[NEARBYINT6:%.*]] = call double @llvm.nearbyint.f64(double [[LD6]])
-; SSE2-NEXT:    [[NEARBYINT7:%.*]] = call double @llvm.nearbyint.f64(double [[LD7]])
-; SSE2-NEXT:    store double [[NEARBYINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; SSE2-NEXT:    store double [[NEARBYINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE2-NEXT:    store double [[NEARBYINT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
-; SSE2-NEXT:    store double [[NEARBYINT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
-; SSE2-NEXT:    store double [[NEARBYINT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8
-; SSE2-NEXT:    store double [[NEARBYINT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
-; SSE2-NEXT:    store double [[NEARBYINT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8
-; SSE2-NEXT:    store double [[NEARBYINT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; SSE2-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; SSE2-NEXT:    [[NEARBYINT0:%.*]] = call double @llvm.nearbyint.f64(double [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; SSE2-NEXT:    [[NEARBYINT1:%.*]] = call double @llvm.nearbyint.f64(double [[TMP6]])
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; SSE2-NEXT:    [[NEARBYINT2:%.*]] = call double @llvm.nearbyint.f64(double [[TMP7]])
+; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; SSE2-NEXT:    [[NEARBYINT3:%.*]] = call double @llvm.nearbyint.f64(double [[TMP8]])
+; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; SSE2-NEXT:    [[NEARBYINT4:%.*]] = call double @llvm.nearbyint.f64(double [[TMP9]])
+; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+; SSE2-NEXT:    [[NEARBYINT5:%.*]] = call double @llvm.nearbyint.f64(double [[TMP10]])
+; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
+; SSE2-NEXT:    [[NEARBYINT6:%.*]] = call double @llvm.nearbyint.f64(double [[TMP11]])
+; SSE2-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
+; SSE2-NEXT:    [[NEARBYINT7:%.*]] = call double @llvm.nearbyint.f64(double [[TMP12]])
+; SSE2-NEXT:    [[TMP13:%.*]] = insertelement <2 x double> poison, double [[NEARBYINT0]], i32 0
+; SSE2-NEXT:    [[TMP14:%.*]] = insertelement <2 x double> [[TMP13]], double [[NEARBYINT1]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP14]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP15:%.*]] = insertelement <2 x double> poison, double [[NEARBYINT2]], i32 0
+; SSE2-NEXT:    [[TMP16:%.*]] = insertelement <2 x double> [[TMP15]], double [[NEARBYINT3]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP16]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP17:%.*]] = insertelement <2 x double> poison, double [[NEARBYINT4]], i32 0
+; SSE2-NEXT:    [[TMP18:%.*]] = insertelement <2 x double> [[TMP17]], double [[NEARBYINT5]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP18]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP19:%.*]] = insertelement <2 x double> poison, double [[NEARBYINT6]], i32 0
+; SSE2-NEXT:    [[TMP20:%.*]] = insertelement <2 x double> [[TMP19]], double [[NEARBYINT7]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP20]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @nearbyint_8f64(
@@ -540,12 +582,14 @@
 
 define void @rint_2f64() #0 {
 ; SSE2-LABEL: @rint_2f64(
-; SSE2-NEXT:    [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; SSE2-NEXT:    [[RINT0:%.*]] = call double @llvm.rint.f64(double [[LD0]])
-; SSE2-NEXT:    [[RINT1:%.*]] = call double @llvm.rint.f64(double [[LD1]])
-; SSE2-NEXT:    store double [[RINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; SSE2-NEXT:    store double [[RINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE2-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; SSE2-NEXT:    [[RINT0:%.*]] = call double @llvm.rint.f64(double [[TMP2]])
+; SSE2-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; SSE2-NEXT:    [[RINT1:%.*]] = call double @llvm.rint.f64(double [[TMP3]])
+; SSE2-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[RINT0]], i32 0
+; SSE2-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[RINT1]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @rint_2f64(
@@ -571,18 +615,22 @@
 
 define void @rint_4f64() #0 {
 ; SSE2-LABEL: @rint_4f64(
-; SSE2-NEXT:    [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; SSE2-NEXT:    [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; SSE2-NEXT:    [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; SSE2-NEXT:    [[RINT0:%.*]] = call double @llvm.rint.f64(double [[LD0]])
-; SSE2-NEXT:    [[RINT1:%.*]] = call double @llvm.rint.f64(double [[LD1]])
-; SSE2-NEXT:    [[RINT2:%.*]] = call double @llvm.rint.f64(double [[LD2]])
-; SSE2-NEXT:    [[RINT3:%.*]] = call double @llvm.rint.f64(double [[LD3]])
-; SSE2-NEXT:    store double [[RINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; SSE2-NEXT:    store double [[RINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE2-NEXT:    store double [[RINT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
-; SSE2-NEXT:    store double [[RINT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE2-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; SSE2-NEXT:    [[RINT0:%.*]] = call double @llvm.rint.f64(double [[TMP3]])
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; SSE2-NEXT:    [[RINT1:%.*]] = call double @llvm.rint.f64(double [[TMP4]])
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; SSE2-NEXT:    [[RINT2:%.*]] = call double @llvm.rint.f64(double [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; SSE2-NEXT:    [[RINT3:%.*]] = call double @llvm.rint.f64(double [[TMP6]])
+; SSE2-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> poison, double [[RINT0]], i32 0
+; SSE2-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[RINT1]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP9:%.*]] = insertelement <2 x double> poison, double [[RINT2]], i32 0
+; SSE2-NEXT:    [[TMP10:%.*]] = insertelement <2 x double> [[TMP9]], double [[RINT3]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP10]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @rint_4f64(
@@ -617,30 +665,38 @@
 
 define void @rint_8f64() #0 {
 ; SSE2-LABEL: @rint_8f64(
-; SSE2-NEXT:    [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; SSE2-NEXT:    [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; SSE2-NEXT:    [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; SSE2-NEXT:    [[LD4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
-; SSE2-NEXT:    [[LD5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
-; SSE2-NEXT:    [[LD6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
-; SSE2-NEXT:    [[LD7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
-; SSE2-NEXT:    [[RINT0:%.*]] = call double @llvm.rint.f64(double [[LD0]])
-; SSE2-NEXT:    [[RINT1:%.*]] = call double @llvm.rint.f64(double [[LD1]])
-; SSE2-NEXT:    [[RINT2:%.*]] = call double @llvm.rint.f64(double [[LD2]])
-; SSE2-NEXT:    [[RINT3:%.*]] = call double @llvm.rint.f64(double [[LD3]])
-; SSE2-NEXT:    [[RINT4:%.*]] = call double @llvm.rint.f64(double [[LD4]])
-; SSE2-NEXT:    [[RINT5:%.*]] = call double @llvm.rint.f64(double [[LD5]])
-; SSE2-NEXT:    [[RINT6:%.*]] = call double @llvm.rint.f64(double [[LD6]])
-; SSE2-NEXT:    [[RINT7:%.*]] = call double @llvm.rint.f64(double [[LD7]])
-; SSE2-NEXT:    store double [[RINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; SSE2-NEXT:    store double [[RINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE2-NEXT:    store double [[RINT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
-; SSE2-NEXT:    store double [[RINT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
-; SSE2-NEXT:    store double [[RINT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8
-; SSE2-NEXT:    store double [[RINT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
-; SSE2-NEXT:    store double [[RINT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8
-; SSE2-NEXT:    store double [[RINT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; SSE2-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; SSE2-NEXT:    [[RINT0:%.*]] = call double @llvm.rint.f64(double [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; SSE2-NEXT:    [[RINT1:%.*]] = call double @llvm.rint.f64(double [[TMP6]])
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; SSE2-NEXT:    [[RINT2:%.*]] = call double @llvm.rint.f64(double [[TMP7]])
+; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; SSE2-NEXT:    [[RINT3:%.*]] = call double @llvm.rint.f64(double [[TMP8]])
+; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; SSE2-NEXT:    [[RINT4:%.*]] = call double @llvm.rint.f64(double [[TMP9]])
+; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+; SSE2-NEXT:    [[RINT5:%.*]] = call double @llvm.rint.f64(double [[TMP10]])
+; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
+; SSE2-NEXT:    [[RINT6:%.*]] = call double @llvm.rint.f64(double [[TMP11]])
+; SSE2-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
+; SSE2-NEXT:    [[RINT7:%.*]] = call double @llvm.rint.f64(double [[TMP12]])
+; SSE2-NEXT:    [[TMP13:%.*]] = insertelement <2 x double> poison, double [[RINT0]], i32 0
+; SSE2-NEXT:    [[TMP14:%.*]] = insertelement <2 x double> [[TMP13]], double [[RINT1]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP14]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP15:%.*]] = insertelement <2 x double> poison, double [[RINT2]], i32 0
+; SSE2-NEXT:    [[TMP16:%.*]] = insertelement <2 x double> [[TMP15]], double [[RINT3]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP16]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP17:%.*]] = insertelement <2 x double> poison, double [[RINT4]], i32 0
+; SSE2-NEXT:    [[TMP18:%.*]] = insertelement <2 x double> [[TMP17]], double [[RINT5]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP18]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP19:%.*]] = insertelement <2 x double> poison, double [[RINT6]], i32 0
+; SSE2-NEXT:    [[TMP20:%.*]] = insertelement <2 x double> [[TMP19]], double [[RINT7]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP20]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @rint_8f64(
@@ -711,12 +767,14 @@
 
 define void @trunc_2f64() #0 {
 ; SSE2-LABEL: @trunc_2f64(
-; SSE2-NEXT:    [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; SSE2-NEXT:    [[TRUNC0:%.*]] = call double @llvm.trunc.f64(double [[LD0]])
-; SSE2-NEXT:    [[TRUNC1:%.*]] = call double @llvm.trunc.f64(double [[LD1]])
-; SSE2-NEXT:    store double [[TRUNC0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; SSE2-NEXT:    store double [[TRUNC1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE2-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; SSE2-NEXT:    [[TRUNC0:%.*]] = call double @llvm.trunc.f64(double [[TMP2]])
+; SSE2-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; SSE2-NEXT:    [[TRUNC1:%.*]] = call double @llvm.trunc.f64(double [[TMP3]])
+; SSE2-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[TRUNC0]], i32 0
+; SSE2-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[TRUNC1]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @trunc_2f64(
@@ -742,18 +800,22 @@
 
 define void @trunc_4f64() #0 {
 ; SSE2-LABEL: @trunc_4f64(
-; SSE2-NEXT:    [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; SSE2-NEXT:    [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; SSE2-NEXT:    [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; SSE2-NEXT:    [[TRUNC0:%.*]] = call double @llvm.trunc.f64(double [[LD0]])
-; SSE2-NEXT:    [[TRUNC1:%.*]] = call double @llvm.trunc.f64(double [[LD1]])
-; SSE2-NEXT:    [[TRUNC2:%.*]] = call double @llvm.trunc.f64(double [[LD2]])
-; SSE2-NEXT:    [[TRUNC3:%.*]] = call double @llvm.trunc.f64(double [[LD3]])
-; SSE2-NEXT:    store double [[TRUNC0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; SSE2-NEXT:    store double [[TRUNC1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE2-NEXT:    store double [[TRUNC2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
-; SSE2-NEXT:    store double [[TRUNC3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE2-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; SSE2-NEXT:    [[TRUNC0:%.*]] = call double @llvm.trunc.f64(double [[TMP3]])
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; SSE2-NEXT:    [[TRUNC1:%.*]] = call double @llvm.trunc.f64(double [[TMP4]])
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; SSE2-NEXT:    [[TRUNC2:%.*]] = call double @llvm.trunc.f64(double [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; SSE2-NEXT:    [[TRUNC3:%.*]] = call double @llvm.trunc.f64(double [[TMP6]])
+; SSE2-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> poison, double [[TRUNC0]], i32 0
+; SSE2-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[TRUNC1]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP9:%.*]] = insertelement <2 x double> poison, double [[TRUNC2]], i32 0
+; SSE2-NEXT:    [[TMP10:%.*]] = insertelement <2 x double> [[TMP9]], double [[TRUNC3]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP10]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @trunc_4f64(
@@ -788,30 +850,38 @@
 
 define void @trunc_8f64() #0 {
 ; SSE2-LABEL: @trunc_8f64(
-; SSE2-NEXT:    [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; SSE2-NEXT:    [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; SSE2-NEXT:    [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; SSE2-NEXT:    [[LD4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
-; SSE2-NEXT:    [[LD5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
-; SSE2-NEXT:    [[LD6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
-; SSE2-NEXT:    [[LD7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
-; SSE2-NEXT:    [[TRUNC0:%.*]] = call double @llvm.trunc.f64(double [[LD0]])
-; SSE2-NEXT:    [[TRUNC1:%.*]] = call double @llvm.trunc.f64(double [[LD1]])
-; SSE2-NEXT:    [[TRUNC2:%.*]] = call double @llvm.trunc.f64(double [[LD2]])
-; SSE2-NEXT:    [[TRUNC3:%.*]] = call double @llvm.trunc.f64(double [[LD3]])
-; SSE2-NEXT:    [[TRUNC4:%.*]] = call double @llvm.trunc.f64(double [[LD4]])
-; SSE2-NEXT:    [[TRUNC5:%.*]] = call double @llvm.trunc.f64(double [[LD5]])
-; SSE2-NEXT:    [[TRUNC6:%.*]] = call double @llvm.trunc.f64(double [[LD6]])
-; SSE2-NEXT:    [[TRUNC7:%.*]] = call double @llvm.trunc.f64(double [[LD7]])
-; SSE2-NEXT:    store double [[TRUNC0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; SSE2-NEXT:    store double [[TRUNC1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE2-NEXT:    store double [[TRUNC2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
-; SSE2-NEXT:    store double [[TRUNC3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
-; SSE2-NEXT:    store double [[TRUNC4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8
-; SSE2-NEXT:    store double [[TRUNC5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
-; SSE2-NEXT:    store double [[TRUNC6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8
-; SSE2-NEXT:    store double [[TRUNC7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; SSE2-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; SSE2-NEXT:    [[TRUNC0:%.*]] = call double @llvm.trunc.f64(double [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; SSE2-NEXT:    [[TRUNC1:%.*]] = call double @llvm.trunc.f64(double [[TMP6]])
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; SSE2-NEXT:    [[TRUNC2:%.*]] = call double @llvm.trunc.f64(double [[TMP7]])
+; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; SSE2-NEXT:    [[TRUNC3:%.*]] = call double @llvm.trunc.f64(double [[TMP8]])
+; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; SSE2-NEXT:    [[TRUNC4:%.*]] = call double @llvm.trunc.f64(double [[TMP9]])
+; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+; SSE2-NEXT:    [[TRUNC5:%.*]] = call double @llvm.trunc.f64(double [[TMP10]])
+; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
+; SSE2-NEXT:    [[TRUNC6:%.*]] = call double @llvm.trunc.f64(double [[TMP11]])
+; SSE2-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
+; SSE2-NEXT:    [[TRUNC7:%.*]] = call double @llvm.trunc.f64(double [[TMP12]])
+; SSE2-NEXT:    [[TMP13:%.*]] = insertelement <2 x double> poison, double [[TRUNC0]], i32 0
+; SSE2-NEXT:    [[TMP14:%.*]] = insertelement <2 x double> [[TMP13]], double [[TRUNC1]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP14]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP15:%.*]] = insertelement <2 x double> poison, double [[TRUNC2]], i32 0
+; SSE2-NEXT:    [[TMP16:%.*]] = insertelement <2 x double> [[TMP15]], double [[TRUNC3]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP16]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP17:%.*]] = insertelement <2 x double> poison, double [[TRUNC4]], i32 0
+; SSE2-NEXT:    [[TMP18:%.*]] = insertelement <2 x double> [[TMP17]], double [[TRUNC5]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP18]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP19:%.*]] = insertelement <2 x double> poison, double [[TRUNC6]], i32 0
+; SSE2-NEXT:    [[TMP20:%.*]] = insertelement <2 x double> [[TMP19]], double [[TRUNC7]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP20]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @trunc_8f64(
@@ -882,18 +952,20 @@
 
 define void @ceil_4f32() #0 {
 ; SSE2-LABEL: @ceil_4f32(
-; SSE2-NEXT:    [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; SSE2-NEXT:    [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; SSE2-NEXT:    [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; SSE2-NEXT:    [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; SSE2-NEXT:    [[CEIL0:%.*]] = call float @llvm.ceil.f32(float [[LD0]])
-; SSE2-NEXT:    [[CEIL1:%.*]] = call float @llvm.ceil.f32(float [[LD1]])
-; SSE2-NEXT:    [[CEIL2:%.*]] = call float @llvm.ceil.f32(float [[LD2]])
-; SSE2-NEXT:    [[CEIL3:%.*]] = call float @llvm.ceil.f32(float [[LD3]])
-; SSE2-NEXT:    store float [[CEIL0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; SSE2-NEXT:    store float [[CEIL1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE2-NEXT:    store float [[CEIL2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; SSE2-NEXT:    store float [[CEIL3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; SSE2-NEXT:    [[CEIL0:%.*]] = call float @llvm.ceil.f32(float [[TMP2]])
+; SSE2-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; SSE2-NEXT:    [[CEIL1:%.*]] = call float @llvm.ceil.f32(float [[TMP3]])
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; SSE2-NEXT:    [[CEIL2:%.*]] = call float @llvm.ceil.f32(float [[TMP4]])
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; SSE2-NEXT:    [[CEIL3:%.*]] = call float @llvm.ceil.f32(float [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> poison, float [[CEIL0]], i32 0
+; SSE2-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[CEIL1]], i32 1
+; SSE2-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[CEIL2]], i32 2
+; SSE2-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[CEIL3]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP9]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @ceil_4f32(
@@ -925,30 +997,34 @@
 
 define void @ceil_8f32() #0 {
 ; SSE2-LABEL: @ceil_8f32(
-; SSE2-NEXT:    [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; SSE2-NEXT:    [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; SSE2-NEXT:    [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; SSE2-NEXT:    [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; SSE2-NEXT:    [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; SSE2-NEXT:    [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; SSE2-NEXT:    [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; SSE2-NEXT:    [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; SSE2-NEXT:    [[CEIL0:%.*]] = call float @llvm.ceil.f32(float [[LD0]])
-; SSE2-NEXT:    [[CEIL1:%.*]] = call float @llvm.ceil.f32(float [[LD1]])
-; SSE2-NEXT:    [[CEIL2:%.*]] = call float @llvm.ceil.f32(float [[LD2]])
-; SSE2-NEXT:    [[CEIL3:%.*]] = call float @llvm.ceil.f32(float [[LD3]])
-; SSE2-NEXT:    [[CEIL4:%.*]] = call float @llvm.ceil.f32(float [[LD4]])
-; SSE2-NEXT:    [[CEIL5:%.*]] = call float @llvm.ceil.f32(float [[LD5]])
-; SSE2-NEXT:    [[CEIL6:%.*]] = call float @llvm.ceil.f32(float [[LD6]])
-; SSE2-NEXT:    [[CEIL7:%.*]] = call float @llvm.ceil.f32(float [[LD7]])
-; SSE2-NEXT:    store float [[CEIL0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; SSE2-NEXT:    store float [[CEIL1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE2-NEXT:    store float [[CEIL2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; SSE2-NEXT:    store float [[CEIL3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE2-NEXT:    store float [[CEIL4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
-; SSE2-NEXT:    store float [[CEIL5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE2-NEXT:    store float [[CEIL6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
-; SSE2-NEXT:    store float [[CEIL7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; SSE2-NEXT:    [[CEIL0:%.*]] = call float @llvm.ceil.f32(float [[TMP3]])
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; SSE2-NEXT:    [[CEIL1:%.*]] = call float @llvm.ceil.f32(float [[TMP4]])
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; SSE2-NEXT:    [[CEIL2:%.*]] = call float @llvm.ceil.f32(float [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; SSE2-NEXT:    [[CEIL3:%.*]] = call float @llvm.ceil.f32(float [[TMP6]])
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; SSE2-NEXT:    [[CEIL4:%.*]] = call float @llvm.ceil.f32(float [[TMP7]])
+; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; SSE2-NEXT:    [[CEIL5:%.*]] = call float @llvm.ceil.f32(float [[TMP8]])
+; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; SSE2-NEXT:    [[CEIL6:%.*]] = call float @llvm.ceil.f32(float [[TMP9]])
+; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; SSE2-NEXT:    [[CEIL7:%.*]] = call float @llvm.ceil.f32(float [[TMP10]])
+; SSE2-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> poison, float [[CEIL0]], i32 0
+; SSE2-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[CEIL1]], i32 1
+; SSE2-NEXT:    [[TMP13:%.*]] = insertelement <4 x float> [[TMP12]], float [[CEIL2]], i32 2
+; SSE2-NEXT:    [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[CEIL3]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP14]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP15:%.*]] = insertelement <4 x float> poison, float [[CEIL4]], i32 0
+; SSE2-NEXT:    [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[CEIL5]], i32 1
+; SSE2-NEXT:    [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float [[CEIL6]], i32 2
+; SSE2-NEXT:    [[TMP18:%.*]] = insertelement <4 x float> [[TMP17]], float [[CEIL7]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP18]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @ceil_8f32(
@@ -995,54 +1071,62 @@
 
 define void @ceil_16f32() #0 {
 ; SSE2-LABEL: @ceil_16f32(
-; SSE2-NEXT:    [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; SSE2-NEXT:    [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; SSE2-NEXT:    [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; SSE2-NEXT:    [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; SSE2-NEXT:    [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; SSE2-NEXT:    [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; SSE2-NEXT:    [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; SSE2-NEXT:    [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; SSE2-NEXT:    [[LD8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8), align 4
-; SSE2-NEXT:    [[LD9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9), align 4
-; SSE2-NEXT:    [[LD10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4
-; SSE2-NEXT:    [[LD11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4
-; SSE2-NEXT:    [[LD12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4
-; SSE2-NEXT:    [[LD13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4
-; SSE2-NEXT:    [[LD14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4
-; SSE2-NEXT:    [[LD15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4
-; SSE2-NEXT:    [[CEIL0:%.*]] = call float @llvm.ceil.f32(float [[LD0]])
-; SSE2-NEXT:    [[CEIL1:%.*]] = call float @llvm.ceil.f32(float [[LD1]])
-; SSE2-NEXT:    [[CEIL2:%.*]] = call float @llvm.ceil.f32(float [[LD2]])
-; SSE2-NEXT:    [[CEIL3:%.*]] = call float @llvm.ceil.f32(float [[LD3]])
-; SSE2-NEXT:    [[CEIL4:%.*]] = call float @llvm.ceil.f32(float [[LD4]])
-; SSE2-NEXT:    [[CEIL5:%.*]] = call float @llvm.ceil.f32(float [[LD5]])
-; SSE2-NEXT:    [[CEIL6:%.*]] = call float @llvm.ceil.f32(float [[LD6]])
-; SSE2-NEXT:    [[CEIL7:%.*]] = call float @llvm.ceil.f32(float [[LD7]])
-; SSE2-NEXT:    [[CEIL8:%.*]] = call float @llvm.ceil.f32(float [[LD8]])
-; SSE2-NEXT:    [[CEIL9:%.*]] = call float @llvm.ceil.f32(float [[LD9]])
-; SSE2-NEXT:    [[CEIL10:%.*]] = call float @llvm.ceil.f32(float [[LD10]])
-; SSE2-NEXT:    [[CEIL11:%.*]] = call float @llvm.ceil.f32(float [[LD11]])
-; SSE2-NEXT:    [[CEIL12:%.*]] = call float @llvm.ceil.f32(float [[LD12]])
-; SSE2-NEXT:    [[CEIL13:%.*]] = call float @llvm.ceil.f32(float [[LD13]])
-; SSE2-NEXT:    [[CEIL14:%.*]] = call float @llvm.ceil.f32(float [[LD14]])
-; SSE2-NEXT:    [[CEIL15:%.*]] = call float @llvm.ceil.f32(float [[LD15]])
-; SSE2-NEXT:    store float [[CEIL0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; SSE2-NEXT:    store float [[CEIL1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE2-NEXT:    store float [[CEIL2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; SSE2-NEXT:    store float [[CEIL3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE2-NEXT:    store float [[CEIL4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
-; SSE2-NEXT:    store float [[CEIL5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE2-NEXT:    store float [[CEIL6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
-; SSE2-NEXT:    store float [[CEIL7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
-; SSE2-NEXT:    store float [[CEIL8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4
-; SSE2-NEXT:    store float [[CEIL9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4
-; SSE2-NEXT:    store float [[CEIL10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4
-; SSE2-NEXT:    store float [[CEIL11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
-; SSE2-NEXT:    store float [[CEIL12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4
-; SSE2-NEXT:    store float [[CEIL13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
-; SSE2-NEXT:    store float [[CEIL14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4
-; SSE2-NEXT:    store float [[CEIL15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; SSE2-NEXT:    [[CEIL0:%.*]] = call float @llvm.ceil.f32(float [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; SSE2-NEXT:    [[CEIL1:%.*]] = call float @llvm.ceil.f32(float [[TMP6]])
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; SSE2-NEXT:    [[CEIL2:%.*]] = call float @llvm.ceil.f32(float [[TMP7]])
+; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; SSE2-NEXT:    [[CEIL3:%.*]] = call float @llvm.ceil.f32(float [[TMP8]])
+; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; SSE2-NEXT:    [[CEIL4:%.*]] = call float @llvm.ceil.f32(float [[TMP9]])
+; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; SSE2-NEXT:    [[CEIL5:%.*]] = call float @llvm.ceil.f32(float [[TMP10]])
+; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; SSE2-NEXT:    [[CEIL6:%.*]] = call float @llvm.ceil.f32(float [[TMP11]])
+; SSE2-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; SSE2-NEXT:    [[CEIL7:%.*]] = call float @llvm.ceil.f32(float [[TMP12]])
+; SSE2-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
+; SSE2-NEXT:    [[CEIL8:%.*]] = call float @llvm.ceil.f32(float [[TMP13]])
+; SSE2-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
+; SSE2-NEXT:    [[CEIL9:%.*]] = call float @llvm.ceil.f32(float [[TMP14]])
+; SSE2-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
+; SSE2-NEXT:    [[CEIL10:%.*]] = call float @llvm.ceil.f32(float [[TMP15]])
+; SSE2-NEXT:    [[TMP16:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
+; SSE2-NEXT:    [[CEIL11:%.*]] = call float @llvm.ceil.f32(float [[TMP16]])
+; SSE2-NEXT:    [[TMP17:%.*]] = extractelement <4 x float> [[TMP4]], i32 0
+; SSE2-NEXT:    [[CEIL12:%.*]] = call float @llvm.ceil.f32(float [[TMP17]])
+; SSE2-NEXT:    [[TMP18:%.*]] = extractelement <4 x float> [[TMP4]], i32 1
+; SSE2-NEXT:    [[CEIL13:%.*]] = call float @llvm.ceil.f32(float [[TMP18]])
+; SSE2-NEXT:    [[TMP19:%.*]] = extractelement <4 x float> [[TMP4]], i32 2
+; SSE2-NEXT:    [[CEIL14:%.*]] = call float @llvm.ceil.f32(float [[TMP19]])
+; SSE2-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[TMP4]], i32 3
+; SSE2-NEXT:    [[CEIL15:%.*]] = call float @llvm.ceil.f32(float [[TMP20]])
+; SSE2-NEXT:    [[TMP21:%.*]] = insertelement <4 x float> poison, float [[CEIL0]], i32 0
+; SSE2-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[CEIL1]], i32 1
+; SSE2-NEXT:    [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[CEIL2]], i32 2
+; SSE2-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[CEIL3]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP24]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP25:%.*]] = insertelement <4 x float> poison, float [[CEIL4]], i32 0
+; SSE2-NEXT:    [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[CEIL5]], i32 1
+; SSE2-NEXT:    [[TMP27:%.*]] = insertelement <4 x float> [[TMP26]], float [[CEIL6]], i32 2
+; SSE2-NEXT:    [[TMP28:%.*]] = insertelement <4 x float> [[TMP27]], float [[CEIL7]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP28]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP29:%.*]] = insertelement <4 x float> poison, float [[CEIL8]], i32 0
+; SSE2-NEXT:    [[TMP30:%.*]] = insertelement <4 x float> [[TMP29]], float [[CEIL9]], i32 1
+; SSE2-NEXT:    [[TMP31:%.*]] = insertelement <4 x float> [[TMP30]], float [[CEIL10]], i32 2
+; SSE2-NEXT:    [[TMP32:%.*]] = insertelement <4 x float> [[TMP31]], float [[CEIL11]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP32]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP33:%.*]] = insertelement <4 x float> poison, float [[CEIL12]], i32 0
+; SSE2-NEXT:    [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[CEIL13]], i32 1
+; SSE2-NEXT:    [[TMP35:%.*]] = insertelement <4 x float> [[TMP34]], float [[CEIL14]], i32 2
+; SSE2-NEXT:    [[TMP36:%.*]] = insertelement <4 x float> [[TMP35]], float [[CEIL15]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP36]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @ceil_16f32(
@@ -1137,18 +1221,20 @@
 
 define void @floor_4f32() #0 {
 ; SSE2-LABEL: @floor_4f32(
-; SSE2-NEXT:    [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; SSE2-NEXT:    [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; SSE2-NEXT:    [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; SSE2-NEXT:    [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; SSE2-NEXT:    [[FLOOR0:%.*]] = call float @llvm.floor.f32(float [[LD0]])
-; SSE2-NEXT:    [[FLOOR1:%.*]] = call float @llvm.floor.f32(float [[LD1]])
-; SSE2-NEXT:    [[FLOOR2:%.*]] = call float @llvm.floor.f32(float [[LD2]])
-; SSE2-NEXT:    [[FLOOR3:%.*]] = call float @llvm.floor.f32(float [[LD3]])
-; SSE2-NEXT:    store float [[FLOOR0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; SSE2-NEXT:    store float [[FLOOR1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE2-NEXT:    store float [[FLOOR2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; SSE2-NEXT:    store float [[FLOOR3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; SSE2-NEXT:    [[FLOOR0:%.*]] = call float @llvm.floor.f32(float [[TMP2]])
+; SSE2-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; SSE2-NEXT:    [[FLOOR1:%.*]] = call float @llvm.floor.f32(float [[TMP3]])
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; SSE2-NEXT:    [[FLOOR2:%.*]] = call float @llvm.floor.f32(float [[TMP4]])
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; SSE2-NEXT:    [[FLOOR3:%.*]] = call float @llvm.floor.f32(float [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> poison, float [[FLOOR0]], i32 0
+; SSE2-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[FLOOR1]], i32 1
+; SSE2-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[FLOOR2]], i32 2
+; SSE2-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[FLOOR3]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP9]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @floor_4f32(
@@ -1180,30 +1266,34 @@
 
 define void @floor_8f32() #0 {
 ; SSE2-LABEL: @floor_8f32(
-; SSE2-NEXT:    [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; SSE2-NEXT:    [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; SSE2-NEXT:    [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; SSE2-NEXT:    [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; SSE2-NEXT:    [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; SSE2-NEXT:    [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; SSE2-NEXT:    [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; SSE2-NEXT:    [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; SSE2-NEXT:    [[FLOOR0:%.*]] = call float @llvm.floor.f32(float [[LD0]])
-; SSE2-NEXT:    [[FLOOR1:%.*]] = call float @llvm.floor.f32(float [[LD1]])
-; SSE2-NEXT:    [[FLOOR2:%.*]] = call float @llvm.floor.f32(float [[LD2]])
-; SSE2-NEXT:    [[FLOOR3:%.*]] = call float @llvm.floor.f32(float [[LD3]])
-; SSE2-NEXT:    [[FLOOR4:%.*]] = call float @llvm.floor.f32(float [[LD4]])
-; SSE2-NEXT:    [[FLOOR5:%.*]] = call float @llvm.floor.f32(float [[LD5]])
-; SSE2-NEXT:    [[FLOOR6:%.*]] = call float @llvm.floor.f32(float [[LD6]])
-; SSE2-NEXT:    [[FLOOR7:%.*]] = call float @llvm.floor.f32(float [[LD7]])
-; SSE2-NEXT:    store float [[FLOOR0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; SSE2-NEXT:    store float [[FLOOR1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE2-NEXT:    store float [[FLOOR2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; SSE2-NEXT:    store float [[FLOOR3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE2-NEXT:    store float [[FLOOR4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
-; SSE2-NEXT:    store float [[FLOOR5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE2-NEXT:    store float [[FLOOR6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
-; SSE2-NEXT:    store float [[FLOOR7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; SSE2-NEXT:    [[FLOOR0:%.*]] = call float @llvm.floor.f32(float [[TMP3]])
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; SSE2-NEXT:    [[FLOOR1:%.*]] = call float @llvm.floor.f32(float [[TMP4]])
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; SSE2-NEXT:    [[FLOOR2:%.*]] = call float @llvm.floor.f32(float [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; SSE2-NEXT:    [[FLOOR3:%.*]] = call float @llvm.floor.f32(float [[TMP6]])
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; SSE2-NEXT:    [[FLOOR4:%.*]] = call float @llvm.floor.f32(float [[TMP7]])
+; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; SSE2-NEXT:    [[FLOOR5:%.*]] = call float @llvm.floor.f32(float [[TMP8]])
+; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; SSE2-NEXT:    [[FLOOR6:%.*]] = call float @llvm.floor.f32(float [[TMP9]])
+; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; SSE2-NEXT:    [[FLOOR7:%.*]] = call float @llvm.floor.f32(float [[TMP10]])
+; SSE2-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> poison, float [[FLOOR0]], i32 0
+; SSE2-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[FLOOR1]], i32 1
+; SSE2-NEXT:    [[TMP13:%.*]] = insertelement <4 x float> [[TMP12]], float [[FLOOR2]], i32 2
+; SSE2-NEXT:    [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[FLOOR3]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP14]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP15:%.*]] = insertelement <4 x float> poison, float [[FLOOR4]], i32 0
+; SSE2-NEXT:    [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[FLOOR5]], i32 1
+; SSE2-NEXT:    [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float [[FLOOR6]], i32 2
+; SSE2-NEXT:    [[TMP18:%.*]] = insertelement <4 x float> [[TMP17]], float [[FLOOR7]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP18]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @floor_8f32(
@@ -1250,54 +1340,62 @@
 
 define void @floor_16f32() #0 {
 ; SSE2-LABEL: @floor_16f32(
-; SSE2-NEXT:    [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; SSE2-NEXT:    [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; SSE2-NEXT:    [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; SSE2-NEXT:    [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; SSE2-NEXT:    [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; SSE2-NEXT:    [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; SSE2-NEXT:    [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; SSE2-NEXT:    [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; SSE2-NEXT:    [[LD8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8), align 4
-; SSE2-NEXT:    [[LD9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9), align 4
-; SSE2-NEXT:    [[LD10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4
-; SSE2-NEXT:    [[LD11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4
-; SSE2-NEXT:    [[LD12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4
-; SSE2-NEXT:    [[LD13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4
-; SSE2-NEXT:    [[LD14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4
-; SSE2-NEXT:    [[LD15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4
-; SSE2-NEXT:    [[FLOOR0:%.*]] = call float @llvm.floor.f32(float [[LD0]])
-; SSE2-NEXT:    [[FLOOR1:%.*]] = call float @llvm.floor.f32(float [[LD1]])
-; SSE2-NEXT:    [[FLOOR2:%.*]] = call float @llvm.floor.f32(float [[LD2]])
-; SSE2-NEXT:    [[FLOOR3:%.*]] = call float @llvm.floor.f32(float [[LD3]])
-; SSE2-NEXT:    [[FLOOR4:%.*]] = call float @llvm.floor.f32(float [[LD4]])
-; SSE2-NEXT:    [[FLOOR5:%.*]] = call float @llvm.floor.f32(float [[LD5]])
-; SSE2-NEXT:    [[FLOOR6:%.*]] = call float @llvm.floor.f32(float [[LD6]])
-; SSE2-NEXT:    [[FLOOR7:%.*]] = call float @llvm.floor.f32(float [[LD7]])
-; SSE2-NEXT:    [[FLOOR8:%.*]] = call float @llvm.floor.f32(float [[LD8]])
-; SSE2-NEXT:    [[FLOOR9:%.*]] = call float @llvm.floor.f32(float [[LD9]])
-; SSE2-NEXT:    [[FLOOR10:%.*]] = call float @llvm.floor.f32(float [[LD10]])
-; SSE2-NEXT:    [[FLOOR11:%.*]] = call float @llvm.floor.f32(float [[LD11]])
-; SSE2-NEXT:    [[FLOOR12:%.*]] = call float @llvm.floor.f32(float [[LD12]])
-; SSE2-NEXT:    [[FLOOR13:%.*]] = call float @llvm.floor.f32(float [[LD13]])
-; SSE2-NEXT:    [[FLOOR14:%.*]] = call float @llvm.floor.f32(float [[LD14]])
-; SSE2-NEXT:    [[FLOOR15:%.*]] = call float @llvm.floor.f32(float [[LD15]])
-; SSE2-NEXT:    store float [[FLOOR0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; SSE2-NEXT:    store float [[FLOOR1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE2-NEXT:    store float [[FLOOR2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; SSE2-NEXT:    store float [[FLOOR3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE2-NEXT:    store float [[FLOOR4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
-; SSE2-NEXT:    store float [[FLOOR5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE2-NEXT:    store float [[FLOOR6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
-; SSE2-NEXT:    store float [[FLOOR7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
-; SSE2-NEXT:    store float [[FLOOR8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4
-; SSE2-NEXT:    store float [[FLOOR9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4
-; SSE2-NEXT:    store float [[FLOOR10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4
-; SSE2-NEXT:    store float [[FLOOR11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
-; SSE2-NEXT:    store float [[FLOOR12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4
-; SSE2-NEXT:    store float [[FLOOR13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
-; SSE2-NEXT:    store float [[FLOOR14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4
-; SSE2-NEXT:    store float [[FLOOR15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; SSE2-NEXT:    [[FLOOR0:%.*]] = call float @llvm.floor.f32(float [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; SSE2-NEXT:    [[FLOOR1:%.*]] = call float @llvm.floor.f32(float [[TMP6]])
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; SSE2-NEXT:    [[FLOOR2:%.*]] = call float @llvm.floor.f32(float [[TMP7]])
+; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; SSE2-NEXT:    [[FLOOR3:%.*]] = call float @llvm.floor.f32(float [[TMP8]])
+; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; SSE2-NEXT:    [[FLOOR4:%.*]] = call float @llvm.floor.f32(float [[TMP9]])
+; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; SSE2-NEXT:    [[FLOOR5:%.*]] = call float @llvm.floor.f32(float [[TMP10]])
+; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; SSE2-NEXT:    [[FLOOR6:%.*]] = call float @llvm.floor.f32(float [[TMP11]])
+; SSE2-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; SSE2-NEXT:    [[FLOOR7:%.*]] = call float @llvm.floor.f32(float [[TMP12]])
+; SSE2-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
+; SSE2-NEXT:    [[FLOOR8:%.*]] = call float @llvm.floor.f32(float [[TMP13]])
+; SSE2-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
+; SSE2-NEXT:    [[FLOOR9:%.*]] = call float @llvm.floor.f32(float [[TMP14]])
+; SSE2-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
+; SSE2-NEXT:    [[FLOOR10:%.*]] = call float @llvm.floor.f32(float [[TMP15]])
+; SSE2-NEXT:    [[TMP16:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
+; SSE2-NEXT:    [[FLOOR11:%.*]] = call float @llvm.floor.f32(float [[TMP16]])
+; SSE2-NEXT:    [[TMP17:%.*]] = extractelement <4 x float> [[TMP4]], i32 0
+; SSE2-NEXT:    [[FLOOR12:%.*]] = call float @llvm.floor.f32(float [[TMP17]])
+; SSE2-NEXT:    [[TMP18:%.*]] = extractelement <4 x float> [[TMP4]], i32 1
+; SSE2-NEXT:    [[FLOOR13:%.*]] = call float @llvm.floor.f32(float [[TMP18]])
+; SSE2-NEXT:    [[TMP19:%.*]] = extractelement <4 x float> [[TMP4]], i32 2
+; SSE2-NEXT:    [[FLOOR14:%.*]] = call float @llvm.floor.f32(float [[TMP19]])
+; SSE2-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[TMP4]], i32 3
+; SSE2-NEXT:    [[FLOOR15:%.*]] = call float @llvm.floor.f32(float [[TMP20]])
+; SSE2-NEXT:    [[TMP21:%.*]] = insertelement <4 x float> poison, float [[FLOOR0]], i32 0
+; SSE2-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[FLOOR1]], i32 1
+; SSE2-NEXT:    [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[FLOOR2]], i32 2
+; SSE2-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[FLOOR3]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP24]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP25:%.*]] = insertelement <4 x float> poison, float [[FLOOR4]], i32 0
+; SSE2-NEXT:    [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[FLOOR5]], i32 1
+; SSE2-NEXT:    [[TMP27:%.*]] = insertelement <4 x float> [[TMP26]], float [[FLOOR6]], i32 2
+; SSE2-NEXT:    [[TMP28:%.*]] = insertelement <4 x float> [[TMP27]], float [[FLOOR7]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP28]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP29:%.*]] = insertelement <4 x float> poison, float [[FLOOR8]], i32 0
+; SSE2-NEXT:    [[TMP30:%.*]] = insertelement <4 x float> [[TMP29]], float [[FLOOR9]], i32 1
+; SSE2-NEXT:    [[TMP31:%.*]] = insertelement <4 x float> [[TMP30]], float [[FLOOR10]], i32 2
+; SSE2-NEXT:    [[TMP32:%.*]] = insertelement <4 x float> [[TMP31]], float [[FLOOR11]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP32]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP33:%.*]] = insertelement <4 x float> poison, float [[FLOOR12]], i32 0
+; SSE2-NEXT:    [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[FLOOR13]], i32 1
+; SSE2-NEXT:    [[TMP35:%.*]] = insertelement <4 x float> [[TMP34]], float [[FLOOR14]], i32 2
+; SSE2-NEXT:    [[TMP36:%.*]] = insertelement <4 x float> [[TMP35]], float [[FLOOR15]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP36]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @floor_16f32(
@@ -1392,18 +1490,20 @@
 
 define void @nearbyint_4f32() #0 {
 ; SSE2-LABEL: @nearbyint_4f32(
-; SSE2-NEXT:    [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; SSE2-NEXT:    [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; SSE2-NEXT:    [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; SSE2-NEXT:    [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; SSE2-NEXT:    [[NEARBYINT0:%.*]] = call float @llvm.nearbyint.f32(float [[LD0]])
-; SSE2-NEXT:    [[NEARBYINT1:%.*]] = call float @llvm.nearbyint.f32(float [[LD1]])
-; SSE2-NEXT:    [[NEARBYINT2:%.*]] = call float @llvm.nearbyint.f32(float [[LD2]])
-; SSE2-NEXT:    [[NEARBYINT3:%.*]] = call float @llvm.nearbyint.f32(float [[LD3]])
-; SSE2-NEXT:    store float [[NEARBYINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; SSE2-NEXT:    store float [[NEARBYINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE2-NEXT:    store float [[NEARBYINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; SSE2-NEXT:    store float [[NEARBYINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; SSE2-NEXT:    [[NEARBYINT0:%.*]] = call float @llvm.nearbyint.f32(float [[TMP2]])
+; SSE2-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; SSE2-NEXT:    [[NEARBYINT1:%.*]] = call float @llvm.nearbyint.f32(float [[TMP3]])
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; SSE2-NEXT:    [[NEARBYINT2:%.*]] = call float @llvm.nearbyint.f32(float [[TMP4]])
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; SSE2-NEXT:    [[NEARBYINT3:%.*]] = call float @llvm.nearbyint.f32(float [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> poison, float [[NEARBYINT0]], i32 0
+; SSE2-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[NEARBYINT1]], i32 1
+; SSE2-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[NEARBYINT2]], i32 2
+; SSE2-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[NEARBYINT3]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP9]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @nearbyint_4f32(
@@ -1435,30 +1535,34 @@
 
 define void @nearbyint_8f32() #0 {
 ; SSE2-LABEL: @nearbyint_8f32(
-; SSE2-NEXT:    [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; SSE2-NEXT:    [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; SSE2-NEXT:    [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; SSE2-NEXT:    [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; SSE2-NEXT:    [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; SSE2-NEXT:    [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; SSE2-NEXT:    [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; SSE2-NEXT:    [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; SSE2-NEXT:    [[NEARBYINT0:%.*]] = call float @llvm.nearbyint.f32(float [[LD0]])
-; SSE2-NEXT:    [[NEARBYINT1:%.*]] = call float @llvm.nearbyint.f32(float [[LD1]])
-; SSE2-NEXT:    [[NEARBYINT2:%.*]] = call float @llvm.nearbyint.f32(float [[LD2]])
-; SSE2-NEXT:    [[NEARBYINT3:%.*]] = call float @llvm.nearbyint.f32(float [[LD3]])
-; SSE2-NEXT:    [[NEARBYINT4:%.*]] = call float @llvm.nearbyint.f32(float [[LD4]])
-; SSE2-NEXT:    [[NEARBYINT5:%.*]] = call float @llvm.nearbyint.f32(float [[LD5]])
-; SSE2-NEXT:    [[NEARBYINT6:%.*]] = call float @llvm.nearbyint.f32(float [[LD6]])
-; SSE2-NEXT:    [[NEARBYINT7:%.*]] = call float @llvm.nearbyint.f32(float [[LD7]])
-; SSE2-NEXT:    store float [[NEARBYINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; SSE2-NEXT:    store float [[NEARBYINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE2-NEXT:    store float [[NEARBYINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; SSE2-NEXT:    store float [[NEARBYINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE2-NEXT:    store float [[NEARBYINT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
-; SSE2-NEXT:    store float [[NEARBYINT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE2-NEXT:    store float [[NEARBYINT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
-; SSE2-NEXT:    store float [[NEARBYINT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; SSE2-NEXT:    [[NEARBYINT0:%.*]] = call float @llvm.nearbyint.f32(float [[TMP3]])
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; SSE2-NEXT:    [[NEARBYINT1:%.*]] = call float @llvm.nearbyint.f32(float [[TMP4]])
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; SSE2-NEXT:    [[NEARBYINT2:%.*]] = call float @llvm.nearbyint.f32(float [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; SSE2-NEXT:    [[NEARBYINT3:%.*]] = call float @llvm.nearbyint.f32(float [[TMP6]])
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; SSE2-NEXT:    [[NEARBYINT4:%.*]] = call float @llvm.nearbyint.f32(float [[TMP7]])
+; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; SSE2-NEXT:    [[NEARBYINT5:%.*]] = call float @llvm.nearbyint.f32(float [[TMP8]])
+; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; SSE2-NEXT:    [[NEARBYINT6:%.*]] = call float @llvm.nearbyint.f32(float [[TMP9]])
+; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; SSE2-NEXT:    [[NEARBYINT7:%.*]] = call float @llvm.nearbyint.f32(float [[TMP10]])
+; SSE2-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> poison, float [[NEARBYINT0]], i32 0
+; SSE2-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[NEARBYINT1]], i32 1
+; SSE2-NEXT:    [[TMP13:%.*]] = insertelement <4 x float> [[TMP12]], float [[NEARBYINT2]], i32 2
+; SSE2-NEXT:    [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[NEARBYINT3]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP14]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP15:%.*]] = insertelement <4 x float> poison, float [[NEARBYINT4]], i32 0
+; SSE2-NEXT:    [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[NEARBYINT5]], i32 1
+; SSE2-NEXT:    [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float [[NEARBYINT6]], i32 2
+; SSE2-NEXT:    [[TMP18:%.*]] = insertelement <4 x float> [[TMP17]], float [[NEARBYINT7]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP18]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @nearbyint_8f32(
@@ -1505,54 +1609,62 @@
 
 define void @nearbyint_16f32() #0 {
 ; SSE2-LABEL: @nearbyint_16f32(
-; SSE2-NEXT:    [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; SSE2-NEXT:    [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; SSE2-NEXT:    [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; SSE2-NEXT:    [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; SSE2-NEXT:    [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; SSE2-NEXT:    [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; SSE2-NEXT:    [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; SSE2-NEXT:    [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; SSE2-NEXT:    [[LD8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8), align 4
-; SSE2-NEXT:    [[LD9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9), align 4
-; SSE2-NEXT:    [[LD10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4
-; SSE2-NEXT:    [[LD11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4
-; SSE2-NEXT:    [[LD12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4
-; SSE2-NEXT:    [[LD13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4
-; SSE2-NEXT:    [[LD14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4
-; SSE2-NEXT:    [[LD15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4
-; SSE2-NEXT:    [[NEARBYINT0:%.*]] = call float @llvm.nearbyint.f32(float [[LD0]])
-; SSE2-NEXT:    [[NEARBYINT1:%.*]] = call float @llvm.nearbyint.f32(float [[LD1]])
-; SSE2-NEXT:    [[NEARBYINT2:%.*]] = call float @llvm.nearbyint.f32(float [[LD2]])
-; SSE2-NEXT:    [[NEARBYINT3:%.*]] = call float @llvm.nearbyint.f32(float [[LD3]])
-; SSE2-NEXT:    [[NEARBYINT4:%.*]] = call float @llvm.nearbyint.f32(float [[LD4]])
-; SSE2-NEXT:    [[NEARBYINT5:%.*]] = call float @llvm.nearbyint.f32(float [[LD5]])
-; SSE2-NEXT:    [[NEARBYINT6:%.*]] = call float @llvm.nearbyint.f32(float [[LD6]])
-; SSE2-NEXT:    [[NEARBYINT7:%.*]] = call float @llvm.nearbyint.f32(float [[LD7]])
-; SSE2-NEXT:    [[NEARBYINT8:%.*]] = call float @llvm.nearbyint.f32(float [[LD8]])
-; SSE2-NEXT:    [[NEARBYINT9:%.*]] = call float @llvm.nearbyint.f32(float [[LD9]])
-; SSE2-NEXT:    [[NEARBYINT10:%.*]] = call float @llvm.nearbyint.f32(float [[LD10]])
-; SSE2-NEXT:    [[NEARBYINT11:%.*]] = call float @llvm.nearbyint.f32(float [[LD11]])
-; SSE2-NEXT:    [[NEARBYINT12:%.*]] = call float @llvm.nearbyint.f32(float [[LD12]])
-; SSE2-NEXT:    [[NEARBYINT13:%.*]] = call float @llvm.nearbyint.f32(float [[LD13]])
-; SSE2-NEXT:    [[NEARBYINT14:%.*]] = call float @llvm.nearbyint.f32(float [[LD14]])
-; SSE2-NEXT:    [[NEARBYINT15:%.*]] = call float @llvm.nearbyint.f32(float [[LD15]])
-; SSE2-NEXT:    store float [[NEARBYINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; SSE2-NEXT:    store float [[NEARBYINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE2-NEXT:    store float [[NEARBYINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; SSE2-NEXT:    store float [[NEARBYINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE2-NEXT:    store float [[NEARBYINT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
-; SSE2-NEXT:    store float [[NEARBYINT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE2-NEXT:    store float [[NEARBYINT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
-; SSE2-NEXT:    store float [[NEARBYINT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
-; SSE2-NEXT:    store float [[NEARBYINT8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4
-; SSE2-NEXT:    store float [[NEARBYINT9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4
-; SSE2-NEXT:    store float [[NEARBYINT10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4
-; SSE2-NEXT:    store float [[NEARBYINT11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
-; SSE2-NEXT:    store float [[NEARBYINT12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4
-; SSE2-NEXT:    store float [[NEARBYINT13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
-; SSE2-NEXT:    store float [[NEARBYINT14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4
-; SSE2-NEXT:    store float [[NEARBYINT15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; SSE2-NEXT:    [[NEARBYINT0:%.*]] = call float @llvm.nearbyint.f32(float [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; SSE2-NEXT:    [[NEARBYINT1:%.*]] = call float @llvm.nearbyint.f32(float [[TMP6]])
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; SSE2-NEXT:    [[NEARBYINT2:%.*]] = call float @llvm.nearbyint.f32(float [[TMP7]])
+; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; SSE2-NEXT:    [[NEARBYINT3:%.*]] = call float @llvm.nearbyint.f32(float [[TMP8]])
+; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; SSE2-NEXT:    [[NEARBYINT4:%.*]] = call float @llvm.nearbyint.f32(float [[TMP9]])
+; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; SSE2-NEXT:    [[NEARBYINT5:%.*]] = call float @llvm.nearbyint.f32(float [[TMP10]])
+; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; SSE2-NEXT:    [[NEARBYINT6:%.*]] = call float @llvm.nearbyint.f32(float [[TMP11]])
+; SSE2-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; SSE2-NEXT:    [[NEARBYINT7:%.*]] = call float @llvm.nearbyint.f32(float [[TMP12]])
+; SSE2-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
+; SSE2-NEXT:    [[NEARBYINT8:%.*]] = call float @llvm.nearbyint.f32(float [[TMP13]])
+; SSE2-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
+; SSE2-NEXT:    [[NEARBYINT9:%.*]] = call float @llvm.nearbyint.f32(float [[TMP14]])
+; SSE2-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
+; SSE2-NEXT:    [[NEARBYINT10:%.*]] = call float @llvm.nearbyint.f32(float [[TMP15]])
+; SSE2-NEXT:    [[TMP16:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
+; SSE2-NEXT:    [[NEARBYINT11:%.*]] = call float @llvm.nearbyint.f32(float [[TMP16]])
+; SSE2-NEXT:    [[TMP17:%.*]] = extractelement <4 x float> [[TMP4]], i32 0
+; SSE2-NEXT:    [[NEARBYINT12:%.*]] = call float @llvm.nearbyint.f32(float [[TMP17]])
+; SSE2-NEXT:    [[TMP18:%.*]] = extractelement <4 x float> [[TMP4]], i32 1
+; SSE2-NEXT:    [[NEARBYINT13:%.*]] = call float @llvm.nearbyint.f32(float [[TMP18]])
+; SSE2-NEXT:    [[TMP19:%.*]] = extractelement <4 x float> [[TMP4]], i32 2
+; SSE2-NEXT:    [[NEARBYINT14:%.*]] = call float @llvm.nearbyint.f32(float [[TMP19]])
+; SSE2-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[TMP4]], i32 3
+; SSE2-NEXT:    [[NEARBYINT15:%.*]] = call float @llvm.nearbyint.f32(float [[TMP20]])
+; SSE2-NEXT:    [[TMP21:%.*]] = insertelement <4 x float> poison, float [[NEARBYINT0]], i32 0
+; SSE2-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[NEARBYINT1]], i32 1
+; SSE2-NEXT:    [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[NEARBYINT2]], i32 2
+; SSE2-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[NEARBYINT3]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP24]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP25:%.*]] = insertelement <4 x float> poison, float [[NEARBYINT4]], i32 0
+; SSE2-NEXT:    [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[NEARBYINT5]], i32 1
+; SSE2-NEXT:    [[TMP27:%.*]] = insertelement <4 x float> [[TMP26]], float [[NEARBYINT6]], i32 2
+; SSE2-NEXT:    [[TMP28:%.*]] = insertelement <4 x float> [[TMP27]], float [[NEARBYINT7]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP28]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP29:%.*]] = insertelement <4 x float> poison, float [[NEARBYINT8]], i32 0
+; SSE2-NEXT:    [[TMP30:%.*]] = insertelement <4 x float> [[TMP29]], float [[NEARBYINT9]], i32 1
+; SSE2-NEXT:    [[TMP31:%.*]] = insertelement <4 x float> [[TMP30]], float [[NEARBYINT10]], i32 2
+; SSE2-NEXT:    [[TMP32:%.*]] = insertelement <4 x float> [[TMP31]], float [[NEARBYINT11]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP32]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP33:%.*]] = insertelement <4 x float> poison, float [[NEARBYINT12]], i32 0
+; SSE2-NEXT:    [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[NEARBYINT13]], i32 1
+; SSE2-NEXT:    [[TMP35:%.*]] = insertelement <4 x float> [[TMP34]], float [[NEARBYINT14]], i32 2
+; SSE2-NEXT:    [[TMP36:%.*]] = insertelement <4 x float> [[TMP35]], float [[NEARBYINT15]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP36]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @nearbyint_16f32(
@@ -1647,18 +1759,20 @@
 
 define void @rint_4f32() #0 {
 ; SSE2-LABEL: @rint_4f32(
-; SSE2-NEXT:    [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; SSE2-NEXT:    [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; SSE2-NEXT:    [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; SSE2-NEXT:    [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; SSE2-NEXT:    [[RINT0:%.*]] = call float @llvm.rint.f32(float [[LD0]])
-; SSE2-NEXT:    [[RINT1:%.*]] = call float @llvm.rint.f32(float [[LD1]])
-; SSE2-NEXT:    [[RINT2:%.*]] = call float @llvm.rint.f32(float [[LD2]])
-; SSE2-NEXT:    [[RINT3:%.*]] = call float @llvm.rint.f32(float [[LD3]])
-; SSE2-NEXT:    store float [[RINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; SSE2-NEXT:    store float [[RINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE2-NEXT:    store float [[RINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; SSE2-NEXT:    store float [[RINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; SSE2-NEXT:    [[RINT0:%.*]] = call float @llvm.rint.f32(float [[TMP2]])
+; SSE2-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; SSE2-NEXT:    [[RINT1:%.*]] = call float @llvm.rint.f32(float [[TMP3]])
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; SSE2-NEXT:    [[RINT2:%.*]] = call float @llvm.rint.f32(float [[TMP4]])
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; SSE2-NEXT:    [[RINT3:%.*]] = call float @llvm.rint.f32(float [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> poison, float [[RINT0]], i32 0
+; SSE2-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[RINT1]], i32 1
+; SSE2-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[RINT2]], i32 2
+; SSE2-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[RINT3]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP9]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @rint_4f32(
@@ -1690,30 +1804,34 @@
 
 define void @rint_8f32() #0 {
 ; SSE2-LABEL: @rint_8f32(
-; SSE2-NEXT:    [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; SSE2-NEXT:    [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; SSE2-NEXT:    [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; SSE2-NEXT:    [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; SSE2-NEXT:    [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; SSE2-NEXT:    [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; SSE2-NEXT:    [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; SSE2-NEXT:    [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; SSE2-NEXT:    [[RINT0:%.*]] = call float @llvm.rint.f32(float [[LD0]])
-; SSE2-NEXT:    [[RINT1:%.*]] = call float @llvm.rint.f32(float [[LD1]])
-; SSE2-NEXT:    [[RINT2:%.*]] = call float @llvm.rint.f32(float [[LD2]])
-; SSE2-NEXT:    [[RINT3:%.*]] = call float @llvm.rint.f32(float [[LD3]])
-; SSE2-NEXT:    [[RINT4:%.*]] = call float @llvm.rint.f32(float [[LD4]])
-; SSE2-NEXT:    [[RINT5:%.*]] = call float @llvm.rint.f32(float [[LD5]])
-; SSE2-NEXT:    [[RINT6:%.*]] = call float @llvm.rint.f32(float [[LD6]])
-; SSE2-NEXT:    [[RINT7:%.*]] = call float @llvm.rint.f32(float [[LD7]])
-; SSE2-NEXT:    store float [[RINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; SSE2-NEXT:    store float [[RINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE2-NEXT:    store float [[RINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; SSE2-NEXT:    store float [[RINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE2-NEXT:    store float [[RINT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
-; SSE2-NEXT:    store float [[RINT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE2-NEXT:    store float [[RINT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
-; SSE2-NEXT:    store float [[RINT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; SSE2-NEXT:    [[RINT0:%.*]] = call float @llvm.rint.f32(float [[TMP3]])
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; SSE2-NEXT:    [[RINT1:%.*]] = call float @llvm.rint.f32(float [[TMP4]])
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; SSE2-NEXT:    [[RINT2:%.*]] = call float @llvm.rint.f32(float [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; SSE2-NEXT:    [[RINT3:%.*]] = call float @llvm.rint.f32(float [[TMP6]])
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; SSE2-NEXT:    [[RINT4:%.*]] = call float @llvm.rint.f32(float [[TMP7]])
+; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; SSE2-NEXT:    [[RINT5:%.*]] = call float @llvm.rint.f32(float [[TMP8]])
+; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; SSE2-NEXT:    [[RINT6:%.*]] = call float @llvm.rint.f32(float [[TMP9]])
+; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; SSE2-NEXT:    [[RINT7:%.*]] = call float @llvm.rint.f32(float [[TMP10]])
+; SSE2-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> poison, float [[RINT0]], i32 0
+; SSE2-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[RINT1]], i32 1
+; SSE2-NEXT:    [[TMP13:%.*]] = insertelement <4 x float> [[TMP12]], float [[RINT2]], i32 2
+; SSE2-NEXT:    [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[RINT3]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP14]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP15:%.*]] = insertelement <4 x float> poison, float [[RINT4]], i32 0
+; SSE2-NEXT:    [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[RINT5]], i32 1
+; SSE2-NEXT:    [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float [[RINT6]], i32 2
+; SSE2-NEXT:    [[TMP18:%.*]] = insertelement <4 x float> [[TMP17]], float [[RINT7]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP18]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @rint_8f32(
@@ -1760,54 +1878,62 @@
 
 define void @rint_16f32() #0 {
 ; SSE2-LABEL: @rint_16f32(
-; SSE2-NEXT:    [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; SSE2-NEXT:    [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; SSE2-NEXT:    [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; SSE2-NEXT:    [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; SSE2-NEXT:    [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; SSE2-NEXT:    [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; SSE2-NEXT:    [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; SSE2-NEXT:    [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; SSE2-NEXT:    [[LD8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8), align 4
-; SSE2-NEXT:    [[LD9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9), align 4
-; SSE2-NEXT:    [[LD10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4
-; SSE2-NEXT:    [[LD11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4
-; SSE2-NEXT:    [[LD12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4
-; SSE2-NEXT:    [[LD13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4
-; SSE2-NEXT:    [[LD14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4
-; SSE2-NEXT:    [[LD15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4
-; SSE2-NEXT:    [[RINT0:%.*]] = call float @llvm.rint.f32(float [[LD0]])
-; SSE2-NEXT:    [[RINT1:%.*]] = call float @llvm.rint.f32(float [[LD1]])
-; SSE2-NEXT:    [[RINT2:%.*]] = call float @llvm.rint.f32(float [[LD2]])
-; SSE2-NEXT:    [[RINT3:%.*]] = call float @llvm.rint.f32(float [[LD3]])
-; SSE2-NEXT:    [[RINT4:%.*]] = call float @llvm.rint.f32(float [[LD4]])
-; SSE2-NEXT:    [[RINT5:%.*]] = call float @llvm.rint.f32(float [[LD5]])
-; SSE2-NEXT:    [[RINT6:%.*]] = call float @llvm.rint.f32(float [[LD6]])
-; SSE2-NEXT:    [[RINT7:%.*]] = call float @llvm.rint.f32(float [[LD7]])
-; SSE2-NEXT:    [[RINT8:%.*]] = call float @llvm.rint.f32(float [[LD8]])
-; SSE2-NEXT:    [[RINT9:%.*]] = call float @llvm.rint.f32(float [[LD9]])
-; SSE2-NEXT:    [[RINT10:%.*]] = call float @llvm.rint.f32(float [[LD10]])
-; SSE2-NEXT:    [[RINT11:%.*]] = call float @llvm.rint.f32(float [[LD11]])
-; SSE2-NEXT:    [[RINT12:%.*]] = call float @llvm.rint.f32(float [[LD12]])
-; SSE2-NEXT:    [[RINT13:%.*]] = call float @llvm.rint.f32(float [[LD13]])
-; SSE2-NEXT:    [[RINT14:%.*]] = call float @llvm.rint.f32(float [[LD14]])
-; SSE2-NEXT:    [[RINT15:%.*]] = call float @llvm.rint.f32(float [[LD15]])
-; SSE2-NEXT:    store float [[RINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; SSE2-NEXT:    store float [[RINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE2-NEXT:    store float [[RINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; SSE2-NEXT:    store float [[RINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE2-NEXT:    store float [[RINT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
-; SSE2-NEXT:    store float [[RINT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE2-NEXT:    store float [[RINT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
-; SSE2-NEXT:    store float [[RINT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
-; SSE2-NEXT:    store float [[RINT8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4
-; SSE2-NEXT:    store float [[RINT9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4
-; SSE2-NEXT:    store float [[RINT10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4
-; SSE2-NEXT:    store float [[RINT11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
-; SSE2-NEXT:    store float [[RINT12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4
-; SSE2-NEXT:    store float [[RINT13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
-; SSE2-NEXT:    store float [[RINT14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4
-; SSE2-NEXT:    store float [[RINT15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; SSE2-NEXT:    [[RINT0:%.*]] = call float @llvm.rint.f32(float [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; SSE2-NEXT:    [[RINT1:%.*]] = call float @llvm.rint.f32(float [[TMP6]])
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; SSE2-NEXT:    [[RINT2:%.*]] = call float @llvm.rint.f32(float [[TMP7]])
+; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; SSE2-NEXT:    [[RINT3:%.*]] = call float @llvm.rint.f32(float [[TMP8]])
+; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; SSE2-NEXT:    [[RINT4:%.*]] = call float @llvm.rint.f32(float [[TMP9]])
+; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; SSE2-NEXT:    [[RINT5:%.*]] = call float @llvm.rint.f32(float [[TMP10]])
+; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; SSE2-NEXT:    [[RINT6:%.*]] = call float @llvm.rint.f32(float [[TMP11]])
+; SSE2-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; SSE2-NEXT:    [[RINT7:%.*]] = call float @llvm.rint.f32(float [[TMP12]])
+; SSE2-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
+; SSE2-NEXT:    [[RINT8:%.*]] = call float @llvm.rint.f32(float [[TMP13]])
+; SSE2-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
+; SSE2-NEXT:    [[RINT9:%.*]] = call float @llvm.rint.f32(float [[TMP14]])
+; SSE2-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
+; SSE2-NEXT:    [[RINT10:%.*]] = call float @llvm.rint.f32(float [[TMP15]])
+; SSE2-NEXT:    [[TMP16:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
+; SSE2-NEXT:    [[RINT11:%.*]] = call float @llvm.rint.f32(float [[TMP16]])
+; SSE2-NEXT:    [[TMP17:%.*]] = extractelement <4 x float> [[TMP4]], i32 0
+; SSE2-NEXT:    [[RINT12:%.*]] = call float @llvm.rint.f32(float [[TMP17]])
+; SSE2-NEXT:    [[TMP18:%.*]] = extractelement <4 x float> [[TMP4]], i32 1
+; SSE2-NEXT:    [[RINT13:%.*]] = call float @llvm.rint.f32(float [[TMP18]])
+; SSE2-NEXT:    [[TMP19:%.*]] = extractelement <4 x float> [[TMP4]], i32 2
+; SSE2-NEXT:    [[RINT14:%.*]] = call float @llvm.rint.f32(float [[TMP19]])
+; SSE2-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[TMP4]], i32 3
+; SSE2-NEXT:    [[RINT15:%.*]] = call float @llvm.rint.f32(float [[TMP20]])
+; SSE2-NEXT:    [[TMP21:%.*]] = insertelement <4 x float> poison, float [[RINT0]], i32 0
+; SSE2-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[RINT1]], i32 1
+; SSE2-NEXT:    [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[RINT2]], i32 2
+; SSE2-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[RINT3]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP24]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP25:%.*]] = insertelement <4 x float> poison, float [[RINT4]], i32 0
+; SSE2-NEXT:    [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[RINT5]], i32 1
+; SSE2-NEXT:    [[TMP27:%.*]] = insertelement <4 x float> [[TMP26]], float [[RINT6]], i32 2
+; SSE2-NEXT:    [[TMP28:%.*]] = insertelement <4 x float> [[TMP27]], float [[RINT7]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP28]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP29:%.*]] = insertelement <4 x float> poison, float [[RINT8]], i32 0
+; SSE2-NEXT:    [[TMP30:%.*]] = insertelement <4 x float> [[TMP29]], float [[RINT9]], i32 1
+; SSE2-NEXT:    [[TMP31:%.*]] = insertelement <4 x float> [[TMP30]], float [[RINT10]], i32 2
+; SSE2-NEXT:    [[TMP32:%.*]] = insertelement <4 x float> [[TMP31]], float [[RINT11]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP32]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP33:%.*]] = insertelement <4 x float> poison, float [[RINT12]], i32 0
+; SSE2-NEXT:    [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[RINT13]], i32 1
+; SSE2-NEXT:    [[TMP35:%.*]] = insertelement <4 x float> [[TMP34]], float [[RINT14]], i32 2
+; SSE2-NEXT:    [[TMP36:%.*]] = insertelement <4 x float> [[TMP35]], float [[RINT15]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP36]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @rint_16f32(
@@ -1902,18 +2028,20 @@
 
 define void @trunc_4f32() #0 {
 ; SSE2-LABEL: @trunc_4f32(
-; SSE2-NEXT:    [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; SSE2-NEXT:    [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; SSE2-NEXT:    [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; SSE2-NEXT:    [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; SSE2-NEXT:    [[TRUNC0:%.*]] = call float @llvm.trunc.f32(float [[LD0]])
-; SSE2-NEXT:    [[TRUNC1:%.*]] = call float @llvm.trunc.f32(float [[LD1]])
-; SSE2-NEXT:    [[TRUNC2:%.*]] = call float @llvm.trunc.f32(float [[LD2]])
-; SSE2-NEXT:    [[TRUNC3:%.*]] = call float @llvm.trunc.f32(float [[LD3]])
-; SSE2-NEXT:    store float [[TRUNC0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; SSE2-NEXT:    store float [[TRUNC1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE2-NEXT:    store float [[TRUNC2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; SSE2-NEXT:    store float [[TRUNC3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; SSE2-NEXT:    [[TRUNC0:%.*]] = call float @llvm.trunc.f32(float [[TMP2]])
+; SSE2-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; SSE2-NEXT:    [[TRUNC1:%.*]] = call float @llvm.trunc.f32(float [[TMP3]])
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; SSE2-NEXT:    [[TRUNC2:%.*]] = call float @llvm.trunc.f32(float [[TMP4]])
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; SSE2-NEXT:    [[TRUNC3:%.*]] = call float @llvm.trunc.f32(float [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> poison, float [[TRUNC0]], i32 0
+; SSE2-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[TRUNC1]], i32 1
+; SSE2-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[TRUNC2]], i32 2
+; SSE2-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[TRUNC3]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP9]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @trunc_4f32(
@@ -1945,30 +2073,34 @@
 
 define void @trunc_8f32() #0 {
 ; SSE2-LABEL: @trunc_8f32(
-; SSE2-NEXT:    [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; SSE2-NEXT:    [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; SSE2-NEXT:    [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; SSE2-NEXT:    [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; SSE2-NEXT:    [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; SSE2-NEXT:    [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; SSE2-NEXT:    [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; SSE2-NEXT:    [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; SSE2-NEXT:    [[TRUNC0:%.*]] = call float @llvm.trunc.f32(float [[LD0]])
-; SSE2-NEXT:    [[TRUNC1:%.*]] = call float @llvm.trunc.f32(float [[LD1]])
-; SSE2-NEXT:    [[TRUNC2:%.*]] = call float @llvm.trunc.f32(float [[LD2]])
-; SSE2-NEXT:    [[TRUNC3:%.*]] = call float @llvm.trunc.f32(float [[LD3]])
-; SSE2-NEXT:    [[TRUNC4:%.*]] = call float @llvm.trunc.f32(float [[LD4]])
-; SSE2-NEXT:    [[TRUNC5:%.*]] = call float @llvm.trunc.f32(float [[LD5]])
-; SSE2-NEXT:    [[TRUNC6:%.*]] = call float @llvm.trunc.f32(float [[LD6]])
-; SSE2-NEXT:    [[TRUNC7:%.*]] = call float @llvm.trunc.f32(float [[LD7]])
-; SSE2-NEXT:    store float [[TRUNC0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; SSE2-NEXT:    store float [[TRUNC1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE2-NEXT:    store float [[TRUNC2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; SSE2-NEXT:    store float [[TRUNC3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE2-NEXT:    store float [[TRUNC4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
-; SSE2-NEXT:    store float [[TRUNC5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE2-NEXT:    store float [[TRUNC6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
-; SSE2-NEXT:    store float [[TRUNC7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; SSE2-NEXT:    [[TRUNC0:%.*]] = call float @llvm.trunc.f32(float [[TMP3]])
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; SSE2-NEXT:    [[TRUNC1:%.*]] = call float @llvm.trunc.f32(float [[TMP4]])
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; SSE2-NEXT:    [[TRUNC2:%.*]] = call float @llvm.trunc.f32(float [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; SSE2-NEXT:    [[TRUNC3:%.*]] = call float @llvm.trunc.f32(float [[TMP6]])
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; SSE2-NEXT:    [[TRUNC4:%.*]] = call float @llvm.trunc.f32(float [[TMP7]])
+; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; SSE2-NEXT:    [[TRUNC5:%.*]] = call float @llvm.trunc.f32(float [[TMP8]])
+; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; SSE2-NEXT:    [[TRUNC6:%.*]] = call float @llvm.trunc.f32(float [[TMP9]])
+; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; SSE2-NEXT:    [[TRUNC7:%.*]] = call float @llvm.trunc.f32(float [[TMP10]])
+; SSE2-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> poison, float [[TRUNC0]], i32 0
+; SSE2-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[TRUNC1]], i32 1
+; SSE2-NEXT:    [[TMP13:%.*]] = insertelement <4 x float> [[TMP12]], float [[TRUNC2]], i32 2
+; SSE2-NEXT:    [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[TRUNC3]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP14]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP15:%.*]] = insertelement <4 x float> poison, float [[TRUNC4]], i32 0
+; SSE2-NEXT:    [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[TRUNC5]], i32 1
+; SSE2-NEXT:    [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float [[TRUNC6]], i32 2
+; SSE2-NEXT:    [[TMP18:%.*]] = insertelement <4 x float> [[TMP17]], float [[TRUNC7]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP18]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @trunc_8f32(
@@ -2015,54 +2147,62 @@
 
 define void @trunc_16f32() #0 {
 ; SSE2-LABEL: @trunc_16f32(
-; SSE2-NEXT:    [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; SSE2-NEXT:    [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; SSE2-NEXT:    [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; SSE2-NEXT:    [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; SSE2-NEXT:    [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; SSE2-NEXT:    [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; SSE2-NEXT:    [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; SSE2-NEXT:    [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; SSE2-NEXT:    [[LD8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8), align 4
-; SSE2-NEXT:    [[LD9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9), align 4
-; SSE2-NEXT:    [[LD10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4
-; SSE2-NEXT:    [[LD11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4
-; SSE2-NEXT:    [[LD12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4
-; SSE2-NEXT:    [[LD13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4
-; SSE2-NEXT:    [[LD14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4
-; SSE2-NEXT:    [[LD15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4
-; SSE2-NEXT:    [[TRUNC0:%.*]] = call float @llvm.trunc.f32(float [[LD0]])
-; SSE2-NEXT:    [[TRUNC1:%.*]] = call float @llvm.trunc.f32(float [[LD1]])
-; SSE2-NEXT:    [[TRUNC2:%.*]] = call float @llvm.trunc.f32(float [[LD2]])
-; SSE2-NEXT:    [[TRUNC3:%.*]] = call float @llvm.trunc.f32(float [[LD3]])
-; SSE2-NEXT:    [[TRUNC4:%.*]] = call float @llvm.trunc.f32(float [[LD4]])
-; SSE2-NEXT:    [[TRUNC5:%.*]] = call float @llvm.trunc.f32(float [[LD5]])
-; SSE2-NEXT:    [[TRUNC6:%.*]] = call float @llvm.trunc.f32(float [[LD6]])
-; SSE2-NEXT:    [[TRUNC7:%.*]] = call float @llvm.trunc.f32(float [[LD7]])
-; SSE2-NEXT:    [[TRUNC8:%.*]] = call float @llvm.trunc.f32(float [[LD8]])
-; SSE2-NEXT:    [[TRUNC9:%.*]] = call float @llvm.trunc.f32(float [[LD9]])
-; SSE2-NEXT:    [[TRUNC10:%.*]] = call float @llvm.trunc.f32(float [[LD10]])
-; SSE2-NEXT:    [[TRUNC11:%.*]] = call float @llvm.trunc.f32(float [[LD11]])
-; SSE2-NEXT:    [[TRUNC12:%.*]] = call float @llvm.trunc.f32(float [[LD12]])
-; SSE2-NEXT:    [[TRUNC13:%.*]] = call float @llvm.trunc.f32(float [[LD13]])
-; SSE2-NEXT:    [[TRUNC14:%.*]] = call float @llvm.trunc.f32(float [[LD14]])
-; SSE2-NEXT:    [[TRUNC15:%.*]] = call float @llvm.trunc.f32(float [[LD15]])
-; SSE2-NEXT:    store float [[TRUNC0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; SSE2-NEXT:    store float [[TRUNC1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE2-NEXT:    store float [[TRUNC2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; SSE2-NEXT:    store float [[TRUNC3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE2-NEXT:    store float [[TRUNC4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
-; SSE2-NEXT:    store float [[TRUNC5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE2-NEXT:    store float [[TRUNC6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
-; SSE2-NEXT:    store float [[TRUNC7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
-; SSE2-NEXT:    store float [[TRUNC8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4
-; SSE2-NEXT:    store float [[TRUNC9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4
-; SSE2-NEXT:    store float [[TRUNC10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4
-; SSE2-NEXT:    store float [[TRUNC11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
-; SSE2-NEXT:    store float [[TRUNC12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4
-; SSE2-NEXT:    store float [[TRUNC13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
-; SSE2-NEXT:    store float [[TRUNC14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4
-; SSE2-NEXT:    store float [[TRUNC15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; SSE2-NEXT:    [[TRUNC0:%.*]] = call float @llvm.trunc.f32(float [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; SSE2-NEXT:    [[TRUNC1:%.*]] = call float @llvm.trunc.f32(float [[TMP6]])
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; SSE2-NEXT:    [[TRUNC2:%.*]] = call float @llvm.trunc.f32(float [[TMP7]])
+; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; SSE2-NEXT:    [[TRUNC3:%.*]] = call float @llvm.trunc.f32(float [[TMP8]])
+; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; SSE2-NEXT:    [[TRUNC4:%.*]] = call float @llvm.trunc.f32(float [[TMP9]])
+; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; SSE2-NEXT:    [[TRUNC5:%.*]] = call float @llvm.trunc.f32(float [[TMP10]])
+; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; SSE2-NEXT:    [[TRUNC6:%.*]] = call float @llvm.trunc.f32(float [[TMP11]])
+; SSE2-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; SSE2-NEXT:    [[TRUNC7:%.*]] = call float @llvm.trunc.f32(float [[TMP12]])
+; SSE2-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
+; SSE2-NEXT:    [[TRUNC8:%.*]] = call float @llvm.trunc.f32(float [[TMP13]])
+; SSE2-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
+; SSE2-NEXT:    [[TRUNC9:%.*]] = call float @llvm.trunc.f32(float [[TMP14]])
+; SSE2-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
+; SSE2-NEXT:    [[TRUNC10:%.*]] = call float @llvm.trunc.f32(float [[TMP15]])
+; SSE2-NEXT:    [[TMP16:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
+; SSE2-NEXT:    [[TRUNC11:%.*]] = call float @llvm.trunc.f32(float [[TMP16]])
+; SSE2-NEXT:    [[TMP17:%.*]] = extractelement <4 x float> [[TMP4]], i32 0
+; SSE2-NEXT:    [[TRUNC12:%.*]] = call float @llvm.trunc.f32(float [[TMP17]])
+; SSE2-NEXT:    [[TMP18:%.*]] = extractelement <4 x float> [[TMP4]], i32 1
+; SSE2-NEXT:    [[TRUNC13:%.*]] = call float @llvm.trunc.f32(float [[TMP18]])
+; SSE2-NEXT:    [[TMP19:%.*]] = extractelement <4 x float> [[TMP4]], i32 2
+; SSE2-NEXT:    [[TRUNC14:%.*]] = call float @llvm.trunc.f32(float [[TMP19]])
+; SSE2-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[TMP4]], i32 3
+; SSE2-NEXT:    [[TRUNC15:%.*]] = call float @llvm.trunc.f32(float [[TMP20]])
+; SSE2-NEXT:    [[TMP21:%.*]] = insertelement <4 x float> poison, float [[TRUNC0]], i32 0
+; SSE2-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TRUNC1]], i32 1
+; SSE2-NEXT:    [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[TRUNC2]], i32 2
+; SSE2-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TRUNC3]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP24]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP25:%.*]] = insertelement <4 x float> poison, float [[TRUNC4]], i32 0
+; SSE2-NEXT:    [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[TRUNC5]], i32 1
+; SSE2-NEXT:    [[TMP27:%.*]] = insertelement <4 x float> [[TMP26]], float [[TRUNC6]], i32 2
+; SSE2-NEXT:    [[TMP28:%.*]] = insertelement <4 x float> [[TMP27]], float [[TRUNC7]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP28]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP29:%.*]] = insertelement <4 x float> poison, float [[TRUNC8]], i32 0
+; SSE2-NEXT:    [[TMP30:%.*]] = insertelement <4 x float> [[TMP29]], float [[TRUNC9]], i32 1
+; SSE2-NEXT:    [[TMP31:%.*]] = insertelement <4 x float> [[TMP30]], float [[TRUNC10]], i32 2
+; SSE2-NEXT:    [[TMP32:%.*]] = insertelement <4 x float> [[TMP31]], float [[TRUNC11]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP32]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP33:%.*]] = insertelement <4 x float> poison, float [[TRUNC12]], i32 0
+; SSE2-NEXT:    [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[TRUNC13]], i32 1
+; SSE2-NEXT:    [[TMP35:%.*]] = insertelement <4 x float> [[TMP34]], float [[TRUNC14]], i32 2
+; SSE2-NEXT:    [[TMP36:%.*]] = insertelement <4 x float> [[TMP35]], float [[TRUNC15]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP36]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @trunc_16f32(
Index: llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
@@ -112,12 +112,15 @@
 ; CHECK-NEXT:    [[T3:%.*]] = bitcast float* [[T2]] to i64*
 ; CHECK-NEXT:    [[T4:%.*]] = load i64, i64* [[T3]], align 8
 ; CHECK-NEXT:    [[T5:%.*]] = trunc i64 [[T1]] to i32
-; CHECK-NEXT:    [[T6:%.*]] = bitcast i32 [[T5]] to float
-; CHECK-NEXT:    [[T7:%.*]] = insertelement <4 x float> poison, float [[T6]], i32 0
 ; CHECK-NEXT:    [[T8:%.*]] = lshr i64 [[T1]], 32
 ; CHECK-NEXT:    [[T9:%.*]] = trunc i64 [[T8]] to i32
-; CHECK-NEXT:    [[T10:%.*]] = bitcast i32 [[T9]] to float
-; CHECK-NEXT:    [[T11:%.*]] = insertelement <4 x float> [[T7]], float [[T10]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T5]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T9]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <2 x float>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; CHECK-NEXT:    [[T7:%.*]] = insertelement <4 x float> poison, float [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; CHECK-NEXT:    [[T11:%.*]] = insertelement <4 x float> [[T7]], float [[TMP5]], i32 1
 ; CHECK-NEXT:    [[T12:%.*]] = trunc i64 [[T4]] to i32
 ; CHECK-NEXT:    [[T13:%.*]] = bitcast i32 [[T12]] to float
 ; CHECK-NEXT:    [[T14:%.*]] = insertelement <4 x float> [[T11]], float [[T13]], i32 2
Index: llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
@@ -112,12 +112,15 @@
 ; CHECK-NEXT:    [[T3:%.*]] = bitcast float* [[T2]] to i64*
 ; CHECK-NEXT:    [[T4:%.*]] = load i64, i64* [[T3]], align 8
 ; CHECK-NEXT:    [[T5:%.*]] = trunc i64 [[T1]] to i32
-; CHECK-NEXT:    [[T6:%.*]] = bitcast i32 [[T5]] to float
-; CHECK-NEXT:    [[T7:%.*]] = insertelement <4 x float> undef, float [[T6]], i32 0
 ; CHECK-NEXT:    [[T8:%.*]] = lshr i64 [[T1]], 32
 ; CHECK-NEXT:    [[T9:%.*]] = trunc i64 [[T8]] to i32
-; CHECK-NEXT:    [[T10:%.*]] = bitcast i32 [[T9]] to float
-; CHECK-NEXT:    [[T11:%.*]] = insertelement <4 x float> [[T7]], float [[T10]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T5]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T9]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <2 x float>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; CHECK-NEXT:    [[T7:%.*]] = insertelement <4 x float> undef, float [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; CHECK-NEXT:    [[T11:%.*]] = insertelement <4 x float> [[T7]], float [[TMP5]], i32 1
 ; CHECK-NEXT:    [[T12:%.*]] = trunc i64 [[T4]] to i32
 ; CHECK-NEXT:    [[T13:%.*]] = bitcast i32 [[T12]] to float
 ; CHECK-NEXT:    [[T14:%.*]] = insertelement <4 x float> [[T11]], float [[T13]], i32 2
Index: llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+sse2 -S | FileCheck %s --check-prefixes=CHECK,SSE
 ; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+avx  -S | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+avx2 -S | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+avx2 -S | FileCheck %s --check-prefixes=CHECK,AVX2
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -58,12 +58,18 @@
 define i8 @PR31243_sext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, i8* %ptr) {
 ; SSE-LABEL: @PR31243_sext(
 ; SSE-NEXT:  entry:
-; SSE-NEXT:    [[TMP0:%.*]] = or i8 [[V0:%.*]], 1
-; SSE-NEXT:    [[TMP1:%.*]] = or i8 [[V1:%.*]], 1
-; SSE-NEXT:    [[TMP2:%.*]] = sext i8 [[TMP0]] to i64
-; SSE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP2]]
-; SSE-NEXT:    [[TMP3:%.*]] = sext i8 [[TMP1]] to i64
-; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP3]]
+; SSE-NEXT:    [[TMP0:%.*]] = sext i8 [[V0:%.*]] to i32
+; SSE-NEXT:    [[TMP1:%.*]] = sext i8 [[V1:%.*]] to i32
+; SSE-NEXT:    [[TMP_0:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0
+; SSE-NEXT:    [[TMP_1:%.*]] = insertelement <2 x i32> [[TMP_0]], i32 [[TMP1]], i32 1
+; SSE-NEXT:    [[TMP2:%.*]] = trunc <2 x i32> [[TMP_1]] to <2 x i16>
+; SSE-NEXT:    [[TMP3:%.*]] = or <2 x i16> [[TMP2]], <i16 1, i16 1>
+; SSE-NEXT:    [[TMP4:%.*]] = extractelement <2 x i16> [[TMP3]], i32 0
+; SSE-NEXT:    [[TMP5:%.*]] = sext i16 [[TMP4]] to i64
+; SSE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP5]]
+; SSE-NEXT:    [[TMP6:%.*]] = extractelement <2 x i16> [[TMP3]], i32 1
+; SSE-NEXT:    [[TMP7:%.*]] = sext i16 [[TMP6]] to i64
+; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP7]]
 ; SSE-NEXT:    [[TMP6:%.*]] = load i8, i8* [[TMP4]], align 1
 ; SSE-NEXT:    [[TMP7:%.*]] = load i8, i8* [[TMP5]], align 1
 ; SSE-NEXT:    [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]]
@@ -86,6 +92,23 @@
 ; AVX-NEXT:    [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]]
 ; AVX-NEXT:    ret i8 [[TMP8]]
 ;
+; AVX2-LABEL: @PR31243_sext(
+; AVX2-NEXT:  entry:
+; AVX2-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i32 0
+; AVX2-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i32 1
+; AVX2-NEXT:    [[TMP2:%.*]] = or <2 x i8> [[TMP1]], <i8 1, i8 1>
+; AVX2-NEXT:    [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i16>
+; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i16> [[TMP3]], i32 0
+; AVX2-NEXT:    [[TMP5:%.*]] = sext i16 [[TMP4]] to i64
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP5]]
+; AVX2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i16> [[TMP3]], i32 1
+; AVX2-NEXT:    [[TMP7:%.*]] = sext i16 [[TMP6]] to i64
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP7]]
+; AVX2-NEXT:    [[TMP6:%.*]] = load i8, i8* [[TMP4]], align 1
+; AVX2-NEXT:    [[TMP7:%.*]] = load i8, i8* [[TMP5]], align 1
+; AVX2-NEXT:    [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]]
+; AVX2-NEXT:    ret i8 [[TMP8]]
+;
 entry:
   %tmp0 = sext i8 %v0 to i32
   %tmp1 = sext i8 %v1 to i32
Index: llvm/test/Transforms/SLPVectorizer/X86/powof2div.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/powof2div.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/powof2div.ll
@@ -60,35 +60,34 @@
 define void @powof2div_nonuniform(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c){
 ; AVX1-LABEL: @powof2div_nonuniform(
 ; AVX1-NEXT:  entry:
-; AVX1-NEXT:    [[TMP0:%.*]] = load i32, i32* [[B:%.*]], align 4
-; AVX1-NEXT:    [[TMP1:%.*]] = load i32, i32* [[C:%.*]], align 4
-; AVX1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
-; AVX1-NEXT:    [[DIV:%.*]] = sdiv i32 [[ADD]], 2
-; AVX1-NEXT:    store i32 [[DIV]], i32* [[A:%.*]], align 4
-; AVX1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 1
-; AVX1-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
-; AVX1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 1
-; AVX1-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX4]], align 4
-; AVX1-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP3]], [[TMP2]]
-; AVX1-NEXT:    [[DIV6:%.*]] = sdiv i32 [[ADD5]], 4
-; AVX1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1
-; AVX1-NEXT:    store i32 [[DIV6]], i32* [[ARRAYIDX7]], align 4
+; AVX1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
+; AVX1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 1
 ; AVX1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
-; AVX1-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX8]], align 4
 ; AVX1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2
-; AVX1-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX9]], align 4
-; AVX1-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP5]], [[TMP4]]
-; AVX1-NEXT:    [[DIV11:%.*]] = sdiv i32 [[ADD10]], 8
-; AVX1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
-; AVX1-NEXT:    store i32 [[DIV11]], i32* [[ARRAYIDX12]], align 4
 ; AVX1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
-; AVX1-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX13]], align 4
+; AVX1-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to <4 x i32>*
+; AVX1-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
 ; AVX1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3
-; AVX1-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX14]], align 4
-; AVX1-NEXT:    [[ADD15:%.*]] = add nsw i32 [[TMP7]], [[TMP6]]
-; AVX1-NEXT:    [[DIV16:%.*]] = sdiv i32 [[ADD15]], 16
+; AVX1-NEXT:    [[TMP2:%.*]] = bitcast i32* [[C]] to <4 x i32>*
+; AVX1-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
+; AVX1-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP1]]
+; AVX1-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
+; AVX1-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP5]], 2
+; AVX1-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1
+; AVX1-NEXT:    [[DIV6:%.*]] = sdiv i32 [[TMP6]], 4
+; AVX1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 1
+; AVX1-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
+; AVX1-NEXT:    [[DIV11:%.*]] = sdiv i32 [[TMP7]], 8
+; AVX1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
+; AVX1-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
+; AVX1-NEXT:    [[DIV16:%.*]] = sdiv i32 [[TMP8]], 16
 ; AVX1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
-; AVX1-NEXT:    store i32 [[DIV16]], i32* [[ARRAYIDX17]], align 4
+; AVX1-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> poison, i32 [[DIV]], i32 0
+; AVX1-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[DIV6]], i32 1
+; AVX1-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[DIV11]], i32 2
+; AVX1-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[DIV16]], i32 3
+; AVX1-NEXT:    [[TMP13:%.*]] = bitcast i32* [[A]] to <4 x i32>*
+; AVX1-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* [[TMP13]], align 4
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @powof2div_nonuniform(
Index: llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
@@ -70,31 +70,28 @@
 ; SSE-NEXT:  entry:
 ; SSE-NEXT:    [[TMP0:%.*]] = load i64, i64* undef, align 1
 ; SSE-NEXT:    [[AND:%.*]] = shl i64 [[TMP0]], 2
-; SSE-NEXT:    [[SHL:%.*]] = and i64 [[AND]], 20
 ; SSE-NEXT:    [[ADD:%.*]] = add i64 undef, undef
 ; SSE-NEXT:    store i64 [[ADD]], i64* undef, align 1
 ; SSE-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 5
 ; SSE-NEXT:    [[AND_1:%.*]] = shl i64 undef, 2
-; SSE-NEXT:    [[SHL_1:%.*]] = and i64 [[AND_1]], 20
-; SSE-NEXT:    [[SHR_1:%.*]] = lshr i64 undef, 6
-; SSE-NEXT:    [[ADD_1:%.*]] = add nuw nsw i64 [[SHL]], [[SHR_1]]
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[AND_1]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[AND]], i32 1
+; SSE-NEXT:    [[TMP3:%.*]] = and <2 x i64> [[TMP2]], <i64 20, i64 20>
 ; SSE-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 4
-; SSE-NEXT:    [[SHR_2:%.*]] = lshr i64 undef, 6
-; SSE-NEXT:    [[ADD_2:%.*]] = add nuw nsw i64 [[SHL_1]], [[SHR_2]]
-; SSE-NEXT:    [[AND_4:%.*]] = shl i64 [[ADD]], 2
-; SSE-NEXT:    [[SHL_4:%.*]] = and i64 [[AND_4]], 20
+; SSE-NEXT:    [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer
 ; SSE-NEXT:    [[ARRAYIDX2_5:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 1
-; SSE-NEXT:    store i64 [[ADD_1]], i64* [[ARRAYIDX2_5]], align 1
-; SSE-NEXT:    [[AND_5:%.*]] = shl nuw nsw i64 [[ADD_1]], 2
-; SSE-NEXT:    [[SHL_5:%.*]] = and i64 [[AND_5]], 20
-; SSE-NEXT:    [[SHR_5:%.*]] = lshr i64 [[ADD_1]], 6
-; SSE-NEXT:    [[ADD_5:%.*]] = add nuw nsw i64 [[SHL_4]], [[SHR_5]]
-; SSE-NEXT:    store i64 [[ADD_5]], i64* [[ARRAYIDX2_1]], align 1
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; SSE-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0
+; SSE-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[ADD]], i32 1
+; SSE-NEXT:    [[TMP8:%.*]] = shl <2 x i64> [[TMP7]], <i64 2, i64 2>
+; SSE-NEXT:    [[TMP9:%.*]] = and <2 x i64> [[TMP8]], <i64 20, i64 20>
 ; SSE-NEXT:    [[ARRAYIDX2_6:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 0
-; SSE-NEXT:    store i64 [[ADD_2]], i64* [[ARRAYIDX2_6]], align 1
-; SSE-NEXT:    [[SHR_6:%.*]] = lshr i64 [[ADD_2]], 6
-; SSE-NEXT:    [[ADD_6:%.*]] = add nuw nsw i64 [[SHL_5]], [[SHR_6]]
-; SSE-NEXT:    store i64 [[ADD_6]], i64* [[ARRAYIDX2_2]], align 1
+; SSE-NEXT:    [[TMP10:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>*
+; SSE-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* [[TMP10]], align 1
+; SSE-NEXT:    [[TMP11:%.*]] = lshr <2 x i64> [[TMP4]], <i64 6, i64 6>
+; SSE-NEXT:    [[TMP12:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP11]]
+; SSE-NEXT:    [[TMP13:%.*]] = bitcast i64* [[ARRAYIDX2_2]] to <2 x i64>*
+; SSE-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* [[TMP13]], align 1
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @pr35497(
Index: llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
@@ -1,14 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2   | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2   | FileCheck %s --check-prefixes=CHECK,SSE2
+; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE42
 ; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx    | FileCheck %s --check-prefixes=CHECK,AVX1
 ; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2   | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX2
+; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512
 
 define void @store_i32(i32* nocapture %0, i32 %1, i32 %2) {
 ; CHECK-LABEL: @store_i32(
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4, [[TBAA0:!tbaa !.*]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4, !tbaa [[TBAA0:![0-9]+]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul <4 x i32> [[TMP5]], [[TMP7]]
@@ -16,7 +16,7 @@
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ult <4 x i32> [[TMP9]], <i32 255, i32 255, i32 255, i32 255>
 ; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP9]], <4 x i32> <i32 255, i32 255, i32 255, i32 255>
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* [[TMP12]], align 4, [[TBAA0]]
+; CHECK-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    ret void
 ;
   %4 = load i32, i32* %0, align 4, !tbaa !2
@@ -52,7 +52,7 @@
 define void @store_i8(i8* nocapture %0, i32 %1, i32 %2) {
 ; CHECK-LABEL: @store_i8(
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP0:%.*]] to <4 x i8>*
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 1, [[TBAA4:!tbaa !.*]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 1, !tbaa [[TBAA4:![0-9]+]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = zext <4 x i8> [[TMP5]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> undef, <4 x i32> zeroinitializer
@@ -62,7 +62,7 @@
 ; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP10]], <4 x i32> <i32 255, i32 255, i32 255, i32 255>
 ; CHECK-NEXT:    [[TMP13:%.*]] = trunc <4 x i32> [[TMP12]] to <4 x i8>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8* [[TMP0]] to <4 x i8>*
-; CHECK-NEXT:    store <4 x i8> [[TMP13]], <4 x i8>* [[TMP14]], align 1, [[TBAA4]]
+; CHECK-NEXT:    store <4 x i8> [[TMP13]], <4 x i8>* [[TMP14]], align 1, !tbaa [[TBAA4]]
 ; CHECK-NEXT:    ret void
 ;
   %4 = load i8, i8* %0, align 1, !tbaa !6
@@ -104,88 +104,109 @@
 }
 
 define void @store_i64(i64* nocapture %0, i32 %1, i32 %2) {
-; SSE-LABEL: @store_i64(
-; SSE-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64
-; SSE-NEXT:    [[TMP5:%.*]] = load i64, i64* [[TMP0:%.*]], align 8, [[TBAA5:!tbaa !.*]]
-; SSE-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = lshr i64 [[TMP6]], 15
-; SSE-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32
-; SSE-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], 255
-; SSE-NEXT:    [[TMP10:%.*]] = and i64 [[TMP7]], 4294967295
-; SSE-NEXT:    [[TMP11:%.*]] = select i1 [[TMP9]], i64 [[TMP10]], i64 255
-; SSE-NEXT:    store i64 [[TMP11]], i64* [[TMP0]], align 8, [[TBAA5]]
-; SSE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 1
-; SSE-NEXT:    [[TMP13:%.*]] = load i64, i64* [[TMP12]], align 8, [[TBAA5]]
-; SSE-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], [[TMP4]]
-; SSE-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP14]], 15
-; SSE-NEXT:    [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32
-; SSE-NEXT:    [[TMP17:%.*]] = icmp ult i32 [[TMP16]], 255
-; SSE-NEXT:    [[TMP18:%.*]] = and i64 [[TMP15]], 4294967295
-; SSE-NEXT:    [[TMP19:%.*]] = select i1 [[TMP17]], i64 [[TMP18]], i64 255
-; SSE-NEXT:    store i64 [[TMP19]], i64* [[TMP12]], align 8, [[TBAA5]]
-; SSE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 2
-; SSE-NEXT:    [[TMP21:%.*]] = load i64, i64* [[TMP20]], align 8, [[TBAA5]]
-; SSE-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP21]], [[TMP4]]
-; SSE-NEXT:    [[TMP23:%.*]] = lshr i64 [[TMP22]], 15
-; SSE-NEXT:    [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32
-; SSE-NEXT:    [[TMP25:%.*]] = icmp ult i32 [[TMP24]], 255
-; SSE-NEXT:    [[TMP26:%.*]] = and i64 [[TMP23]], 4294967295
-; SSE-NEXT:    [[TMP27:%.*]] = select i1 [[TMP25]], i64 [[TMP26]], i64 255
-; SSE-NEXT:    store i64 [[TMP27]], i64* [[TMP20]], align 8, [[TBAA5]]
-; SSE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 3
-; SSE-NEXT:    [[TMP29:%.*]] = load i64, i64* [[TMP28]], align 8, [[TBAA5]]
-; SSE-NEXT:    [[TMP30:%.*]] = mul i64 [[TMP29]], [[TMP4]]
-; SSE-NEXT:    [[TMP31:%.*]] = lshr i64 [[TMP30]], 15
-; SSE-NEXT:    [[TMP32:%.*]] = trunc i64 [[TMP31]] to i32
-; SSE-NEXT:    [[TMP33:%.*]] = icmp ult i32 [[TMP32]], 255
-; SSE-NEXT:    [[TMP34:%.*]] = and i64 [[TMP31]], 4294967295
-; SSE-NEXT:    [[TMP35:%.*]] = select i1 [[TMP33]], i64 [[TMP34]], i64 255
-; SSE-NEXT:    store i64 [[TMP35]], i64* [[TMP28]], align 8, [[TBAA5]]
-; SSE-NEXT:    ret void
+; SSE2-LABEL: @store_i64(
+; SSE2-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64
+; SSE2-NEXT:    [[TMP5:%.*]] = load i64, i64* [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]]
+; SSE2-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], [[TMP4]]
+; SSE2-NEXT:    [[TMP7:%.*]] = lshr i64 [[TMP6]], 15
+; SSE2-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32
+; SSE2-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], 255
+; SSE2-NEXT:    [[TMP10:%.*]] = and i64 [[TMP7]], 4294967295
+; SSE2-NEXT:    [[TMP11:%.*]] = select i1 [[TMP9]], i64 [[TMP10]], i64 255
+; SSE2-NEXT:    store i64 [[TMP11]], i64* [[TMP0]], align 8, !tbaa [[TBAA5]]
+; SSE2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 1
+; SSE2-NEXT:    [[TMP13:%.*]] = load i64, i64* [[TMP12]], align 8, !tbaa [[TBAA5]]
+; SSE2-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], [[TMP4]]
+; SSE2-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP14]], 15
+; SSE2-NEXT:    [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32
+; SSE2-NEXT:    [[TMP17:%.*]] = icmp ult i32 [[TMP16]], 255
+; SSE2-NEXT:    [[TMP18:%.*]] = and i64 [[TMP15]], 4294967295
+; SSE2-NEXT:    [[TMP19:%.*]] = select i1 [[TMP17]], i64 [[TMP18]], i64 255
+; SSE2-NEXT:    store i64 [[TMP19]], i64* [[TMP12]], align 8, !tbaa [[TBAA5]]
+; SSE2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 2
+; SSE2-NEXT:    [[TMP21:%.*]] = load i64, i64* [[TMP20]], align 8, !tbaa [[TBAA5]]
+; SSE2-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP21]], [[TMP4]]
+; SSE2-NEXT:    [[TMP23:%.*]] = lshr i64 [[TMP22]], 15
+; SSE2-NEXT:    [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32
+; SSE2-NEXT:    [[TMP25:%.*]] = icmp ult i32 [[TMP24]], 255
+; SSE2-NEXT:    [[TMP26:%.*]] = and i64 [[TMP23]], 4294967295
+; SSE2-NEXT:    [[TMP27:%.*]] = select i1 [[TMP25]], i64 [[TMP26]], i64 255
+; SSE2-NEXT:    store i64 [[TMP27]], i64* [[TMP20]], align 8, !tbaa [[TBAA5]]
+; SSE2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 3
+; SSE2-NEXT:    [[TMP29:%.*]] = load i64, i64* [[TMP28]], align 8, !tbaa [[TBAA5]]
+; SSE2-NEXT:    [[TMP30:%.*]] = mul i64 [[TMP29]], [[TMP4]]
+; SSE2-NEXT:    [[TMP31:%.*]] = lshr i64 [[TMP30]], 15
+; SSE2-NEXT:    [[TMP32:%.*]] = trunc i64 [[TMP31]] to i32
+; SSE2-NEXT:    [[TMP33:%.*]] = icmp ult i32 [[TMP32]], 255
+; SSE2-NEXT:    [[TMP34:%.*]] = and i64 [[TMP31]], 4294967295
+; SSE2-NEXT:    [[TMP35:%.*]] = select i1 [[TMP33]], i64 [[TMP34]], i64 255
+; SSE2-NEXT:    store i64 [[TMP35]], i64* [[TMP28]], align 8, !tbaa [[TBAA5]]
+; SSE2-NEXT:    ret void
+;
+; SSE42-LABEL: @store_i64(
+; SSE42-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64
+; SSE42-NEXT:    [[TMP5:%.*]] = bitcast i64* [[TMP0:%.*]] to <2 x i64>*
+; SSE42-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[TMP5]], align 8, !tbaa [[TBAA5:![0-9]+]]
+; SSE42-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
+; SSE42-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], [[TMP4]]
+; SSE42-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
+; SSE42-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], [[TMP4]]
+; SSE42-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
+; SSE42-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> [[TMP11]], i64 [[TMP10]], i32 1
+; SSE42-NEXT:    [[TMP13:%.*]] = lshr <2 x i64> [[TMP12]], <i64 15, i64 15>
+; SSE42-NEXT:    [[TMP14:%.*]] = trunc <2 x i64> [[TMP13]] to <2 x i32>
+; SSE42-NEXT:    [[TMP15:%.*]] = icmp ult <2 x i32> [[TMP14]], <i32 255, i32 255>
+; SSE42-NEXT:    [[TMP16:%.*]] = and <2 x i64> [[TMP13]], <i64 4294967295, i64 4294967295>
+; SSE42-NEXT:    [[TMP17:%.*]] = select <2 x i1> [[TMP15]], <2 x i64> [[TMP16]], <2 x i64> <i64 255, i64 255>
+; SSE42-NEXT:    [[TMP18:%.*]] = bitcast i64* [[TMP0]] to <2 x i64>*
+; SSE42-NEXT:    store <2 x i64> [[TMP17]], <2 x i64>* [[TMP18]], align 8, !tbaa [[TBAA5]]
+; SSE42-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 2
+; SSE42-NEXT:    [[TMP20:%.*]] = bitcast i64* [[TMP19]] to <2 x i64>*
+; SSE42-NEXT:    [[TMP21:%.*]] = load <2 x i64>, <2 x i64>* [[TMP20]], align 8, !tbaa [[TBAA5]]
+; SSE42-NEXT:    [[TMP22:%.*]] = extractelement <2 x i64> [[TMP21]], i32 0
+; SSE42-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP22]], [[TMP4]]
+; SSE42-NEXT:    [[TMP24:%.*]] = extractelement <2 x i64> [[TMP21]], i32 1
+; SSE42-NEXT:    [[TMP25:%.*]] = mul i64 [[TMP24]], [[TMP4]]
+; SSE42-NEXT:    [[TMP26:%.*]] = insertelement <2 x i64> poison, i64 [[TMP23]], i32 0
+; SSE42-NEXT:    [[TMP27:%.*]] = insertelement <2 x i64> [[TMP26]], i64 [[TMP25]], i32 1
+; SSE42-NEXT:    [[TMP28:%.*]] = lshr <2 x i64> [[TMP27]], <i64 15, i64 15>
+; SSE42-NEXT:    [[TMP29:%.*]] = trunc <2 x i64> [[TMP28]] to <2 x i32>
+; SSE42-NEXT:    [[TMP30:%.*]] = icmp ult <2 x i32> [[TMP29]], <i32 255, i32 255>
+; SSE42-NEXT:    [[TMP31:%.*]] = and <2 x i64> [[TMP28]], <i64 4294967295, i64 4294967295>
+; SSE42-NEXT:    [[TMP32:%.*]] = select <2 x i1> [[TMP30]], <2 x i64> [[TMP31]], <2 x i64> <i64 255, i64 255>
+; SSE42-NEXT:    [[TMP33:%.*]] = bitcast i64* [[TMP19]] to <2 x i64>*
+; SSE42-NEXT:    store <2 x i64> [[TMP32]], <2 x i64>* [[TMP33]], align 8, !tbaa [[TBAA5]]
+; SSE42-NEXT:    ret void
 ;
 ; AVX1-LABEL: @store_i64(
 ; AVX1-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64
-; AVX1-NEXT:    [[TMP5:%.*]] = load i64, i64* [[TMP0:%.*]], align 8, [[TBAA5:!tbaa !.*]]
-; AVX1-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], [[TMP4]]
-; AVX1-NEXT:    [[TMP7:%.*]] = lshr i64 [[TMP6]], 15
-; AVX1-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32
-; AVX1-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], 255
-; AVX1-NEXT:    [[TMP10:%.*]] = and i64 [[TMP7]], 4294967295
-; AVX1-NEXT:    [[TMP11:%.*]] = select i1 [[TMP9]], i64 [[TMP10]], i64 255
-; AVX1-NEXT:    store i64 [[TMP11]], i64* [[TMP0]], align 8, [[TBAA5]]
-; AVX1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 1
-; AVX1-NEXT:    [[TMP13:%.*]] = load i64, i64* [[TMP12]], align 8, [[TBAA5]]
-; AVX1-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], [[TMP4]]
-; AVX1-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP14]], 15
-; AVX1-NEXT:    [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32
-; AVX1-NEXT:    [[TMP17:%.*]] = icmp ult i32 [[TMP16]], 255
-; AVX1-NEXT:    [[TMP18:%.*]] = and i64 [[TMP15]], 4294967295
-; AVX1-NEXT:    [[TMP19:%.*]] = select i1 [[TMP17]], i64 [[TMP18]], i64 255
-; AVX1-NEXT:    store i64 [[TMP19]], i64* [[TMP12]], align 8, [[TBAA5]]
-; AVX1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 2
-; AVX1-NEXT:    [[TMP21:%.*]] = load i64, i64* [[TMP20]], align 8, [[TBAA5]]
-; AVX1-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP21]], [[TMP4]]
-; AVX1-NEXT:    [[TMP23:%.*]] = lshr i64 [[TMP22]], 15
-; AVX1-NEXT:    [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32
-; AVX1-NEXT:    [[TMP25:%.*]] = icmp ult i32 [[TMP24]], 255
-; AVX1-NEXT:    [[TMP26:%.*]] = and i64 [[TMP23]], 4294967295
-; AVX1-NEXT:    [[TMP27:%.*]] = select i1 [[TMP25]], i64 [[TMP26]], i64 255
-; AVX1-NEXT:    store i64 [[TMP27]], i64* [[TMP20]], align 8, [[TBAA5]]
-; AVX1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 3
-; AVX1-NEXT:    [[TMP29:%.*]] = load i64, i64* [[TMP28]], align 8, [[TBAA5]]
-; AVX1-NEXT:    [[TMP30:%.*]] = mul i64 [[TMP29]], [[TMP4]]
-; AVX1-NEXT:    [[TMP31:%.*]] = lshr i64 [[TMP30]], 15
-; AVX1-NEXT:    [[TMP32:%.*]] = trunc i64 [[TMP31]] to i32
-; AVX1-NEXT:    [[TMP33:%.*]] = icmp ult i32 [[TMP32]], 255
-; AVX1-NEXT:    [[TMP34:%.*]] = and i64 [[TMP31]], 4294967295
-; AVX1-NEXT:    [[TMP35:%.*]] = select i1 [[TMP33]], i64 [[TMP34]], i64 255
-; AVX1-NEXT:    store i64 [[TMP35]], i64* [[TMP28]], align 8, [[TBAA5]]
+; AVX1-NEXT:    [[TMP5:%.*]] = bitcast i64* [[TMP0:%.*]] to <4 x i64>*
+; AVX1-NEXT:    [[TMP6:%.*]] = load <4 x i64>, <4 x i64>* [[TMP5]], align 8, !tbaa [[TBAA5:![0-9]+]]
+; AVX1-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0
+; AVX1-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], [[TMP4]]
+; AVX1-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1
+; AVX1-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], [[TMP4]]
+; AVX1-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i32 0
+; AVX1-NEXT:    [[TMP13:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> undef, <2 x i32> zeroinitializer
+; AVX1-NEXT:    [[TMP14:%.*]] = mul <2 x i64> [[TMP11]], [[TMP13]]
+; AVX1-NEXT:    [[TMP15:%.*]] = shufflevector <2 x i64> [[TMP14]], <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; AVX1-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> poison, i64 [[TMP8]], i32 0
+; AVX1-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP10]], i32 1
+; AVX1-NEXT:    [[TMP18:%.*]] = shufflevector <4 x i64> [[TMP17]], <4 x i64> [[TMP15]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; AVX1-NEXT:    [[TMP19:%.*]] = lshr <4 x i64> [[TMP18]], <i64 15, i64 15, i64 15, i64 15>
+; AVX1-NEXT:    [[TMP20:%.*]] = trunc <4 x i64> [[TMP19]] to <4 x i32>
+; AVX1-NEXT:    [[TMP21:%.*]] = icmp ult <4 x i32> [[TMP20]], <i32 255, i32 255, i32 255, i32 255>
+; AVX1-NEXT:    [[TMP22:%.*]] = and <4 x i64> [[TMP19]], <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+; AVX1-NEXT:    [[TMP23:%.*]] = select <4 x i1> [[TMP21]], <4 x i64> [[TMP22]], <4 x i64> <i64 255, i64 255, i64 255, i64 255>
+; AVX1-NEXT:    [[TMP24:%.*]] = bitcast i64* [[TMP0]] to <4 x i64>*
+; AVX1-NEXT:    store <4 x i64> [[TMP23]], <4 x i64>* [[TMP24]], align 8, !tbaa [[TBAA5]]
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @store_i64(
 ; AVX2-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64
 ; AVX2-NEXT:    [[TMP5:%.*]] = bitcast i64* [[TMP0:%.*]] to <4 x i64>*
-; AVX2-NEXT:    [[TMP6:%.*]] = load <4 x i64>, <4 x i64>* [[TMP5]], align 8, [[TBAA5:!tbaa !.*]]
+; AVX2-NEXT:    [[TMP6:%.*]] = load <4 x i64>, <4 x i64>* [[TMP5]], align 8, !tbaa [[TBAA5:![0-9]+]]
 ; AVX2-NEXT:    [[TMP7:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0
 ; AVX2-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> undef, <4 x i32> zeroinitializer
 ; AVX2-NEXT:    [[TMP9:%.*]] = mul <4 x i64> [[TMP6]], [[TMP8]]
@@ -195,8 +216,24 @@
 ; AVX2-NEXT:    [[TMP13:%.*]] = and <4 x i64> [[TMP10]], <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
 ; AVX2-NEXT:    [[TMP14:%.*]] = select <4 x i1> [[TMP12]], <4 x i64> [[TMP13]], <4 x i64> <i64 255, i64 255, i64 255, i64 255>
 ; AVX2-NEXT:    [[TMP15:%.*]] = bitcast i64* [[TMP0]] to <4 x i64>*
-; AVX2-NEXT:    store <4 x i64> [[TMP14]], <4 x i64>* [[TMP15]], align 8, [[TBAA5]]
+; AVX2-NEXT:    store <4 x i64> [[TMP14]], <4 x i64>* [[TMP15]], align 8, !tbaa [[TBAA5]]
 ; AVX2-NEXT:    ret void
+;
+; AVX512-LABEL: @store_i64(
+; AVX512-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64
+; AVX512-NEXT:    [[TMP5:%.*]] = bitcast i64* [[TMP0:%.*]] to <4 x i64>*
+; AVX512-NEXT:    [[TMP6:%.*]] = load <4 x i64>, <4 x i64>* [[TMP5]], align 8, !tbaa [[TBAA5:![0-9]+]]
+; AVX512-NEXT:    [[TMP7:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0
+; AVX512-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX512-NEXT:    [[TMP9:%.*]] = mul <4 x i64> [[TMP6]], [[TMP8]]
+; AVX512-NEXT:    [[TMP10:%.*]] = lshr <4 x i64> [[TMP9]], <i64 15, i64 15, i64 15, i64 15>
+; AVX512-NEXT:    [[TMP11:%.*]] = trunc <4 x i64> [[TMP10]] to <4 x i32>
+; AVX512-NEXT:    [[TMP12:%.*]] = icmp ult <4 x i32> [[TMP11]], <i32 255, i32 255, i32 255, i32 255>
+; AVX512-NEXT:    [[TMP13:%.*]] = and <4 x i64> [[TMP10]], <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+; AVX512-NEXT:    [[TMP14:%.*]] = select <4 x i1> [[TMP12]], <4 x i64> [[TMP13]], <4 x i64> <i64 255, i64 255, i64 255, i64 255>
+; AVX512-NEXT:    [[TMP15:%.*]] = bitcast i64* [[TMP0]] to <4 x i64>*
+; AVX512-NEXT:    store <4 x i64> [[TMP14]], <4 x i64>* [[TMP15]], align 8, !tbaa [[TBAA5]]
+; AVX512-NEXT:    ret void
 ;
   %4 = zext i32 %1 to i64
   %5 = load i64, i64* %0, align 8, !tbaa !7
Index: llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
@@ -8,19 +8,19 @@
 define void @gather_load(i32* noalias nocapture %0, i32* noalias nocapture readonly %1) {
 ; CHECK-LABEL: @gather_load(
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, [[TBAA0:!tbaa !.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, [[TBAA0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, [[TBAA0]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, [[TBAA0]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i32 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i32 2
 ; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 3
 ; CHECK-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, [[TBAA0]]
+; CHECK-NEXT:    store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    ret void
 ;
   %3 = getelementptr inbounds i32, i32* %1, i64 1
@@ -46,66 +46,66 @@
 define void @gather_load_2(i32* noalias nocapture %0, i32* noalias nocapture readonly %1) {
 ; SSE-LABEL: @gather_load_2(
 ; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
-; SSE-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, [[TBAA0:!tbaa !.*]]
+; SSE-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP4]], 1
 ; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
-; SSE-NEXT:    store i32 [[TMP5]], i32* [[TMP0]], align 4, [[TBAA0]]
+; SSE-NEXT:    store i32 [[TMP5]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10
-; SSE-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP9:%.*]] = add nsw i32 [[TMP8]], 2
 ; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2
-; SSE-NEXT:    store i32 [[TMP9]], i32* [[TMP6]], align 4, [[TBAA0]]
+; SSE-NEXT:    store i32 [[TMP9]], i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3
-; SSE-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3
 ; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3
-; SSE-NEXT:    store i32 [[TMP13]], i32* [[TMP10]], align 4, [[TBAA0]]
+; SSE-NEXT:    store i32 [[TMP13]], i32* [[TMP10]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5
-; SSE-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4
-; SSE-NEXT:    store i32 [[TMP17]], i32* [[TMP14]], align 4, [[TBAA0]]
+; SSE-NEXT:    store i32 [[TMP17]], i32* [[TMP14]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_2(
 ; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
-; AVX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, [[TBAA0:!tbaa !.*]]
+; AVX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP4]], 1
 ; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
-; AVX-NEXT:    store i32 [[TMP5]], i32* [[TMP0]], align 4, [[TBAA0]]
+; AVX-NEXT:    store i32 [[TMP5]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10
-; AVX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[TMP9:%.*]] = add nsw i32 [[TMP8]], 2
 ; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2
-; AVX-NEXT:    store i32 [[TMP9]], i32* [[TMP6]], align 4, [[TBAA0]]
+; AVX-NEXT:    store i32 [[TMP9]], i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3
-; AVX-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3
 ; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3
-; AVX-NEXT:    store i32 [[TMP13]], i32* [[TMP10]], align 4, [[TBAA0]]
+; AVX-NEXT:    store i32 [[TMP13]], i32* [[TMP10]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5
-; AVX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4
-; AVX-NEXT:    store i32 [[TMP17]], i32* [[TMP14]], align 4, [[TBAA0]]
+; AVX-NEXT:    store i32 [[TMP17]], i32* [[TMP14]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_2(
 ; AVX2-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1:%.*]], i32 0
 ; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> undef, <4 x i32> zeroinitializer
 ; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i32, <4 x i32*> [[TMP4]], <4 x i64> <i64 1, i64 10, i64 3, i64 5>
-; AVX2-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0:!tbaa !.*]]
+; AVX2-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
 ; AVX2-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], <i32 1, i32 2, i32 3, i32 4>
 ; AVX2-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
-; AVX2-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4, [[TBAA0]]
+; AVX2-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512-LABEL: @gather_load_2(
 ; AVX512-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1:%.*]], i32 0
 ; AVX512-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> undef, <4 x i32> zeroinitializer
 ; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr i32, <4 x i32*> [[TMP4]], <4 x i64> <i64 1, i64 10, i64 3, i64 5>
-; AVX512-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0:!tbaa !.*]]
+; AVX512-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
 ; AVX512-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], <i32 1, i32 2, i32 3, i32 4>
 ; AVX512-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
-; AVX512-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4, [[TBAA0]]
+; AVX512-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4, !tbaa [[TBAA0]]
 ; AVX512-NEXT:    ret void
 ;
   %3 = getelementptr inbounds i32, i32* %1, i64 1
@@ -133,144 +133,87 @@
 
 define void @gather_load_3(i32* noalias nocapture %0, i32* noalias nocapture readonly %1) {
 ; SSE-LABEL: @gather_load_3(
-; SSE-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
-; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
-; SSE-NEXT:    store i32 [[TMP4]], i32* [[TMP0]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
-; SSE-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP8:%.*]] = add i32 [[TMP7]], 2
-; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2
-; SSE-NEXT:    store i32 [[TMP8]], i32* [[TMP5]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
-; SSE-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP12:%.*]] = add i32 [[TMP11]], 3
-; SSE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3
-; SSE-NEXT:    store i32 [[TMP12]], i32* [[TMP9]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
-; SSE-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP16:%.*]] = add i32 [[TMP15]], 4
-; SSE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4
-; SSE-NEXT:    store i32 [[TMP16]], i32* [[TMP13]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
-; SSE-NEXT:    [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP20:%.*]] = add i32 [[TMP19]], 1
-; SSE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
-; SSE-NEXT:    store i32 [[TMP20]], i32* [[TMP17]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
-; SSE-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP24:%.*]] = add i32 [[TMP23]], 2
-; SSE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
-; SSE-NEXT:    store i32 [[TMP24]], i32* [[TMP21]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
-; SSE-NEXT:    [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP28:%.*]] = add i32 [[TMP27]], 3
-; SSE-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7
-; SSE-NEXT:    store i32 [[TMP28]], i32* [[TMP25]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
-; SSE-NEXT:    [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP32:%.*]] = add i32 [[TMP31]], 4
-; SSE-NEXT:    store i32 [[TMP32]], i32* [[TMP29]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
+; SSE-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
+; SSE-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
+; SSE-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0
+; SSE-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP5]], i32 1
+; SSE-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP7]], i32 2
+; SSE-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 3
+; SSE-NEXT:    [[TMP14:%.*]] = add <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
+; SSE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 4
+; SSE-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
+; SSE-NEXT:    store <4 x i32> [[TMP14]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
+; SSE-NEXT:    [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP19:%.*]] = add i32 [[TMP18]], 1
+; SSE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
+; SSE-NEXT:    store i32 [[TMP19]], i32* [[TMP15]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
+; SSE-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP23:%.*]] = add i32 [[TMP22]], 2
+; SSE-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
+; SSE-NEXT:    store i32 [[TMP23]], i32* [[TMP20]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
+; SSE-NEXT:    [[TMP26:%.*]] = load i32, i32* [[TMP25]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP27:%.*]] = add i32 [[TMP26]], 3
+; SSE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7
+; SSE-NEXT:    store i32 [[TMP27]], i32* [[TMP24]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
+; SSE-NEXT:    [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP31:%.*]] = add i32 [[TMP30]], 4
+; SSE-NEXT:    store i32 [[TMP31]], i32* [[TMP28]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_3(
-; AVX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
-; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
-; AVX-NEXT:    store i32 [[TMP4]], i32* [[TMP0]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
-; AVX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP8:%.*]] = add i32 [[TMP7]], 2
-; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2
-; AVX-NEXT:    store i32 [[TMP8]], i32* [[TMP5]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
-; AVX-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP12:%.*]] = add i32 [[TMP11]], 3
-; AVX-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3
-; AVX-NEXT:    store i32 [[TMP12]], i32* [[TMP9]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
-; AVX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP16:%.*]] = add i32 [[TMP15]], 4
-; AVX-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4
-; AVX-NEXT:    store i32 [[TMP16]], i32* [[TMP13]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
-; AVX-NEXT:    [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP20:%.*]] = add i32 [[TMP19]], 1
-; AVX-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
-; AVX-NEXT:    store i32 [[TMP20]], i32* [[TMP17]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
-; AVX-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP24:%.*]] = add i32 [[TMP23]], 2
-; AVX-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
-; AVX-NEXT:    store i32 [[TMP24]], i32* [[TMP21]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
-; AVX-NEXT:    [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP28:%.*]] = add i32 [[TMP27]], 3
-; AVX-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7
-; AVX-NEXT:    store i32 [[TMP28]], i32* [[TMP25]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
-; AVX-NEXT:    [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP32:%.*]] = add i32 [[TMP31]], 4
-; AVX-NEXT:    store i32 [[TMP32]], i32* [[TMP29]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
+; AVX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
+; AVX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
+; AVX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i32 0
+; AVX-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32*> [[TMP10]], <4 x i32*> undef, <4 x i32> zeroinitializer
+; AVX-NEXT:    [[TMP12:%.*]] = getelementptr i32, <4 x i32*> [[TMP11]], <4 x i64> <i64 18, i64 9, i64 6, i64 21>
+; AVX-NEXT:    [[TMP13:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP12]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP13]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP15:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i32 0
+; AVX-NEXT:    [[TMP16:%.*]] = insertelement <8 x i32> [[TMP15]], i32 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP17:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[TMP7]], i32 2
+; AVX-NEXT:    [[TMP18:%.*]] = insertelement <8 x i32> [[TMP17]], i32 [[TMP9]], i32 3
+; AVX-NEXT:    [[TMP19:%.*]] = shufflevector <8 x i32> [[TMP18]], <8 x i32> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; AVX-NEXT:    [[TMP20:%.*]] = add <8 x i32> [[TMP19]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
+; AVX-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>*
+; AVX-NEXT:    store <8 x i32> [[TMP20]], <8 x i32>* [[TMP21]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_3(
-; AVX2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
-; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
-; AVX2-NEXT:    store i32 [[TMP4]], i32* [[TMP0]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
+; AVX2-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i32 0
 ; AVX2-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> undef, <4 x i32> zeroinitializer
-; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i32, <4 x i32*> [[TMP7]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
-; AVX2-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP8]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0]]
-; AVX2-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP9]], <i32 2, i32 3, i32 4, i32 1>
-; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
-; AVX2-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
-; AVX2-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
-; AVX2-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP15:%.*]] = add i32 [[TMP14]], 2
-; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
-; AVX2-NEXT:    store i32 [[TMP15]], i32* [[TMP11]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
-; AVX2-NEXT:    [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP19:%.*]] = add i32 [[TMP18]], 3
-; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7
-; AVX2-NEXT:    store i32 [[TMP19]], i32* [[TMP16]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
-; AVX2-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP23:%.*]] = add i32 [[TMP22]], 4
-; AVX2-NEXT:    store i32 [[TMP23]], i32* [[TMP20]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i32, <4 x i32*> [[TMP7]], <4 x i64> <i64 4, i64 15, i64 18, i64 9>
+; AVX2-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP8]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32*> poison, i32* [[TMP1]], i32 0
+; AVX2-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i32*> [[TMP11]], <2 x i32*> undef, <2 x i32> zeroinitializer
+; AVX2-NEXT:    [[TMP13:%.*]] = getelementptr i32, <2 x i32*> [[TMP12]], <2 x i64> <i64 6, i64 21>
+; AVX2-NEXT:    [[TMP14:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP13]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef), !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP15:%.*]] = shufflevector <2 x i32> [[TMP14]], <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP16:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i32 0
+; AVX2-NEXT:    [[TMP17:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[TMP5]], i32 1
+; AVX2-NEXT:    [[TMP18:%.*]] = shufflevector <8 x i32> [[TMP17]], <8 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP19:%.*]] = shufflevector <8 x i32> [[TMP18]], <8 x i32> [[TMP15]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; AVX2-NEXT:    [[TMP20:%.*]] = add <8 x i32> [[TMP19]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
+; AVX2-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>*
+; AVX2-NEXT:    store <8 x i32> [[TMP20]], <8 x i32>* [[TMP21]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    ret void
-;
-; AVX512-LABEL: @gather_load_3(
-; AVX512-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
-; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
-; AVX512-NEXT:    store i32 [[TMP4]], i32* [[TMP0]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i32 0
-; AVX512-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> undef, <4 x i32> zeroinitializer
-; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr i32, <4 x i32*> [[TMP7]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
-; AVX512-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP8]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0]]
-; AVX512-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP9]], <i32 2, i32 3, i32 4, i32 1>
-; AVX512-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
-; AVX512-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
-; AVX512-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
-; AVX512-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP15:%.*]] = add i32 [[TMP14]], 2
-; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
-; AVX512-NEXT:    store i32 [[TMP15]], i32* [[TMP11]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
-; AVX512-NEXT:    [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP19:%.*]] = add i32 [[TMP18]], 3
-; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7
-; AVX512-NEXT:    store i32 [[TMP19]], i32* [[TMP16]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
-; AVX512-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP23:%.*]] = add i32 [[TMP22]], 4
-; AVX512-NEXT:    store i32 [[TMP23]], i32* [[TMP20]], align 4, [[TBAA0]]
-; AVX512-NEXT:    ret void
 ;
   %3 = load i32, i32* %1, align 4, !tbaa !2
   %4 = add i32 %3, 1
@@ -315,13 +258,10 @@
 
 define void @gather_load_4(i32* noalias nocapture %t0, i32* noalias nocapture readonly %t1) {
 ; SSE-LABEL: @gather_load_4(
-; SSE-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
 ; SSE-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11
-; SSE-NEXT:    [[T9:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 2
 ; SSE-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4
-; SSE-NEXT:    [[T13:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 3
 ; SSE-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15
-; SSE-NEXT:    [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 4
+; SSE-NEXT:    [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 4
 ; SSE-NEXT:    [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18
 ; SSE-NEXT:    [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
 ; SSE-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
@@ -329,130 +269,76 @@
 ; SSE-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
 ; SSE-NEXT:    [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7
 ; SSE-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
-; SSE-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[T7:%.*]] = load i32, i32* [[T6]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[T11:%.*]] = load i32, i32* [[T10]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[T15:%.*]] = load i32, i32* [[T14]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[T19:%.*]] = load i32, i32* [[T18]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[T23:%.*]] = load i32, i32* [[T22]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[T27:%.*]] = load i32, i32* [[T26]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[T31:%.*]] = load i32, i32* [[T30]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[T4:%.*]] = add i32 [[T3]], 1
-; SSE-NEXT:    [[T8:%.*]] = add i32 [[T7]], 2
-; SSE-NEXT:    [[T12:%.*]] = add i32 [[T11]], 3
-; SSE-NEXT:    [[T16:%.*]] = add i32 [[T15]], 4
+; SSE-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[T11:%.*]] = load i32, i32* [[T10]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[T15:%.*]] = load i32, i32* [[T14]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[T19:%.*]] = load i32, i32* [[T18]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T3]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T7]], i32 1
+; SSE-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T11]], i32 2
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T15]], i32 3
+; SSE-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP4]], <i32 1, i32 2, i32 3, i32 4>
 ; SSE-NEXT:    [[T20:%.*]] = add i32 [[T19]], 1
 ; SSE-NEXT:    [[T24:%.*]] = add i32 [[T23]], 2
 ; SSE-NEXT:    [[T28:%.*]] = add i32 [[T27]], 3
 ; SSE-NEXT:    [[T32:%.*]] = add i32 [[T31]], 4
-; SSE-NEXT:    store i32 [[T4]], i32* [[T0]], align 4, [[TBAA0]]
-; SSE-NEXT:    store i32 [[T8]], i32* [[T5]], align 4, [[TBAA0]]
-; SSE-NEXT:    store i32 [[T12]], i32* [[T9]], align 4, [[TBAA0]]
-; SSE-NEXT:    store i32 [[T16]], i32* [[T13]], align 4, [[TBAA0]]
-; SSE-NEXT:    store i32 [[T20]], i32* [[T17]], align 4, [[TBAA0]]
-; SSE-NEXT:    store i32 [[T24]], i32* [[T21]], align 4, [[TBAA0]]
-; SSE-NEXT:    store i32 [[T28]], i32* [[T25]], align 4, [[TBAA0]]
-; SSE-NEXT:    store i32 [[T32]], i32* [[T29]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP6:%.*]] = bitcast i32* [[T0]] to <4 x i32>*
+; SSE-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store i32 [[T20]], i32* [[T17]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store i32 [[T24]], i32* [[T21]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store i32 [[T28]], i32* [[T25]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store i32 [[T32]], i32* [[T29]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_4(
-; AVX-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
 ; AVX-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11
-; AVX-NEXT:    [[T9:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 2
 ; AVX-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4
-; AVX-NEXT:    [[T13:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 3
 ; AVX-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15
-; AVX-NEXT:    [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 4
-; AVX-NEXT:    [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18
-; AVX-NEXT:    [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
-; AVX-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
-; AVX-NEXT:    [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6
-; AVX-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
-; AVX-NEXT:    [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7
-; AVX-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
-; AVX-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[T7:%.*]] = load i32, i32* [[T6]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[T11:%.*]] = load i32, i32* [[T10]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[T15:%.*]] = load i32, i32* [[T14]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[T19:%.*]] = load i32, i32* [[T18]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[T23:%.*]] = load i32, i32* [[T22]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[T27:%.*]] = load i32, i32* [[T26]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[T31:%.*]] = load i32, i32* [[T30]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[T4:%.*]] = add i32 [[T3]], 1
-; AVX-NEXT:    [[T8:%.*]] = add i32 [[T7]], 2
-; AVX-NEXT:    [[T12:%.*]] = add i32 [[T11]], 3
-; AVX-NEXT:    [[T16:%.*]] = add i32 [[T15]], 4
-; AVX-NEXT:    [[T20:%.*]] = add i32 [[T19]], 1
-; AVX-NEXT:    [[T24:%.*]] = add i32 [[T23]], 2
-; AVX-NEXT:    [[T28:%.*]] = add i32 [[T27]], 3
-; AVX-NEXT:    [[T32:%.*]] = add i32 [[T31]], 4
-; AVX-NEXT:    store i32 [[T4]], i32* [[T0]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T8]], i32* [[T5]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T12]], i32* [[T9]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T16]], i32* [[T13]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T20]], i32* [[T17]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T24]], i32* [[T21]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T28]], i32* [[T25]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T32]], i32* [[T29]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1]], i32 0
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> undef, <4 x i32> zeroinitializer
+; AVX-NEXT:    [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> <i64 18, i64 9, i64 6, i64 21>
+; AVX-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[T11:%.*]] = load i32, i32* [[T10]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[T15:%.*]] = load i32, i32* [[T14]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i32 0
+; AVX-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T7]], i32 1
+; AVX-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T11]], i32 2
+; AVX-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[T15]], i32 3
+; AVX-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; AVX-NEXT:    [[TMP11:%.*]] = add <8 x i32> [[TMP10]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
+; AVX-NEXT:    [[TMP12:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>*
+; AVX-NEXT:    store <8 x i32> [[TMP11]], <8 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_4(
-; AVX2-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
-; AVX2-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i32 0
+; AVX2-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11
+; AVX2-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1]], i32 0
 ; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> undef, <4 x i32> zeroinitializer
-; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
-; AVX2-NEXT:    [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
-; AVX2-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
-; AVX2-NEXT:    [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6
-; AVX2-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
-; AVX2-NEXT:    [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7
-; AVX2-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
-; AVX2-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0]]
-; AVX2-NEXT:    [[T23:%.*]] = load i32, i32* [[T22]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[T27:%.*]] = load i32, i32* [[T26]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[T31:%.*]] = load i32, i32* [[T30]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[T4:%.*]] = add i32 [[T3]], 1
-; AVX2-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP4]], <i32 2, i32 3, i32 4, i32 1>
-; AVX2-NEXT:    [[T24:%.*]] = add i32 [[T23]], 2
-; AVX2-NEXT:    [[T28:%.*]] = add i32 [[T27]], 3
-; AVX2-NEXT:    [[T32:%.*]] = add i32 [[T31]], 4
-; AVX2-NEXT:    store i32 [[T4]], i32* [[T0]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP6:%.*]] = bitcast i32* [[T5]] to <4 x i32>*
-; AVX2-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, [[TBAA0]]
-; AVX2-NEXT:    store i32 [[T24]], i32* [[T21]], align 4, [[TBAA0]]
-; AVX2-NEXT:    store i32 [[T28]], i32* [[T25]], align 4, [[TBAA0]]
-; AVX2-NEXT:    store i32 [[T32]], i32* [[T29]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> <i64 4, i64 15, i64 18, i64 9>
+; AVX2-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32*> poison, i32* [[T1]], i32 0
+; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32*> [[TMP4]], <2 x i32*> undef, <2 x i32> zeroinitializer
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr i32, <2 x i32*> [[TMP5]], <2 x i64> <i64 6, i64 21>
+; AVX2-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP9:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP6]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef), !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP11:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i32 0
+; AVX2-NEXT:    [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[T7]], i32 1
+; AVX2-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i32> [[TMP12]], <8 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP14:%.*]] = shufflevector <8 x i32> [[TMP13]], <8 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; AVX2-NEXT:    [[TMP15:%.*]] = add <8 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
+; AVX2-NEXT:    [[TMP16:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>*
+; AVX2-NEXT:    store <8 x i32> [[TMP15]], <8 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    ret void
-;
-; AVX512-LABEL: @gather_load_4(
-; AVX512-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
-; AVX512-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i32 0
-; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> undef, <4 x i32> zeroinitializer
-; AVX512-NEXT:    [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
-; AVX512-NEXT:    [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
-; AVX512-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
-; AVX512-NEXT:    [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6
-; AVX512-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
-; AVX512-NEXT:    [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7
-; AVX512-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
-; AVX512-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0]]
-; AVX512-NEXT:    [[T23:%.*]] = load i32, i32* [[T22]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[T27:%.*]] = load i32, i32* [[T26]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[T31:%.*]] = load i32, i32* [[T30]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[T4:%.*]] = add i32 [[T3]], 1
-; AVX512-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP4]], <i32 2, i32 3, i32 4, i32 1>
-; AVX512-NEXT:    [[T24:%.*]] = add i32 [[T23]], 2
-; AVX512-NEXT:    [[T28:%.*]] = add i32 [[T27]], 3
-; AVX512-NEXT:    [[T32:%.*]] = add i32 [[T31]], 4
-; AVX512-NEXT:    store i32 [[T4]], i32* [[T0]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP6:%.*]] = bitcast i32* [[T5]] to <4 x i32>*
-; AVX512-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, [[TBAA0]]
-; AVX512-NEXT:    store i32 [[T24]], i32* [[T21]], align 4, [[TBAA0]]
-; AVX512-NEXT:    store i32 [[T28]], i32* [[T25]], align 4, [[TBAA0]]
-; AVX512-NEXT:    store i32 [[T32]], i32* [[T29]], align 4, [[TBAA0]]
-; AVX512-NEXT:    ret void
 ;
   %t5 = getelementptr inbounds i32, i32* %t0, i64 1
   %t6 = getelementptr inbounds i32, i32* %t1, i64 11
@@ -509,21 +395,21 @@
 ; SSE-NEXT:    [[TMP7:%.*]] = insertelement <4 x float*> [[TMP6]], float* [[TMP3]], i32 1
 ; SSE-NEXT:    [[TMP8:%.*]] = insertelement <4 x float*> [[TMP7]], float* [[TMP4]], i32 2
 ; SSE-NEXT:    [[TMP9:%.*]] = insertelement <4 x float*> [[TMP8]], float* [[TMP5]], i32 3
-; SSE-NEXT:    [[TMP10:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP9]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef), [[TBAA0]]
+; SSE-NEXT:    [[TMP10:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP9]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef), !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP11:%.*]] = shufflevector <4 x float*> [[TMP6]], <4 x float*> undef, <4 x i32> zeroinitializer
 ; SSE-NEXT:    [[TMP12:%.*]] = getelementptr float, <4 x float*> [[TMP11]], <4 x i64> <i64 4, i64 13, i64 11, i64 44>
-; SSE-NEXT:    [[TMP13:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP12]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef), [[TBAA0]]
+; SSE-NEXT:    [[TMP13:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP12]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef), !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP14:%.*]] = fdiv <4 x float> [[TMP10]], [[TMP13]]
 ; SSE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP0:%.*]], i64 4
 ; SSE-NEXT:    [[TMP16:%.*]] = bitcast float* [[TMP0]] to <4 x float>*
-; SSE-NEXT:    store <4 x float> [[TMP14]], <4 x float>* [[TMP16]], align 4, [[TBAA0]]
+; SSE-NEXT:    store <4 x float> [[TMP14]], <4 x float>* [[TMP16]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP17:%.*]] = getelementptr float, <4 x float*> [[TMP11]], <4 x i64> <i64 17, i64 8, i64 5, i64 20>
-; SSE-NEXT:    [[TMP18:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP17]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef), [[TBAA0]]
+; SSE-NEXT:    [[TMP18:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP17]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef), !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP19:%.*]] = getelementptr float, <4 x float*> [[TMP11]], <4 x i64> <i64 33, i64 30, i64 27, i64 23>
-; SSE-NEXT:    [[TMP20:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP19]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef), [[TBAA0]]
+; SSE-NEXT:    [[TMP20:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP19]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef), !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP21:%.*]] = fdiv <4 x float> [[TMP18]], [[TMP20]]
 ; SSE-NEXT:    [[TMP22:%.*]] = bitcast float* [[TMP15]] to <4 x float>*
-; SSE-NEXT:    store <4 x float> [[TMP21]], <4 x float>* [[TMP22]], align 4, [[TBAA0]]
+; SSE-NEXT:    store <4 x float> [[TMP21]], <4 x float>* [[TMP22]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_div(
@@ -542,13 +428,13 @@
 ; AVX-NEXT:    [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP7]], i32 5
 ; AVX-NEXT:    [[TMP16:%.*]] = insertelement <8 x float*> [[TMP15]], float* [[TMP8]], i32 6
 ; AVX-NEXT:    [[TMP17:%.*]] = insertelement <8 x float*> [[TMP16]], float* [[TMP9]], i32 7
-; AVX-NEXT:    [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), [[TBAA0]]
+; AVX-NEXT:    [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[TMP19:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> undef, <8 x i32> zeroinitializer
 ; AVX-NEXT:    [[TMP20:%.*]] = getelementptr float, <8 x float*> [[TMP19]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
-; AVX-NEXT:    [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), [[TBAA0]]
+; AVX-NEXT:    [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[TMP22:%.*]] = fdiv <8 x float> [[TMP18]], [[TMP21]]
 ; AVX-NEXT:    [[TMP23:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
-; AVX-NEXT:    store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, [[TBAA0]]
+; AVX-NEXT:    store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_div(
@@ -567,13 +453,13 @@
 ; AVX2-NEXT:    [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP7]], i32 5
 ; AVX2-NEXT:    [[TMP16:%.*]] = insertelement <8 x float*> [[TMP15]], float* [[TMP8]], i32 6
 ; AVX2-NEXT:    [[TMP17:%.*]] = insertelement <8 x float*> [[TMP16]], float* [[TMP9]], i32 7
-; AVX2-NEXT:    [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), [[TBAA0]]
+; AVX2-NEXT:    [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
 ; AVX2-NEXT:    [[TMP19:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> undef, <8 x i32> zeroinitializer
 ; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr float, <8 x float*> [[TMP19]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
-; AVX2-NEXT:    [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), [[TBAA0]]
+; AVX2-NEXT:    [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
 ; AVX2-NEXT:    [[TMP22:%.*]] = fdiv <8 x float> [[TMP18]], [[TMP21]]
 ; AVX2-NEXT:    [[TMP23:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
-; AVX2-NEXT:    store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, [[TBAA0]]
+; AVX2-NEXT:    store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512-LABEL: @gather_load_div(
@@ -592,13 +478,13 @@
 ; AVX512-NEXT:    [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP7]], i32 5
 ; AVX512-NEXT:    [[TMP16:%.*]] = insertelement <8 x float*> [[TMP15]], float* [[TMP8]], i32 6
 ; AVX512-NEXT:    [[TMP17:%.*]] = insertelement <8 x float*> [[TMP16]], float* [[TMP9]], i32 7
-; AVX512-NEXT:    [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), [[TBAA0]]
+; AVX512-NEXT:    [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
 ; AVX512-NEXT:    [[TMP19:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> undef, <8 x i32> zeroinitializer
 ; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr float, <8 x float*> [[TMP19]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
-; AVX512-NEXT:    [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), [[TBAA0]]
+; AVX512-NEXT:    [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
 ; AVX512-NEXT:    [[TMP22:%.*]] = fdiv <8 x float> [[TMP18]], [[TMP21]]
 ; AVX512-NEXT:    [[TMP23:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
-; AVX512-NEXT:    store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, [[TBAA0]]
+; AVX512-NEXT:    store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, !tbaa [[TBAA0]]
 ; AVX512-NEXT:    ret void
 ;
   %3 = load float, float* %1, align 4, !tbaa !2
Index: llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
@@ -8,19 +8,19 @@
 define void @gather_load(i32* noalias nocapture %0, i32* noalias nocapture readonly %1) {
 ; CHECK-LABEL: @gather_load(
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, [[TBAA0:!tbaa !.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, [[TBAA0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, [[TBAA0]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, [[TBAA0]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i32 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i32 2
 ; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 3
 ; CHECK-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, [[TBAA0]]
+; CHECK-NEXT:    store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    ret void
 ;
   %3 = getelementptr inbounds i32, i32* %1, i64 1
@@ -46,66 +46,66 @@
 define void @gather_load_2(i32* noalias nocapture %0, i32* noalias nocapture readonly %1) {
 ; SSE-LABEL: @gather_load_2(
 ; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
-; SSE-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, [[TBAA0:!tbaa !.*]]
+; SSE-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP4]], 1
 ; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
-; SSE-NEXT:    store i32 [[TMP5]], i32* [[TMP0]], align 4, [[TBAA0]]
+; SSE-NEXT:    store i32 [[TMP5]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10
-; SSE-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP9:%.*]] = add nsw i32 [[TMP8]], 2
 ; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2
-; SSE-NEXT:    store i32 [[TMP9]], i32* [[TMP6]], align 4, [[TBAA0]]
+; SSE-NEXT:    store i32 [[TMP9]], i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3
-; SSE-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3
 ; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3
-; SSE-NEXT:    store i32 [[TMP13]], i32* [[TMP10]], align 4, [[TBAA0]]
+; SSE-NEXT:    store i32 [[TMP13]], i32* [[TMP10]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5
-; SSE-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4
-; SSE-NEXT:    store i32 [[TMP17]], i32* [[TMP14]], align 4, [[TBAA0]]
+; SSE-NEXT:    store i32 [[TMP17]], i32* [[TMP14]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_2(
 ; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
-; AVX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, [[TBAA0:!tbaa !.*]]
+; AVX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP4]], 1
 ; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
-; AVX-NEXT:    store i32 [[TMP5]], i32* [[TMP0]], align 4, [[TBAA0]]
+; AVX-NEXT:    store i32 [[TMP5]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10
-; AVX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[TMP9:%.*]] = add nsw i32 [[TMP8]], 2
 ; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2
-; AVX-NEXT:    store i32 [[TMP9]], i32* [[TMP6]], align 4, [[TBAA0]]
+; AVX-NEXT:    store i32 [[TMP9]], i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3
-; AVX-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3
 ; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3
-; AVX-NEXT:    store i32 [[TMP13]], i32* [[TMP10]], align 4, [[TBAA0]]
+; AVX-NEXT:    store i32 [[TMP13]], i32* [[TMP10]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5
-; AVX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4
-; AVX-NEXT:    store i32 [[TMP17]], i32* [[TMP14]], align 4, [[TBAA0]]
+; AVX-NEXT:    store i32 [[TMP17]], i32* [[TMP14]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_2(
 ; AVX2-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1:%.*]], i32 0
 ; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> undef, <4 x i32> zeroinitializer
 ; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i32, <4 x i32*> [[TMP4]], <4 x i64> <i64 1, i64 10, i64 3, i64 5>
-; AVX2-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0:!tbaa !.*]]
+; AVX2-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
 ; AVX2-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], <i32 1, i32 2, i32 3, i32 4>
 ; AVX2-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
-; AVX2-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4, [[TBAA0]]
+; AVX2-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512-LABEL: @gather_load_2(
 ; AVX512-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1:%.*]], i32 0
 ; AVX512-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> undef, <4 x i32> zeroinitializer
 ; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr i32, <4 x i32*> [[TMP4]], <4 x i64> <i64 1, i64 10, i64 3, i64 5>
-; AVX512-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0:!tbaa !.*]]
+; AVX512-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
 ; AVX512-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], <i32 1, i32 2, i32 3, i32 4>
 ; AVX512-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
-; AVX512-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4, [[TBAA0]]
+; AVX512-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4, !tbaa [[TBAA0]]
 ; AVX512-NEXT:    ret void
 ;
   %3 = getelementptr inbounds i32, i32* %1, i64 1
@@ -133,144 +133,87 @@
 
 define void @gather_load_3(i32* noalias nocapture %0, i32* noalias nocapture readonly %1) {
 ; SSE-LABEL: @gather_load_3(
-; SSE-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
-; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
-; SSE-NEXT:    store i32 [[TMP4]], i32* [[TMP0]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
-; SSE-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP8:%.*]] = add i32 [[TMP7]], 2
-; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2
-; SSE-NEXT:    store i32 [[TMP8]], i32* [[TMP5]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
-; SSE-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP12:%.*]] = add i32 [[TMP11]], 3
-; SSE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3
-; SSE-NEXT:    store i32 [[TMP12]], i32* [[TMP9]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
-; SSE-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP16:%.*]] = add i32 [[TMP15]], 4
-; SSE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4
-; SSE-NEXT:    store i32 [[TMP16]], i32* [[TMP13]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
-; SSE-NEXT:    [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP20:%.*]] = add i32 [[TMP19]], 1
-; SSE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
-; SSE-NEXT:    store i32 [[TMP20]], i32* [[TMP17]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
-; SSE-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP24:%.*]] = add i32 [[TMP23]], 2
-; SSE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
-; SSE-NEXT:    store i32 [[TMP24]], i32* [[TMP21]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
-; SSE-NEXT:    [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP28:%.*]] = add i32 [[TMP27]], 3
-; SSE-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7
-; SSE-NEXT:    store i32 [[TMP28]], i32* [[TMP25]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
-; SSE-NEXT:    [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP32:%.*]] = add i32 [[TMP31]], 4
-; SSE-NEXT:    store i32 [[TMP32]], i32* [[TMP29]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
+; SSE-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
+; SSE-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
+; SSE-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0
+; SSE-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP5]], i32 1
+; SSE-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP7]], i32 2
+; SSE-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 3
+; SSE-NEXT:    [[TMP14:%.*]] = add <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
+; SSE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 4
+; SSE-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
+; SSE-NEXT:    store <4 x i32> [[TMP14]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
+; SSE-NEXT:    [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP19:%.*]] = add i32 [[TMP18]], 1
+; SSE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
+; SSE-NEXT:    store i32 [[TMP19]], i32* [[TMP15]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
+; SSE-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP23:%.*]] = add i32 [[TMP22]], 2
+; SSE-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
+; SSE-NEXT:    store i32 [[TMP23]], i32* [[TMP20]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
+; SSE-NEXT:    [[TMP26:%.*]] = load i32, i32* [[TMP25]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP27:%.*]] = add i32 [[TMP26]], 3
+; SSE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7
+; SSE-NEXT:    store i32 [[TMP27]], i32* [[TMP24]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
+; SSE-NEXT:    [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP31:%.*]] = add i32 [[TMP30]], 4
+; SSE-NEXT:    store i32 [[TMP31]], i32* [[TMP28]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_3(
-; AVX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
-; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
-; AVX-NEXT:    store i32 [[TMP4]], i32* [[TMP0]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
-; AVX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP8:%.*]] = add i32 [[TMP7]], 2
-; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2
-; AVX-NEXT:    store i32 [[TMP8]], i32* [[TMP5]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
-; AVX-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP12:%.*]] = add i32 [[TMP11]], 3
-; AVX-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3
-; AVX-NEXT:    store i32 [[TMP12]], i32* [[TMP9]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
-; AVX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP16:%.*]] = add i32 [[TMP15]], 4
-; AVX-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4
-; AVX-NEXT:    store i32 [[TMP16]], i32* [[TMP13]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
-; AVX-NEXT:    [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP20:%.*]] = add i32 [[TMP19]], 1
-; AVX-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
-; AVX-NEXT:    store i32 [[TMP20]], i32* [[TMP17]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
-; AVX-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP24:%.*]] = add i32 [[TMP23]], 2
-; AVX-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
-; AVX-NEXT:    store i32 [[TMP24]], i32* [[TMP21]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
-; AVX-NEXT:    [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP28:%.*]] = add i32 [[TMP27]], 3
-; AVX-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7
-; AVX-NEXT:    store i32 [[TMP28]], i32* [[TMP25]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
-; AVX-NEXT:    [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP32:%.*]] = add i32 [[TMP31]], 4
-; AVX-NEXT:    store i32 [[TMP32]], i32* [[TMP29]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
+; AVX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
+; AVX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
+; AVX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i32 0
+; AVX-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32*> [[TMP10]], <4 x i32*> undef, <4 x i32> zeroinitializer
+; AVX-NEXT:    [[TMP12:%.*]] = getelementptr i32, <4 x i32*> [[TMP11]], <4 x i64> <i64 18, i64 9, i64 6, i64 21>
+; AVX-NEXT:    [[TMP13:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP12]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP13]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP15:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i32 0
+; AVX-NEXT:    [[TMP16:%.*]] = insertelement <8 x i32> [[TMP15]], i32 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP17:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[TMP7]], i32 2
+; AVX-NEXT:    [[TMP18:%.*]] = insertelement <8 x i32> [[TMP17]], i32 [[TMP9]], i32 3
+; AVX-NEXT:    [[TMP19:%.*]] = shufflevector <8 x i32> [[TMP18]], <8 x i32> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; AVX-NEXT:    [[TMP20:%.*]] = add <8 x i32> [[TMP19]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
+; AVX-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>*
+; AVX-NEXT:    store <8 x i32> [[TMP20]], <8 x i32>* [[TMP21]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_3(
-; AVX2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
-; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
-; AVX2-NEXT:    store i32 [[TMP4]], i32* [[TMP0]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
+; AVX2-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i32 0
 ; AVX2-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> undef, <4 x i32> zeroinitializer
-; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i32, <4 x i32*> [[TMP7]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
-; AVX2-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP8]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0]]
-; AVX2-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP9]], <i32 2, i32 3, i32 4, i32 1>
-; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
-; AVX2-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
-; AVX2-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
-; AVX2-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP15:%.*]] = add i32 [[TMP14]], 2
-; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
-; AVX2-NEXT:    store i32 [[TMP15]], i32* [[TMP11]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
-; AVX2-NEXT:    [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP19:%.*]] = add i32 [[TMP18]], 3
-; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7
-; AVX2-NEXT:    store i32 [[TMP19]], i32* [[TMP16]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
-; AVX2-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP23:%.*]] = add i32 [[TMP22]], 4
-; AVX2-NEXT:    store i32 [[TMP23]], i32* [[TMP20]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i32, <4 x i32*> [[TMP7]], <4 x i64> <i64 4, i64 15, i64 18, i64 9>
+; AVX2-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP8]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32*> poison, i32* [[TMP1]], i32 0
+; AVX2-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i32*> [[TMP11]], <2 x i32*> undef, <2 x i32> zeroinitializer
+; AVX2-NEXT:    [[TMP13:%.*]] = getelementptr i32, <2 x i32*> [[TMP12]], <2 x i64> <i64 6, i64 21>
+; AVX2-NEXT:    [[TMP14:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP13]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef), !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP15:%.*]] = shufflevector <2 x i32> [[TMP14]], <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP16:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i32 0
+; AVX2-NEXT:    [[TMP17:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[TMP5]], i32 1
+; AVX2-NEXT:    [[TMP18:%.*]] = shufflevector <8 x i32> [[TMP17]], <8 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP19:%.*]] = shufflevector <8 x i32> [[TMP18]], <8 x i32> [[TMP15]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; AVX2-NEXT:    [[TMP20:%.*]] = add <8 x i32> [[TMP19]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
+; AVX2-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>*
+; AVX2-NEXT:    store <8 x i32> [[TMP20]], <8 x i32>* [[TMP21]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    ret void
-;
-; AVX512-LABEL: @gather_load_3(
-; AVX512-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
-; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
-; AVX512-NEXT:    store i32 [[TMP4]], i32* [[TMP0]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i32 0
-; AVX512-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> undef, <4 x i32> zeroinitializer
-; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr i32, <4 x i32*> [[TMP7]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
-; AVX512-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP8]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0]]
-; AVX512-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP9]], <i32 2, i32 3, i32 4, i32 1>
-; AVX512-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
-; AVX512-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
-; AVX512-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
-; AVX512-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP15:%.*]] = add i32 [[TMP14]], 2
-; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
-; AVX512-NEXT:    store i32 [[TMP15]], i32* [[TMP11]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
-; AVX512-NEXT:    [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP19:%.*]] = add i32 [[TMP18]], 3
-; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7
-; AVX512-NEXT:    store i32 [[TMP19]], i32* [[TMP16]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
-; AVX512-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP23:%.*]] = add i32 [[TMP22]], 4
-; AVX512-NEXT:    store i32 [[TMP23]], i32* [[TMP20]], align 4, [[TBAA0]]
-; AVX512-NEXT:    ret void
 ;
   %3 = load i32, i32* %1, align 4, !tbaa !2
   %4 = add i32 %3, 1
@@ -315,13 +258,10 @@
 
 define void @gather_load_4(i32* noalias nocapture %t0, i32* noalias nocapture readonly %t1) {
 ; SSE-LABEL: @gather_load_4(
-; SSE-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
 ; SSE-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11
-; SSE-NEXT:    [[T9:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 2
 ; SSE-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4
-; SSE-NEXT:    [[T13:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 3
 ; SSE-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15
-; SSE-NEXT:    [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 4
+; SSE-NEXT:    [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 4
 ; SSE-NEXT:    [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18
 ; SSE-NEXT:    [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
 ; SSE-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
@@ -329,130 +269,76 @@
 ; SSE-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
 ; SSE-NEXT:    [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7
 ; SSE-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
-; SSE-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[T7:%.*]] = load i32, i32* [[T6]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[T11:%.*]] = load i32, i32* [[T10]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[T15:%.*]] = load i32, i32* [[T14]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[T19:%.*]] = load i32, i32* [[T18]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[T23:%.*]] = load i32, i32* [[T22]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[T27:%.*]] = load i32, i32* [[T26]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[T31:%.*]] = load i32, i32* [[T30]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[T4:%.*]] = add i32 [[T3]], 1
-; SSE-NEXT:    [[T8:%.*]] = add i32 [[T7]], 2
-; SSE-NEXT:    [[T12:%.*]] = add i32 [[T11]], 3
-; SSE-NEXT:    [[T16:%.*]] = add i32 [[T15]], 4
+; SSE-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[T11:%.*]] = load i32, i32* [[T10]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[T15:%.*]] = load i32, i32* [[T14]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[T19:%.*]] = load i32, i32* [[T18]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T3]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T7]], i32 1
+; SSE-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T11]], i32 2
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T15]], i32 3
+; SSE-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP4]], <i32 1, i32 2, i32 3, i32 4>
 ; SSE-NEXT:    [[T20:%.*]] = add i32 [[T19]], 1
 ; SSE-NEXT:    [[T24:%.*]] = add i32 [[T23]], 2
 ; SSE-NEXT:    [[T28:%.*]] = add i32 [[T27]], 3
 ; SSE-NEXT:    [[T32:%.*]] = add i32 [[T31]], 4
-; SSE-NEXT:    store i32 [[T4]], i32* [[T0]], align 4, [[TBAA0]]
-; SSE-NEXT:    store i32 [[T8]], i32* [[T5]], align 4, [[TBAA0]]
-; SSE-NEXT:    store i32 [[T12]], i32* [[T9]], align 4, [[TBAA0]]
-; SSE-NEXT:    store i32 [[T16]], i32* [[T13]], align 4, [[TBAA0]]
-; SSE-NEXT:    store i32 [[T20]], i32* [[T17]], align 4, [[TBAA0]]
-; SSE-NEXT:    store i32 [[T24]], i32* [[T21]], align 4, [[TBAA0]]
-; SSE-NEXT:    store i32 [[T28]], i32* [[T25]], align 4, [[TBAA0]]
-; SSE-NEXT:    store i32 [[T32]], i32* [[T29]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP6:%.*]] = bitcast i32* [[T0]] to <4 x i32>*
+; SSE-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store i32 [[T20]], i32* [[T17]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store i32 [[T24]], i32* [[T21]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store i32 [[T28]], i32* [[T25]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store i32 [[T32]], i32* [[T29]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_4(
-; AVX-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
 ; AVX-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11
-; AVX-NEXT:    [[T9:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 2
 ; AVX-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4
-; AVX-NEXT:    [[T13:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 3
 ; AVX-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15
-; AVX-NEXT:    [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 4
-; AVX-NEXT:    [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18
-; AVX-NEXT:    [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
-; AVX-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
-; AVX-NEXT:    [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6
-; AVX-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
-; AVX-NEXT:    [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7
-; AVX-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
-; AVX-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[T7:%.*]] = load i32, i32* [[T6]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[T11:%.*]] = load i32, i32* [[T10]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[T15:%.*]] = load i32, i32* [[T14]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[T19:%.*]] = load i32, i32* [[T18]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[T23:%.*]] = load i32, i32* [[T22]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[T27:%.*]] = load i32, i32* [[T26]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[T31:%.*]] = load i32, i32* [[T30]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[T4:%.*]] = add i32 [[T3]], 1
-; AVX-NEXT:    [[T8:%.*]] = add i32 [[T7]], 2
-; AVX-NEXT:    [[T12:%.*]] = add i32 [[T11]], 3
-; AVX-NEXT:    [[T16:%.*]] = add i32 [[T15]], 4
-; AVX-NEXT:    [[T20:%.*]] = add i32 [[T19]], 1
-; AVX-NEXT:    [[T24:%.*]] = add i32 [[T23]], 2
-; AVX-NEXT:    [[T28:%.*]] = add i32 [[T27]], 3
-; AVX-NEXT:    [[T32:%.*]] = add i32 [[T31]], 4
-; AVX-NEXT:    store i32 [[T4]], i32* [[T0]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T8]], i32* [[T5]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T12]], i32* [[T9]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T16]], i32* [[T13]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T20]], i32* [[T17]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T24]], i32* [[T21]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T28]], i32* [[T25]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T32]], i32* [[T29]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1]], i32 0
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> undef, <4 x i32> zeroinitializer
+; AVX-NEXT:    [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> <i64 18, i64 9, i64 6, i64 21>
+; AVX-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[T11:%.*]] = load i32, i32* [[T10]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[T15:%.*]] = load i32, i32* [[T14]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i32 0
+; AVX-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T7]], i32 1
+; AVX-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T11]], i32 2
+; AVX-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[T15]], i32 3
+; AVX-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; AVX-NEXT:    [[TMP11:%.*]] = add <8 x i32> [[TMP10]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
+; AVX-NEXT:    [[TMP12:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>*
+; AVX-NEXT:    store <8 x i32> [[TMP11]], <8 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_4(
-; AVX2-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
-; AVX2-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i32 0
+; AVX2-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11
+; AVX2-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1]], i32 0
 ; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> undef, <4 x i32> zeroinitializer
-; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
-; AVX2-NEXT:    [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
-; AVX2-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
-; AVX2-NEXT:    [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6
-; AVX2-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
-; AVX2-NEXT:    [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7
-; AVX2-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
-; AVX2-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0]]
-; AVX2-NEXT:    [[T23:%.*]] = load i32, i32* [[T22]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[T27:%.*]] = load i32, i32* [[T26]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[T31:%.*]] = load i32, i32* [[T30]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[T4:%.*]] = add i32 [[T3]], 1
-; AVX2-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP4]], <i32 2, i32 3, i32 4, i32 1>
-; AVX2-NEXT:    [[T24:%.*]] = add i32 [[T23]], 2
-; AVX2-NEXT:    [[T28:%.*]] = add i32 [[T27]], 3
-; AVX2-NEXT:    [[T32:%.*]] = add i32 [[T31]], 4
-; AVX2-NEXT:    store i32 [[T4]], i32* [[T0]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP6:%.*]] = bitcast i32* [[T5]] to <4 x i32>*
-; AVX2-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, [[TBAA0]]
-; AVX2-NEXT:    store i32 [[T24]], i32* [[T21]], align 4, [[TBAA0]]
-; AVX2-NEXT:    store i32 [[T28]], i32* [[T25]], align 4, [[TBAA0]]
-; AVX2-NEXT:    store i32 [[T32]], i32* [[T29]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> <i64 4, i64 15, i64 18, i64 9>
+; AVX2-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32*> poison, i32* [[T1]], i32 0
+; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32*> [[TMP4]], <2 x i32*> undef, <2 x i32> zeroinitializer
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr i32, <2 x i32*> [[TMP5]], <2 x i64> <i64 6, i64 21>
+; AVX2-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP9:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP6]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef), !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP11:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i32 0
+; AVX2-NEXT:    [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[T7]], i32 1
+; AVX2-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i32> [[TMP12]], <8 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP14:%.*]] = shufflevector <8 x i32> [[TMP13]], <8 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; AVX2-NEXT:    [[TMP15:%.*]] = add <8 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
+; AVX2-NEXT:    [[TMP16:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>*
+; AVX2-NEXT:    store <8 x i32> [[TMP15]], <8 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    ret void
-;
-; AVX512-LABEL: @gather_load_4(
-; AVX512-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
-; AVX512-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i32 0
-; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> undef, <4 x i32> zeroinitializer
-; AVX512-NEXT:    [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
-; AVX512-NEXT:    [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
-; AVX512-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
-; AVX512-NEXT:    [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6
-; AVX512-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
-; AVX512-NEXT:    [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7
-; AVX512-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
-; AVX512-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0]]
-; AVX512-NEXT:    [[T23:%.*]] = load i32, i32* [[T22]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[T27:%.*]] = load i32, i32* [[T26]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[T31:%.*]] = load i32, i32* [[T30]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[T4:%.*]] = add i32 [[T3]], 1
-; AVX512-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP4]], <i32 2, i32 3, i32 4, i32 1>
-; AVX512-NEXT:    [[T24:%.*]] = add i32 [[T23]], 2
-; AVX512-NEXT:    [[T28:%.*]] = add i32 [[T27]], 3
-; AVX512-NEXT:    [[T32:%.*]] = add i32 [[T31]], 4
-; AVX512-NEXT:    store i32 [[T4]], i32* [[T0]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP6:%.*]] = bitcast i32* [[T5]] to <4 x i32>*
-; AVX512-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, [[TBAA0]]
-; AVX512-NEXT:    store i32 [[T24]], i32* [[T21]], align 4, [[TBAA0]]
-; AVX512-NEXT:    store i32 [[T28]], i32* [[T25]], align 4, [[TBAA0]]
-; AVX512-NEXT:    store i32 [[T32]], i32* [[T29]], align 4, [[TBAA0]]
-; AVX512-NEXT:    ret void
 ;
   %t5 = getelementptr inbounds i32, i32* %t0, i64 1
   %t6 = getelementptr inbounds i32, i32* %t1, i64 11
@@ -509,21 +395,21 @@
 ; SSE-NEXT:    [[TMP7:%.*]] = insertelement <4 x float*> [[TMP6]], float* [[TMP3]], i32 1
 ; SSE-NEXT:    [[TMP8:%.*]] = insertelement <4 x float*> [[TMP7]], float* [[TMP4]], i32 2
 ; SSE-NEXT:    [[TMP9:%.*]] = insertelement <4 x float*> [[TMP8]], float* [[TMP5]], i32 3
-; SSE-NEXT:    [[TMP10:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP9]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef), [[TBAA0]]
+; SSE-NEXT:    [[TMP10:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP9]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef), !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP11:%.*]] = shufflevector <4 x float*> [[TMP6]], <4 x float*> undef, <4 x i32> zeroinitializer
 ; SSE-NEXT:    [[TMP12:%.*]] = getelementptr float, <4 x float*> [[TMP11]], <4 x i64> <i64 4, i64 13, i64 11, i64 44>
-; SSE-NEXT:    [[TMP13:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP12]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef), [[TBAA0]]
+; SSE-NEXT:    [[TMP13:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP12]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef), !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP14:%.*]] = fdiv <4 x float> [[TMP10]], [[TMP13]]
 ; SSE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP0:%.*]], i64 4
 ; SSE-NEXT:    [[TMP16:%.*]] = bitcast float* [[TMP0]] to <4 x float>*
-; SSE-NEXT:    store <4 x float> [[TMP14]], <4 x float>* [[TMP16]], align 4, [[TBAA0]]
+; SSE-NEXT:    store <4 x float> [[TMP14]], <4 x float>* [[TMP16]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP17:%.*]] = getelementptr float, <4 x float*> [[TMP11]], <4 x i64> <i64 17, i64 8, i64 5, i64 20>
-; SSE-NEXT:    [[TMP18:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP17]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef), [[TBAA0]]
+; SSE-NEXT:    [[TMP18:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP17]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef), !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP19:%.*]] = getelementptr float, <4 x float*> [[TMP11]], <4 x i64> <i64 33, i64 30, i64 27, i64 23>
-; SSE-NEXT:    [[TMP20:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP19]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef), [[TBAA0]]
+; SSE-NEXT:    [[TMP20:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP19]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef), !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP21:%.*]] = fdiv <4 x float> [[TMP18]], [[TMP20]]
 ; SSE-NEXT:    [[TMP22:%.*]] = bitcast float* [[TMP15]] to <4 x float>*
-; SSE-NEXT:    store <4 x float> [[TMP21]], <4 x float>* [[TMP22]], align 4, [[TBAA0]]
+; SSE-NEXT:    store <4 x float> [[TMP21]], <4 x float>* [[TMP22]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_div(
@@ -542,13 +428,13 @@
 ; AVX-NEXT:    [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP7]], i32 5
 ; AVX-NEXT:    [[TMP16:%.*]] = insertelement <8 x float*> [[TMP15]], float* [[TMP8]], i32 6
 ; AVX-NEXT:    [[TMP17:%.*]] = insertelement <8 x float*> [[TMP16]], float* [[TMP9]], i32 7
-; AVX-NEXT:    [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), [[TBAA0]]
+; AVX-NEXT:    [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[TMP19:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> undef, <8 x i32> zeroinitializer
 ; AVX-NEXT:    [[TMP20:%.*]] = getelementptr float, <8 x float*> [[TMP19]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
-; AVX-NEXT:    [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), [[TBAA0]]
+; AVX-NEXT:    [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[TMP22:%.*]] = fdiv <8 x float> [[TMP18]], [[TMP21]]
 ; AVX-NEXT:    [[TMP23:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
-; AVX-NEXT:    store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, [[TBAA0]]
+; AVX-NEXT:    store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_div(
@@ -567,13 +453,13 @@
 ; AVX2-NEXT:    [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP7]], i32 5
 ; AVX2-NEXT:    [[TMP16:%.*]] = insertelement <8 x float*> [[TMP15]], float* [[TMP8]], i32 6
 ; AVX2-NEXT:    [[TMP17:%.*]] = insertelement <8 x float*> [[TMP16]], float* [[TMP9]], i32 7
-; AVX2-NEXT:    [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), [[TBAA0]]
+; AVX2-NEXT:    [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
 ; AVX2-NEXT:    [[TMP19:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> undef, <8 x i32> zeroinitializer
 ; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr float, <8 x float*> [[TMP19]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
-; AVX2-NEXT:    [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), [[TBAA0]]
+; AVX2-NEXT:    [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
 ; AVX2-NEXT:    [[TMP22:%.*]] = fdiv <8 x float> [[TMP18]], [[TMP21]]
 ; AVX2-NEXT:    [[TMP23:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
-; AVX2-NEXT:    store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, [[TBAA0]]
+; AVX2-NEXT:    store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512-LABEL: @gather_load_div(
@@ -592,13 +478,13 @@
 ; AVX512-NEXT:    [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP7]], i32 5
 ; AVX512-NEXT:    [[TMP16:%.*]] = insertelement <8 x float*> [[TMP15]], float* [[TMP8]], i32 6
 ; AVX512-NEXT:    [[TMP17:%.*]] = insertelement <8 x float*> [[TMP16]], float* [[TMP9]], i32 7
-; AVX512-NEXT:    [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), [[TBAA0]]
+; AVX512-NEXT:    [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
 ; AVX512-NEXT:    [[TMP19:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> undef, <8 x i32> zeroinitializer
 ; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr float, <8 x float*> [[TMP19]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
-; AVX512-NEXT:    [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), [[TBAA0]]
+; AVX512-NEXT:    [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
 ; AVX512-NEXT:    [[TMP22:%.*]] = fdiv <8 x float> [[TMP18]], [[TMP21]]
 ; AVX512-NEXT:    [[TMP23:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
-; AVX512-NEXT:    store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, [[TBAA0]]
+; AVX512-NEXT:    store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, !tbaa [[TBAA0]]
 ; AVX512-NEXT:    ret void
 ;
   %3 = load float, float* %1, align 4, !tbaa !2
Index: llvm/test/Transforms/SLPVectorizer/X86/shift-ashr.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/shift-ashr.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/shift-ashr.ll
@@ -340,146 +340,169 @@
 
 define void @ashr_v32i16() {
 ; SSE-LABEL: @ashr_v32i16(
-; SSE-NEXT:    [[A0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0), align 2
-; SSE-NEXT:    [[A1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1), align 2
-; SSE-NEXT:    [[A2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2), align 2
-; SSE-NEXT:    [[A3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3), align 2
-; SSE-NEXT:    [[A4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4), align 2
-; SSE-NEXT:    [[A5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5), align 2
-; SSE-NEXT:    [[A6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6), align 2
-; SSE-NEXT:    [[A7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7), align 2
-; SSE-NEXT:    [[A8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8), align 2
-; SSE-NEXT:    [[A9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9), align 2
-; SSE-NEXT:    [[A10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
-; SSE-NEXT:    [[A11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
-; SSE-NEXT:    [[A12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
-; SSE-NEXT:    [[A13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
-; SSE-NEXT:    [[A14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
-; SSE-NEXT:    [[A15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
-; SSE-NEXT:    [[A16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
-; SSE-NEXT:    [[A17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
-; SSE-NEXT:    [[A18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
-; SSE-NEXT:    [[A19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
-; SSE-NEXT:    [[A20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
-; SSE-NEXT:    [[A21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
-; SSE-NEXT:    [[A22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
-; SSE-NEXT:    [[A23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
-; SSE-NEXT:    [[A24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
-; SSE-NEXT:    [[A25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
-; SSE-NEXT:    [[A26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
-; SSE-NEXT:    [[A27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
-; SSE-NEXT:    [[A28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
-; SSE-NEXT:    [[A29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
-; SSE-NEXT:    [[A30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
-; SSE-NEXT:    [[A31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
-; SSE-NEXT:    [[B0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0), align 2
-; SSE-NEXT:    [[B1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1), align 2
-; SSE-NEXT:    [[B2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2), align 2
-; SSE-NEXT:    [[B3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3), align 2
-; SSE-NEXT:    [[B4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4), align 2
-; SSE-NEXT:    [[B5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5), align 2
-; SSE-NEXT:    [[B6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6), align 2
-; SSE-NEXT:    [[B7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7), align 2
-; SSE-NEXT:    [[B8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8), align 2
-; SSE-NEXT:    [[B9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9), align 2
-; SSE-NEXT:    [[B10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
-; SSE-NEXT:    [[B11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
-; SSE-NEXT:    [[B12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
-; SSE-NEXT:    [[B13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
-; SSE-NEXT:    [[B14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
-; SSE-NEXT:    [[B15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
-; SSE-NEXT:    [[B16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
-; SSE-NEXT:    [[B17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
-; SSE-NEXT:    [[B18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
-; SSE-NEXT:    [[B19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
-; SSE-NEXT:    [[B20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
-; SSE-NEXT:    [[B21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
-; SSE-NEXT:    [[B22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
-; SSE-NEXT:    [[B23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
-; SSE-NEXT:    [[B24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
-; SSE-NEXT:    [[B25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
-; SSE-NEXT:    [[B26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
-; SSE-NEXT:    [[B27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
-; SSE-NEXT:    [[B28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
-; SSE-NEXT:    [[B29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
-; SSE-NEXT:    [[B30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
-; SSE-NEXT:    [[B31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
-; SSE-NEXT:    [[R0:%.*]] = ashr i16 [[A0]], [[B0]]
-; SSE-NEXT:    [[R1:%.*]] = ashr i16 [[A1]], [[B1]]
-; SSE-NEXT:    [[R2:%.*]] = ashr i16 [[A2]], [[B2]]
-; SSE-NEXT:    [[R3:%.*]] = ashr i16 [[A3]], [[B3]]
-; SSE-NEXT:    [[R4:%.*]] = ashr i16 [[A4]], [[B4]]
-; SSE-NEXT:    [[R5:%.*]] = ashr i16 [[A5]], [[B5]]
-; SSE-NEXT:    [[R6:%.*]] = ashr i16 [[A6]], [[B6]]
-; SSE-NEXT:    [[R7:%.*]] = ashr i16 [[A7]], [[B7]]
-; SSE-NEXT:    [[R8:%.*]] = ashr i16 [[A8]], [[B8]]
-; SSE-NEXT:    [[R9:%.*]] = ashr i16 [[A9]], [[B9]]
-; SSE-NEXT:    [[R10:%.*]] = ashr i16 [[A10]], [[B10]]
-; SSE-NEXT:    [[R11:%.*]] = ashr i16 [[A11]], [[B11]]
-; SSE-NEXT:    [[R12:%.*]] = ashr i16 [[A12]], [[B12]]
-; SSE-NEXT:    [[R13:%.*]] = ashr i16 [[A13]], [[B13]]
-; SSE-NEXT:    [[R14:%.*]] = ashr i16 [[A14]], [[B14]]
-; SSE-NEXT:    [[R15:%.*]] = ashr i16 [[A15]], [[B15]]
-; SSE-NEXT:    [[R16:%.*]] = ashr i16 [[A16]], [[B16]]
-; SSE-NEXT:    [[R17:%.*]] = ashr i16 [[A17]], [[B17]]
-; SSE-NEXT:    [[R18:%.*]] = ashr i16 [[A18]], [[B18]]
-; SSE-NEXT:    [[R19:%.*]] = ashr i16 [[A19]], [[B19]]
-; SSE-NEXT:    [[R20:%.*]] = ashr i16 [[A20]], [[B20]]
-; SSE-NEXT:    [[R21:%.*]] = ashr i16 [[A21]], [[B21]]
-; SSE-NEXT:    [[R22:%.*]] = ashr i16 [[A22]], [[B22]]
-; SSE-NEXT:    [[R23:%.*]] = ashr i16 [[A23]], [[B23]]
-; SSE-NEXT:    [[R24:%.*]] = ashr i16 [[A24]], [[B24]]
-; SSE-NEXT:    [[R25:%.*]] = ashr i16 [[A25]], [[B25]]
-; SSE-NEXT:    [[R26:%.*]] = ashr i16 [[A26]], [[B26]]
-; SSE-NEXT:    [[R27:%.*]] = ashr i16 [[A27]], [[B27]]
-; SSE-NEXT:    [[R28:%.*]] = ashr i16 [[A28]], [[B28]]
-; SSE-NEXT:    [[R29:%.*]] = ashr i16 [[A29]], [[B29]]
-; SSE-NEXT:    [[R30:%.*]] = ashr i16 [[A30]], [[B30]]
-; SSE-NEXT:    [[R31:%.*]] = ashr i16 [[A31]], [[B31]]
-; SSE-NEXT:    store i16 [[R0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0), align 2
-; SSE-NEXT:    store i16 [[R1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1), align 2
-; SSE-NEXT:    store i16 [[R2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2), align 2
-; SSE-NEXT:    store i16 [[R3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3), align 2
-; SSE-NEXT:    store i16 [[R4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4), align 2
-; SSE-NEXT:    store i16 [[R5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5), align 2
-; SSE-NEXT:    store i16 [[R6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6), align 2
-; SSE-NEXT:    store i16 [[R7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7), align 2
-; SSE-NEXT:    store i16 [[R8]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8), align 2
-; SSE-NEXT:    store i16 [[R9]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9), align 2
-; SSE-NEXT:    store i16 [[R10]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
-; SSE-NEXT:    store i16 [[R11]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
-; SSE-NEXT:    store i16 [[R12]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
-; SSE-NEXT:    store i16 [[R13]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
-; SSE-NEXT:    store i16 [[R14]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
-; SSE-NEXT:    store i16 [[R15]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
-; SSE-NEXT:    store i16 [[R16]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
-; SSE-NEXT:    store i16 [[R17]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
-; SSE-NEXT:    store i16 [[R18]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
-; SSE-NEXT:    store i16 [[R19]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
-; SSE-NEXT:    store i16 [[R20]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
-; SSE-NEXT:    store i16 [[R21]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
-; SSE-NEXT:    store i16 [[R22]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
-; SSE-NEXT:    store i16 [[R23]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
-; SSE-NEXT:    store i16 [[R24]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
-; SSE-NEXT:    store i16 [[R25]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
-; SSE-NEXT:    store i16 [[R26]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
-; SSE-NEXT:    store i16 [[R27]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
-; SSE-NEXT:    store i16 [[R28]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
-; SSE-NEXT:    store i16 [[R29]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
-; SSE-NEXT:    store i16 [[R30]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
-; SSE-NEXT:    store i16 [[R31]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0
+; SSE-NEXT:    [[TMP10:%.*]] = extractelement <8 x i16> [[TMP5]], i32 0
+; SSE-NEXT:    [[R0:%.*]] = ashr i16 [[TMP9]], [[TMP10]]
+; SSE-NEXT:    [[TMP11:%.*]] = extractelement <8 x i16> [[TMP1]], i32 1
+; SSE-NEXT:    [[TMP12:%.*]] = extractelement <8 x i16> [[TMP5]], i32 1
+; SSE-NEXT:    [[R1:%.*]] = ashr i16 [[TMP11]], [[TMP12]]
+; SSE-NEXT:    [[TMP13:%.*]] = extractelement <8 x i16> [[TMP1]], i32 2
+; SSE-NEXT:    [[TMP14:%.*]] = extractelement <8 x i16> [[TMP5]], i32 2
+; SSE-NEXT:    [[R2:%.*]] = ashr i16 [[TMP13]], [[TMP14]]
+; SSE-NEXT:    [[TMP15:%.*]] = extractelement <8 x i16> [[TMP1]], i32 3
+; SSE-NEXT:    [[TMP16:%.*]] = extractelement <8 x i16> [[TMP5]], i32 3
+; SSE-NEXT:    [[R3:%.*]] = ashr i16 [[TMP15]], [[TMP16]]
+; SSE-NEXT:    [[TMP17:%.*]] = extractelement <8 x i16> [[TMP1]], i32 4
+; SSE-NEXT:    [[TMP18:%.*]] = extractelement <8 x i16> [[TMP5]], i32 4
+; SSE-NEXT:    [[R4:%.*]] = ashr i16 [[TMP17]], [[TMP18]]
+; SSE-NEXT:    [[TMP19:%.*]] = extractelement <8 x i16> [[TMP1]], i32 5
+; SSE-NEXT:    [[TMP20:%.*]] = extractelement <8 x i16> [[TMP5]], i32 5
+; SSE-NEXT:    [[R5:%.*]] = ashr i16 [[TMP19]], [[TMP20]]
+; SSE-NEXT:    [[TMP21:%.*]] = extractelement <8 x i16> [[TMP1]], i32 6
+; SSE-NEXT:    [[TMP22:%.*]] = extractelement <8 x i16> [[TMP5]], i32 6
+; SSE-NEXT:    [[R6:%.*]] = ashr i16 [[TMP21]], [[TMP22]]
+; SSE-NEXT:    [[TMP23:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+; SSE-NEXT:    [[TMP24:%.*]] = extractelement <8 x i16> [[TMP5]], i32 7
+; SSE-NEXT:    [[R7:%.*]] = ashr i16 [[TMP23]], [[TMP24]]
+; SSE-NEXT:    [[TMP25:%.*]] = extractelement <8 x i16> [[TMP2]], i32 0
+; SSE-NEXT:    [[TMP26:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0
+; SSE-NEXT:    [[R8:%.*]] = ashr i16 [[TMP25]], [[TMP26]]
+; SSE-NEXT:    [[TMP27:%.*]] = extractelement <8 x i16> [[TMP2]], i32 1
+; SSE-NEXT:    [[TMP28:%.*]] = extractelement <8 x i16> [[TMP6]], i32 1
+; SSE-NEXT:    [[R9:%.*]] = ashr i16 [[TMP27]], [[TMP28]]
+; SSE-NEXT:    [[TMP29:%.*]] = extractelement <8 x i16> [[TMP2]], i32 2
+; SSE-NEXT:    [[TMP30:%.*]] = extractelement <8 x i16> [[TMP6]], i32 2
+; SSE-NEXT:    [[R10:%.*]] = ashr i16 [[TMP29]], [[TMP30]]
+; SSE-NEXT:    [[TMP31:%.*]] = extractelement <8 x i16> [[TMP2]], i32 3
+; SSE-NEXT:    [[TMP32:%.*]] = extractelement <8 x i16> [[TMP6]], i32 3
+; SSE-NEXT:    [[R11:%.*]] = ashr i16 [[TMP31]], [[TMP32]]
+; SSE-NEXT:    [[TMP33:%.*]] = extractelement <8 x i16> [[TMP2]], i32 4
+; SSE-NEXT:    [[TMP34:%.*]] = extractelement <8 x i16> [[TMP6]], i32 4
+; SSE-NEXT:    [[R12:%.*]] = ashr i16 [[TMP33]], [[TMP34]]
+; SSE-NEXT:    [[TMP35:%.*]] = extractelement <8 x i16> [[TMP2]], i32 5
+; SSE-NEXT:    [[TMP36:%.*]] = extractelement <8 x i16> [[TMP6]], i32 5
+; SSE-NEXT:    [[R13:%.*]] = ashr i16 [[TMP35]], [[TMP36]]
+; SSE-NEXT:    [[TMP37:%.*]] = extractelement <8 x i16> [[TMP2]], i32 6
+; SSE-NEXT:    [[TMP38:%.*]] = extractelement <8 x i16> [[TMP6]], i32 6
+; SSE-NEXT:    [[R14:%.*]] = ashr i16 [[TMP37]], [[TMP38]]
+; SSE-NEXT:    [[TMP39:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
+; SSE-NEXT:    [[TMP40:%.*]] = extractelement <8 x i16> [[TMP6]], i32 7
+; SSE-NEXT:    [[R15:%.*]] = ashr i16 [[TMP39]], [[TMP40]]
+; SSE-NEXT:    [[TMP41:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
+; SSE-NEXT:    [[TMP42:%.*]] = extractelement <8 x i16> [[TMP7]], i32 0
+; SSE-NEXT:    [[R16:%.*]] = ashr i16 [[TMP41]], [[TMP42]]
+; SSE-NEXT:    [[TMP43:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
+; SSE-NEXT:    [[TMP44:%.*]] = extractelement <8 x i16> [[TMP7]], i32 1
+; SSE-NEXT:    [[R17:%.*]] = ashr i16 [[TMP43]], [[TMP44]]
+; SSE-NEXT:    [[TMP45:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
+; SSE-NEXT:    [[TMP46:%.*]] = extractelement <8 x i16> [[TMP7]], i32 2
+; SSE-NEXT:    [[R18:%.*]] = ashr i16 [[TMP45]], [[TMP46]]
+; SSE-NEXT:    [[TMP47:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
+; SSE-NEXT:    [[TMP48:%.*]] = extractelement <8 x i16> [[TMP7]], i32 3
+; SSE-NEXT:    [[R19:%.*]] = ashr i16 [[TMP47]], [[TMP48]]
+; SSE-NEXT:    [[TMP49:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
+; SSE-NEXT:    [[TMP50:%.*]] = extractelement <8 x i16> [[TMP7]], i32 4
+; SSE-NEXT:    [[R20:%.*]] = ashr i16 [[TMP49]], [[TMP50]]
+; SSE-NEXT:    [[TMP51:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
+; SSE-NEXT:    [[TMP52:%.*]] = extractelement <8 x i16> [[TMP7]], i32 5
+; SSE-NEXT:    [[R21:%.*]] = ashr i16 [[TMP51]], [[TMP52]]
+; SSE-NEXT:    [[TMP53:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
+; SSE-NEXT:    [[TMP54:%.*]] = extractelement <8 x i16> [[TMP7]], i32 6
+; SSE-NEXT:    [[R22:%.*]] = ashr i16 [[TMP53]], [[TMP54]]
+; SSE-NEXT:    [[TMP55:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
+; SSE-NEXT:    [[TMP56:%.*]] = extractelement <8 x i16> [[TMP7]], i32 7
+; SSE-NEXT:    [[R23:%.*]] = ashr i16 [[TMP55]], [[TMP56]]
+; SSE-NEXT:    [[TMP57:%.*]] = extractelement <8 x i16> [[TMP4]], i32 0
+; SSE-NEXT:    [[TMP58:%.*]] = extractelement <8 x i16> [[TMP8]], i32 0
+; SSE-NEXT:    [[R24:%.*]] = ashr i16 [[TMP57]], [[TMP58]]
+; SSE-NEXT:    [[TMP59:%.*]] = extractelement <8 x i16> [[TMP4]], i32 1
+; SSE-NEXT:    [[TMP60:%.*]] = extractelement <8 x i16> [[TMP8]], i32 1
+; SSE-NEXT:    [[R25:%.*]] = ashr i16 [[TMP59]], [[TMP60]]
+; SSE-NEXT:    [[TMP61:%.*]] = extractelement <8 x i16> [[TMP4]], i32 2
+; SSE-NEXT:    [[TMP62:%.*]] = extractelement <8 x i16> [[TMP8]], i32 2
+; SSE-NEXT:    [[R26:%.*]] = ashr i16 [[TMP61]], [[TMP62]]
+; SSE-NEXT:    [[TMP63:%.*]] = extractelement <8 x i16> [[TMP4]], i32 3
+; SSE-NEXT:    [[TMP64:%.*]] = extractelement <8 x i16> [[TMP8]], i32 3
+; SSE-NEXT:    [[R27:%.*]] = ashr i16 [[TMP63]], [[TMP64]]
+; SSE-NEXT:    [[TMP65:%.*]] = extractelement <8 x i16> [[TMP4]], i32 4
+; SSE-NEXT:    [[TMP66:%.*]] = extractelement <8 x i16> [[TMP8]], i32 4
+; SSE-NEXT:    [[R28:%.*]] = ashr i16 [[TMP65]], [[TMP66]]
+; SSE-NEXT:    [[TMP67:%.*]] = extractelement <8 x i16> [[TMP4]], i32 5
+; SSE-NEXT:    [[TMP68:%.*]] = extractelement <8 x i16> [[TMP8]], i32 5
+; SSE-NEXT:    [[R29:%.*]] = ashr i16 [[TMP67]], [[TMP68]]
+; SSE-NEXT:    [[TMP69:%.*]] = extractelement <8 x i16> [[TMP4]], i32 6
+; SSE-NEXT:    [[TMP70:%.*]] = extractelement <8 x i16> [[TMP8]], i32 6
+; SSE-NEXT:    [[R30:%.*]] = ashr i16 [[TMP69]], [[TMP70]]
+; SSE-NEXT:    [[TMP71:%.*]] = extractelement <8 x i16> [[TMP4]], i32 7
+; SSE-NEXT:    [[TMP72:%.*]] = extractelement <8 x i16> [[TMP8]], i32 7
+; SSE-NEXT:    [[R31:%.*]] = ashr i16 [[TMP71]], [[TMP72]]
+; SSE-NEXT:    [[TMP73:%.*]] = insertelement <8 x i16> poison, i16 [[R0]], i32 0
+; SSE-NEXT:    [[TMP74:%.*]] = insertelement <8 x i16> [[TMP73]], i16 [[R1]], i32 1
+; SSE-NEXT:    [[TMP75:%.*]] = insertelement <8 x i16> [[TMP74]], i16 [[R2]], i32 2
+; SSE-NEXT:    [[TMP76:%.*]] = insertelement <8 x i16> [[TMP75]], i16 [[R3]], i32 3
+; SSE-NEXT:    [[TMP77:%.*]] = insertelement <8 x i16> [[TMP76]], i16 [[R4]], i32 4
+; SSE-NEXT:    [[TMP78:%.*]] = insertelement <8 x i16> [[TMP77]], i16 [[R5]], i32 5
+; SSE-NEXT:    [[TMP79:%.*]] = insertelement <8 x i16> [[TMP78]], i16 [[R6]], i32 6
+; SSE-NEXT:    [[TMP80:%.*]] = insertelement <8 x i16> [[TMP79]], i16 [[R7]], i32 7
+; SSE-NEXT:    store <8 x i16> [[TMP80]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP81:%.*]] = insertelement <8 x i16> poison, i16 [[R8]], i32 0
+; SSE-NEXT:    [[TMP82:%.*]] = insertelement <8 x i16> [[TMP81]], i16 [[R9]], i32 1
+; SSE-NEXT:    [[TMP83:%.*]] = insertelement <8 x i16> [[TMP82]], i16 [[R10]], i32 2
+; SSE-NEXT:    [[TMP84:%.*]] = insertelement <8 x i16> [[TMP83]], i16 [[R11]], i32 3
+; SSE-NEXT:    [[TMP85:%.*]] = insertelement <8 x i16> [[TMP84]], i16 [[R12]], i32 4
+; SSE-NEXT:    [[TMP86:%.*]] = insertelement <8 x i16> [[TMP85]], i16 [[R13]], i32 5
+; SSE-NEXT:    [[TMP87:%.*]] = insertelement <8 x i16> [[TMP86]], i16 [[R14]], i32 6
+; SSE-NEXT:    [[TMP88:%.*]] = insertelement <8 x i16> [[TMP87]], i16 [[R15]], i32 7
+; SSE-NEXT:    store <8 x i16> [[TMP88]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP89:%.*]] = insertelement <8 x i16> poison, i16 [[R16]], i32 0
+; SSE-NEXT:    [[TMP90:%.*]] = insertelement <8 x i16> [[TMP89]], i16 [[R17]], i32 1
+; SSE-NEXT:    [[TMP91:%.*]] = insertelement <8 x i16> [[TMP90]], i16 [[R18]], i32 2
+; SSE-NEXT:    [[TMP92:%.*]] = insertelement <8 x i16> [[TMP91]], i16 [[R19]], i32 3
+; SSE-NEXT:    [[TMP93:%.*]] = insertelement <8 x i16> [[TMP92]], i16 [[R20]], i32 4
+; SSE-NEXT:    [[TMP94:%.*]] = insertelement <8 x i16> [[TMP93]], i16 [[R21]], i32 5
+; SSE-NEXT:    [[TMP95:%.*]] = insertelement <8 x i16> [[TMP94]], i16 [[R22]], i32 6
+; SSE-NEXT:    [[TMP96:%.*]] = insertelement <8 x i16> [[TMP95]], i16 [[R23]], i32 7
+; SSE-NEXT:    store <8 x i16> [[TMP96]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP97:%.*]] = insertelement <8 x i16> poison, i16 [[R24]], i32 0
+; SSE-NEXT:    [[TMP98:%.*]] = insertelement <8 x i16> [[TMP97]], i16 [[R25]], i32 1
+; SSE-NEXT:    [[TMP99:%.*]] = insertelement <8 x i16> [[TMP98]], i16 [[R26]], i32 2
+; SSE-NEXT:    [[TMP100:%.*]] = insertelement <8 x i16> [[TMP99]], i16 [[R27]], i32 3
+; SSE-NEXT:    [[TMP101:%.*]] = insertelement <8 x i16> [[TMP100]], i16 [[R28]], i32 4
+; SSE-NEXT:    [[TMP102:%.*]] = insertelement <8 x i16> [[TMP101]], i16 [[R29]], i32 5
+; SSE-NEXT:    [[TMP103:%.*]] = insertelement <8 x i16> [[TMP102]], i16 [[R30]], i32 6
+; SSE-NEXT:    [[TMP104:%.*]] = insertelement <8 x i16> [[TMP103]], i16 [[R31]], i32 7
+; SSE-NEXT:    store <8 x i16> [[TMP104]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
-; AVX-LABEL: @ashr_v32i16(
-; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP5:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP3]]
-; AVX-NEXT:    [[TMP6:%.*]] = ashr <16 x i16> [[TMP2]], [[TMP4]]
-; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
-; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    ret void
+; AVX1-LABEL: @ashr_v32i16(
+; AVX1-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX1-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX1-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX1-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX1-NEXT:    [[TMP5:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP3]]
+; AVX1-NEXT:    [[TMP6:%.*]] = ashr <16 x i16> [[TMP2]], [[TMP4]]
+; AVX1-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX1-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @ashr_v32i16(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX2-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX2-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX2-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX2-NEXT:    [[TMP5:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP3]]
+; AVX2-NEXT:    [[TMP6:%.*]] = ashr <16 x i16> [[TMP2]], [[TMP4]]
+; AVX2-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX2-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX2-NEXT:    ret void
 ;
 ; AVX512-LABEL: @ashr_v32i16(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <32 x i16>, <32 x i16>* bitcast ([32 x i16]* @a16 to <32 x i16>*), align 2
@@ -499,6 +522,16 @@
 ; XOP-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
 ; XOP-NEXT:    ret void
 ;
+; AVX-LABEL: @ashr_v32i16(
+; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = ashr <16 x i16> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    ret void
   %a0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2
   %a1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2
   %a2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2
@@ -650,16 +683,27 @@
 ; SSE-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SSE-NEXT:    ret void
 ;
-; AVX-LABEL: @ashr_v64i8(
-; AVX-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP5:%.*]] = ashr <32 x i8> [[TMP1]], [[TMP3]]
-; AVX-NEXT:    [[TMP6:%.*]] = ashr <32 x i8> [[TMP2]], [[TMP4]]
-; AVX-NEXT:    store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
-; AVX-NEXT:    store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    ret void
+; AVX1-LABEL: @ashr_v64i8(
+; AVX1-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
+; AVX1-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX1-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
+; AVX1-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX1-NEXT:    [[TMP5:%.*]] = ashr <32 x i8> [[TMP1]], [[TMP3]]
+; AVX1-NEXT:    [[TMP6:%.*]] = ashr <32 x i8> [[TMP2]], [[TMP4]]
+; AVX1-NEXT:    store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
+; AVX1-NEXT:    store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @ashr_v64i8(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
+; AVX2-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX2-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
+; AVX2-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX2-NEXT:    [[TMP5:%.*]] = ashr <32 x i8> [[TMP1]], [[TMP3]]
+; AVX2-NEXT:    [[TMP6:%.*]] = ashr <32 x i8> [[TMP2]], [[TMP4]]
+; AVX2-NEXT:    store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
+; AVX2-NEXT:    store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX2-NEXT:    ret void
 ;
 ; AVX512-LABEL: @ashr_v64i8(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <64 x i8>, <64 x i8>* bitcast ([64 x i8]* @a8 to <64 x i8>*), align 1
@@ -679,6 +723,16 @@
 ; XOP-NEXT:    store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
 ; XOP-NEXT:    ret void
 ;
+; AVX-LABEL: @ashr_v64i8(
+; AVX-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP5:%.*]] = ashr <32 x i8> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = ashr <32 x i8> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
+; AVX-NEXT:    store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    ret void
   %a0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
   %a1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
   %a2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1
Index: llvm/test/Transforms/SLPVectorizer/X86/shift-lshr.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/shift-lshr.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/shift-lshr.ll
@@ -289,134 +289,146 @@
 
 define void @lshr_v32i16() {
 ; SSE-LABEL: @lshr_v32i16(
-; SSE-NEXT:    [[A0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0), align 2
-; SSE-NEXT:    [[A1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1), align 2
-; SSE-NEXT:    [[A2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2), align 2
-; SSE-NEXT:    [[A3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3), align 2
-; SSE-NEXT:    [[A4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4), align 2
-; SSE-NEXT:    [[A5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5), align 2
-; SSE-NEXT:    [[A6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6), align 2
-; SSE-NEXT:    [[A7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7), align 2
-; SSE-NEXT:    [[A8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8), align 2
-; SSE-NEXT:    [[A9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9), align 2
-; SSE-NEXT:    [[A10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
-; SSE-NEXT:    [[A11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
-; SSE-NEXT:    [[A12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
-; SSE-NEXT:    [[A13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
-; SSE-NEXT:    [[A14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
-; SSE-NEXT:    [[A15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
-; SSE-NEXT:    [[A16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
-; SSE-NEXT:    [[A17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
-; SSE-NEXT:    [[A18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
-; SSE-NEXT:    [[A19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
-; SSE-NEXT:    [[A20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
-; SSE-NEXT:    [[A21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
-; SSE-NEXT:    [[A22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
-; SSE-NEXT:    [[A23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
-; SSE-NEXT:    [[A24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
-; SSE-NEXT:    [[A25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
-; SSE-NEXT:    [[A26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
-; SSE-NEXT:    [[A27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
-; SSE-NEXT:    [[A28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
-; SSE-NEXT:    [[A29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
-; SSE-NEXT:    [[A30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
-; SSE-NEXT:    [[A31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
-; SSE-NEXT:    [[B0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0), align 2
-; SSE-NEXT:    [[B1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1), align 2
-; SSE-NEXT:    [[B2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2), align 2
-; SSE-NEXT:    [[B3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3), align 2
-; SSE-NEXT:    [[B4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4), align 2
-; SSE-NEXT:    [[B5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5), align 2
-; SSE-NEXT:    [[B6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6), align 2
-; SSE-NEXT:    [[B7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7), align 2
-; SSE-NEXT:    [[B8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8), align 2
-; SSE-NEXT:    [[B9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9), align 2
-; SSE-NEXT:    [[B10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
-; SSE-NEXT:    [[B11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
-; SSE-NEXT:    [[B12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
-; SSE-NEXT:    [[B13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
-; SSE-NEXT:    [[B14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
-; SSE-NEXT:    [[B15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
-; SSE-NEXT:    [[B16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
-; SSE-NEXT:    [[B17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
-; SSE-NEXT:    [[B18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
-; SSE-NEXT:    [[B19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
-; SSE-NEXT:    [[B20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
-; SSE-NEXT:    [[B21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
-; SSE-NEXT:    [[B22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
-; SSE-NEXT:    [[B23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
-; SSE-NEXT:    [[B24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
-; SSE-NEXT:    [[B25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
-; SSE-NEXT:    [[B26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
-; SSE-NEXT:    [[B27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
-; SSE-NEXT:    [[B28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
-; SSE-NEXT:    [[B29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
-; SSE-NEXT:    [[B30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
-; SSE-NEXT:    [[B31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
-; SSE-NEXT:    [[R0:%.*]] = lshr i16 [[A0]], [[B0]]
-; SSE-NEXT:    [[R1:%.*]] = lshr i16 [[A1]], [[B1]]
-; SSE-NEXT:    [[R2:%.*]] = lshr i16 [[A2]], [[B2]]
-; SSE-NEXT:    [[R3:%.*]] = lshr i16 [[A3]], [[B3]]
-; SSE-NEXT:    [[R4:%.*]] = lshr i16 [[A4]], [[B4]]
-; SSE-NEXT:    [[R5:%.*]] = lshr i16 [[A5]], [[B5]]
-; SSE-NEXT:    [[R6:%.*]] = lshr i16 [[A6]], [[B6]]
-; SSE-NEXT:    [[R7:%.*]] = lshr i16 [[A7]], [[B7]]
-; SSE-NEXT:    [[R8:%.*]] = lshr i16 [[A8]], [[B8]]
-; SSE-NEXT:    [[R9:%.*]] = lshr i16 [[A9]], [[B9]]
-; SSE-NEXT:    [[R10:%.*]] = lshr i16 [[A10]], [[B10]]
-; SSE-NEXT:    [[R11:%.*]] = lshr i16 [[A11]], [[B11]]
-; SSE-NEXT:    [[R12:%.*]] = lshr i16 [[A12]], [[B12]]
-; SSE-NEXT:    [[R13:%.*]] = lshr i16 [[A13]], [[B13]]
-; SSE-NEXT:    [[R14:%.*]] = lshr i16 [[A14]], [[B14]]
-; SSE-NEXT:    [[R15:%.*]] = lshr i16 [[A15]], [[B15]]
-; SSE-NEXT:    [[R16:%.*]] = lshr i16 [[A16]], [[B16]]
-; SSE-NEXT:    [[R17:%.*]] = lshr i16 [[A17]], [[B17]]
-; SSE-NEXT:    [[R18:%.*]] = lshr i16 [[A18]], [[B18]]
-; SSE-NEXT:    [[R19:%.*]] = lshr i16 [[A19]], [[B19]]
-; SSE-NEXT:    [[R20:%.*]] = lshr i16 [[A20]], [[B20]]
-; SSE-NEXT:    [[R21:%.*]] = lshr i16 [[A21]], [[B21]]
-; SSE-NEXT:    [[R22:%.*]] = lshr i16 [[A22]], [[B22]]
-; SSE-NEXT:    [[R23:%.*]] = lshr i16 [[A23]], [[B23]]
-; SSE-NEXT:    [[R24:%.*]] = lshr i16 [[A24]], [[B24]]
-; SSE-NEXT:    [[R25:%.*]] = lshr i16 [[A25]], [[B25]]
-; SSE-NEXT:    [[R26:%.*]] = lshr i16 [[A26]], [[B26]]
-; SSE-NEXT:    [[R27:%.*]] = lshr i16 [[A27]], [[B27]]
-; SSE-NEXT:    [[R28:%.*]] = lshr i16 [[A28]], [[B28]]
-; SSE-NEXT:    [[R29:%.*]] = lshr i16 [[A29]], [[B29]]
-; SSE-NEXT:    [[R30:%.*]] = lshr i16 [[A30]], [[B30]]
-; SSE-NEXT:    [[R31:%.*]] = lshr i16 [[A31]], [[B31]]
-; SSE-NEXT:    store i16 [[R0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0), align 2
-; SSE-NEXT:    store i16 [[R1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1), align 2
-; SSE-NEXT:    store i16 [[R2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2), align 2
-; SSE-NEXT:    store i16 [[R3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3), align 2
-; SSE-NEXT:    store i16 [[R4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4), align 2
-; SSE-NEXT:    store i16 [[R5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5), align 2
-; SSE-NEXT:    store i16 [[R6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6), align 2
-; SSE-NEXT:    store i16 [[R7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7), align 2
-; SSE-NEXT:    store i16 [[R8]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8), align 2
-; SSE-NEXT:    store i16 [[R9]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9), align 2
-; SSE-NEXT:    store i16 [[R10]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
-; SSE-NEXT:    store i16 [[R11]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
-; SSE-NEXT:    store i16 [[R12]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
-; SSE-NEXT:    store i16 [[R13]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
-; SSE-NEXT:    store i16 [[R14]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
-; SSE-NEXT:    store i16 [[R15]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
-; SSE-NEXT:    store i16 [[R16]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
-; SSE-NEXT:    store i16 [[R17]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
-; SSE-NEXT:    store i16 [[R18]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
-; SSE-NEXT:    store i16 [[R19]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
-; SSE-NEXT:    store i16 [[R20]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
-; SSE-NEXT:    store i16 [[R21]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
-; SSE-NEXT:    store i16 [[R22]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
-; SSE-NEXT:    store i16 [[R23]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
-; SSE-NEXT:    store i16 [[R24]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
-; SSE-NEXT:    store i16 [[R25]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
-; SSE-NEXT:    store i16 [[R26]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
-; SSE-NEXT:    store i16 [[R27]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
-; SSE-NEXT:    store i16 [[R28]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
-; SSE-NEXT:    store i16 [[R29]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
-; SSE-NEXT:    store i16 [[R30]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
-; SSE-NEXT:    store i16 [[R31]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0
+; SSE-NEXT:    [[TMP10:%.*]] = extractelement <8 x i16> [[TMP5]], i32 0
+; SSE-NEXT:    [[R0:%.*]] = lshr i16 [[TMP9]], [[TMP10]]
+; SSE-NEXT:    [[TMP11:%.*]] = extractelement <8 x i16> [[TMP1]], i32 1
+; SSE-NEXT:    [[TMP12:%.*]] = extractelement <8 x i16> [[TMP5]], i32 1
+; SSE-NEXT:    [[R1:%.*]] = lshr i16 [[TMP11]], [[TMP12]]
+; SSE-NEXT:    [[TMP13:%.*]] = extractelement <8 x i16> [[TMP1]], i32 2
+; SSE-NEXT:    [[TMP14:%.*]] = extractelement <8 x i16> [[TMP5]], i32 2
+; SSE-NEXT:    [[R2:%.*]] = lshr i16 [[TMP13]], [[TMP14]]
+; SSE-NEXT:    [[TMP15:%.*]] = extractelement <8 x i16> [[TMP1]], i32 3
+; SSE-NEXT:    [[TMP16:%.*]] = extractelement <8 x i16> [[TMP5]], i32 3
+; SSE-NEXT:    [[R3:%.*]] = lshr i16 [[TMP15]], [[TMP16]]
+; SSE-NEXT:    [[TMP17:%.*]] = extractelement <8 x i16> [[TMP1]], i32 4
+; SSE-NEXT:    [[TMP18:%.*]] = extractelement <8 x i16> [[TMP5]], i32 4
+; SSE-NEXT:    [[R4:%.*]] = lshr i16 [[TMP17]], [[TMP18]]
+; SSE-NEXT:    [[TMP19:%.*]] = extractelement <8 x i16> [[TMP1]], i32 5
+; SSE-NEXT:    [[TMP20:%.*]] = extractelement <8 x i16> [[TMP5]], i32 5
+; SSE-NEXT:    [[R5:%.*]] = lshr i16 [[TMP19]], [[TMP20]]
+; SSE-NEXT:    [[TMP21:%.*]] = extractelement <8 x i16> [[TMP1]], i32 6
+; SSE-NEXT:    [[TMP22:%.*]] = extractelement <8 x i16> [[TMP5]], i32 6
+; SSE-NEXT:    [[R6:%.*]] = lshr i16 [[TMP21]], [[TMP22]]
+; SSE-NEXT:    [[TMP23:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+; SSE-NEXT:    [[TMP24:%.*]] = extractelement <8 x i16> [[TMP5]], i32 7
+; SSE-NEXT:    [[R7:%.*]] = lshr i16 [[TMP23]], [[TMP24]]
+; SSE-NEXT:    [[TMP25:%.*]] = extractelement <8 x i16> [[TMP2]], i32 0
+; SSE-NEXT:    [[TMP26:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0
+; SSE-NEXT:    [[R8:%.*]] = lshr i16 [[TMP25]], [[TMP26]]
+; SSE-NEXT:    [[TMP27:%.*]] = extractelement <8 x i16> [[TMP2]], i32 1
+; SSE-NEXT:    [[TMP28:%.*]] = extractelement <8 x i16> [[TMP6]], i32 1
+; SSE-NEXT:    [[R9:%.*]] = lshr i16 [[TMP27]], [[TMP28]]
+; SSE-NEXT:    [[TMP29:%.*]] = extractelement <8 x i16> [[TMP2]], i32 2
+; SSE-NEXT:    [[TMP30:%.*]] = extractelement <8 x i16> [[TMP6]], i32 2
+; SSE-NEXT:    [[R10:%.*]] = lshr i16 [[TMP29]], [[TMP30]]
+; SSE-NEXT:    [[TMP31:%.*]] = extractelement <8 x i16> [[TMP2]], i32 3
+; SSE-NEXT:    [[TMP32:%.*]] = extractelement <8 x i16> [[TMP6]], i32 3
+; SSE-NEXT:    [[R11:%.*]] = lshr i16 [[TMP31]], [[TMP32]]
+; SSE-NEXT:    [[TMP33:%.*]] = extractelement <8 x i16> [[TMP2]], i32 4
+; SSE-NEXT:    [[TMP34:%.*]] = extractelement <8 x i16> [[TMP6]], i32 4
+; SSE-NEXT:    [[R12:%.*]] = lshr i16 [[TMP33]], [[TMP34]]
+; SSE-NEXT:    [[TMP35:%.*]] = extractelement <8 x i16> [[TMP2]], i32 5
+; SSE-NEXT:    [[TMP36:%.*]] = extractelement <8 x i16> [[TMP6]], i32 5
+; SSE-NEXT:    [[R13:%.*]] = lshr i16 [[TMP35]], [[TMP36]]
+; SSE-NEXT:    [[TMP37:%.*]] = extractelement <8 x i16> [[TMP2]], i32 6
+; SSE-NEXT:    [[TMP38:%.*]] = extractelement <8 x i16> [[TMP6]], i32 6
+; SSE-NEXT:    [[R14:%.*]] = lshr i16 [[TMP37]], [[TMP38]]
+; SSE-NEXT:    [[TMP39:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
+; SSE-NEXT:    [[TMP40:%.*]] = extractelement <8 x i16> [[TMP6]], i32 7
+; SSE-NEXT:    [[R15:%.*]] = lshr i16 [[TMP39]], [[TMP40]]
+; SSE-NEXT:    [[TMP41:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
+; SSE-NEXT:    [[TMP42:%.*]] = extractelement <8 x i16> [[TMP7]], i32 0
+; SSE-NEXT:    [[R16:%.*]] = lshr i16 [[TMP41]], [[TMP42]]
+; SSE-NEXT:    [[TMP43:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
+; SSE-NEXT:    [[TMP44:%.*]] = extractelement <8 x i16> [[TMP7]], i32 1
+; SSE-NEXT:    [[R17:%.*]] = lshr i16 [[TMP43]], [[TMP44]]
+; SSE-NEXT:    [[TMP45:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
+; SSE-NEXT:    [[TMP46:%.*]] = extractelement <8 x i16> [[TMP7]], i32 2
+; SSE-NEXT:    [[R18:%.*]] = lshr i16 [[TMP45]], [[TMP46]]
+; SSE-NEXT:    [[TMP47:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
+; SSE-NEXT:    [[TMP48:%.*]] = extractelement <8 x i16> [[TMP7]], i32 3
+; SSE-NEXT:    [[R19:%.*]] = lshr i16 [[TMP47]], [[TMP48]]
+; SSE-NEXT:    [[TMP49:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
+; SSE-NEXT:    [[TMP50:%.*]] = extractelement <8 x i16> [[TMP7]], i32 4
+; SSE-NEXT:    [[R20:%.*]] = lshr i16 [[TMP49]], [[TMP50]]
+; SSE-NEXT:    [[TMP51:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
+; SSE-NEXT:    [[TMP52:%.*]] = extractelement <8 x i16> [[TMP7]], i32 5
+; SSE-NEXT:    [[R21:%.*]] = lshr i16 [[TMP51]], [[TMP52]]
+; SSE-NEXT:    [[TMP53:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
+; SSE-NEXT:    [[TMP54:%.*]] = extractelement <8 x i16> [[TMP7]], i32 6
+; SSE-NEXT:    [[R22:%.*]] = lshr i16 [[TMP53]], [[TMP54]]
+; SSE-NEXT:    [[TMP55:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
+; SSE-NEXT:    [[TMP56:%.*]] = extractelement <8 x i16> [[TMP7]], i32 7
+; SSE-NEXT:    [[R23:%.*]] = lshr i16 [[TMP55]], [[TMP56]]
+; SSE-NEXT:    [[TMP57:%.*]] = extractelement <8 x i16> [[TMP4]], i32 0
+; SSE-NEXT:    [[TMP58:%.*]] = extractelement <8 x i16> [[TMP8]], i32 0
+; SSE-NEXT:    [[R24:%.*]] = lshr i16 [[TMP57]], [[TMP58]]
+; SSE-NEXT:    [[TMP59:%.*]] = extractelement <8 x i16> [[TMP4]], i32 1
+; SSE-NEXT:    [[TMP60:%.*]] = extractelement <8 x i16> [[TMP8]], i32 1
+; SSE-NEXT:    [[R25:%.*]] = lshr i16 [[TMP59]], [[TMP60]]
+; SSE-NEXT:    [[TMP61:%.*]] = extractelement <8 x i16> [[TMP4]], i32 2
+; SSE-NEXT:    [[TMP62:%.*]] = extractelement <8 x i16> [[TMP8]], i32 2
+; SSE-NEXT:    [[R26:%.*]] = lshr i16 [[TMP61]], [[TMP62]]
+; SSE-NEXT:    [[TMP63:%.*]] = extractelement <8 x i16> [[TMP4]], i32 3
+; SSE-NEXT:    [[TMP64:%.*]] = extractelement <8 x i16> [[TMP8]], i32 3
+; SSE-NEXT:    [[R27:%.*]] = lshr i16 [[TMP63]], [[TMP64]]
+; SSE-NEXT:    [[TMP65:%.*]] = extractelement <8 x i16> [[TMP4]], i32 4
+; SSE-NEXT:    [[TMP66:%.*]] = extractelement <8 x i16> [[TMP8]], i32 4
+; SSE-NEXT:    [[R28:%.*]] = lshr i16 [[TMP65]], [[TMP66]]
+; SSE-NEXT:    [[TMP67:%.*]] = extractelement <8 x i16> [[TMP4]], i32 5
+; SSE-NEXT:    [[TMP68:%.*]] = extractelement <8 x i16> [[TMP8]], i32 5
+; SSE-NEXT:    [[R29:%.*]] = lshr i16 [[TMP67]], [[TMP68]]
+; SSE-NEXT:    [[TMP69:%.*]] = extractelement <8 x i16> [[TMP4]], i32 6
+; SSE-NEXT:    [[TMP70:%.*]] = extractelement <8 x i16> [[TMP8]], i32 6
+; SSE-NEXT:    [[R30:%.*]] = lshr i16 [[TMP69]], [[TMP70]]
+; SSE-NEXT:    [[TMP71:%.*]] = extractelement <8 x i16> [[TMP4]], i32 7
+; SSE-NEXT:    [[TMP72:%.*]] = extractelement <8 x i16> [[TMP8]], i32 7
+; SSE-NEXT:    [[R31:%.*]] = lshr i16 [[TMP71]], [[TMP72]]
+; SSE-NEXT:    [[TMP73:%.*]] = insertelement <8 x i16> poison, i16 [[R0]], i32 0
+; SSE-NEXT:    [[TMP74:%.*]] = insertelement <8 x i16> [[TMP73]], i16 [[R1]], i32 1
+; SSE-NEXT:    [[TMP75:%.*]] = insertelement <8 x i16> [[TMP74]], i16 [[R2]], i32 2
+; SSE-NEXT:    [[TMP76:%.*]] = insertelement <8 x i16> [[TMP75]], i16 [[R3]], i32 3
+; SSE-NEXT:    [[TMP77:%.*]] = insertelement <8 x i16> [[TMP76]], i16 [[R4]], i32 4
+; SSE-NEXT:    [[TMP78:%.*]] = insertelement <8 x i16> [[TMP77]], i16 [[R5]], i32 5
+; SSE-NEXT:    [[TMP79:%.*]] = insertelement <8 x i16> [[TMP78]], i16 [[R6]], i32 6
+; SSE-NEXT:    [[TMP80:%.*]] = insertelement <8 x i16> [[TMP79]], i16 [[R7]], i32 7
+; SSE-NEXT:    store <8 x i16> [[TMP80]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP81:%.*]] = insertelement <8 x i16> poison, i16 [[R8]], i32 0
+; SSE-NEXT:    [[TMP82:%.*]] = insertelement <8 x i16> [[TMP81]], i16 [[R9]], i32 1
+; SSE-NEXT:    [[TMP83:%.*]] = insertelement <8 x i16> [[TMP82]], i16 [[R10]], i32 2
+; SSE-NEXT:    [[TMP84:%.*]] = insertelement <8 x i16> [[TMP83]], i16 [[R11]], i32 3
+; SSE-NEXT:    [[TMP85:%.*]] = insertelement <8 x i16> [[TMP84]], i16 [[R12]], i32 4
+; SSE-NEXT:    [[TMP86:%.*]] = insertelement <8 x i16> [[TMP85]], i16 [[R13]], i32 5
+; SSE-NEXT:    [[TMP87:%.*]] = insertelement <8 x i16> [[TMP86]], i16 [[R14]], i32 6
+; SSE-NEXT:    [[TMP88:%.*]] = insertelement <8 x i16> [[TMP87]], i16 [[R15]], i32 7
+; SSE-NEXT:    store <8 x i16> [[TMP88]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP89:%.*]] = insertelement <8 x i16> poison, i16 [[R16]], i32 0
+; SSE-NEXT:    [[TMP90:%.*]] = insertelement <8 x i16> [[TMP89]], i16 [[R17]], i32 1
+; SSE-NEXT:    [[TMP91:%.*]] = insertelement <8 x i16> [[TMP90]], i16 [[R18]], i32 2
+; SSE-NEXT:    [[TMP92:%.*]] = insertelement <8 x i16> [[TMP91]], i16 [[R19]], i32 3
+; SSE-NEXT:    [[TMP93:%.*]] = insertelement <8 x i16> [[TMP92]], i16 [[R20]], i32 4
+; SSE-NEXT:    [[TMP94:%.*]] = insertelement <8 x i16> [[TMP93]], i16 [[R21]], i32 5
+; SSE-NEXT:    [[TMP95:%.*]] = insertelement <8 x i16> [[TMP94]], i16 [[R22]], i32 6
+; SSE-NEXT:    [[TMP96:%.*]] = insertelement <8 x i16> [[TMP95]], i16 [[R23]], i32 7
+; SSE-NEXT:    store <8 x i16> [[TMP96]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP97:%.*]] = insertelement <8 x i16> poison, i16 [[R24]], i32 0
+; SSE-NEXT:    [[TMP98:%.*]] = insertelement <8 x i16> [[TMP97]], i16 [[R25]], i32 1
+; SSE-NEXT:    [[TMP99:%.*]] = insertelement <8 x i16> [[TMP98]], i16 [[R26]], i32 2
+; SSE-NEXT:    [[TMP100:%.*]] = insertelement <8 x i16> [[TMP99]], i16 [[R27]], i32 3
+; SSE-NEXT:    [[TMP101:%.*]] = insertelement <8 x i16> [[TMP100]], i16 [[R28]], i32 4
+; SSE-NEXT:    [[TMP102:%.*]] = insertelement <8 x i16> [[TMP101]], i16 [[R29]], i32 5
+; SSE-NEXT:    [[TMP103:%.*]] = insertelement <8 x i16> [[TMP102]], i16 [[R30]], i32 6
+; SSE-NEXT:    [[TMP104:%.*]] = insertelement <8 x i16> [[TMP103]], i16 [[R31]], i32 7
+; SSE-NEXT:    store <8 x i16> [[TMP104]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @lshr_v32i16(
Index: llvm/test/Transforms/SLPVectorizer/X86/shift-shl.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/shift-shl.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/shift-shl.ll
@@ -241,134 +241,146 @@
 
 define void @shl_v32i16() {
 ; SSE-LABEL: @shl_v32i16(
-; SSE-NEXT:    [[A0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0), align 2
-; SSE-NEXT:    [[A1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1), align 2
-; SSE-NEXT:    [[A2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2), align 2
-; SSE-NEXT:    [[A3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3), align 2
-; SSE-NEXT:    [[A4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4), align 2
-; SSE-NEXT:    [[A5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5), align 2
-; SSE-NEXT:    [[A6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6), align 2
-; SSE-NEXT:    [[A7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7), align 2
-; SSE-NEXT:    [[A8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8), align 2
-; SSE-NEXT:    [[A9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9), align 2
-; SSE-NEXT:    [[A10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
-; SSE-NEXT:    [[A11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
-; SSE-NEXT:    [[A12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
-; SSE-NEXT:    [[A13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
-; SSE-NEXT:    [[A14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
-; SSE-NEXT:    [[A15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
-; SSE-NEXT:    [[A16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
-; SSE-NEXT:    [[A17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
-; SSE-NEXT:    [[A18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
-; SSE-NEXT:    [[A19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
-; SSE-NEXT:    [[A20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
-; SSE-NEXT:    [[A21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
-; SSE-NEXT:    [[A22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
-; SSE-NEXT:    [[A23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
-; SSE-NEXT:    [[A24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
-; SSE-NEXT:    [[A25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
-; SSE-NEXT:    [[A26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
-; SSE-NEXT:    [[A27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
-; SSE-NEXT:    [[A28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
-; SSE-NEXT:    [[A29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
-; SSE-NEXT:    [[A30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
-; SSE-NEXT:    [[A31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
-; SSE-NEXT:    [[B0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0), align 2
-; SSE-NEXT:    [[B1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1), align 2
-; SSE-NEXT:    [[B2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2), align 2
-; SSE-NEXT:    [[B3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3), align 2
-; SSE-NEXT:    [[B4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4), align 2
-; SSE-NEXT:    [[B5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5), align 2
-; SSE-NEXT:    [[B6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6), align 2
-; SSE-NEXT:    [[B7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7), align 2
-; SSE-NEXT:    [[B8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8), align 2
-; SSE-NEXT:    [[B9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9), align 2
-; SSE-NEXT:    [[B10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
-; SSE-NEXT:    [[B11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
-; SSE-NEXT:    [[B12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
-; SSE-NEXT:    [[B13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
-; SSE-NEXT:    [[B14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
-; SSE-NEXT:    [[B15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
-; SSE-NEXT:    [[B16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
-; SSE-NEXT:    [[B17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
-; SSE-NEXT:    [[B18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
-; SSE-NEXT:    [[B19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
-; SSE-NEXT:    [[B20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
-; SSE-NEXT:    [[B21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
-; SSE-NEXT:    [[B22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
-; SSE-NEXT:    [[B23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
-; SSE-NEXT:    [[B24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
-; SSE-NEXT:    [[B25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
-; SSE-NEXT:    [[B26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
-; SSE-NEXT:    [[B27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
-; SSE-NEXT:    [[B28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
-; SSE-NEXT:    [[B29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
-; SSE-NEXT:    [[B30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
-; SSE-NEXT:    [[B31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
-; SSE-NEXT:    [[R0:%.*]] = shl i16 [[A0]], [[B0]]
-; SSE-NEXT:    [[R1:%.*]] = shl i16 [[A1]], [[B1]]
-; SSE-NEXT:    [[R2:%.*]] = shl i16 [[A2]], [[B2]]
-; SSE-NEXT:    [[R3:%.*]] = shl i16 [[A3]], [[B3]]
-; SSE-NEXT:    [[R4:%.*]] = shl i16 [[A4]], [[B4]]
-; SSE-NEXT:    [[R5:%.*]] = shl i16 [[A5]], [[B5]]
-; SSE-NEXT:    [[R6:%.*]] = shl i16 [[A6]], [[B6]]
-; SSE-NEXT:    [[R7:%.*]] = shl i16 [[A7]], [[B7]]
-; SSE-NEXT:    [[R8:%.*]] = shl i16 [[A8]], [[B8]]
-; SSE-NEXT:    [[R9:%.*]] = shl i16 [[A9]], [[B9]]
-; SSE-NEXT:    [[R10:%.*]] = shl i16 [[A10]], [[B10]]
-; SSE-NEXT:    [[R11:%.*]] = shl i16 [[A11]], [[B11]]
-; SSE-NEXT:    [[R12:%.*]] = shl i16 [[A12]], [[B12]]
-; SSE-NEXT:    [[R13:%.*]] = shl i16 [[A13]], [[B13]]
-; SSE-NEXT:    [[R14:%.*]] = shl i16 [[A14]], [[B14]]
-; SSE-NEXT:    [[R15:%.*]] = shl i16 [[A15]], [[B15]]
-; SSE-NEXT:    [[R16:%.*]] = shl i16 [[A16]], [[B16]]
-; SSE-NEXT:    [[R17:%.*]] = shl i16 [[A17]], [[B17]]
-; SSE-NEXT:    [[R18:%.*]] = shl i16 [[A18]], [[B18]]
-; SSE-NEXT:    [[R19:%.*]] = shl i16 [[A19]], [[B19]]
-; SSE-NEXT:    [[R20:%.*]] = shl i16 [[A20]], [[B20]]
-; SSE-NEXT:    [[R21:%.*]] = shl i16 [[A21]], [[B21]]
-; SSE-NEXT:    [[R22:%.*]] = shl i16 [[A22]], [[B22]]
-; SSE-NEXT:    [[R23:%.*]] = shl i16 [[A23]], [[B23]]
-; SSE-NEXT:    [[R24:%.*]] = shl i16 [[A24]], [[B24]]
-; SSE-NEXT:    [[R25:%.*]] = shl i16 [[A25]], [[B25]]
-; SSE-NEXT:    [[R26:%.*]] = shl i16 [[A26]], [[B26]]
-; SSE-NEXT:    [[R27:%.*]] = shl i16 [[A27]], [[B27]]
-; SSE-NEXT:    [[R28:%.*]] = shl i16 [[A28]], [[B28]]
-; SSE-NEXT:    [[R29:%.*]] = shl i16 [[A29]], [[B29]]
-; SSE-NEXT:    [[R30:%.*]] = shl i16 [[A30]], [[B30]]
-; SSE-NEXT:    [[R31:%.*]] = shl i16 [[A31]], [[B31]]
-; SSE-NEXT:    store i16 [[R0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0), align 2
-; SSE-NEXT:    store i16 [[R1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1), align 2
-; SSE-NEXT:    store i16 [[R2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2), align 2
-; SSE-NEXT:    store i16 [[R3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3), align 2
-; SSE-NEXT:    store i16 [[R4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4), align 2
-; SSE-NEXT:    store i16 [[R5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5), align 2
-; SSE-NEXT:    store i16 [[R6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6), align 2
-; SSE-NEXT:    store i16 [[R7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7), align 2
-; SSE-NEXT:    store i16 [[R8]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8), align 2
-; SSE-NEXT:    store i16 [[R9]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9), align 2
-; SSE-NEXT:    store i16 [[R10]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
-; SSE-NEXT:    store i16 [[R11]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
-; SSE-NEXT:    store i16 [[R12]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
-; SSE-NEXT:    store i16 [[R13]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
-; SSE-NEXT:    store i16 [[R14]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
-; SSE-NEXT:    store i16 [[R15]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
-; SSE-NEXT:    store i16 [[R16]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
-; SSE-NEXT:    store i16 [[R17]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
-; SSE-NEXT:    store i16 [[R18]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
-; SSE-NEXT:    store i16 [[R19]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
-; SSE-NEXT:    store i16 [[R20]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
-; SSE-NEXT:    store i16 [[R21]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
-; SSE-NEXT:    store i16 [[R22]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
-; SSE-NEXT:    store i16 [[R23]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
-; SSE-NEXT:    store i16 [[R24]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
-; SSE-NEXT:    store i16 [[R25]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
-; SSE-NEXT:    store i16 [[R26]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
-; SSE-NEXT:    store i16 [[R27]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
-; SSE-NEXT:    store i16 [[R28]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
-; SSE-NEXT:    store i16 [[R29]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
-; SSE-NEXT:    store i16 [[R30]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
-; SSE-NEXT:    store i16 [[R31]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0
+; SSE-NEXT:    [[TMP10:%.*]] = extractelement <8 x i16> [[TMP5]], i32 0
+; SSE-NEXT:    [[R0:%.*]] = shl i16 [[TMP9]], [[TMP10]]
+; SSE-NEXT:    [[TMP11:%.*]] = extractelement <8 x i16> [[TMP1]], i32 1
+; SSE-NEXT:    [[TMP12:%.*]] = extractelement <8 x i16> [[TMP5]], i32 1
+; SSE-NEXT:    [[R1:%.*]] = shl i16 [[TMP11]], [[TMP12]]
+; SSE-NEXT:    [[TMP13:%.*]] = extractelement <8 x i16> [[TMP1]], i32 2
+; SSE-NEXT:    [[TMP14:%.*]] = extractelement <8 x i16> [[TMP5]], i32 2
+; SSE-NEXT:    [[R2:%.*]] = shl i16 [[TMP13]], [[TMP14]]
+; SSE-NEXT:    [[TMP15:%.*]] = extractelement <8 x i16> [[TMP1]], i32 3
+; SSE-NEXT:    [[TMP16:%.*]] = extractelement <8 x i16> [[TMP5]], i32 3
+; SSE-NEXT:    [[R3:%.*]] = shl i16 [[TMP15]], [[TMP16]]
+; SSE-NEXT:    [[TMP17:%.*]] = extractelement <8 x i16> [[TMP1]], i32 4
+; SSE-NEXT:    [[TMP18:%.*]] = extractelement <8 x i16> [[TMP5]], i32 4
+; SSE-NEXT:    [[R4:%.*]] = shl i16 [[TMP17]], [[TMP18]]
+; SSE-NEXT:    [[TMP19:%.*]] = extractelement <8 x i16> [[TMP1]], i32 5
+; SSE-NEXT:    [[TMP20:%.*]] = extractelement <8 x i16> [[TMP5]], i32 5
+; SSE-NEXT:    [[R5:%.*]] = shl i16 [[TMP19]], [[TMP20]]
+; SSE-NEXT:    [[TMP21:%.*]] = extractelement <8 x i16> [[TMP1]], i32 6
+; SSE-NEXT:    [[TMP22:%.*]] = extractelement <8 x i16> [[TMP5]], i32 6
+; SSE-NEXT:    [[R6:%.*]] = shl i16 [[TMP21]], [[TMP22]]
+; SSE-NEXT:    [[TMP23:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+; SSE-NEXT:    [[TMP24:%.*]] = extractelement <8 x i16> [[TMP5]], i32 7
+; SSE-NEXT:    [[R7:%.*]] = shl i16 [[TMP23]], [[TMP24]]
+; SSE-NEXT:    [[TMP25:%.*]] = extractelement <8 x i16> [[TMP2]], i32 0
+; SSE-NEXT:    [[TMP26:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0
+; SSE-NEXT:    [[R8:%.*]] = shl i16 [[TMP25]], [[TMP26]]
+; SSE-NEXT:    [[TMP27:%.*]] = extractelement <8 x i16> [[TMP2]], i32 1
+; SSE-NEXT:    [[TMP28:%.*]] = extractelement <8 x i16> [[TMP6]], i32 1
+; SSE-NEXT:    [[R9:%.*]] = shl i16 [[TMP27]], [[TMP28]]
+; SSE-NEXT:    [[TMP29:%.*]] = extractelement <8 x i16> [[TMP2]], i32 2
+; SSE-NEXT:    [[TMP30:%.*]] = extractelement <8 x i16> [[TMP6]], i32 2
+; SSE-NEXT:    [[R10:%.*]] = shl i16 [[TMP29]], [[TMP30]]
+; SSE-NEXT:    [[TMP31:%.*]] = extractelement <8 x i16> [[TMP2]], i32 3
+; SSE-NEXT:    [[TMP32:%.*]] = extractelement <8 x i16> [[TMP6]], i32 3
+; SSE-NEXT:    [[R11:%.*]] = shl i16 [[TMP31]], [[TMP32]]
+; SSE-NEXT:    [[TMP33:%.*]] = extractelement <8 x i16> [[TMP2]], i32 4
+; SSE-NEXT:    [[TMP34:%.*]] = extractelement <8 x i16> [[TMP6]], i32 4
+; SSE-NEXT:    [[R12:%.*]] = shl i16 [[TMP33]], [[TMP34]]
+; SSE-NEXT:    [[TMP35:%.*]] = extractelement <8 x i16> [[TMP2]], i32 5
+; SSE-NEXT:    [[TMP36:%.*]] = extractelement <8 x i16> [[TMP6]], i32 5
+; SSE-NEXT:    [[R13:%.*]] = shl i16 [[TMP35]], [[TMP36]]
+; SSE-NEXT:    [[TMP37:%.*]] = extractelement <8 x i16> [[TMP2]], i32 6
+; SSE-NEXT:    [[TMP38:%.*]] = extractelement <8 x i16> [[TMP6]], i32 6
+; SSE-NEXT:    [[R14:%.*]] = shl i16 [[TMP37]], [[TMP38]]
+; SSE-NEXT:    [[TMP39:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
+; SSE-NEXT:    [[TMP40:%.*]] = extractelement <8 x i16> [[TMP6]], i32 7
+; SSE-NEXT:    [[R15:%.*]] = shl i16 [[TMP39]], [[TMP40]]
+; SSE-NEXT:    [[TMP41:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
+; SSE-NEXT:    [[TMP42:%.*]] = extractelement <8 x i16> [[TMP7]], i32 0
+; SSE-NEXT:    [[R16:%.*]] = shl i16 [[TMP41]], [[TMP42]]
+; SSE-NEXT:    [[TMP43:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
+; SSE-NEXT:    [[TMP44:%.*]] = extractelement <8 x i16> [[TMP7]], i32 1
+; SSE-NEXT:    [[R17:%.*]] = shl i16 [[TMP43]], [[TMP44]]
+; SSE-NEXT:    [[TMP45:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
+; SSE-NEXT:    [[TMP46:%.*]] = extractelement <8 x i16> [[TMP7]], i32 2
+; SSE-NEXT:    [[R18:%.*]] = shl i16 [[TMP45]], [[TMP46]]
+; SSE-NEXT:    [[TMP47:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
+; SSE-NEXT:    [[TMP48:%.*]] = extractelement <8 x i16> [[TMP7]], i32 3
+; SSE-NEXT:    [[R19:%.*]] = shl i16 [[TMP47]], [[TMP48]]
+; SSE-NEXT:    [[TMP49:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
+; SSE-NEXT:    [[TMP50:%.*]] = extractelement <8 x i16> [[TMP7]], i32 4
+; SSE-NEXT:    [[R20:%.*]] = shl i16 [[TMP49]], [[TMP50]]
+; SSE-NEXT:    [[TMP51:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
+; SSE-NEXT:    [[TMP52:%.*]] = extractelement <8 x i16> [[TMP7]], i32 5
+; SSE-NEXT:    [[R21:%.*]] = shl i16 [[TMP51]], [[TMP52]]
+; SSE-NEXT:    [[TMP53:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
+; SSE-NEXT:    [[TMP54:%.*]] = extractelement <8 x i16> [[TMP7]], i32 6
+; SSE-NEXT:    [[R22:%.*]] = shl i16 [[TMP53]], [[TMP54]]
+; SSE-NEXT:    [[TMP55:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
+; SSE-NEXT:    [[TMP56:%.*]] = extractelement <8 x i16> [[TMP7]], i32 7
+; SSE-NEXT:    [[R23:%.*]] = shl i16 [[TMP55]], [[TMP56]]
+; SSE-NEXT:    [[TMP57:%.*]] = extractelement <8 x i16> [[TMP4]], i32 0
+; SSE-NEXT:    [[TMP58:%.*]] = extractelement <8 x i16> [[TMP8]], i32 0
+; SSE-NEXT:    [[R24:%.*]] = shl i16 [[TMP57]], [[TMP58]]
+; SSE-NEXT:    [[TMP59:%.*]] = extractelement <8 x i16> [[TMP4]], i32 1
+; SSE-NEXT:    [[TMP60:%.*]] = extractelement <8 x i16> [[TMP8]], i32 1
+; SSE-NEXT:    [[R25:%.*]] = shl i16 [[TMP59]], [[TMP60]]
+; SSE-NEXT:    [[TMP61:%.*]] = extractelement <8 x i16> [[TMP4]], i32 2
+; SSE-NEXT:    [[TMP62:%.*]] = extractelement <8 x i16> [[TMP8]], i32 2
+; SSE-NEXT:    [[R26:%.*]] = shl i16 [[TMP61]], [[TMP62]]
+; SSE-NEXT:    [[TMP63:%.*]] = extractelement <8 x i16> [[TMP4]], i32 3
+; SSE-NEXT:    [[TMP64:%.*]] = extractelement <8 x i16> [[TMP8]], i32 3
+; SSE-NEXT:    [[R27:%.*]] = shl i16 [[TMP63]], [[TMP64]]
+; SSE-NEXT:    [[TMP65:%.*]] = extractelement <8 x i16> [[TMP4]], i32 4
+; SSE-NEXT:    [[TMP66:%.*]] = extractelement <8 x i16> [[TMP8]], i32 4
+; SSE-NEXT:    [[R28:%.*]] = shl i16 [[TMP65]], [[TMP66]]
+; SSE-NEXT:    [[TMP67:%.*]] = extractelement <8 x i16> [[TMP4]], i32 5
+; SSE-NEXT:    [[TMP68:%.*]] = extractelement <8 x i16> [[TMP8]], i32 5
+; SSE-NEXT:    [[R29:%.*]] = shl i16 [[TMP67]], [[TMP68]]
+; SSE-NEXT:    [[TMP69:%.*]] = extractelement <8 x i16> [[TMP4]], i32 6
+; SSE-NEXT:    [[TMP70:%.*]] = extractelement <8 x i16> [[TMP8]], i32 6
+; SSE-NEXT:    [[R30:%.*]] = shl i16 [[TMP69]], [[TMP70]]
+; SSE-NEXT:    [[TMP71:%.*]] = extractelement <8 x i16> [[TMP4]], i32 7
+; SSE-NEXT:    [[TMP72:%.*]] = extractelement <8 x i16> [[TMP8]], i32 7
+; SSE-NEXT:    [[R31:%.*]] = shl i16 [[TMP71]], [[TMP72]]
+; SSE-NEXT:    [[TMP73:%.*]] = insertelement <8 x i16> poison, i16 [[R0]], i32 0
+; SSE-NEXT:    [[TMP74:%.*]] = insertelement <8 x i16> [[TMP73]], i16 [[R1]], i32 1
+; SSE-NEXT:    [[TMP75:%.*]] = insertelement <8 x i16> [[TMP74]], i16 [[R2]], i32 2
+; SSE-NEXT:    [[TMP76:%.*]] = insertelement <8 x i16> [[TMP75]], i16 [[R3]], i32 3
+; SSE-NEXT:    [[TMP77:%.*]] = insertelement <8 x i16> [[TMP76]], i16 [[R4]], i32 4
+; SSE-NEXT:    [[TMP78:%.*]] = insertelement <8 x i16> [[TMP77]], i16 [[R5]], i32 5
+; SSE-NEXT:    [[TMP79:%.*]] = insertelement <8 x i16> [[TMP78]], i16 [[R6]], i32 6
+; SSE-NEXT:    [[TMP80:%.*]] = insertelement <8 x i16> [[TMP79]], i16 [[R7]], i32 7
+; SSE-NEXT:    store <8 x i16> [[TMP80]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP81:%.*]] = insertelement <8 x i16> poison, i16 [[R8]], i32 0
+; SSE-NEXT:    [[TMP82:%.*]] = insertelement <8 x i16> [[TMP81]], i16 [[R9]], i32 1
+; SSE-NEXT:    [[TMP83:%.*]] = insertelement <8 x i16> [[TMP82]], i16 [[R10]], i32 2
+; SSE-NEXT:    [[TMP84:%.*]] = insertelement <8 x i16> [[TMP83]], i16 [[R11]], i32 3
+; SSE-NEXT:    [[TMP85:%.*]] = insertelement <8 x i16> [[TMP84]], i16 [[R12]], i32 4
+; SSE-NEXT:    [[TMP86:%.*]] = insertelement <8 x i16> [[TMP85]], i16 [[R13]], i32 5
+; SSE-NEXT:    [[TMP87:%.*]] = insertelement <8 x i16> [[TMP86]], i16 [[R14]], i32 6
+; SSE-NEXT:    [[TMP88:%.*]] = insertelement <8 x i16> [[TMP87]], i16 [[R15]], i32 7
+; SSE-NEXT:    store <8 x i16> [[TMP88]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP89:%.*]] = insertelement <8 x i16> poison, i16 [[R16]], i32 0
+; SSE-NEXT:    [[TMP90:%.*]] = insertelement <8 x i16> [[TMP89]], i16 [[R17]], i32 1
+; SSE-NEXT:    [[TMP91:%.*]] = insertelement <8 x i16> [[TMP90]], i16 [[R18]], i32 2
+; SSE-NEXT:    [[TMP92:%.*]] = insertelement <8 x i16> [[TMP91]], i16 [[R19]], i32 3
+; SSE-NEXT:    [[TMP93:%.*]] = insertelement <8 x i16> [[TMP92]], i16 [[R20]], i32 4
+; SSE-NEXT:    [[TMP94:%.*]] = insertelement <8 x i16> [[TMP93]], i16 [[R21]], i32 5
+; SSE-NEXT:    [[TMP95:%.*]] = insertelement <8 x i16> [[TMP94]], i16 [[R22]], i32 6
+; SSE-NEXT:    [[TMP96:%.*]] = insertelement <8 x i16> [[TMP95]], i16 [[R23]], i32 7
+; SSE-NEXT:    store <8 x i16> [[TMP96]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP97:%.*]] = insertelement <8 x i16> poison, i16 [[R24]], i32 0
+; SSE-NEXT:    [[TMP98:%.*]] = insertelement <8 x i16> [[TMP97]], i16 [[R25]], i32 1
+; SSE-NEXT:    [[TMP99:%.*]] = insertelement <8 x i16> [[TMP98]], i16 [[R26]], i32 2
+; SSE-NEXT:    [[TMP100:%.*]] = insertelement <8 x i16> [[TMP99]], i16 [[R27]], i32 3
+; SSE-NEXT:    [[TMP101:%.*]] = insertelement <8 x i16> [[TMP100]], i16 [[R28]], i32 4
+; SSE-NEXT:    [[TMP102:%.*]] = insertelement <8 x i16> [[TMP101]], i16 [[R29]], i32 5
+; SSE-NEXT:    [[TMP103:%.*]] = insertelement <8 x i16> [[TMP102]], i16 [[R30]], i32 6
+; SSE-NEXT:    [[TMP104:%.*]] = insertelement <8 x i16> [[TMP103]], i16 [[R31]], i32 7
+; SSE-NEXT:    store <8 x i16> [[TMP104]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @shl_v32i16(
Index: llvm/test/Transforms/SLPVectorizer/X86/simplebb.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/simplebb.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/simplebb.ll
@@ -70,9 +70,10 @@
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B]], i64 1
 ; CHECK-NEXT:    [[I4:%.*]] = load double, double* [[ARRAYIDX4]], align 8
 ; CHECK-NEXT:    [[MUL5:%.*]] = fmul double [[I3]], [[I4]]
-; CHECK-NEXT:    store double [[MUL]], double* [[C:%.*]], align 8
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[C]], i64 1
-; CHECK-NEXT:    store double [[MUL5]], double* [[ARRAYIDX5]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[MUL]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[MUL5]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double* [[C:%.*]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP2]], <2 x double>* [[TMP3]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %i0 = load volatile double, double* %a, align 8
Index: llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll
@@ -702,33 +702,6 @@
 ; SSE-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
 ; SSE-NEXT:    ret void
 ;
-; AVX256NODQ-LABEL: @sitofp_8i64_8f32(
-; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
-; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
-; AVX256NODQ-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
-; AVX256NODQ-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
-; AVX256NODQ-NEXT:    [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
-; AVX256NODQ-NEXT:    [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
-; AVX256NODQ-NEXT:    [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
-; AVX256NODQ-NEXT:    [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
-; AVX256NODQ-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to float
-; AVX256NODQ-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to float
-; AVX256NODQ-NEXT:    [[CVT2:%.*]] = sitofp i64 [[LD2]] to float
-; AVX256NODQ-NEXT:    [[CVT3:%.*]] = sitofp i64 [[LD3]] to float
-; AVX256NODQ-NEXT:    [[CVT4:%.*]] = sitofp i64 [[LD4]] to float
-; AVX256NODQ-NEXT:    [[CVT5:%.*]] = sitofp i64 [[LD5]] to float
-; AVX256NODQ-NEXT:    [[CVT6:%.*]] = sitofp i64 [[LD6]] to float
-; AVX256NODQ-NEXT:    [[CVT7:%.*]] = sitofp i64 [[LD7]] to float
-; AVX256NODQ-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; AVX256NODQ-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; AVX256NODQ-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
-; AVX256NODQ-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; AVX256NODQ-NEXT:    store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
-; AVX256NODQ-NEXT:    store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; AVX256NODQ-NEXT:    store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
-; AVX256NODQ-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
-; AVX256NODQ-NEXT:    ret void
-;
 ; AVX512-LABEL: @sitofp_8i64_8f32(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
 ; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <8 x i64> [[TMP1]] to <8 x float>
@@ -917,18 +890,20 @@
 
 define void @sitofp_4i16_4f32() #0 {
 ; SSE-LABEL: @sitofp_4i16_4f32(
-; SSE-NEXT:    [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
-; SSE-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
-; SSE-NEXT:    [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
-; SSE-NEXT:    [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
-; SSE-NEXT:    [[CVT0:%.*]] = sitofp i16 [[LD0]] to float
-; SSE-NEXT:    [[CVT1:%.*]] = sitofp i16 [[LD1]] to float
-; SSE-NEXT:    [[CVT2:%.*]] = sitofp i16 [[LD2]] to float
-; SSE-NEXT:    [[CVT3:%.*]] = sitofp i16 [[LD3]] to float
-; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
-; SSE-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[TMP1]], i32 0
+; SSE-NEXT:    [[CVT0:%.*]] = sitofp i16 [[TMP2]] to float
+; SSE-NEXT:    [[TMP3:%.*]] = extractelement <4 x i16> [[TMP1]], i32 1
+; SSE-NEXT:    [[CVT1:%.*]] = sitofp i16 [[TMP3]] to float
+; SSE-NEXT:    [[TMP4:%.*]] = extractelement <4 x i16> [[TMP1]], i32 2
+; SSE-NEXT:    [[CVT2:%.*]] = sitofp i16 [[TMP4]] to float
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+; SSE-NEXT:    [[CVT3:%.*]] = sitofp i16 [[TMP5]] to float
+; SSE-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> poison, float [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[CVT1]], i32 1
+; SSE-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[CVT2]], i32 2
+; SSE-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[CVT3]], i32 3
+; SSE-NEXT:    store <4 x float> [[TMP9]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @sitofp_4i16_4f32(
@@ -954,30 +929,34 @@
 
 define void @sitofp_8i16_8f32() #0 {
 ; SSE-LABEL: @sitofp_8i16_8f32(
-; SSE-NEXT:    [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
-; SSE-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
-; SSE-NEXT:    [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
-; SSE-NEXT:    [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
-; SSE-NEXT:    [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8
-; SSE-NEXT:    [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2
-; SSE-NEXT:    [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4
-; SSE-NEXT:    [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2
-; SSE-NEXT:    [[CVT0:%.*]] = sitofp i16 [[LD0]] to float
-; SSE-NEXT:    [[CVT1:%.*]] = sitofp i16 [[LD1]] to float
-; SSE-NEXT:    [[CVT2:%.*]] = sitofp i16 [[LD2]] to float
-; SSE-NEXT:    [[CVT3:%.*]] = sitofp i16 [[LD3]] to float
-; SSE-NEXT:    [[CVT4:%.*]] = sitofp i16 [[LD4]] to float
-; SSE-NEXT:    [[CVT5:%.*]] = sitofp i16 [[LD5]] to float
-; SSE-NEXT:    [[CVT6:%.*]] = sitofp i16 [[LD6]] to float
-; SSE-NEXT:    [[CVT7:%.*]] = sitofp i16 [[LD7]] to float
-; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
-; SSE-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE-NEXT:    store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
-; SSE-NEXT:    store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE-NEXT:    store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
-; SSE-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = extractelement <4 x i16> [[TMP1]], i32 0
+; SSE-NEXT:    [[CVT0:%.*]] = sitofp i16 [[TMP3]] to float
+; SSE-NEXT:    [[TMP4:%.*]] = extractelement <4 x i16> [[TMP1]], i32 1
+; SSE-NEXT:    [[CVT1:%.*]] = sitofp i16 [[TMP4]] to float
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <4 x i16> [[TMP1]], i32 2
+; SSE-NEXT:    [[CVT2:%.*]] = sitofp i16 [[TMP5]] to float
+; SSE-NEXT:    [[TMP6:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+; SSE-NEXT:    [[CVT3:%.*]] = sitofp i16 [[TMP6]] to float
+; SSE-NEXT:    [[TMP7:%.*]] = extractelement <4 x i16> [[TMP2]], i32 0
+; SSE-NEXT:    [[CVT4:%.*]] = sitofp i16 [[TMP7]] to float
+; SSE-NEXT:    [[TMP8:%.*]] = extractelement <4 x i16> [[TMP2]], i32 1
+; SSE-NEXT:    [[CVT5:%.*]] = sitofp i16 [[TMP8]] to float
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2
+; SSE-NEXT:    [[CVT6:%.*]] = sitofp i16 [[TMP9]] to float
+; SSE-NEXT:    [[TMP10:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+; SSE-NEXT:    [[CVT7:%.*]] = sitofp i16 [[TMP10]] to float
+; SSE-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> poison, float [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[CVT1]], i32 1
+; SSE-NEXT:    [[TMP13:%.*]] = insertelement <4 x float> [[TMP12]], float [[CVT2]], i32 2
+; SSE-NEXT:    [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[CVT3]], i32 3
+; SSE-NEXT:    store <4 x float> [[TMP14]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    [[TMP15:%.*]] = insertelement <4 x float> poison, float [[CVT4]], i32 0
+; SSE-NEXT:    [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[CVT5]], i32 1
+; SSE-NEXT:    [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float [[CVT6]], i32 2
+; SSE-NEXT:    [[TMP18:%.*]] = insertelement <4 x float> [[TMP17]], float [[CVT7]], i32 3
+; SSE-NEXT:    store <4 x float> [[TMP18]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @sitofp_8i16_8f32(
@@ -1015,54 +994,62 @@
 
 define void @sitofp_16i16_16f32() #0 {
 ; SSE-LABEL: @sitofp_16i16_16f32(
-; SSE-NEXT:    [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
-; SSE-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
-; SSE-NEXT:    [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
-; SSE-NEXT:    [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
-; SSE-NEXT:    [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8
-; SSE-NEXT:    [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2
-; SSE-NEXT:    [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4
-; SSE-NEXT:    [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2
-; SSE-NEXT:    [[LD8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8), align 16
-; SSE-NEXT:    [[LD9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 9), align 2
-; SSE-NEXT:    [[LD10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 10), align 4
-; SSE-NEXT:    [[LD11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 11), align 2
-; SSE-NEXT:    [[LD12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12), align 8
-; SSE-NEXT:    [[LD13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 13), align 2
-; SSE-NEXT:    [[LD14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 14), align 4
-; SSE-NEXT:    [[LD15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 15), align 2
-; SSE-NEXT:    [[CVT0:%.*]] = sitofp i16 [[LD0]] to float
-; SSE-NEXT:    [[CVT1:%.*]] = sitofp i16 [[LD1]] to float
-; SSE-NEXT:    [[CVT2:%.*]] = sitofp i16 [[LD2]] to float
-; SSE-NEXT:    [[CVT3:%.*]] = sitofp i16 [[LD3]] to float
-; SSE-NEXT:    [[CVT4:%.*]] = sitofp i16 [[LD4]] to float
-; SSE-NEXT:    [[CVT5:%.*]] = sitofp i16 [[LD5]] to float
-; SSE-NEXT:    [[CVT6:%.*]] = sitofp i16 [[LD6]] to float
-; SSE-NEXT:    [[CVT7:%.*]] = sitofp i16 [[LD7]] to float
-; SSE-NEXT:    [[CVT8:%.*]] = sitofp i16 [[LD8]] to float
-; SSE-NEXT:    [[CVT9:%.*]] = sitofp i16 [[LD9]] to float
-; SSE-NEXT:    [[CVT10:%.*]] = sitofp i16 [[LD10]] to float
-; SSE-NEXT:    [[CVT11:%.*]] = sitofp i16 [[LD11]] to float
-; SSE-NEXT:    [[CVT12:%.*]] = sitofp i16 [[LD12]] to float
-; SSE-NEXT:    [[CVT13:%.*]] = sitofp i16 [[LD13]] to float
-; SSE-NEXT:    [[CVT14:%.*]] = sitofp i16 [[LD14]] to float
-; SSE-NEXT:    [[CVT15:%.*]] = sitofp i16 [[LD15]] to float
-; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
-; SSE-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE-NEXT:    store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
-; SSE-NEXT:    store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE-NEXT:    store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
-; SSE-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
-; SSE-NEXT:    store float [[CVT8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 32
-; SSE-NEXT:    store float [[CVT9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4
-; SSE-NEXT:    store float [[CVT10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 8
-; SSE-NEXT:    store float [[CVT11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
-; SSE-NEXT:    store float [[CVT12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 16
-; SSE-NEXT:    store float [[CVT13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
-; SSE-NEXT:    store float [[CVT14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 8
-; SSE-NEXT:    store float [[CVT15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <4 x i16>*), align 16
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <4 x i16> [[TMP1]], i32 0
+; SSE-NEXT:    [[CVT0:%.*]] = sitofp i16 [[TMP5]] to float
+; SSE-NEXT:    [[TMP6:%.*]] = extractelement <4 x i16> [[TMP1]], i32 1
+; SSE-NEXT:    [[CVT1:%.*]] = sitofp i16 [[TMP6]] to float
+; SSE-NEXT:    [[TMP7:%.*]] = extractelement <4 x i16> [[TMP1]], i32 2
+; SSE-NEXT:    [[CVT2:%.*]] = sitofp i16 [[TMP7]] to float
+; SSE-NEXT:    [[TMP8:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+; SSE-NEXT:    [[CVT3:%.*]] = sitofp i16 [[TMP8]] to float
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <4 x i16> [[TMP2]], i32 0
+; SSE-NEXT:    [[CVT4:%.*]] = sitofp i16 [[TMP9]] to float
+; SSE-NEXT:    [[TMP10:%.*]] = extractelement <4 x i16> [[TMP2]], i32 1
+; SSE-NEXT:    [[CVT5:%.*]] = sitofp i16 [[TMP10]] to float
+; SSE-NEXT:    [[TMP11:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2
+; SSE-NEXT:    [[CVT6:%.*]] = sitofp i16 [[TMP11]] to float
+; SSE-NEXT:    [[TMP12:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+; SSE-NEXT:    [[CVT7:%.*]] = sitofp i16 [[TMP12]] to float
+; SSE-NEXT:    [[TMP13:%.*]] = extractelement <4 x i16> [[TMP3]], i32 0
+; SSE-NEXT:    [[CVT8:%.*]] = sitofp i16 [[TMP13]] to float
+; SSE-NEXT:    [[TMP14:%.*]] = extractelement <4 x i16> [[TMP3]], i32 1
+; SSE-NEXT:    [[CVT9:%.*]] = sitofp i16 [[TMP14]] to float
+; SSE-NEXT:    [[TMP15:%.*]] = extractelement <4 x i16> [[TMP3]], i32 2
+; SSE-NEXT:    [[CVT10:%.*]] = sitofp i16 [[TMP15]] to float
+; SSE-NEXT:    [[TMP16:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3
+; SSE-NEXT:    [[CVT11:%.*]] = sitofp i16 [[TMP16]] to float
+; SSE-NEXT:    [[TMP17:%.*]] = extractelement <4 x i16> [[TMP4]], i32 0
+; SSE-NEXT:    [[CVT12:%.*]] = sitofp i16 [[TMP17]] to float
+; SSE-NEXT:    [[TMP18:%.*]] = extractelement <4 x i16> [[TMP4]], i32 1
+; SSE-NEXT:    [[CVT13:%.*]] = sitofp i16 [[TMP18]] to float
+; SSE-NEXT:    [[TMP19:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2
+; SSE-NEXT:    [[CVT14:%.*]] = sitofp i16 [[TMP19]] to float
+; SSE-NEXT:    [[TMP20:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3
+; SSE-NEXT:    [[CVT15:%.*]] = sitofp i16 [[TMP20]] to float
+; SSE-NEXT:    [[TMP21:%.*]] = insertelement <4 x float> poison, float [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[CVT1]], i32 1
+; SSE-NEXT:    [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[CVT2]], i32 2
+; SSE-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[CVT3]], i32 3
+; SSE-NEXT:    store <4 x float> [[TMP24]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    [[TMP25:%.*]] = insertelement <4 x float> poison, float [[CVT4]], i32 0
+; SSE-NEXT:    [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[CVT5]], i32 1
+; SSE-NEXT:    [[TMP27:%.*]] = insertelement <4 x float> [[TMP26]], float [[CVT6]], i32 2
+; SSE-NEXT:    [[TMP28:%.*]] = insertelement <4 x float> [[TMP27]], float [[CVT7]], i32 3
+; SSE-NEXT:    store <4 x float> [[TMP28]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    [[TMP29:%.*]] = insertelement <4 x float> poison, float [[CVT8]], i32 0
+; SSE-NEXT:    [[TMP30:%.*]] = insertelement <4 x float> [[TMP29]], float [[CVT9]], i32 1
+; SSE-NEXT:    [[TMP31:%.*]] = insertelement <4 x float> [[TMP30]], float [[CVT10]], i32 2
+; SSE-NEXT:    [[TMP32:%.*]] = insertelement <4 x float> [[TMP31]], float [[CVT11]], i32 3
+; SSE-NEXT:    store <4 x float> [[TMP32]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
+; SSE-NEXT:    [[TMP33:%.*]] = insertelement <4 x float> poison, float [[CVT12]], i32 0
+; SSE-NEXT:    [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[CVT13]], i32 1
+; SSE-NEXT:    [[TMP35:%.*]] = insertelement <4 x float> [[TMP34]], float [[CVT14]], i32 2
+; SSE-NEXT:    [[TMP36:%.*]] = insertelement <4 x float> [[TMP35]], float [[CVT15]], i32 3
+; SSE-NEXT:    store <4 x float> [[TMP36]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @sitofp_16i16_16f32(
Index: llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll
@@ -702,33 +702,6 @@
 ; SSE-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
 ; SSE-NEXT:    ret void
 ;
-; AVX256NODQ-LABEL: @sitofp_8i64_8f32(
-; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
-; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
-; AVX256NODQ-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
-; AVX256NODQ-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
-; AVX256NODQ-NEXT:    [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
-; AVX256NODQ-NEXT:    [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
-; AVX256NODQ-NEXT:    [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
-; AVX256NODQ-NEXT:    [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
-; AVX256NODQ-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to float
-; AVX256NODQ-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to float
-; AVX256NODQ-NEXT:    [[CVT2:%.*]] = sitofp i64 [[LD2]] to float
-; AVX256NODQ-NEXT:    [[CVT3:%.*]] = sitofp i64 [[LD3]] to float
-; AVX256NODQ-NEXT:    [[CVT4:%.*]] = sitofp i64 [[LD4]] to float
-; AVX256NODQ-NEXT:    [[CVT5:%.*]] = sitofp i64 [[LD5]] to float
-; AVX256NODQ-NEXT:    [[CVT6:%.*]] = sitofp i64 [[LD6]] to float
-; AVX256NODQ-NEXT:    [[CVT7:%.*]] = sitofp i64 [[LD7]] to float
-; AVX256NODQ-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; AVX256NODQ-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; AVX256NODQ-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
-; AVX256NODQ-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; AVX256NODQ-NEXT:    store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
-; AVX256NODQ-NEXT:    store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; AVX256NODQ-NEXT:    store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
-; AVX256NODQ-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
-; AVX256NODQ-NEXT:    ret void
-;
 ; AVX512-LABEL: @sitofp_8i64_8f32(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
 ; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <8 x i64> [[TMP1]] to <8 x float>
@@ -917,18 +890,20 @@
 
 define void @sitofp_4i16_4f32() #0 {
 ; SSE-LABEL: @sitofp_4i16_4f32(
-; SSE-NEXT:    [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
-; SSE-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
-; SSE-NEXT:    [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
-; SSE-NEXT:    [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
-; SSE-NEXT:    [[CVT0:%.*]] = sitofp i16 [[LD0]] to float
-; SSE-NEXT:    [[CVT1:%.*]] = sitofp i16 [[LD1]] to float
-; SSE-NEXT:    [[CVT2:%.*]] = sitofp i16 [[LD2]] to float
-; SSE-NEXT:    [[CVT3:%.*]] = sitofp i16 [[LD3]] to float
-; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
-; SSE-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[TMP1]], i32 0
+; SSE-NEXT:    [[CVT0:%.*]] = sitofp i16 [[TMP2]] to float
+; SSE-NEXT:    [[TMP3:%.*]] = extractelement <4 x i16> [[TMP1]], i32 1
+; SSE-NEXT:    [[CVT1:%.*]] = sitofp i16 [[TMP3]] to float
+; SSE-NEXT:    [[TMP4:%.*]] = extractelement <4 x i16> [[TMP1]], i32 2
+; SSE-NEXT:    [[CVT2:%.*]] = sitofp i16 [[TMP4]] to float
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+; SSE-NEXT:    [[CVT3:%.*]] = sitofp i16 [[TMP5]] to float
+; SSE-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> poison, float [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[CVT1]], i32 1
+; SSE-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[CVT2]], i32 2
+; SSE-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[CVT3]], i32 3
+; SSE-NEXT:    store <4 x float> [[TMP9]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @sitofp_4i16_4f32(
@@ -954,30 +929,34 @@
 
 define void @sitofp_8i16_8f32() #0 {
 ; SSE-LABEL: @sitofp_8i16_8f32(
-; SSE-NEXT:    [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
-; SSE-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
-; SSE-NEXT:    [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
-; SSE-NEXT:    [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
-; SSE-NEXT:    [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8
-; SSE-NEXT:    [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2
-; SSE-NEXT:    [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4
-; SSE-NEXT:    [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2
-; SSE-NEXT:    [[CVT0:%.*]] = sitofp i16 [[LD0]] to float
-; SSE-NEXT:    [[CVT1:%.*]] = sitofp i16 [[LD1]] to float
-; SSE-NEXT:    [[CVT2:%.*]] = sitofp i16 [[LD2]] to float
-; SSE-NEXT:    [[CVT3:%.*]] = sitofp i16 [[LD3]] to float
-; SSE-NEXT:    [[CVT4:%.*]] = sitofp i16 [[LD4]] to float
-; SSE-NEXT:    [[CVT5:%.*]] = sitofp i16 [[LD5]] to float
-; SSE-NEXT:    [[CVT6:%.*]] = sitofp i16 [[LD6]] to float
-; SSE-NEXT:    [[CVT7:%.*]] = sitofp i16 [[LD7]] to float
-; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
-; SSE-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE-NEXT:    store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
-; SSE-NEXT:    store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE-NEXT:    store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
-; SSE-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = extractelement <4 x i16> [[TMP1]], i32 0
+; SSE-NEXT:    [[CVT0:%.*]] = sitofp i16 [[TMP3]] to float
+; SSE-NEXT:    [[TMP4:%.*]] = extractelement <4 x i16> [[TMP1]], i32 1
+; SSE-NEXT:    [[CVT1:%.*]] = sitofp i16 [[TMP4]] to float
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <4 x i16> [[TMP1]], i32 2
+; SSE-NEXT:    [[CVT2:%.*]] = sitofp i16 [[TMP5]] to float
+; SSE-NEXT:    [[TMP6:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+; SSE-NEXT:    [[CVT3:%.*]] = sitofp i16 [[TMP6]] to float
+; SSE-NEXT:    [[TMP7:%.*]] = extractelement <4 x i16> [[TMP2]], i32 0
+; SSE-NEXT:    [[CVT4:%.*]] = sitofp i16 [[TMP7]] to float
+; SSE-NEXT:    [[TMP8:%.*]] = extractelement <4 x i16> [[TMP2]], i32 1
+; SSE-NEXT:    [[CVT5:%.*]] = sitofp i16 [[TMP8]] to float
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2
+; SSE-NEXT:    [[CVT6:%.*]] = sitofp i16 [[TMP9]] to float
+; SSE-NEXT:    [[TMP10:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+; SSE-NEXT:    [[CVT7:%.*]] = sitofp i16 [[TMP10]] to float
+; SSE-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> poison, float [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[CVT1]], i32 1
+; SSE-NEXT:    [[TMP13:%.*]] = insertelement <4 x float> [[TMP12]], float [[CVT2]], i32 2
+; SSE-NEXT:    [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[CVT3]], i32 3
+; SSE-NEXT:    store <4 x float> [[TMP14]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    [[TMP15:%.*]] = insertelement <4 x float> poison, float [[CVT4]], i32 0
+; SSE-NEXT:    [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[CVT5]], i32 1
+; SSE-NEXT:    [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float [[CVT6]], i32 2
+; SSE-NEXT:    [[TMP18:%.*]] = insertelement <4 x float> [[TMP17]], float [[CVT7]], i32 3
+; SSE-NEXT:    store <4 x float> [[TMP18]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @sitofp_8i16_8f32(
@@ -1015,54 +994,62 @@
 
 define void @sitofp_16i16_16f32() #0 {
 ; SSE-LABEL: @sitofp_16i16_16f32(
-; SSE-NEXT:    [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
-; SSE-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
-; SSE-NEXT:    [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
-; SSE-NEXT:    [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
-; SSE-NEXT:    [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8
-; SSE-NEXT:    [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2
-; SSE-NEXT:    [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4
-; SSE-NEXT:    [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2
-; SSE-NEXT:    [[LD8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8), align 16
-; SSE-NEXT:    [[LD9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 9), align 2
-; SSE-NEXT:    [[LD10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 10), align 4
-; SSE-NEXT:    [[LD11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 11), align 2
-; SSE-NEXT:    [[LD12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12), align 8
-; SSE-NEXT:    [[LD13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 13), align 2
-; SSE-NEXT:    [[LD14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 14), align 4
-; SSE-NEXT:    [[LD15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 15), align 2
-; SSE-NEXT:    [[CVT0:%.*]] = sitofp i16 [[LD0]] to float
-; SSE-NEXT:    [[CVT1:%.*]] = sitofp i16 [[LD1]] to float
-; SSE-NEXT:    [[CVT2:%.*]] = sitofp i16 [[LD2]] to float
-; SSE-NEXT:    [[CVT3:%.*]] = sitofp i16 [[LD3]] to float
-; SSE-NEXT:    [[CVT4:%.*]] = sitofp i16 [[LD4]] to float
-; SSE-NEXT:    [[CVT5:%.*]] = sitofp i16 [[LD5]] to float
-; SSE-NEXT:    [[CVT6:%.*]] = sitofp i16 [[LD6]] to float
-; SSE-NEXT:    [[CVT7:%.*]] = sitofp i16 [[LD7]] to float
-; SSE-NEXT:    [[CVT8:%.*]] = sitofp i16 [[LD8]] to float
-; SSE-NEXT:    [[CVT9:%.*]] = sitofp i16 [[LD9]] to float
-; SSE-NEXT:    [[CVT10:%.*]] = sitofp i16 [[LD10]] to float
-; SSE-NEXT:    [[CVT11:%.*]] = sitofp i16 [[LD11]] to float
-; SSE-NEXT:    [[CVT12:%.*]] = sitofp i16 [[LD12]] to float
-; SSE-NEXT:    [[CVT13:%.*]] = sitofp i16 [[LD13]] to float
-; SSE-NEXT:    [[CVT14:%.*]] = sitofp i16 [[LD14]] to float
-; SSE-NEXT:    [[CVT15:%.*]] = sitofp i16 [[LD15]] to float
-; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
-; SSE-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE-NEXT:    store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
-; SSE-NEXT:    store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE-NEXT:    store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
-; SSE-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
-; SSE-NEXT:    store float [[CVT8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 32
-; SSE-NEXT:    store float [[CVT9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4
-; SSE-NEXT:    store float [[CVT10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 8
-; SSE-NEXT:    store float [[CVT11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
-; SSE-NEXT:    store float [[CVT12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 16
-; SSE-NEXT:    store float [[CVT13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
-; SSE-NEXT:    store float [[CVT14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 8
-; SSE-NEXT:    store float [[CVT15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <4 x i16>*), align 16
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <4 x i16> [[TMP1]], i32 0
+; SSE-NEXT:    [[CVT0:%.*]] = sitofp i16 [[TMP5]] to float
+; SSE-NEXT:    [[TMP6:%.*]] = extractelement <4 x i16> [[TMP1]], i32 1
+; SSE-NEXT:    [[CVT1:%.*]] = sitofp i16 [[TMP6]] to float
+; SSE-NEXT:    [[TMP7:%.*]] = extractelement <4 x i16> [[TMP1]], i32 2
+; SSE-NEXT:    [[CVT2:%.*]] = sitofp i16 [[TMP7]] to float
+; SSE-NEXT:    [[TMP8:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+; SSE-NEXT:    [[CVT3:%.*]] = sitofp i16 [[TMP8]] to float
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <4 x i16> [[TMP2]], i32 0
+; SSE-NEXT:    [[CVT4:%.*]] = sitofp i16 [[TMP9]] to float
+; SSE-NEXT:    [[TMP10:%.*]] = extractelement <4 x i16> [[TMP2]], i32 1
+; SSE-NEXT:    [[CVT5:%.*]] = sitofp i16 [[TMP10]] to float
+; SSE-NEXT:    [[TMP11:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2
+; SSE-NEXT:    [[CVT6:%.*]] = sitofp i16 [[TMP11]] to float
+; SSE-NEXT:    [[TMP12:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+; SSE-NEXT:    [[CVT7:%.*]] = sitofp i16 [[TMP12]] to float
+; SSE-NEXT:    [[TMP13:%.*]] = extractelement <4 x i16> [[TMP3]], i32 0
+; SSE-NEXT:    [[CVT8:%.*]] = sitofp i16 [[TMP13]] to float
+; SSE-NEXT:    [[TMP14:%.*]] = extractelement <4 x i16> [[TMP3]], i32 1
+; SSE-NEXT:    [[CVT9:%.*]] = sitofp i16 [[TMP14]] to float
+; SSE-NEXT:    [[TMP15:%.*]] = extractelement <4 x i16> [[TMP3]], i32 2
+; SSE-NEXT:    [[CVT10:%.*]] = sitofp i16 [[TMP15]] to float
+; SSE-NEXT:    [[TMP16:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3
+; SSE-NEXT:    [[CVT11:%.*]] = sitofp i16 [[TMP16]] to float
+; SSE-NEXT:    [[TMP17:%.*]] = extractelement <4 x i16> [[TMP4]], i32 0
+; SSE-NEXT:    [[CVT12:%.*]] = sitofp i16 [[TMP17]] to float
+; SSE-NEXT:    [[TMP18:%.*]] = extractelement <4 x i16> [[TMP4]], i32 1
+; SSE-NEXT:    [[CVT13:%.*]] = sitofp i16 [[TMP18]] to float
+; SSE-NEXT:    [[TMP19:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2
+; SSE-NEXT:    [[CVT14:%.*]] = sitofp i16 [[TMP19]] to float
+; SSE-NEXT:    [[TMP20:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3
+; SSE-NEXT:    [[CVT15:%.*]] = sitofp i16 [[TMP20]] to float
+; SSE-NEXT:    [[TMP21:%.*]] = insertelement <4 x float> poison, float [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[CVT1]], i32 1
+; SSE-NEXT:    [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[CVT2]], i32 2
+; SSE-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[CVT3]], i32 3
+; SSE-NEXT:    store <4 x float> [[TMP24]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    [[TMP25:%.*]] = insertelement <4 x float> poison, float [[CVT4]], i32 0
+; SSE-NEXT:    [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[CVT5]], i32 1
+; SSE-NEXT:    [[TMP27:%.*]] = insertelement <4 x float> [[TMP26]], float [[CVT6]], i32 2
+; SSE-NEXT:    [[TMP28:%.*]] = insertelement <4 x float> [[TMP27]], float [[CVT7]], i32 3
+; SSE-NEXT:    store <4 x float> [[TMP28]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    [[TMP29:%.*]] = insertelement <4 x float> poison, float [[CVT8]], i32 0
+; SSE-NEXT:    [[TMP30:%.*]] = insertelement <4 x float> [[TMP29]], float [[CVT9]], i32 1
+; SSE-NEXT:    [[TMP31:%.*]] = insertelement <4 x float> [[TMP30]], float [[CVT10]], i32 2
+; SSE-NEXT:    [[TMP32:%.*]] = insertelement <4 x float> [[TMP31]], float [[CVT11]], i32 3
+; SSE-NEXT:    store <4 x float> [[TMP32]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
+; SSE-NEXT:    [[TMP33:%.*]] = insertelement <4 x float> poison, float [[CVT12]], i32 0
+; SSE-NEXT:    [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[CVT13]], i32 1
+; SSE-NEXT:    [[TMP35:%.*]] = insertelement <4 x float> [[TMP34]], float [[CVT14]], i32 2
+; SSE-NEXT:    [[TMP36:%.*]] = insertelement <4 x float> [[TMP35]], float [[CVT15]], i32 3
+; SSE-NEXT:    store <4 x float> [[TMP36]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @sitofp_16i16_16f32(
Index: llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll
@@ -5,18 +5,20 @@
 ; CHECK-LABEL: @rftbsub(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 2
-; CHECK-NEXT:    [[TMP0:%.*]] = load double, double* [[ARRAYIDX6]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = or i64 2, 1
-; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load double, double* [[ARRAYIDX12]], align 8
-; CHECK-NEXT:    [[ADD16:%.*]] = fadd double [[TMP2]], undef
+; CHECK-NEXT:    [[TMP0:%.*]] = or i64 2, 1
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; CHECK-NEXT:    [[ADD16:%.*]] = fadd double [[TMP3]], undef
 ; CHECK-NEXT:    [[MUL18:%.*]] = fmul double undef, [[ADD16]]
 ; CHECK-NEXT:    [[ADD19:%.*]] = fadd double undef, [[MUL18]]
 ; CHECK-NEXT:    [[SUB22:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[SUB25:%.*]] = fsub double [[TMP0]], [[ADD19]]
-; CHECK-NEXT:    store double [[SUB25]], double* [[ARRAYIDX6]], align 8
-; CHECK-NEXT:    [[SUB29:%.*]] = fsub double [[TMP2]], [[SUB22]]
-; CHECK-NEXT:    store double [[SUB29]], double* [[ARRAYIDX12]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[ADD19]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[SUB22]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP6]], <2 x double>* [[TMP7]], align 8
 ; CHECK-NEXT:    unreachable
 ;
 entry:
Index: llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll
@@ -119,11 +119,13 @@
 ; CHECK-NEXT:    [[DST_ADDR_014:%.*]] = phi double* [ [[ADD_PTR4:%.*]], [[FOR_BODY]] ], [ [[DST:%.*]], [[ENTRY]] ]
 ; CHECK-NEXT:    [[SRC_ADDR_013:%.*]] = phi double* [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[SRC:%.*]], [[ENTRY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load double, double* [[SRC_ADDR_013]], align 8
-; CHECK-NEXT:    store double [[TMP0]], double* [[DST_ADDR_014]], align 8
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[SRC_ADDR_013]], i64 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = load double, double* [[ARRAYIDX2]], align 8
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[DST_ADDR_014]], i64 1
-; CHECK-NEXT:    store double [[TMP1]], double* [[ARRAYIDX3]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[DST_ADDR_014]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8
 ; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds double, double* [[SRC_ADDR_013]], i64 [[I_015]]
 ; CHECK-NEXT:    [[ADD_PTR4]] = getelementptr inbounds double, double* [[DST_ADDR_014]], i64 [[I_015]]
 ; CHECK-NEXT:    [[INC]] = add i64 [[I_015]], 1
@@ -166,19 +168,21 @@
 ; CHECK-NEXT:    [[DST_ADDR_022:%.*]] = phi float* [ [[ADD_PTR8:%.*]], [[FOR_BODY]] ], [ [[DST:%.*]], [[ENTRY]] ]
 ; CHECK-NEXT:    [[SRC_ADDR_021:%.*]] = phi float* [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[SRC:%.*]], [[ENTRY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC_ADDR_021]], align 4
-; CHECK-NEXT:    store float [[TMP0]], float* [[DST_ADDR_022]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 1
-; CHECK-NEXT:    store float [[TMP1]], float* [[ARRAYIDX3]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[ARRAYIDX4]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 2
-; CHECK-NEXT:    store float [[TMP2]], float* [[ARRAYIDX5]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 3
 ; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[ARRAYIDX6]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 3
-; CHECK-NEXT:    store float [[TMP3]], float* [[ARRAYIDX7]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[TMP2]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP3]], i32 3
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[DST_ADDR_022]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP7]], <4 x float>* [[TMP8]], align 4
 ; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 [[I_023]]
 ; CHECK-NEXT:    [[ADD_PTR8]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 [[I_023]]
 ; CHECK-NEXT:    [[INC]] = add i64 [[I_023]], 1
Index: llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll
@@ -654,33 +654,6 @@
 ; SSE-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
 ; SSE-NEXT:    ret void
 ;
-; AVX256NODQ-LABEL: @uitofp_8i64_8f32(
-; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
-; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
-; AVX256NODQ-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
-; AVX256NODQ-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
-; AVX256NODQ-NEXT:    [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
-; AVX256NODQ-NEXT:    [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
-; AVX256NODQ-NEXT:    [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
-; AVX256NODQ-NEXT:    [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
-; AVX256NODQ-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to float
-; AVX256NODQ-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to float
-; AVX256NODQ-NEXT:    [[CVT2:%.*]] = uitofp i64 [[LD2]] to float
-; AVX256NODQ-NEXT:    [[CVT3:%.*]] = uitofp i64 [[LD3]] to float
-; AVX256NODQ-NEXT:    [[CVT4:%.*]] = uitofp i64 [[LD4]] to float
-; AVX256NODQ-NEXT:    [[CVT5:%.*]] = uitofp i64 [[LD5]] to float
-; AVX256NODQ-NEXT:    [[CVT6:%.*]] = uitofp i64 [[LD6]] to float
-; AVX256NODQ-NEXT:    [[CVT7:%.*]] = uitofp i64 [[LD7]] to float
-; AVX256NODQ-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; AVX256NODQ-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; AVX256NODQ-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
-; AVX256NODQ-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; AVX256NODQ-NEXT:    store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
-; AVX256NODQ-NEXT:    store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; AVX256NODQ-NEXT:    store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
-; AVX256NODQ-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
-; AVX256NODQ-NEXT:    ret void
-;
 ; AVX512-LABEL: @uitofp_8i64_8f32(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
 ; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <8 x i64> [[TMP1]] to <8 x float>
@@ -869,18 +842,20 @@
 
 define void @uitofp_4i16_4f32() #0 {
 ; SSE-LABEL: @uitofp_4i16_4f32(
-; SSE-NEXT:    [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
-; SSE-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
-; SSE-NEXT:    [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
-; SSE-NEXT:    [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
-; SSE-NEXT:    [[CVT0:%.*]] = uitofp i16 [[LD0]] to float
-; SSE-NEXT:    [[CVT1:%.*]] = uitofp i16 [[LD1]] to float
-; SSE-NEXT:    [[CVT2:%.*]] = uitofp i16 [[LD2]] to float
-; SSE-NEXT:    [[CVT3:%.*]] = uitofp i16 [[LD3]] to float
-; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
-; SSE-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[TMP1]], i32 0
+; SSE-NEXT:    [[CVT0:%.*]] = uitofp i16 [[TMP2]] to float
+; SSE-NEXT:    [[TMP3:%.*]] = extractelement <4 x i16> [[TMP1]], i32 1
+; SSE-NEXT:    [[CVT1:%.*]] = uitofp i16 [[TMP3]] to float
+; SSE-NEXT:    [[TMP4:%.*]] = extractelement <4 x i16> [[TMP1]], i32 2
+; SSE-NEXT:    [[CVT2:%.*]] = uitofp i16 [[TMP4]] to float
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+; SSE-NEXT:    [[CVT3:%.*]] = uitofp i16 [[TMP5]] to float
+; SSE-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> poison, float [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[CVT1]], i32 1
+; SSE-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[CVT2]], i32 2
+; SSE-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[CVT3]], i32 3
+; SSE-NEXT:    store <4 x float> [[TMP9]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @uitofp_4i16_4f32(
@@ -906,30 +881,34 @@
 
 define void @uitofp_8i16_8f32() #0 {
 ; SSE-LABEL: @uitofp_8i16_8f32(
-; SSE-NEXT:    [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
-; SSE-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
-; SSE-NEXT:    [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
-; SSE-NEXT:    [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
-; SSE-NEXT:    [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8
-; SSE-NEXT:    [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2
-; SSE-NEXT:    [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4
-; SSE-NEXT:    [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2
-; SSE-NEXT:    [[CVT0:%.*]] = uitofp i16 [[LD0]] to float
-; SSE-NEXT:    [[CVT1:%.*]] = uitofp i16 [[LD1]] to float
-; SSE-NEXT:    [[CVT2:%.*]] = uitofp i16 [[LD2]] to float
-; SSE-NEXT:    [[CVT3:%.*]] = uitofp i16 [[LD3]] to float
-; SSE-NEXT:    [[CVT4:%.*]] = uitofp i16 [[LD4]] to float
-; SSE-NEXT:    [[CVT5:%.*]] = uitofp i16 [[LD5]] to float
-; SSE-NEXT:    [[CVT6:%.*]] = uitofp i16 [[LD6]] to float
-; SSE-NEXT:    [[CVT7:%.*]] = uitofp i16 [[LD7]] to float
-; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
-; SSE-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE-NEXT:    store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
-; SSE-NEXT:    store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE-NEXT:    store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
-; SSE-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = extractelement <4 x i16> [[TMP1]], i32 0
+; SSE-NEXT:    [[CVT0:%.*]] = uitofp i16 [[TMP3]] to float
+; SSE-NEXT:    [[TMP4:%.*]] = extractelement <4 x i16> [[TMP1]], i32 1
+; SSE-NEXT:    [[CVT1:%.*]] = uitofp i16 [[TMP4]] to float
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <4 x i16> [[TMP1]], i32 2
+; SSE-NEXT:    [[CVT2:%.*]] = uitofp i16 [[TMP5]] to float
+; SSE-NEXT:    [[TMP6:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+; SSE-NEXT:    [[CVT3:%.*]] = uitofp i16 [[TMP6]] to float
+; SSE-NEXT:    [[TMP7:%.*]] = extractelement <4 x i16> [[TMP2]], i32 0
+; SSE-NEXT:    [[CVT4:%.*]] = uitofp i16 [[TMP7]] to float
+; SSE-NEXT:    [[TMP8:%.*]] = extractelement <4 x i16> [[TMP2]], i32 1
+; SSE-NEXT:    [[CVT5:%.*]] = uitofp i16 [[TMP8]] to float
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2
+; SSE-NEXT:    [[CVT6:%.*]] = uitofp i16 [[TMP9]] to float
+; SSE-NEXT:    [[TMP10:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+; SSE-NEXT:    [[CVT7:%.*]] = uitofp i16 [[TMP10]] to float
+; SSE-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> poison, float [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[CVT1]], i32 1
+; SSE-NEXT:    [[TMP13:%.*]] = insertelement <4 x float> [[TMP12]], float [[CVT2]], i32 2
+; SSE-NEXT:    [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[CVT3]], i32 3
+; SSE-NEXT:    store <4 x float> [[TMP14]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    [[TMP15:%.*]] = insertelement <4 x float> poison, float [[CVT4]], i32 0
+; SSE-NEXT:    [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[CVT5]], i32 1
+; SSE-NEXT:    [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float [[CVT6]], i32 2
+; SSE-NEXT:    [[TMP18:%.*]] = insertelement <4 x float> [[TMP17]], float [[CVT7]], i32 3
+; SSE-NEXT:    store <4 x float> [[TMP18]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @uitofp_8i16_8f32(
@@ -967,54 +946,62 @@
 
 define void @uitofp_16i16_16f32() #0 {
 ; SSE-LABEL: @uitofp_16i16_16f32(
-; SSE-NEXT:    [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
-; SSE-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
-; SSE-NEXT:    [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
-; SSE-NEXT:    [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
-; SSE-NEXT:    [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8
-; SSE-NEXT:    [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2
-; SSE-NEXT:    [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4
-; SSE-NEXT:    [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2
-; SSE-NEXT:    [[LD8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8), align 16
-; SSE-NEXT:    [[LD9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 9), align 2
-; SSE-NEXT:    [[LD10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 10), align 4
-; SSE-NEXT:    [[LD11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 11), align 2
-; SSE-NEXT:    [[LD12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12), align 8
-; SSE-NEXT:    [[LD13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 13), align 2
-; SSE-NEXT:    [[LD14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 14), align 4
-; SSE-NEXT:    [[LD15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 15), align 2
-; SSE-NEXT:    [[CVT0:%.*]] = uitofp i16 [[LD0]] to float
-; SSE-NEXT:    [[CVT1:%.*]] = uitofp i16 [[LD1]] to float
-; SSE-NEXT:    [[CVT2:%.*]] = uitofp i16 [[LD2]] to float
-; SSE-NEXT:    [[CVT3:%.*]] = uitofp i16 [[LD3]] to float
-; SSE-NEXT:    [[CVT4:%.*]] = uitofp i16 [[LD4]] to float
-; SSE-NEXT:    [[CVT5:%.*]] = uitofp i16 [[LD5]] to float
-; SSE-NEXT:    [[CVT6:%.*]] = uitofp i16 [[LD6]] to float
-; SSE-NEXT:    [[CVT7:%.*]] = uitofp i16 [[LD7]] to float
-; SSE-NEXT:    [[CVT8:%.*]] = uitofp i16 [[LD8]] to float
-; SSE-NEXT:    [[CVT9:%.*]] = uitofp i16 [[LD9]] to float
-; SSE-NEXT:    [[CVT10:%.*]] = uitofp i16 [[LD10]] to float
-; SSE-NEXT:    [[CVT11:%.*]] = uitofp i16 [[LD11]] to float
-; SSE-NEXT:    [[CVT12:%.*]] = uitofp i16 [[LD12]] to float
-; SSE-NEXT:    [[CVT13:%.*]] = uitofp i16 [[LD13]] to float
-; SSE-NEXT:    [[CVT14:%.*]] = uitofp i16 [[LD14]] to float
-; SSE-NEXT:    [[CVT15:%.*]] = uitofp i16 [[LD15]] to float
-; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
-; SSE-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE-NEXT:    store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
-; SSE-NEXT:    store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE-NEXT:    store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
-; SSE-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
-; SSE-NEXT:    store float [[CVT8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 32
-; SSE-NEXT:    store float [[CVT9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4
-; SSE-NEXT:    store float [[CVT10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 8
-; SSE-NEXT:    store float [[CVT11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
-; SSE-NEXT:    store float [[CVT12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 16
-; SSE-NEXT:    store float [[CVT13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
-; SSE-NEXT:    store float [[CVT14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 8
-; SSE-NEXT:    store float [[CVT15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <4 x i16>*), align 16
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <4 x i16> [[TMP1]], i32 0
+; SSE-NEXT:    [[CVT0:%.*]] = uitofp i16 [[TMP5]] to float
+; SSE-NEXT:    [[TMP6:%.*]] = extractelement <4 x i16> [[TMP1]], i32 1
+; SSE-NEXT:    [[CVT1:%.*]] = uitofp i16 [[TMP6]] to float
+; SSE-NEXT:    [[TMP7:%.*]] = extractelement <4 x i16> [[TMP1]], i32 2
+; SSE-NEXT:    [[CVT2:%.*]] = uitofp i16 [[TMP7]] to float
+; SSE-NEXT:    [[TMP8:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+; SSE-NEXT:    [[CVT3:%.*]] = uitofp i16 [[TMP8]] to float
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <4 x i16> [[TMP2]], i32 0
+; SSE-NEXT:    [[CVT4:%.*]] = uitofp i16 [[TMP9]] to float
+; SSE-NEXT:    [[TMP10:%.*]] = extractelement <4 x i16> [[TMP2]], i32 1
+; SSE-NEXT:    [[CVT5:%.*]] = uitofp i16 [[TMP10]] to float
+; SSE-NEXT:    [[TMP11:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2
+; SSE-NEXT:    [[CVT6:%.*]] = uitofp i16 [[TMP11]] to float
+; SSE-NEXT:    [[TMP12:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+; SSE-NEXT:    [[CVT7:%.*]] = uitofp i16 [[TMP12]] to float
+; SSE-NEXT:    [[TMP13:%.*]] = extractelement <4 x i16> [[TMP3]], i32 0
+; SSE-NEXT:    [[CVT8:%.*]] = uitofp i16 [[TMP13]] to float
+; SSE-NEXT:    [[TMP14:%.*]] = extractelement <4 x i16> [[TMP3]], i32 1
+; SSE-NEXT:    [[CVT9:%.*]] = uitofp i16 [[TMP14]] to float
+; SSE-NEXT:    [[TMP15:%.*]] = extractelement <4 x i16> [[TMP3]], i32 2
+; SSE-NEXT:    [[CVT10:%.*]] = uitofp i16 [[TMP15]] to float
+; SSE-NEXT:    [[TMP16:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3
+; SSE-NEXT:    [[CVT11:%.*]] = uitofp i16 [[TMP16]] to float
+; SSE-NEXT:    [[TMP17:%.*]] = extractelement <4 x i16> [[TMP4]], i32 0
+; SSE-NEXT:    [[CVT12:%.*]] = uitofp i16 [[TMP17]] to float
+; SSE-NEXT:    [[TMP18:%.*]] = extractelement <4 x i16> [[TMP4]], i32 1
+; SSE-NEXT:    [[CVT13:%.*]] = uitofp i16 [[TMP18]] to float
+; SSE-NEXT:    [[TMP19:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2
+; SSE-NEXT:    [[CVT14:%.*]] = uitofp i16 [[TMP19]] to float
+; SSE-NEXT:    [[TMP20:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3
+; SSE-NEXT:    [[CVT15:%.*]] = uitofp i16 [[TMP20]] to float
+; SSE-NEXT:    [[TMP21:%.*]] = insertelement <4 x float> poison, float [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[CVT1]], i32 1
+; SSE-NEXT:    [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[CVT2]], i32 2
+; SSE-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[CVT3]], i32 3
+; SSE-NEXT:    store <4 x float> [[TMP24]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    [[TMP25:%.*]] = insertelement <4 x float> poison, float [[CVT4]], i32 0
+; SSE-NEXT:    [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[CVT5]], i32 1
+; SSE-NEXT:    [[TMP27:%.*]] = insertelement <4 x float> [[TMP26]], float [[CVT6]], i32 2
+; SSE-NEXT:    [[TMP28:%.*]] = insertelement <4 x float> [[TMP27]], float [[CVT7]], i32 3
+; SSE-NEXT:    store <4 x float> [[TMP28]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    [[TMP29:%.*]] = insertelement <4 x float> poison, float [[CVT8]], i32 0
+; SSE-NEXT:    [[TMP30:%.*]] = insertelement <4 x float> [[TMP29]], float [[CVT9]], i32 1
+; SSE-NEXT:    [[TMP31:%.*]] = insertelement <4 x float> [[TMP30]], float [[CVT10]], i32 2
+; SSE-NEXT:    [[TMP32:%.*]] = insertelement <4 x float> [[TMP31]], float [[CVT11]], i32 3
+; SSE-NEXT:    store <4 x float> [[TMP32]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
+; SSE-NEXT:    [[TMP33:%.*]] = insertelement <4 x float> poison, float [[CVT12]], i32 0
+; SSE-NEXT:    [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[CVT13]], i32 1
+; SSE-NEXT:    [[TMP35:%.*]] = insertelement <4 x float> [[TMP34]], float [[CVT14]], i32 2
+; SSE-NEXT:    [[TMP36:%.*]] = insertelement <4 x float> [[TMP35]], float [[CVT15]], i32 3
+; SSE-NEXT:    store <4 x float> [[TMP36]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @uitofp_16i16_16f32(
Index: llvm/test/Transforms/SLPVectorizer/X86/unreachable.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/unreachable.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/unreachable.ll
@@ -27,13 +27,15 @@
 ; CHECK-NEXT:    [[T2_0:%.*]] = phi i32 [ [[T6]], [[BB1]] ], [ 2, [[ENTRY]] ]
 ; CHECK-NEXT:    [[T3_0:%.*]] = phi i32 [ [[T8]], [[BB1]] ], [ 2, [[ENTRY]] ]
 ; CHECK-NEXT:    [[T4_0:%.*]] = phi i32 [ [[T10]], [[BB1]] ], [ 2, [[ENTRY]] ]
-; CHECK-NEXT:    store i32 [[T1_0]], i32* [[X]], align 4
 ; CHECK-NEXT:    [[T12:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 1
-; CHECK-NEXT:    store i32 [[T2_0]], i32* [[T12]], align 4
 ; CHECK-NEXT:    [[T13:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 2
-; CHECK-NEXT:    store i32 [[T3_0]], i32* [[T13]], align 4
 ; CHECK-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 3
-; CHECK-NEXT:    store i32 [[T4_0]], i32* [[T14]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[T1_0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[T2_0]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T3_0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T4_0]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[X]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry: