Index: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -92,6 +92,7 @@
 #include <cstdint>
 #include <iterator>
 #include <memory>
+#include <queue>
 #include <set>
 #include <string>
 #include <tuple>
@@ -124,6 +125,15 @@
     cl::desc(
         "Attempt to vectorize horizontal reductions feeding into a store"));
 
+static cl::opt<unsigned>
+    SLPThrottleBudget("slp-throttling-budget", cl::init(32), cl::Hidden,
+                      cl::desc("Limit the total number of nodes for cost "
+                               "recalculations during throttling"));
+
+static cl::opt<unsigned> MinVecNodes(
+    "slp-throttling-min-vec", cl::init(2), cl::Hidden,
+    cl::desc("Minimal number of vectorizable nodes while throttling"));
+
 static cl::opt<int>
 MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
     cl::desc("Attempt to vectorize for this register size in bits"));
@@ -595,11 +605,63 @@
 
   /// \returns the cost incurred by unwanted spills and fills, caused by
   /// holding live values over call sites.
-  InstructionCost getSpillCost() const;
+  InstructionCost getSpillCost();
+
+  /// \returns the cost extracting vectorized elements.
+  InstructionCost getExtractCost() const;
+
+  /// \returns the cost of gathering canceled elements to be used
+  /// by vectorized operations during throttling.
+  InstructionCost getInsertCost();
+
+  struct TECostComparator {
+    bool operator()(const TreeEntry *LHS, const TreeEntry *RHS) const {
+      return LHS->Cost > RHS->Cost;
+    }
+  };
+  using TEVecQueue = std::priority_queue<TreeEntry *, std::vector<TreeEntry *>,
+                                         TECostComparator>;
+
+  /// Find a subtree of the whole tree suitable to be vectorized. When
+  /// vectorizing the whole tree is not profitable, we can consider vectorizing
+  /// part of that tree. SLP algorithm looks to operations to vectorize starting
+  /// from seed instructions on the bottom toward the end of chains of
+  /// dependencies to the top of SLP graph, it groups potentially vectorizable
+  /// operations in scalar form to bundles.
+  /// For example:
+  ///
+  ///   <bundle 1> vector form
+  ///      |
+  ///   <bundle 2> vector form  <bundle 3> vector form
+  ///       \                    /
+  ///        <seed root bundle> vector form
+  ///
+  /// Total cost is not profitable to vectorize, hence all operations are in
+  /// scalar form.
+  ///
+  /// Here is the same tree after SLP throttling transformation:
+  ///
+  ///   <bundle 1> vector form
+  ///      |
+  ///   <bundle 2> vector form  <bundle 3> gathered nodes
+  ///       \                    /
+  ///        <seed root bundle> vector form
+  ///
+  /// So, we can throttle some operations in such a way that it is still
+  /// profitable to vectorize part on the tree, while all tree vectorization
+  /// does not make sense.
+  /// More details:
+  /// https://www.cl.cam.ac.uk/~tmj32/papers/docs/porpodas15-pact.pdf
+  bool findSubTree(std::vector<TreeEntry *> &Vec, unsigned &VectNodes,
+                   InstructionCost TreeCost, InstructionCost UserCost);
+
+  /// Get raw summary of all elements of the tree.
+  InstructionCost getRawTreeCost();
 
   /// \returns the vectorization cost of the subtree that starts at \p VL.
   /// A negative number means that this is profitable.
-  InstructionCost getTreeCost();
+  InstructionCost getTreeCost(bool TreeReduce = false,
+                              InstructionCost UserCost = 0);
 
   /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
   /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
@@ -620,6 +682,8 @@
     ScalarToTreeEntry.clear();
     MustGather.clear();
     ExternalUses.clear();
+    InternalTreeUses.clear();
+    ProposedToGather.clear();
     NumOpsWantToKeepOrder.clear();
     NumOpsWantToKeepOriginalOrder = 0;
     for (auto &Iter : BlocksSchedules) {
@@ -628,6 +692,9 @@
     }
     MinBWs.clear();
     InstrElementSize.clear();
+    NoCallInst = true;
+    RawTreeCost = 0;
+    IsCostSumReady = false;
   }
 
   unsigned getTreeSize() const { return VectorizableTree.size(); }
@@ -790,6 +857,9 @@
   ///       may not be necessary.
   bool isLoadCombineCandidate() const;
 
+  /// Cut the tree to make it partially vectorizable.
+  void cutTree();
+
   OptimizationRemarkEmitter *getORE() { return ORE; }
 
   /// This structure holds any data we need about the edges being traversed
@@ -1606,6 +1676,9 @@
     /// Does this entry require reordering?
     SmallVector<unsigned, 4> ReorderIndices;
 
+    /// Cost of this tree entry.
+    InstructionCost Cost = 0;
+
     /// Points back to the VectorizableTree.
     ///
     /// Only used for Graphviz right now.  Unfortunately GraphTrait::NodeRef has
@@ -1618,6 +1691,9 @@
     /// have multiple users so the data structure is not truly a tree.
     SmallVector<EdgeInfo, 1> UserTreeIndices;
 
+    /// Use of this entry.
+    TinyPtrVector<TreeEntry *> UseEntries;
+
     /// The index of this treeEntry in VectorizableTree.
     int Idx = -1;
 
@@ -1850,8 +1926,10 @@
       MustGather.insert(VL.begin(), VL.end());
     }
 
-    if (UserTreeIdx.UserTE)
+    if (UserTreeIdx.UserTE) {
       Last->UserTreeIndices.push_back(UserTreeIdx);
+      VectorizableTree[UserTreeIdx.UserTE->Idx]->UseEntries.push_back(Last);
+    }
 
     return Last;
   }
@@ -1901,6 +1979,9 @@
   };
   using UserList = SmallVector<ExternalUser, 16>;
 
+  /// \returns the cost of extracting the vectorized elements.
+  InstructionCost getExtractOperationCost(const ExternalUser &EU) const;
+
   /// Checks if two instructions may access the same memory.
   ///
   /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
@@ -1951,6 +2032,25 @@
   /// after vectorization.
   UserList ExternalUses;
 
+  /// Tree entries that should not be vectorized due to throttling.
+  SmallPtrSet<TreeEntry *, 2> ProposedToGather;
+
+  /// Raw cost of all elemts in the tree.
+  InstructionCost RawTreeCost = 0;
+
+  /// Indicate that no CallInst found in the tree and we don't need to
+  /// calculate spill cost.
+  bool NoCallInst = true;
+
+  /// True, if we have calucalte tree cost for the tree.
+  bool IsCostSumReady = false;
+
+  /// Current operations width to vectorize.
+  unsigned BundleWidth = 0;
+
+  /// Internal tree oprations proposed to be vectorized values use.
+  SmallDenseMap<Value *, UserList> InternalTreeUses;
+
   /// Values used only by @llvm.assume calls.
   SmallPtrSet<const Value *, 32> EphValues;
 
@@ -2293,6 +2393,9 @@
     /// Sets all instruction in the scheduling region to un-scheduled.
     void resetSchedule();
 
+    /// Make the scheduling region smaller.
+    void reduceSchedulingRegion(Instruction *Start, Instruction *End);
+
     BasicBlock *BB;
 
     /// Simple memory allocation for ScheduleData.
@@ -2355,6 +2458,9 @@
   /// performed in a basic block.
   void scheduleBlock(BlockScheduling *BS);
 
+  /// Remove operations from the list of proposed to schedule.
+  void removeFromScheduling(BlockScheduling *BS);
+
   /// List of users to ignore during scheduling and that don't need extracting.
   ArrayRef<Value *> UserIgnoreList;
 
@@ -2569,7 +2675,7 @@
   buildTree_rec(Roots, 0, EdgeInfo());
 
   // Collect the values that we need to extract from the tree.
-  for (auto &TEPtr : VectorizableTree) {
+  for (std::unique_ptr<TreeEntry> &TEPtr : VectorizableTree) {
     TreeEntry *Entry = TEPtr.get();
 
     // No need to handle users of gathered values.
@@ -2602,6 +2708,7 @@
           // Some in-tree scalars will remain as scalar in vectorized
           // instructions. If that is the case, the one in Lane 0 will
           // be used.
+          InternalTreeUses[U].emplace_back(Scalar, U, FoundLane);
           if (UseScalar != U ||
               UseEntry->State == TreeEntry::ScatterVectorize ||
               !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
@@ -3328,6 +3435,50 @@
   }
 }
 
+void BoUpSLP::cutTree() {
+  SmallVector<TreeEntry *, 4> VecNodes;
+
+  for (std::unique_ptr<TreeEntry> &TEPtr : VectorizableTree) {
+    TreeEntry *Entry = TEPtr.get();
+    if (Entry->State != TreeEntry::Vectorize &&
+        Entry->State != TreeEntry::ScatterVectorize)
+      continue;
+    // For all canceled operations we should consider the possibility of
+    // use by with non-canceled operations and for that, it requires
+    // to populate ExternalUser list with canceled elements.
+    for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
+      Value *Scalar = Entry->Scalars[Lane];
+      for (User *U : Scalar->users()) {
+        LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
+        TreeEntry *UserTE = getTreeEntry(U);
+        if (!UserTE || ProposedToGather.count(UserTE) == 0)
+          continue;
+        // Ignore users in the user ignore list.
+        auto *UserInst = dyn_cast<Instruction>(U);
+        if (!UserInst)
+          continue;
+
+        if (is_contained(UserIgnoreList, UserInst))
+          continue;
+        LLVM_DEBUG(dbgs() << "SLP: Need extract to canceled operation :" << *U
+                          << " from lane " << Lane << " from " << *Scalar
+                          << ".\n");
+        ExternalUses.emplace_back(Scalar, U, Lane);
+      }
+    }
+  }
+  // Canceling unprofitable elements.
+  for (TreeEntry *Entry : ProposedToGather) {
+    for (Value *V : Entry->Scalars) {
+      ScalarToTreeEntry.erase(V);
+#ifndef NDEBUG
+      LLVM_DEBUG(dbgs() << "SLP: Remove scalar " << *V
+                        << " out of proposed to vectorize.\n");
+#endif
+    }
+  }
+}
+
 unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
   unsigned N = 1;
   Type *EltTy = T;
@@ -3637,7 +3788,7 @@
     SmallVector<const TreeEntry *> Entries;
     Optional<TargetTransformInfo::ShuffleKind> Shuffle =
         isGatherShuffledEntry(E, Mask, Entries);
-    if (Shuffle.hasValue()) {
+    if (Shuffle.hasValue() && ProposedToGather.count(E) == 0) {
       if (ShuffleVectorInst::isIdentityMask(Mask)) {
         LLVM_DEBUG(
             dbgs()
@@ -4141,12 +4292,11 @@
   return true;
 }
 
-InstructionCost BoUpSLP::getSpillCost() const {
+InstructionCost BoUpSLP::getSpillCost() {
   // Walk from the bottom of the tree to the top, tracking which values are
   // live. When we see a call instruction that is not part of our tree,
   // query TTI to see if there is a cost to keeping values live over it
   // (for example, if spills and fills are required).
-  unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
   InstructionCost Cost = 0;
 
   SmallPtrSet<Instruction*, 4> LiveValues;
@@ -4211,6 +4361,7 @@
     }
 
     if (NumCalls) {
+      NoCallInst = false;
       SmallVector<Type*, 4> V;
       for (auto *II : LiveValues)
         V.push_back(FixedVectorType::get(II->getType(), BundleWidth));
@@ -4223,15 +4374,113 @@
   return Cost;
 }
 
-InstructionCost BoUpSLP::getTreeCost() {
-  InstructionCost Cost = 0;
+InstructionCost BoUpSLP::getExtractOperationCost(const ExternalUser &EU) const {
+  // Uses by ephemeral values are free (because the ephemeral value will be
+  // removed prior to code generation, and so the extraction will be
+  // removed as well).
+  if (EphValues.count(EU.User))
+    return 0;
+
+  // If we plan to rewrite the tree in a smaller type, we will need to sign
+  // extend the extracted value back to the original type. Here, we account
+  // for the extract and the added cost of the sign extend if needed.
+  auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth);
+  Value *ScalarRoot = VectorizableTree.front()->Scalars[0];
+
+  auto It = MinBWs.find(ScalarRoot);
+  if (It != MinBWs.end()) {
+    uint64_t Width = It->second.first;
+    bool Signed = It->second.second;
+    auto *MinTy = IntegerType::get(F->getContext(), Width);
+    unsigned ExtOp = Signed ? Instruction::SExt : Instruction::ZExt;
+    VecTy = FixedVectorType::get(MinTy, BundleWidth);
+    return (TTI->getExtractWithExtendCost(ExtOp, EU.Scalar->getType(), VecTy,
+                                          EU.Lane));
+  }
+  return TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
+}
+
+InstructionCost BoUpSLP::getExtractCost() const {
+  InstructionCost ExtractCost = 0;
+  SmallPtrSet<Value *, 16> ExtractCostCalculated;
+  // Consider the possibility of extracting vectorized
+  // values for canceled elements use.
+  for (TreeEntry *Entry : ProposedToGather) {
+    for (Value *V : Entry->Scalars) {
+      // Consider the possibility of extracting vectorized
+      // values for canceled elements use.
+      auto It = InternalTreeUses.find(V);
+      if (It != InternalTreeUses.end()) {
+        const UserList &UL = It->second;
+        for (const ExternalUser &IU : UL)
+          ExtractCost += getExtractOperationCost(IU);
+      }
+    }
+  }
+  for (const ExternalUser &EU : ExternalUses) {
+    // We only add extract cost once for the same scalar.
+    if (!ExtractCostCalculated.insert(EU.Scalar).second)
+      continue;
+
+    ExtractCost += getExtractOperationCost(EU);
+  }
+  return ExtractCost;
+}
+
+InstructionCost BoUpSLP::getInsertCost() {
+  InstructionCost InsertCost = 0;
+  for (TreeEntry *Entry : ProposedToGather) {
+    // Avoid already vectorized TreeEntries, it is already in a vector form and
+    // we don't need to gather those operations or nodes that were once
+    // considered to be vectorized but now don't have any direct relations
+    // to vectorizable nodes.
+    for (Value *V : Entry->Scalars) {
+      auto *Inst = cast<Instruction>(V);
+      if (llvm::any_of(Inst->users(), [this](User *Op) {
+            if (const TreeEntry *UserTE = getTreeEntry(Op)) {
+              return (ProposedToGather.count(UserTE) != 0);
+            }
+            return false;
+          })) {
+        InsertCost += getEntryCost(Entry);
+        break;
+      }
+    }
+  }
+  return InsertCost;
+}
+
+bool BoUpSLP::findSubTree(std::vector<TreeEntry *> &Vec, unsigned &VectNodes,
+                          InstructionCost TreeCost, InstructionCost UserCost) {
+  for (const std::unique_ptr<TreeEntry> &TEPtr : VectorizableTree) {
+    TreeEntry *Entry = TEPtr.get();
+    // Ignore any non-vectoriable entries, entries with low cost,
+    // or root entry.
+    if (Entry->State == TreeEntry::NeedToGather)
+      continue;
+    VectNodes++;
+    if (Entry->Cost <= 0 || !Entry->Idx)
+      continue;
+    Vec.push_back(Entry);
+  }
+  InstructionCost Sum = 0;
+  for (TreeEntry *Entry : Vec)
+    Sum += Entry->Cost;
+  // Avoid reducing the tree if there is no potential room to reduce.
+  if ((TreeCost - UserCost - Sum) >= -SLPCostThreshold)
+    return false;
+
+  return (Vec.size() > 0);
+}
+
+InstructionCost BoUpSLP::getRawTreeCost() {
+  InstructionCost CostSum = 0;
+  BundleWidth = VectorizableTree.front()->Scalars.size();
   LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
                     << VectorizableTree.size() << ".\n");
 
-  unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
-
-  for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
-    TreeEntry &TE = *VectorizableTree[I].get();
+  for (std::unique_ptr<TreeEntry> &TEPtr : VectorizableTree) {
+    TreeEntry &TE = *TEPtr.get();
 
     // We create duplicate tree entries for gather sequences that have multiple
     // uses. However, we should not compute the cost of duplicate sequences.
@@ -4246,70 +4495,110 @@
     // existing heuristics based on tree size may yield different results.
     //
     if (TE.State == TreeEntry::NeedToGather &&
-        std::any_of(std::next(VectorizableTree.begin(), I + 1),
-                    VectorizableTree.end(),
-                    [TE](const std::unique_ptr<TreeEntry> &EntryPtr) {
-                      return EntryPtr->State == TreeEntry::NeedToGather &&
-                             EntryPtr->isSame(TE.Scalars);
-                    }))
+        llvm::any_of(llvm::drop_begin(VectorizableTree, TE.Idx + 1),
+                     [TE](const std::unique_ptr<TreeEntry> &EntryPtr) {
+                       return EntryPtr->State == TreeEntry::NeedToGather &&
+                              EntryPtr->isSame(TE.Scalars);
+                     }))
       continue;
 
-    InstructionCost C = getEntryCost(&TE);
-    Cost += C;
-    LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
+    TE.Cost = getEntryCost(&TE);
+    LLVM_DEBUG(dbgs() << "SLP: Adding cost " << TE.Cost
                       << " for bundle that starts with " << *TE.Scalars[0]
-                      << ".\n"
-                      << "SLP: Current total cost = " << Cost << "\n");
+                      << ".\n");
+    CostSum += TE.Cost;
+    LLVM_DEBUG(dbgs() << "SLP: Current total cost = " << CostSum << "\n");
   }
 
-  SmallPtrSet<Value *, 16> ExtractCostCalculated;
-  InstructionCost ExtractCost = 0;
-  for (ExternalUser &EU : ExternalUses) {
-    // We only add extract cost once for the same scalar.
-    if (!ExtractCostCalculated.insert(EU.Scalar).second)
-      continue;
-
-    // Uses by ephemeral values are free (because the ephemeral value will be
-    // removed prior to code generation, and so the extraction will be
-    // removed as well).
-    if (EphValues.count(EU.User))
+  for (std::unique_ptr<TreeEntry> &TEPtr : VectorizableTree) {
+    TreeEntry *TE = TEPtr.get();
+    if (TE->State != TreeEntry::Vectorize &&
+        TE->State != TreeEntry::ScatterVectorize)
       continue;
+    InstructionCost GatherCost = 0;
+    for (TreeEntry *Gather : TE->UseEntries)
+      if (Gather->State != TreeEntry::Vectorize &&
+          Gather->State != TreeEntry::ScatterVectorize)
+        GatherCost += Gather->Cost;
+    TE->Cost += GatherCost;
+  }
+  return CostSum;
+}
 
-    // If we plan to rewrite the tree in a smaller type, we will need to sign
-    // extend the extracted value back to the original type. Here, we account
-    // for the extract and the added cost of the sign extend if needed.
-    auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth);
-    auto *ScalarRoot = VectorizableTree[0]->Scalars[0];
-    if (MinBWs.count(ScalarRoot)) {
-      auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
-      auto Extend =
-          MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt;
-      VecTy = FixedVectorType::get(MinTy, BundleWidth);
-      ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
-                                                   VecTy, EU.Lane);
-    } else {
-      ExtractCost +=
-          TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
-    }
+InstructionCost BoUpSLP::getTreeCost(bool TreeReduce,
+                                     InstructionCost UserCost) {
+  InstructionCost CostSum;
+  if (!IsCostSumReady) {
+    CostSum = getRawTreeCost();
+    RawTreeCost = CostSum;
+  } else {
+    CostSum = RawTreeCost;
   }
 
-  InstructionCost SpillCost = getSpillCost();
-  Cost += SpillCost + ExtractCost;
+  InstructionCost ExtractCost = getExtractCost();
+  InstructionCost SpillCost = 0;
+  if (!NoCallInst || !IsCostSumReady)
+    SpillCost = getSpillCost();
+  assert((!NoCallInst || getSpillCost() == 0) && "Incorrect spill cost");
+  if (!IsCostSumReady)
+    IsCostSumReady = true;
+  InstructionCost InsertCost = getInsertCost();
+  InstructionCost Cost =
+      CostSum + ExtractCost + SpillCost + InsertCost - UserCost;
+  InstructionCost FullCost = Cost;
 
 #ifndef NDEBUG
   SmallString<256> Str;
-  {
-    raw_svector_ostream OS(Str);
-    OS << "SLP: Spill Cost = " << SpillCost << ".\n"
-       << "SLP: Extract Cost = " << ExtractCost << ".\n"
-       << "SLP: Total Cost = " << Cost << ".\n";
-  }
+  raw_svector_ostream OS(Str);
+  OS << "SLP: Spill Cost = " << SpillCost << ".\n"
+     << "SLP: Extract Cost = " << ExtractCost << ".\n"
+     << "SLP: Insert Cost = " << InsertCost << ".\n"
+     << "SLP: Total Cost = " << Cost << ".\n";
   LLVM_DEBUG(dbgs() << Str);
   if (ViewSLPTree)
     ViewGraph(this, "SLP" + F->getName(), false, Str);
 #endif
+  if (TreeReduce && (Cost - UserCost) >= -SLPCostThreshold) {
+    std::vector<TreeEntry *> Vec;
+    unsigned VectNodes = 0;
+    if (!findSubTree(Vec, VectNodes, Cost, UserCost))
+      return Cost;
+    TEVecQueue Queue(Vec.begin(), Vec.end());
+    unsigned NodeCounter = 0;
 
-  return Cost;
+    while (!Queue.empty()) {
+      TreeEntry *T = Queue.top();
+      Queue.pop();
+      NodeCounter++;
+
+      if (!NoCallInst && NodeCounter > SLPThrottleBudget)
+        break;
+
+      ProposedToGather.insert(T);
+      T->State = TreeEntry::NeedToGather;
+      for (Value *V : T->Scalars) {
+        MustGather.insert(V);
+        ExternalUses.erase(
+            llvm::remove_if(ExternalUses,
+                            [V](ExternalUser &EU) { return EU.Scalar == V; }),
+            ExternalUses.end());
+      }
+      CostSum -= T->Cost;
+      ExtractCost = getExtractCost();
+      if (!NoCallInst)
+        SpillCost = getSpillCost();
+      assert((!NoCallInst || getSpillCost() == 0) && "Incorrect spill cost");
+      InsertCost = getInsertCost();
+      Cost = CostSum + ExtractCost + SpillCost + InsertCost - UserCost;
+      if (Cost < -SLPCostThreshold && !isTreeTinyAndNotFullyVectorizable() &&
+          (VectNodes - ProposedToGather.size()) >= MinVecNodes) {
+        cutTree();
+        return Cost;
+      }
+    }
+    ProposedToGather.clear();
+  }
+  return FullCost;
 }
 
 Optional<TargetTransformInfo::ShuffleKind>
@@ -5149,12 +5438,25 @@
 BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
   // All blocks must be scheduled before any instructions are inserted.
   for (auto &BSIter : BlocksSchedules) {
-    scheduleBlock(BSIter.second.get());
+    BlockScheduling *BS = BSIter.second.get();
+    // Remove all Schedule Data from all nodes that we have changed
+    // vectorization decision.
+    if (!ProposedToGather.empty())
+      removeFromScheduling(BS);
+    scheduleBlock(BS);
   }
 
   Builder.SetInsertPoint(&F->getEntryBlock().front());
   auto *VectorRoot = vectorizeTree(VectorizableTree[0].get());
 
+  for (std::unique_ptr<TreeEntry> &TEPtr : VectorizableTree) {
+    TreeEntry *Entry = TEPtr.get();
+    if ((Entry->State == TreeEntry::Vectorize ||
+         Entry->State == TreeEntry::ScatterVectorize) &&
+        !Entry->VectorizedValue)
+      vectorizeTree(Entry);
+  }
+
   // If the vectorized tree can be rewritten in a smaller type, we truncate the
   // vectorized root. InstCombine will then rewrite the entire expression. We
   // sign extend the extracted values below.
@@ -5284,7 +5586,9 @@
 
 #ifndef NDEBUG
       Type *Ty = Scalar->getType();
-      if (!Ty->isVoidTy()) {
+      // The tree might not be fully vectorized, so we don't have to
+      // check every user.
+      if (!Ty->isVoidTy() && ProposedToGather.empty()) {
         for (User *U : Scalar->users()) {
           LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
 
@@ -5509,6 +5813,7 @@
     BundleMember->FirstInBundle = BundleMember;
     ScheduleData *Next = BundleMember->NextInBundle;
     BundleMember->NextInBundle = nullptr;
+    BundleMember->TE = nullptr;
     BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
     if (BundleMember->UnscheduledDepsInBundle == 0) {
       ReadyInsts.insert(BundleMember);
@@ -5777,6 +6082,85 @@
   ReadyInsts.clear();
 }
 
+void BoUpSLP::BlockScheduling::reduceSchedulingRegion(Instruction *Start,
+                                                      Instruction *End) {
+  if (Start)
+    ScheduleStart = Start;
+  if (End)
+    ScheduleEnd = End;
+}
+
+void BoUpSLP::removeFromScheduling(BlockScheduling *BS) {
+  bool Removed = false;
+  SmallPtrSet<Instruction *, 12> Gathers;
+  SmallPtrSet<Instruction *, 12> Reduced;
+  Instruction *Start = nullptr;
+
+  // We can reduce the number of instructions to be considered for scheduling,
+  // after cutting the tree. Here we shrink the scheduling area from the top,
+  // consecutively, untill we encounter the required instruction. There might be
+  // unnecessary NeedToGather nodes with the relationship only to other
+  // NeedToGather nodes and unmap instructions in chains, we could safely
+  // delete those.
+  for (std::unique_ptr<TreeEntry> &TEPtr : reverse(VectorizableTree)) {
+    TreeEntry *TE = TEPtr.get();
+    if (TE->State != TreeEntry::NeedToGather || !TE->getOpcode() ||
+        TE->getMainOp()->getParent() != BS->BB)
+      continue;
+    for (const EdgeInfo &EI : TE->UserTreeIndices) {
+      if (EI.UserTE && (EI.UserTE->State != TreeEntry::NeedToGather)) {
+        auto InstructionsOnly =
+            make_filter_range(TE->Scalars, Instruction::classof);
+        for (Value *V : InstructionsOnly)
+          Gathers.insert(cast<Instruction>(V));
+        break;
+      }
+    }
+  }
+
+  for (Instruction *I = BS->ScheduleStart; I != BS->ScheduleEnd;
+       I = I->getNextNode()) {
+    if (!getTreeEntry(I) && !Gathers.count(I)) {
+      Reduced.insert(I);
+    } else {
+      Start = I;
+      break;
+    }
+  }
+
+  BS->reduceSchedulingRegion(Start, nullptr);
+
+  for (TreeEntry *Entry : ProposedToGather) {
+    ScheduleData *SD = BS->getScheduleData(Entry->Scalars[0]);
+    if (SD && SD->isPartOfBundle()) {
+      if (!Removed) {
+        Removed = true;
+        BS->resetSchedule();
+      }
+      SD->IsScheduled = false;
+      BS->cancelScheduling(Entry->Scalars, SD->OpValue);
+    }
+  }
+  if (!Removed)
+    return;
+
+  if (Reduced.size()) {
+    for (Instruction *I : Reduced) {
+      ScheduleData *SD = BS->getScheduleData(I);
+      if (SD)
+        SD->SchedulingRegionID = -1;
+    }
+  }
+  BS->resetSchedule();
+  BS->initialFillReadyList(BS->ReadyInsts);
+  for (Instruction *I = BS->ScheduleStart; I != BS->ScheduleEnd;
+       I = I->getNextNode()) {
+    if (BS->ScheduleDataMap.find(I) == BS->ScheduleDataMap.end())
+      continue;
+    BS->doForAllOpcodes(I, [](ScheduleData *SD) { SD->clearDependencies(); });
+  }
+}
+
 void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
   if (!BS->ScheduleStart)
     return;
@@ -6306,7 +6690,7 @@
 
   R.computeMinimumValueSizes();
 
-  InstructionCost Cost = R.getTreeCost();
+  InstructionCost Cost = R.getTreeCost(true);
 
   LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF =" << VF << "\n");
   if (Cost < -SLPCostThreshold) {
@@ -6512,6 +6896,7 @@
   // Check that all of the parts are instructions of the same type,
   // we permit an alternate opcode via InstructionsState.
   InstructionsState S = getSameOpcode(VL);
+
   if (!S.getOpcode())
     return false;
 
@@ -6606,7 +6991,7 @@
         continue;
 
       R.computeMinimumValueSizes();
-      InstructionCost Cost = R.getTreeCost();
+      InstructionCost UserCost = 0;
       CandidateFound = true;
       if (CompensateUseCost) {
         // TODO: Use TTI's getScalarizationOverhead for sequence of inserts
@@ -6636,7 +7021,6 @@
         // Switching to the TTI interface might help a bit.
         // Alternative solution could be pattern-match to detect a no-op or
         // shuffle.
-        InstructionCost UserCost = 0;
         for (unsigned Lane = 0; Lane < OpsWidth; Lane++) {
           auto *IE = cast<InsertElementInst>(InsertUses[I + Lane]);
           if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2)))
@@ -6645,8 +7029,8 @@
         }
         LLVM_DEBUG(dbgs() << "SLP: Compensate cost of users by: " << UserCost
                           << ".\n");
-        Cost -= UserCost;
       }
+      InstructionCost Cost = R.getTreeCost(true, UserCost);
 
       MinCost = std::min(MinCost, Cost);
 
Index: llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll
+++ llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll
@@ -233,14 +233,16 @@
 ; GFX9-NEXT:    ret void
 ;
 ; VI-LABEL: @canonicalize_v2f16(
-; VI-NEXT:    [[I0:%.*]] = load half, half addrspace(3)* [[A:%.*]], align 2
-; VI-NEXT:    [[CANONICALIZE0:%.*]] = call half @llvm.canonicalize.f16(half [[I0]])
-; VI-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds half, half addrspace(3)* [[A]], i64 1
-; VI-NEXT:    [[I3:%.*]] = load half, half addrspace(3)* [[ARRAYIDX3]], align 2
-; VI-NEXT:    [[CANONICALIZE1:%.*]] = call half @llvm.canonicalize.f16(half [[I3]])
-; VI-NEXT:    store half [[CANONICALIZE0]], half addrspace(3)* [[C:%.*]], align 2
-; VI-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds half, half addrspace(3)* [[C]], i64 1
-; VI-NEXT:    store half [[CANONICALIZE1]], half addrspace(3)* [[ARRAYIDX5]], align 2
+; VI-NEXT:    [[TMP1:%.*]] = bitcast half addrspace(3)* [[A:%.*]] to <2 x half> addrspace(3)*
+; VI-NEXT:    [[TMP2:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP1]], align 2
+; VI-NEXT:    [[TMP3:%.*]] = extractelement <2 x half> [[TMP2]], i32 0
+; VI-NEXT:    [[CANONICALIZE0:%.*]] = call half @llvm.canonicalize.f16(half [[TMP3]])
+; VI-NEXT:    [[TMP4:%.*]] = extractelement <2 x half> [[TMP2]], i32 1
+; VI-NEXT:    [[CANONICALIZE1:%.*]] = call half @llvm.canonicalize.f16(half [[TMP4]])
+; VI-NEXT:    [[TMP5:%.*]] = insertelement <2 x half> poison, half [[CANONICALIZE0]], i32 0
+; VI-NEXT:    [[TMP6:%.*]] = insertelement <2 x half> [[TMP5]], half [[CANONICALIZE1]], i32 1
+; VI-NEXT:    [[TMP7:%.*]] = bitcast half addrspace(3)* [[C:%.*]] to <2 x half> addrspace(3)*
+; VI-NEXT:    store <2 x half> [[TMP6]], <2 x half> addrspace(3)* [[TMP7]], align 2
 ; VI-NEXT:    ret void
 ;
   %i0 = load half, half addrspace(3)* %a, align 2
Index: llvm/test/Transforms/SLPVectorizer/X86/PR31847.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/PR31847.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/PR31847.ll
@@ -24,53 +24,53 @@
 ; CHECK-NEXT:    [[Y_045:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_1:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
 ; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP4]] to i32
-; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[CONV]], -128
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1
 ; CHECK-NEXT:    [[CONV3:%.*]] = zext i8 [[TMP5]] to i32
-; CHECK-NEXT:    [[SUB4:%.*]] = add nsw i32 [[CONV3]], -128
-; CHECK-NEXT:    [[CMP5:%.*]] = icmp sgt i32 [[SUB]], -1
-; CHECK-NEXT:    [[SUB7:%.*]] = sub nsw i32 128, [[CONV]]
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP5]], i32 [[SUB]], i32 [[SUB7]]
-; CHECK-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[SUB4]], -1
-; CHECK-NEXT:    [[SUB12:%.*]] = sub nsw i32 128, [[CONV3]]
-; CHECK-NEXT:    [[COND14:%.*]] = select i1 [[CMP8]], i32 [[SUB4]], i32 [[SUB12]]
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[COND14]], [[COND]]
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[CONV3]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[CONV]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <2 x i32> [[TMP7]], <i32 -128, i32 -128>
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp sgt <2 x i32> [[TMP8]], <i32 -1, i32 -1>
+; CHECK-NEXT:    [[TMP10:%.*]] = sub nsw <2 x i32> <i32 128, i32 128>, [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <2 x i1> [[TMP9]], <2 x i32> [[TMP8]], <2 x i32> [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i32> [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i32> [[TMP11]], i32 1
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP13]]
 ; CHECK-NEXT:    [[IDX_NEG:%.*]] = sub nsw i32 0, [[ADD]]
 ; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, i8* [[D1_DATA_046]], i32 [[IDX_NEG]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load i8, i8* [[ADD_PTR]], align 1
-; CHECK-NEXT:    [[CONV15:%.*]] = zext i8 [[TMP6]] to i32
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, i8* [[ADD_PTR]], align 1
+; CHECK-NEXT:    [[CONV15:%.*]] = zext i8 [[TMP14]] to i32
 ; CHECK-NEXT:    [[ADD16:%.*]] = add nsw i32 [[CONV15]], [[INTENSITY:%.*]]
 ; CHECK-NEXT:    [[CONV17:%.*]] = trunc i32 [[ADD16]] to i8
 ; CHECK-NEXT:    store i8 [[CONV17]], i8* [[ADD_PTR]], align 1
 ; CHECK-NEXT:    [[ADD_PTR18:%.*]] = getelementptr inbounds i8, i8* [[D1_DATA_046]], i32 [[ADD]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i8, i8* [[ADD_PTR18]], align 1
-; CHECK-NEXT:    [[NOT_TOBOOL:%.*]] = icmp eq i8 [[TMP7]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = load i8, i8* [[ADD_PTR18]], align 1
+; CHECK-NEXT:    [[NOT_TOBOOL:%.*]] = icmp eq i8 [[TMP15]], 0
 ; CHECK-NEXT:    [[CONV21:%.*]] = zext i1 [[NOT_TOBOOL]] to i8
 ; CHECK-NEXT:    store i8 [[CONV21]], i8* [[ADD_PTR18]], align 1
 ; CHECK-NEXT:    [[ADD_PTR23:%.*]] = getelementptr inbounds i8, i8* [[D1_DATA_046]], i32 [[TMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CONV_1:%.*]] = zext i8 [[TMP8]] to i32
-; CHECK-NEXT:    [[SUB_1:%.*]] = add nsw i32 [[CONV_1]], -128
-; CHECK-NEXT:    [[TMP9:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1
-; CHECK-NEXT:    [[CONV3_1:%.*]] = zext i8 [[TMP9]] to i32
-; CHECK-NEXT:    [[SUB4_1:%.*]] = add nsw i32 [[CONV3_1]], -128
-; CHECK-NEXT:    [[CMP5_1:%.*]] = icmp sgt i32 [[SUB_1]], -1
-; CHECK-NEXT:    [[SUB7_1:%.*]] = sub nsw i32 128, [[CONV_1]]
-; CHECK-NEXT:    [[COND_1:%.*]] = select i1 [[CMP5_1]], i32 [[SUB_1]], i32 [[SUB7_1]]
-; CHECK-NEXT:    [[CMP8_1:%.*]] = icmp sgt i32 [[SUB4_1]], -1
-; CHECK-NEXT:    [[SUB12_1:%.*]] = sub nsw i32 128, [[CONV3_1]]
-; CHECK-NEXT:    [[COND14_1:%.*]] = select i1 [[CMP8_1]], i32 [[SUB4_1]], i32 [[SUB12_1]]
-; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[COND14_1]], [[COND_1]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CONV_1:%.*]] = zext i8 [[TMP16]] to i32
+; CHECK-NEXT:    [[TMP17:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1
+; CHECK-NEXT:    [[CONV3_1:%.*]] = zext i8 [[TMP17]] to i32
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <2 x i32> poison, i32 [[CONV3_1]], i32 0
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x i32> [[TMP18]], i32 [[CONV_1]], i32 1
+; CHECK-NEXT:    [[TMP20:%.*]] = add nsw <2 x i32> [[TMP19]], <i32 -128, i32 -128>
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp sgt <2 x i32> [[TMP20]], <i32 -1, i32 -1>
+; CHECK-NEXT:    [[TMP22:%.*]] = sub nsw <2 x i32> <i32 128, i32 128>, [[TMP19]]
+; CHECK-NEXT:    [[TMP23:%.*]] = select <2 x i1> [[TMP21]], <2 x i32> [[TMP20]], <2 x i32> [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <2 x i32> [[TMP23]], i32 0
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <2 x i32> [[TMP23]], i32 1
+; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[TMP24]], [[TMP25]]
 ; CHECK-NEXT:    [[IDX_NEG_1:%.*]] = sub nsw i32 0, [[ADD_1]]
 ; CHECK-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR23]], i32 [[IDX_NEG_1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load i8, i8* [[ADD_PTR_1]], align 1
-; CHECK-NEXT:    [[CONV15_1:%.*]] = zext i8 [[TMP10]] to i32
+; CHECK-NEXT:    [[TMP26:%.*]] = load i8, i8* [[ADD_PTR_1]], align 1
+; CHECK-NEXT:    [[CONV15_1:%.*]] = zext i8 [[TMP26]] to i32
 ; CHECK-NEXT:    [[ADD16_1:%.*]] = add nsw i32 [[CONV15_1]], [[INTENSITY]]
 ; CHECK-NEXT:    [[CONV17_1:%.*]] = trunc i32 [[ADD16_1]] to i8
 ; CHECK-NEXT:    store i8 [[CONV17_1]], i8* [[ADD_PTR_1]], align 1
 ; CHECK-NEXT:    [[ADD_PTR18_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR23]], i32 [[ADD_1]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load i8, i8* [[ADD_PTR18_1]], align 1
-; CHECK-NEXT:    [[NOT_TOBOOL_1:%.*]] = icmp eq i8 [[TMP11]], 0
+; CHECK-NEXT:    [[TMP27:%.*]] = load i8, i8* [[ADD_PTR18_1]], align 1
+; CHECK-NEXT:    [[NOT_TOBOOL_1:%.*]] = icmp eq i8 [[TMP27]], 0
 ; CHECK-NEXT:    [[CONV21_1:%.*]] = zext i1 [[NOT_TOBOOL_1]] to i8
 ; CHECK-NEXT:    store i8 [[CONV21_1]], i8* [[ADD_PTR18_1]], align 1
 ; CHECK-NEXT:    [[ADD_PTR23_1]] = getelementptr inbounds i8, i8* [[ADD_PTR23]], i32 [[TMP1]]
Index: llvm/test/Transforms/SLPVectorizer/X86/arith-fix.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/arith-fix.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/arith-fix.ll
@@ -4,7 +4,7 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX1
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=SKX
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX256BW
 
 @a64 = common global [8 x i64] zeroinitializer, align 64
@@ -101,6 +101,13 @@
 ; AVX512-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
 ; AVX512-NEXT:    ret void
 ;
+; SKX-LABEL: @smul_v8i64(
+; SKX-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
+; SKX-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
+; SKX-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.smul.fix.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]], i32 3)
+; SKX-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
+; SKX-NEXT:    ret void
+;
 ; AVX256BW-LABEL: @smul_v8i64(
 ; AVX256BW-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
 ; AVX256BW-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
@@ -283,70 +290,82 @@
 ; SLM-NEXT:    ret void
 ;
 ; AVX1-LABEL: @smul_v16i32(
-; AVX1-NEXT:    [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4
-; AVX1-NEXT:    [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4
-; AVX1-NEXT:    [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4
-; AVX1-NEXT:    [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4
-; AVX1-NEXT:    [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4
-; AVX1-NEXT:    [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4
-; AVX1-NEXT:    [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4
-; AVX1-NEXT:    [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4
-; AVX1-NEXT:    [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4
-; AVX1-NEXT:    [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4
-; AVX1-NEXT:    [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
-; AVX1-NEXT:    [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
-; AVX1-NEXT:    [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
-; AVX1-NEXT:    [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
-; AVX1-NEXT:    [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
-; AVX1-NEXT:    [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
-; AVX1-NEXT:    [[B0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4
-; AVX1-NEXT:    [[B1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4
-; AVX1-NEXT:    [[B2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4
-; AVX1-NEXT:    [[B3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4
-; AVX1-NEXT:    [[B4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4
-; AVX1-NEXT:    [[B5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4
-; AVX1-NEXT:    [[B6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4
-; AVX1-NEXT:    [[B7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4
-; AVX1-NEXT:    [[B8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4
-; AVX1-NEXT:    [[B9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4
-; AVX1-NEXT:    [[B10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
-; AVX1-NEXT:    [[B11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
-; AVX1-NEXT:    [[B12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
-; AVX1-NEXT:    [[B13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
-; AVX1-NEXT:    [[B14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
-; AVX1-NEXT:    [[B15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
-; AVX1-NEXT:    [[R0:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A0]], i32 [[B0]], i32 3)
-; AVX1-NEXT:    [[R1:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A1]], i32 [[B1]], i32 3)
-; AVX1-NEXT:    [[R2:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A2]], i32 [[B2]], i32 3)
-; AVX1-NEXT:    [[R3:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A3]], i32 [[B3]], i32 3)
-; AVX1-NEXT:    [[R4:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A4]], i32 [[B4]], i32 3)
-; AVX1-NEXT:    [[R5:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A5]], i32 [[B5]], i32 3)
-; AVX1-NEXT:    [[R6:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A6]], i32 [[B6]], i32 3)
-; AVX1-NEXT:    [[R7:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A7]], i32 [[B7]], i32 3)
-; AVX1-NEXT:    [[R8:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A8]], i32 [[B8]], i32 3)
-; AVX1-NEXT:    [[R9:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A9]], i32 [[B9]], i32 3)
-; AVX1-NEXT:    [[R10:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A10]], i32 [[B10]], i32 3)
-; AVX1-NEXT:    [[R11:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A11]], i32 [[B11]], i32 3)
-; AVX1-NEXT:    [[R12:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A12]], i32 [[B12]], i32 3)
-; AVX1-NEXT:    [[R13:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A13]], i32 [[B13]], i32 3)
-; AVX1-NEXT:    [[R14:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A14]], i32 [[B14]], i32 3)
-; AVX1-NEXT:    [[R15:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A15]], i32 [[B15]], i32 3)
-; AVX1-NEXT:    store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4
-; AVX1-NEXT:    store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4
-; AVX1-NEXT:    store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4
-; AVX1-NEXT:    store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4
-; AVX1-NEXT:    store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4
-; AVX1-NEXT:    store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4
-; AVX1-NEXT:    store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4
-; AVX1-NEXT:    store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4
-; AVX1-NEXT:    store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4
-; AVX1-NEXT:    store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4
-; AVX1-NEXT:    store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
-; AVX1-NEXT:    store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
-; AVX1-NEXT:    store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
-; AVX1-NEXT:    store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
-; AVX1-NEXT:    store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
-; AVX1-NEXT:    store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+; AVX1-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; AVX1-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[TMP5]], i32 0
+; AVX1-NEXT:    [[R0:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[TMP9]], i32 [[TMP10]], i32 3)
+; AVX1-NEXT:    [[TMP11:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
+; AVX1-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[TMP5]], i32 1
+; AVX1-NEXT:    [[R1:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[TMP11]], i32 [[TMP12]], i32 3)
+; AVX1-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
+; AVX1-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[TMP5]], i32 2
+; AVX1-NEXT:    [[R2:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[TMP13]], i32 [[TMP14]], i32 3)
+; AVX1-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+; AVX1-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[TMP5]], i32 3
+; AVX1-NEXT:    [[R3:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[TMP15]], i32 [[TMP16]], i32 3)
+; AVX1-NEXT:    [[TMP17:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
+; AVX1-NEXT:    [[TMP18:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0
+; AVX1-NEXT:    [[R4:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[TMP17]], i32 [[TMP18]], i32 3)
+; AVX1-NEXT:    [[TMP19:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
+; AVX1-NEXT:    [[TMP20:%.*]] = extractelement <4 x i32> [[TMP6]], i32 1
+; AVX1-NEXT:    [[R5:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[TMP19]], i32 [[TMP20]], i32 3)
+; AVX1-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
+; AVX1-NEXT:    [[TMP22:%.*]] = extractelement <4 x i32> [[TMP6]], i32 2
+; AVX1-NEXT:    [[R6:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[TMP21]], i32 [[TMP22]], i32 3)
+; AVX1-NEXT:    [[TMP23:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; AVX1-NEXT:    [[TMP24:%.*]] = extractelement <4 x i32> [[TMP6]], i32 3
+; AVX1-NEXT:    [[R7:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[TMP23]], i32 [[TMP24]], i32 3)
+; AVX1-NEXT:    [[TMP25:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; AVX1-NEXT:    [[TMP26:%.*]] = extractelement <4 x i32> [[TMP7]], i32 0
+; AVX1-NEXT:    [[R8:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[TMP25]], i32 [[TMP26]], i32 3)
+; AVX1-NEXT:    [[TMP27:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; AVX1-NEXT:    [[TMP28:%.*]] = extractelement <4 x i32> [[TMP7]], i32 1
+; AVX1-NEXT:    [[R9:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[TMP27]], i32 [[TMP28]], i32 3)
+; AVX1-NEXT:    [[TMP29:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; AVX1-NEXT:    [[TMP30:%.*]] = extractelement <4 x i32> [[TMP7]], i32 2
+; AVX1-NEXT:    [[R10:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[TMP29]], i32 [[TMP30]], i32 3)
+; AVX1-NEXT:    [[TMP31:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; AVX1-NEXT:    [[TMP32:%.*]] = extractelement <4 x i32> [[TMP7]], i32 3
+; AVX1-NEXT:    [[R11:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[TMP31]], i32 [[TMP32]], i32 3)
+; AVX1-NEXT:    [[TMP33:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
+; AVX1-NEXT:    [[TMP34:%.*]] = extractelement <4 x i32> [[TMP8]], i32 0
+; AVX1-NEXT:    [[R12:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[TMP33]], i32 [[TMP34]], i32 3)
+; AVX1-NEXT:    [[TMP35:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1
+; AVX1-NEXT:    [[TMP36:%.*]] = extractelement <4 x i32> [[TMP8]], i32 1
+; AVX1-NEXT:    [[R13:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[TMP35]], i32 [[TMP36]], i32 3)
+; AVX1-NEXT:    [[TMP37:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
+; AVX1-NEXT:    [[TMP38:%.*]] = extractelement <4 x i32> [[TMP8]], i32 2
+; AVX1-NEXT:    [[R14:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[TMP37]], i32 [[TMP38]], i32 3)
+; AVX1-NEXT:    [[TMP39:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
+; AVX1-NEXT:    [[TMP40:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3
+; AVX1-NEXT:    [[R15:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[TMP39]], i32 [[TMP40]], i32 3)
+; AVX1-NEXT:    [[TMP41:%.*]] = insertelement <4 x i32> poison, i32 [[R0]], i32 0
+; AVX1-NEXT:    [[TMP42:%.*]] = insertelement <4 x i32> [[TMP41]], i32 [[R1]], i32 1
+; AVX1-NEXT:    [[TMP43:%.*]] = insertelement <4 x i32> [[TMP42]], i32 [[R2]], i32 2
+; AVX1-NEXT:    [[TMP44:%.*]] = insertelement <4 x i32> [[TMP43]], i32 [[R3]], i32 3
+; AVX1-NEXT:    store <4 x i32> [[TMP44]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP45:%.*]] = insertelement <4 x i32> poison, i32 [[R4]], i32 0
+; AVX1-NEXT:    [[TMP46:%.*]] = insertelement <4 x i32> [[TMP45]], i32 [[R5]], i32 1
+; AVX1-NEXT:    [[TMP47:%.*]] = insertelement <4 x i32> [[TMP46]], i32 [[R6]], i32 2
+; AVX1-NEXT:    [[TMP48:%.*]] = insertelement <4 x i32> [[TMP47]], i32 [[R7]], i32 3
+; AVX1-NEXT:    store <4 x i32> [[TMP48]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP49:%.*]] = insertelement <4 x i32> poison, i32 [[R8]], i32 0
+; AVX1-NEXT:    [[TMP50:%.*]] = insertelement <4 x i32> [[TMP49]], i32 [[R9]], i32 1
+; AVX1-NEXT:    [[TMP51:%.*]] = insertelement <4 x i32> [[TMP50]], i32 [[R10]], i32 2
+; AVX1-NEXT:    [[TMP52:%.*]] = insertelement <4 x i32> [[TMP51]], i32 [[R11]], i32 3
+; AVX1-NEXT:    store <4 x i32> [[TMP52]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP53:%.*]] = insertelement <4 x i32> poison, i32 [[R12]], i32 0
+; AVX1-NEXT:    [[TMP54:%.*]] = insertelement <4 x i32> [[TMP53]], i32 [[R13]], i32 1
+; AVX1-NEXT:    [[TMP55:%.*]] = insertelement <4 x i32> [[TMP54]], i32 [[R14]], i32 2
+; AVX1-NEXT:    [[TMP56:%.*]] = insertelement <4 x i32> [[TMP55]], i32 [[R15]], i32 3
+; AVX1-NEXT:    store <4 x i32> [[TMP56]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @smul_v16i32(
@@ -367,6 +386,13 @@
 ; AVX512-NEXT:    store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
 ; AVX512-NEXT:    ret void
 ;
+; SKX-LABEL: @smul_v16i32(
+; SKX-NEXT:    [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
+; SKX-NEXT:    [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
+; SKX-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.smul.fix.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]], i32 3)
+; SKX-NEXT:    store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
+; SKX-NEXT:    ret void
+;
 ; AVX256BW-LABEL: @smul_v16i32(
 ; AVX256BW-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
 ; AVX256BW-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
@@ -501,6 +527,13 @@
 ; AVX512-NEXT:    [[TMP3:%.*]] = call <32 x i16> @llvm.smul.fix.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]], i32 3)
 ; AVX512-NEXT:    store <32 x i16> [[TMP3]], <32 x i16>* bitcast ([32 x i16]* @c16 to <32 x i16>*), align 2
 ; AVX512-NEXT:    ret void
+;
+; SKX-LABEL: @smul_v32i16(
+; SKX-NEXT:    [[TMP1:%.*]] = load <32 x i16>, <32 x i16>* bitcast ([32 x i16]* @a16 to <32 x i16>*), align 2
+; SKX-NEXT:    [[TMP2:%.*]] = load <32 x i16>, <32 x i16>* bitcast ([32 x i16]* @b16 to <32 x i16>*), align 2
+; SKX-NEXT:    [[TMP3:%.*]] = call <32 x i16> @llvm.smul.fix.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]], i32 3)
+; SKX-NEXT:    store <32 x i16> [[TMP3]], <32 x i16>* bitcast ([32 x i16]* @c16 to <32 x i16>*), align 2
+; SKX-NEXT:    ret void
 ;
   %a0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2
   %a1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2
@@ -689,6 +722,13 @@
 ; AVX512-NEXT:    [[TMP3:%.*]] = call <64 x i8> @llvm.smul.fix.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]], i32 3)
 ; AVX512-NEXT:    store <64 x i8> [[TMP3]], <64 x i8>* bitcast ([64 x i8]* @c8 to <64 x i8>*), align 1
 ; AVX512-NEXT:    ret void
+;
+; SKX-LABEL: @smul_v64i8(
+; SKX-NEXT:    [[TMP1:%.*]] = load <64 x i8>, <64 x i8>* bitcast ([64 x i8]* @a8 to <64 x i8>*), align 1
+; SKX-NEXT:    [[TMP2:%.*]] = load <64 x i8>, <64 x i8>* bitcast ([64 x i8]* @b8 to <64 x i8>*), align 1
+; SKX-NEXT:    [[TMP3:%.*]] = call <64 x i8> @llvm.smul.fix.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]], i32 3)
+; SKX-NEXT:    store <64 x i8> [[TMP3]], <64 x i8>* bitcast ([64 x i8]* @c8 to <64 x i8>*), align 1
+; SKX-NEXT:    ret void
 ;
   %a0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
   %a1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
@@ -1030,6 +1070,13 @@
 ; AVX512-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
 ; AVX512-NEXT:    ret void
 ;
+; SKX-LABEL: @umul_v8i64(
+; SKX-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
+; SKX-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
+; SKX-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.umul.fix.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]], i32 3)
+; SKX-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
+; SKX-NEXT:    ret void
+;
 ; AVX256BW-LABEL: @umul_v8i64(
 ; AVX256BW-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
 ; AVX256BW-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
@@ -1212,70 +1259,82 @@
 ; SLM-NEXT:    ret void
 ;
 ; AVX1-LABEL: @umul_v16i32(
-; AVX1-NEXT:    [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4
-; AVX1-NEXT:    [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4
-; AVX1-NEXT:    [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4
-; AVX1-NEXT:    [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4
-; AVX1-NEXT:    [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4
-; AVX1-NEXT:    [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4
-; AVX1-NEXT:    [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4
-; AVX1-NEXT:    [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4
-; AVX1-NEXT:    [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4
-; AVX1-NEXT:    [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4
-; AVX1-NEXT:    [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
-; AVX1-NEXT:    [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
-; AVX1-NEXT:    [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
-; AVX1-NEXT:    [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
-; AVX1-NEXT:    [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
-; AVX1-NEXT:    [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
-; AVX1-NEXT:    [[B0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4
-; AVX1-NEXT:    [[B1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4
-; AVX1-NEXT:    [[B2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4
-; AVX1-NEXT:    [[B3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4
-; AVX1-NEXT:    [[B4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4
-; AVX1-NEXT:    [[B5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4
-; AVX1-NEXT:    [[B6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4
-; AVX1-NEXT:    [[B7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4
-; AVX1-NEXT:    [[B8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4
-; AVX1-NEXT:    [[B9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4
-; AVX1-NEXT:    [[B10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
-; AVX1-NEXT:    [[B11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
-; AVX1-NEXT:    [[B12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
-; AVX1-NEXT:    [[B13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
-; AVX1-NEXT:    [[B14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
-; AVX1-NEXT:    [[B15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
-; AVX1-NEXT:    [[R0:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A0]], i32 [[B0]], i32 3)
-; AVX1-NEXT:    [[R1:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A1]], i32 [[B1]], i32 3)
-; AVX1-NEXT:    [[R2:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A2]], i32 [[B2]], i32 3)
-; AVX1-NEXT:    [[R3:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A3]], i32 [[B3]], i32 3)
-; AVX1-NEXT:    [[R4:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A4]], i32 [[B4]], i32 3)
-; AVX1-NEXT:    [[R5:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A5]], i32 [[B5]], i32 3)
-; AVX1-NEXT:    [[R6:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A6]], i32 [[B6]], i32 3)
-; AVX1-NEXT:    [[R7:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A7]], i32 [[B7]], i32 3)
-; AVX1-NEXT:    [[R8:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A8]], i32 [[B8]], i32 3)
-; AVX1-NEXT:    [[R9:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A9]], i32 [[B9]], i32 3)
-; AVX1-NEXT:    [[R10:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A10]], i32 [[B10]], i32 3)
-; AVX1-NEXT:    [[R11:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A11]], i32 [[B11]], i32 3)
-; AVX1-NEXT:    [[R12:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A12]], i32 [[B12]], i32 3)
-; AVX1-NEXT:    [[R13:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A13]], i32 [[B13]], i32 3)
-; AVX1-NEXT:    [[R14:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A14]], i32 [[B14]], i32 3)
-; AVX1-NEXT:    [[R15:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A15]], i32 [[B15]], i32 3)
-; AVX1-NEXT:    store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4
-; AVX1-NEXT:    store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4
-; AVX1-NEXT:    store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4
-; AVX1-NEXT:    store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4
-; AVX1-NEXT:    store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4
-; AVX1-NEXT:    store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4
-; AVX1-NEXT:    store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4
-; AVX1-NEXT:    store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4
-; AVX1-NEXT:    store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4
-; AVX1-NEXT:    store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4
-; AVX1-NEXT:    store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
-; AVX1-NEXT:    store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
-; AVX1-NEXT:    store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
-; AVX1-NEXT:    store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
-; AVX1-NEXT:    store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
-; AVX1-NEXT:    store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+; AVX1-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; AVX1-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[TMP5]], i32 0
+; AVX1-NEXT:    [[R0:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[TMP9]], i32 [[TMP10]], i32 3)
+; AVX1-NEXT:    [[TMP11:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
+; AVX1-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[TMP5]], i32 1
+; AVX1-NEXT:    [[R1:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[TMP11]], i32 [[TMP12]], i32 3)
+; AVX1-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
+; AVX1-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[TMP5]], i32 2
+; AVX1-NEXT:    [[R2:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[TMP13]], i32 [[TMP14]], i32 3)
+; AVX1-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+; AVX1-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[TMP5]], i32 3
+; AVX1-NEXT:    [[R3:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[TMP15]], i32 [[TMP16]], i32 3)
+; AVX1-NEXT:    [[TMP17:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
+; AVX1-NEXT:    [[TMP18:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0
+; AVX1-NEXT:    [[R4:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[TMP17]], i32 [[TMP18]], i32 3)
+; AVX1-NEXT:    [[TMP19:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
+; AVX1-NEXT:    [[TMP20:%.*]] = extractelement <4 x i32> [[TMP6]], i32 1
+; AVX1-NEXT:    [[R5:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[TMP19]], i32 [[TMP20]], i32 3)
+; AVX1-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
+; AVX1-NEXT:    [[TMP22:%.*]] = extractelement <4 x i32> [[TMP6]], i32 2
+; AVX1-NEXT:    [[R6:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[TMP21]], i32 [[TMP22]], i32 3)
+; AVX1-NEXT:    [[TMP23:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; AVX1-NEXT:    [[TMP24:%.*]] = extractelement <4 x i32> [[TMP6]], i32 3
+; AVX1-NEXT:    [[R7:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[TMP23]], i32 [[TMP24]], i32 3)
+; AVX1-NEXT:    [[TMP25:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; AVX1-NEXT:    [[TMP26:%.*]] = extractelement <4 x i32> [[TMP7]], i32 0
+; AVX1-NEXT:    [[R8:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[TMP25]], i32 [[TMP26]], i32 3)
+; AVX1-NEXT:    [[TMP27:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; AVX1-NEXT:    [[TMP28:%.*]] = extractelement <4 x i32> [[TMP7]], i32 1
+; AVX1-NEXT:    [[R9:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[TMP27]], i32 [[TMP28]], i32 3)
+; AVX1-NEXT:    [[TMP29:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; AVX1-NEXT:    [[TMP30:%.*]] = extractelement <4 x i32> [[TMP7]], i32 2
+; AVX1-NEXT:    [[R10:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[TMP29]], i32 [[TMP30]], i32 3)
+; AVX1-NEXT:    [[TMP31:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; AVX1-NEXT:    [[TMP32:%.*]] = extractelement <4 x i32> [[TMP7]], i32 3
+; AVX1-NEXT:    [[R11:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[TMP31]], i32 [[TMP32]], i32 3)
+; AVX1-NEXT:    [[TMP33:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
+; AVX1-NEXT:    [[TMP34:%.*]] = extractelement <4 x i32> [[TMP8]], i32 0
+; AVX1-NEXT:    [[R12:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[TMP33]], i32 [[TMP34]], i32 3)
+; AVX1-NEXT:    [[TMP35:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1
+; AVX1-NEXT:    [[TMP36:%.*]] = extractelement <4 x i32> [[TMP8]], i32 1
+; AVX1-NEXT:    [[R13:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[TMP35]], i32 [[TMP36]], i32 3)
+; AVX1-NEXT:    [[TMP37:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
+; AVX1-NEXT:    [[TMP38:%.*]] = extractelement <4 x i32> [[TMP8]], i32 2
+; AVX1-NEXT:    [[R14:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[TMP37]], i32 [[TMP38]], i32 3)
+; AVX1-NEXT:    [[TMP39:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
+; AVX1-NEXT:    [[TMP40:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3
+; AVX1-NEXT:    [[R15:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[TMP39]], i32 [[TMP40]], i32 3)
+; AVX1-NEXT:    [[TMP41:%.*]] = insertelement <4 x i32> poison, i32 [[R0]], i32 0
+; AVX1-NEXT:    [[TMP42:%.*]] = insertelement <4 x i32> [[TMP41]], i32 [[R1]], i32 1
+; AVX1-NEXT:    [[TMP43:%.*]] = insertelement <4 x i32> [[TMP42]], i32 [[R2]], i32 2
+; AVX1-NEXT:    [[TMP44:%.*]] = insertelement <4 x i32> [[TMP43]], i32 [[R3]], i32 3
+; AVX1-NEXT:    store <4 x i32> [[TMP44]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP45:%.*]] = insertelement <4 x i32> poison, i32 [[R4]], i32 0
+; AVX1-NEXT:    [[TMP46:%.*]] = insertelement <4 x i32> [[TMP45]], i32 [[R5]], i32 1
+; AVX1-NEXT:    [[TMP47:%.*]] = insertelement <4 x i32> [[TMP46]], i32 [[R6]], i32 2
+; AVX1-NEXT:    [[TMP48:%.*]] = insertelement <4 x i32> [[TMP47]], i32 [[R7]], i32 3
+; AVX1-NEXT:    store <4 x i32> [[TMP48]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP49:%.*]] = insertelement <4 x i32> poison, i32 [[R8]], i32 0
+; AVX1-NEXT:    [[TMP50:%.*]] = insertelement <4 x i32> [[TMP49]], i32 [[R9]], i32 1
+; AVX1-NEXT:    [[TMP51:%.*]] = insertelement <4 x i32> [[TMP50]], i32 [[R10]], i32 2
+; AVX1-NEXT:    [[TMP52:%.*]] = insertelement <4 x i32> [[TMP51]], i32 [[R11]], i32 3
+; AVX1-NEXT:    store <4 x i32> [[TMP52]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP53:%.*]] = insertelement <4 x i32> poison, i32 [[R12]], i32 0
+; AVX1-NEXT:    [[TMP54:%.*]] = insertelement <4 x i32> [[TMP53]], i32 [[R13]], i32 1
+; AVX1-NEXT:    [[TMP55:%.*]] = insertelement <4 x i32> [[TMP54]], i32 [[R14]], i32 2
+; AVX1-NEXT:    [[TMP56:%.*]] = insertelement <4 x i32> [[TMP55]], i32 [[R15]], i32 3
+; AVX1-NEXT:    store <4 x i32> [[TMP56]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @umul_v16i32(
@@ -1296,6 +1355,13 @@
 ; AVX512-NEXT:    store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
 ; AVX512-NEXT:    ret void
 ;
+; SKX-LABEL: @umul_v16i32(
+; SKX-NEXT:    [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
+; SKX-NEXT:    [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
+; SKX-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.umul.fix.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]], i32 3)
+; SKX-NEXT:    store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
+; SKX-NEXT:    ret void
+;
 ; AVX256BW-LABEL: @umul_v16i32(
 ; AVX256BW-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
 ; AVX256BW-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
@@ -1430,6 +1496,13 @@
 ; AVX512-NEXT:    [[TMP3:%.*]] = call <32 x i16> @llvm.umul.fix.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]], i32 3)
 ; AVX512-NEXT:    store <32 x i16> [[TMP3]], <32 x i16>* bitcast ([32 x i16]* @c16 to <32 x i16>*), align 2
 ; AVX512-NEXT:    ret void
+;
+; SKX-LABEL: @umul_v32i16(
+; SKX-NEXT:    [[TMP1:%.*]] = load <32 x i16>, <32 x i16>* bitcast ([32 x i16]* @a16 to <32 x i16>*), align 2
+; SKX-NEXT:    [[TMP2:%.*]] = load <32 x i16>, <32 x i16>* bitcast ([32 x i16]* @b16 to <32 x i16>*), align 2
+; SKX-NEXT:    [[TMP3:%.*]] = call <32 x i16> @llvm.umul.fix.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]], i32 3)
+; SKX-NEXT:    store <32 x i16> [[TMP3]], <32 x i16>* bitcast ([32 x i16]* @c16 to <32 x i16>*), align 2
+; SKX-NEXT:    ret void
 ;
   %a0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2
   %a1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2
@@ -1618,6 +1691,13 @@
 ; AVX512-NEXT:    [[TMP3:%.*]] = call <64 x i8> @llvm.umul.fix.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]], i32 3)
 ; AVX512-NEXT:    store <64 x i8> [[TMP3]], <64 x i8>* bitcast ([64 x i8]* @c8 to <64 x i8>*), align 1
 ; AVX512-NEXT:    ret void
+;
+; SKX-LABEL: @umul_v64i8(
+; SKX-NEXT:    [[TMP1:%.*]] = load <64 x i8>, <64 x i8>* bitcast ([64 x i8]* @a8 to <64 x i8>*), align 1
+; SKX-NEXT:    [[TMP2:%.*]] = load <64 x i8>, <64 x i8>* bitcast ([64 x i8]* @b8 to <64 x i8>*), align 1
+; SKX-NEXT:    [[TMP3:%.*]] = call <64 x i8> @llvm.umul.fix.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]], i32 3)
+; SKX-NEXT:    store <64 x i8> [[TMP3]], <64 x i8>* bitcast ([64 x i8]* @c8 to <64 x i8>*), align 1
+; SKX-NEXT:    ret void
 ;
   %a0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
   %a1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
Index: llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll
@@ -103,18 +103,20 @@
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @ctlz_4i32(
-; AVX2-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
-; AVX2-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
-; AVX2-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
-; AVX2-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
-; AVX2-NEXT:    [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false)
-; AVX2-NEXT:    [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false)
-; AVX2-NEXT:    [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false)
-; AVX2-NEXT:    [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false)
-; AVX2-NEXT:    store i32 [[CTLZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
-; AVX2-NEXT:    store i32 [[CTLZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
-; AVX2-NEXT:    store i32 [[CTLZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
-; AVX2-NEXT:    store i32 [[CTLZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
+; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
+; AVX2-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; AVX2-NEXT:    [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[TMP2]], i1 false)
+; AVX2-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
+; AVX2-NEXT:    [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[TMP3]], i1 false)
+; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
+; AVX2-NEXT:    [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[TMP4]], i1 false)
+; AVX2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+; AVX2-NEXT:    [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[TMP5]], i1 false)
+; AVX2-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[CTLZ0]], i32 0
+; AVX2-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[CTLZ1]], i32 1
+; AVX2-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[CTLZ2]], i32 2
+; AVX2-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[CTLZ3]], i32 3
+; AVX2-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
 ; AVX2-NEXT:    ret void
 ;
   %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
@@ -559,18 +561,20 @@
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @ctlz_undef_4i32(
-; AVX2-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
-; AVX2-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
-; AVX2-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
-; AVX2-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
-; AVX2-NEXT:    [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 true)
-; AVX2-NEXT:    [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 true)
-; AVX2-NEXT:    [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 true)
-; AVX2-NEXT:    [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 true)
-; AVX2-NEXT:    store i32 [[CTLZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
-; AVX2-NEXT:    store i32 [[CTLZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
-; AVX2-NEXT:    store i32 [[CTLZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
-; AVX2-NEXT:    store i32 [[CTLZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
+; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
+; AVX2-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; AVX2-NEXT:    [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[TMP2]], i1 true)
+; AVX2-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
+; AVX2-NEXT:    [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[TMP3]], i1 true)
+; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
+; AVX2-NEXT:    [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[TMP4]], i1 true)
+; AVX2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+; AVX2-NEXT:    [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[TMP5]], i1 true)
+; AVX2-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[CTLZ0]], i32 0
+; AVX2-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[CTLZ1]], i32 1
+; AVX2-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[CTLZ2]], i32 2
+; AVX2-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[CTLZ3]], i32 3
+; AVX2-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
 ; AVX2-NEXT:    ret void
 ;
   %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
Index: llvm/test/Transforms/SLPVectorizer/X86/ctpop.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/ctpop.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/ctpop.ll
@@ -123,33 +123,37 @@
 ; SSE2-NEXT:    ret void
 ;
 ; SSE42-LABEL: @ctpop_4i32(
-; SSE42-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
-; SSE42-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
-; SSE42-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
-; SSE42-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
-; SSE42-NEXT:    [[CTPOP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD0]])
-; SSE42-NEXT:    [[CTPOP1:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD1]])
-; SSE42-NEXT:    [[CTPOP2:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD2]])
-; SSE42-NEXT:    [[CTPOP3:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD3]])
-; SSE42-NEXT:    store i32 [[CTPOP0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
-; SSE42-NEXT:    store i32 [[CTPOP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
-; SSE42-NEXT:    store i32 [[CTPOP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
-; SSE42-NEXT:    store i32 [[CTPOP3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
+; SSE42-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
+; SSE42-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; SSE42-NEXT:    [[CTPOP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP2]])
+; SSE42-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
+; SSE42-NEXT:    [[CTPOP1:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP3]])
+; SSE42-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
+; SSE42-NEXT:    [[CTPOP2:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP4]])
+; SSE42-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+; SSE42-NEXT:    [[CTPOP3:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP5]])
+; SSE42-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[CTPOP0]], i32 0
+; SSE42-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[CTPOP1]], i32 1
+; SSE42-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[CTPOP2]], i32 2
+; SSE42-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[CTPOP3]], i32 3
+; SSE42-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
 ; SSE42-NEXT:    ret void
 ;
 ; AVX-LABEL: @ctpop_4i32(
-; AVX-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
-; AVX-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
-; AVX-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
-; AVX-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
-; AVX-NEXT:    [[CTPOP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD0]])
-; AVX-NEXT:    [[CTPOP1:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD1]])
-; AVX-NEXT:    [[CTPOP2:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD2]])
-; AVX-NEXT:    [[CTPOP3:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD3]])
-; AVX-NEXT:    store i32 [[CTPOP0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
-; AVX-NEXT:    store i32 [[CTPOP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
-; AVX-NEXT:    store i32 [[CTPOP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
-; AVX-NEXT:    store i32 [[CTPOP3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; AVX-NEXT:    [[CTPOP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP2]])
+; AVX-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
+; AVX-NEXT:    [[CTPOP1:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP3]])
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
+; AVX-NEXT:    [[CTPOP2:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP4]])
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+; AVX-NEXT:    [[CTPOP3:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP5]])
+; AVX-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[CTPOP0]], i32 0
+; AVX-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[CTPOP1]], i32 1
+; AVX-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[CTPOP2]], i32 2
+; AVX-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[CTPOP3]], i32 3
+; AVX-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
 ; AVX-NEXT:    ret void
 ;
   %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
@@ -178,57 +182,63 @@
 ; SSE2-NEXT:    ret void
 ;
 ; SSE42-LABEL: @ctpop_8i32(
-; SSE42-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2
-; SSE42-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2
-; SSE42-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2
-; SSE42-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2
-; SSE42-NEXT:    [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2
-; SSE42-NEXT:    [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2
-; SSE42-NEXT:    [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2
-; SSE42-NEXT:    [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2
-; SSE42-NEXT:    [[CTPOP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD0]])
-; SSE42-NEXT:    [[CTPOP1:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD1]])
-; SSE42-NEXT:    [[CTPOP2:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD2]])
-; SSE42-NEXT:    [[CTPOP3:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD3]])
-; SSE42-NEXT:    [[CTPOP4:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD4]])
-; SSE42-NEXT:    [[CTPOP5:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD5]])
-; SSE42-NEXT:    [[CTPOP6:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD6]])
-; SSE42-NEXT:    [[CTPOP7:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD7]])
-; SSE42-NEXT:    store i32 [[CTPOP0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
-; SSE42-NEXT:    store i32 [[CTPOP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
-; SSE42-NEXT:    store i32 [[CTPOP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
-; SSE42-NEXT:    store i32 [[CTPOP3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
-; SSE42-NEXT:    store i32 [[CTPOP4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
-; SSE42-NEXT:    store i32 [[CTPOP5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
-; SSE42-NEXT:    store i32 [[CTPOP6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
-; SSE42-NEXT:    store i32 [[CTPOP7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
+; SSE42-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2
+; SSE42-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2
+; SSE42-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; SSE42-NEXT:    [[CTPOP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP3]])
+; SSE42-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
+; SSE42-NEXT:    [[CTPOP1:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP4]])
+; SSE42-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
+; SSE42-NEXT:    [[CTPOP2:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP5]])
+; SSE42-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+; SSE42-NEXT:    [[CTPOP3:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP6]])
+; SSE42-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
+; SSE42-NEXT:    [[CTPOP4:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP7]])
+; SSE42-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
+; SSE42-NEXT:    [[CTPOP5:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP8]])
+; SSE42-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
+; SSE42-NEXT:    [[CTPOP6:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP9]])
+; SSE42-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; SSE42-NEXT:    [[CTPOP7:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP10]])
+; SSE42-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[CTPOP0]], i32 0
+; SSE42-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[CTPOP1]], i32 1
+; SSE42-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[CTPOP2]], i32 2
+; SSE42-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[CTPOP3]], i32 3
+; SSE42-NEXT:    store <4 x i32> [[TMP14]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
+; SSE42-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> poison, i32 [[CTPOP4]], i32 0
+; SSE42-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[CTPOP5]], i32 1
+; SSE42-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[CTPOP6]], i32 2
+; SSE42-NEXT:    [[TMP18:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[CTPOP7]], i32 3
+; SSE42-NEXT:    store <4 x i32> [[TMP18]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2
 ; SSE42-NEXT:    ret void
 ;
 ; AVX1-LABEL: @ctpop_8i32(
-; AVX1-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2
-; AVX1-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2
-; AVX1-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2
-; AVX1-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2
-; AVX1-NEXT:    [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2
-; AVX1-NEXT:    [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2
-; AVX1-NEXT:    [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2
-; AVX1-NEXT:    [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2
-; AVX1-NEXT:    [[CTPOP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD0]])
-; AVX1-NEXT:    [[CTPOP1:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD1]])
-; AVX1-NEXT:    [[CTPOP2:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD2]])
-; AVX1-NEXT:    [[CTPOP3:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD3]])
-; AVX1-NEXT:    [[CTPOP4:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD4]])
-; AVX1-NEXT:    [[CTPOP5:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD5]])
-; AVX1-NEXT:    [[CTPOP6:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD6]])
-; AVX1-NEXT:    [[CTPOP7:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD7]])
-; AVX1-NEXT:    store i32 [[CTPOP0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
-; AVX1-NEXT:    store i32 [[CTPOP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
-; AVX1-NEXT:    store i32 [[CTPOP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
-; AVX1-NEXT:    store i32 [[CTPOP3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
-; AVX1-NEXT:    store i32 [[CTPOP4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
-; AVX1-NEXT:    store i32 [[CTPOP5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
-; AVX1-NEXT:    store i32 [[CTPOP6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
-; AVX1-NEXT:    store i32 [[CTPOP7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
+; AVX1-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([8 x i32]* @src32 to <8 x i32>*), align 2
+; AVX1-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0
+; AVX1-NEXT:    [[CTPOP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP2]])
+; AVX1-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP1]], i32 1
+; AVX1-NEXT:    [[CTPOP1:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP3]])
+; AVX1-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2
+; AVX1-NEXT:    [[CTPOP2:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP4]])
+; AVX1-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3
+; AVX1-NEXT:    [[CTPOP3:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP5]])
+; AVX1-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4
+; AVX1-NEXT:    [[CTPOP4:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP6]])
+; AVX1-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5
+; AVX1-NEXT:    [[CTPOP5:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP7]])
+; AVX1-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP1]], i32 6
+; AVX1-NEXT:    [[CTPOP6:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP8]])
+; AVX1-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP1]], i32 7
+; AVX1-NEXT:    [[CTPOP7:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP9]])
+; AVX1-NEXT:    [[TMP10:%.*]] = insertelement <8 x i32> poison, i32 [[CTPOP0]], i32 0
+; AVX1-NEXT:    [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[CTPOP1]], i32 1
+; AVX1-NEXT:    [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[CTPOP2]], i32 2
+; AVX1-NEXT:    [[TMP13:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[CTPOP3]], i32 3
+; AVX1-NEXT:    [[TMP14:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[CTPOP4]], i32 4
+; AVX1-NEXT:    [[TMP15:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[CTPOP5]], i32 5
+; AVX1-NEXT:    [[TMP16:%.*]] = insertelement <8 x i32> [[TMP15]], i32 [[CTPOP6]], i32 6
+; AVX1-NEXT:    [[TMP17:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[CTPOP7]], i32 7
+; AVX1-NEXT:    store <8 x i32> [[TMP17]], <8 x i32>* bitcast ([8 x i32]* @dst32 to <8 x i32>*), align 2
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @ctpop_8i32(
Index: llvm/test/Transforms/SLPVectorizer/X86/cttz.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/cttz.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/cttz.ll
@@ -103,18 +103,20 @@
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @cttz_4i32(
-; AVX2-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
-; AVX2-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
-; AVX2-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
-; AVX2-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
-; AVX2-NEXT:    [[CTTZ0:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD0]], i1 false)
-; AVX2-NEXT:    [[CTTZ1:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD1]], i1 false)
-; AVX2-NEXT:    [[CTTZ2:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD2]], i1 false)
-; AVX2-NEXT:    [[CTTZ3:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD3]], i1 false)
-; AVX2-NEXT:    store i32 [[CTTZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
-; AVX2-NEXT:    store i32 [[CTTZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
-; AVX2-NEXT:    store i32 [[CTTZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
-; AVX2-NEXT:    store i32 [[CTTZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
+; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
+; AVX2-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; AVX2-NEXT:    [[CTTZ0:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP2]], i1 false)
+; AVX2-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
+; AVX2-NEXT:    [[CTTZ1:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP3]], i1 false)
+; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
+; AVX2-NEXT:    [[CTTZ2:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP4]], i1 false)
+; AVX2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+; AVX2-NEXT:    [[CTTZ3:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP5]], i1 false)
+; AVX2-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[CTTZ0]], i32 0
+; AVX2-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[CTTZ1]], i32 1
+; AVX2-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[CTTZ2]], i32 2
+; AVX2-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[CTTZ3]], i32 3
+; AVX2-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
 ; AVX2-NEXT:    ret void
 ;
   %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
@@ -559,18 +561,20 @@
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @cttz_undef_4i32(
-; AVX2-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
-; AVX2-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
-; AVX2-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
-; AVX2-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
-; AVX2-NEXT:    [[CTTZ0:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD0]], i1 true)
-; AVX2-NEXT:    [[CTTZ1:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD1]], i1 true)
-; AVX2-NEXT:    [[CTTZ2:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD2]], i1 true)
-; AVX2-NEXT:    [[CTTZ3:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD3]], i1 true)
-; AVX2-NEXT:    store i32 [[CTTZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
-; AVX2-NEXT:    store i32 [[CTTZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
-; AVX2-NEXT:    store i32 [[CTTZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
-; AVX2-NEXT:    store i32 [[CTTZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
+; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
+; AVX2-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; AVX2-NEXT:    [[CTTZ0:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP2]], i1 true)
+; AVX2-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
+; AVX2-NEXT:    [[CTTZ1:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP3]], i1 true)
+; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
+; AVX2-NEXT:    [[CTTZ2:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP4]], i1 true)
+; AVX2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+; AVX2-NEXT:    [[CTTZ3:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP5]], i1 true)
+; AVX2-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[CTTZ0]], i32 0
+; AVX2-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[CTTZ1]], i32 1
+; AVX2-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[CTTZ2]], i32 2
+; AVX2-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[CTTZ3]], i32 3
+; AVX2-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
 ; AVX2-NEXT:    ret void
 ;
   %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
Index: llvm/test/Transforms/SLPVectorizer/X86/fma.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/fma.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/fma.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=NO-FMA
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=NO-FMA
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=NO-FMA-I7
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=FMA --check-prefix=FMA256
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=FMA --check-prefix=FMA256
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=FMA-AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=FMA --check-prefix=FMA512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=FMA --check-prefix=FMA256
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=FMA --check-prefix=FMA256AVX512
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
@@ -26,18 +26,39 @@
 
 define void @fma_2f64() #0 {
 ; NO-FMA-LABEL: @fma_2f64(
-; NO-FMA-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 0), align 8
-; NO-FMA-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 1), align 8
-; NO-FMA-NEXT:    [[B0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 0), align 8
-; NO-FMA-NEXT:    [[B1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 1), align 8
-; NO-FMA-NEXT:    [[C0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 0), align 8
-; NO-FMA-NEXT:    [[C1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 1), align 8
-; NO-FMA-NEXT:    [[FMA0:%.*]] = call double @llvm.fma.f64(double [[A0]], double [[B0]], double [[C0]])
-; NO-FMA-NEXT:    [[FMA1:%.*]] = call double @llvm.fma.f64(double [[A1]], double [[B1]], double [[C1]])
-; NO-FMA-NEXT:    store double [[FMA0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; NO-FMA-NEXT:    store double [[FMA1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; NO-FMA-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcA64 to <2 x double>*), align 8
+; NO-FMA-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 8
+; NO-FMA-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcC64 to <2 x double>*), align 8
+; NO-FMA-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; NO-FMA-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; NO-FMA-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; NO-FMA-NEXT:    [[FMA0:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[TMP5]], double [[TMP6]])
+; NO-FMA-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; NO-FMA-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; NO-FMA-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+; NO-FMA-NEXT:    [[FMA1:%.*]] = call double @llvm.fma.f64(double [[TMP7]], double [[TMP8]], double [[TMP9]])
+; NO-FMA-NEXT:    [[TMP10:%.*]] = insertelement <2 x double> poison, double [[FMA0]], i32 0
+; NO-FMA-NEXT:    [[TMP11:%.*]] = insertelement <2 x double> [[TMP10]], double [[FMA1]], i32 1
+; NO-FMA-NEXT:    store <2 x double> [[TMP11]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
 ; NO-FMA-NEXT:    ret void
 ;
+; NO-FMA-I7-LABEL: @fma_2f64(
+; NO-FMA-I7-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcA64 to <2 x double>*), align 8
+; NO-FMA-I7-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 8
+; NO-FMA-I7-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcC64 to <2 x double>*), align 8
+; NO-FMA-I7-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; NO-FMA-I7-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; NO-FMA-I7-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; NO-FMA-I7-NEXT:    [[FMA0:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[TMP5]], double [[TMP6]])
+; NO-FMA-I7-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; NO-FMA-I7-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; NO-FMA-I7-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+; NO-FMA-I7-NEXT:    [[FMA1:%.*]] = call double @llvm.fma.f64(double [[TMP7]], double [[TMP8]], double [[TMP9]])
+; NO-FMA-I7-NEXT:    [[TMP10:%.*]] = insertelement <2 x double> poison, double [[FMA0]], i32 0
+; NO-FMA-I7-NEXT:    [[TMP11:%.*]] = insertelement <2 x double> [[TMP10]], double [[FMA1]], i32 1
+; NO-FMA-I7-NEXT:    store <2 x double> [[TMP11]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; NO-FMA-I7-NEXT:    ret void
+;
 ; FMA-LABEL: @fma_2f64(
 ; FMA-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcA64 to <2 x double>*), align 8
 ; FMA-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 8
@@ -45,6 +66,14 @@
 ; FMA-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP2]], <2 x double> [[TMP3]])
 ; FMA-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
 ; FMA-NEXT:    ret void
+;
+; FMA-AVX2-LABEL: @fma_2f64(
+; FMA-AVX2-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcA64 to <2 x double>*), align 8
+; FMA-AVX2-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 8
+; FMA-AVX2-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcC64 to <2 x double>*), align 8
+; FMA-AVX2-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP2]], <2 x double> [[TMP3]])
+; FMA-AVX2-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; FMA-AVX2-NEXT:    ret void
 ;
   %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 0), align 8
   %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 1), align 8
@@ -61,28 +90,67 @@
 
 define void @fma_4f64() #0 {
 ; NO-FMA-LABEL: @fma_4f64(
-; NO-FMA-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 0), align 8
-; NO-FMA-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 1), align 8
-; NO-FMA-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2), align 8
-; NO-FMA-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 3), align 8
-; NO-FMA-NEXT:    [[B0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 0), align 8
-; NO-FMA-NEXT:    [[B1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 1), align 8
-; NO-FMA-NEXT:    [[B2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2), align 8
-; NO-FMA-NEXT:    [[B3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 3), align 8
-; NO-FMA-NEXT:    [[C0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 0), align 8
-; NO-FMA-NEXT:    [[C1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 1), align 8
-; NO-FMA-NEXT:    [[C2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 2), align 8
-; NO-FMA-NEXT:    [[C3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 3), align 8
-; NO-FMA-NEXT:    [[FMA0:%.*]] = call double @llvm.fma.f64(double [[A0]], double [[B0]], double [[C0]])
-; NO-FMA-NEXT:    [[FMA1:%.*]] = call double @llvm.fma.f64(double [[A1]], double [[B1]], double [[C1]])
-; NO-FMA-NEXT:    [[FMA2:%.*]] = call double @llvm.fma.f64(double [[A2]], double [[B2]], double [[C2]])
-; NO-FMA-NEXT:    [[FMA3:%.*]] = call double @llvm.fma.f64(double [[A3]], double [[B3]], double [[C3]])
-; NO-FMA-NEXT:    store double [[FMA0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; NO-FMA-NEXT:    store double [[FMA1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; NO-FMA-NEXT:    store double [[FMA2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
-; NO-FMA-NEXT:    store double [[FMA3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; NO-FMA-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcA64 to <2 x double>*), align 8
+; NO-FMA-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 8
+; NO-FMA-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 8
+; NO-FMA-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 8
+; NO-FMA-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcC64 to <2 x double>*), align 8
+; NO-FMA-NEXT:    [[TMP6:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 2) to <2 x double>*), align 8
+; NO-FMA-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; NO-FMA-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; NO-FMA-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
+; NO-FMA-NEXT:    [[FMA0:%.*]] = call double @llvm.fma.f64(double [[TMP7]], double [[TMP8]], double [[TMP9]])
+; NO-FMA-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; NO-FMA-NEXT:    [[TMP11:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+; NO-FMA-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
+; NO-FMA-NEXT:    [[FMA1:%.*]] = call double @llvm.fma.f64(double [[TMP10]], double [[TMP11]], double [[TMP12]])
+; NO-FMA-NEXT:    [[TMP13:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; NO-FMA-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
+; NO-FMA-NEXT:    [[TMP15:%.*]] = extractelement <2 x double> [[TMP6]], i32 0
+; NO-FMA-NEXT:    [[FMA2:%.*]] = call double @llvm.fma.f64(double [[TMP13]], double [[TMP14]], double [[TMP15]])
+; NO-FMA-NEXT:    [[TMP16:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; NO-FMA-NEXT:    [[TMP17:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
+; NO-FMA-NEXT:    [[TMP18:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
+; NO-FMA-NEXT:    [[FMA3:%.*]] = call double @llvm.fma.f64(double [[TMP16]], double [[TMP17]], double [[TMP18]])
+; NO-FMA-NEXT:    [[TMP19:%.*]] = insertelement <2 x double> poison, double [[FMA0]], i32 0
+; NO-FMA-NEXT:    [[TMP20:%.*]] = insertelement <2 x double> [[TMP19]], double [[FMA1]], i32 1
+; NO-FMA-NEXT:    store <2 x double> [[TMP20]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; NO-FMA-NEXT:    [[TMP21:%.*]] = insertelement <2 x double> poison, double [[FMA2]], i32 0
+; NO-FMA-NEXT:    [[TMP22:%.*]] = insertelement <2 x double> [[TMP21]], double [[FMA3]], i32 1
+; NO-FMA-NEXT:    store <2 x double> [[TMP22]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
 ; NO-FMA-NEXT:    ret void
 ;
+; NO-FMA-I7-LABEL: @fma_4f64(
+; NO-FMA-I7-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcA64 to <2 x double>*), align 8
+; NO-FMA-I7-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 8
+; NO-FMA-I7-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 8
+; NO-FMA-I7-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 8
+; NO-FMA-I7-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcC64 to <2 x double>*), align 8
+; NO-FMA-I7-NEXT:    [[TMP6:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 2) to <2 x double>*), align 8
+; NO-FMA-I7-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; NO-FMA-I7-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; NO-FMA-I7-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
+; NO-FMA-I7-NEXT:    [[FMA0:%.*]] = call double @llvm.fma.f64(double [[TMP7]], double [[TMP8]], double [[TMP9]])
+; NO-FMA-I7-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; NO-FMA-I7-NEXT:    [[TMP11:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+; NO-FMA-I7-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
+; NO-FMA-I7-NEXT:    [[FMA1:%.*]] = call double @llvm.fma.f64(double [[TMP10]], double [[TMP11]], double [[TMP12]])
+; NO-FMA-I7-NEXT:    [[TMP13:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; NO-FMA-I7-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
+; NO-FMA-I7-NEXT:    [[TMP15:%.*]] = extractelement <2 x double> [[TMP6]], i32 0
+; NO-FMA-I7-NEXT:    [[FMA2:%.*]] = call double @llvm.fma.f64(double [[TMP13]], double [[TMP14]], double [[TMP15]])
+; NO-FMA-I7-NEXT:    [[TMP16:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; NO-FMA-I7-NEXT:    [[TMP17:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
+; NO-FMA-I7-NEXT:    [[TMP18:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
+; NO-FMA-I7-NEXT:    [[FMA3:%.*]] = call double @llvm.fma.f64(double [[TMP16]], double [[TMP17]], double [[TMP18]])
+; NO-FMA-I7-NEXT:    [[TMP19:%.*]] = insertelement <2 x double> poison, double [[FMA0]], i32 0
+; NO-FMA-I7-NEXT:    [[TMP20:%.*]] = insertelement <2 x double> [[TMP19]], double [[FMA1]], i32 1
+; NO-FMA-I7-NEXT:    store <2 x double> [[TMP20]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; NO-FMA-I7-NEXT:    [[TMP21:%.*]] = insertelement <2 x double> poison, double [[FMA2]], i32 0
+; NO-FMA-I7-NEXT:    [[TMP22:%.*]] = insertelement <2 x double> [[TMP21]], double [[FMA3]], i32 1
+; NO-FMA-I7-NEXT:    store <2 x double> [[TMP22]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
+; NO-FMA-I7-NEXT:    ret void
+;
 ; FMA-LABEL: @fma_4f64(
 ; FMA-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcA64 to <4 x double>*), align 8
 ; FMA-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcB64 to <4 x double>*), align 8
@@ -90,6 +158,14 @@
 ; FMA-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP2]], <4 x double> [[TMP3]])
 ; FMA-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
 ; FMA-NEXT:    ret void
+;
+; FMA-AVX2-LABEL: @fma_4f64(
+; FMA-AVX2-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcA64 to <4 x double>*), align 8
+; FMA-AVX2-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcB64 to <4 x double>*), align 8
+; FMA-AVX2-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcC64 to <4 x double>*), align 8
+; FMA-AVX2-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP2]], <4 x double> [[TMP3]])
+; FMA-AVX2-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
+; FMA-AVX2-NEXT:    ret void
 ;
   %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 0), align 8
   %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 1), align 8
@@ -116,48 +192,123 @@
 
 define void @fma_8f64() #0 {
 ; NO-FMA-LABEL: @fma_8f64(
-; NO-FMA-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 0), align 4
-; NO-FMA-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 1), align 4
-; NO-FMA-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2), align 4
-; NO-FMA-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 3), align 4
-; NO-FMA-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4), align 4
-; NO-FMA-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 5), align 4
-; NO-FMA-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 6), align 4
-; NO-FMA-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 7), align 4
-; NO-FMA-NEXT:    [[B0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 0), align 4
-; NO-FMA-NEXT:    [[B1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 1), align 4
-; NO-FMA-NEXT:    [[B2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2), align 4
-; NO-FMA-NEXT:    [[B3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 3), align 4
-; NO-FMA-NEXT:    [[B4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4), align 4
-; NO-FMA-NEXT:    [[B5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 5), align 4
-; NO-FMA-NEXT:    [[B6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 6), align 4
-; NO-FMA-NEXT:    [[B7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 7), align 4
-; NO-FMA-NEXT:    [[C0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 0), align 4
-; NO-FMA-NEXT:    [[C1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 1), align 4
-; NO-FMA-NEXT:    [[C2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 2), align 4
-; NO-FMA-NEXT:    [[C3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 3), align 4
-; NO-FMA-NEXT:    [[C4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 4), align 4
-; NO-FMA-NEXT:    [[C5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 5), align 4
-; NO-FMA-NEXT:    [[C6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 6), align 4
-; NO-FMA-NEXT:    [[C7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 7), align 4
-; NO-FMA-NEXT:    [[FMA0:%.*]] = call double @llvm.fma.f64(double [[A0]], double [[B0]], double [[C0]])
-; NO-FMA-NEXT:    [[FMA1:%.*]] = call double @llvm.fma.f64(double [[A1]], double [[B1]], double [[C1]])
-; NO-FMA-NEXT:    [[FMA2:%.*]] = call double @llvm.fma.f64(double [[A2]], double [[B2]], double [[C2]])
-; NO-FMA-NEXT:    [[FMA3:%.*]] = call double @llvm.fma.f64(double [[A3]], double [[B3]], double [[C3]])
-; NO-FMA-NEXT:    [[FMA4:%.*]] = call double @llvm.fma.f64(double [[A4]], double [[B4]], double [[C4]])
-; NO-FMA-NEXT:    [[FMA5:%.*]] = call double @llvm.fma.f64(double [[A5]], double [[B5]], double [[C5]])
-; NO-FMA-NEXT:    [[FMA6:%.*]] = call double @llvm.fma.f64(double [[A6]], double [[B6]], double [[C6]])
-; NO-FMA-NEXT:    [[FMA7:%.*]] = call double @llvm.fma.f64(double [[A7]], double [[B7]], double [[C7]])
-; NO-FMA-NEXT:    store double [[FMA0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 4
-; NO-FMA-NEXT:    store double [[FMA1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 4
-; NO-FMA-NEXT:    store double [[FMA2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 4
-; NO-FMA-NEXT:    store double [[FMA3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 4
-; NO-FMA-NEXT:    store double [[FMA4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 4
-; NO-FMA-NEXT:    store double [[FMA5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 4
-; NO-FMA-NEXT:    store double [[FMA6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 4
-; NO-FMA-NEXT:    store double [[FMA7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 4
+; NO-FMA-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcA64 to <2 x double>*), align 4
+; NO-FMA-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 4
+; NO-FMA-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <2 x double>*), align 4
+; NO-FMA-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 6) to <2 x double>*), align 4
+; NO-FMA-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 4
+; NO-FMA-NEXT:    [[TMP6:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 4
+; NO-FMA-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <2 x double>*), align 4
+; NO-FMA-NEXT:    [[TMP8:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 6) to <2 x double>*), align 4
+; NO-FMA-NEXT:    [[TMP9:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcC64 to <2 x double>*), align 4
+; NO-FMA-NEXT:    [[TMP10:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 2) to <2 x double>*), align 4
+; NO-FMA-NEXT:    [[TMP11:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 4) to <2 x double>*), align 4
+; NO-FMA-NEXT:    [[TMP12:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 6) to <2 x double>*), align 4
+; NO-FMA-NEXT:    [[TMP13:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; NO-FMA-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
+; NO-FMA-NEXT:    [[TMP15:%.*]] = extractelement <2 x double> [[TMP9]], i32 0
+; NO-FMA-NEXT:    [[FMA0:%.*]] = call double @llvm.fma.f64(double [[TMP13]], double [[TMP14]], double [[TMP15]])
+; NO-FMA-NEXT:    [[TMP16:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; NO-FMA-NEXT:    [[TMP17:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
+; NO-FMA-NEXT:    [[TMP18:%.*]] = extractelement <2 x double> [[TMP9]], i32 1
+; NO-FMA-NEXT:    [[FMA1:%.*]] = call double @llvm.fma.f64(double [[TMP16]], double [[TMP17]], double [[TMP18]])
+; NO-FMA-NEXT:    [[TMP19:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; NO-FMA-NEXT:    [[TMP20:%.*]] = extractelement <2 x double> [[TMP6]], i32 0
+; NO-FMA-NEXT:    [[TMP21:%.*]] = extractelement <2 x double> [[TMP10]], i32 0
+; NO-FMA-NEXT:    [[FMA2:%.*]] = call double @llvm.fma.f64(double [[TMP19]], double [[TMP20]], double [[TMP21]])
+; NO-FMA-NEXT:    [[TMP22:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; NO-FMA-NEXT:    [[TMP23:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
+; NO-FMA-NEXT:    [[TMP24:%.*]] = extractelement <2 x double> [[TMP10]], i32 1
+; NO-FMA-NEXT:    [[FMA3:%.*]] = call double @llvm.fma.f64(double [[TMP22]], double [[TMP23]], double [[TMP24]])
+; NO-FMA-NEXT:    [[TMP25:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; NO-FMA-NEXT:    [[TMP26:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
+; NO-FMA-NEXT:    [[TMP27:%.*]] = extractelement <2 x double> [[TMP11]], i32 0
+; NO-FMA-NEXT:    [[FMA4:%.*]] = call double @llvm.fma.f64(double [[TMP25]], double [[TMP26]], double [[TMP27]])
+; NO-FMA-NEXT:    [[TMP28:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+; NO-FMA-NEXT:    [[TMP29:%.*]] = extractelement <2 x double> [[TMP7]], i32 1
+; NO-FMA-NEXT:    [[TMP30:%.*]] = extractelement <2 x double> [[TMP11]], i32 1
+; NO-FMA-NEXT:    [[FMA5:%.*]] = call double @llvm.fma.f64(double [[TMP28]], double [[TMP29]], double [[TMP30]])
+; NO-FMA-NEXT:    [[TMP31:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
+; NO-FMA-NEXT:    [[TMP32:%.*]] = extractelement <2 x double> [[TMP8]], i32 0
+; NO-FMA-NEXT:    [[TMP33:%.*]] = extractelement <2 x double> [[TMP12]], i32 0
+; NO-FMA-NEXT:    [[FMA6:%.*]] = call double @llvm.fma.f64(double [[TMP31]], double [[TMP32]], double [[TMP33]])
+; NO-FMA-NEXT:    [[TMP34:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
+; NO-FMA-NEXT:    [[TMP35:%.*]] = extractelement <2 x double> [[TMP8]], i32 1
+; NO-FMA-NEXT:    [[TMP36:%.*]] = extractelement <2 x double> [[TMP12]], i32 1
+; NO-FMA-NEXT:    [[FMA7:%.*]] = call double @llvm.fma.f64(double [[TMP34]], double [[TMP35]], double [[TMP36]])
+; NO-FMA-NEXT:    [[TMP37:%.*]] = insertelement <2 x double> poison, double [[FMA0]], i32 0
+; NO-FMA-NEXT:    [[TMP38:%.*]] = insertelement <2 x double> [[TMP37]], double [[FMA1]], i32 1
+; NO-FMA-NEXT:    store <2 x double> [[TMP38]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 4
+; NO-FMA-NEXT:    [[TMP39:%.*]] = insertelement <2 x double> poison, double [[FMA2]], i32 0
+; NO-FMA-NEXT:    [[TMP40:%.*]] = insertelement <2 x double> [[TMP39]], double [[FMA3]], i32 1
+; NO-FMA-NEXT:    store <2 x double> [[TMP40]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 4
+; NO-FMA-NEXT:    [[TMP41:%.*]] = insertelement <2 x double> poison, double [[FMA4]], i32 0
+; NO-FMA-NEXT:    [[TMP42:%.*]] = insertelement <2 x double> [[TMP41]], double [[FMA5]], i32 1
+; NO-FMA-NEXT:    store <2 x double> [[TMP42]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 4
+; NO-FMA-NEXT:    [[TMP43:%.*]] = insertelement <2 x double> poison, double [[FMA6]], i32 0
+; NO-FMA-NEXT:    [[TMP44:%.*]] = insertelement <2 x double> [[TMP43]], double [[FMA7]], i32 1
+; NO-FMA-NEXT:    store <2 x double> [[TMP44]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 4
 ; NO-FMA-NEXT:    ret void
 ;
+; NO-FMA-I7-LABEL: @fma_8f64(
+; NO-FMA-I7-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcA64 to <2 x double>*), align 4
+; NO-FMA-I7-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 4
+; NO-FMA-I7-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <2 x double>*), align 4
+; NO-FMA-I7-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 6) to <2 x double>*), align 4
+; NO-FMA-I7-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 4
+; NO-FMA-I7-NEXT:    [[TMP6:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 4
+; NO-FMA-I7-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <2 x double>*), align 4
+; NO-FMA-I7-NEXT:    [[TMP8:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 6) to <2 x double>*), align 4
+; NO-FMA-I7-NEXT:    [[TMP9:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcC64 to <2 x double>*), align 4
+; NO-FMA-I7-NEXT:    [[TMP10:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 2) to <2 x double>*), align 4
+; NO-FMA-I7-NEXT:    [[TMP11:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 4) to <2 x double>*), align 4
+; NO-FMA-I7-NEXT:    [[TMP12:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 6) to <2 x double>*), align 4
+; NO-FMA-I7-NEXT:    [[TMP13:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; NO-FMA-I7-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
+; NO-FMA-I7-NEXT:    [[TMP15:%.*]] = extractelement <2 x double> [[TMP9]], i32 0
+; NO-FMA-I7-NEXT:    [[FMA0:%.*]] = call double @llvm.fma.f64(double [[TMP13]], double [[TMP14]], double [[TMP15]])
+; NO-FMA-I7-NEXT:    [[TMP16:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; NO-FMA-I7-NEXT:    [[TMP17:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
+; NO-FMA-I7-NEXT:    [[TMP18:%.*]] = extractelement <2 x double> [[TMP9]], i32 1
+; NO-FMA-I7-NEXT:    [[FMA1:%.*]] = call double @llvm.fma.f64(double [[TMP16]], double [[TMP17]], double [[TMP18]])
+; NO-FMA-I7-NEXT:    [[TMP19:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; NO-FMA-I7-NEXT:    [[TMP20:%.*]] = extractelement <2 x double> [[TMP6]], i32 0
+; NO-FMA-I7-NEXT:    [[TMP21:%.*]] = extractelement <2 x double> [[TMP10]], i32 0
+; NO-FMA-I7-NEXT:    [[FMA2:%.*]] = call double @llvm.fma.f64(double [[TMP19]], double [[TMP20]], double [[TMP21]])
+; NO-FMA-I7-NEXT:    [[TMP22:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; NO-FMA-I7-NEXT:    [[TMP23:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
+; NO-FMA-I7-NEXT:    [[TMP24:%.*]] = extractelement <2 x double> [[TMP10]], i32 1
+; NO-FMA-I7-NEXT:    [[FMA3:%.*]] = call double @llvm.fma.f64(double [[TMP22]], double [[TMP23]], double [[TMP24]])
+; NO-FMA-I7-NEXT:    [[TMP25:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; NO-FMA-I7-NEXT:    [[TMP26:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
+; NO-FMA-I7-NEXT:    [[TMP27:%.*]] = extractelement <2 x double> [[TMP11]], i32 0
+; NO-FMA-I7-NEXT:    [[FMA4:%.*]] = call double @llvm.fma.f64(double [[TMP25]], double [[TMP26]], double [[TMP27]])
+; NO-FMA-I7-NEXT:    [[TMP28:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+; NO-FMA-I7-NEXT:    [[TMP29:%.*]] = extractelement <2 x double> [[TMP7]], i32 1
+; NO-FMA-I7-NEXT:    [[TMP30:%.*]] = extractelement <2 x double> [[TMP11]], i32 1
+; NO-FMA-I7-NEXT:    [[FMA5:%.*]] = call double @llvm.fma.f64(double [[TMP28]], double [[TMP29]], double [[TMP30]])
+; NO-FMA-I7-NEXT:    [[TMP31:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
+; NO-FMA-I7-NEXT:    [[TMP32:%.*]] = extractelement <2 x double> [[TMP8]], i32 0
+; NO-FMA-I7-NEXT:    [[TMP33:%.*]] = extractelement <2 x double> [[TMP12]], i32 0
+; NO-FMA-I7-NEXT:    [[FMA6:%.*]] = call double @llvm.fma.f64(double [[TMP31]], double [[TMP32]], double [[TMP33]])
+; NO-FMA-I7-NEXT:    [[TMP34:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
+; NO-FMA-I7-NEXT:    [[TMP35:%.*]] = extractelement <2 x double> [[TMP8]], i32 1
+; NO-FMA-I7-NEXT:    [[TMP36:%.*]] = extractelement <2 x double> [[TMP12]], i32 1
+; NO-FMA-I7-NEXT:    [[FMA7:%.*]] = call double @llvm.fma.f64(double [[TMP34]], double [[TMP35]], double [[TMP36]])
+; NO-FMA-I7-NEXT:    [[TMP37:%.*]] = insertelement <2 x double> poison, double [[FMA0]], i32 0
+; NO-FMA-I7-NEXT:    [[TMP38:%.*]] = insertelement <2 x double> [[TMP37]], double [[FMA1]], i32 1
+; NO-FMA-I7-NEXT:    store <2 x double> [[TMP38]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 4
+; NO-FMA-I7-NEXT:    [[TMP39:%.*]] = insertelement <2 x double> poison, double [[FMA2]], i32 0
+; NO-FMA-I7-NEXT:    [[TMP40:%.*]] = insertelement <2 x double> [[TMP39]], double [[FMA3]], i32 1
+; NO-FMA-I7-NEXT:    store <2 x double> [[TMP40]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 4
+; NO-FMA-I7-NEXT:    [[TMP41:%.*]] = insertelement <2 x double> poison, double [[FMA4]], i32 0
+; NO-FMA-I7-NEXT:    [[TMP42:%.*]] = insertelement <2 x double> [[TMP41]], double [[FMA5]], i32 1
+; NO-FMA-I7-NEXT:    store <2 x double> [[TMP42]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 4
+; NO-FMA-I7-NEXT:    [[TMP43:%.*]] = insertelement <2 x double> poison, double [[FMA6]], i32 0
+; NO-FMA-I7-NEXT:    [[TMP44:%.*]] = insertelement <2 x double> [[TMP43]], double [[FMA7]], i32 1
+; NO-FMA-I7-NEXT:    store <2 x double> [[TMP44]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 4
+; NO-FMA-I7-NEXT:    ret void
+;
 ; FMA256-LABEL: @fma_8f64(
 ; FMA256-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcA64 to <4 x double>*), align 4
 ; FMA256-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <4 x double>*), align 4
@@ -171,6 +322,19 @@
 ; FMA256-NEXT:    store <4 x double> [[TMP8]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 4
 ; FMA256-NEXT:    ret void
 ;
+; FMA-AVX2-LABEL: @fma_8f64(
+; FMA-AVX2-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcA64 to <4 x double>*), align 4
+; FMA-AVX2-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <4 x double>*), align 4
+; FMA-AVX2-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcB64 to <4 x double>*), align 4
+; FMA-AVX2-NEXT:    [[TMP4:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <4 x double>*), align 4
+; FMA-AVX2-NEXT:    [[TMP5:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcC64 to <4 x double>*), align 4
+; FMA-AVX2-NEXT:    [[TMP6:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 4) to <4 x double>*), align 4
+; FMA-AVX2-NEXT:    [[TMP7:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP3]], <4 x double> [[TMP5]])
+; FMA-AVX2-NEXT:    [[TMP8:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[TMP2]], <4 x double> [[TMP4]], <4 x double> [[TMP6]])
+; FMA-AVX2-NEXT:    store <4 x double> [[TMP7]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4
+; FMA-AVX2-NEXT:    store <4 x double> [[TMP8]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 4
+; FMA-AVX2-NEXT:    ret void
+;
 ; FMA512-LABEL: @fma_8f64(
 ; FMA512-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @srcA64 to <8 x double>*), align 4
 ; FMA512-NEXT:    [[TMP2:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @srcB64 to <8 x double>*), align 4
@@ -179,6 +343,31 @@
 ; FMA512-NEXT:    store <8 x double> [[TMP4]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 4
 ; FMA512-NEXT:    ret void
 ;
+; FMA256AVX512-LABEL: @fma_8f64(
+; FMA256AVX512-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcA64 to <4 x double>*), align 4
+; FMA256AVX512-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <4 x double>*), align 4
+; FMA256AVX512-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcB64 to <4 x double>*), align 4
+; FMA256AVX512-NEXT:    [[TMP4:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <4 x double>*), align 4
+; FMA256AVX512-NEXT:    [[TMP5:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcC64 to <4 x double>*), align 4
+; FMA256AVX512-NEXT:    [[TMP6:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 4) to <4 x double>*), align 4
+; FMA256AVX512-NEXT:    [[TMP7:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP3]], <4 x double> [[TMP5]])
+; FMA256AVX512-NEXT:    [[TMP8:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[TMP2]], <4 x double> [[TMP4]], <4 x double> [[TMP6]])
+; FMA256AVX512-NEXT:    store <4 x double> [[TMP7]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4
+; FMA256AVX512-NEXT:    store <4 x double> [[TMP8]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 4
+; FMA256AVX512-NEXT:    ret void
+;
+; FMA256-BDVER1-LABEL: @fma_8f64(
+; FMA256-BDVER1-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcA64 to <4 x double>*), align 4
+; FMA256-BDVER1-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <4 x double>*), align 4
+; FMA256-BDVER1-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcB64 to <4 x double>*), align 4
+; FMA256-BDVER1-NEXT:    [[TMP4:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <4 x double>*), align 4
+; FMA256-BDVER1-NEXT:    [[TMP5:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcC64 to <4 x double>*), align 4
+; FMA256-BDVER1-NEXT:    [[TMP6:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 4) to <4 x double>*), align 4
+; FMA256-BDVER1-NEXT:    [[TMP7:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP3]], <4 x double> [[TMP5]])
+; FMA256-BDVER1-NEXT:    [[TMP8:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[TMP2]], <4 x double> [[TMP4]], <4 x double> [[TMP6]])
+; FMA256-BDVER1-NEXT:    store <4 x double> [[TMP7]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4
+; FMA256-BDVER1-NEXT:    store <4 x double> [[TMP8]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 4
+; FMA256-BDVER1-NEXT:    ret void
   %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 0), align 4
   %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 1), align 4
   %a2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2), align 4
@@ -224,28 +413,59 @@
 
 define void @fma_4f32() #0 {
 ; NO-FMA-LABEL: @fma_4f32(
-; NO-FMA-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 0), align 4
-; NO-FMA-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 1), align 4
-; NO-FMA-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 2), align 4
-; NO-FMA-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 3), align 4
-; NO-FMA-NEXT:    [[B0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 0), align 4
-; NO-FMA-NEXT:    [[B1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 1), align 4
-; NO-FMA-NEXT:    [[B2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 2), align 4
-; NO-FMA-NEXT:    [[B3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 3), align 4
-; NO-FMA-NEXT:    [[C0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 0), align 4
-; NO-FMA-NEXT:    [[C1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 1), align 4
-; NO-FMA-NEXT:    [[C2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 2), align 4
-; NO-FMA-NEXT:    [[C3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 3), align 4
-; NO-FMA-NEXT:    [[FMA0:%.*]] = call float @llvm.fma.f32(float [[A0]], float [[B0]], float [[C0]])
-; NO-FMA-NEXT:    [[FMA1:%.*]] = call float @llvm.fma.f32(float [[A1]], float [[B1]], float [[C1]])
-; NO-FMA-NEXT:    [[FMA2:%.*]] = call float @llvm.fma.f32(float [[A2]], float [[B2]], float [[C2]])
-; NO-FMA-NEXT:    [[FMA3:%.*]] = call float @llvm.fma.f32(float [[A3]], float [[B3]], float [[C3]])
-; NO-FMA-NEXT:    store float [[FMA0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; NO-FMA-NEXT:    store float [[FMA1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; NO-FMA-NEXT:    store float [[FMA2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; NO-FMA-NEXT:    store float [[FMA3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; NO-FMA-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcC32 to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; NO-FMA-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; NO-FMA-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
+; NO-FMA-NEXT:    [[FMA0:%.*]] = call float @llvm.fma.f32(float [[TMP4]], float [[TMP5]], float [[TMP6]])
+; NO-FMA-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; NO-FMA-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; NO-FMA-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
+; NO-FMA-NEXT:    [[FMA1:%.*]] = call float @llvm.fma.f32(float [[TMP7]], float [[TMP8]], float [[TMP9]])
+; NO-FMA-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; NO-FMA-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; NO-FMA-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
+; NO-FMA-NEXT:    [[FMA2:%.*]] = call float @llvm.fma.f32(float [[TMP10]], float [[TMP11]], float [[TMP12]])
+; NO-FMA-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; NO-FMA-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; NO-FMA-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
+; NO-FMA-NEXT:    [[FMA3:%.*]] = call float @llvm.fma.f32(float [[TMP13]], float [[TMP14]], float [[TMP15]])
+; NO-FMA-NEXT:    [[TMP16:%.*]] = insertelement <4 x float> poison, float [[FMA0]], i32 0
+; NO-FMA-NEXT:    [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float [[FMA1]], i32 1
+; NO-FMA-NEXT:    [[TMP18:%.*]] = insertelement <4 x float> [[TMP17]], float [[FMA2]], i32 2
+; NO-FMA-NEXT:    [[TMP19:%.*]] = insertelement <4 x float> [[TMP18]], float [[FMA3]], i32 3
+; NO-FMA-NEXT:    store <4 x float> [[TMP19]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
 ; NO-FMA-NEXT:    ret void
 ;
+; NO-FMA-I7-LABEL: @fma_4f32(
+; NO-FMA-I7-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4
+; NO-FMA-I7-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
+; NO-FMA-I7-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcC32 to <4 x float>*), align 4
+; NO-FMA-I7-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; NO-FMA-I7-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; NO-FMA-I7-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
+; NO-FMA-I7-NEXT:    [[FMA0:%.*]] = call float @llvm.fma.f32(float [[TMP4]], float [[TMP5]], float [[TMP6]])
+; NO-FMA-I7-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; NO-FMA-I7-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; NO-FMA-I7-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
+; NO-FMA-I7-NEXT:    [[FMA1:%.*]] = call float @llvm.fma.f32(float [[TMP7]], float [[TMP8]], float [[TMP9]])
+; NO-FMA-I7-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; NO-FMA-I7-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; NO-FMA-I7-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
+; NO-FMA-I7-NEXT:    [[FMA2:%.*]] = call float @llvm.fma.f32(float [[TMP10]], float [[TMP11]], float [[TMP12]])
+; NO-FMA-I7-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; NO-FMA-I7-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; NO-FMA-I7-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
+; NO-FMA-I7-NEXT:    [[FMA3:%.*]] = call float @llvm.fma.f32(float [[TMP13]], float [[TMP14]], float [[TMP15]])
+; NO-FMA-I7-NEXT:    [[TMP16:%.*]] = insertelement <4 x float> poison, float [[FMA0]], i32 0
+; NO-FMA-I7-NEXT:    [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float [[FMA1]], i32 1
+; NO-FMA-I7-NEXT:    [[TMP18:%.*]] = insertelement <4 x float> [[TMP17]], float [[FMA2]], i32 2
+; NO-FMA-I7-NEXT:    [[TMP19:%.*]] = insertelement <4 x float> [[TMP18]], float [[FMA3]], i32 3
+; NO-FMA-I7-NEXT:    store <4 x float> [[TMP19]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; NO-FMA-I7-NEXT:    ret void
+;
 ; FMA-LABEL: @fma_4f32(
 ; FMA-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4
 ; FMA-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
@@ -253,6 +473,14 @@
 ; FMA-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x float> [[TMP3]])
 ; FMA-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
 ; FMA-NEXT:    ret void
+;
+; FMA-AVX2-LABEL: @fma_4f32(
+; FMA-AVX2-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4
+; FMA-AVX2-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
+; FMA-AVX2-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcC32 to <4 x float>*), align 4
+; FMA-AVX2-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x float> [[TMP3]])
+; FMA-AVX2-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; FMA-AVX2-NEXT:    ret void
 ;
   %a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 0), align 4
   %a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 1), align 4
@@ -279,48 +507,107 @@
 
 define void @fma_8f32() #0 {
 ; NO-FMA-LABEL: @fma_8f32(
-; NO-FMA-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 0), align 4
-; NO-FMA-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 1), align 4
-; NO-FMA-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 2), align 4
-; NO-FMA-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 3), align 4
-; NO-FMA-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4), align 4
-; NO-FMA-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 5), align 4
-; NO-FMA-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 6), align 4
-; NO-FMA-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 7), align 4
-; NO-FMA-NEXT:    [[B0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 0), align 4
-; NO-FMA-NEXT:    [[B1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 1), align 4
-; NO-FMA-NEXT:    [[B2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 2), align 4
-; NO-FMA-NEXT:    [[B3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 3), align 4
-; NO-FMA-NEXT:    [[B4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4), align 4
-; NO-FMA-NEXT:    [[B5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 5), align 4
-; NO-FMA-NEXT:    [[B6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 6), align 4
-; NO-FMA-NEXT:    [[B7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 7), align 4
-; NO-FMA-NEXT:    [[C0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 0), align 4
-; NO-FMA-NEXT:    [[C1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 1), align 4
-; NO-FMA-NEXT:    [[C2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 2), align 4
-; NO-FMA-NEXT:    [[C3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 3), align 4
-; NO-FMA-NEXT:    [[C4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 4), align 4
-; NO-FMA-NEXT:    [[C5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 5), align 4
-; NO-FMA-NEXT:    [[C6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 6), align 4
-; NO-FMA-NEXT:    [[C7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 7), align 4
-; NO-FMA-NEXT:    [[FMA0:%.*]] = call float @llvm.fma.f32(float [[A0]], float [[B0]], float [[C0]])
-; NO-FMA-NEXT:    [[FMA1:%.*]] = call float @llvm.fma.f32(float [[A1]], float [[B1]], float [[C1]])
-; NO-FMA-NEXT:    [[FMA2:%.*]] = call float @llvm.fma.f32(float [[A2]], float [[B2]], float [[C2]])
-; NO-FMA-NEXT:    [[FMA3:%.*]] = call float @llvm.fma.f32(float [[A3]], float [[B3]], float [[C3]])
-; NO-FMA-NEXT:    [[FMA4:%.*]] = call float @llvm.fma.f32(float [[A4]], float [[B4]], float [[C4]])
-; NO-FMA-NEXT:    [[FMA5:%.*]] = call float @llvm.fma.f32(float [[A5]], float [[B5]], float [[C5]])
-; NO-FMA-NEXT:    [[FMA6:%.*]] = call float @llvm.fma.f32(float [[A6]], float [[B6]], float [[C6]])
-; NO-FMA-NEXT:    [[FMA7:%.*]] = call float @llvm.fma.f32(float [[A7]], float [[B7]], float [[C7]])
-; NO-FMA-NEXT:    store float [[FMA0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; NO-FMA-NEXT:    store float [[FMA1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; NO-FMA-NEXT:    store float [[FMA2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; NO-FMA-NEXT:    store float [[FMA3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; NO-FMA-NEXT:    store float [[FMA4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
-; NO-FMA-NEXT:    store float [[FMA5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; NO-FMA-NEXT:    store float [[FMA6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
-; NO-FMA-NEXT:    store float [[FMA7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; NO-FMA-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcC32 to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP6:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 4) to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; NO-FMA-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
+; NO-FMA-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP5]], i32 0
+; NO-FMA-NEXT:    [[FMA0:%.*]] = call float @llvm.fma.f32(float [[TMP7]], float [[TMP8]], float [[TMP9]])
+; NO-FMA-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; NO-FMA-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
+; NO-FMA-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[TMP5]], i32 1
+; NO-FMA-NEXT:    [[FMA1:%.*]] = call float @llvm.fma.f32(float [[TMP10]], float [[TMP11]], float [[TMP12]])
+; NO-FMA-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; NO-FMA-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
+; NO-FMA-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[TMP5]], i32 2
+; NO-FMA-NEXT:    [[FMA2:%.*]] = call float @llvm.fma.f32(float [[TMP13]], float [[TMP14]], float [[TMP15]])
+; NO-FMA-NEXT:    [[TMP16:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; NO-FMA-NEXT:    [[TMP17:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
+; NO-FMA-NEXT:    [[TMP18:%.*]] = extractelement <4 x float> [[TMP5]], i32 3
+; NO-FMA-NEXT:    [[FMA3:%.*]] = call float @llvm.fma.f32(float [[TMP16]], float [[TMP17]], float [[TMP18]])
+; NO-FMA-NEXT:    [[TMP19:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; NO-FMA-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[TMP4]], i32 0
+; NO-FMA-NEXT:    [[TMP21:%.*]] = extractelement <4 x float> [[TMP6]], i32 0
+; NO-FMA-NEXT:    [[FMA4:%.*]] = call float @llvm.fma.f32(float [[TMP19]], float [[TMP20]], float [[TMP21]])
+; NO-FMA-NEXT:    [[TMP22:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; NO-FMA-NEXT:    [[TMP23:%.*]] = extractelement <4 x float> [[TMP4]], i32 1
+; NO-FMA-NEXT:    [[TMP24:%.*]] = extractelement <4 x float> [[TMP6]], i32 1
+; NO-FMA-NEXT:    [[FMA5:%.*]] = call float @llvm.fma.f32(float [[TMP22]], float [[TMP23]], float [[TMP24]])
+; NO-FMA-NEXT:    [[TMP25:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; NO-FMA-NEXT:    [[TMP26:%.*]] = extractelement <4 x float> [[TMP4]], i32 2
+; NO-FMA-NEXT:    [[TMP27:%.*]] = extractelement <4 x float> [[TMP6]], i32 2
+; NO-FMA-NEXT:    [[FMA6:%.*]] = call float @llvm.fma.f32(float [[TMP25]], float [[TMP26]], float [[TMP27]])
+; NO-FMA-NEXT:    [[TMP28:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; NO-FMA-NEXT:    [[TMP29:%.*]] = extractelement <4 x float> [[TMP4]], i32 3
+; NO-FMA-NEXT:    [[TMP30:%.*]] = extractelement <4 x float> [[TMP6]], i32 3
+; NO-FMA-NEXT:    [[FMA7:%.*]] = call float @llvm.fma.f32(float [[TMP28]], float [[TMP29]], float [[TMP30]])
+; NO-FMA-NEXT:    [[TMP31:%.*]] = insertelement <4 x float> poison, float [[FMA0]], i32 0
+; NO-FMA-NEXT:    [[TMP32:%.*]] = insertelement <4 x float> [[TMP31]], float [[FMA1]], i32 1
+; NO-FMA-NEXT:    [[TMP33:%.*]] = insertelement <4 x float> [[TMP32]], float [[FMA2]], i32 2
+; NO-FMA-NEXT:    [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[FMA3]], i32 3
+; NO-FMA-NEXT:    store <4 x float> [[TMP34]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP35:%.*]] = insertelement <4 x float> poison, float [[FMA4]], i32 0
+; NO-FMA-NEXT:    [[TMP36:%.*]] = insertelement <4 x float> [[TMP35]], float [[FMA5]], i32 1
+; NO-FMA-NEXT:    [[TMP37:%.*]] = insertelement <4 x float> [[TMP36]], float [[FMA6]], i32 2
+; NO-FMA-NEXT:    [[TMP38:%.*]] = insertelement <4 x float> [[TMP37]], float [[FMA7]], i32 3
+; NO-FMA-NEXT:    store <4 x float> [[TMP38]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
 ; NO-FMA-NEXT:    ret void
 ;
+; NO-FMA-I7-LABEL: @fma_8f32(
+; NO-FMA-I7-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4
+; NO-FMA-I7-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4
+; NO-FMA-I7-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
+; NO-FMA-I7-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4
+; NO-FMA-I7-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcC32 to <4 x float>*), align 4
+; NO-FMA-I7-NEXT:    [[TMP6:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 4) to <4 x float>*), align 4
+; NO-FMA-I7-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; NO-FMA-I7-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
+; NO-FMA-I7-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP5]], i32 0
+; NO-FMA-I7-NEXT:    [[FMA0:%.*]] = call float @llvm.fma.f32(float [[TMP7]], float [[TMP8]], float [[TMP9]])
+; NO-FMA-I7-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; NO-FMA-I7-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
+; NO-FMA-I7-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[TMP5]], i32 1
+; NO-FMA-I7-NEXT:    [[FMA1:%.*]] = call float @llvm.fma.f32(float [[TMP10]], float [[TMP11]], float [[TMP12]])
+; NO-FMA-I7-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; NO-FMA-I7-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
+; NO-FMA-I7-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[TMP5]], i32 2
+; NO-FMA-I7-NEXT:    [[FMA2:%.*]] = call float @llvm.fma.f32(float [[TMP13]], float [[TMP14]], float [[TMP15]])
+; NO-FMA-I7-NEXT:    [[TMP16:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; NO-FMA-I7-NEXT:    [[TMP17:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
+; NO-FMA-I7-NEXT:    [[TMP18:%.*]] = extractelement <4 x float> [[TMP5]], i32 3
+; NO-FMA-I7-NEXT:    [[FMA3:%.*]] = call float @llvm.fma.f32(float [[TMP16]], float [[TMP17]], float [[TMP18]])
+; NO-FMA-I7-NEXT:    [[TMP19:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; NO-FMA-I7-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[TMP4]], i32 0
+; NO-FMA-I7-NEXT:    [[TMP21:%.*]] = extractelement <4 x float> [[TMP6]], i32 0
+; NO-FMA-I7-NEXT:    [[FMA4:%.*]] = call float @llvm.fma.f32(float [[TMP19]], float [[TMP20]], float [[TMP21]])
+; NO-FMA-I7-NEXT:    [[TMP22:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; NO-FMA-I7-NEXT:    [[TMP23:%.*]] = extractelement <4 x float> [[TMP4]], i32 1
+; NO-FMA-I7-NEXT:    [[TMP24:%.*]] = extractelement <4 x float> [[TMP6]], i32 1
+; NO-FMA-I7-NEXT:    [[FMA5:%.*]] = call float @llvm.fma.f32(float [[TMP22]], float [[TMP23]], float [[TMP24]])
+; NO-FMA-I7-NEXT:    [[TMP25:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; NO-FMA-I7-NEXT:    [[TMP26:%.*]] = extractelement <4 x float> [[TMP4]], i32 2
+; NO-FMA-I7-NEXT:    [[TMP27:%.*]] = extractelement <4 x float> [[TMP6]], i32 2
+; NO-FMA-I7-NEXT:    [[FMA6:%.*]] = call float @llvm.fma.f32(float [[TMP25]], float [[TMP26]], float [[TMP27]])
+; NO-FMA-I7-NEXT:    [[TMP28:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; NO-FMA-I7-NEXT:    [[TMP29:%.*]] = extractelement <4 x float> [[TMP4]], i32 3
+; NO-FMA-I7-NEXT:    [[TMP30:%.*]] = extractelement <4 x float> [[TMP6]], i32 3
+; NO-FMA-I7-NEXT:    [[FMA7:%.*]] = call float @llvm.fma.f32(float [[TMP28]], float [[TMP29]], float [[TMP30]])
+; NO-FMA-I7-NEXT:    [[TMP31:%.*]] = insertelement <4 x float> poison, float [[FMA0]], i32 0
+; NO-FMA-I7-NEXT:    [[TMP32:%.*]] = insertelement <4 x float> [[TMP31]], float [[FMA1]], i32 1
+; NO-FMA-I7-NEXT:    [[TMP33:%.*]] = insertelement <4 x float> [[TMP32]], float [[FMA2]], i32 2
+; NO-FMA-I7-NEXT:    [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[FMA3]], i32 3
+; NO-FMA-I7-NEXT:    store <4 x float> [[TMP34]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; NO-FMA-I7-NEXT:    [[TMP35:%.*]] = insertelement <4 x float> poison, float [[FMA4]], i32 0
+; NO-FMA-I7-NEXT:    [[TMP36:%.*]] = insertelement <4 x float> [[TMP35]], float [[FMA5]], i32 1
+; NO-FMA-I7-NEXT:    [[TMP37:%.*]] = insertelement <4 x float> [[TMP36]], float [[FMA6]], i32 2
+; NO-FMA-I7-NEXT:    [[TMP38:%.*]] = insertelement <4 x float> [[TMP37]], float [[FMA7]], i32 3
+; NO-FMA-I7-NEXT:    store <4 x float> [[TMP38]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; NO-FMA-I7-NEXT:    ret void
+;
 ; FMA-LABEL: @fma_8f32(
 ; FMA-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcA32 to <8 x float>*), align 4
 ; FMA-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4
@@ -328,6 +615,14 @@
 ; FMA-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x float> [[TMP3]])
 ; FMA-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
 ; FMA-NEXT:    ret void
+;
+; FMA-AVX2-LABEL: @fma_8f32(
+; FMA-AVX2-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcA32 to <8 x float>*), align 4
+; FMA-AVX2-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4
+; FMA-AVX2-NEXT:    [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcC32 to <8 x float>*), align 4
+; FMA-AVX2-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x float> [[TMP3]])
+; FMA-AVX2-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; FMA-AVX2-NEXT:    ret void
 ;
   %a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 0), align 4
   %a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 1), align 4
@@ -374,88 +669,203 @@
 
 define void @fma_16f32() #0 {
 ; NO-FMA-LABEL: @fma_16f32(
-; NO-FMA-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 0), align 4
-; NO-FMA-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 1), align 4
-; NO-FMA-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 2), align 4
-; NO-FMA-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 3), align 4
-; NO-FMA-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4), align 4
-; NO-FMA-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 5), align 4
-; NO-FMA-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 6), align 4
-; NO-FMA-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 7), align 4
-; NO-FMA-NEXT:    [[A8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8), align 4
-; NO-FMA-NEXT:    [[A9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 9), align 4
-; NO-FMA-NEXT:    [[A10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 10), align 4
-; NO-FMA-NEXT:    [[A11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 11), align 4
-; NO-FMA-NEXT:    [[A12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 12), align 4
-; NO-FMA-NEXT:    [[A13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 13), align 4
-; NO-FMA-NEXT:    [[A14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 14), align 4
-; NO-FMA-NEXT:    [[A15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 15), align 4
-; NO-FMA-NEXT:    [[B0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 0), align 4
-; NO-FMA-NEXT:    [[B1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 1), align 4
-; NO-FMA-NEXT:    [[B2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 2), align 4
-; NO-FMA-NEXT:    [[B3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 3), align 4
-; NO-FMA-NEXT:    [[B4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4), align 4
-; NO-FMA-NEXT:    [[B5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 5), align 4
-; NO-FMA-NEXT:    [[B6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 6), align 4
-; NO-FMA-NEXT:    [[B7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 7), align 4
-; NO-FMA-NEXT:    [[B8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8), align 4
-; NO-FMA-NEXT:    [[B9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 9), align 4
-; NO-FMA-NEXT:    [[B10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 10), align 4
-; NO-FMA-NEXT:    [[B11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 11), align 4
-; NO-FMA-NEXT:    [[B12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 12), align 4
-; NO-FMA-NEXT:    [[B13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 13), align 4
-; NO-FMA-NEXT:    [[B14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 14), align 4
-; NO-FMA-NEXT:    [[B15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 15), align 4
-; NO-FMA-NEXT:    [[C0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 0), align 4
-; NO-FMA-NEXT:    [[C1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 1), align 4
-; NO-FMA-NEXT:    [[C2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 2), align 4
-; NO-FMA-NEXT:    [[C3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 3), align 4
-; NO-FMA-NEXT:    [[C4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 4), align 4
-; NO-FMA-NEXT:    [[C5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 5), align 4
-; NO-FMA-NEXT:    [[C6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 6), align 4
-; NO-FMA-NEXT:    [[C7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 7), align 4
-; NO-FMA-NEXT:    [[C8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 8), align 4
-; NO-FMA-NEXT:    [[C9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 9), align 4
-; NO-FMA-NEXT:    [[C10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 10), align 4
-; NO-FMA-NEXT:    [[C11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 11), align 4
-; NO-FMA-NEXT:    [[C12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 12), align 4
-; NO-FMA-NEXT:    [[C13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 13), align 4
-; NO-FMA-NEXT:    [[C14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 14), align 4
-; NO-FMA-NEXT:    [[C15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 15), align 4
-; NO-FMA-NEXT:    [[FMA0:%.*]] = call float @llvm.fma.f32(float [[A0]], float [[B0]], float [[C0]])
-; NO-FMA-NEXT:    [[FMA1:%.*]] = call float @llvm.fma.f32(float [[A1]], float [[B1]], float [[C1]])
-; NO-FMA-NEXT:    [[FMA2:%.*]] = call float @llvm.fma.f32(float [[A2]], float [[B2]], float [[C2]])
-; NO-FMA-NEXT:    [[FMA3:%.*]] = call float @llvm.fma.f32(float [[A3]], float [[B3]], float [[C3]])
-; NO-FMA-NEXT:    [[FMA4:%.*]] = call float @llvm.fma.f32(float [[A4]], float [[B4]], float [[C4]])
-; NO-FMA-NEXT:    [[FMA5:%.*]] = call float @llvm.fma.f32(float [[A5]], float [[B5]], float [[C5]])
-; NO-FMA-NEXT:    [[FMA6:%.*]] = call float @llvm.fma.f32(float [[A6]], float [[B6]], float [[C6]])
-; NO-FMA-NEXT:    [[FMA7:%.*]] = call float @llvm.fma.f32(float [[A7]], float [[B7]], float [[C7]])
-; NO-FMA-NEXT:    [[FMA8:%.*]] = call float @llvm.fma.f32(float [[A8]], float [[B8]], float [[C8]])
-; NO-FMA-NEXT:    [[FMA9:%.*]] = call float @llvm.fma.f32(float [[A9]], float [[B9]], float [[C9]])
-; NO-FMA-NEXT:    [[FMA10:%.*]] = call float @llvm.fma.f32(float [[A10]], float [[B10]], float [[C10]])
-; NO-FMA-NEXT:    [[FMA11:%.*]] = call float @llvm.fma.f32(float [[A11]], float [[B11]], float [[C11]])
-; NO-FMA-NEXT:    [[FMA12:%.*]] = call float @llvm.fma.f32(float [[A12]], float [[B12]], float [[C12]])
-; NO-FMA-NEXT:    [[FMA13:%.*]] = call float @llvm.fma.f32(float [[A13]], float [[B13]], float [[C13]])
-; NO-FMA-NEXT:    [[FMA14:%.*]] = call float @llvm.fma.f32(float [[A14]], float [[B14]], float [[C14]])
-; NO-FMA-NEXT:    [[FMA15:%.*]] = call float @llvm.fma.f32(float [[A15]], float [[B15]], float [[C15]])
-; NO-FMA-NEXT:    store float [[FMA0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; NO-FMA-NEXT:    store float [[FMA1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; NO-FMA-NEXT:    store float [[FMA2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; NO-FMA-NEXT:    store float [[FMA3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; NO-FMA-NEXT:    store float [[FMA4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
-; NO-FMA-NEXT:    store float [[FMA5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; NO-FMA-NEXT:    store float [[FMA6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
-; NO-FMA-NEXT:    store float [[FMA7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
-; NO-FMA-NEXT:    store float [[FMA8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4
-; NO-FMA-NEXT:    store float [[FMA9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4
-; NO-FMA-NEXT:    store float [[FMA10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4
-; NO-FMA-NEXT:    store float [[FMA11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
-; NO-FMA-NEXT:    store float [[FMA12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4
-; NO-FMA-NEXT:    store float [[FMA13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
-; NO-FMA-NEXT:    store float [[FMA14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4
-; NO-FMA-NEXT:    store float [[FMA15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+; NO-FMA-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 12) to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP6:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP8:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 12) to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP9:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcC32 to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP10:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 4) to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP11:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 8) to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP12:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 12) to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; NO-FMA-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[TMP5]], i32 0
+; NO-FMA-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[TMP9]], i32 0
+; NO-FMA-NEXT:    [[FMA0:%.*]] = call float @llvm.fma.f32(float [[TMP13]], float [[TMP14]], float [[TMP15]])
+; NO-FMA-NEXT:    [[TMP16:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; NO-FMA-NEXT:    [[TMP17:%.*]] = extractelement <4 x float> [[TMP5]], i32 1
+; NO-FMA-NEXT:    [[TMP18:%.*]] = extractelement <4 x float> [[TMP9]], i32 1
+; NO-FMA-NEXT:    [[FMA1:%.*]] = call float @llvm.fma.f32(float [[TMP16]], float [[TMP17]], float [[TMP18]])
+; NO-FMA-NEXT:    [[TMP19:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; NO-FMA-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[TMP5]], i32 2
+; NO-FMA-NEXT:    [[TMP21:%.*]] = extractelement <4 x float> [[TMP9]], i32 2
+; NO-FMA-NEXT:    [[FMA2:%.*]] = call float @llvm.fma.f32(float [[TMP19]], float [[TMP20]], float [[TMP21]])
+; NO-FMA-NEXT:    [[TMP22:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; NO-FMA-NEXT:    [[TMP23:%.*]] = extractelement <4 x float> [[TMP5]], i32 3
+; NO-FMA-NEXT:    [[TMP24:%.*]] = extractelement <4 x float> [[TMP9]], i32 3
+; NO-FMA-NEXT:    [[FMA3:%.*]] = call float @llvm.fma.f32(float [[TMP22]], float [[TMP23]], float [[TMP24]])
+; NO-FMA-NEXT:    [[TMP25:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; NO-FMA-NEXT:    [[TMP26:%.*]] = extractelement <4 x float> [[TMP6]], i32 0
+; NO-FMA-NEXT:    [[TMP27:%.*]] = extractelement <4 x float> [[TMP10]], i32 0
+; NO-FMA-NEXT:    [[FMA4:%.*]] = call float @llvm.fma.f32(float [[TMP25]], float [[TMP26]], float [[TMP27]])
+; NO-FMA-NEXT:    [[TMP28:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; NO-FMA-NEXT:    [[TMP29:%.*]] = extractelement <4 x float> [[TMP6]], i32 1
+; NO-FMA-NEXT:    [[TMP30:%.*]] = extractelement <4 x float> [[TMP10]], i32 1
+; NO-FMA-NEXT:    [[FMA5:%.*]] = call float @llvm.fma.f32(float [[TMP28]], float [[TMP29]], float [[TMP30]])
+; NO-FMA-NEXT:    [[TMP31:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; NO-FMA-NEXT:    [[TMP32:%.*]] = extractelement <4 x float> [[TMP6]], i32 2
+; NO-FMA-NEXT:    [[TMP33:%.*]] = extractelement <4 x float> [[TMP10]], i32 2
+; NO-FMA-NEXT:    [[FMA6:%.*]] = call float @llvm.fma.f32(float [[TMP31]], float [[TMP32]], float [[TMP33]])
+; NO-FMA-NEXT:    [[TMP34:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; NO-FMA-NEXT:    [[TMP35:%.*]] = extractelement <4 x float> [[TMP6]], i32 3
+; NO-FMA-NEXT:    [[TMP36:%.*]] = extractelement <4 x float> [[TMP10]], i32 3
+; NO-FMA-NEXT:    [[FMA7:%.*]] = call float @llvm.fma.f32(float [[TMP34]], float [[TMP35]], float [[TMP36]])
+; NO-FMA-NEXT:    [[TMP37:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
+; NO-FMA-NEXT:    [[TMP38:%.*]] = extractelement <4 x float> [[TMP7]], i32 0
+; NO-FMA-NEXT:    [[TMP39:%.*]] = extractelement <4 x float> [[TMP11]], i32 0
+; NO-FMA-NEXT:    [[FMA8:%.*]] = call float @llvm.fma.f32(float [[TMP37]], float [[TMP38]], float [[TMP39]])
+; NO-FMA-NEXT:    [[TMP40:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
+; NO-FMA-NEXT:    [[TMP41:%.*]] = extractelement <4 x float> [[TMP7]], i32 1
+; NO-FMA-NEXT:    [[TMP42:%.*]] = extractelement <4 x float> [[TMP11]], i32 1
+; NO-FMA-NEXT:    [[FMA9:%.*]] = call float @llvm.fma.f32(float [[TMP40]], float [[TMP41]], float [[TMP42]])
+; NO-FMA-NEXT:    [[TMP43:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
+; NO-FMA-NEXT:    [[TMP44:%.*]] = extractelement <4 x float> [[TMP7]], i32 2
+; NO-FMA-NEXT:    [[TMP45:%.*]] = extractelement <4 x float> [[TMP11]], i32 2
+; NO-FMA-NEXT:    [[FMA10:%.*]] = call float @llvm.fma.f32(float [[TMP43]], float [[TMP44]], float [[TMP45]])
+; NO-FMA-NEXT:    [[TMP46:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
+; NO-FMA-NEXT:    [[TMP47:%.*]] = extractelement <4 x float> [[TMP7]], i32 3
+; NO-FMA-NEXT:    [[TMP48:%.*]] = extractelement <4 x float> [[TMP11]], i32 3
+; NO-FMA-NEXT:    [[FMA11:%.*]] = call float @llvm.fma.f32(float [[TMP46]], float [[TMP47]], float [[TMP48]])
+; NO-FMA-NEXT:    [[TMP49:%.*]] = extractelement <4 x float> [[TMP4]], i32 0
+; NO-FMA-NEXT:    [[TMP50:%.*]] = extractelement <4 x float> [[TMP8]], i32 0
+; NO-FMA-NEXT:    [[TMP51:%.*]] = extractelement <4 x float> [[TMP12]], i32 0
+; NO-FMA-NEXT:    [[FMA12:%.*]] = call float @llvm.fma.f32(float [[TMP49]], float [[TMP50]], float [[TMP51]])
+; NO-FMA-NEXT:    [[TMP52:%.*]] = extractelement <4 x float> [[TMP4]], i32 1
+; NO-FMA-NEXT:    [[TMP53:%.*]] = extractelement <4 x float> [[TMP8]], i32 1
+; NO-FMA-NEXT:    [[TMP54:%.*]] = extractelement <4 x float> [[TMP12]], i32 1
+; NO-FMA-NEXT:    [[FMA13:%.*]] = call float @llvm.fma.f32(float [[TMP52]], float [[TMP53]], float [[TMP54]])
+; NO-FMA-NEXT:    [[TMP55:%.*]] = extractelement <4 x float> [[TMP4]], i32 2
+; NO-FMA-NEXT:    [[TMP56:%.*]] = extractelement <4 x float> [[TMP8]], i32 2
+; NO-FMA-NEXT:    [[TMP57:%.*]] = extractelement <4 x float> [[TMP12]], i32 2
+; NO-FMA-NEXT:    [[FMA14:%.*]] = call float @llvm.fma.f32(float [[TMP55]], float [[TMP56]], float [[TMP57]])
+; NO-FMA-NEXT:    [[TMP58:%.*]] = extractelement <4 x float> [[TMP4]], i32 3
+; NO-FMA-NEXT:    [[TMP59:%.*]] = extractelement <4 x float> [[TMP8]], i32 3
+; NO-FMA-NEXT:    [[TMP60:%.*]] = extractelement <4 x float> [[TMP12]], i32 3
+; NO-FMA-NEXT:    [[FMA15:%.*]] = call float @llvm.fma.f32(float [[TMP58]], float [[TMP59]], float [[TMP60]])
+; NO-FMA-NEXT:    [[TMP61:%.*]] = insertelement <4 x float> poison, float [[FMA0]], i32 0
+; NO-FMA-NEXT:    [[TMP62:%.*]] = insertelement <4 x float> [[TMP61]], float [[FMA1]], i32 1
+; NO-FMA-NEXT:    [[TMP63:%.*]] = insertelement <4 x float> [[TMP62]], float [[FMA2]], i32 2
+; NO-FMA-NEXT:    [[TMP64:%.*]] = insertelement <4 x float> [[TMP63]], float [[FMA3]], i32 3
+; NO-FMA-NEXT:    store <4 x float> [[TMP64]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP65:%.*]] = insertelement <4 x float> poison, float [[FMA4]], i32 0
+; NO-FMA-NEXT:    [[TMP66:%.*]] = insertelement <4 x float> [[TMP65]], float [[FMA5]], i32 1
+; NO-FMA-NEXT:    [[TMP67:%.*]] = insertelement <4 x float> [[TMP66]], float [[FMA6]], i32 2
+; NO-FMA-NEXT:    [[TMP68:%.*]] = insertelement <4 x float> [[TMP67]], float [[FMA7]], i32 3
+; NO-FMA-NEXT:    store <4 x float> [[TMP68]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP69:%.*]] = insertelement <4 x float> poison, float [[FMA8]], i32 0
+; NO-FMA-NEXT:    [[TMP70:%.*]] = insertelement <4 x float> [[TMP69]], float [[FMA9]], i32 1
+; NO-FMA-NEXT:    [[TMP71:%.*]] = insertelement <4 x float> [[TMP70]], float [[FMA10]], i32 2
+; NO-FMA-NEXT:    [[TMP72:%.*]] = insertelement <4 x float> [[TMP71]], float [[FMA11]], i32 3
+; NO-FMA-NEXT:    store <4 x float> [[TMP72]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; NO-FMA-NEXT:    [[TMP73:%.*]] = insertelement <4 x float> poison, float [[FMA12]], i32 0
+; NO-FMA-NEXT:    [[TMP74:%.*]] = insertelement <4 x float> [[TMP73]], float [[FMA13]], i32 1
+; NO-FMA-NEXT:    [[TMP75:%.*]] = insertelement <4 x float> [[TMP74]], float [[FMA14]], i32 2
+; NO-FMA-NEXT:    [[TMP76:%.*]] = insertelement <4 x float> [[TMP75]], float [[FMA15]], i32 3
+; NO-FMA-NEXT:    store <4 x float> [[TMP76]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
 ; NO-FMA-NEXT:    ret void
 ;
+; NO-FMA-I7-LABEL: @fma_16f32(
+; NO-FMA-I7-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4
+; NO-FMA-I7-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4
+; NO-FMA-I7-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <4 x float>*), align 4
+; NO-FMA-I7-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 12) to <4 x float>*), align 4
+; NO-FMA-I7-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
+; NO-FMA-I7-NEXT:    [[TMP6:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4
+; NO-FMA-I7-NEXT:    [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <4 x float>*), align 4
+; NO-FMA-I7-NEXT:    [[TMP8:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 12) to <4 x float>*), align 4
+; NO-FMA-I7-NEXT:    [[TMP9:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcC32 to <4 x float>*), align 4
+; NO-FMA-I7-NEXT:    [[TMP10:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 4) to <4 x float>*), align 4
+; NO-FMA-I7-NEXT:    [[TMP11:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 8) to <4 x float>*), align 4
+; NO-FMA-I7-NEXT:    [[TMP12:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 12) to <4 x float>*), align 4
+; NO-FMA-I7-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; NO-FMA-I7-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[TMP5]], i32 0
+; NO-FMA-I7-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[TMP9]], i32 0
+; NO-FMA-I7-NEXT:    [[FMA0:%.*]] = call float @llvm.fma.f32(float [[TMP13]], float [[TMP14]], float [[TMP15]])
+; NO-FMA-I7-NEXT:    [[TMP16:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; NO-FMA-I7-NEXT:    [[TMP17:%.*]] = extractelement <4 x float> [[TMP5]], i32 1
+; NO-FMA-I7-NEXT:    [[TMP18:%.*]] = extractelement <4 x float> [[TMP9]], i32 1
+; NO-FMA-I7-NEXT:    [[FMA1:%.*]] = call float @llvm.fma.f32(float [[TMP16]], float [[TMP17]], float [[TMP18]])
+; NO-FMA-I7-NEXT:    [[TMP19:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; NO-FMA-I7-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[TMP5]], i32 2
+; NO-FMA-I7-NEXT:    [[TMP21:%.*]] = extractelement <4 x float> [[TMP9]], i32 2
+; NO-FMA-I7-NEXT:    [[FMA2:%.*]] = call float @llvm.fma.f32(float [[TMP19]], float [[TMP20]], float [[TMP21]])
+; NO-FMA-I7-NEXT:    [[TMP22:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; NO-FMA-I7-NEXT:    [[TMP23:%.*]] = extractelement <4 x float> [[TMP5]], i32 3
+; NO-FMA-I7-NEXT:    [[TMP24:%.*]] = extractelement <4 x float> [[TMP9]], i32 3
+; NO-FMA-I7-NEXT:    [[FMA3:%.*]] = call float @llvm.fma.f32(float [[TMP22]], float [[TMP23]], float [[TMP24]])
+; NO-FMA-I7-NEXT:    [[TMP25:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; NO-FMA-I7-NEXT:    [[TMP26:%.*]] = extractelement <4 x float> [[TMP6]], i32 0
+; NO-FMA-I7-NEXT:    [[TMP27:%.*]] = extractelement <4 x float> [[TMP10]], i32 0
+; NO-FMA-I7-NEXT:    [[FMA4:%.*]] = call float @llvm.fma.f32(float [[TMP25]], float [[TMP26]], float [[TMP27]])
+; NO-FMA-I7-NEXT:    [[TMP28:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; NO-FMA-I7-NEXT:    [[TMP29:%.*]] = extractelement <4 x float> [[TMP6]], i32 1
+; NO-FMA-I7-NEXT:    [[TMP30:%.*]] = extractelement <4 x float> [[TMP10]], i32 1
+; NO-FMA-I7-NEXT:    [[FMA5:%.*]] = call float @llvm.fma.f32(float [[TMP28]], float [[TMP29]], float [[TMP30]])
+; NO-FMA-I7-NEXT:    [[TMP31:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; NO-FMA-I7-NEXT:    [[TMP32:%.*]] = extractelement <4 x float> [[TMP6]], i32 2
+; NO-FMA-I7-NEXT:    [[TMP33:%.*]] = extractelement <4 x float> [[TMP10]], i32 2
+; NO-FMA-I7-NEXT:    [[FMA6:%.*]] = call float @llvm.fma.f32(float [[TMP31]], float [[TMP32]], float [[TMP33]])
+; NO-FMA-I7-NEXT:    [[TMP34:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; NO-FMA-I7-NEXT:    [[TMP35:%.*]] = extractelement <4 x float> [[TMP6]], i32 3
+; NO-FMA-I7-NEXT:    [[TMP36:%.*]] = extractelement <4 x float> [[TMP10]], i32 3
+; NO-FMA-I7-NEXT:    [[FMA7:%.*]] = call float @llvm.fma.f32(float [[TMP34]], float [[TMP35]], float [[TMP36]])
+; NO-FMA-I7-NEXT:    [[TMP37:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
+; NO-FMA-I7-NEXT:    [[TMP38:%.*]] = extractelement <4 x float> [[TMP7]], i32 0
+; NO-FMA-I7-NEXT:    [[TMP39:%.*]] = extractelement <4 x float> [[TMP11]], i32 0
+; NO-FMA-I7-NEXT:    [[FMA8:%.*]] = call float @llvm.fma.f32(float [[TMP37]], float [[TMP38]], float [[TMP39]])
+; NO-FMA-I7-NEXT:    [[TMP40:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
+; NO-FMA-I7-NEXT:    [[TMP41:%.*]] = extractelement <4 x float> [[TMP7]], i32 1
+; NO-FMA-I7-NEXT:    [[TMP42:%.*]] = extractelement <4 x float> [[TMP11]], i32 1
+; NO-FMA-I7-NEXT:    [[FMA9:%.*]] = call float @llvm.fma.f32(float [[TMP40]], float [[TMP41]], float [[TMP42]])
+; NO-FMA-I7-NEXT:    [[TMP43:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
+; NO-FMA-I7-NEXT:    [[TMP44:%.*]] = extractelement <4 x float> [[TMP7]], i32 2
+; NO-FMA-I7-NEXT:    [[TMP45:%.*]] = extractelement <4 x float> [[TMP11]], i32 2
+; NO-FMA-I7-NEXT:    [[FMA10:%.*]] = call float @llvm.fma.f32(float [[TMP43]], float [[TMP44]], float [[TMP45]])
+; NO-FMA-I7-NEXT:    [[TMP46:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
+; NO-FMA-I7-NEXT:    [[TMP47:%.*]] = extractelement <4 x float> [[TMP7]], i32 3
+; NO-FMA-I7-NEXT:    [[TMP48:%.*]] = extractelement <4 x float> [[TMP11]], i32 3
+; NO-FMA-I7-NEXT:    [[FMA11:%.*]] = call float @llvm.fma.f32(float [[TMP46]], float [[TMP47]], float [[TMP48]])
+; NO-FMA-I7-NEXT:    [[TMP49:%.*]] = extractelement <4 x float> [[TMP4]], i32 0
+; NO-FMA-I7-NEXT:    [[TMP50:%.*]] = extractelement <4 x float> [[TMP8]], i32 0
+; NO-FMA-I7-NEXT:    [[TMP51:%.*]] = extractelement <4 x float> [[TMP12]], i32 0
+; NO-FMA-I7-NEXT:    [[FMA12:%.*]] = call float @llvm.fma.f32(float [[TMP49]], float [[TMP50]], float [[TMP51]])
+; NO-FMA-I7-NEXT:    [[TMP52:%.*]] = extractelement <4 x float> [[TMP4]], i32 1
+; NO-FMA-I7-NEXT:    [[TMP53:%.*]] = extractelement <4 x float> [[TMP8]], i32 1
+; NO-FMA-I7-NEXT:    [[TMP54:%.*]] = extractelement <4 x float> [[TMP12]], i32 1
+; NO-FMA-I7-NEXT:    [[FMA13:%.*]] = call float @llvm.fma.f32(float [[TMP52]], float [[TMP53]], float [[TMP54]])
+; NO-FMA-I7-NEXT:    [[TMP55:%.*]] = extractelement <4 x float> [[TMP4]], i32 2
+; NO-FMA-I7-NEXT:    [[TMP56:%.*]] = extractelement <4 x float> [[TMP8]], i32 2
+; NO-FMA-I7-NEXT:    [[TMP57:%.*]] = extractelement <4 x float> [[TMP12]], i32 2
+; NO-FMA-I7-NEXT:    [[FMA14:%.*]] = call float @llvm.fma.f32(float [[TMP55]], float [[TMP56]], float [[TMP57]])
+; NO-FMA-I7-NEXT:    [[TMP58:%.*]] = extractelement <4 x float> [[TMP4]], i32 3
+; NO-FMA-I7-NEXT:    [[TMP59:%.*]] = extractelement <4 x float> [[TMP8]], i32 3
+; NO-FMA-I7-NEXT:    [[TMP60:%.*]] = extractelement <4 x float> [[TMP12]], i32 3
+; NO-FMA-I7-NEXT:    [[FMA15:%.*]] = call float @llvm.fma.f32(float [[TMP58]], float [[TMP59]], float [[TMP60]])
+; NO-FMA-I7-NEXT:    [[TMP61:%.*]] = insertelement <4 x float> poison, float [[FMA0]], i32 0
+; NO-FMA-I7-NEXT:    [[TMP62:%.*]] = insertelement <4 x float> [[TMP61]], float [[FMA1]], i32 1
+; NO-FMA-I7-NEXT:    [[TMP63:%.*]] = insertelement <4 x float> [[TMP62]], float [[FMA2]], i32 2
+; NO-FMA-I7-NEXT:    [[TMP64:%.*]] = insertelement <4 x float> [[TMP63]], float [[FMA3]], i32 3
+; NO-FMA-I7-NEXT:    store <4 x float> [[TMP64]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; NO-FMA-I7-NEXT:    [[TMP65:%.*]] = insertelement <4 x float> poison, float [[FMA4]], i32 0
+; NO-FMA-I7-NEXT:    [[TMP66:%.*]] = insertelement <4 x float> [[TMP65]], float [[FMA5]], i32 1
+; NO-FMA-I7-NEXT:    [[TMP67:%.*]] = insertelement <4 x float> [[TMP66]], float [[FMA6]], i32 2
+; NO-FMA-I7-NEXT:    [[TMP68:%.*]] = insertelement <4 x float> [[TMP67]], float [[FMA7]], i32 3
+; NO-FMA-I7-NEXT:    store <4 x float> [[TMP68]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; NO-FMA-I7-NEXT:    [[TMP69:%.*]] = insertelement <4 x float> poison, float [[FMA8]], i32 0
+; NO-FMA-I7-NEXT:    [[TMP70:%.*]] = insertelement <4 x float> [[TMP69]], float [[FMA9]], i32 1
+; NO-FMA-I7-NEXT:    [[TMP71:%.*]] = insertelement <4 x float> [[TMP70]], float [[FMA10]], i32 2
+; NO-FMA-I7-NEXT:    [[TMP72:%.*]] = insertelement <4 x float> [[TMP71]], float [[FMA11]], i32 3
+; NO-FMA-I7-NEXT:    store <4 x float> [[TMP72]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; NO-FMA-I7-NEXT:    [[TMP73:%.*]] = insertelement <4 x float> poison, float [[FMA12]], i32 0
+; NO-FMA-I7-NEXT:    [[TMP74:%.*]] = insertelement <4 x float> [[TMP73]], float [[FMA13]], i32 1
+; NO-FMA-I7-NEXT:    [[TMP75:%.*]] = insertelement <4 x float> [[TMP74]], float [[FMA14]], i32 2
+; NO-FMA-I7-NEXT:    [[TMP76:%.*]] = insertelement <4 x float> [[TMP75]], float [[FMA15]], i32 3
+; NO-FMA-I7-NEXT:    store <4 x float> [[TMP76]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
+; NO-FMA-I7-NEXT:    ret void
+;
 ; FMA256-LABEL: @fma_16f32(
 ; FMA256-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcA32 to <8 x float>*), align 4
 ; FMA256-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <8 x float>*), align 4
@@ -469,6 +879,19 @@
 ; FMA256-NEXT:    store <8 x float> [[TMP8]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
 ; FMA256-NEXT:    ret void
 ;
+; FMA-AVX2-LABEL: @fma_16f32(
+; FMA-AVX2-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcA32 to <8 x float>*), align 4
+; FMA-AVX2-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <8 x float>*), align 4
+; FMA-AVX2-NEXT:    [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4
+; FMA-AVX2-NEXT:    [[TMP4:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <8 x float>*), align 4
+; FMA-AVX2-NEXT:    [[TMP5:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcC32 to <8 x float>*), align 4
+; FMA-AVX2-NEXT:    [[TMP6:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 8) to <8 x float>*), align 4
+; FMA-AVX2-NEXT:    [[TMP7:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP3]], <8 x float> [[TMP5]])
+; FMA-AVX2-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[TMP2]], <8 x float> [[TMP4]], <8 x float> [[TMP6]])
+; FMA-AVX2-NEXT:    store <8 x float> [[TMP7]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; FMA-AVX2-NEXT:    store <8 x float> [[TMP8]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
+; FMA-AVX2-NEXT:    ret void
+;
 ; FMA512-LABEL: @fma_16f32(
 ; FMA512-NEXT:    [[TMP1:%.*]] = load <16 x float>, <16 x float>* bitcast ([16 x float]* @srcA32 to <16 x float>*), align 4
 ; FMA512-NEXT:    [[TMP2:%.*]] = load <16 x float>, <16 x float>* bitcast ([16 x float]* @srcB32 to <16 x float>*), align 4
@@ -477,6 +900,31 @@
 ; FMA512-NEXT:    store <16 x float> [[TMP4]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4
 ; FMA512-NEXT:    ret void
 ;
+; FMA256AVX512-LABEL: @fma_16f32(
+; FMA256AVX512-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcA32 to <8 x float>*), align 4
+; FMA256AVX512-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <8 x float>*), align 4
+; FMA256AVX512-NEXT:    [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4
+; FMA256AVX512-NEXT:    [[TMP4:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <8 x float>*), align 4
+; FMA256AVX512-NEXT:    [[TMP5:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcC32 to <8 x float>*), align 4
+; FMA256AVX512-NEXT:    [[TMP6:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 8) to <8 x float>*), align 4
+; FMA256AVX512-NEXT:    [[TMP7:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP3]], <8 x float> [[TMP5]])
+; FMA256AVX512-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[TMP2]], <8 x float> [[TMP4]], <8 x float> [[TMP6]])
+; FMA256AVX512-NEXT:    store <8 x float> [[TMP7]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; FMA256AVX512-NEXT:    store <8 x float> [[TMP8]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
+; FMA256AVX512-NEXT:    ret void
+;
+; FMA256-BDVER1-LABEL: @fma_16f32(
+; FMA256-BDVER1-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcA32 to <8 x float>*), align 4
+; FMA256-BDVER1-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <8 x float>*), align 4
+; FMA256-BDVER1-NEXT:    [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4
+; FMA256-BDVER1-NEXT:    [[TMP4:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <8 x float>*), align 4
+; FMA256-BDVER1-NEXT:    [[TMP5:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcC32 to <8 x float>*), align 4
+; FMA256-BDVER1-NEXT:    [[TMP6:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 8) to <8 x float>*), align 4
+; FMA256-BDVER1-NEXT:    [[TMP7:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP3]], <8 x float> [[TMP5]])
+; FMA256-BDVER1-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[TMP2]], <8 x float> [[TMP4]], <8 x float> [[TMP6]])
+; FMA256-BDVER1-NEXT:    store <8 x float> [[TMP7]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; FMA256-BDVER1-NEXT:    store <8 x float> [[TMP8]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
+; FMA256-BDVER1-NEXT:    ret void
   %a0  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64  0), align 4
   %a1  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64  1), align 4
   %a2  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64  2), align 4
Index: llvm/test/Transforms/SLPVectorizer/X86/fptosi-inseltpoison.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/fptosi-inseltpoison.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/fptosi-inseltpoison.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256NODQ
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256NODQ
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256NODQ-BDVER1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256NODQ-AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256DQ
 
@@ -21,59 +21,137 @@
 
 define void @fptosi_8f64_8i64() #0 {
 ; SSE-LABEL: @fptosi_8f64_8i64(
-; SSE-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; SSE-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; SSE-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; SSE-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
-; SSE-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
-; SSE-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
-; SSE-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
-; SSE-NEXT:    [[CVT0:%.*]] = fptosi double [[A0]] to i64
-; SSE-NEXT:    [[CVT1:%.*]] = fptosi double [[A1]] to i64
-; SSE-NEXT:    [[CVT2:%.*]] = fptosi double [[A2]] to i64
-; SSE-NEXT:    [[CVT3:%.*]] = fptosi double [[A3]] to i64
-; SSE-NEXT:    [[CVT4:%.*]] = fptosi double [[A4]] to i64
-; SSE-NEXT:    [[CVT5:%.*]] = fptosi double [[A5]] to i64
-; SSE-NEXT:    [[CVT6:%.*]] = fptosi double [[A6]] to i64
-; SSE-NEXT:    [[CVT7:%.*]] = fptosi double [[A7]] to i64
-; SSE-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
-; SSE-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
-; SSE-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
-; SSE-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
-; SSE-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
-; SSE-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
-; SSE-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
-; SSE-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; SSE-NEXT:    [[CVT0:%.*]] = fptosi double [[TMP5]] to i64
+; SSE-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; SSE-NEXT:    [[CVT1:%.*]] = fptosi double [[TMP6]] to i64
+; SSE-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; SSE-NEXT:    [[CVT2:%.*]] = fptosi double [[TMP7]] to i64
+; SSE-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; SSE-NEXT:    [[CVT3:%.*]] = fptosi double [[TMP8]] to i64
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; SSE-NEXT:    [[CVT4:%.*]] = fptosi double [[TMP9]] to i64
+; SSE-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+; SSE-NEXT:    [[CVT5:%.*]] = fptosi double [[TMP10]] to i64
+; SSE-NEXT:    [[TMP11:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
+; SSE-NEXT:    [[CVT6:%.*]] = fptosi double [[TMP11]] to i64
+; SSE-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
+; SSE-NEXT:    [[CVT7:%.*]] = fptosi double [[TMP12]] to i64
+; SSE-NEXT:    [[TMP13:%.*]] = insertelement <2 x i64> poison, i64 [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP14:%.*]] = insertelement <2 x i64> [[TMP13]], i64 [[CVT1]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP14]], <2 x i64>* bitcast ([8 x i64]* @dst64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP15:%.*]] = insertelement <2 x i64> poison, i64 [[CVT2]], i32 0
+; SSE-NEXT:    [[TMP16:%.*]] = insertelement <2 x i64> [[TMP15]], i64 [[CVT3]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP16]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP17:%.*]] = insertelement <2 x i64> poison, i64 [[CVT4]], i32 0
+; SSE-NEXT:    [[TMP18:%.*]] = insertelement <2 x i64> [[TMP17]], i64 [[CVT5]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP18]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP19:%.*]] = insertelement <2 x i64> poison, i64 [[CVT6]], i32 0
+; SSE-NEXT:    [[TMP20:%.*]] = insertelement <2 x i64> [[TMP19]], i64 [[CVT7]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP20]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SSE-NEXT:    ret void
 ;
 ; AVX256NODQ-LABEL: @fptosi_8f64_8i64(
-; AVX256NODQ-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; AVX256NODQ-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; AVX256NODQ-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; AVX256NODQ-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; AVX256NODQ-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
-; AVX256NODQ-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
-; AVX256NODQ-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
-; AVX256NODQ-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
-; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptosi double [[A0]] to i64
-; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptosi double [[A1]] to i64
-; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptosi double [[A2]] to i64
-; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptosi double [[A3]] to i64
-; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptosi double [[A4]] to i64
-; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptosi double [[A5]] to i64
-; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptosi double [[A6]] to i64
-; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptosi double [[A7]] to i64
-; AVX256NODQ-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+; AVX256NODQ-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; AVX256NODQ-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; AVX256NODQ-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8
+; AVX256NODQ-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8
+; AVX256NODQ-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptosi double [[TMP5]] to i64
+; AVX256NODQ-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptosi double [[TMP6]] to i64
+; AVX256NODQ-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptosi double [[TMP7]] to i64
+; AVX256NODQ-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptosi double [[TMP8]] to i64
+; AVX256NODQ-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptosi double [[TMP9]] to i64
+; AVX256NODQ-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptosi double [[TMP10]] to i64
+; AVX256NODQ-NEXT:    [[TMP11:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
+; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptosi double [[TMP11]] to i64
+; AVX256NODQ-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
+; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptosi double [[TMP12]] to i64
+; AVX256NODQ-NEXT:    [[TMP13:%.*]] = insertelement <2 x i64> poison, i64 [[CVT0]], i32 0
+; AVX256NODQ-NEXT:    [[TMP14:%.*]] = insertelement <2 x i64> [[TMP13]], i64 [[CVT1]], i32 1
+; AVX256NODQ-NEXT:    store <2 x i64> [[TMP14]], <2 x i64>* bitcast ([8 x i64]* @dst64 to <2 x i64>*), align 8
+; AVX256NODQ-NEXT:    [[TMP15:%.*]] = insertelement <2 x i64> poison, i64 [[CVT2]], i32 0
+; AVX256NODQ-NEXT:    [[TMP16:%.*]] = insertelement <2 x i64> [[TMP15]], i64 [[CVT3]], i32 1
+; AVX256NODQ-NEXT:    store <2 x i64> [[TMP16]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX256NODQ-NEXT:    [[TMP17:%.*]] = insertelement <2 x i64> poison, i64 [[CVT4]], i32 0
+; AVX256NODQ-NEXT:    [[TMP18:%.*]] = insertelement <2 x i64> [[TMP17]], i64 [[CVT5]], i32 1
+; AVX256NODQ-NEXT:    store <2 x i64> [[TMP18]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX256NODQ-NEXT:    [[TMP19:%.*]] = insertelement <2 x i64> poison, i64 [[CVT6]], i32 0
+; AVX256NODQ-NEXT:    [[TMP20:%.*]] = insertelement <2 x i64> [[TMP19]], i64 [[CVT7]], i32 1
+; AVX256NODQ-NEXT:    store <2 x i64> [[TMP20]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6) to <2 x i64>*), align 8
 ; AVX256NODQ-NEXT:    ret void
 ;
+; AVX256NODQ-BDVER1-LABEL: @fptosi_8f64_8i64(
+; AVX256NODQ-BDVER1-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
+; AVX256NODQ-BDVER1-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX256NODQ-BDVER1-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
+; AVX256NODQ-BDVER1-NEXT:    [[CVT0:%.*]] = fptosi double [[TMP3]] to i64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
+; AVX256NODQ-BDVER1-NEXT:    [[CVT1:%.*]] = fptosi double [[TMP4]] to i64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 2
+; AVX256NODQ-BDVER1-NEXT:    [[CVT2:%.*]] = fptosi double [[TMP5]] to i64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP6:%.*]] = extractelement <4 x double> [[TMP1]], i32 3
+; AVX256NODQ-BDVER1-NEXT:    [[CVT3:%.*]] = fptosi double [[TMP6]] to i64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP7:%.*]] = extractelement <4 x double> [[TMP2]], i32 0
+; AVX256NODQ-BDVER1-NEXT:    [[CVT4:%.*]] = fptosi double [[TMP7]] to i64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP8:%.*]] = extractelement <4 x double> [[TMP2]], i32 1
+; AVX256NODQ-BDVER1-NEXT:    [[CVT5:%.*]] = fptosi double [[TMP8]] to i64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP9:%.*]] = extractelement <4 x double> [[TMP2]], i32 2
+; AVX256NODQ-BDVER1-NEXT:    [[CVT6:%.*]] = fptosi double [[TMP9]] to i64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP10:%.*]] = extractelement <4 x double> [[TMP2]], i32 3
+; AVX256NODQ-BDVER1-NEXT:    [[CVT7:%.*]] = fptosi double [[TMP10]] to i64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP11:%.*]] = insertelement <4 x i64> poison, i64 [[CVT0]], i32 0
+; AVX256NODQ-BDVER1-NEXT:    [[TMP12:%.*]] = insertelement <4 x i64> [[TMP11]], i64 [[CVT1]], i32 1
+; AVX256NODQ-BDVER1-NEXT:    [[TMP13:%.*]] = insertelement <4 x i64> [[TMP12]], i64 [[CVT2]], i32 2
+; AVX256NODQ-BDVER1-NEXT:    [[TMP14:%.*]] = insertelement <4 x i64> [[TMP13]], i64 [[CVT3]], i32 3
+; AVX256NODQ-BDVER1-NEXT:    store <4 x i64> [[TMP14]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
+; AVX256NODQ-BDVER1-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[CVT4]], i32 0
+; AVX256NODQ-BDVER1-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[CVT5]], i32 1
+; AVX256NODQ-BDVER1-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[CVT6]], i32 2
+; AVX256NODQ-BDVER1-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[CVT7]], i32 3
+; AVX256NODQ-BDVER1-NEXT:    store <4 x i64> [[TMP18]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX256NODQ-BDVER1-NEXT:    ret void
+;
+; AVX256NODQ-AVX2-LABEL: @fptosi_8f64_8i64(
+; AVX256NODQ-AVX2-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
+; AVX256NODQ-AVX2-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX256NODQ-AVX2-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
+; AVX256NODQ-AVX2-NEXT:    [[CVT0:%.*]] = fptosi double [[TMP3]] to i64
+; AVX256NODQ-AVX2-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
+; AVX256NODQ-AVX2-NEXT:    [[CVT1:%.*]] = fptosi double [[TMP4]] to i64
+; AVX256NODQ-AVX2-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 2
+; AVX256NODQ-AVX2-NEXT:    [[CVT2:%.*]] = fptosi double [[TMP5]] to i64
+; AVX256NODQ-AVX2-NEXT:    [[TMP6:%.*]] = extractelement <4 x double> [[TMP1]], i32 3
+; AVX256NODQ-AVX2-NEXT:    [[CVT3:%.*]] = fptosi double [[TMP6]] to i64
+; AVX256NODQ-AVX2-NEXT:    [[TMP7:%.*]] = extractelement <4 x double> [[TMP2]], i32 0
+; AVX256NODQ-AVX2-NEXT:    [[CVT4:%.*]] = fptosi double [[TMP7]] to i64
+; AVX256NODQ-AVX2-NEXT:    [[TMP8:%.*]] = extractelement <4 x double> [[TMP2]], i32 1
+; AVX256NODQ-AVX2-NEXT:    [[CVT5:%.*]] = fptosi double [[TMP8]] to i64
+; AVX256NODQ-AVX2-NEXT:    [[TMP9:%.*]] = extractelement <4 x double> [[TMP2]], i32 2
+; AVX256NODQ-AVX2-NEXT:    [[CVT6:%.*]] = fptosi double [[TMP9]] to i64
+; AVX256NODQ-AVX2-NEXT:    [[TMP10:%.*]] = extractelement <4 x double> [[TMP2]], i32 3
+; AVX256NODQ-AVX2-NEXT:    [[CVT7:%.*]] = fptosi double [[TMP10]] to i64
+; AVX256NODQ-AVX2-NEXT:    [[TMP11:%.*]] = insertelement <4 x i64> poison, i64 [[CVT0]], i32 0
+; AVX256NODQ-AVX2-NEXT:    [[TMP12:%.*]] = insertelement <4 x i64> [[TMP11]], i64 [[CVT1]], i32 1
+; AVX256NODQ-AVX2-NEXT:    [[TMP13:%.*]] = insertelement <4 x i64> [[TMP12]], i64 [[CVT2]], i32 2
+; AVX256NODQ-AVX2-NEXT:    [[TMP14:%.*]] = insertelement <4 x i64> [[TMP13]], i64 [[CVT3]], i32 3
+; AVX256NODQ-AVX2-NEXT:    store <4 x i64> [[TMP14]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
+; AVX256NODQ-AVX2-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[CVT4]], i32 0
+; AVX256NODQ-AVX2-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[CVT5]], i32 1
+; AVX256NODQ-AVX2-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[CVT6]], i32 2
+; AVX256NODQ-AVX2-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[CVT7]], i32 3
+; AVX256NODQ-AVX2-NEXT:    store <4 x i64> [[TMP18]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX256NODQ-AVX2-NEXT:    ret void
+;
 ; AVX512-LABEL: @fptosi_8f64_8i64(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
 ; AVX512-NEXT:    [[TMP2:%.*]] = fptosi <8 x double> [[TMP1]] to <8 x i64>
@@ -254,59 +332,133 @@
 
 define void @fptosi_8f32_8i64() #0 {
 ; SSE-LABEL: @fptosi_8f32_8i64(
-; SSE-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; SSE-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; SSE-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; SSE-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; SSE-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; SSE-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; SSE-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; SSE-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; SSE-NEXT:    [[CVT0:%.*]] = fptosi float [[A0]] to i64
-; SSE-NEXT:    [[CVT1:%.*]] = fptosi float [[A1]] to i64
-; SSE-NEXT:    [[CVT2:%.*]] = fptosi float [[A2]] to i64
-; SSE-NEXT:    [[CVT3:%.*]] = fptosi float [[A3]] to i64
-; SSE-NEXT:    [[CVT4:%.*]] = fptosi float [[A4]] to i64
-; SSE-NEXT:    [[CVT5:%.*]] = fptosi float [[A5]] to i64
-; SSE-NEXT:    [[CVT6:%.*]] = fptosi float [[A6]] to i64
-; SSE-NEXT:    [[CVT7:%.*]] = fptosi float [[A7]] to i64
-; SSE-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
-; SSE-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
-; SSE-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
-; SSE-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
-; SSE-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
-; SSE-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
-; SSE-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
-; SSE-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([16 x float]* @src32 to <2 x float>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2) to <2 x float>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <2 x float>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6) to <2 x float>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
+; SSE-NEXT:    [[CVT0:%.*]] = fptosi float [[TMP5]] to i64
+; SSE-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+; SSE-NEXT:    [[CVT1:%.*]] = fptosi float [[TMP6]] to i64
+; SSE-NEXT:    [[TMP7:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; SSE-NEXT:    [[CVT2:%.*]] = fptosi float [[TMP7]] to i64
+; SSE-NEXT:    [[TMP8:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+; SSE-NEXT:    [[CVT3:%.*]] = fptosi float [[TMP8]] to i64
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; SSE-NEXT:    [[CVT4:%.*]] = fptosi float [[TMP9]] to i64
+; SSE-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; SSE-NEXT:    [[CVT5:%.*]] = fptosi float [[TMP10]] to i64
+; SSE-NEXT:    [[TMP11:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; SSE-NEXT:    [[CVT6:%.*]] = fptosi float [[TMP11]] to i64
+; SSE-NEXT:    [[TMP12:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; SSE-NEXT:    [[CVT7:%.*]] = fptosi float [[TMP12]] to i64
+; SSE-NEXT:    [[TMP13:%.*]] = insertelement <2 x i64> poison, i64 [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP14:%.*]] = insertelement <2 x i64> [[TMP13]], i64 [[CVT1]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP14]], <2 x i64>* bitcast ([8 x i64]* @dst64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP15:%.*]] = insertelement <2 x i64> poison, i64 [[CVT2]], i32 0
+; SSE-NEXT:    [[TMP16:%.*]] = insertelement <2 x i64> [[TMP15]], i64 [[CVT3]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP16]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP17:%.*]] = insertelement <2 x i64> poison, i64 [[CVT4]], i32 0
+; SSE-NEXT:    [[TMP18:%.*]] = insertelement <2 x i64> [[TMP17]], i64 [[CVT5]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP18]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP19:%.*]] = insertelement <2 x i64> poison, i64 [[CVT6]], i32 0
+; SSE-NEXT:    [[TMP20:%.*]] = insertelement <2 x i64> [[TMP19]], i64 [[CVT7]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP20]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SSE-NEXT:    ret void
 ;
 ; AVX256NODQ-LABEL: @fptosi_8f32_8i64(
-; AVX256NODQ-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; AVX256NODQ-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; AVX256NODQ-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; AVX256NODQ-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; AVX256NODQ-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; AVX256NODQ-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; AVX256NODQ-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; AVX256NODQ-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptosi float [[A0]] to i64
-; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptosi float [[A1]] to i64
-; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptosi float [[A2]] to i64
-; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptosi float [[A3]] to i64
-; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptosi float [[A4]] to i64
-; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptosi float [[A5]] to i64
-; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptosi float [[A6]] to i64
-; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptosi float [[A7]] to i64
-; AVX256NODQ-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+; AVX256NODQ-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; AVX256NODQ-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; AVX256NODQ-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptosi float [[TMP3]] to i64
+; AVX256NODQ-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptosi float [[TMP4]] to i64
+; AVX256NODQ-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptosi float [[TMP5]] to i64
+; AVX256NODQ-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptosi float [[TMP6]] to i64
+; AVX256NODQ-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptosi float [[TMP7]] to i64
+; AVX256NODQ-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptosi float [[TMP8]] to i64
+; AVX256NODQ-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptosi float [[TMP9]] to i64
+; AVX256NODQ-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptosi float [[TMP10]] to i64
+; AVX256NODQ-NEXT:    [[TMP11:%.*]] = insertelement <4 x i64> poison, i64 [[CVT0]], i32 0
+; AVX256NODQ-NEXT:    [[TMP12:%.*]] = insertelement <4 x i64> [[TMP11]], i64 [[CVT1]], i32 1
+; AVX256NODQ-NEXT:    [[TMP13:%.*]] = insertelement <4 x i64> [[TMP12]], i64 [[CVT2]], i32 2
+; AVX256NODQ-NEXT:    [[TMP14:%.*]] = insertelement <4 x i64> [[TMP13]], i64 [[CVT3]], i32 3
+; AVX256NODQ-NEXT:    store <4 x i64> [[TMP14]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
+; AVX256NODQ-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[CVT4]], i32 0
+; AVX256NODQ-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[CVT5]], i32 1
+; AVX256NODQ-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[CVT6]], i32 2
+; AVX256NODQ-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[CVT7]], i32 3
+; AVX256NODQ-NEXT:    store <4 x i64> [[TMP18]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX256NODQ-NEXT:    ret void
 ;
+; AVX256NODQ-BDVER1-LABEL: @fptosi_8f32_8i64(
+; AVX256NODQ-BDVER1-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; AVX256NODQ-BDVER1-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; AVX256NODQ-BDVER1-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; AVX256NODQ-BDVER1-NEXT:    [[CVT0:%.*]] = fptosi float [[TMP3]] to i64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; AVX256NODQ-BDVER1-NEXT:    [[CVT1:%.*]] = fptosi float [[TMP4]] to i64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; AVX256NODQ-BDVER1-NEXT:    [[CVT2:%.*]] = fptosi float [[TMP5]] to i64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; AVX256NODQ-BDVER1-NEXT:    [[CVT3:%.*]] = fptosi float [[TMP6]] to i64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; AVX256NODQ-BDVER1-NEXT:    [[CVT4:%.*]] = fptosi float [[TMP7]] to i64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; AVX256NODQ-BDVER1-NEXT:    [[CVT5:%.*]] = fptosi float [[TMP8]] to i64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; AVX256NODQ-BDVER1-NEXT:    [[CVT6:%.*]] = fptosi float [[TMP9]] to i64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; AVX256NODQ-BDVER1-NEXT:    [[CVT7:%.*]] = fptosi float [[TMP10]] to i64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP11:%.*]] = insertelement <4 x i64> poison, i64 [[CVT0]], i32 0
+; AVX256NODQ-BDVER1-NEXT:    [[TMP12:%.*]] = insertelement <4 x i64> [[TMP11]], i64 [[CVT1]], i32 1
+; AVX256NODQ-BDVER1-NEXT:    [[TMP13:%.*]] = insertelement <4 x i64> [[TMP12]], i64 [[CVT2]], i32 2
+; AVX256NODQ-BDVER1-NEXT:    [[TMP14:%.*]] = insertelement <4 x i64> [[TMP13]], i64 [[CVT3]], i32 3
+; AVX256NODQ-BDVER1-NEXT:    store <4 x i64> [[TMP14]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
+; AVX256NODQ-BDVER1-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[CVT4]], i32 0
+; AVX256NODQ-BDVER1-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[CVT5]], i32 1
+; AVX256NODQ-BDVER1-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[CVT6]], i32 2
+; AVX256NODQ-BDVER1-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[CVT7]], i32 3
+; AVX256NODQ-BDVER1-NEXT:    store <4 x i64> [[TMP18]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX256NODQ-BDVER1-NEXT:    ret void
+;
+; AVX256NODQ-AVX2-LABEL: @fptosi_8f32_8i64(
+; AVX256NODQ-AVX2-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; AVX256NODQ-AVX2-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; AVX256NODQ-AVX2-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; AVX256NODQ-AVX2-NEXT:    [[CVT0:%.*]] = fptosi float [[TMP3]] to i64
+; AVX256NODQ-AVX2-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; AVX256NODQ-AVX2-NEXT:    [[CVT1:%.*]] = fptosi float [[TMP4]] to i64
+; AVX256NODQ-AVX2-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; AVX256NODQ-AVX2-NEXT:    [[CVT2:%.*]] = fptosi float [[TMP5]] to i64
+; AVX256NODQ-AVX2-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; AVX256NODQ-AVX2-NEXT:    [[CVT3:%.*]] = fptosi float [[TMP6]] to i64
+; AVX256NODQ-AVX2-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; AVX256NODQ-AVX2-NEXT:    [[CVT4:%.*]] = fptosi float [[TMP7]] to i64
+; AVX256NODQ-AVX2-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; AVX256NODQ-AVX2-NEXT:    [[CVT5:%.*]] = fptosi float [[TMP8]] to i64
+; AVX256NODQ-AVX2-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; AVX256NODQ-AVX2-NEXT:    [[CVT6:%.*]] = fptosi float [[TMP9]] to i64
+; AVX256NODQ-AVX2-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; AVX256NODQ-AVX2-NEXT:    [[CVT7:%.*]] = fptosi float [[TMP10]] to i64
+; AVX256NODQ-AVX2-NEXT:    [[TMP11:%.*]] = insertelement <4 x i64> poison, i64 [[CVT0]], i32 0
+; AVX256NODQ-AVX2-NEXT:    [[TMP12:%.*]] = insertelement <4 x i64> [[TMP11]], i64 [[CVT1]], i32 1
+; AVX256NODQ-AVX2-NEXT:    [[TMP13:%.*]] = insertelement <4 x i64> [[TMP12]], i64 [[CVT2]], i32 2
+; AVX256NODQ-AVX2-NEXT:    [[TMP14:%.*]] = insertelement <4 x i64> [[TMP13]], i64 [[CVT3]], i32 3
+; AVX256NODQ-AVX2-NEXT:    store <4 x i64> [[TMP14]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
+; AVX256NODQ-AVX2-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[CVT4]], i32 0
+; AVX256NODQ-AVX2-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[CVT5]], i32 1
+; AVX256NODQ-AVX2-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[CVT6]], i32 2
+; AVX256NODQ-AVX2-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[CVT7]], i32 3
+; AVX256NODQ-AVX2-NEXT:    store <4 x i64> [[TMP18]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX256NODQ-AVX2-NEXT:    ret void
+;
 ; AVX512-LABEL: @fptosi_8f32_8i64(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
 ; AVX512-NEXT:    [[TMP2:%.*]] = fptosi <8 x float> [[TMP1]] to <8 x i64>
Index: llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256NODQ
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256NODQ
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256NODQ-BDVER1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256NODQ-AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256DQ
 
@@ -21,59 +21,137 @@
 
 define void @fptosi_8f64_8i64() #0 {
 ; SSE-LABEL: @fptosi_8f64_8i64(
-; SSE-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; SSE-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; SSE-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; SSE-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
-; SSE-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
-; SSE-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
-; SSE-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
-; SSE-NEXT:    [[CVT0:%.*]] = fptosi double [[A0]] to i64
-; SSE-NEXT:    [[CVT1:%.*]] = fptosi double [[A1]] to i64
-; SSE-NEXT:    [[CVT2:%.*]] = fptosi double [[A2]] to i64
-; SSE-NEXT:    [[CVT3:%.*]] = fptosi double [[A3]] to i64
-; SSE-NEXT:    [[CVT4:%.*]] = fptosi double [[A4]] to i64
-; SSE-NEXT:    [[CVT5:%.*]] = fptosi double [[A5]] to i64
-; SSE-NEXT:    [[CVT6:%.*]] = fptosi double [[A6]] to i64
-; SSE-NEXT:    [[CVT7:%.*]] = fptosi double [[A7]] to i64
-; SSE-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
-; SSE-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
-; SSE-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
-; SSE-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
-; SSE-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
-; SSE-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
-; SSE-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
-; SSE-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; SSE-NEXT:    [[CVT0:%.*]] = fptosi double [[TMP5]] to i64
+; SSE-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; SSE-NEXT:    [[CVT1:%.*]] = fptosi double [[TMP6]] to i64
+; SSE-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; SSE-NEXT:    [[CVT2:%.*]] = fptosi double [[TMP7]] to i64
+; SSE-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; SSE-NEXT:    [[CVT3:%.*]] = fptosi double [[TMP8]] to i64
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; SSE-NEXT:    [[CVT4:%.*]] = fptosi double [[TMP9]] to i64
+; SSE-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+; SSE-NEXT:    [[CVT5:%.*]] = fptosi double [[TMP10]] to i64
+; SSE-NEXT:    [[TMP11:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
+; SSE-NEXT:    [[CVT6:%.*]] = fptosi double [[TMP11]] to i64
+; SSE-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
+; SSE-NEXT:    [[CVT7:%.*]] = fptosi double [[TMP12]] to i64
+; SSE-NEXT:    [[TMP13:%.*]] = insertelement <2 x i64> poison, i64 [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP14:%.*]] = insertelement <2 x i64> [[TMP13]], i64 [[CVT1]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP14]], <2 x i64>* bitcast ([8 x i64]* @dst64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP15:%.*]] = insertelement <2 x i64> poison, i64 [[CVT2]], i32 0
+; SSE-NEXT:    [[TMP16:%.*]] = insertelement <2 x i64> [[TMP15]], i64 [[CVT3]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP16]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP17:%.*]] = insertelement <2 x i64> poison, i64 [[CVT4]], i32 0
+; SSE-NEXT:    [[TMP18:%.*]] = insertelement <2 x i64> [[TMP17]], i64 [[CVT5]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP18]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP19:%.*]] = insertelement <2 x i64> poison, i64 [[CVT6]], i32 0
+; SSE-NEXT:    [[TMP20:%.*]] = insertelement <2 x i64> [[TMP19]], i64 [[CVT7]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP20]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SSE-NEXT:    ret void
 ;
 ; AVX256NODQ-LABEL: @fptosi_8f64_8i64(
-; AVX256NODQ-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; AVX256NODQ-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; AVX256NODQ-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; AVX256NODQ-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; AVX256NODQ-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
-; AVX256NODQ-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
-; AVX256NODQ-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
-; AVX256NODQ-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
-; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptosi double [[A0]] to i64
-; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptosi double [[A1]] to i64
-; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptosi double [[A2]] to i64
-; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptosi double [[A3]] to i64
-; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptosi double [[A4]] to i64
-; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptosi double [[A5]] to i64
-; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptosi double [[A6]] to i64
-; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptosi double [[A7]] to i64
-; AVX256NODQ-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+; AVX256NODQ-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; AVX256NODQ-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; AVX256NODQ-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8
+; AVX256NODQ-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8
+; AVX256NODQ-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptosi double [[TMP5]] to i64
+; AVX256NODQ-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptosi double [[TMP6]] to i64
+; AVX256NODQ-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptosi double [[TMP7]] to i64
+; AVX256NODQ-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptosi double [[TMP8]] to i64
+; AVX256NODQ-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptosi double [[TMP9]] to i64
+; AVX256NODQ-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptosi double [[TMP10]] to i64
+; AVX256NODQ-NEXT:    [[TMP11:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
+; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptosi double [[TMP11]] to i64
+; AVX256NODQ-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
+; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptosi double [[TMP12]] to i64
+; AVX256NODQ-NEXT:    [[TMP13:%.*]] = insertelement <2 x i64> poison, i64 [[CVT0]], i32 0
+; AVX256NODQ-NEXT:    [[TMP14:%.*]] = insertelement <2 x i64> [[TMP13]], i64 [[CVT1]], i32 1
+; AVX256NODQ-NEXT:    store <2 x i64> [[TMP14]], <2 x i64>* bitcast ([8 x i64]* @dst64 to <2 x i64>*), align 8
+; AVX256NODQ-NEXT:    [[TMP15:%.*]] = insertelement <2 x i64> poison, i64 [[CVT2]], i32 0
+; AVX256NODQ-NEXT:    [[TMP16:%.*]] = insertelement <2 x i64> [[TMP15]], i64 [[CVT3]], i32 1
+; AVX256NODQ-NEXT:    store <2 x i64> [[TMP16]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX256NODQ-NEXT:    [[TMP17:%.*]] = insertelement <2 x i64> poison, i64 [[CVT4]], i32 0
+; AVX256NODQ-NEXT:    [[TMP18:%.*]] = insertelement <2 x i64> [[TMP17]], i64 [[CVT5]], i32 1
+; AVX256NODQ-NEXT:    store <2 x i64> [[TMP18]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX256NODQ-NEXT:    [[TMP19:%.*]] = insertelement <2 x i64> poison, i64 [[CVT6]], i32 0
+; AVX256NODQ-NEXT:    [[TMP20:%.*]] = insertelement <2 x i64> [[TMP19]], i64 [[CVT7]], i32 1
+; AVX256NODQ-NEXT:    store <2 x i64> [[TMP20]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6) to <2 x i64>*), align 8
 ; AVX256NODQ-NEXT:    ret void
 ;
+; AVX256NODQ-BDVER1-LABEL: @fptosi_8f64_8i64(
+; AVX256NODQ-BDVER1-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
+; AVX256NODQ-BDVER1-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX256NODQ-BDVER1-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
+; AVX256NODQ-BDVER1-NEXT:    [[CVT0:%.*]] = fptosi double [[TMP3]] to i64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
+; AVX256NODQ-BDVER1-NEXT:    [[CVT1:%.*]] = fptosi double [[TMP4]] to i64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 2
+; AVX256NODQ-BDVER1-NEXT:    [[CVT2:%.*]] = fptosi double [[TMP5]] to i64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP6:%.*]] = extractelement <4 x double> [[TMP1]], i32 3
+; AVX256NODQ-BDVER1-NEXT:    [[CVT3:%.*]] = fptosi double [[TMP6]] to i64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP7:%.*]] = extractelement <4 x double> [[TMP2]], i32 0
+; AVX256NODQ-BDVER1-NEXT:    [[CVT4:%.*]] = fptosi double [[TMP7]] to i64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP8:%.*]] = extractelement <4 x double> [[TMP2]], i32 1
+; AVX256NODQ-BDVER1-NEXT:    [[CVT5:%.*]] = fptosi double [[TMP8]] to i64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP9:%.*]] = extractelement <4 x double> [[TMP2]], i32 2
+; AVX256NODQ-BDVER1-NEXT:    [[CVT6:%.*]] = fptosi double [[TMP9]] to i64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP10:%.*]] = extractelement <4 x double> [[TMP2]], i32 3
+; AVX256NODQ-BDVER1-NEXT:    [[CVT7:%.*]] = fptosi double [[TMP10]] to i64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP11:%.*]] = insertelement <4 x i64> poison, i64 [[CVT0]], i32 0
+; AVX256NODQ-BDVER1-NEXT:    [[TMP12:%.*]] = insertelement <4 x i64> [[TMP11]], i64 [[CVT1]], i32 1
+; AVX256NODQ-BDVER1-NEXT:    [[TMP13:%.*]] = insertelement <4 x i64> [[TMP12]], i64 [[CVT2]], i32 2
+; AVX256NODQ-BDVER1-NEXT:    [[TMP14:%.*]] = insertelement <4 x i64> [[TMP13]], i64 [[CVT3]], i32 3
+; AVX256NODQ-BDVER1-NEXT:    store <4 x i64> [[TMP14]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
+; AVX256NODQ-BDVER1-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[CVT4]], i32 0
+; AVX256NODQ-BDVER1-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[CVT5]], i32 1
+; AVX256NODQ-BDVER1-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[CVT6]], i32 2
+; AVX256NODQ-BDVER1-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[CVT7]], i32 3
+; AVX256NODQ-BDVER1-NEXT:    store <4 x i64> [[TMP18]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX256NODQ-BDVER1-NEXT:    ret void
+;
+; AVX256NODQ-AVX2-LABEL: @fptosi_8f64_8i64(
+; AVX256NODQ-AVX2-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
+; AVX256NODQ-AVX2-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX256NODQ-AVX2-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
+; AVX256NODQ-AVX2-NEXT:    [[CVT0:%.*]] = fptosi double [[TMP3]] to i64
+; AVX256NODQ-AVX2-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
+; AVX256NODQ-AVX2-NEXT:    [[CVT1:%.*]] = fptosi double [[TMP4]] to i64
+; AVX256NODQ-AVX2-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 2
+; AVX256NODQ-AVX2-NEXT:    [[CVT2:%.*]] = fptosi double [[TMP5]] to i64
+; AVX256NODQ-AVX2-NEXT:    [[TMP6:%.*]] = extractelement <4 x double> [[TMP1]], i32 3
+; AVX256NODQ-AVX2-NEXT:    [[CVT3:%.*]] = fptosi double [[TMP6]] to i64
+; AVX256NODQ-AVX2-NEXT:    [[TMP7:%.*]] = extractelement <4 x double> [[TMP2]], i32 0
+; AVX256NODQ-AVX2-NEXT:    [[CVT4:%.*]] = fptosi double [[TMP7]] to i64
+; AVX256NODQ-AVX2-NEXT:    [[TMP8:%.*]] = extractelement <4 x double> [[TMP2]], i32 1
+; AVX256NODQ-AVX2-NEXT:    [[CVT5:%.*]] = fptosi double [[TMP8]] to i64
+; AVX256NODQ-AVX2-NEXT:    [[TMP9:%.*]] = extractelement <4 x double> [[TMP2]], i32 2
+; AVX256NODQ-AVX2-NEXT:    [[CVT6:%.*]] = fptosi double [[TMP9]] to i64
+; AVX256NODQ-AVX2-NEXT:    [[TMP10:%.*]] = extractelement <4 x double> [[TMP2]], i32 3
+; AVX256NODQ-AVX2-NEXT:    [[CVT7:%.*]] = fptosi double [[TMP10]] to i64
+; AVX256NODQ-AVX2-NEXT:    [[TMP11:%.*]] = insertelement <4 x i64> poison, i64 [[CVT0]], i32 0
+; AVX256NODQ-AVX2-NEXT:    [[TMP12:%.*]] = insertelement <4 x i64> [[TMP11]], i64 [[CVT1]], i32 1
+; AVX256NODQ-AVX2-NEXT:    [[TMP13:%.*]] = insertelement <4 x i64> [[TMP12]], i64 [[CVT2]], i32 2
+; AVX256NODQ-AVX2-NEXT:    [[TMP14:%.*]] = insertelement <4 x i64> [[TMP13]], i64 [[CVT3]], i32 3
+; AVX256NODQ-AVX2-NEXT:    store <4 x i64> [[TMP14]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
+; AVX256NODQ-AVX2-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[CVT4]], i32 0
+; AVX256NODQ-AVX2-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[CVT5]], i32 1
+; AVX256NODQ-AVX2-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[CVT6]], i32 2
+; AVX256NODQ-AVX2-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[CVT7]], i32 3
+; AVX256NODQ-AVX2-NEXT:    store <4 x i64> [[TMP18]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX256NODQ-AVX2-NEXT:    ret void
+;
 ; AVX512-LABEL: @fptosi_8f64_8i64(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
 ; AVX512-NEXT:    [[TMP2:%.*]] = fptosi <8 x double> [[TMP1]] to <8 x i64>
@@ -254,59 +332,133 @@
 
 define void @fptosi_8f32_8i64() #0 {
 ; SSE-LABEL: @fptosi_8f32_8i64(
-; SSE-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; SSE-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; SSE-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; SSE-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; SSE-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; SSE-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; SSE-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; SSE-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; SSE-NEXT:    [[CVT0:%.*]] = fptosi float [[A0]] to i64
-; SSE-NEXT:    [[CVT1:%.*]] = fptosi float [[A1]] to i64
-; SSE-NEXT:    [[CVT2:%.*]] = fptosi float [[A2]] to i64
-; SSE-NEXT:    [[CVT3:%.*]] = fptosi float [[A3]] to i64
-; SSE-NEXT:    [[CVT4:%.*]] = fptosi float [[A4]] to i64
-; SSE-NEXT:    [[CVT5:%.*]] = fptosi float [[A5]] to i64
-; SSE-NEXT:    [[CVT6:%.*]] = fptosi float [[A6]] to i64
-; SSE-NEXT:    [[CVT7:%.*]] = fptosi float [[A7]] to i64
-; SSE-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
-; SSE-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
-; SSE-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
-; SSE-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
-; SSE-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
-; SSE-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
-; SSE-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
-; SSE-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([16 x float]* @src32 to <2 x float>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2) to <2 x float>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <2 x float>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6) to <2 x float>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
+; SSE-NEXT:    [[CVT0:%.*]] = fptosi float [[TMP5]] to i64
+; SSE-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+; SSE-NEXT:    [[CVT1:%.*]] = fptosi float [[TMP6]] to i64
+; SSE-NEXT:    [[TMP7:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; SSE-NEXT:    [[CVT2:%.*]] = fptosi float [[TMP7]] to i64
+; SSE-NEXT:    [[TMP8:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+; SSE-NEXT:    [[CVT3:%.*]] = fptosi float [[TMP8]] to i64
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; SSE-NEXT:    [[CVT4:%.*]] = fptosi float [[TMP9]] to i64
+; SSE-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; SSE-NEXT:    [[CVT5:%.*]] = fptosi float [[TMP10]] to i64
+; SSE-NEXT:    [[TMP11:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; SSE-NEXT:    [[CVT6:%.*]] = fptosi float [[TMP11]] to i64
+; SSE-NEXT:    [[TMP12:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; SSE-NEXT:    [[CVT7:%.*]] = fptosi float [[TMP12]] to i64
+; SSE-NEXT:    [[TMP13:%.*]] = insertelement <2 x i64> poison, i64 [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP14:%.*]] = insertelement <2 x i64> [[TMP13]], i64 [[CVT1]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP14]], <2 x i64>* bitcast ([8 x i64]* @dst64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP15:%.*]] = insertelement <2 x i64> poison, i64 [[CVT2]], i32 0
+; SSE-NEXT:    [[TMP16:%.*]] = insertelement <2 x i64> [[TMP15]], i64 [[CVT3]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP16]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP17:%.*]] = insertelement <2 x i64> poison, i64 [[CVT4]], i32 0
+; SSE-NEXT:    [[TMP18:%.*]] = insertelement <2 x i64> [[TMP17]], i64 [[CVT5]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP18]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP19:%.*]] = insertelement <2 x i64> poison, i64 [[CVT6]], i32 0
+; SSE-NEXT:    [[TMP20:%.*]] = insertelement <2 x i64> [[TMP19]], i64 [[CVT7]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP20]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SSE-NEXT:    ret void
 ;
 ; AVX256NODQ-LABEL: @fptosi_8f32_8i64(
-; AVX256NODQ-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; AVX256NODQ-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; AVX256NODQ-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; AVX256NODQ-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; AVX256NODQ-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; AVX256NODQ-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; AVX256NODQ-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; AVX256NODQ-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptosi float [[A0]] to i64
-; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptosi float [[A1]] to i64
-; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptosi float [[A2]] to i64
-; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptosi float [[A3]] to i64
-; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptosi float [[A4]] to i64
-; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptosi float [[A5]] to i64
-; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptosi float [[A6]] to i64
-; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptosi float [[A7]] to i64
-; AVX256NODQ-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+; AVX256NODQ-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; AVX256NODQ-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; AVX256NODQ-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptosi float [[TMP3]] to i64
+; AVX256NODQ-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptosi float [[TMP4]] to i64
+; AVX256NODQ-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptosi float [[TMP5]] to i64
+; AVX256NODQ-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptosi float [[TMP6]] to i64
+; AVX256NODQ-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptosi float [[TMP7]] to i64
+; AVX256NODQ-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptosi float [[TMP8]] to i64
+; AVX256NODQ-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptosi float [[TMP9]] to i64
+; AVX256NODQ-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptosi float [[TMP10]] to i64
+; AVX256NODQ-NEXT:    [[TMP11:%.*]] = insertelement <4 x i64> poison, i64 [[CVT0]], i32 0
+; AVX256NODQ-NEXT:    [[TMP12:%.*]] = insertelement <4 x i64> [[TMP11]], i64 [[CVT1]], i32 1
+; AVX256NODQ-NEXT:    [[TMP13:%.*]] = insertelement <4 x i64> [[TMP12]], i64 [[CVT2]], i32 2
+; AVX256NODQ-NEXT:    [[TMP14:%.*]] = insertelement <4 x i64> [[TMP13]], i64 [[CVT3]], i32 3
+; AVX256NODQ-NEXT:    store <4 x i64> [[TMP14]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
+; AVX256NODQ-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[CVT4]], i32 0
+; AVX256NODQ-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[CVT5]], i32 1
+; AVX256NODQ-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[CVT6]], i32 2
+; AVX256NODQ-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[CVT7]], i32 3
+; AVX256NODQ-NEXT:    store <4 x i64> [[TMP18]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX256NODQ-NEXT:    ret void
 ;
+; AVX256NODQ-BDVER1-LABEL: @fptosi_8f32_8i64(
+; AVX256NODQ-BDVER1-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; AVX256NODQ-BDVER1-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; AVX256NODQ-BDVER1-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; AVX256NODQ-BDVER1-NEXT:    [[CVT0:%.*]] = fptosi float [[TMP3]] to i64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; AVX256NODQ-BDVER1-NEXT:    [[CVT1:%.*]] = fptosi float [[TMP4]] to i64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; AVX256NODQ-BDVER1-NEXT:    [[CVT2:%.*]] = fptosi float [[TMP5]] to i64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; AVX256NODQ-BDVER1-NEXT:    [[CVT3:%.*]] = fptosi float [[TMP6]] to i64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; AVX256NODQ-BDVER1-NEXT:    [[CVT4:%.*]] = fptosi float [[TMP7]] to i64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; AVX256NODQ-BDVER1-NEXT:    [[CVT5:%.*]] = fptosi float [[TMP8]] to i64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; AVX256NODQ-BDVER1-NEXT:    [[CVT6:%.*]] = fptosi float [[TMP9]] to i64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; AVX256NODQ-BDVER1-NEXT:    [[CVT7:%.*]] = fptosi float [[TMP10]] to i64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP11:%.*]] = insertelement <4 x i64> poison, i64 [[CVT0]], i32 0
+; AVX256NODQ-BDVER1-NEXT:    [[TMP12:%.*]] = insertelement <4 x i64> [[TMP11]], i64 [[CVT1]], i32 1
+; AVX256NODQ-BDVER1-NEXT:    [[TMP13:%.*]] = insertelement <4 x i64> [[TMP12]], i64 [[CVT2]], i32 2
+; AVX256NODQ-BDVER1-NEXT:    [[TMP14:%.*]] = insertelement <4 x i64> [[TMP13]], i64 [[CVT3]], i32 3
+; AVX256NODQ-BDVER1-NEXT:    store <4 x i64> [[TMP14]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
+; AVX256NODQ-BDVER1-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[CVT4]], i32 0
+; AVX256NODQ-BDVER1-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[CVT5]], i32 1
+; AVX256NODQ-BDVER1-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[CVT6]], i32 2
+; AVX256NODQ-BDVER1-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[CVT7]], i32 3
+; AVX256NODQ-BDVER1-NEXT:    store <4 x i64> [[TMP18]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX256NODQ-BDVER1-NEXT:    ret void
+;
+; AVX256NODQ-AVX2-LABEL: @fptosi_8f32_8i64(
+; AVX256NODQ-AVX2-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; AVX256NODQ-AVX2-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; AVX256NODQ-AVX2-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; AVX256NODQ-AVX2-NEXT:    [[CVT0:%.*]] = fptosi float [[TMP3]] to i64
+; AVX256NODQ-AVX2-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; AVX256NODQ-AVX2-NEXT:    [[CVT1:%.*]] = fptosi float [[TMP4]] to i64
+; AVX256NODQ-AVX2-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; AVX256NODQ-AVX2-NEXT:    [[CVT2:%.*]] = fptosi float [[TMP5]] to i64
+; AVX256NODQ-AVX2-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; AVX256NODQ-AVX2-NEXT:    [[CVT3:%.*]] = fptosi float [[TMP6]] to i64
+; AVX256NODQ-AVX2-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; AVX256NODQ-AVX2-NEXT:    [[CVT4:%.*]] = fptosi float [[TMP7]] to i64
+; AVX256NODQ-AVX2-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; AVX256NODQ-AVX2-NEXT:    [[CVT5:%.*]] = fptosi float [[TMP8]] to i64
+; AVX256NODQ-AVX2-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; AVX256NODQ-AVX2-NEXT:    [[CVT6:%.*]] = fptosi float [[TMP9]] to i64
+; AVX256NODQ-AVX2-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; AVX256NODQ-AVX2-NEXT:    [[CVT7:%.*]] = fptosi float [[TMP10]] to i64
+; AVX256NODQ-AVX2-NEXT:    [[TMP11:%.*]] = insertelement <4 x i64> poison, i64 [[CVT0]], i32 0
+; AVX256NODQ-AVX2-NEXT:    [[TMP12:%.*]] = insertelement <4 x i64> [[TMP11]], i64 [[CVT1]], i32 1
+; AVX256NODQ-AVX2-NEXT:    [[TMP13:%.*]] = insertelement <4 x i64> [[TMP12]], i64 [[CVT2]], i32 2
+; AVX256NODQ-AVX2-NEXT:    [[TMP14:%.*]] = insertelement <4 x i64> [[TMP13]], i64 [[CVT3]], i32 3
+; AVX256NODQ-AVX2-NEXT:    store <4 x i64> [[TMP14]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
+; AVX256NODQ-AVX2-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[CVT4]], i32 0
+; AVX256NODQ-AVX2-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[CVT5]], i32 1
+; AVX256NODQ-AVX2-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[CVT6]], i32 2
+; AVX256NODQ-AVX2-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[CVT7]], i32 3
+; AVX256NODQ-AVX2-NEXT:    store <4 x i64> [[TMP18]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX256NODQ-AVX2-NEXT:    ret void
+;
 ; AVX512-LABEL: @fptosi_8f32_8i64(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
 ; AVX512-NEXT:    [[TMP2:%.*]] = fptosi <8 x float> [[TMP1]] to <8 x i64>
Index: llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX256NODQ
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX256NODQ
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX256NODQ-BDVER1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX256NODQ-AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256DQ
 
@@ -21,59 +21,137 @@
 
 define void @fptoui_8f64_8i64() #0 {
 ; SSE-LABEL: @fptoui_8f64_8i64(
-; SSE-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; SSE-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; SSE-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; SSE-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
-; SSE-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
-; SSE-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
-; SSE-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
-; SSE-NEXT:    [[CVT0:%.*]] = fptoui double [[A0]] to i64
-; SSE-NEXT:    [[CVT1:%.*]] = fptoui double [[A1]] to i64
-; SSE-NEXT:    [[CVT2:%.*]] = fptoui double [[A2]] to i64
-; SSE-NEXT:    [[CVT3:%.*]] = fptoui double [[A3]] to i64
-; SSE-NEXT:    [[CVT4:%.*]] = fptoui double [[A4]] to i64
-; SSE-NEXT:    [[CVT5:%.*]] = fptoui double [[A5]] to i64
-; SSE-NEXT:    [[CVT6:%.*]] = fptoui double [[A6]] to i64
-; SSE-NEXT:    [[CVT7:%.*]] = fptoui double [[A7]] to i64
-; SSE-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
-; SSE-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
-; SSE-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
-; SSE-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
-; SSE-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
-; SSE-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
-; SSE-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
-; SSE-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; SSE-NEXT:    [[CVT0:%.*]] = fptoui double [[TMP5]] to i64
+; SSE-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; SSE-NEXT:    [[CVT1:%.*]] = fptoui double [[TMP6]] to i64
+; SSE-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; SSE-NEXT:    [[CVT2:%.*]] = fptoui double [[TMP7]] to i64
+; SSE-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; SSE-NEXT:    [[CVT3:%.*]] = fptoui double [[TMP8]] to i64
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; SSE-NEXT:    [[CVT4:%.*]] = fptoui double [[TMP9]] to i64
+; SSE-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+; SSE-NEXT:    [[CVT5:%.*]] = fptoui double [[TMP10]] to i64
+; SSE-NEXT:    [[TMP11:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
+; SSE-NEXT:    [[CVT6:%.*]] = fptoui double [[TMP11]] to i64
+; SSE-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
+; SSE-NEXT:    [[CVT7:%.*]] = fptoui double [[TMP12]] to i64
+; SSE-NEXT:    [[TMP13:%.*]] = insertelement <2 x i64> poison, i64 [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP14:%.*]] = insertelement <2 x i64> [[TMP13]], i64 [[CVT1]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP14]], <2 x i64>* bitcast ([8 x i64]* @dst64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP15:%.*]] = insertelement <2 x i64> poison, i64 [[CVT2]], i32 0
+; SSE-NEXT:    [[TMP16:%.*]] = insertelement <2 x i64> [[TMP15]], i64 [[CVT3]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP16]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP17:%.*]] = insertelement <2 x i64> poison, i64 [[CVT4]], i32 0
+; SSE-NEXT:    [[TMP18:%.*]] = insertelement <2 x i64> [[TMP17]], i64 [[CVT5]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP18]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP19:%.*]] = insertelement <2 x i64> poison, i64 [[CVT6]], i32 0
+; SSE-NEXT:    [[TMP20:%.*]] = insertelement <2 x i64> [[TMP19]], i64 [[CVT7]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP20]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SSE-NEXT:    ret void
 ;
 ; AVX256NODQ-LABEL: @fptoui_8f64_8i64(
-; AVX256NODQ-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; AVX256NODQ-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; AVX256NODQ-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; AVX256NODQ-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; AVX256NODQ-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
-; AVX256NODQ-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
-; AVX256NODQ-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
-; AVX256NODQ-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
-; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptoui double [[A0]] to i64
-; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptoui double [[A1]] to i64
-; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptoui double [[A2]] to i64
-; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptoui double [[A3]] to i64
-; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptoui double [[A4]] to i64
-; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptoui double [[A5]] to i64
-; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptoui double [[A6]] to i64
-; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptoui double [[A7]] to i64
-; AVX256NODQ-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+; AVX256NODQ-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; AVX256NODQ-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; AVX256NODQ-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8
+; AVX256NODQ-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8
+; AVX256NODQ-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptoui double [[TMP5]] to i64
+; AVX256NODQ-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptoui double [[TMP6]] to i64
+; AVX256NODQ-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptoui double [[TMP7]] to i64
+; AVX256NODQ-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptoui double [[TMP8]] to i64
+; AVX256NODQ-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptoui double [[TMP9]] to i64
+; AVX256NODQ-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptoui double [[TMP10]] to i64
+; AVX256NODQ-NEXT:    [[TMP11:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
+; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptoui double [[TMP11]] to i64
+; AVX256NODQ-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
+; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptoui double [[TMP12]] to i64
+; AVX256NODQ-NEXT:    [[TMP13:%.*]] = insertelement <2 x i64> poison, i64 [[CVT0]], i32 0
+; AVX256NODQ-NEXT:    [[TMP14:%.*]] = insertelement <2 x i64> [[TMP13]], i64 [[CVT1]], i32 1
+; AVX256NODQ-NEXT:    store <2 x i64> [[TMP14]], <2 x i64>* bitcast ([8 x i64]* @dst64 to <2 x i64>*), align 8
+; AVX256NODQ-NEXT:    [[TMP15:%.*]] = insertelement <2 x i64> poison, i64 [[CVT2]], i32 0
+; AVX256NODQ-NEXT:    [[TMP16:%.*]] = insertelement <2 x i64> [[TMP15]], i64 [[CVT3]], i32 1
+; AVX256NODQ-NEXT:    store <2 x i64> [[TMP16]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX256NODQ-NEXT:    [[TMP17:%.*]] = insertelement <2 x i64> poison, i64 [[CVT4]], i32 0
+; AVX256NODQ-NEXT:    [[TMP18:%.*]] = insertelement <2 x i64> [[TMP17]], i64 [[CVT5]], i32 1
+; AVX256NODQ-NEXT:    store <2 x i64> [[TMP18]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX256NODQ-NEXT:    [[TMP19:%.*]] = insertelement <2 x i64> poison, i64 [[CVT6]], i32 0
+; AVX256NODQ-NEXT:    [[TMP20:%.*]] = insertelement <2 x i64> [[TMP19]], i64 [[CVT7]], i32 1
+; AVX256NODQ-NEXT:    store <2 x i64> [[TMP20]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6) to <2 x i64>*), align 8
 ; AVX256NODQ-NEXT:    ret void
 ;
+; AVX256NODQ-BDVER1-LABEL: @fptoui_8f64_8i64(
+; AVX256NODQ-BDVER1-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
+; AVX256NODQ-BDVER1-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX256NODQ-BDVER1-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
+; AVX256NODQ-BDVER1-NEXT:    [[CVT0:%.*]] = fptoui double [[TMP3]] to i64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
+; AVX256NODQ-BDVER1-NEXT:    [[CVT1:%.*]] = fptoui double [[TMP4]] to i64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 2
+; AVX256NODQ-BDVER1-NEXT:    [[CVT2:%.*]] = fptoui double [[TMP5]] to i64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP6:%.*]] = extractelement <4 x double> [[TMP1]], i32 3
+; AVX256NODQ-BDVER1-NEXT:    [[CVT3:%.*]] = fptoui double [[TMP6]] to i64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP7:%.*]] = extractelement <4 x double> [[TMP2]], i32 0
+; AVX256NODQ-BDVER1-NEXT:    [[CVT4:%.*]] = fptoui double [[TMP7]] to i64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP8:%.*]] = extractelement <4 x double> [[TMP2]], i32 1
+; AVX256NODQ-BDVER1-NEXT:    [[CVT5:%.*]] = fptoui double [[TMP8]] to i64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP9:%.*]] = extractelement <4 x double> [[TMP2]], i32 2
+; AVX256NODQ-BDVER1-NEXT:    [[CVT6:%.*]] = fptoui double [[TMP9]] to i64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP10:%.*]] = extractelement <4 x double> [[TMP2]], i32 3
+; AVX256NODQ-BDVER1-NEXT:    [[CVT7:%.*]] = fptoui double [[TMP10]] to i64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP11:%.*]] = insertelement <4 x i64> poison, i64 [[CVT0]], i32 0
+; AVX256NODQ-BDVER1-NEXT:    [[TMP12:%.*]] = insertelement <4 x i64> [[TMP11]], i64 [[CVT1]], i32 1
+; AVX256NODQ-BDVER1-NEXT:    [[TMP13:%.*]] = insertelement <4 x i64> [[TMP12]], i64 [[CVT2]], i32 2
+; AVX256NODQ-BDVER1-NEXT:    [[TMP14:%.*]] = insertelement <4 x i64> [[TMP13]], i64 [[CVT3]], i32 3
+; AVX256NODQ-BDVER1-NEXT:    store <4 x i64> [[TMP14]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
+; AVX256NODQ-BDVER1-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[CVT4]], i32 0
+; AVX256NODQ-BDVER1-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[CVT5]], i32 1
+; AVX256NODQ-BDVER1-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[CVT6]], i32 2
+; AVX256NODQ-BDVER1-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[CVT7]], i32 3
+; AVX256NODQ-BDVER1-NEXT:    store <4 x i64> [[TMP18]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX256NODQ-BDVER1-NEXT:    ret void
+;
+; AVX256NODQ-AVX2-LABEL: @fptoui_8f64_8i64(
+; AVX256NODQ-AVX2-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
+; AVX256NODQ-AVX2-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX256NODQ-AVX2-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
+; AVX256NODQ-AVX2-NEXT:    [[CVT0:%.*]] = fptoui double [[TMP3]] to i64
+; AVX256NODQ-AVX2-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
+; AVX256NODQ-AVX2-NEXT:    [[CVT1:%.*]] = fptoui double [[TMP4]] to i64
+; AVX256NODQ-AVX2-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 2
+; AVX256NODQ-AVX2-NEXT:    [[CVT2:%.*]] = fptoui double [[TMP5]] to i64
+; AVX256NODQ-AVX2-NEXT:    [[TMP6:%.*]] = extractelement <4 x double> [[TMP1]], i32 3
+; AVX256NODQ-AVX2-NEXT:    [[CVT3:%.*]] = fptoui double [[TMP6]] to i64
+; AVX256NODQ-AVX2-NEXT:    [[TMP7:%.*]] = extractelement <4 x double> [[TMP2]], i32 0
+; AVX256NODQ-AVX2-NEXT:    [[CVT4:%.*]] = fptoui double [[TMP7]] to i64
+; AVX256NODQ-AVX2-NEXT:    [[TMP8:%.*]] = extractelement <4 x double> [[TMP2]], i32 1
+; AVX256NODQ-AVX2-NEXT:    [[CVT5:%.*]] = fptoui double [[TMP8]] to i64
+; AVX256NODQ-AVX2-NEXT:    [[TMP9:%.*]] = extractelement <4 x double> [[TMP2]], i32 2
+; AVX256NODQ-AVX2-NEXT:    [[CVT6:%.*]] = fptoui double [[TMP9]] to i64
+; AVX256NODQ-AVX2-NEXT:    [[TMP10:%.*]] = extractelement <4 x double> [[TMP2]], i32 3
+; AVX256NODQ-AVX2-NEXT:    [[CVT7:%.*]] = fptoui double [[TMP10]] to i64
+; AVX256NODQ-AVX2-NEXT:    [[TMP11:%.*]] = insertelement <4 x i64> poison, i64 [[CVT0]], i32 0
+; AVX256NODQ-AVX2-NEXT:    [[TMP12:%.*]] = insertelement <4 x i64> [[TMP11]], i64 [[CVT1]], i32 1
+; AVX256NODQ-AVX2-NEXT:    [[TMP13:%.*]] = insertelement <4 x i64> [[TMP12]], i64 [[CVT2]], i32 2
+; AVX256NODQ-AVX2-NEXT:    [[TMP14:%.*]] = insertelement <4 x i64> [[TMP13]], i64 [[CVT3]], i32 3
+; AVX256NODQ-AVX2-NEXT:    store <4 x i64> [[TMP14]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
+; AVX256NODQ-AVX2-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[CVT4]], i32 0
+; AVX256NODQ-AVX2-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[CVT5]], i32 1
+; AVX256NODQ-AVX2-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[CVT6]], i32 2
+; AVX256NODQ-AVX2-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[CVT7]], i32 3
+; AVX256NODQ-AVX2-NEXT:    store <4 x i64> [[TMP18]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX256NODQ-AVX2-NEXT:    ret void
+;
 ; AVX512-LABEL: @fptoui_8f64_8i64(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
 ; AVX512-NEXT:    [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i64>
@@ -118,59 +196,123 @@
 
 define void @fptoui_8f64_8i32() #0 {
 ; SSE-LABEL: @fptoui_8f64_8i32(
-; SSE-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; SSE-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; SSE-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; SSE-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
-; SSE-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
-; SSE-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
-; SSE-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
-; SSE-NEXT:    [[CVT0:%.*]] = fptoui double [[A0]] to i32
-; SSE-NEXT:    [[CVT1:%.*]] = fptoui double [[A1]] to i32
-; SSE-NEXT:    [[CVT2:%.*]] = fptoui double [[A2]] to i32
-; SSE-NEXT:    [[CVT3:%.*]] = fptoui double [[A3]] to i32
-; SSE-NEXT:    [[CVT4:%.*]] = fptoui double [[A4]] to i32
-; SSE-NEXT:    [[CVT5:%.*]] = fptoui double [[A5]] to i32
-; SSE-NEXT:    [[CVT6:%.*]] = fptoui double [[A6]] to i32
-; SSE-NEXT:    [[CVT7:%.*]] = fptoui double [[A7]] to i32
-; SSE-NEXT:    store i32 [[CVT0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 0), align 4
-; SSE-NEXT:    store i32 [[CVT1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 1), align 4
-; SSE-NEXT:    store i32 [[CVT2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 2), align 4
-; SSE-NEXT:    store i32 [[CVT3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 3), align 4
-; SSE-NEXT:    store i32 [[CVT4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4), align 4
-; SSE-NEXT:    store i32 [[CVT5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 5), align 4
-; SSE-NEXT:    store i32 [[CVT6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 6), align 4
-; SSE-NEXT:    store i32 [[CVT7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
+; SSE-NEXT:    [[CVT0:%.*]] = fptoui double [[TMP3]] to i32
+; SSE-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
+; SSE-NEXT:    [[CVT1:%.*]] = fptoui double [[TMP4]] to i32
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 2
+; SSE-NEXT:    [[CVT2:%.*]] = fptoui double [[TMP5]] to i32
+; SSE-NEXT:    [[TMP6:%.*]] = extractelement <4 x double> [[TMP1]], i32 3
+; SSE-NEXT:    [[CVT3:%.*]] = fptoui double [[TMP6]] to i32
+; SSE-NEXT:    [[TMP7:%.*]] = extractelement <4 x double> [[TMP2]], i32 0
+; SSE-NEXT:    [[CVT4:%.*]] = fptoui double [[TMP7]] to i32
+; SSE-NEXT:    [[TMP8:%.*]] = extractelement <4 x double> [[TMP2]], i32 1
+; SSE-NEXT:    [[CVT5:%.*]] = fptoui double [[TMP8]] to i32
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <4 x double> [[TMP2]], i32 2
+; SSE-NEXT:    [[CVT6:%.*]] = fptoui double [[TMP9]] to i32
+; SSE-NEXT:    [[TMP10:%.*]] = extractelement <4 x double> [[TMP2]], i32 3
+; SSE-NEXT:    [[CVT7:%.*]] = fptoui double [[TMP10]] to i32
+; SSE-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[CVT1]], i32 1
+; SSE-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[CVT2]], i32 2
+; SSE-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[CVT3]], i32 3
+; SSE-NEXT:    store <4 x i32> [[TMP14]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> poison, i32 [[CVT4]], i32 0
+; SSE-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[CVT5]], i32 1
+; SSE-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[CVT6]], i32 2
+; SSE-NEXT:    [[TMP18:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[CVT7]], i32 3
+; SSE-NEXT:    store <4 x i32> [[TMP18]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; AVX256NODQ-LABEL: @fptoui_8f64_8i32(
-; AVX256NODQ-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; AVX256NODQ-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; AVX256NODQ-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; AVX256NODQ-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; AVX256NODQ-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
-; AVX256NODQ-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
-; AVX256NODQ-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
-; AVX256NODQ-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
-; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptoui double [[A0]] to i32
-; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptoui double [[A1]] to i32
-; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptoui double [[A2]] to i32
-; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptoui double [[A3]] to i32
-; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptoui double [[A4]] to i32
-; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptoui double [[A5]] to i32
-; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptoui double [[A6]] to i32
-; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptoui double [[A7]] to i32
-; AVX256NODQ-NEXT:    store i32 [[CVT0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 0), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 1), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 2), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 3), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 5), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 6), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4
+; AVX256NODQ-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
+; AVX256NODQ-NEXT:    [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptoui double [[TMP2]] to i32
+; AVX256NODQ-NEXT:    [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptoui double [[TMP3]] to i32
+; AVX256NODQ-NEXT:    [[TMP4:%.*]] = extractelement <8 x double> [[TMP1]], i32 2
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptoui double [[TMP4]] to i32
+; AVX256NODQ-NEXT:    [[TMP5:%.*]] = extractelement <8 x double> [[TMP1]], i32 3
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptoui double [[TMP5]] to i32
+; AVX256NODQ-NEXT:    [[TMP6:%.*]] = extractelement <8 x double> [[TMP1]], i32 4
+; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptoui double [[TMP6]] to i32
+; AVX256NODQ-NEXT:    [[TMP7:%.*]] = extractelement <8 x double> [[TMP1]], i32 5
+; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptoui double [[TMP7]] to i32
+; AVX256NODQ-NEXT:    [[TMP8:%.*]] = extractelement <8 x double> [[TMP1]], i32 6
+; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptoui double [[TMP8]] to i32
+; AVX256NODQ-NEXT:    [[TMP9:%.*]] = extractelement <8 x double> [[TMP1]], i32 7
+; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptoui double [[TMP9]] to i32
+; AVX256NODQ-NEXT:    [[TMP10:%.*]] = insertelement <8 x i32> poison, i32 [[CVT0]], i32 0
+; AVX256NODQ-NEXT:    [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[CVT1]], i32 1
+; AVX256NODQ-NEXT:    [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[CVT2]], i32 2
+; AVX256NODQ-NEXT:    [[TMP13:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[CVT3]], i32 3
+; AVX256NODQ-NEXT:    [[TMP14:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[CVT4]], i32 4
+; AVX256NODQ-NEXT:    [[TMP15:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[CVT5]], i32 5
+; AVX256NODQ-NEXT:    [[TMP16:%.*]] = insertelement <8 x i32> [[TMP15]], i32 [[CVT6]], i32 6
+; AVX256NODQ-NEXT:    [[TMP17:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[CVT7]], i32 7
+; AVX256NODQ-NEXT:    store <8 x i32> [[TMP17]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4
 ; AVX256NODQ-NEXT:    ret void
 ;
+; AVX256NODQ-BDVER1-LABEL: @fptoui_8f64_8i32(
+; AVX256NODQ-BDVER1-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
+; AVX256NODQ-BDVER1-NEXT:    [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0
+; AVX256NODQ-BDVER1-NEXT:    [[CVT0:%.*]] = fptoui double [[TMP2]] to i32
+; AVX256NODQ-BDVER1-NEXT:    [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1
+; AVX256NODQ-BDVER1-NEXT:    [[CVT1:%.*]] = fptoui double [[TMP3]] to i32
+; AVX256NODQ-BDVER1-NEXT:    [[TMP4:%.*]] = extractelement <8 x double> [[TMP1]], i32 2
+; AVX256NODQ-BDVER1-NEXT:    [[CVT2:%.*]] = fptoui double [[TMP4]] to i32
+; AVX256NODQ-BDVER1-NEXT:    [[TMP5:%.*]] = extractelement <8 x double> [[TMP1]], i32 3
+; AVX256NODQ-BDVER1-NEXT:    [[CVT3:%.*]] = fptoui double [[TMP5]] to i32
+; AVX256NODQ-BDVER1-NEXT:    [[TMP6:%.*]] = extractelement <8 x double> [[TMP1]], i32 4
+; AVX256NODQ-BDVER1-NEXT:    [[CVT4:%.*]] = fptoui double [[TMP6]] to i32
+; AVX256NODQ-BDVER1-NEXT:    [[TMP7:%.*]] = extractelement <8 x double> [[TMP1]], i32 5
+; AVX256NODQ-BDVER1-NEXT:    [[CVT5:%.*]] = fptoui double [[TMP7]] to i32
+; AVX256NODQ-BDVER1-NEXT:    [[TMP8:%.*]] = extractelement <8 x double> [[TMP1]], i32 6
+; AVX256NODQ-BDVER1-NEXT:    [[CVT6:%.*]] = fptoui double [[TMP8]] to i32
+; AVX256NODQ-BDVER1-NEXT:    [[TMP9:%.*]] = extractelement <8 x double> [[TMP1]], i32 7
+; AVX256NODQ-BDVER1-NEXT:    [[CVT7:%.*]] = fptoui double [[TMP9]] to i32
+; AVX256NODQ-BDVER1-NEXT:    [[TMP10:%.*]] = insertelement <8 x i32> poison, i32 [[CVT0]], i32 0
+; AVX256NODQ-BDVER1-NEXT:    [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[CVT1]], i32 1
+; AVX256NODQ-BDVER1-NEXT:    [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[CVT2]], i32 2
+; AVX256NODQ-BDVER1-NEXT:    [[TMP13:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[CVT3]], i32 3
+; AVX256NODQ-BDVER1-NEXT:    [[TMP14:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[CVT4]], i32 4
+; AVX256NODQ-BDVER1-NEXT:    [[TMP15:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[CVT5]], i32 5
+; AVX256NODQ-BDVER1-NEXT:    [[TMP16:%.*]] = insertelement <8 x i32> [[TMP15]], i32 [[CVT6]], i32 6
+; AVX256NODQ-BDVER1-NEXT:    [[TMP17:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[CVT7]], i32 7
+; AVX256NODQ-BDVER1-NEXT:    store <8 x i32> [[TMP17]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4
+; AVX256NODQ-BDVER1-NEXT:    ret void
+;
+; AVX256NODQ-AVX2-LABEL: @fptoui_8f64_8i32(
+; AVX256NODQ-AVX2-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
+; AVX256NODQ-AVX2-NEXT:    [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0
+; AVX256NODQ-AVX2-NEXT:    [[CVT0:%.*]] = fptoui double [[TMP2]] to i32
+; AVX256NODQ-AVX2-NEXT:    [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1
+; AVX256NODQ-AVX2-NEXT:    [[CVT1:%.*]] = fptoui double [[TMP3]] to i32
+; AVX256NODQ-AVX2-NEXT:    [[TMP4:%.*]] = extractelement <8 x double> [[TMP1]], i32 2
+; AVX256NODQ-AVX2-NEXT:    [[CVT2:%.*]] = fptoui double [[TMP4]] to i32
+; AVX256NODQ-AVX2-NEXT:    [[TMP5:%.*]] = extractelement <8 x double> [[TMP1]], i32 3
+; AVX256NODQ-AVX2-NEXT:    [[CVT3:%.*]] = fptoui double [[TMP5]] to i32
+; AVX256NODQ-AVX2-NEXT:    [[TMP6:%.*]] = extractelement <8 x double> [[TMP1]], i32 4
+; AVX256NODQ-AVX2-NEXT:    [[CVT4:%.*]] = fptoui double [[TMP6]] to i32
+; AVX256NODQ-AVX2-NEXT:    [[TMP7:%.*]] = extractelement <8 x double> [[TMP1]], i32 5
+; AVX256NODQ-AVX2-NEXT:    [[CVT5:%.*]] = fptoui double [[TMP7]] to i32
+; AVX256NODQ-AVX2-NEXT:    [[TMP8:%.*]] = extractelement <8 x double> [[TMP1]], i32 6
+; AVX256NODQ-AVX2-NEXT:    [[CVT6:%.*]] = fptoui double [[TMP8]] to i32
+; AVX256NODQ-AVX2-NEXT:    [[TMP9:%.*]] = extractelement <8 x double> [[TMP1]], i32 7
+; AVX256NODQ-AVX2-NEXT:    [[CVT7:%.*]] = fptoui double [[TMP9]] to i32
+; AVX256NODQ-AVX2-NEXT:    [[TMP10:%.*]] = insertelement <8 x i32> poison, i32 [[CVT0]], i32 0
+; AVX256NODQ-AVX2-NEXT:    [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[CVT1]], i32 1
+; AVX256NODQ-AVX2-NEXT:    [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[CVT2]], i32 2
+; AVX256NODQ-AVX2-NEXT:    [[TMP13:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[CVT3]], i32 3
+; AVX256NODQ-AVX2-NEXT:    [[TMP14:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[CVT4]], i32 4
+; AVX256NODQ-AVX2-NEXT:    [[TMP15:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[CVT5]], i32 5
+; AVX256NODQ-AVX2-NEXT:    [[TMP16:%.*]] = insertelement <8 x i32> [[TMP15]], i32 [[CVT6]], i32 6
+; AVX256NODQ-AVX2-NEXT:    [[TMP17:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[CVT7]], i32 7
+; AVX256NODQ-AVX2-NEXT:    store <8 x i32> [[TMP17]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4
+; AVX256NODQ-AVX2-NEXT:    ret void
+;
 ; AVX-LABEL: @fptoui_8f64_8i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
 ; AVX-NEXT:    [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i32>
@@ -299,59 +441,133 @@
 
 define void @fptoui_8f32_8i64() #0 {
 ; SSE-LABEL: @fptoui_8f32_8i64(
-; SSE-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; SSE-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; SSE-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; SSE-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; SSE-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; SSE-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; SSE-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; SSE-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; SSE-NEXT:    [[CVT0:%.*]] = fptoui float [[A0]] to i64
-; SSE-NEXT:    [[CVT1:%.*]] = fptoui float [[A1]] to i64
-; SSE-NEXT:    [[CVT2:%.*]] = fptoui float [[A2]] to i64
-; SSE-NEXT:    [[CVT3:%.*]] = fptoui float [[A3]] to i64
-; SSE-NEXT:    [[CVT4:%.*]] = fptoui float [[A4]] to i64
-; SSE-NEXT:    [[CVT5:%.*]] = fptoui float [[A5]] to i64
-; SSE-NEXT:    [[CVT6:%.*]] = fptoui float [[A6]] to i64
-; SSE-NEXT:    [[CVT7:%.*]] = fptoui float [[A7]] to i64
-; SSE-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
-; SSE-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
-; SSE-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
-; SSE-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
-; SSE-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
-; SSE-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
-; SSE-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
-; SSE-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([16 x float]* @src32 to <2 x float>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2) to <2 x float>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <2 x float>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6) to <2 x float>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
+; SSE-NEXT:    [[CVT0:%.*]] = fptoui float [[TMP5]] to i64
+; SSE-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+; SSE-NEXT:    [[CVT1:%.*]] = fptoui float [[TMP6]] to i64
+; SSE-NEXT:    [[TMP7:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; SSE-NEXT:    [[CVT2:%.*]] = fptoui float [[TMP7]] to i64
+; SSE-NEXT:    [[TMP8:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+; SSE-NEXT:    [[CVT3:%.*]] = fptoui float [[TMP8]] to i64
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; SSE-NEXT:    [[CVT4:%.*]] = fptoui float [[TMP9]] to i64
+; SSE-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; SSE-NEXT:    [[CVT5:%.*]] = fptoui float [[TMP10]] to i64
+; SSE-NEXT:    [[TMP11:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; SSE-NEXT:    [[CVT6:%.*]] = fptoui float [[TMP11]] to i64
+; SSE-NEXT:    [[TMP12:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; SSE-NEXT:    [[CVT7:%.*]] = fptoui float [[TMP12]] to i64
+; SSE-NEXT:    [[TMP13:%.*]] = insertelement <2 x i64> poison, i64 [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP14:%.*]] = insertelement <2 x i64> [[TMP13]], i64 [[CVT1]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP14]], <2 x i64>* bitcast ([8 x i64]* @dst64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP15:%.*]] = insertelement <2 x i64> poison, i64 [[CVT2]], i32 0
+; SSE-NEXT:    [[TMP16:%.*]] = insertelement <2 x i64> [[TMP15]], i64 [[CVT3]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP16]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP17:%.*]] = insertelement <2 x i64> poison, i64 [[CVT4]], i32 0
+; SSE-NEXT:    [[TMP18:%.*]] = insertelement <2 x i64> [[TMP17]], i64 [[CVT5]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP18]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP19:%.*]] = insertelement <2 x i64> poison, i64 [[CVT6]], i32 0
+; SSE-NEXT:    [[TMP20:%.*]] = insertelement <2 x i64> [[TMP19]], i64 [[CVT7]], i32 1
+; SSE-NEXT:    store <2 x i64> [[TMP20]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SSE-NEXT:    ret void
 ;
 ; AVX256NODQ-LABEL: @fptoui_8f32_8i64(
-; AVX256NODQ-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; AVX256NODQ-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; AVX256NODQ-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; AVX256NODQ-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; AVX256NODQ-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; AVX256NODQ-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; AVX256NODQ-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; AVX256NODQ-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptoui float [[A0]] to i64
-; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptoui float [[A1]] to i64
-; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptoui float [[A2]] to i64
-; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptoui float [[A3]] to i64
-; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptoui float [[A4]] to i64
-; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptoui float [[A5]] to i64
-; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptoui float [[A6]] to i64
-; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptoui float [[A7]] to i64
-; AVX256NODQ-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
-; AVX256NODQ-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+; AVX256NODQ-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; AVX256NODQ-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; AVX256NODQ-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptoui float [[TMP3]] to i64
+; AVX256NODQ-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptoui float [[TMP4]] to i64
+; AVX256NODQ-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptoui float [[TMP5]] to i64
+; AVX256NODQ-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptoui float [[TMP6]] to i64
+; AVX256NODQ-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptoui float [[TMP7]] to i64
+; AVX256NODQ-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptoui float [[TMP8]] to i64
+; AVX256NODQ-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptoui float [[TMP9]] to i64
+; AVX256NODQ-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptoui float [[TMP10]] to i64
+; AVX256NODQ-NEXT:    [[TMP11:%.*]] = insertelement <4 x i64> poison, i64 [[CVT0]], i32 0
+; AVX256NODQ-NEXT:    [[TMP12:%.*]] = insertelement <4 x i64> [[TMP11]], i64 [[CVT1]], i32 1
+; AVX256NODQ-NEXT:    [[TMP13:%.*]] = insertelement <4 x i64> [[TMP12]], i64 [[CVT2]], i32 2
+; AVX256NODQ-NEXT:    [[TMP14:%.*]] = insertelement <4 x i64> [[TMP13]], i64 [[CVT3]], i32 3
+; AVX256NODQ-NEXT:    store <4 x i64> [[TMP14]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
+; AVX256NODQ-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[CVT4]], i32 0
+; AVX256NODQ-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[CVT5]], i32 1
+; AVX256NODQ-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[CVT6]], i32 2
+; AVX256NODQ-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[CVT7]], i32 3
+; AVX256NODQ-NEXT:    store <4 x i64> [[TMP18]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX256NODQ-NEXT:    ret void
 ;
+; AVX256NODQ-BDVER1-LABEL: @fptoui_8f32_8i64(
+; AVX256NODQ-BDVER1-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; AVX256NODQ-BDVER1-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; AVX256NODQ-BDVER1-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; AVX256NODQ-BDVER1-NEXT:    [[CVT0:%.*]] = fptoui float [[TMP3]] to i64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; AVX256NODQ-BDVER1-NEXT:    [[CVT1:%.*]] = fptoui float [[TMP4]] to i64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; AVX256NODQ-BDVER1-NEXT:    [[CVT2:%.*]] = fptoui float [[TMP5]] to i64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; AVX256NODQ-BDVER1-NEXT:    [[CVT3:%.*]] = fptoui float [[TMP6]] to i64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; AVX256NODQ-BDVER1-NEXT:    [[CVT4:%.*]] = fptoui float [[TMP7]] to i64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; AVX256NODQ-BDVER1-NEXT:    [[CVT5:%.*]] = fptoui float [[TMP8]] to i64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; AVX256NODQ-BDVER1-NEXT:    [[CVT6:%.*]] = fptoui float [[TMP9]] to i64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; AVX256NODQ-BDVER1-NEXT:    [[CVT7:%.*]] = fptoui float [[TMP10]] to i64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP11:%.*]] = insertelement <4 x i64> poison, i64 [[CVT0]], i32 0
+; AVX256NODQ-BDVER1-NEXT:    [[TMP12:%.*]] = insertelement <4 x i64> [[TMP11]], i64 [[CVT1]], i32 1
+; AVX256NODQ-BDVER1-NEXT:    [[TMP13:%.*]] = insertelement <4 x i64> [[TMP12]], i64 [[CVT2]], i32 2
+; AVX256NODQ-BDVER1-NEXT:    [[TMP14:%.*]] = insertelement <4 x i64> [[TMP13]], i64 [[CVT3]], i32 3
+; AVX256NODQ-BDVER1-NEXT:    store <4 x i64> [[TMP14]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
+; AVX256NODQ-BDVER1-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[CVT4]], i32 0
+; AVX256NODQ-BDVER1-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[CVT5]], i32 1
+; AVX256NODQ-BDVER1-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[CVT6]], i32 2
+; AVX256NODQ-BDVER1-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[CVT7]], i32 3
+; AVX256NODQ-BDVER1-NEXT:    store <4 x i64> [[TMP18]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX256NODQ-BDVER1-NEXT:    ret void
+;
+; AVX256NODQ-AVX2-LABEL: @fptoui_8f32_8i64(
+; AVX256NODQ-AVX2-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; AVX256NODQ-AVX2-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; AVX256NODQ-AVX2-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; AVX256NODQ-AVX2-NEXT:    [[CVT0:%.*]] = fptoui float [[TMP3]] to i64
+; AVX256NODQ-AVX2-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; AVX256NODQ-AVX2-NEXT:    [[CVT1:%.*]] = fptoui float [[TMP4]] to i64
+; AVX256NODQ-AVX2-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; AVX256NODQ-AVX2-NEXT:    [[CVT2:%.*]] = fptoui float [[TMP5]] to i64
+; AVX256NODQ-AVX2-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; AVX256NODQ-AVX2-NEXT:    [[CVT3:%.*]] = fptoui float [[TMP6]] to i64
+; AVX256NODQ-AVX2-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; AVX256NODQ-AVX2-NEXT:    [[CVT4:%.*]] = fptoui float [[TMP7]] to i64
+; AVX256NODQ-AVX2-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; AVX256NODQ-AVX2-NEXT:    [[CVT5:%.*]] = fptoui float [[TMP8]] to i64
+; AVX256NODQ-AVX2-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; AVX256NODQ-AVX2-NEXT:    [[CVT6:%.*]] = fptoui float [[TMP9]] to i64
+; AVX256NODQ-AVX2-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; AVX256NODQ-AVX2-NEXT:    [[CVT7:%.*]] = fptoui float [[TMP10]] to i64
+; AVX256NODQ-AVX2-NEXT:    [[TMP11:%.*]] = insertelement <4 x i64> poison, i64 [[CVT0]], i32 0
+; AVX256NODQ-AVX2-NEXT:    [[TMP12:%.*]] = insertelement <4 x i64> [[TMP11]], i64 [[CVT1]], i32 1
+; AVX256NODQ-AVX2-NEXT:    [[TMP13:%.*]] = insertelement <4 x i64> [[TMP12]], i64 [[CVT2]], i32 2
+; AVX256NODQ-AVX2-NEXT:    [[TMP14:%.*]] = insertelement <4 x i64> [[TMP13]], i64 [[CVT3]], i32 3
+; AVX256NODQ-AVX2-NEXT:    store <4 x i64> [[TMP14]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
+; AVX256NODQ-AVX2-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[CVT4]], i32 0
+; AVX256NODQ-AVX2-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[CVT5]], i32 1
+; AVX256NODQ-AVX2-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[CVT6]], i32 2
+; AVX256NODQ-AVX2-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[CVT7]], i32 3
+; AVX256NODQ-AVX2-NEXT:    store <4 x i64> [[TMP18]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX256NODQ-AVX2-NEXT:    ret void
+;
 ; AVX512-LABEL: @fptoui_8f32_8i64(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
 ; AVX512-NEXT:    [[TMP2:%.*]] = fptoui <8 x float> [[TMP1]] to <8 x i64>
@@ -396,59 +612,123 @@
 
 define void @fptoui_8f32_8i32() #0 {
 ; SSE-LABEL: @fptoui_8f32_8i32(
-; SSE-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; SSE-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; SSE-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; SSE-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; SSE-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; SSE-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; SSE-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; SSE-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; SSE-NEXT:    [[CVT0:%.*]] = fptoui float [[A0]] to i32
-; SSE-NEXT:    [[CVT1:%.*]] = fptoui float [[A1]] to i32
-; SSE-NEXT:    [[CVT2:%.*]] = fptoui float [[A2]] to i32
-; SSE-NEXT:    [[CVT3:%.*]] = fptoui float [[A3]] to i32
-; SSE-NEXT:    [[CVT4:%.*]] = fptoui float [[A4]] to i32
-; SSE-NEXT:    [[CVT5:%.*]] = fptoui float [[A5]] to i32
-; SSE-NEXT:    [[CVT6:%.*]] = fptoui float [[A6]] to i32
-; SSE-NEXT:    [[CVT7:%.*]] = fptoui float [[A7]] to i32
-; SSE-NEXT:    store i32 [[CVT0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 0), align 4
-; SSE-NEXT:    store i32 [[CVT1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 1), align 4
-; SSE-NEXT:    store i32 [[CVT2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 2), align 4
-; SSE-NEXT:    store i32 [[CVT3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 3), align 4
-; SSE-NEXT:    store i32 [[CVT4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4), align 4
-; SSE-NEXT:    store i32 [[CVT5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 5), align 4
-; SSE-NEXT:    store i32 [[CVT6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 6), align 4
-; SSE-NEXT:    store i32 [[CVT7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; SSE-NEXT:    [[CVT0:%.*]] = fptoui float [[TMP3]] to i32
+; SSE-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; SSE-NEXT:    [[CVT1:%.*]] = fptoui float [[TMP4]] to i32
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; SSE-NEXT:    [[CVT2:%.*]] = fptoui float [[TMP5]] to i32
+; SSE-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; SSE-NEXT:    [[CVT3:%.*]] = fptoui float [[TMP6]] to i32
+; SSE-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; SSE-NEXT:    [[CVT4:%.*]] = fptoui float [[TMP7]] to i32
+; SSE-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; SSE-NEXT:    [[CVT5:%.*]] = fptoui float [[TMP8]] to i32
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; SSE-NEXT:    [[CVT6:%.*]] = fptoui float [[TMP9]] to i32
+; SSE-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; SSE-NEXT:    [[CVT7:%.*]] = fptoui float [[TMP10]] to i32
+; SSE-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[CVT1]], i32 1
+; SSE-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[CVT2]], i32 2
+; SSE-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[CVT3]], i32 3
+; SSE-NEXT:    store <4 x i32> [[TMP14]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> poison, i32 [[CVT4]], i32 0
+; SSE-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[CVT5]], i32 1
+; SSE-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[CVT6]], i32 2
+; SSE-NEXT:    [[TMP18:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[CVT7]], i32 3
+; SSE-NEXT:    store <4 x i32> [[TMP18]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; AVX256NODQ-LABEL: @fptoui_8f32_8i32(
-; AVX256NODQ-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; AVX256NODQ-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; AVX256NODQ-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; AVX256NODQ-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; AVX256NODQ-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; AVX256NODQ-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; AVX256NODQ-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; AVX256NODQ-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptoui float [[A0]] to i32
-; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptoui float [[A1]] to i32
-; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptoui float [[A2]] to i32
-; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptoui float [[A3]] to i32
-; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptoui float [[A4]] to i32
-; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptoui float [[A5]] to i32
-; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptoui float [[A6]] to i32
-; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptoui float [[A7]] to i32
-; AVX256NODQ-NEXT:    store i32 [[CVT0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 0), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 1), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 2), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 3), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 5), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 6), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4
+; AVX256NODQ-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; AVX256NODQ-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[TMP1]], i32 0
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptoui float [[TMP2]] to i32
+; AVX256NODQ-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP1]], i32 1
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptoui float [[TMP3]] to i32
+; AVX256NODQ-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[TMP1]], i32 2
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptoui float [[TMP4]] to i32
+; AVX256NODQ-NEXT:    [[TMP5:%.*]] = extractelement <8 x float> [[TMP1]], i32 3
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptoui float [[TMP5]] to i32
+; AVX256NODQ-NEXT:    [[TMP6:%.*]] = extractelement <8 x float> [[TMP1]], i32 4
+; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptoui float [[TMP6]] to i32
+; AVX256NODQ-NEXT:    [[TMP7:%.*]] = extractelement <8 x float> [[TMP1]], i32 5
+; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptoui float [[TMP7]] to i32
+; AVX256NODQ-NEXT:    [[TMP8:%.*]] = extractelement <8 x float> [[TMP1]], i32 6
+; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptoui float [[TMP8]] to i32
+; AVX256NODQ-NEXT:    [[TMP9:%.*]] = extractelement <8 x float> [[TMP1]], i32 7
+; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptoui float [[TMP9]] to i32
+; AVX256NODQ-NEXT:    [[TMP10:%.*]] = insertelement <8 x i32> poison, i32 [[CVT0]], i32 0
+; AVX256NODQ-NEXT:    [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[CVT1]], i32 1
+; AVX256NODQ-NEXT:    [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[CVT2]], i32 2
+; AVX256NODQ-NEXT:    [[TMP13:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[CVT3]], i32 3
+; AVX256NODQ-NEXT:    [[TMP14:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[CVT4]], i32 4
+; AVX256NODQ-NEXT:    [[TMP15:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[CVT5]], i32 5
+; AVX256NODQ-NEXT:    [[TMP16:%.*]] = insertelement <8 x i32> [[TMP15]], i32 [[CVT6]], i32 6
+; AVX256NODQ-NEXT:    [[TMP17:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[CVT7]], i32 7
+; AVX256NODQ-NEXT:    store <8 x i32> [[TMP17]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4
 ; AVX256NODQ-NEXT:    ret void
 ;
+; AVX256NODQ-BDVER1-LABEL: @fptoui_8f32_8i32(
+; AVX256NODQ-BDVER1-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; AVX256NODQ-BDVER1-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[TMP1]], i32 0
+; AVX256NODQ-BDVER1-NEXT:    [[CVT0:%.*]] = fptoui float [[TMP2]] to i32
+; AVX256NODQ-BDVER1-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP1]], i32 1
+; AVX256NODQ-BDVER1-NEXT:    [[CVT1:%.*]] = fptoui float [[TMP3]] to i32
+; AVX256NODQ-BDVER1-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[TMP1]], i32 2
+; AVX256NODQ-BDVER1-NEXT:    [[CVT2:%.*]] = fptoui float [[TMP4]] to i32
+; AVX256NODQ-BDVER1-NEXT:    [[TMP5:%.*]] = extractelement <8 x float> [[TMP1]], i32 3
+; AVX256NODQ-BDVER1-NEXT:    [[CVT3:%.*]] = fptoui float [[TMP5]] to i32
+; AVX256NODQ-BDVER1-NEXT:    [[TMP6:%.*]] = extractelement <8 x float> [[TMP1]], i32 4
+; AVX256NODQ-BDVER1-NEXT:    [[CVT4:%.*]] = fptoui float [[TMP6]] to i32
+; AVX256NODQ-BDVER1-NEXT:    [[TMP7:%.*]] = extractelement <8 x float> [[TMP1]], i32 5
+; AVX256NODQ-BDVER1-NEXT:    [[CVT5:%.*]] = fptoui float [[TMP7]] to i32
+; AVX256NODQ-BDVER1-NEXT:    [[TMP8:%.*]] = extractelement <8 x float> [[TMP1]], i32 6
+; AVX256NODQ-BDVER1-NEXT:    [[CVT6:%.*]] = fptoui float [[TMP8]] to i32
+; AVX256NODQ-BDVER1-NEXT:    [[TMP9:%.*]] = extractelement <8 x float> [[TMP1]], i32 7
+; AVX256NODQ-BDVER1-NEXT:    [[CVT7:%.*]] = fptoui float [[TMP9]] to i32
+; AVX256NODQ-BDVER1-NEXT:    [[TMP10:%.*]] = insertelement <8 x i32> poison, i32 [[CVT0]], i32 0
+; AVX256NODQ-BDVER1-NEXT:    [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[CVT1]], i32 1
+; AVX256NODQ-BDVER1-NEXT:    [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[CVT2]], i32 2
+; AVX256NODQ-BDVER1-NEXT:    [[TMP13:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[CVT3]], i32 3
+; AVX256NODQ-BDVER1-NEXT:    [[TMP14:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[CVT4]], i32 4
+; AVX256NODQ-BDVER1-NEXT:    [[TMP15:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[CVT5]], i32 5
+; AVX256NODQ-BDVER1-NEXT:    [[TMP16:%.*]] = insertelement <8 x i32> [[TMP15]], i32 [[CVT6]], i32 6
+; AVX256NODQ-BDVER1-NEXT:    [[TMP17:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[CVT7]], i32 7
+; AVX256NODQ-BDVER1-NEXT:    store <8 x i32> [[TMP17]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4
+; AVX256NODQ-BDVER1-NEXT:    ret void
+;
+; AVX256NODQ-AVX2-LABEL: @fptoui_8f32_8i32(
+; AVX256NODQ-AVX2-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; AVX256NODQ-AVX2-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[TMP1]], i32 0
+; AVX256NODQ-AVX2-NEXT:    [[CVT0:%.*]] = fptoui float [[TMP2]] to i32
+; AVX256NODQ-AVX2-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP1]], i32 1
+; AVX256NODQ-AVX2-NEXT:    [[CVT1:%.*]] = fptoui float [[TMP3]] to i32
+; AVX256NODQ-AVX2-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[TMP1]], i32 2
+; AVX256NODQ-AVX2-NEXT:    [[CVT2:%.*]] = fptoui float [[TMP4]] to i32
+; AVX256NODQ-AVX2-NEXT:    [[TMP5:%.*]] = extractelement <8 x float> [[TMP1]], i32 3
+; AVX256NODQ-AVX2-NEXT:    [[CVT3:%.*]] = fptoui float [[TMP5]] to i32
+; AVX256NODQ-AVX2-NEXT:    [[TMP6:%.*]] = extractelement <8 x float> [[TMP1]], i32 4
+; AVX256NODQ-AVX2-NEXT:    [[CVT4:%.*]] = fptoui float [[TMP6]] to i32
+; AVX256NODQ-AVX2-NEXT:    [[TMP7:%.*]] = extractelement <8 x float> [[TMP1]], i32 5
+; AVX256NODQ-AVX2-NEXT:    [[CVT5:%.*]] = fptoui float [[TMP7]] to i32
+; AVX256NODQ-AVX2-NEXT:    [[TMP8:%.*]] = extractelement <8 x float> [[TMP1]], i32 6
+; AVX256NODQ-AVX2-NEXT:    [[CVT6:%.*]] = fptoui float [[TMP8]] to i32
+; AVX256NODQ-AVX2-NEXT:    [[TMP9:%.*]] = extractelement <8 x float> [[TMP1]], i32 7
+; AVX256NODQ-AVX2-NEXT:    [[CVT7:%.*]] = fptoui float [[TMP9]] to i32
+; AVX256NODQ-AVX2-NEXT:    [[TMP10:%.*]] = insertelement <8 x i32> poison, i32 [[CVT0]], i32 0
+; AVX256NODQ-AVX2-NEXT:    [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[CVT1]], i32 1
+; AVX256NODQ-AVX2-NEXT:    [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[CVT2]], i32 2
+; AVX256NODQ-AVX2-NEXT:    [[TMP13:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[CVT3]], i32 3
+; AVX256NODQ-AVX2-NEXT:    [[TMP14:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[CVT4]], i32 4
+; AVX256NODQ-AVX2-NEXT:    [[TMP15:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[CVT5]], i32 5
+; AVX256NODQ-AVX2-NEXT:    [[TMP16:%.*]] = insertelement <8 x i32> [[TMP15]], i32 [[CVT6]], i32 6
+; AVX256NODQ-AVX2-NEXT:    [[TMP17:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[CVT7]], i32 7
+; AVX256NODQ-AVX2-NEXT:    store <8 x i32> [[TMP17]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4
+; AVX256NODQ-AVX2-NEXT:    ret void
+;
 ; AVX-LABEL: @fptoui_8f32_8i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
 ; AVX-NEXT:    [[TMP2:%.*]] = fptoui <8 x float> [[TMP1]] to <8 x i32>
Index: llvm/test/Transforms/SLPVectorizer/X86/fround.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/fround.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/fround.ll
@@ -4,7 +4,7 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX --check-prefix=AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX --check-prefix=AVX2-SKYLAKE
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
@@ -27,12 +27,14 @@
 
 define void @ceil_2f64() #0 {
 ; SSE2-LABEL: @ceil_2f64(
-; SSE2-NEXT:    [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; SSE2-NEXT:    [[CEIL0:%.*]] = call double @llvm.ceil.f64(double [[LD0]])
-; SSE2-NEXT:    [[CEIL1:%.*]] = call double @llvm.ceil.f64(double [[LD1]])
-; SSE2-NEXT:    store double [[CEIL0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; SSE2-NEXT:    store double [[CEIL1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE2-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; SSE2-NEXT:    [[CEIL0:%.*]] = call double @llvm.ceil.f64(double [[TMP2]])
+; SSE2-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; SSE2-NEXT:    [[CEIL1:%.*]] = call double @llvm.ceil.f64(double [[TMP3]])
+; SSE2-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[CEIL0]], i32 0
+; SSE2-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[CEIL1]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @ceil_2f64(
@@ -58,18 +60,22 @@
 
 define void @ceil_4f64() #0 {
 ; SSE2-LABEL: @ceil_4f64(
-; SSE2-NEXT:    [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; SSE2-NEXT:    [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; SSE2-NEXT:    [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; SSE2-NEXT:    [[CEIL0:%.*]] = call double @llvm.ceil.f64(double [[LD0]])
-; SSE2-NEXT:    [[CEIL1:%.*]] = call double @llvm.ceil.f64(double [[LD1]])
-; SSE2-NEXT:    [[CEIL2:%.*]] = call double @llvm.ceil.f64(double [[LD2]])
-; SSE2-NEXT:    [[CEIL3:%.*]] = call double @llvm.ceil.f64(double [[LD3]])
-; SSE2-NEXT:    store double [[CEIL0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; SSE2-NEXT:    store double [[CEIL1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE2-NEXT:    store double [[CEIL2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
-; SSE2-NEXT:    store double [[CEIL3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE2-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; SSE2-NEXT:    [[CEIL0:%.*]] = call double @llvm.ceil.f64(double [[TMP3]])
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; SSE2-NEXT:    [[CEIL1:%.*]] = call double @llvm.ceil.f64(double [[TMP4]])
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; SSE2-NEXT:    [[CEIL2:%.*]] = call double @llvm.ceil.f64(double [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; SSE2-NEXT:    [[CEIL3:%.*]] = call double @llvm.ceil.f64(double [[TMP6]])
+; SSE2-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> poison, double [[CEIL0]], i32 0
+; SSE2-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[CEIL1]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP9:%.*]] = insertelement <2 x double> poison, double [[CEIL2]], i32 0
+; SSE2-NEXT:    [[TMP10:%.*]] = insertelement <2 x double> [[TMP9]], double [[CEIL3]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP10]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @ceil_4f64(
@@ -104,30 +110,38 @@
 
 define void @ceil_8f64() #0 {
 ; SSE2-LABEL: @ceil_8f64(
-; SSE2-NEXT:    [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; SSE2-NEXT:    [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; SSE2-NEXT:    [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; SSE2-NEXT:    [[LD4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
-; SSE2-NEXT:    [[LD5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
-; SSE2-NEXT:    [[LD6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
-; SSE2-NEXT:    [[LD7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
-; SSE2-NEXT:    [[CEIL0:%.*]] = call double @llvm.ceil.f64(double [[LD0]])
-; SSE2-NEXT:    [[CEIL1:%.*]] = call double @llvm.ceil.f64(double [[LD1]])
-; SSE2-NEXT:    [[CEIL2:%.*]] = call double @llvm.ceil.f64(double [[LD2]])
-; SSE2-NEXT:    [[CEIL3:%.*]] = call double @llvm.ceil.f64(double [[LD3]])
-; SSE2-NEXT:    [[CEIL4:%.*]] = call double @llvm.ceil.f64(double [[LD4]])
-; SSE2-NEXT:    [[CEIL5:%.*]] = call double @llvm.ceil.f64(double [[LD5]])
-; SSE2-NEXT:    [[CEIL6:%.*]] = call double @llvm.ceil.f64(double [[LD6]])
-; SSE2-NEXT:    [[CEIL7:%.*]] = call double @llvm.ceil.f64(double [[LD7]])
-; SSE2-NEXT:    store double [[CEIL0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; SSE2-NEXT:    store double [[CEIL1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE2-NEXT:    store double [[CEIL2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
-; SSE2-NEXT:    store double [[CEIL3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
-; SSE2-NEXT:    store double [[CEIL4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8
-; SSE2-NEXT:    store double [[CEIL5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
-; SSE2-NEXT:    store double [[CEIL6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8
-; SSE2-NEXT:    store double [[CEIL7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; SSE2-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; SSE2-NEXT:    [[CEIL0:%.*]] = call double @llvm.ceil.f64(double [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; SSE2-NEXT:    [[CEIL1:%.*]] = call double @llvm.ceil.f64(double [[TMP6]])
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; SSE2-NEXT:    [[CEIL2:%.*]] = call double @llvm.ceil.f64(double [[TMP7]])
+; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; SSE2-NEXT:    [[CEIL3:%.*]] = call double @llvm.ceil.f64(double [[TMP8]])
+; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; SSE2-NEXT:    [[CEIL4:%.*]] = call double @llvm.ceil.f64(double [[TMP9]])
+; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+; SSE2-NEXT:    [[CEIL5:%.*]] = call double @llvm.ceil.f64(double [[TMP10]])
+; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
+; SSE2-NEXT:    [[CEIL6:%.*]] = call double @llvm.ceil.f64(double [[TMP11]])
+; SSE2-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
+; SSE2-NEXT:    [[CEIL7:%.*]] = call double @llvm.ceil.f64(double [[TMP12]])
+; SSE2-NEXT:    [[TMP13:%.*]] = insertelement <2 x double> poison, double [[CEIL0]], i32 0
+; SSE2-NEXT:    [[TMP14:%.*]] = insertelement <2 x double> [[TMP13]], double [[CEIL1]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP14]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP15:%.*]] = insertelement <2 x double> poison, double [[CEIL2]], i32 0
+; SSE2-NEXT:    [[TMP16:%.*]] = insertelement <2 x double> [[TMP15]], double [[CEIL3]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP16]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP17:%.*]] = insertelement <2 x double> poison, double [[CEIL4]], i32 0
+; SSE2-NEXT:    [[TMP18:%.*]] = insertelement <2 x double> [[TMP17]], double [[CEIL5]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP18]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP19:%.*]] = insertelement <2 x double> poison, double [[CEIL6]], i32 0
+; SSE2-NEXT:    [[TMP20:%.*]] = insertelement <2 x double> [[TMP19]], double [[CEIL7]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP20]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @ceil_8f64(
@@ -168,6 +182,15 @@
 ; AVX512-NEXT:    [[TMP2:%.*]] = call <8 x double> @llvm.ceil.v8f64(<8 x double> [[TMP1]])
 ; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 8
 ; AVX512-NEXT:    ret void
+;
+; AVX2-SKYLAKE-LABEL: @ceil_8f64(
+; AVX2-SKYLAKE-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
+; AVX2-SKYLAKE-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX2-SKYLAKE-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[TMP1]])
+; AVX2-SKYLAKE-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[TMP2]])
+; AVX2-SKYLAKE-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
+; AVX2-SKYLAKE-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX2-SKYLAKE-NEXT:    ret void
 ;
   %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
   %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
@@ -198,12 +221,14 @@
 
 define void @floor_2f64() #0 {
 ; SSE2-LABEL: @floor_2f64(
-; SSE2-NEXT:    [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; SSE2-NEXT:    [[FLOOR0:%.*]] = call double @llvm.floor.f64(double [[LD0]])
-; SSE2-NEXT:    [[FLOOR1:%.*]] = call double @llvm.floor.f64(double [[LD1]])
-; SSE2-NEXT:    store double [[FLOOR0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; SSE2-NEXT:    store double [[FLOOR1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE2-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; SSE2-NEXT:    [[FLOOR0:%.*]] = call double @llvm.floor.f64(double [[TMP2]])
+; SSE2-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; SSE2-NEXT:    [[FLOOR1:%.*]] = call double @llvm.floor.f64(double [[TMP3]])
+; SSE2-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[FLOOR0]], i32 0
+; SSE2-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[FLOOR1]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @floor_2f64(
@@ -229,18 +254,22 @@
 
 define void @floor_4f64() #0 {
 ; SSE2-LABEL: @floor_4f64(
-; SSE2-NEXT:    [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; SSE2-NEXT:    [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; SSE2-NEXT:    [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; SSE2-NEXT:    [[FLOOR0:%.*]] = call double @llvm.floor.f64(double [[LD0]])
-; SSE2-NEXT:    [[FLOOR1:%.*]] = call double @llvm.floor.f64(double [[LD1]])
-; SSE2-NEXT:    [[FLOOR2:%.*]] = call double @llvm.floor.f64(double [[LD2]])
-; SSE2-NEXT:    [[FLOOR3:%.*]] = call double @llvm.floor.f64(double [[LD3]])
-; SSE2-NEXT:    store double [[FLOOR0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; SSE2-NEXT:    store double [[FLOOR1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE2-NEXT:    store double [[FLOOR2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
-; SSE2-NEXT:    store double [[FLOOR3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE2-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; SSE2-NEXT:    [[FLOOR0:%.*]] = call double @llvm.floor.f64(double [[TMP3]])
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; SSE2-NEXT:    [[FLOOR1:%.*]] = call double @llvm.floor.f64(double [[TMP4]])
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; SSE2-NEXT:    [[FLOOR2:%.*]] = call double @llvm.floor.f64(double [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; SSE2-NEXT:    [[FLOOR3:%.*]] = call double @llvm.floor.f64(double [[TMP6]])
+; SSE2-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> poison, double [[FLOOR0]], i32 0
+; SSE2-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[FLOOR1]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP9:%.*]] = insertelement <2 x double> poison, double [[FLOOR2]], i32 0
+; SSE2-NEXT:    [[TMP10:%.*]] = insertelement <2 x double> [[TMP9]], double [[FLOOR3]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP10]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @floor_4f64(
@@ -275,30 +304,38 @@
 
 define void @floor_8f64() #0 {
 ; SSE2-LABEL: @floor_8f64(
-; SSE2-NEXT:    [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; SSE2-NEXT:    [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; SSE2-NEXT:    [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; SSE2-NEXT:    [[LD4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
-; SSE2-NEXT:    [[LD5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
-; SSE2-NEXT:    [[LD6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
-; SSE2-NEXT:    [[LD7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
-; SSE2-NEXT:    [[FLOOR0:%.*]] = call double @llvm.floor.f64(double [[LD0]])
-; SSE2-NEXT:    [[FLOOR1:%.*]] = call double @llvm.floor.f64(double [[LD1]])
-; SSE2-NEXT:    [[FLOOR2:%.*]] = call double @llvm.floor.f64(double [[LD2]])
-; SSE2-NEXT:    [[FLOOR3:%.*]] = call double @llvm.floor.f64(double [[LD3]])
-; SSE2-NEXT:    [[FLOOR4:%.*]] = call double @llvm.floor.f64(double [[LD4]])
-; SSE2-NEXT:    [[FLOOR5:%.*]] = call double @llvm.floor.f64(double [[LD5]])
-; SSE2-NEXT:    [[FLOOR6:%.*]] = call double @llvm.floor.f64(double [[LD6]])
-; SSE2-NEXT:    [[FLOOR7:%.*]] = call double @llvm.floor.f64(double [[LD7]])
-; SSE2-NEXT:    store double [[FLOOR0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; SSE2-NEXT:    store double [[FLOOR1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE2-NEXT:    store double [[FLOOR2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
-; SSE2-NEXT:    store double [[FLOOR3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
-; SSE2-NEXT:    store double [[FLOOR4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8
-; SSE2-NEXT:    store double [[FLOOR5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
-; SSE2-NEXT:    store double [[FLOOR6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8
-; SSE2-NEXT:    store double [[FLOOR7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; SSE2-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; SSE2-NEXT:    [[FLOOR0:%.*]] = call double @llvm.floor.f64(double [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; SSE2-NEXT:    [[FLOOR1:%.*]] = call double @llvm.floor.f64(double [[TMP6]])
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; SSE2-NEXT:    [[FLOOR2:%.*]] = call double @llvm.floor.f64(double [[TMP7]])
+; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; SSE2-NEXT:    [[FLOOR3:%.*]] = call double @llvm.floor.f64(double [[TMP8]])
+; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; SSE2-NEXT:    [[FLOOR4:%.*]] = call double @llvm.floor.f64(double [[TMP9]])
+; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+; SSE2-NEXT:    [[FLOOR5:%.*]] = call double @llvm.floor.f64(double [[TMP10]])
+; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
+; SSE2-NEXT:    [[FLOOR6:%.*]] = call double @llvm.floor.f64(double [[TMP11]])
+; SSE2-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
+; SSE2-NEXT:    [[FLOOR7:%.*]] = call double @llvm.floor.f64(double [[TMP12]])
+; SSE2-NEXT:    [[TMP13:%.*]] = insertelement <2 x double> poison, double [[FLOOR0]], i32 0
+; SSE2-NEXT:    [[TMP14:%.*]] = insertelement <2 x double> [[TMP13]], double [[FLOOR1]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP14]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP15:%.*]] = insertelement <2 x double> poison, double [[FLOOR2]], i32 0
+; SSE2-NEXT:    [[TMP16:%.*]] = insertelement <2 x double> [[TMP15]], double [[FLOOR3]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP16]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP17:%.*]] = insertelement <2 x double> poison, double [[FLOOR4]], i32 0
+; SSE2-NEXT:    [[TMP18:%.*]] = insertelement <2 x double> [[TMP17]], double [[FLOOR5]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP18]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP19:%.*]] = insertelement <2 x double> poison, double [[FLOOR6]], i32 0
+; SSE2-NEXT:    [[TMP20:%.*]] = insertelement <2 x double> [[TMP19]], double [[FLOOR7]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP20]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @floor_8f64(
@@ -339,6 +376,15 @@
 ; AVX512-NEXT:    [[TMP2:%.*]] = call <8 x double> @llvm.floor.v8f64(<8 x double> [[TMP1]])
 ; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 8
 ; AVX512-NEXT:    ret void
+;
+; AVX2-SKYLAKE-LABEL: @floor_8f64(
+; AVX2-SKYLAKE-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
+; AVX2-SKYLAKE-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX2-SKYLAKE-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[TMP1]])
+; AVX2-SKYLAKE-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[TMP2]])
+; AVX2-SKYLAKE-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
+; AVX2-SKYLAKE-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX2-SKYLAKE-NEXT:    ret void
 ;
   %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
   %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
@@ -369,12 +415,14 @@
 
 define void @nearbyint_2f64() #0 {
 ; SSE2-LABEL: @nearbyint_2f64(
-; SSE2-NEXT:    [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; SSE2-NEXT:    [[NEARBYINT0:%.*]] = call double @llvm.nearbyint.f64(double [[LD0]])
-; SSE2-NEXT:    [[NEARBYINT1:%.*]] = call double @llvm.nearbyint.f64(double [[LD1]])
-; SSE2-NEXT:    store double [[NEARBYINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; SSE2-NEXT:    store double [[NEARBYINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE2-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; SSE2-NEXT:    [[NEARBYINT0:%.*]] = call double @llvm.nearbyint.f64(double [[TMP2]])
+; SSE2-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; SSE2-NEXT:    [[NEARBYINT1:%.*]] = call double @llvm.nearbyint.f64(double [[TMP3]])
+; SSE2-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[NEARBYINT0]], i32 0
+; SSE2-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[NEARBYINT1]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @nearbyint_2f64(
@@ -400,18 +448,22 @@
 
 define void @nearbyint_4f64() #0 {
 ; SSE2-LABEL: @nearbyint_4f64(
-; SSE2-NEXT:    [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; SSE2-NEXT:    [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; SSE2-NEXT:    [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; SSE2-NEXT:    [[NEARBYINT0:%.*]] = call double @llvm.nearbyint.f64(double [[LD0]])
-; SSE2-NEXT:    [[NEARBYINT1:%.*]] = call double @llvm.nearbyint.f64(double [[LD1]])
-; SSE2-NEXT:    [[NEARBYINT2:%.*]] = call double @llvm.nearbyint.f64(double [[LD2]])
-; SSE2-NEXT:    [[NEARBYINT3:%.*]] = call double @llvm.nearbyint.f64(double [[LD3]])
-; SSE2-NEXT:    store double [[NEARBYINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; SSE2-NEXT:    store double [[NEARBYINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE2-NEXT:    store double [[NEARBYINT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
-; SSE2-NEXT:    store double [[NEARBYINT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE2-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; SSE2-NEXT:    [[NEARBYINT0:%.*]] = call double @llvm.nearbyint.f64(double [[TMP3]])
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; SSE2-NEXT:    [[NEARBYINT1:%.*]] = call double @llvm.nearbyint.f64(double [[TMP4]])
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; SSE2-NEXT:    [[NEARBYINT2:%.*]] = call double @llvm.nearbyint.f64(double [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; SSE2-NEXT:    [[NEARBYINT3:%.*]] = call double @llvm.nearbyint.f64(double [[TMP6]])
+; SSE2-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> poison, double [[NEARBYINT0]], i32 0
+; SSE2-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[NEARBYINT1]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP9:%.*]] = insertelement <2 x double> poison, double [[NEARBYINT2]], i32 0
+; SSE2-NEXT:    [[TMP10:%.*]] = insertelement <2 x double> [[TMP9]], double [[NEARBYINT3]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP10]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @nearbyint_4f64(
@@ -446,30 +498,38 @@
 
 define void @nearbyint_8f64() #0 {
 ; SSE2-LABEL: @nearbyint_8f64(
-; SSE2-NEXT:    [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; SSE2-NEXT:    [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; SSE2-NEXT:    [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; SSE2-NEXT:    [[LD4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
-; SSE2-NEXT:    [[LD5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
-; SSE2-NEXT:    [[LD6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
-; SSE2-NEXT:    [[LD7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
-; SSE2-NEXT:    [[NEARBYINT0:%.*]] = call double @llvm.nearbyint.f64(double [[LD0]])
-; SSE2-NEXT:    [[NEARBYINT1:%.*]] = call double @llvm.nearbyint.f64(double [[LD1]])
-; SSE2-NEXT:    [[NEARBYINT2:%.*]] = call double @llvm.nearbyint.f64(double [[LD2]])
-; SSE2-NEXT:    [[NEARBYINT3:%.*]] = call double @llvm.nearbyint.f64(double [[LD3]])
-; SSE2-NEXT:    [[NEARBYINT4:%.*]] = call double @llvm.nearbyint.f64(double [[LD4]])
-; SSE2-NEXT:    [[NEARBYINT5:%.*]] = call double @llvm.nearbyint.f64(double [[LD5]])
-; SSE2-NEXT:    [[NEARBYINT6:%.*]] = call double @llvm.nearbyint.f64(double [[LD6]])
-; SSE2-NEXT:    [[NEARBYINT7:%.*]] = call double @llvm.nearbyint.f64(double [[LD7]])
-; SSE2-NEXT:    store double [[NEARBYINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; SSE2-NEXT:    store double [[NEARBYINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE2-NEXT:    store double [[NEARBYINT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
-; SSE2-NEXT:    store double [[NEARBYINT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
-; SSE2-NEXT:    store double [[NEARBYINT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8
-; SSE2-NEXT:    store double [[NEARBYINT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
-; SSE2-NEXT:    store double [[NEARBYINT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8
-; SSE2-NEXT:    store double [[NEARBYINT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; SSE2-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; SSE2-NEXT:    [[NEARBYINT0:%.*]] = call double @llvm.nearbyint.f64(double [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; SSE2-NEXT:    [[NEARBYINT1:%.*]] = call double @llvm.nearbyint.f64(double [[TMP6]])
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; SSE2-NEXT:    [[NEARBYINT2:%.*]] = call double @llvm.nearbyint.f64(double [[TMP7]])
+; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; SSE2-NEXT:    [[NEARBYINT3:%.*]] = call double @llvm.nearbyint.f64(double [[TMP8]])
+; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; SSE2-NEXT:    [[NEARBYINT4:%.*]] = call double @llvm.nearbyint.f64(double [[TMP9]])
+; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+; SSE2-NEXT:    [[NEARBYINT5:%.*]] = call double @llvm.nearbyint.f64(double [[TMP10]])
+; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
+; SSE2-NEXT:    [[NEARBYINT6:%.*]] = call double @llvm.nearbyint.f64(double [[TMP11]])
+; SSE2-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
+; SSE2-NEXT:    [[NEARBYINT7:%.*]] = call double @llvm.nearbyint.f64(double [[TMP12]])
+; SSE2-NEXT:    [[TMP13:%.*]] = insertelement <2 x double> poison, double [[NEARBYINT0]], i32 0
+; SSE2-NEXT:    [[TMP14:%.*]] = insertelement <2 x double> [[TMP13]], double [[NEARBYINT1]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP14]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP15:%.*]] = insertelement <2 x double> poison, double [[NEARBYINT2]], i32 0
+; SSE2-NEXT:    [[TMP16:%.*]] = insertelement <2 x double> [[TMP15]], double [[NEARBYINT3]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP16]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP17:%.*]] = insertelement <2 x double> poison, double [[NEARBYINT4]], i32 0
+; SSE2-NEXT:    [[TMP18:%.*]] = insertelement <2 x double> [[TMP17]], double [[NEARBYINT5]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP18]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP19:%.*]] = insertelement <2 x double> poison, double [[NEARBYINT6]], i32 0
+; SSE2-NEXT:    [[TMP20:%.*]] = insertelement <2 x double> [[TMP19]], double [[NEARBYINT7]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP20]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @nearbyint_8f64(
@@ -510,6 +570,15 @@
 ; AVX512-NEXT:    [[TMP2:%.*]] = call <8 x double> @llvm.nearbyint.v8f64(<8 x double> [[TMP1]])
 ; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 8
 ; AVX512-NEXT:    ret void
+;
+; AVX2-SKYLAKE-LABEL: @nearbyint_8f64(
+; AVX2-SKYLAKE-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
+; AVX2-SKYLAKE-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX2-SKYLAKE-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> [[TMP1]])
+; AVX2-SKYLAKE-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> [[TMP2]])
+; AVX2-SKYLAKE-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
+; AVX2-SKYLAKE-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX2-SKYLAKE-NEXT:    ret void
 ;
   %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
   %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
@@ -540,12 +609,14 @@
 
 define void @rint_2f64() #0 {
 ; SSE2-LABEL: @rint_2f64(
-; SSE2-NEXT:    [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; SSE2-NEXT:    [[RINT0:%.*]] = call double @llvm.rint.f64(double [[LD0]])
-; SSE2-NEXT:    [[RINT1:%.*]] = call double @llvm.rint.f64(double [[LD1]])
-; SSE2-NEXT:    store double [[RINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; SSE2-NEXT:    store double [[RINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE2-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; SSE2-NEXT:    [[RINT0:%.*]] = call double @llvm.rint.f64(double [[TMP2]])
+; SSE2-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; SSE2-NEXT:    [[RINT1:%.*]] = call double @llvm.rint.f64(double [[TMP3]])
+; SSE2-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[RINT0]], i32 0
+; SSE2-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[RINT1]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @rint_2f64(
@@ -571,18 +642,22 @@
 
 define void @rint_4f64() #0 {
 ; SSE2-LABEL: @rint_4f64(
-; SSE2-NEXT:    [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; SSE2-NEXT:    [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; SSE2-NEXT:    [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; SSE2-NEXT:    [[RINT0:%.*]] = call double @llvm.rint.f64(double [[LD0]])
-; SSE2-NEXT:    [[RINT1:%.*]] = call double @llvm.rint.f64(double [[LD1]])
-; SSE2-NEXT:    [[RINT2:%.*]] = call double @llvm.rint.f64(double [[LD2]])
-; SSE2-NEXT:    [[RINT3:%.*]] = call double @llvm.rint.f64(double [[LD3]])
-; SSE2-NEXT:    store double [[RINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; SSE2-NEXT:    store double [[RINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE2-NEXT:    store double [[RINT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
-; SSE2-NEXT:    store double [[RINT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE2-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; SSE2-NEXT:    [[RINT0:%.*]] = call double @llvm.rint.f64(double [[TMP3]])
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; SSE2-NEXT:    [[RINT1:%.*]] = call double @llvm.rint.f64(double [[TMP4]])
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; SSE2-NEXT:    [[RINT2:%.*]] = call double @llvm.rint.f64(double [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; SSE2-NEXT:    [[RINT3:%.*]] = call double @llvm.rint.f64(double [[TMP6]])
+; SSE2-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> poison, double [[RINT0]], i32 0
+; SSE2-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[RINT1]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP9:%.*]] = insertelement <2 x double> poison, double [[RINT2]], i32 0
+; SSE2-NEXT:    [[TMP10:%.*]] = insertelement <2 x double> [[TMP9]], double [[RINT3]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP10]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @rint_4f64(
@@ -617,30 +692,38 @@
 
 define void @rint_8f64() #0 {
 ; SSE2-LABEL: @rint_8f64(
-; SSE2-NEXT:    [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; SSE2-NEXT:    [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; SSE2-NEXT:    [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; SSE2-NEXT:    [[LD4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
-; SSE2-NEXT:    [[LD5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
-; SSE2-NEXT:    [[LD6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
-; SSE2-NEXT:    [[LD7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
-; SSE2-NEXT:    [[RINT0:%.*]] = call double @llvm.rint.f64(double [[LD0]])
-; SSE2-NEXT:    [[RINT1:%.*]] = call double @llvm.rint.f64(double [[LD1]])
-; SSE2-NEXT:    [[RINT2:%.*]] = call double @llvm.rint.f64(double [[LD2]])
-; SSE2-NEXT:    [[RINT3:%.*]] = call double @llvm.rint.f64(double [[LD3]])
-; SSE2-NEXT:    [[RINT4:%.*]] = call double @llvm.rint.f64(double [[LD4]])
-; SSE2-NEXT:    [[RINT5:%.*]] = call double @llvm.rint.f64(double [[LD5]])
-; SSE2-NEXT:    [[RINT6:%.*]] = call double @llvm.rint.f64(double [[LD6]])
-; SSE2-NEXT:    [[RINT7:%.*]] = call double @llvm.rint.f64(double [[LD7]])
-; SSE2-NEXT:    store double [[RINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; SSE2-NEXT:    store double [[RINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE2-NEXT:    store double [[RINT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
-; SSE2-NEXT:    store double [[RINT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
-; SSE2-NEXT:    store double [[RINT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8
-; SSE2-NEXT:    store double [[RINT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
-; SSE2-NEXT:    store double [[RINT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8
-; SSE2-NEXT:    store double [[RINT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; SSE2-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; SSE2-NEXT:    [[RINT0:%.*]] = call double @llvm.rint.f64(double [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; SSE2-NEXT:    [[RINT1:%.*]] = call double @llvm.rint.f64(double [[TMP6]])
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; SSE2-NEXT:    [[RINT2:%.*]] = call double @llvm.rint.f64(double [[TMP7]])
+; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; SSE2-NEXT:    [[RINT3:%.*]] = call double @llvm.rint.f64(double [[TMP8]])
+; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; SSE2-NEXT:    [[RINT4:%.*]] = call double @llvm.rint.f64(double [[TMP9]])
+; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+; SSE2-NEXT:    [[RINT5:%.*]] = call double @llvm.rint.f64(double [[TMP10]])
+; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
+; SSE2-NEXT:    [[RINT6:%.*]] = call double @llvm.rint.f64(double [[TMP11]])
+; SSE2-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
+; SSE2-NEXT:    [[RINT7:%.*]] = call double @llvm.rint.f64(double [[TMP12]])
+; SSE2-NEXT:    [[TMP13:%.*]] = insertelement <2 x double> poison, double [[RINT0]], i32 0
+; SSE2-NEXT:    [[TMP14:%.*]] = insertelement <2 x double> [[TMP13]], double [[RINT1]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP14]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP15:%.*]] = insertelement <2 x double> poison, double [[RINT2]], i32 0
+; SSE2-NEXT:    [[TMP16:%.*]] = insertelement <2 x double> [[TMP15]], double [[RINT3]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP16]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP17:%.*]] = insertelement <2 x double> poison, double [[RINT4]], i32 0
+; SSE2-NEXT:    [[TMP18:%.*]] = insertelement <2 x double> [[TMP17]], double [[RINT5]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP18]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP19:%.*]] = insertelement <2 x double> poison, double [[RINT6]], i32 0
+; SSE2-NEXT:    [[TMP20:%.*]] = insertelement <2 x double> [[TMP19]], double [[RINT7]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP20]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @rint_8f64(
@@ -681,6 +764,15 @@
 ; AVX512-NEXT:    [[TMP2:%.*]] = call <8 x double> @llvm.rint.v8f64(<8 x double> [[TMP1]])
 ; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 8
 ; AVX512-NEXT:    ret void
+;
+; AVX2-SKYLAKE-LABEL: @rint_8f64(
+; AVX2-SKYLAKE-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
+; AVX2-SKYLAKE-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX2-SKYLAKE-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP1]])
+; AVX2-SKYLAKE-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP2]])
+; AVX2-SKYLAKE-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
+; AVX2-SKYLAKE-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX2-SKYLAKE-NEXT:    ret void
 ;
   %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
   %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
@@ -711,12 +803,14 @@
 
 define void @trunc_2f64() #0 {
 ; SSE2-LABEL: @trunc_2f64(
-; SSE2-NEXT:    [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; SSE2-NEXT:    [[TRUNC0:%.*]] = call double @llvm.trunc.f64(double [[LD0]])
-; SSE2-NEXT:    [[TRUNC1:%.*]] = call double @llvm.trunc.f64(double [[LD1]])
-; SSE2-NEXT:    store double [[TRUNC0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; SSE2-NEXT:    store double [[TRUNC1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE2-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; SSE2-NEXT:    [[TRUNC0:%.*]] = call double @llvm.trunc.f64(double [[TMP2]])
+; SSE2-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; SSE2-NEXT:    [[TRUNC1:%.*]] = call double @llvm.trunc.f64(double [[TMP3]])
+; SSE2-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[TRUNC0]], i32 0
+; SSE2-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[TRUNC1]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @trunc_2f64(
@@ -742,18 +836,22 @@
 
 define void @trunc_4f64() #0 {
 ; SSE2-LABEL: @trunc_4f64(
-; SSE2-NEXT:    [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; SSE2-NEXT:    [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; SSE2-NEXT:    [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; SSE2-NEXT:    [[TRUNC0:%.*]] = call double @llvm.trunc.f64(double [[LD0]])
-; SSE2-NEXT:    [[TRUNC1:%.*]] = call double @llvm.trunc.f64(double [[LD1]])
-; SSE2-NEXT:    [[TRUNC2:%.*]] = call double @llvm.trunc.f64(double [[LD2]])
-; SSE2-NEXT:    [[TRUNC3:%.*]] = call double @llvm.trunc.f64(double [[LD3]])
-; SSE2-NEXT:    store double [[TRUNC0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; SSE2-NEXT:    store double [[TRUNC1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE2-NEXT:    store double [[TRUNC2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
-; SSE2-NEXT:    store double [[TRUNC3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE2-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; SSE2-NEXT:    [[TRUNC0:%.*]] = call double @llvm.trunc.f64(double [[TMP3]])
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; SSE2-NEXT:    [[TRUNC1:%.*]] = call double @llvm.trunc.f64(double [[TMP4]])
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; SSE2-NEXT:    [[TRUNC2:%.*]] = call double @llvm.trunc.f64(double [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; SSE2-NEXT:    [[TRUNC3:%.*]] = call double @llvm.trunc.f64(double [[TMP6]])
+; SSE2-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> poison, double [[TRUNC0]], i32 0
+; SSE2-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[TRUNC1]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP9:%.*]] = insertelement <2 x double> poison, double [[TRUNC2]], i32 0
+; SSE2-NEXT:    [[TMP10:%.*]] = insertelement <2 x double> [[TMP9]], double [[TRUNC3]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP10]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @trunc_4f64(
@@ -788,30 +886,38 @@
 
 define void @trunc_8f64() #0 {
 ; SSE2-LABEL: @trunc_8f64(
-; SSE2-NEXT:    [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; SSE2-NEXT:    [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; SSE2-NEXT:    [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; SSE2-NEXT:    [[LD4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
-; SSE2-NEXT:    [[LD5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
-; SSE2-NEXT:    [[LD6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
-; SSE2-NEXT:    [[LD7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
-; SSE2-NEXT:    [[TRUNC0:%.*]] = call double @llvm.trunc.f64(double [[LD0]])
-; SSE2-NEXT:    [[TRUNC1:%.*]] = call double @llvm.trunc.f64(double [[LD1]])
-; SSE2-NEXT:    [[TRUNC2:%.*]] = call double @llvm.trunc.f64(double [[LD2]])
-; SSE2-NEXT:    [[TRUNC3:%.*]] = call double @llvm.trunc.f64(double [[LD3]])
-; SSE2-NEXT:    [[TRUNC4:%.*]] = call double @llvm.trunc.f64(double [[LD4]])
-; SSE2-NEXT:    [[TRUNC5:%.*]] = call double @llvm.trunc.f64(double [[LD5]])
-; SSE2-NEXT:    [[TRUNC6:%.*]] = call double @llvm.trunc.f64(double [[LD6]])
-; SSE2-NEXT:    [[TRUNC7:%.*]] = call double @llvm.trunc.f64(double [[LD7]])
-; SSE2-NEXT:    store double [[TRUNC0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; SSE2-NEXT:    store double [[TRUNC1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE2-NEXT:    store double [[TRUNC2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
-; SSE2-NEXT:    store double [[TRUNC3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
-; SSE2-NEXT:    store double [[TRUNC4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8
-; SSE2-NEXT:    store double [[TRUNC5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
-; SSE2-NEXT:    store double [[TRUNC6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8
-; SSE2-NEXT:    store double [[TRUNC7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; SSE2-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; SSE2-NEXT:    [[TRUNC0:%.*]] = call double @llvm.trunc.f64(double [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; SSE2-NEXT:    [[TRUNC1:%.*]] = call double @llvm.trunc.f64(double [[TMP6]])
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; SSE2-NEXT:    [[TRUNC2:%.*]] = call double @llvm.trunc.f64(double [[TMP7]])
+; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; SSE2-NEXT:    [[TRUNC3:%.*]] = call double @llvm.trunc.f64(double [[TMP8]])
+; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; SSE2-NEXT:    [[TRUNC4:%.*]] = call double @llvm.trunc.f64(double [[TMP9]])
+; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+; SSE2-NEXT:    [[TRUNC5:%.*]] = call double @llvm.trunc.f64(double [[TMP10]])
+; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
+; SSE2-NEXT:    [[TRUNC6:%.*]] = call double @llvm.trunc.f64(double [[TMP11]])
+; SSE2-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
+; SSE2-NEXT:    [[TRUNC7:%.*]] = call double @llvm.trunc.f64(double [[TMP12]])
+; SSE2-NEXT:    [[TMP13:%.*]] = insertelement <2 x double> poison, double [[TRUNC0]], i32 0
+; SSE2-NEXT:    [[TMP14:%.*]] = insertelement <2 x double> [[TMP13]], double [[TRUNC1]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP14]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP15:%.*]] = insertelement <2 x double> poison, double [[TRUNC2]], i32 0
+; SSE2-NEXT:    [[TMP16:%.*]] = insertelement <2 x double> [[TMP15]], double [[TRUNC3]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP16]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP17:%.*]] = insertelement <2 x double> poison, double [[TRUNC4]], i32 0
+; SSE2-NEXT:    [[TMP18:%.*]] = insertelement <2 x double> [[TMP17]], double [[TRUNC5]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP18]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE2-NEXT:    [[TMP19:%.*]] = insertelement <2 x double> poison, double [[TRUNC6]], i32 0
+; SSE2-NEXT:    [[TMP20:%.*]] = insertelement <2 x double> [[TMP19]], double [[TRUNC7]], i32 1
+; SSE2-NEXT:    store <2 x double> [[TMP20]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @trunc_8f64(
@@ -852,6 +958,15 @@
 ; AVX512-NEXT:    [[TMP2:%.*]] = call <8 x double> @llvm.trunc.v8f64(<8 x double> [[TMP1]])
 ; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 8
 ; AVX512-NEXT:    ret void
+;
+; AVX2-SKYLAKE-LABEL: @trunc_8f64(
+; AVX2-SKYLAKE-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
+; AVX2-SKYLAKE-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX2-SKYLAKE-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.trunc.v4f64(<4 x double> [[TMP1]])
+; AVX2-SKYLAKE-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.trunc.v4f64(<4 x double> [[TMP2]])
+; AVX2-SKYLAKE-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
+; AVX2-SKYLAKE-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX2-SKYLAKE-NEXT:    ret void
 ;
   %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
   %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
@@ -882,18 +997,20 @@
 
 define void @ceil_4f32() #0 {
 ; SSE2-LABEL: @ceil_4f32(
-; SSE2-NEXT:    [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; SSE2-NEXT:    [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; SSE2-NEXT:    [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; SSE2-NEXT:    [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; SSE2-NEXT:    [[CEIL0:%.*]] = call float @llvm.ceil.f32(float [[LD0]])
-; SSE2-NEXT:    [[CEIL1:%.*]] = call float @llvm.ceil.f32(float [[LD1]])
-; SSE2-NEXT:    [[CEIL2:%.*]] = call float @llvm.ceil.f32(float [[LD2]])
-; SSE2-NEXT:    [[CEIL3:%.*]] = call float @llvm.ceil.f32(float [[LD3]])
-; SSE2-NEXT:    store float [[CEIL0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; SSE2-NEXT:    store float [[CEIL1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE2-NEXT:    store float [[CEIL2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; SSE2-NEXT:    store float [[CEIL3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; SSE2-NEXT:    [[CEIL0:%.*]] = call float @llvm.ceil.f32(float [[TMP2]])
+; SSE2-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; SSE2-NEXT:    [[CEIL1:%.*]] = call float @llvm.ceil.f32(float [[TMP3]])
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; SSE2-NEXT:    [[CEIL2:%.*]] = call float @llvm.ceil.f32(float [[TMP4]])
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; SSE2-NEXT:    [[CEIL3:%.*]] = call float @llvm.ceil.f32(float [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> poison, float [[CEIL0]], i32 0
+; SSE2-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[CEIL1]], i32 1
+; SSE2-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[CEIL2]], i32 2
+; SSE2-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[CEIL3]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP9]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @ceil_4f32(
@@ -925,30 +1042,34 @@
 
 define void @ceil_8f32() #0 {
 ; SSE2-LABEL: @ceil_8f32(
-; SSE2-NEXT:    [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; SSE2-NEXT:    [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; SSE2-NEXT:    [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; SSE2-NEXT:    [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; SSE2-NEXT:    [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; SSE2-NEXT:    [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; SSE2-NEXT:    [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; SSE2-NEXT:    [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; SSE2-NEXT:    [[CEIL0:%.*]] = call float @llvm.ceil.f32(float [[LD0]])
-; SSE2-NEXT:    [[CEIL1:%.*]] = call float @llvm.ceil.f32(float [[LD1]])
-; SSE2-NEXT:    [[CEIL2:%.*]] = call float @llvm.ceil.f32(float [[LD2]])
-; SSE2-NEXT:    [[CEIL3:%.*]] = call float @llvm.ceil.f32(float [[LD3]])
-; SSE2-NEXT:    [[CEIL4:%.*]] = call float @llvm.ceil.f32(float [[LD4]])
-; SSE2-NEXT:    [[CEIL5:%.*]] = call float @llvm.ceil.f32(float [[LD5]])
-; SSE2-NEXT:    [[CEIL6:%.*]] = call float @llvm.ceil.f32(float [[LD6]])
-; SSE2-NEXT:    [[CEIL7:%.*]] = call float @llvm.ceil.f32(float [[LD7]])
-; SSE2-NEXT:    store float [[CEIL0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; SSE2-NEXT:    store float [[CEIL1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE2-NEXT:    store float [[CEIL2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; SSE2-NEXT:    store float [[CEIL3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE2-NEXT:    store float [[CEIL4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
-; SSE2-NEXT:    store float [[CEIL5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE2-NEXT:    store float [[CEIL6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
-; SSE2-NEXT:    store float [[CEIL7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; SSE2-NEXT:    [[CEIL0:%.*]] = call float @llvm.ceil.f32(float [[TMP3]])
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; SSE2-NEXT:    [[CEIL1:%.*]] = call float @llvm.ceil.f32(float [[TMP4]])
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; SSE2-NEXT:    [[CEIL2:%.*]] = call float @llvm.ceil.f32(float [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; SSE2-NEXT:    [[CEIL3:%.*]] = call float @llvm.ceil.f32(float [[TMP6]])
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; SSE2-NEXT:    [[CEIL4:%.*]] = call float @llvm.ceil.f32(float [[TMP7]])
+; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; SSE2-NEXT:    [[CEIL5:%.*]] = call float @llvm.ceil.f32(float [[TMP8]])
+; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; SSE2-NEXT:    [[CEIL6:%.*]] = call float @llvm.ceil.f32(float [[TMP9]])
+; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; SSE2-NEXT:    [[CEIL7:%.*]] = call float @llvm.ceil.f32(float [[TMP10]])
+; SSE2-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> poison, float [[CEIL0]], i32 0
+; SSE2-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[CEIL1]], i32 1
+; SSE2-NEXT:    [[TMP13:%.*]] = insertelement <4 x float> [[TMP12]], float [[CEIL2]], i32 2
+; SSE2-NEXT:    [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[CEIL3]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP14]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP15:%.*]] = insertelement <4 x float> poison, float [[CEIL4]], i32 0
+; SSE2-NEXT:    [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[CEIL5]], i32 1
+; SSE2-NEXT:    [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float [[CEIL6]], i32 2
+; SSE2-NEXT:    [[TMP18:%.*]] = insertelement <4 x float> [[TMP17]], float [[CEIL7]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP18]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @ceil_8f32(
@@ -995,54 +1116,62 @@
 
 define void @ceil_16f32() #0 {
 ; SSE2-LABEL: @ceil_16f32(
-; SSE2-NEXT:    [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; SSE2-NEXT:    [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; SSE2-NEXT:    [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; SSE2-NEXT:    [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; SSE2-NEXT:    [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; SSE2-NEXT:    [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; SSE2-NEXT:    [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; SSE2-NEXT:    [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; SSE2-NEXT:    [[LD8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8), align 4
-; SSE2-NEXT:    [[LD9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9), align 4
-; SSE2-NEXT:    [[LD10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4
-; SSE2-NEXT:    [[LD11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4
-; SSE2-NEXT:    [[LD12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4
-; SSE2-NEXT:    [[LD13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4
-; SSE2-NEXT:    [[LD14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4
-; SSE2-NEXT:    [[LD15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4
-; SSE2-NEXT:    [[CEIL0:%.*]] = call float @llvm.ceil.f32(float [[LD0]])
-; SSE2-NEXT:    [[CEIL1:%.*]] = call float @llvm.ceil.f32(float [[LD1]])
-; SSE2-NEXT:    [[CEIL2:%.*]] = call float @llvm.ceil.f32(float [[LD2]])
-; SSE2-NEXT:    [[CEIL3:%.*]] = call float @llvm.ceil.f32(float [[LD3]])
-; SSE2-NEXT:    [[CEIL4:%.*]] = call float @llvm.ceil.f32(float [[LD4]])
-; SSE2-NEXT:    [[CEIL5:%.*]] = call float @llvm.ceil.f32(float [[LD5]])
-; SSE2-NEXT:    [[CEIL6:%.*]] = call float @llvm.ceil.f32(float [[LD6]])
-; SSE2-NEXT:    [[CEIL7:%.*]] = call float @llvm.ceil.f32(float [[LD7]])
-; SSE2-NEXT:    [[CEIL8:%.*]] = call float @llvm.ceil.f32(float [[LD8]])
-; SSE2-NEXT:    [[CEIL9:%.*]] = call float @llvm.ceil.f32(float [[LD9]])
-; SSE2-NEXT:    [[CEIL10:%.*]] = call float @llvm.ceil.f32(float [[LD10]])
-; SSE2-NEXT:    [[CEIL11:%.*]] = call float @llvm.ceil.f32(float [[LD11]])
-; SSE2-NEXT:    [[CEIL12:%.*]] = call float @llvm.ceil.f32(float [[LD12]])
-; SSE2-NEXT:    [[CEIL13:%.*]] = call float @llvm.ceil.f32(float [[LD13]])
-; SSE2-NEXT:    [[CEIL14:%.*]] = call float @llvm.ceil.f32(float [[LD14]])
-; SSE2-NEXT:    [[CEIL15:%.*]] = call float @llvm.ceil.f32(float [[LD15]])
-; SSE2-NEXT:    store float [[CEIL0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; SSE2-NEXT:    store float [[CEIL1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE2-NEXT:    store float [[CEIL2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; SSE2-NEXT:    store float [[CEIL3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE2-NEXT:    store float [[CEIL4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
-; SSE2-NEXT:    store float [[CEIL5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE2-NEXT:    store float [[CEIL6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
-; SSE2-NEXT:    store float [[CEIL7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
-; SSE2-NEXT:    store float [[CEIL8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4
-; SSE2-NEXT:    store float [[CEIL9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4
-; SSE2-NEXT:    store float [[CEIL10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4
-; SSE2-NEXT:    store float [[CEIL11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
-; SSE2-NEXT:    store float [[CEIL12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4
-; SSE2-NEXT:    store float [[CEIL13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
-; SSE2-NEXT:    store float [[CEIL14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4
-; SSE2-NEXT:    store float [[CEIL15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; SSE2-NEXT:    [[CEIL0:%.*]] = call float @llvm.ceil.f32(float [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; SSE2-NEXT:    [[CEIL1:%.*]] = call float @llvm.ceil.f32(float [[TMP6]])
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; SSE2-NEXT:    [[CEIL2:%.*]] = call float @llvm.ceil.f32(float [[TMP7]])
+; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; SSE2-NEXT:    [[CEIL3:%.*]] = call float @llvm.ceil.f32(float [[TMP8]])
+; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; SSE2-NEXT:    [[CEIL4:%.*]] = call float @llvm.ceil.f32(float [[TMP9]])
+; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; SSE2-NEXT:    [[CEIL5:%.*]] = call float @llvm.ceil.f32(float [[TMP10]])
+; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; SSE2-NEXT:    [[CEIL6:%.*]] = call float @llvm.ceil.f32(float [[TMP11]])
+; SSE2-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; SSE2-NEXT:    [[CEIL7:%.*]] = call float @llvm.ceil.f32(float [[TMP12]])
+; SSE2-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
+; SSE2-NEXT:    [[CEIL8:%.*]] = call float @llvm.ceil.f32(float [[TMP13]])
+; SSE2-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
+; SSE2-NEXT:    [[CEIL9:%.*]] = call float @llvm.ceil.f32(float [[TMP14]])
+; SSE2-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
+; SSE2-NEXT:    [[CEIL10:%.*]] = call float @llvm.ceil.f32(float [[TMP15]])
+; SSE2-NEXT:    [[TMP16:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
+; SSE2-NEXT:    [[CEIL11:%.*]] = call float @llvm.ceil.f32(float [[TMP16]])
+; SSE2-NEXT:    [[TMP17:%.*]] = extractelement <4 x float> [[TMP4]], i32 0
+; SSE2-NEXT:    [[CEIL12:%.*]] = call float @llvm.ceil.f32(float [[TMP17]])
+; SSE2-NEXT:    [[TMP18:%.*]] = extractelement <4 x float> [[TMP4]], i32 1
+; SSE2-NEXT:    [[CEIL13:%.*]] = call float @llvm.ceil.f32(float [[TMP18]])
+; SSE2-NEXT:    [[TMP19:%.*]] = extractelement <4 x float> [[TMP4]], i32 2
+; SSE2-NEXT:    [[CEIL14:%.*]] = call float @llvm.ceil.f32(float [[TMP19]])
+; SSE2-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[TMP4]], i32 3
+; SSE2-NEXT:    [[CEIL15:%.*]] = call float @llvm.ceil.f32(float [[TMP20]])
+; SSE2-NEXT:    [[TMP21:%.*]] = insertelement <4 x float> poison, float [[CEIL0]], i32 0
+; SSE2-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[CEIL1]], i32 1
+; SSE2-NEXT:    [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[CEIL2]], i32 2
+; SSE2-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[CEIL3]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP24]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP25:%.*]] = insertelement <4 x float> poison, float [[CEIL4]], i32 0
+; SSE2-NEXT:    [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[CEIL5]], i32 1
+; SSE2-NEXT:    [[TMP27:%.*]] = insertelement <4 x float> [[TMP26]], float [[CEIL6]], i32 2
+; SSE2-NEXT:    [[TMP28:%.*]] = insertelement <4 x float> [[TMP27]], float [[CEIL7]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP28]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP29:%.*]] = insertelement <4 x float> poison, float [[CEIL8]], i32 0
+; SSE2-NEXT:    [[TMP30:%.*]] = insertelement <4 x float> [[TMP29]], float [[CEIL9]], i32 1
+; SSE2-NEXT:    [[TMP31:%.*]] = insertelement <4 x float> [[TMP30]], float [[CEIL10]], i32 2
+; SSE2-NEXT:    [[TMP32:%.*]] = insertelement <4 x float> [[TMP31]], float [[CEIL11]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP32]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP33:%.*]] = insertelement <4 x float> poison, float [[CEIL12]], i32 0
+; SSE2-NEXT:    [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[CEIL13]], i32 1
+; SSE2-NEXT:    [[TMP35:%.*]] = insertelement <4 x float> [[TMP34]], float [[CEIL14]], i32 2
+; SSE2-NEXT:    [[TMP36:%.*]] = insertelement <4 x float> [[TMP35]], float [[CEIL15]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP36]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @ceil_16f32(
@@ -1083,6 +1212,15 @@
 ; AVX512-NEXT:    [[TMP2:%.*]] = call <16 x float> @llvm.ceil.v16f32(<16 x float> [[TMP1]])
 ; AVX512-NEXT:    store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4
 ; AVX512-NEXT:    ret void
+;
+; AVX2-SKYLAKE-LABEL: @ceil_16f32(
+; AVX2-SKYLAKE-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; AVX2-SKYLAKE-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX2-SKYLAKE-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[TMP1]])
+; AVX2-SKYLAKE-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[TMP2]])
+; AVX2-SKYLAKE-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX2-SKYLAKE-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX2-SKYLAKE-NEXT:    ret void
 ;
   %ld0  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0 ), align 4
   %ld1  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1 ), align 4
@@ -1137,18 +1275,20 @@
 
 define void @floor_4f32() #0 {
 ; SSE2-LABEL: @floor_4f32(
-; SSE2-NEXT:    [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; SSE2-NEXT:    [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; SSE2-NEXT:    [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; SSE2-NEXT:    [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; SSE2-NEXT:    [[FLOOR0:%.*]] = call float @llvm.floor.f32(float [[LD0]])
-; SSE2-NEXT:    [[FLOOR1:%.*]] = call float @llvm.floor.f32(float [[LD1]])
-; SSE2-NEXT:    [[FLOOR2:%.*]] = call float @llvm.floor.f32(float [[LD2]])
-; SSE2-NEXT:    [[FLOOR3:%.*]] = call float @llvm.floor.f32(float [[LD3]])
-; SSE2-NEXT:    store float [[FLOOR0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; SSE2-NEXT:    store float [[FLOOR1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE2-NEXT:    store float [[FLOOR2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; SSE2-NEXT:    store float [[FLOOR3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; SSE2-NEXT:    [[FLOOR0:%.*]] = call float @llvm.floor.f32(float [[TMP2]])
+; SSE2-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; SSE2-NEXT:    [[FLOOR1:%.*]] = call float @llvm.floor.f32(float [[TMP3]])
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; SSE2-NEXT:    [[FLOOR2:%.*]] = call float @llvm.floor.f32(float [[TMP4]])
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; SSE2-NEXT:    [[FLOOR3:%.*]] = call float @llvm.floor.f32(float [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> poison, float [[FLOOR0]], i32 0
+; SSE2-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[FLOOR1]], i32 1
+; SSE2-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[FLOOR2]], i32 2
+; SSE2-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[FLOOR3]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP9]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @floor_4f32(
@@ -1180,30 +1320,34 @@
 
 define void @floor_8f32() #0 {
 ; SSE2-LABEL: @floor_8f32(
-; SSE2-NEXT:    [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; SSE2-NEXT:    [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; SSE2-NEXT:    [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; SSE2-NEXT:    [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; SSE2-NEXT:    [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; SSE2-NEXT:    [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; SSE2-NEXT:    [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; SSE2-NEXT:    [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; SSE2-NEXT:    [[FLOOR0:%.*]] = call float @llvm.floor.f32(float [[LD0]])
-; SSE2-NEXT:    [[FLOOR1:%.*]] = call float @llvm.floor.f32(float [[LD1]])
-; SSE2-NEXT:    [[FLOOR2:%.*]] = call float @llvm.floor.f32(float [[LD2]])
-; SSE2-NEXT:    [[FLOOR3:%.*]] = call float @llvm.floor.f32(float [[LD3]])
-; SSE2-NEXT:    [[FLOOR4:%.*]] = call float @llvm.floor.f32(float [[LD4]])
-; SSE2-NEXT:    [[FLOOR5:%.*]] = call float @llvm.floor.f32(float [[LD5]])
-; SSE2-NEXT:    [[FLOOR6:%.*]] = call float @llvm.floor.f32(float [[LD6]])
-; SSE2-NEXT:    [[FLOOR7:%.*]] = call float @llvm.floor.f32(float [[LD7]])
-; SSE2-NEXT:    store float [[FLOOR0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; SSE2-NEXT:    store float [[FLOOR1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE2-NEXT:    store float [[FLOOR2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; SSE2-NEXT:    store float [[FLOOR3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE2-NEXT:    store float [[FLOOR4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
-; SSE2-NEXT:    store float [[FLOOR5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE2-NEXT:    store float [[FLOOR6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
-; SSE2-NEXT:    store float [[FLOOR7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; SSE2-NEXT:    [[FLOOR0:%.*]] = call float @llvm.floor.f32(float [[TMP3]])
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; SSE2-NEXT:    [[FLOOR1:%.*]] = call float @llvm.floor.f32(float [[TMP4]])
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; SSE2-NEXT:    [[FLOOR2:%.*]] = call float @llvm.floor.f32(float [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; SSE2-NEXT:    [[FLOOR3:%.*]] = call float @llvm.floor.f32(float [[TMP6]])
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; SSE2-NEXT:    [[FLOOR4:%.*]] = call float @llvm.floor.f32(float [[TMP7]])
+; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; SSE2-NEXT:    [[FLOOR5:%.*]] = call float @llvm.floor.f32(float [[TMP8]])
+; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; SSE2-NEXT:    [[FLOOR6:%.*]] = call float @llvm.floor.f32(float [[TMP9]])
+; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; SSE2-NEXT:    [[FLOOR7:%.*]] = call float @llvm.floor.f32(float [[TMP10]])
+; SSE2-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> poison, float [[FLOOR0]], i32 0
+; SSE2-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[FLOOR1]], i32 1
+; SSE2-NEXT:    [[TMP13:%.*]] = insertelement <4 x float> [[TMP12]], float [[FLOOR2]], i32 2
+; SSE2-NEXT:    [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[FLOOR3]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP14]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP15:%.*]] = insertelement <4 x float> poison, float [[FLOOR4]], i32 0
+; SSE2-NEXT:    [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[FLOOR5]], i32 1
+; SSE2-NEXT:    [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float [[FLOOR6]], i32 2
+; SSE2-NEXT:    [[TMP18:%.*]] = insertelement <4 x float> [[TMP17]], float [[FLOOR7]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP18]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @floor_8f32(
@@ -1250,54 +1394,62 @@
 
 define void @floor_16f32() #0 {
 ; SSE2-LABEL: @floor_16f32(
-; SSE2-NEXT:    [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; SSE2-NEXT:    [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; SSE2-NEXT:    [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; SSE2-NEXT:    [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; SSE2-NEXT:    [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; SSE2-NEXT:    [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; SSE2-NEXT:    [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; SSE2-NEXT:    [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; SSE2-NEXT:    [[LD8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8), align 4
-; SSE2-NEXT:    [[LD9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9), align 4
-; SSE2-NEXT:    [[LD10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4
-; SSE2-NEXT:    [[LD11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4
-; SSE2-NEXT:    [[LD12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4
-; SSE2-NEXT:    [[LD13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4
-; SSE2-NEXT:    [[LD14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4
-; SSE2-NEXT:    [[LD15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4
-; SSE2-NEXT:    [[FLOOR0:%.*]] = call float @llvm.floor.f32(float [[LD0]])
-; SSE2-NEXT:    [[FLOOR1:%.*]] = call float @llvm.floor.f32(float [[LD1]])
-; SSE2-NEXT:    [[FLOOR2:%.*]] = call float @llvm.floor.f32(float [[LD2]])
-; SSE2-NEXT:    [[FLOOR3:%.*]] = call float @llvm.floor.f32(float [[LD3]])
-; SSE2-NEXT:    [[FLOOR4:%.*]] = call float @llvm.floor.f32(float [[LD4]])
-; SSE2-NEXT:    [[FLOOR5:%.*]] = call float @llvm.floor.f32(float [[LD5]])
-; SSE2-NEXT:    [[FLOOR6:%.*]] = call float @llvm.floor.f32(float [[LD6]])
-; SSE2-NEXT:    [[FLOOR7:%.*]] = call float @llvm.floor.f32(float [[LD7]])
-; SSE2-NEXT:    [[FLOOR8:%.*]] = call float @llvm.floor.f32(float [[LD8]])
-; SSE2-NEXT:    [[FLOOR9:%.*]] = call float @llvm.floor.f32(float [[LD9]])
-; SSE2-NEXT:    [[FLOOR10:%.*]] = call float @llvm.floor.f32(float [[LD10]])
-; SSE2-NEXT:    [[FLOOR11:%.*]] = call float @llvm.floor.f32(float [[LD11]])
-; SSE2-NEXT:    [[FLOOR12:%.*]] = call float @llvm.floor.f32(float [[LD12]])
-; SSE2-NEXT:    [[FLOOR13:%.*]] = call float @llvm.floor.f32(float [[LD13]])
-; SSE2-NEXT:    [[FLOOR14:%.*]] = call float @llvm.floor.f32(float [[LD14]])
-; SSE2-NEXT:    [[FLOOR15:%.*]] = call float @llvm.floor.f32(float [[LD15]])
-; SSE2-NEXT:    store float [[FLOOR0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; SSE2-NEXT:    store float [[FLOOR1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE2-NEXT:    store float [[FLOOR2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; SSE2-NEXT:    store float [[FLOOR3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE2-NEXT:    store float [[FLOOR4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
-; SSE2-NEXT:    store float [[FLOOR5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE2-NEXT:    store float [[FLOOR6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
-; SSE2-NEXT:    store float [[FLOOR7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
-; SSE2-NEXT:    store float [[FLOOR8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4
-; SSE2-NEXT:    store float [[FLOOR9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4
-; SSE2-NEXT:    store float [[FLOOR10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4
-; SSE2-NEXT:    store float [[FLOOR11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
-; SSE2-NEXT:    store float [[FLOOR12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4
-; SSE2-NEXT:    store float [[FLOOR13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
-; SSE2-NEXT:    store float [[FLOOR14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4
-; SSE2-NEXT:    store float [[FLOOR15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; SSE2-NEXT:    [[FLOOR0:%.*]] = call float @llvm.floor.f32(float [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; SSE2-NEXT:    [[FLOOR1:%.*]] = call float @llvm.floor.f32(float [[TMP6]])
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; SSE2-NEXT:    [[FLOOR2:%.*]] = call float @llvm.floor.f32(float [[TMP7]])
+; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; SSE2-NEXT:    [[FLOOR3:%.*]] = call float @llvm.floor.f32(float [[TMP8]])
+; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; SSE2-NEXT:    [[FLOOR4:%.*]] = call float @llvm.floor.f32(float [[TMP9]])
+; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; SSE2-NEXT:    [[FLOOR5:%.*]] = call float @llvm.floor.f32(float [[TMP10]])
+; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; SSE2-NEXT:    [[FLOOR6:%.*]] = call float @llvm.floor.f32(float [[TMP11]])
+; SSE2-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; SSE2-NEXT:    [[FLOOR7:%.*]] = call float @llvm.floor.f32(float [[TMP12]])
+; SSE2-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
+; SSE2-NEXT:    [[FLOOR8:%.*]] = call float @llvm.floor.f32(float [[TMP13]])
+; SSE2-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
+; SSE2-NEXT:    [[FLOOR9:%.*]] = call float @llvm.floor.f32(float [[TMP14]])
+; SSE2-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
+; SSE2-NEXT:    [[FLOOR10:%.*]] = call float @llvm.floor.f32(float [[TMP15]])
+; SSE2-NEXT:    [[TMP16:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
+; SSE2-NEXT:    [[FLOOR11:%.*]] = call float @llvm.floor.f32(float [[TMP16]])
+; SSE2-NEXT:    [[TMP17:%.*]] = extractelement <4 x float> [[TMP4]], i32 0
+; SSE2-NEXT:    [[FLOOR12:%.*]] = call float @llvm.floor.f32(float [[TMP17]])
+; SSE2-NEXT:    [[TMP18:%.*]] = extractelement <4 x float> [[TMP4]], i32 1
+; SSE2-NEXT:    [[FLOOR13:%.*]] = call float @llvm.floor.f32(float [[TMP18]])
+; SSE2-NEXT:    [[TMP19:%.*]] = extractelement <4 x float> [[TMP4]], i32 2
+; SSE2-NEXT:    [[FLOOR14:%.*]] = call float @llvm.floor.f32(float [[TMP19]])
+; SSE2-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[TMP4]], i32 3
+; SSE2-NEXT:    [[FLOOR15:%.*]] = call float @llvm.floor.f32(float [[TMP20]])
+; SSE2-NEXT:    [[TMP21:%.*]] = insertelement <4 x float> poison, float [[FLOOR0]], i32 0
+; SSE2-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[FLOOR1]], i32 1
+; SSE2-NEXT:    [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[FLOOR2]], i32 2
+; SSE2-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[FLOOR3]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP24]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP25:%.*]] = insertelement <4 x float> poison, float [[FLOOR4]], i32 0
+; SSE2-NEXT:    [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[FLOOR5]], i32 1
+; SSE2-NEXT:    [[TMP27:%.*]] = insertelement <4 x float> [[TMP26]], float [[FLOOR6]], i32 2
+; SSE2-NEXT:    [[TMP28:%.*]] = insertelement <4 x float> [[TMP27]], float [[FLOOR7]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP28]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP29:%.*]] = insertelement <4 x float> poison, float [[FLOOR8]], i32 0
+; SSE2-NEXT:    [[TMP30:%.*]] = insertelement <4 x float> [[TMP29]], float [[FLOOR9]], i32 1
+; SSE2-NEXT:    [[TMP31:%.*]] = insertelement <4 x float> [[TMP30]], float [[FLOOR10]], i32 2
+; SSE2-NEXT:    [[TMP32:%.*]] = insertelement <4 x float> [[TMP31]], float [[FLOOR11]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP32]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP33:%.*]] = insertelement <4 x float> poison, float [[FLOOR12]], i32 0
+; SSE2-NEXT:    [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[FLOOR13]], i32 1
+; SSE2-NEXT:    [[TMP35:%.*]] = insertelement <4 x float> [[TMP34]], float [[FLOOR14]], i32 2
+; SSE2-NEXT:    [[TMP36:%.*]] = insertelement <4 x float> [[TMP35]], float [[FLOOR15]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP36]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @floor_16f32(
@@ -1338,6 +1490,15 @@
 ; AVX512-NEXT:    [[TMP2:%.*]] = call <16 x float> @llvm.floor.v16f32(<16 x float> [[TMP1]])
 ; AVX512-NEXT:    store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4
 ; AVX512-NEXT:    ret void
+;
+; AVX2-SKYLAKE-LABEL: @floor_16f32(
+; AVX2-SKYLAKE-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; AVX2-SKYLAKE-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX2-SKYLAKE-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[TMP1]])
+; AVX2-SKYLAKE-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[TMP2]])
+; AVX2-SKYLAKE-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX2-SKYLAKE-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX2-SKYLAKE-NEXT:    ret void
 ;
   %ld0  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0 ), align 4
   %ld1  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1 ), align 4
@@ -1392,18 +1553,20 @@
 
 define void @nearbyint_4f32() #0 {
 ; SSE2-LABEL: @nearbyint_4f32(
-; SSE2-NEXT:    [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; SSE2-NEXT:    [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; SSE2-NEXT:    [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; SSE2-NEXT:    [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; SSE2-NEXT:    [[NEARBYINT0:%.*]] = call float @llvm.nearbyint.f32(float [[LD0]])
-; SSE2-NEXT:    [[NEARBYINT1:%.*]] = call float @llvm.nearbyint.f32(float [[LD1]])
-; SSE2-NEXT:    [[NEARBYINT2:%.*]] = call float @llvm.nearbyint.f32(float [[LD2]])
-; SSE2-NEXT:    [[NEARBYINT3:%.*]] = call float @llvm.nearbyint.f32(float [[LD3]])
-; SSE2-NEXT:    store float [[NEARBYINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; SSE2-NEXT:    store float [[NEARBYINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE2-NEXT:    store float [[NEARBYINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; SSE2-NEXT:    store float [[NEARBYINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; SSE2-NEXT:    [[NEARBYINT0:%.*]] = call float @llvm.nearbyint.f32(float [[TMP2]])
+; SSE2-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; SSE2-NEXT:    [[NEARBYINT1:%.*]] = call float @llvm.nearbyint.f32(float [[TMP3]])
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; SSE2-NEXT:    [[NEARBYINT2:%.*]] = call float @llvm.nearbyint.f32(float [[TMP4]])
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; SSE2-NEXT:    [[NEARBYINT3:%.*]] = call float @llvm.nearbyint.f32(float [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> poison, float [[NEARBYINT0]], i32 0
+; SSE2-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[NEARBYINT1]], i32 1
+; SSE2-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[NEARBYINT2]], i32 2
+; SSE2-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[NEARBYINT3]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP9]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @nearbyint_4f32(
@@ -1435,30 +1598,34 @@
 
 define void @nearbyint_8f32() #0 {
 ; SSE2-LABEL: @nearbyint_8f32(
-; SSE2-NEXT:    [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; SSE2-NEXT:    [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; SSE2-NEXT:    [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; SSE2-NEXT:    [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; SSE2-NEXT:    [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; SSE2-NEXT:    [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; SSE2-NEXT:    [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; SSE2-NEXT:    [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; SSE2-NEXT:    [[NEARBYINT0:%.*]] = call float @llvm.nearbyint.f32(float [[LD0]])
-; SSE2-NEXT:    [[NEARBYINT1:%.*]] = call float @llvm.nearbyint.f32(float [[LD1]])
-; SSE2-NEXT:    [[NEARBYINT2:%.*]] = call float @llvm.nearbyint.f32(float [[LD2]])
-; SSE2-NEXT:    [[NEARBYINT3:%.*]] = call float @llvm.nearbyint.f32(float [[LD3]])
-; SSE2-NEXT:    [[NEARBYINT4:%.*]] = call float @llvm.nearbyint.f32(float [[LD4]])
-; SSE2-NEXT:    [[NEARBYINT5:%.*]] = call float @llvm.nearbyint.f32(float [[LD5]])
-; SSE2-NEXT:    [[NEARBYINT6:%.*]] = call float @llvm.nearbyint.f32(float [[LD6]])
-; SSE2-NEXT:    [[NEARBYINT7:%.*]] = call float @llvm.nearbyint.f32(float [[LD7]])
-; SSE2-NEXT:    store float [[NEARBYINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; SSE2-NEXT:    store float [[NEARBYINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE2-NEXT:    store float [[NEARBYINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; SSE2-NEXT:    store float [[NEARBYINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE2-NEXT:    store float [[NEARBYINT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
-; SSE2-NEXT:    store float [[NEARBYINT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE2-NEXT:    store float [[NEARBYINT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
-; SSE2-NEXT:    store float [[NEARBYINT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; SSE2-NEXT:    [[NEARBYINT0:%.*]] = call float @llvm.nearbyint.f32(float [[TMP3]])
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; SSE2-NEXT:    [[NEARBYINT1:%.*]] = call float @llvm.nearbyint.f32(float [[TMP4]])
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; SSE2-NEXT:    [[NEARBYINT2:%.*]] = call float @llvm.nearbyint.f32(float [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; SSE2-NEXT:    [[NEARBYINT3:%.*]] = call float @llvm.nearbyint.f32(float [[TMP6]])
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; SSE2-NEXT:    [[NEARBYINT4:%.*]] = call float @llvm.nearbyint.f32(float [[TMP7]])
+; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; SSE2-NEXT:    [[NEARBYINT5:%.*]] = call float @llvm.nearbyint.f32(float [[TMP8]])
+; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; SSE2-NEXT:    [[NEARBYINT6:%.*]] = call float @llvm.nearbyint.f32(float [[TMP9]])
+; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; SSE2-NEXT:    [[NEARBYINT7:%.*]] = call float @llvm.nearbyint.f32(float [[TMP10]])
+; SSE2-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> poison, float [[NEARBYINT0]], i32 0
+; SSE2-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[NEARBYINT1]], i32 1
+; SSE2-NEXT:    [[TMP13:%.*]] = insertelement <4 x float> [[TMP12]], float [[NEARBYINT2]], i32 2
+; SSE2-NEXT:    [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[NEARBYINT3]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP14]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP15:%.*]] = insertelement <4 x float> poison, float [[NEARBYINT4]], i32 0
+; SSE2-NEXT:    [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[NEARBYINT5]], i32 1
+; SSE2-NEXT:    [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float [[NEARBYINT6]], i32 2
+; SSE2-NEXT:    [[TMP18:%.*]] = insertelement <4 x float> [[TMP17]], float [[NEARBYINT7]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP18]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @nearbyint_8f32(
@@ -1505,54 +1672,62 @@
 
 define void @nearbyint_16f32() #0 {
 ; SSE2-LABEL: @nearbyint_16f32(
-; SSE2-NEXT:    [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; SSE2-NEXT:    [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; SSE2-NEXT:    [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; SSE2-NEXT:    [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; SSE2-NEXT:    [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; SSE2-NEXT:    [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; SSE2-NEXT:    [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; SSE2-NEXT:    [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; SSE2-NEXT:    [[LD8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8), align 4
-; SSE2-NEXT:    [[LD9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9), align 4
-; SSE2-NEXT:    [[LD10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4
-; SSE2-NEXT:    [[LD11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4
-; SSE2-NEXT:    [[LD12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4
-; SSE2-NEXT:    [[LD13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4
-; SSE2-NEXT:    [[LD14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4
-; SSE2-NEXT:    [[LD15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4
-; SSE2-NEXT:    [[NEARBYINT0:%.*]] = call float @llvm.nearbyint.f32(float [[LD0]])
-; SSE2-NEXT:    [[NEARBYINT1:%.*]] = call float @llvm.nearbyint.f32(float [[LD1]])
-; SSE2-NEXT:    [[NEARBYINT2:%.*]] = call float @llvm.nearbyint.f32(float [[LD2]])
-; SSE2-NEXT:    [[NEARBYINT3:%.*]] = call float @llvm.nearbyint.f32(float [[LD3]])
-; SSE2-NEXT:    [[NEARBYINT4:%.*]] = call float @llvm.nearbyint.f32(float [[LD4]])
-; SSE2-NEXT:    [[NEARBYINT5:%.*]] = call float @llvm.nearbyint.f32(float [[LD5]])
-; SSE2-NEXT:    [[NEARBYINT6:%.*]] = call float @llvm.nearbyint.f32(float [[LD6]])
-; SSE2-NEXT:    [[NEARBYINT7:%.*]] = call float @llvm.nearbyint.f32(float [[LD7]])
-; SSE2-NEXT:    [[NEARBYINT8:%.*]] = call float @llvm.nearbyint.f32(float [[LD8]])
-; SSE2-NEXT:    [[NEARBYINT9:%.*]] = call float @llvm.nearbyint.f32(float [[LD9]])
-; SSE2-NEXT:    [[NEARBYINT10:%.*]] = call float @llvm.nearbyint.f32(float [[LD10]])
-; SSE2-NEXT:    [[NEARBYINT11:%.*]] = call float @llvm.nearbyint.f32(float [[LD11]])
-; SSE2-NEXT:    [[NEARBYINT12:%.*]] = call float @llvm.nearbyint.f32(float [[LD12]])
-; SSE2-NEXT:    [[NEARBYINT13:%.*]] = call float @llvm.nearbyint.f32(float [[LD13]])
-; SSE2-NEXT:    [[NEARBYINT14:%.*]] = call float @llvm.nearbyint.f32(float [[LD14]])
-; SSE2-NEXT:    [[NEARBYINT15:%.*]] = call float @llvm.nearbyint.f32(float [[LD15]])
-; SSE2-NEXT:    store float [[NEARBYINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; SSE2-NEXT:    store float [[NEARBYINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE2-NEXT:    store float [[NEARBYINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; SSE2-NEXT:    store float [[NEARBYINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE2-NEXT:    store float [[NEARBYINT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
-; SSE2-NEXT:    store float [[NEARBYINT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE2-NEXT:    store float [[NEARBYINT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
-; SSE2-NEXT:    store float [[NEARBYINT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
-; SSE2-NEXT:    store float [[NEARBYINT8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4
-; SSE2-NEXT:    store float [[NEARBYINT9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4
-; SSE2-NEXT:    store float [[NEARBYINT10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4
-; SSE2-NEXT:    store float [[NEARBYINT11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
-; SSE2-NEXT:    store float [[NEARBYINT12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4
-; SSE2-NEXT:    store float [[NEARBYINT13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
-; SSE2-NEXT:    store float [[NEARBYINT14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4
-; SSE2-NEXT:    store float [[NEARBYINT15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; SSE2-NEXT:    [[NEARBYINT0:%.*]] = call float @llvm.nearbyint.f32(float [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; SSE2-NEXT:    [[NEARBYINT1:%.*]] = call float @llvm.nearbyint.f32(float [[TMP6]])
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; SSE2-NEXT:    [[NEARBYINT2:%.*]] = call float @llvm.nearbyint.f32(float [[TMP7]])
+; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; SSE2-NEXT:    [[NEARBYINT3:%.*]] = call float @llvm.nearbyint.f32(float [[TMP8]])
+; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; SSE2-NEXT:    [[NEARBYINT4:%.*]] = call float @llvm.nearbyint.f32(float [[TMP9]])
+; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; SSE2-NEXT:    [[NEARBYINT5:%.*]] = call float @llvm.nearbyint.f32(float [[TMP10]])
+; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; SSE2-NEXT:    [[NEARBYINT6:%.*]] = call float @llvm.nearbyint.f32(float [[TMP11]])
+; SSE2-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; SSE2-NEXT:    [[NEARBYINT7:%.*]] = call float @llvm.nearbyint.f32(float [[TMP12]])
+; SSE2-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
+; SSE2-NEXT:    [[NEARBYINT8:%.*]] = call float @llvm.nearbyint.f32(float [[TMP13]])
+; SSE2-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
+; SSE2-NEXT:    [[NEARBYINT9:%.*]] = call float @llvm.nearbyint.f32(float [[TMP14]])
+; SSE2-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
+; SSE2-NEXT:    [[NEARBYINT10:%.*]] = call float @llvm.nearbyint.f32(float [[TMP15]])
+; SSE2-NEXT:    [[TMP16:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
+; SSE2-NEXT:    [[NEARBYINT11:%.*]] = call float @llvm.nearbyint.f32(float [[TMP16]])
+; SSE2-NEXT:    [[TMP17:%.*]] = extractelement <4 x float> [[TMP4]], i32 0
+; SSE2-NEXT:    [[NEARBYINT12:%.*]] = call float @llvm.nearbyint.f32(float [[TMP17]])
+; SSE2-NEXT:    [[TMP18:%.*]] = extractelement <4 x float> [[TMP4]], i32 1
+; SSE2-NEXT:    [[NEARBYINT13:%.*]] = call float @llvm.nearbyint.f32(float [[TMP18]])
+; SSE2-NEXT:    [[TMP19:%.*]] = extractelement <4 x float> [[TMP4]], i32 2
+; SSE2-NEXT:    [[NEARBYINT14:%.*]] = call float @llvm.nearbyint.f32(float [[TMP19]])
+; SSE2-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[TMP4]], i32 3
+; SSE2-NEXT:    [[NEARBYINT15:%.*]] = call float @llvm.nearbyint.f32(float [[TMP20]])
+; SSE2-NEXT:    [[TMP21:%.*]] = insertelement <4 x float> poison, float [[NEARBYINT0]], i32 0
+; SSE2-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[NEARBYINT1]], i32 1
+; SSE2-NEXT:    [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[NEARBYINT2]], i32 2
+; SSE2-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[NEARBYINT3]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP24]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP25:%.*]] = insertelement <4 x float> poison, float [[NEARBYINT4]], i32 0
+; SSE2-NEXT:    [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[NEARBYINT5]], i32 1
+; SSE2-NEXT:    [[TMP27:%.*]] = insertelement <4 x float> [[TMP26]], float [[NEARBYINT6]], i32 2
+; SSE2-NEXT:    [[TMP28:%.*]] = insertelement <4 x float> [[TMP27]], float [[NEARBYINT7]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP28]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP29:%.*]] = insertelement <4 x float> poison, float [[NEARBYINT8]], i32 0
+; SSE2-NEXT:    [[TMP30:%.*]] = insertelement <4 x float> [[TMP29]], float [[NEARBYINT9]], i32 1
+; SSE2-NEXT:    [[TMP31:%.*]] = insertelement <4 x float> [[TMP30]], float [[NEARBYINT10]], i32 2
+; SSE2-NEXT:    [[TMP32:%.*]] = insertelement <4 x float> [[TMP31]], float [[NEARBYINT11]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP32]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP33:%.*]] = insertelement <4 x float> poison, float [[NEARBYINT12]], i32 0
+; SSE2-NEXT:    [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[NEARBYINT13]], i32 1
+; SSE2-NEXT:    [[TMP35:%.*]] = insertelement <4 x float> [[TMP34]], float [[NEARBYINT14]], i32 2
+; SSE2-NEXT:    [[TMP36:%.*]] = insertelement <4 x float> [[TMP35]], float [[NEARBYINT15]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP36]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @nearbyint_16f32(
@@ -1593,6 +1768,15 @@
 ; AVX512-NEXT:    [[TMP2:%.*]] = call <16 x float> @llvm.nearbyint.v16f32(<16 x float> [[TMP1]])
 ; AVX512-NEXT:    store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4
 ; AVX512-NEXT:    ret void
+;
+; AVX2-SKYLAKE-LABEL: @nearbyint_16f32(
+; AVX2-SKYLAKE-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; AVX2-SKYLAKE-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX2-SKYLAKE-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> [[TMP1]])
+; AVX2-SKYLAKE-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> [[TMP2]])
+; AVX2-SKYLAKE-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX2-SKYLAKE-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX2-SKYLAKE-NEXT:    ret void
 ;
   %ld0  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0 ), align 4
   %ld1  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1 ), align 4
@@ -1647,18 +1831,20 @@
 
 define void @rint_4f32() #0 {
 ; SSE2-LABEL: @rint_4f32(
-; SSE2-NEXT:    [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; SSE2-NEXT:    [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; SSE2-NEXT:    [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; SSE2-NEXT:    [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; SSE2-NEXT:    [[RINT0:%.*]] = call float @llvm.rint.f32(float [[LD0]])
-; SSE2-NEXT:    [[RINT1:%.*]] = call float @llvm.rint.f32(float [[LD1]])
-; SSE2-NEXT:    [[RINT2:%.*]] = call float @llvm.rint.f32(float [[LD2]])
-; SSE2-NEXT:    [[RINT3:%.*]] = call float @llvm.rint.f32(float [[LD3]])
-; SSE2-NEXT:    store float [[RINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; SSE2-NEXT:    store float [[RINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE2-NEXT:    store float [[RINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; SSE2-NEXT:    store float [[RINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; SSE2-NEXT:    [[RINT0:%.*]] = call float @llvm.rint.f32(float [[TMP2]])
+; SSE2-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; SSE2-NEXT:    [[RINT1:%.*]] = call float @llvm.rint.f32(float [[TMP3]])
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; SSE2-NEXT:    [[RINT2:%.*]] = call float @llvm.rint.f32(float [[TMP4]])
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; SSE2-NEXT:    [[RINT3:%.*]] = call float @llvm.rint.f32(float [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> poison, float [[RINT0]], i32 0
+; SSE2-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[RINT1]], i32 1
+; SSE2-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[RINT2]], i32 2
+; SSE2-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[RINT3]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP9]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @rint_4f32(
@@ -1690,30 +1876,34 @@
 
 define void @rint_8f32() #0 {
 ; SSE2-LABEL: @rint_8f32(
-; SSE2-NEXT:    [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; SSE2-NEXT:    [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; SSE2-NEXT:    [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; SSE2-NEXT:    [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; SSE2-NEXT:    [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; SSE2-NEXT:    [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; SSE2-NEXT:    [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; SSE2-NEXT:    [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; SSE2-NEXT:    [[RINT0:%.*]] = call float @llvm.rint.f32(float [[LD0]])
-; SSE2-NEXT:    [[RINT1:%.*]] = call float @llvm.rint.f32(float [[LD1]])
-; SSE2-NEXT:    [[RINT2:%.*]] = call float @llvm.rint.f32(float [[LD2]])
-; SSE2-NEXT:    [[RINT3:%.*]] = call float @llvm.rint.f32(float [[LD3]])
-; SSE2-NEXT:    [[RINT4:%.*]] = call float @llvm.rint.f32(float [[LD4]])
-; SSE2-NEXT:    [[RINT5:%.*]] = call float @llvm.rint.f32(float [[LD5]])
-; SSE2-NEXT:    [[RINT6:%.*]] = call float @llvm.rint.f32(float [[LD6]])
-; SSE2-NEXT:    [[RINT7:%.*]] = call float @llvm.rint.f32(float [[LD7]])
-; SSE2-NEXT:    store float [[RINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; SSE2-NEXT:    store float [[RINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE2-NEXT:    store float [[RINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; SSE2-NEXT:    store float [[RINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE2-NEXT:    store float [[RINT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
-; SSE2-NEXT:    store float [[RINT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE2-NEXT:    store float [[RINT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
-; SSE2-NEXT:    store float [[RINT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; SSE2-NEXT:    [[RINT0:%.*]] = call float @llvm.rint.f32(float [[TMP3]])
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; SSE2-NEXT:    [[RINT1:%.*]] = call float @llvm.rint.f32(float [[TMP4]])
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; SSE2-NEXT:    [[RINT2:%.*]] = call float @llvm.rint.f32(float [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; SSE2-NEXT:    [[RINT3:%.*]] = call float @llvm.rint.f32(float [[TMP6]])
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; SSE2-NEXT:    [[RINT4:%.*]] = call float @llvm.rint.f32(float [[TMP7]])
+; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; SSE2-NEXT:    [[RINT5:%.*]] = call float @llvm.rint.f32(float [[TMP8]])
+; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; SSE2-NEXT:    [[RINT6:%.*]] = call float @llvm.rint.f32(float [[TMP9]])
+; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; SSE2-NEXT:    [[RINT7:%.*]] = call float @llvm.rint.f32(float [[TMP10]])
+; SSE2-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> poison, float [[RINT0]], i32 0
+; SSE2-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[RINT1]], i32 1
+; SSE2-NEXT:    [[TMP13:%.*]] = insertelement <4 x float> [[TMP12]], float [[RINT2]], i32 2
+; SSE2-NEXT:    [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[RINT3]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP14]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP15:%.*]] = insertelement <4 x float> poison, float [[RINT4]], i32 0
+; SSE2-NEXT:    [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[RINT5]], i32 1
+; SSE2-NEXT:    [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float [[RINT6]], i32 2
+; SSE2-NEXT:    [[TMP18:%.*]] = insertelement <4 x float> [[TMP17]], float [[RINT7]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP18]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @rint_8f32(
@@ -1760,54 +1950,62 @@
 
 define void @rint_16f32() #0 {
 ; SSE2-LABEL: @rint_16f32(
-; SSE2-NEXT:    [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; SSE2-NEXT:    [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; SSE2-NEXT:    [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; SSE2-NEXT:    [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; SSE2-NEXT:    [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; SSE2-NEXT:    [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; SSE2-NEXT:    [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; SSE2-NEXT:    [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; SSE2-NEXT:    [[LD8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8), align 4
-; SSE2-NEXT:    [[LD9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9), align 4
-; SSE2-NEXT:    [[LD10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4
-; SSE2-NEXT:    [[LD11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4
-; SSE2-NEXT:    [[LD12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4
-; SSE2-NEXT:    [[LD13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4
-; SSE2-NEXT:    [[LD14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4
-; SSE2-NEXT:    [[LD15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4
-; SSE2-NEXT:    [[RINT0:%.*]] = call float @llvm.rint.f32(float [[LD0]])
-; SSE2-NEXT:    [[RINT1:%.*]] = call float @llvm.rint.f32(float [[LD1]])
-; SSE2-NEXT:    [[RINT2:%.*]] = call float @llvm.rint.f32(float [[LD2]])
-; SSE2-NEXT:    [[RINT3:%.*]] = call float @llvm.rint.f32(float [[LD3]])
-; SSE2-NEXT:    [[RINT4:%.*]] = call float @llvm.rint.f32(float [[LD4]])
-; SSE2-NEXT:    [[RINT5:%.*]] = call float @llvm.rint.f32(float [[LD5]])
-; SSE2-NEXT:    [[RINT6:%.*]] = call float @llvm.rint.f32(float [[LD6]])
-; SSE2-NEXT:    [[RINT7:%.*]] = call float @llvm.rint.f32(float [[LD7]])
-; SSE2-NEXT:    [[RINT8:%.*]] = call float @llvm.rint.f32(float [[LD8]])
-; SSE2-NEXT:    [[RINT9:%.*]] = call float @llvm.rint.f32(float [[LD9]])
-; SSE2-NEXT:    [[RINT10:%.*]] = call float @llvm.rint.f32(float [[LD10]])
-; SSE2-NEXT:    [[RINT11:%.*]] = call float @llvm.rint.f32(float [[LD11]])
-; SSE2-NEXT:    [[RINT12:%.*]] = call float @llvm.rint.f32(float [[LD12]])
-; SSE2-NEXT:    [[RINT13:%.*]] = call float @llvm.rint.f32(float [[LD13]])
-; SSE2-NEXT:    [[RINT14:%.*]] = call float @llvm.rint.f32(float [[LD14]])
-; SSE2-NEXT:    [[RINT15:%.*]] = call float @llvm.rint.f32(float [[LD15]])
-; SSE2-NEXT:    store float [[RINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; SSE2-NEXT:    store float [[RINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE2-NEXT:    store float [[RINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; SSE2-NEXT:    store float [[RINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE2-NEXT:    store float [[RINT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
-; SSE2-NEXT:    store float [[RINT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE2-NEXT:    store float [[RINT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
-; SSE2-NEXT:    store float [[RINT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
-; SSE2-NEXT:    store float [[RINT8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4
-; SSE2-NEXT:    store float [[RINT9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4
-; SSE2-NEXT:    store float [[RINT10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4
-; SSE2-NEXT:    store float [[RINT11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
-; SSE2-NEXT:    store float [[RINT12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4
-; SSE2-NEXT:    store float [[RINT13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
-; SSE2-NEXT:    store float [[RINT14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4
-; SSE2-NEXT:    store float [[RINT15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; SSE2-NEXT:    [[RINT0:%.*]] = call float @llvm.rint.f32(float [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; SSE2-NEXT:    [[RINT1:%.*]] = call float @llvm.rint.f32(float [[TMP6]])
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; SSE2-NEXT:    [[RINT2:%.*]] = call float @llvm.rint.f32(float [[TMP7]])
+; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; SSE2-NEXT:    [[RINT3:%.*]] = call float @llvm.rint.f32(float [[TMP8]])
+; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; SSE2-NEXT:    [[RINT4:%.*]] = call float @llvm.rint.f32(float [[TMP9]])
+; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; SSE2-NEXT:    [[RINT5:%.*]] = call float @llvm.rint.f32(float [[TMP10]])
+; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; SSE2-NEXT:    [[RINT6:%.*]] = call float @llvm.rint.f32(float [[TMP11]])
+; SSE2-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; SSE2-NEXT:    [[RINT7:%.*]] = call float @llvm.rint.f32(float [[TMP12]])
+; SSE2-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
+; SSE2-NEXT:    [[RINT8:%.*]] = call float @llvm.rint.f32(float [[TMP13]])
+; SSE2-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
+; SSE2-NEXT:    [[RINT9:%.*]] = call float @llvm.rint.f32(float [[TMP14]])
+; SSE2-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
+; SSE2-NEXT:    [[RINT10:%.*]] = call float @llvm.rint.f32(float [[TMP15]])
+; SSE2-NEXT:    [[TMP16:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
+; SSE2-NEXT:    [[RINT11:%.*]] = call float @llvm.rint.f32(float [[TMP16]])
+; SSE2-NEXT:    [[TMP17:%.*]] = extractelement <4 x float> [[TMP4]], i32 0
+; SSE2-NEXT:    [[RINT12:%.*]] = call float @llvm.rint.f32(float [[TMP17]])
+; SSE2-NEXT:    [[TMP18:%.*]] = extractelement <4 x float> [[TMP4]], i32 1
+; SSE2-NEXT:    [[RINT13:%.*]] = call float @llvm.rint.f32(float [[TMP18]])
+; SSE2-NEXT:    [[TMP19:%.*]] = extractelement <4 x float> [[TMP4]], i32 2
+; SSE2-NEXT:    [[RINT14:%.*]] = call float @llvm.rint.f32(float [[TMP19]])
+; SSE2-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[TMP4]], i32 3
+; SSE2-NEXT:    [[RINT15:%.*]] = call float @llvm.rint.f32(float [[TMP20]])
+; SSE2-NEXT:    [[TMP21:%.*]] = insertelement <4 x float> poison, float [[RINT0]], i32 0
+; SSE2-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[RINT1]], i32 1
+; SSE2-NEXT:    [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[RINT2]], i32 2
+; SSE2-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[RINT3]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP24]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP25:%.*]] = insertelement <4 x float> poison, float [[RINT4]], i32 0
+; SSE2-NEXT:    [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[RINT5]], i32 1
+; SSE2-NEXT:    [[TMP27:%.*]] = insertelement <4 x float> [[TMP26]], float [[RINT6]], i32 2
+; SSE2-NEXT:    [[TMP28:%.*]] = insertelement <4 x float> [[TMP27]], float [[RINT7]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP28]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP29:%.*]] = insertelement <4 x float> poison, float [[RINT8]], i32 0
+; SSE2-NEXT:    [[TMP30:%.*]] = insertelement <4 x float> [[TMP29]], float [[RINT9]], i32 1
+; SSE2-NEXT:    [[TMP31:%.*]] = insertelement <4 x float> [[TMP30]], float [[RINT10]], i32 2
+; SSE2-NEXT:    [[TMP32:%.*]] = insertelement <4 x float> [[TMP31]], float [[RINT11]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP32]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP33:%.*]] = insertelement <4 x float> poison, float [[RINT12]], i32 0
+; SSE2-NEXT:    [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[RINT13]], i32 1
+; SSE2-NEXT:    [[TMP35:%.*]] = insertelement <4 x float> [[TMP34]], float [[RINT14]], i32 2
+; SSE2-NEXT:    [[TMP36:%.*]] = insertelement <4 x float> [[TMP35]], float [[RINT15]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP36]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @rint_16f32(
@@ -1848,6 +2046,15 @@
 ; AVX512-NEXT:    [[TMP2:%.*]] = call <16 x float> @llvm.rint.v16f32(<16 x float> [[TMP1]])
 ; AVX512-NEXT:    store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4
 ; AVX512-NEXT:    ret void
+;
+; AVX2-SKYLAKE-LABEL: @rint_16f32(
+; AVX2-SKYLAKE-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; AVX2-SKYLAKE-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX2-SKYLAKE-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP1]])
+; AVX2-SKYLAKE-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP2]])
+; AVX2-SKYLAKE-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX2-SKYLAKE-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX2-SKYLAKE-NEXT:    ret void
 ;
   %ld0  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0 ), align 4
   %ld1  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1 ), align 4
@@ -1902,18 +2109,20 @@
 
 define void @trunc_4f32() #0 {
 ; SSE2-LABEL: @trunc_4f32(
-; SSE2-NEXT:    [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; SSE2-NEXT:    [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; SSE2-NEXT:    [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; SSE2-NEXT:    [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; SSE2-NEXT:    [[TRUNC0:%.*]] = call float @llvm.trunc.f32(float [[LD0]])
-; SSE2-NEXT:    [[TRUNC1:%.*]] = call float @llvm.trunc.f32(float [[LD1]])
-; SSE2-NEXT:    [[TRUNC2:%.*]] = call float @llvm.trunc.f32(float [[LD2]])
-; SSE2-NEXT:    [[TRUNC3:%.*]] = call float @llvm.trunc.f32(float [[LD3]])
-; SSE2-NEXT:    store float [[TRUNC0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; SSE2-NEXT:    store float [[TRUNC1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE2-NEXT:    store float [[TRUNC2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; SSE2-NEXT:    store float [[TRUNC3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; SSE2-NEXT:    [[TRUNC0:%.*]] = call float @llvm.trunc.f32(float [[TMP2]])
+; SSE2-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; SSE2-NEXT:    [[TRUNC1:%.*]] = call float @llvm.trunc.f32(float [[TMP3]])
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; SSE2-NEXT:    [[TRUNC2:%.*]] = call float @llvm.trunc.f32(float [[TMP4]])
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; SSE2-NEXT:    [[TRUNC3:%.*]] = call float @llvm.trunc.f32(float [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> poison, float [[TRUNC0]], i32 0
+; SSE2-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[TRUNC1]], i32 1
+; SSE2-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[TRUNC2]], i32 2
+; SSE2-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[TRUNC3]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP9]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @trunc_4f32(
@@ -1945,30 +2154,34 @@
 
 define void @trunc_8f32() #0 {
 ; SSE2-LABEL: @trunc_8f32(
-; SSE2-NEXT:    [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; SSE2-NEXT:    [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; SSE2-NEXT:    [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; SSE2-NEXT:    [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; SSE2-NEXT:    [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; SSE2-NEXT:    [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; SSE2-NEXT:    [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; SSE2-NEXT:    [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; SSE2-NEXT:    [[TRUNC0:%.*]] = call float @llvm.trunc.f32(float [[LD0]])
-; SSE2-NEXT:    [[TRUNC1:%.*]] = call float @llvm.trunc.f32(float [[LD1]])
-; SSE2-NEXT:    [[TRUNC2:%.*]] = call float @llvm.trunc.f32(float [[LD2]])
-; SSE2-NEXT:    [[TRUNC3:%.*]] = call float @llvm.trunc.f32(float [[LD3]])
-; SSE2-NEXT:    [[TRUNC4:%.*]] = call float @llvm.trunc.f32(float [[LD4]])
-; SSE2-NEXT:    [[TRUNC5:%.*]] = call float @llvm.trunc.f32(float [[LD5]])
-; SSE2-NEXT:    [[TRUNC6:%.*]] = call float @llvm.trunc.f32(float [[LD6]])
-; SSE2-NEXT:    [[TRUNC7:%.*]] = call float @llvm.trunc.f32(float [[LD7]])
-; SSE2-NEXT:    store float [[TRUNC0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; SSE2-NEXT:    store float [[TRUNC1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE2-NEXT:    store float [[TRUNC2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; SSE2-NEXT:    store float [[TRUNC3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE2-NEXT:    store float [[TRUNC4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
-; SSE2-NEXT:    store float [[TRUNC5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE2-NEXT:    store float [[TRUNC6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
-; SSE2-NEXT:    store float [[TRUNC7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; SSE2-NEXT:    [[TRUNC0:%.*]] = call float @llvm.trunc.f32(float [[TMP3]])
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; SSE2-NEXT:    [[TRUNC1:%.*]] = call float @llvm.trunc.f32(float [[TMP4]])
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; SSE2-NEXT:    [[TRUNC2:%.*]] = call float @llvm.trunc.f32(float [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; SSE2-NEXT:    [[TRUNC3:%.*]] = call float @llvm.trunc.f32(float [[TMP6]])
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; SSE2-NEXT:    [[TRUNC4:%.*]] = call float @llvm.trunc.f32(float [[TMP7]])
+; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; SSE2-NEXT:    [[TRUNC5:%.*]] = call float @llvm.trunc.f32(float [[TMP8]])
+; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; SSE2-NEXT:    [[TRUNC6:%.*]] = call float @llvm.trunc.f32(float [[TMP9]])
+; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; SSE2-NEXT:    [[TRUNC7:%.*]] = call float @llvm.trunc.f32(float [[TMP10]])
+; SSE2-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> poison, float [[TRUNC0]], i32 0
+; SSE2-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[TRUNC1]], i32 1
+; SSE2-NEXT:    [[TMP13:%.*]] = insertelement <4 x float> [[TMP12]], float [[TRUNC2]], i32 2
+; SSE2-NEXT:    [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[TRUNC3]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP14]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP15:%.*]] = insertelement <4 x float> poison, float [[TRUNC4]], i32 0
+; SSE2-NEXT:    [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[TRUNC5]], i32 1
+; SSE2-NEXT:    [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float [[TRUNC6]], i32 2
+; SSE2-NEXT:    [[TMP18:%.*]] = insertelement <4 x float> [[TMP17]], float [[TRUNC7]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP18]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @trunc_8f32(
@@ -2015,54 +2228,62 @@
 
 define void @trunc_16f32() #0 {
 ; SSE2-LABEL: @trunc_16f32(
-; SSE2-NEXT:    [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; SSE2-NEXT:    [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; SSE2-NEXT:    [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; SSE2-NEXT:    [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; SSE2-NEXT:    [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; SSE2-NEXT:    [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; SSE2-NEXT:    [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; SSE2-NEXT:    [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; SSE2-NEXT:    [[LD8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8), align 4
-; SSE2-NEXT:    [[LD9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9), align 4
-; SSE2-NEXT:    [[LD10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4
-; SSE2-NEXT:    [[LD11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4
-; SSE2-NEXT:    [[LD12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4
-; SSE2-NEXT:    [[LD13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4
-; SSE2-NEXT:    [[LD14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4
-; SSE2-NEXT:    [[LD15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4
-; SSE2-NEXT:    [[TRUNC0:%.*]] = call float @llvm.trunc.f32(float [[LD0]])
-; SSE2-NEXT:    [[TRUNC1:%.*]] = call float @llvm.trunc.f32(float [[LD1]])
-; SSE2-NEXT:    [[TRUNC2:%.*]] = call float @llvm.trunc.f32(float [[LD2]])
-; SSE2-NEXT:    [[TRUNC3:%.*]] = call float @llvm.trunc.f32(float [[LD3]])
-; SSE2-NEXT:    [[TRUNC4:%.*]] = call float @llvm.trunc.f32(float [[LD4]])
-; SSE2-NEXT:    [[TRUNC5:%.*]] = call float @llvm.trunc.f32(float [[LD5]])
-; SSE2-NEXT:    [[TRUNC6:%.*]] = call float @llvm.trunc.f32(float [[LD6]])
-; SSE2-NEXT:    [[TRUNC7:%.*]] = call float @llvm.trunc.f32(float [[LD7]])
-; SSE2-NEXT:    [[TRUNC8:%.*]] = call float @llvm.trunc.f32(float [[LD8]])
-; SSE2-NEXT:    [[TRUNC9:%.*]] = call float @llvm.trunc.f32(float [[LD9]])
-; SSE2-NEXT:    [[TRUNC10:%.*]] = call float @llvm.trunc.f32(float [[LD10]])
-; SSE2-NEXT:    [[TRUNC11:%.*]] = call float @llvm.trunc.f32(float [[LD11]])
-; SSE2-NEXT:    [[TRUNC12:%.*]] = call float @llvm.trunc.f32(float [[LD12]])
-; SSE2-NEXT:    [[TRUNC13:%.*]] = call float @llvm.trunc.f32(float [[LD13]])
-; SSE2-NEXT:    [[TRUNC14:%.*]] = call float @llvm.trunc.f32(float [[LD14]])
-; SSE2-NEXT:    [[TRUNC15:%.*]] = call float @llvm.trunc.f32(float [[LD15]])
-; SSE2-NEXT:    store float [[TRUNC0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; SSE2-NEXT:    store float [[TRUNC1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE2-NEXT:    store float [[TRUNC2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; SSE2-NEXT:    store float [[TRUNC3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE2-NEXT:    store float [[TRUNC4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
-; SSE2-NEXT:    store float [[TRUNC5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE2-NEXT:    store float [[TRUNC6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
-; SSE2-NEXT:    store float [[TRUNC7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
-; SSE2-NEXT:    store float [[TRUNC8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4
-; SSE2-NEXT:    store float [[TRUNC9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4
-; SSE2-NEXT:    store float [[TRUNC10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4
-; SSE2-NEXT:    store float [[TRUNC11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
-; SSE2-NEXT:    store float [[TRUNC12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4
-; SSE2-NEXT:    store float [[TRUNC13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
-; SSE2-NEXT:    store float [[TRUNC14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4
-; SSE2-NEXT:    store float [[TRUNC15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; SSE2-NEXT:    [[TRUNC0:%.*]] = call float @llvm.trunc.f32(float [[TMP5]])
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; SSE2-NEXT:    [[TRUNC1:%.*]] = call float @llvm.trunc.f32(float [[TMP6]])
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; SSE2-NEXT:    [[TRUNC2:%.*]] = call float @llvm.trunc.f32(float [[TMP7]])
+; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; SSE2-NEXT:    [[TRUNC3:%.*]] = call float @llvm.trunc.f32(float [[TMP8]])
+; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; SSE2-NEXT:    [[TRUNC4:%.*]] = call float @llvm.trunc.f32(float [[TMP9]])
+; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; SSE2-NEXT:    [[TRUNC5:%.*]] = call float @llvm.trunc.f32(float [[TMP10]])
+; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; SSE2-NEXT:    [[TRUNC6:%.*]] = call float @llvm.trunc.f32(float [[TMP11]])
+; SSE2-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; SSE2-NEXT:    [[TRUNC7:%.*]] = call float @llvm.trunc.f32(float [[TMP12]])
+; SSE2-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
+; SSE2-NEXT:    [[TRUNC8:%.*]] = call float @llvm.trunc.f32(float [[TMP13]])
+; SSE2-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
+; SSE2-NEXT:    [[TRUNC9:%.*]] = call float @llvm.trunc.f32(float [[TMP14]])
+; SSE2-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
+; SSE2-NEXT:    [[TRUNC10:%.*]] = call float @llvm.trunc.f32(float [[TMP15]])
+; SSE2-NEXT:    [[TMP16:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
+; SSE2-NEXT:    [[TRUNC11:%.*]] = call float @llvm.trunc.f32(float [[TMP16]])
+; SSE2-NEXT:    [[TMP17:%.*]] = extractelement <4 x float> [[TMP4]], i32 0
+; SSE2-NEXT:    [[TRUNC12:%.*]] = call float @llvm.trunc.f32(float [[TMP17]])
+; SSE2-NEXT:    [[TMP18:%.*]] = extractelement <4 x float> [[TMP4]], i32 1
+; SSE2-NEXT:    [[TRUNC13:%.*]] = call float @llvm.trunc.f32(float [[TMP18]])
+; SSE2-NEXT:    [[TMP19:%.*]] = extractelement <4 x float> [[TMP4]], i32 2
+; SSE2-NEXT:    [[TRUNC14:%.*]] = call float @llvm.trunc.f32(float [[TMP19]])
+; SSE2-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[TMP4]], i32 3
+; SSE2-NEXT:    [[TRUNC15:%.*]] = call float @llvm.trunc.f32(float [[TMP20]])
+; SSE2-NEXT:    [[TMP21:%.*]] = insertelement <4 x float> poison, float [[TRUNC0]], i32 0
+; SSE2-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TRUNC1]], i32 1
+; SSE2-NEXT:    [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[TRUNC2]], i32 2
+; SSE2-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TRUNC3]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP24]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP25:%.*]] = insertelement <4 x float> poison, float [[TRUNC4]], i32 0
+; SSE2-NEXT:    [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[TRUNC5]], i32 1
+; SSE2-NEXT:    [[TMP27:%.*]] = insertelement <4 x float> [[TMP26]], float [[TRUNC6]], i32 2
+; SSE2-NEXT:    [[TMP28:%.*]] = insertelement <4 x float> [[TMP27]], float [[TRUNC7]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP28]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP29:%.*]] = insertelement <4 x float> poison, float [[TRUNC8]], i32 0
+; SSE2-NEXT:    [[TMP30:%.*]] = insertelement <4 x float> [[TMP29]], float [[TRUNC9]], i32 1
+; SSE2-NEXT:    [[TMP31:%.*]] = insertelement <4 x float> [[TMP30]], float [[TRUNC10]], i32 2
+; SSE2-NEXT:    [[TMP32:%.*]] = insertelement <4 x float> [[TMP31]], float [[TRUNC11]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP32]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE2-NEXT:    [[TMP33:%.*]] = insertelement <4 x float> poison, float [[TRUNC12]], i32 0
+; SSE2-NEXT:    [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[TRUNC13]], i32 1
+; SSE2-NEXT:    [[TMP35:%.*]] = insertelement <4 x float> [[TMP34]], float [[TRUNC14]], i32 2
+; SSE2-NEXT:    [[TMP36:%.*]] = insertelement <4 x float> [[TMP35]], float [[TRUNC15]], i32 3
+; SSE2-NEXT:    store <4 x float> [[TMP36]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
 ; SSE2-NEXT:    ret void
 ;
 ; SSE41-LABEL: @trunc_16f32(
@@ -2103,6 +2324,15 @@
 ; AVX512-NEXT:    [[TMP2:%.*]] = call <16 x float> @llvm.trunc.v16f32(<16 x float> [[TMP1]])
 ; AVX512-NEXT:    store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4
 ; AVX512-NEXT:    ret void
+;
+; AVX2-SKYLAKE-LABEL: @trunc_16f32(
+; AVX2-SKYLAKE-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; AVX2-SKYLAKE-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX2-SKYLAKE-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.trunc.v8f32(<8 x float> [[TMP1]])
+; AVX2-SKYLAKE-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.trunc.v8f32(<8 x float> [[TMP2]])
+; AVX2-SKYLAKE-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX2-SKYLAKE-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX2-SKYLAKE-NEXT:    ret void
 ;
   %ld0  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0 ), align 4
   %ld1  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1 ), align 4
Index: llvm/test/Transforms/SLPVectorizer/X86/powof2div.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/powof2div.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/powof2div.ll
@@ -60,35 +60,34 @@
 define void @powof2div_nonuniform(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c){
 ; AVX1-LABEL: @powof2div_nonuniform(
 ; AVX1-NEXT:  entry:
-; AVX1-NEXT:    [[TMP0:%.*]] = load i32, i32* [[B:%.*]], align 4
-; AVX1-NEXT:    [[TMP1:%.*]] = load i32, i32* [[C:%.*]], align 4
-; AVX1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
-; AVX1-NEXT:    [[DIV:%.*]] = sdiv i32 [[ADD]], 2
-; AVX1-NEXT:    store i32 [[DIV]], i32* [[A:%.*]], align 4
-; AVX1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 1
-; AVX1-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
-; AVX1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 1
-; AVX1-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX4]], align 4
-; AVX1-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP3]], [[TMP2]]
-; AVX1-NEXT:    [[DIV6:%.*]] = sdiv i32 [[ADD5]], 4
-; AVX1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1
-; AVX1-NEXT:    store i32 [[DIV6]], i32* [[ARRAYIDX7]], align 4
+; AVX1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
+; AVX1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 1
 ; AVX1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
-; AVX1-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX8]], align 4
 ; AVX1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2
-; AVX1-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX9]], align 4
-; AVX1-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP5]], [[TMP4]]
-; AVX1-NEXT:    [[DIV11:%.*]] = sdiv i32 [[ADD10]], 8
-; AVX1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
-; AVX1-NEXT:    store i32 [[DIV11]], i32* [[ARRAYIDX12]], align 4
 ; AVX1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
-; AVX1-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX13]], align 4
+; AVX1-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to <4 x i32>*
+; AVX1-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
 ; AVX1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3
-; AVX1-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX14]], align 4
-; AVX1-NEXT:    [[ADD15:%.*]] = add nsw i32 [[TMP7]], [[TMP6]]
-; AVX1-NEXT:    [[DIV16:%.*]] = sdiv i32 [[ADD15]], 16
+; AVX1-NEXT:    [[TMP2:%.*]] = bitcast i32* [[C]] to <4 x i32>*
+; AVX1-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
+; AVX1-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP1]]
+; AVX1-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
+; AVX1-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP5]], 2
+; AVX1-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1
+; AVX1-NEXT:    [[DIV6:%.*]] = sdiv i32 [[TMP6]], 4
+; AVX1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 1
+; AVX1-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
+; AVX1-NEXT:    [[DIV11:%.*]] = sdiv i32 [[TMP7]], 8
+; AVX1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
+; AVX1-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
+; AVX1-NEXT:    [[DIV16:%.*]] = sdiv i32 [[TMP8]], 16
 ; AVX1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
-; AVX1-NEXT:    store i32 [[DIV16]], i32* [[ARRAYIDX17]], align 4
+; AVX1-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> poison, i32 [[DIV]], i32 0
+; AVX1-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[DIV6]], i32 1
+; AVX1-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[DIV11]], i32 2
+; AVX1-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[DIV16]], i32 3
+; AVX1-NEXT:    [[TMP13:%.*]] = bitcast i32* [[A]] to <4 x i32>*
+; AVX1-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* [[TMP13]], align 4
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @powof2div_nonuniform(
Index: llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -slp-vectorizer -mattr=+sse2 -S | FileCheck %s --check-prefix=SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -slp-vectorizer -mattr=+avx  -S | FileCheck %s --check-prefix=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -slp-vectorizer -mattr=+avx2 -S | FileCheck %s --check-prefix=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -slp-vectorizer -mattr=+avx2 -S | FileCheck %s --check-prefix=AVX2
 
 %class.1 = type { %class.2 }
 %class.2 = type { %"class.3" }
@@ -47,6 +47,24 @@
 ; AVX-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]], align 8
 ; AVX-NEXT:    ret void
 ;
+; AVX2-LABEL: @_ZN1C10SwitchModeEv(
+; AVX2-NEXT:  for.body.lr.ph.i:
+; AVX2-NEXT:    [[OR_1:%.*]] = or i64 undef, 1
+; AVX2-NEXT:    store i64 [[OR_1]], i64* undef, align 8
+; AVX2-NEXT:    [[FOO_1:%.*]] = getelementptr inbounds [[CLASS_1:%.*]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 0
+; AVX2-NEXT:    [[FOO_2:%.*]] = getelementptr inbounds [[CLASS_1]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 1
+; AVX2-NEXT:    [[TMP0:%.*]] = bitcast i64* [[FOO_1]] to <2 x i64>*
+; AVX2-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8
+; AVX2-NEXT:    [[BAR5:%.*]] = load i64, i64* undef, align 8
+; AVX2-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[OR_1]], i32 0
+; AVX2-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[BAR5]], i32 1
+; AVX2-NEXT:    [[TMP4:%.*]] = and <2 x i64> [[TMP3]], [[TMP1]]
+; AVX2-NEXT:    [[BAR3:%.*]] = getelementptr inbounds [[CLASS_2:%.*]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 0
+; AVX2-NEXT:    [[BAR4:%.*]] = getelementptr inbounds [[CLASS_2]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 1
+; AVX2-NEXT:    [[TMP5:%.*]] = bitcast i64* [[BAR3]] to <2 x i64>*
+; AVX2-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]], align 8
+; AVX2-NEXT:    ret void
+;
 for.body.lr.ph.i:
   %or.1 = or i64 undef, 1
   store i64 %or.1, i64* undef, align 8
@@ -70,31 +88,28 @@
 ; SSE-NEXT:  entry:
 ; SSE-NEXT:    [[TMP0:%.*]] = load i64, i64* undef, align 1
 ; SSE-NEXT:    [[AND:%.*]] = shl i64 [[TMP0]], 2
-; SSE-NEXT:    [[SHL:%.*]] = and i64 [[AND]], 20
 ; SSE-NEXT:    [[ADD:%.*]] = add i64 undef, undef
 ; SSE-NEXT:    store i64 [[ADD]], i64* undef, align 1
 ; SSE-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 5
 ; SSE-NEXT:    [[AND_1:%.*]] = shl i64 undef, 2
-; SSE-NEXT:    [[SHL_1:%.*]] = and i64 [[AND_1]], 20
-; SSE-NEXT:    [[SHR_1:%.*]] = lshr i64 undef, 6
-; SSE-NEXT:    [[ADD_1:%.*]] = add nuw nsw i64 [[SHL]], [[SHR_1]]
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[AND_1]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[AND]], i32 1
+; SSE-NEXT:    [[TMP3:%.*]] = and <2 x i64> [[TMP2]], <i64 20, i64 20>
 ; SSE-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 4
-; SSE-NEXT:    [[SHR_2:%.*]] = lshr i64 undef, 6
-; SSE-NEXT:    [[ADD_2:%.*]] = add nuw nsw i64 [[SHL_1]], [[SHR_2]]
-; SSE-NEXT:    [[AND_4:%.*]] = shl i64 [[ADD]], 2
-; SSE-NEXT:    [[SHL_4:%.*]] = and i64 [[AND_4]], 20
+; SSE-NEXT:    [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer
 ; SSE-NEXT:    [[ARRAYIDX2_5:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 1
-; SSE-NEXT:    store i64 [[ADD_1]], i64* [[ARRAYIDX2_5]], align 1
-; SSE-NEXT:    [[AND_5:%.*]] = shl nuw nsw i64 [[ADD_1]], 2
-; SSE-NEXT:    [[SHL_5:%.*]] = and i64 [[AND_5]], 20
-; SSE-NEXT:    [[SHR_5:%.*]] = lshr i64 [[ADD_1]], 6
-; SSE-NEXT:    [[ADD_5:%.*]] = add nuw nsw i64 [[SHL_4]], [[SHR_5]]
-; SSE-NEXT:    store i64 [[ADD_5]], i64* [[ARRAYIDX2_1]], align 1
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; SSE-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0
+; SSE-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[ADD]], i32 1
+; SSE-NEXT:    [[TMP8:%.*]] = shl <2 x i64> [[TMP7]], <i64 2, i64 2>
+; SSE-NEXT:    [[TMP9:%.*]] = and <2 x i64> [[TMP8]], <i64 20, i64 20>
 ; SSE-NEXT:    [[ARRAYIDX2_6:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 0
-; SSE-NEXT:    store i64 [[ADD_2]], i64* [[ARRAYIDX2_6]], align 1
-; SSE-NEXT:    [[SHR_6:%.*]] = lshr i64 [[ADD_2]], 6
-; SSE-NEXT:    [[ADD_6:%.*]] = add nuw nsw i64 [[SHL_5]], [[SHR_6]]
-; SSE-NEXT:    store i64 [[ADD_6]], i64* [[ARRAYIDX2_2]], align 1
+; SSE-NEXT:    [[TMP10:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>*
+; SSE-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* [[TMP10]], align 1
+; SSE-NEXT:    [[TMP11:%.*]] = lshr <2 x i64> [[TMP4]], <i64 6, i64 6>
+; SSE-NEXT:    [[TMP12:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP11]]
+; SSE-NEXT:    [[TMP13:%.*]] = bitcast i64* [[ARRAYIDX2_2]] to <2 x i64>*
+; SSE-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* [[TMP13]], align 1
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @pr35497(
@@ -123,6 +138,32 @@
 ; AVX-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* [[TMP13]], align 1
 ; AVX-NEXT:    ret void
 ;
+; AVX2-LABEL: @pr35497(
+; AVX2-NEXT:  entry:
+; AVX2-NEXT:    [[TMP0:%.*]] = load i64, i64* undef, align 1
+; AVX2-NEXT:    [[ADD:%.*]] = add i64 undef, undef
+; AVX2-NEXT:    store i64 [[ADD]], i64* undef, align 1
+; AVX2-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 5
+; AVX2-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> <i64 undef, i64 poison>, i64 [[TMP0]], i32 1
+; AVX2-NEXT:    [[TMP2:%.*]] = shl <2 x i64> [[TMP1]], <i64 2, i64 2>
+; AVX2-NEXT:    [[TMP3:%.*]] = and <2 x i64> [[TMP2]], <i64 20, i64 20>
+; AVX2-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 4
+; AVX2-NEXT:    [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer
+; AVX2-NEXT:    [[ARRAYIDX2_5:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 1
+; AVX2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; AVX2-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0
+; AVX2-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[ADD]], i32 1
+; AVX2-NEXT:    [[TMP8:%.*]] = shl <2 x i64> [[TMP7]], <i64 2, i64 2>
+; AVX2-NEXT:    [[TMP9:%.*]] = and <2 x i64> [[TMP8]], <i64 20, i64 20>
+; AVX2-NEXT:    [[ARRAYIDX2_6:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 0
+; AVX2-NEXT:    [[TMP10:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>*
+; AVX2-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* [[TMP10]], align 1
+; AVX2-NEXT:    [[TMP11:%.*]] = lshr <2 x i64> [[TMP4]], <i64 6, i64 6>
+; AVX2-NEXT:    [[TMP12:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP11]]
+; AVX2-NEXT:    [[TMP13:%.*]] = bitcast i64* [[ARRAYIDX2_2]] to <2 x i64>*
+; AVX2-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* [[TMP13]], align 1
+; AVX2-NEXT:    ret void
+;
 entry:
   %0 = load i64, i64* undef, align 1
   %and = shl i64 %0, 2
Index: llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
@@ -1,14 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2   | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE42
 ; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx    | FileCheck %s --check-prefixes=CHECK,AVX1
 ; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2   | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX2
+; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512
 
 define void @store_i32(i32* nocapture %0, i32 %1, i32 %2) {
 ; CHECK-LABEL: @store_i32(
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4, [[TBAA0:!tbaa !.*]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4, !tbaa [[TBAA0:![0-9]+]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul <4 x i32> [[TMP5]], [[TMP7]]
@@ -16,7 +16,7 @@
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ult <4 x i32> [[TMP9]], <i32 255, i32 255, i32 255, i32 255>
 ; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP9]], <4 x i32> <i32 255, i32 255, i32 255, i32 255>
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* [[TMP12]], align 4, [[TBAA0]]
+; CHECK-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    ret void
 ;
   %4 = load i32, i32* %0, align 4, !tbaa !2
@@ -52,7 +52,7 @@
 define void @store_i8(i8* nocapture %0, i32 %1, i32 %2) {
 ; CHECK-LABEL: @store_i8(
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP0:%.*]] to <4 x i8>*
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 1, [[TBAA4:!tbaa !.*]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 1, !tbaa [[TBAA4:![0-9]+]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = zext <4 x i8> [[TMP5]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> undef, <4 x i32> zeroinitializer
@@ -62,7 +62,7 @@
 ; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP10]], <4 x i32> <i32 255, i32 255, i32 255, i32 255>
 ; CHECK-NEXT:    [[TMP13:%.*]] = trunc <4 x i32> [[TMP12]] to <4 x i8>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8* [[TMP0]] to <4 x i8>*
-; CHECK-NEXT:    store <4 x i8> [[TMP13]], <4 x i8>* [[TMP14]], align 1, [[TBAA4]]
+; CHECK-NEXT:    store <4 x i8> [[TMP13]], <4 x i8>* [[TMP14]], align 1, !tbaa [[TBAA4]]
 ; CHECK-NEXT:    ret void
 ;
   %4 = load i8, i8* %0, align 1, !tbaa !6
@@ -106,86 +106,111 @@
 define void @store_i64(i64* nocapture %0, i32 %1, i32 %2) {
 ; SSE-LABEL: @store_i64(
 ; SSE-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64
-; SSE-NEXT:    [[TMP5:%.*]] = load i64, i64* [[TMP0:%.*]], align 8, [[TBAA5:!tbaa !.*]]
+; SSE-NEXT:    [[TMP5:%.*]] = load i64, i64* [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]]
 ; SSE-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], [[TMP4]]
 ; SSE-NEXT:    [[TMP7:%.*]] = lshr i64 [[TMP6]], 15
 ; SSE-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32
 ; SSE-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], 255
 ; SSE-NEXT:    [[TMP10:%.*]] = and i64 [[TMP7]], 4294967295
 ; SSE-NEXT:    [[TMP11:%.*]] = select i1 [[TMP9]], i64 [[TMP10]], i64 255
-; SSE-NEXT:    store i64 [[TMP11]], i64* [[TMP0]], align 8, [[TBAA5]]
+; SSE-NEXT:    store i64 [[TMP11]], i64* [[TMP0]], align 8, !tbaa [[TBAA5]]
 ; SSE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 1
-; SSE-NEXT:    [[TMP13:%.*]] = load i64, i64* [[TMP12]], align 8, [[TBAA5]]
+; SSE-NEXT:    [[TMP13:%.*]] = load i64, i64* [[TMP12]], align 8, !tbaa [[TBAA5]]
 ; SSE-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], [[TMP4]]
 ; SSE-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP14]], 15
 ; SSE-NEXT:    [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32
 ; SSE-NEXT:    [[TMP17:%.*]] = icmp ult i32 [[TMP16]], 255
 ; SSE-NEXT:    [[TMP18:%.*]] = and i64 [[TMP15]], 4294967295
 ; SSE-NEXT:    [[TMP19:%.*]] = select i1 [[TMP17]], i64 [[TMP18]], i64 255
-; SSE-NEXT:    store i64 [[TMP19]], i64* [[TMP12]], align 8, [[TBAA5]]
+; SSE-NEXT:    store i64 [[TMP19]], i64* [[TMP12]], align 8, !tbaa [[TBAA5]]
 ; SSE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 2
-; SSE-NEXT:    [[TMP21:%.*]] = load i64, i64* [[TMP20]], align 8, [[TBAA5]]
+; SSE-NEXT:    [[TMP21:%.*]] = load i64, i64* [[TMP20]], align 8, !tbaa [[TBAA5]]
 ; SSE-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP21]], [[TMP4]]
 ; SSE-NEXT:    [[TMP23:%.*]] = lshr i64 [[TMP22]], 15
 ; SSE-NEXT:    [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32
 ; SSE-NEXT:    [[TMP25:%.*]] = icmp ult i32 [[TMP24]], 255
 ; SSE-NEXT:    [[TMP26:%.*]] = and i64 [[TMP23]], 4294967295
 ; SSE-NEXT:    [[TMP27:%.*]] = select i1 [[TMP25]], i64 [[TMP26]], i64 255
-; SSE-NEXT:    store i64 [[TMP27]], i64* [[TMP20]], align 8, [[TBAA5]]
+; SSE-NEXT:    store i64 [[TMP27]], i64* [[TMP20]], align 8, !tbaa [[TBAA5]]
 ; SSE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 3
-; SSE-NEXT:    [[TMP29:%.*]] = load i64, i64* [[TMP28]], align 8, [[TBAA5]]
+; SSE-NEXT:    [[TMP29:%.*]] = load i64, i64* [[TMP28]], align 8, !tbaa [[TBAA5]]
 ; SSE-NEXT:    [[TMP30:%.*]] = mul i64 [[TMP29]], [[TMP4]]
 ; SSE-NEXT:    [[TMP31:%.*]] = lshr i64 [[TMP30]], 15
 ; SSE-NEXT:    [[TMP32:%.*]] = trunc i64 [[TMP31]] to i32
 ; SSE-NEXT:    [[TMP33:%.*]] = icmp ult i32 [[TMP32]], 255
 ; SSE-NEXT:    [[TMP34:%.*]] = and i64 [[TMP31]], 4294967295
 ; SSE-NEXT:    [[TMP35:%.*]] = select i1 [[TMP33]], i64 [[TMP34]], i64 255
-; SSE-NEXT:    store i64 [[TMP35]], i64* [[TMP28]], align 8, [[TBAA5]]
+; SSE-NEXT:    store i64 [[TMP35]], i64* [[TMP28]], align 8, !tbaa [[TBAA5]]
 ; SSE-NEXT:    ret void
 ;
+; SSE42-LABEL: @store_i64(
+; SSE42-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64
+; SSE42-NEXT:    [[TMP5:%.*]] = load i64, i64* [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]]
+; SSE42-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], [[TMP4]]
+; SSE42-NEXT:    [[TMP7:%.*]] = lshr i64 [[TMP6]], 15
+; SSE42-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32
+; SSE42-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], 255
+; SSE42-NEXT:    [[TMP10:%.*]] = and i64 [[TMP7]], 4294967295
+; SSE42-NEXT:    [[TMP11:%.*]] = select i1 [[TMP9]], i64 [[TMP10]], i64 255
+; SSE42-NEXT:    store i64 [[TMP11]], i64* [[TMP0]], align 8, !tbaa [[TBAA5]]
+; SSE42-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 1
+; SSE42-NEXT:    [[TMP13:%.*]] = load i64, i64* [[TMP12]], align 8, !tbaa [[TBAA5]]
+; SSE42-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], [[TMP4]]
+; SSE42-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP14]], 15
+; SSE42-NEXT:    [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32
+; SSE42-NEXT:    [[TMP17:%.*]] = icmp ult i32 [[TMP16]], 255
+; SSE42-NEXT:    [[TMP18:%.*]] = and i64 [[TMP15]], 4294967295
+; SSE42-NEXT:    [[TMP19:%.*]] = select i1 [[TMP17]], i64 [[TMP18]], i64 255
+; SSE42-NEXT:    store i64 [[TMP19]], i64* [[TMP12]], align 8, !tbaa [[TBAA5]]
+; SSE42-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 2
+; SSE42-NEXT:    [[TMP21:%.*]] = load i64, i64* [[TMP20]], align 8, !tbaa [[TBAA5]]
+; SSE42-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP21]], [[TMP4]]
+; SSE42-NEXT:    [[TMP23:%.*]] = lshr i64 [[TMP22]], 15
+; SSE42-NEXT:    [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32
+; SSE42-NEXT:    [[TMP25:%.*]] = icmp ult i32 [[TMP24]], 255
+; SSE42-NEXT:    [[TMP26:%.*]] = and i64 [[TMP23]], 4294967295
+; SSE42-NEXT:    [[TMP27:%.*]] = select i1 [[TMP25]], i64 [[TMP26]], i64 255
+; SSE42-NEXT:    store i64 [[TMP27]], i64* [[TMP20]], align 8, !tbaa [[TBAA5]]
+; SSE42-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 3
+; SSE42-NEXT:    [[TMP29:%.*]] = load i64, i64* [[TMP28]], align 8, !tbaa [[TBAA5]]
+; SSE42-NEXT:    [[TMP30:%.*]] = mul i64 [[TMP29]], [[TMP4]]
+; SSE42-NEXT:    [[TMP31:%.*]] = lshr i64 [[TMP30]], 15
+; SSE42-NEXT:    [[TMP32:%.*]] = trunc i64 [[TMP31]] to i32
+; SSE42-NEXT:    [[TMP33:%.*]] = icmp ult i32 [[TMP32]], 255
+; SSE42-NEXT:    [[TMP34:%.*]] = and i64 [[TMP31]], 4294967295
+; SSE42-NEXT:    [[TMP35:%.*]] = select i1 [[TMP33]], i64 [[TMP34]], i64 255
+; SSE42-NEXT:    store i64 [[TMP35]], i64* [[TMP28]], align 8, !tbaa [[TBAA5]]
+; SSE42-NEXT:    ret void
+;
 ; AVX1-LABEL: @store_i64(
 ; AVX1-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64
-; AVX1-NEXT:    [[TMP5:%.*]] = load i64, i64* [[TMP0:%.*]], align 8, [[TBAA5:!tbaa !.*]]
-; AVX1-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], [[TMP4]]
-; AVX1-NEXT:    [[TMP7:%.*]] = lshr i64 [[TMP6]], 15
-; AVX1-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32
-; AVX1-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], 255
-; AVX1-NEXT:    [[TMP10:%.*]] = and i64 [[TMP7]], 4294967295
-; AVX1-NEXT:    [[TMP11:%.*]] = select i1 [[TMP9]], i64 [[TMP10]], i64 255
-; AVX1-NEXT:    store i64 [[TMP11]], i64* [[TMP0]], align 8, [[TBAA5]]
-; AVX1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 1
-; AVX1-NEXT:    [[TMP13:%.*]] = load i64, i64* [[TMP12]], align 8, [[TBAA5]]
-; AVX1-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], [[TMP4]]
-; AVX1-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP14]], 15
-; AVX1-NEXT:    [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32
-; AVX1-NEXT:    [[TMP17:%.*]] = icmp ult i32 [[TMP16]], 255
-; AVX1-NEXT:    [[TMP18:%.*]] = and i64 [[TMP15]], 4294967295
-; AVX1-NEXT:    [[TMP19:%.*]] = select i1 [[TMP17]], i64 [[TMP18]], i64 255
-; AVX1-NEXT:    store i64 [[TMP19]], i64* [[TMP12]], align 8, [[TBAA5]]
-; AVX1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 2
-; AVX1-NEXT:    [[TMP21:%.*]] = load i64, i64* [[TMP20]], align 8, [[TBAA5]]
-; AVX1-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP21]], [[TMP4]]
-; AVX1-NEXT:    [[TMP23:%.*]] = lshr i64 [[TMP22]], 15
-; AVX1-NEXT:    [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32
-; AVX1-NEXT:    [[TMP25:%.*]] = icmp ult i32 [[TMP24]], 255
-; AVX1-NEXT:    [[TMP26:%.*]] = and i64 [[TMP23]], 4294967295
-; AVX1-NEXT:    [[TMP27:%.*]] = select i1 [[TMP25]], i64 [[TMP26]], i64 255
-; AVX1-NEXT:    store i64 [[TMP27]], i64* [[TMP20]], align 8, [[TBAA5]]
-; AVX1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 3
-; AVX1-NEXT:    [[TMP29:%.*]] = load i64, i64* [[TMP28]], align 8, [[TBAA5]]
-; AVX1-NEXT:    [[TMP30:%.*]] = mul i64 [[TMP29]], [[TMP4]]
-; AVX1-NEXT:    [[TMP31:%.*]] = lshr i64 [[TMP30]], 15
-; AVX1-NEXT:    [[TMP32:%.*]] = trunc i64 [[TMP31]] to i32
-; AVX1-NEXT:    [[TMP33:%.*]] = icmp ult i32 [[TMP32]], 255
-; AVX1-NEXT:    [[TMP34:%.*]] = and i64 [[TMP31]], 4294967295
-; AVX1-NEXT:    [[TMP35:%.*]] = select i1 [[TMP33]], i64 [[TMP34]], i64 255
-; AVX1-NEXT:    store i64 [[TMP35]], i64* [[TMP28]], align 8, [[TBAA5]]
+; AVX1-NEXT:    [[TMP5:%.*]] = bitcast i64* [[TMP0:%.*]] to <4 x i64>*
+; AVX1-NEXT:    [[TMP6:%.*]] = load <4 x i64>, <4 x i64>* [[TMP5]], align 8, !tbaa [[TBAA5:![0-9]+]]
+; AVX1-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0
+; AVX1-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], [[TMP4]]
+; AVX1-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1
+; AVX1-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], [[TMP4]]
+; AVX1-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i32 0
+; AVX1-NEXT:    [[TMP13:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> undef, <2 x i32> zeroinitializer
+; AVX1-NEXT:    [[TMP14:%.*]] = mul <2 x i64> [[TMP11]], [[TMP13]]
+; AVX1-NEXT:    [[TMP15:%.*]] = shufflevector <2 x i64> [[TMP14]], <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; AVX1-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> poison, i64 [[TMP8]], i32 0
+; AVX1-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP10]], i32 1
+; AVX1-NEXT:    [[TMP18:%.*]] = shufflevector <4 x i64> [[TMP17]], <4 x i64> [[TMP15]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; AVX1-NEXT:    [[TMP19:%.*]] = lshr <4 x i64> [[TMP18]], <i64 15, i64 15, i64 15, i64 15>
+; AVX1-NEXT:    [[TMP20:%.*]] = trunc <4 x i64> [[TMP19]] to <4 x i32>
+; AVX1-NEXT:    [[TMP21:%.*]] = icmp ult <4 x i32> [[TMP20]], <i32 255, i32 255, i32 255, i32 255>
+; AVX1-NEXT:    [[TMP22:%.*]] = and <4 x i64> [[TMP19]], <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+; AVX1-NEXT:    [[TMP23:%.*]] = select <4 x i1> [[TMP21]], <4 x i64> [[TMP22]], <4 x i64> <i64 255, i64 255, i64 255, i64 255>
+; AVX1-NEXT:    [[TMP24:%.*]] = bitcast i64* [[TMP0]] to <4 x i64>*
+; AVX1-NEXT:    store <4 x i64> [[TMP23]], <4 x i64>* [[TMP24]], align 8, !tbaa [[TBAA5]]
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @store_i64(
 ; AVX2-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64
 ; AVX2-NEXT:    [[TMP5:%.*]] = bitcast i64* [[TMP0:%.*]] to <4 x i64>*
-; AVX2-NEXT:    [[TMP6:%.*]] = load <4 x i64>, <4 x i64>* [[TMP5]], align 8, [[TBAA5:!tbaa !.*]]
+; AVX2-NEXT:    [[TMP6:%.*]] = load <4 x i64>, <4 x i64>* [[TMP5]], align 8, !tbaa [[TBAA5:![0-9]+]]
 ; AVX2-NEXT:    [[TMP7:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0
 ; AVX2-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> undef, <4 x i32> zeroinitializer
 ; AVX2-NEXT:    [[TMP9:%.*]] = mul <4 x i64> [[TMP6]], [[TMP8]]
@@ -195,8 +220,24 @@
 ; AVX2-NEXT:    [[TMP13:%.*]] = and <4 x i64> [[TMP10]], <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
 ; AVX2-NEXT:    [[TMP14:%.*]] = select <4 x i1> [[TMP12]], <4 x i64> [[TMP13]], <4 x i64> <i64 255, i64 255, i64 255, i64 255>
 ; AVX2-NEXT:    [[TMP15:%.*]] = bitcast i64* [[TMP0]] to <4 x i64>*
-; AVX2-NEXT:    store <4 x i64> [[TMP14]], <4 x i64>* [[TMP15]], align 8, [[TBAA5]]
+; AVX2-NEXT:    store <4 x i64> [[TMP14]], <4 x i64>* [[TMP15]], align 8, !tbaa [[TBAA5]]
 ; AVX2-NEXT:    ret void
+;
+; AVX512-LABEL: @store_i64(
+; AVX512-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64
+; AVX512-NEXT:    [[TMP5:%.*]] = bitcast i64* [[TMP0:%.*]] to <4 x i64>*
+; AVX512-NEXT:    [[TMP6:%.*]] = load <4 x i64>, <4 x i64>* [[TMP5]], align 8, !tbaa [[TBAA5:![0-9]+]]
+; AVX512-NEXT:    [[TMP7:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0
+; AVX512-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX512-NEXT:    [[TMP9:%.*]] = mul <4 x i64> [[TMP6]], [[TMP8]]
+; AVX512-NEXT:    [[TMP10:%.*]] = lshr <4 x i64> [[TMP9]], <i64 15, i64 15, i64 15, i64 15>
+; AVX512-NEXT:    [[TMP11:%.*]] = trunc <4 x i64> [[TMP10]] to <4 x i32>
+; AVX512-NEXT:    [[TMP12:%.*]] = icmp ult <4 x i32> [[TMP11]], <i32 255, i32 255, i32 255, i32 255>
+; AVX512-NEXT:    [[TMP13:%.*]] = and <4 x i64> [[TMP10]], <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+; AVX512-NEXT:    [[TMP14:%.*]] = select <4 x i1> [[TMP12]], <4 x i64> [[TMP13]], <4 x i64> <i64 255, i64 255, i64 255, i64 255>
+; AVX512-NEXT:    [[TMP15:%.*]] = bitcast i64* [[TMP0]] to <4 x i64>*
+; AVX512-NEXT:    store <4 x i64> [[TMP14]], <4 x i64>* [[TMP15]], align 8, !tbaa [[TBAA5]]
+; AVX512-NEXT:    ret void
 ;
   %4 = zext i32 %1 to i64
   %5 = load i64, i64* %0, align 8, !tbaa !7
Index: llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
@@ -3,24 +3,24 @@
 ; RUN:  opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx      | FileCheck %s --check-prefixes=CHECK,AVX
 ; RUN:  opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx2     | FileCheck %s --check-prefixes=CHECK,AVX2
 ; RUN:  opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512f  | FileCheck %s --check-prefixes=CHECK,AVX512
-; RUN:  opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN:  opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512VL
 
 define void @gather_load(i32* noalias nocapture %0, i32* noalias nocapture readonly %1) {
 ; CHECK-LABEL: @gather_load(
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, [[TBAA0:!tbaa !.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, [[TBAA0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, [[TBAA0]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, [[TBAA0]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i32 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i32 2
 ; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 3
 ; CHECK-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, [[TBAA0]]
+; CHECK-NEXT:    store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    ret void
 ;
   %3 = getelementptr inbounds i32, i32* %1, i64 1
@@ -46,67 +46,73 @@
 define void @gather_load_2(i32* noalias nocapture %0, i32* noalias nocapture readonly %1) {
 ; SSE-LABEL: @gather_load_2(
 ; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
-; SSE-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, [[TBAA0:!tbaa !.*]]
+; SSE-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP4]], 1
 ; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
-; SSE-NEXT:    store i32 [[TMP5]], i32* [[TMP0]], align 4, [[TBAA0]]
+; SSE-NEXT:    store i32 [[TMP5]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10
-; SSE-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP9:%.*]] = add nsw i32 [[TMP8]], 2
 ; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2
-; SSE-NEXT:    store i32 [[TMP9]], i32* [[TMP6]], align 4, [[TBAA0]]
+; SSE-NEXT:    store i32 [[TMP9]], i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3
-; SSE-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3
 ; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3
-; SSE-NEXT:    store i32 [[TMP13]], i32* [[TMP10]], align 4, [[TBAA0]]
+; SSE-NEXT:    store i32 [[TMP13]], i32* [[TMP10]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5
-; SSE-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4
-; SSE-NEXT:    store i32 [[TMP17]], i32* [[TMP14]], align 4, [[TBAA0]]
+; SSE-NEXT:    store i32 [[TMP17]], i32* [[TMP14]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_2(
 ; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
-; AVX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, [[TBAA0:!tbaa !.*]]
-; AVX-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP4]], 1
-; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
-; AVX-NEXT:    store i32 [[TMP5]], i32* [[TMP0]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10
-; AVX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP9:%.*]] = add nsw i32 [[TMP8]], 2
-; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2
-; AVX-NEXT:    store i32 [[TMP9]], i32* [[TMP6]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3
-; AVX-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3
-; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3
-; AVX-NEXT:    store i32 [[TMP13]], i32* [[TMP10]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5
-; AVX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4
-; AVX-NEXT:    store i32 [[TMP17]], i32* [[TMP14]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32*> poison, i32* [[TMP1]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32*> [[TMP4]], <2 x i32*> undef, <2 x i32> zeroinitializer
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr i32, <2 x i32*> [[TMP5]], <2 x i64> <i64 10, i64 3>
+; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5
+; AVX-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP3]], i32 0
+; AVX-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32*> [[TMP6]], i32 0
+; AVX-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32*> [[TMP8]], i32* [[TMP9]], i32 1
+; AVX-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32*> [[TMP6]], i32 1
+; AVX-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32*> [[TMP10]], i32* [[TMP11]], i32 2
+; AVX-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32*> [[TMP12]], i32* [[TMP7]], i32 3
+; AVX-NEXT:    [[TMP14:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP13]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
+; AVX-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
+; AVX-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_2(
 ; AVX2-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1:%.*]], i32 0
 ; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> undef, <4 x i32> zeroinitializer
 ; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i32, <4 x i32*> [[TMP4]], <4 x i64> <i64 1, i64 10, i64 3, i64 5>
-; AVX2-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0:!tbaa !.*]]
+; AVX2-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
 ; AVX2-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], <i32 1, i32 2, i32 3, i32 4>
 ; AVX2-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
-; AVX2-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4, [[TBAA0]]
+; AVX2-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512-LABEL: @gather_load_2(
 ; AVX512-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1:%.*]], i32 0
 ; AVX512-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> undef, <4 x i32> zeroinitializer
 ; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr i32, <4 x i32*> [[TMP4]], <4 x i64> <i64 1, i64 10, i64 3, i64 5>
-; AVX512-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0:!tbaa !.*]]
+; AVX512-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
 ; AVX512-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], <i32 1, i32 2, i32 3, i32 4>
 ; AVX512-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
-; AVX512-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4, [[TBAA0]]
+; AVX512-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4, !tbaa [[TBAA0]]
 ; AVX512-NEXT:    ret void
+;
+; AVX512VL-LABEL: @gather_load_2(
+; AVX512VL-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1:%.*]], i32 0
+; AVX512VL-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> undef, <4 x i32> zeroinitializer
+; AVX512VL-NEXT:    [[TMP5:%.*]] = getelementptr i32, <4 x i32*> [[TMP4]], <4 x i64> <i64 1, i64 10, i64 3, i64 5>
+; AVX512VL-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], <i32 1, i32 2, i32 3, i32 4>
+; AVX512VL-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
+; AVX512VL-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    ret void
 ;
   %3 = getelementptr inbounds i32, i32* %1, i64 1
   %4 = load i32, i32* %3, align 4, !tbaa !2
@@ -133,144 +139,139 @@
 
 define void @gather_load_3(i32* noalias nocapture %0, i32* noalias nocapture readonly %1) {
 ; SSE-LABEL: @gather_load_3(
-; SSE-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
-; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
-; SSE-NEXT:    store i32 [[TMP4]], i32* [[TMP0]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
-; SSE-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP8:%.*]] = add i32 [[TMP7]], 2
-; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2
-; SSE-NEXT:    store i32 [[TMP8]], i32* [[TMP5]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
-; SSE-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP12:%.*]] = add i32 [[TMP11]], 3
-; SSE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3
-; SSE-NEXT:    store i32 [[TMP12]], i32* [[TMP9]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
-; SSE-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP16:%.*]] = add i32 [[TMP15]], 4
-; SSE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4
-; SSE-NEXT:    store i32 [[TMP16]], i32* [[TMP13]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
-; SSE-NEXT:    [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP20:%.*]] = add i32 [[TMP19]], 1
-; SSE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
-; SSE-NEXT:    store i32 [[TMP20]], i32* [[TMP17]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
-; SSE-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP24:%.*]] = add i32 [[TMP23]], 2
-; SSE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
-; SSE-NEXT:    store i32 [[TMP24]], i32* [[TMP21]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
-; SSE-NEXT:    [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP28:%.*]] = add i32 [[TMP27]], 3
-; SSE-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7
-; SSE-NEXT:    store i32 [[TMP28]], i32* [[TMP25]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
-; SSE-NEXT:    [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP32:%.*]] = add i32 [[TMP31]], 4
-; SSE-NEXT:    store i32 [[TMP32]], i32* [[TMP29]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
+; SSE-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
+; SSE-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
+; SSE-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0
+; SSE-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP5]], i32 1
+; SSE-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP7]], i32 2
+; SSE-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 3
+; SSE-NEXT:    [[TMP14:%.*]] = add <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
+; SSE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 4
+; SSE-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
+; SSE-NEXT:    store <4 x i32> [[TMP14]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
+; SSE-NEXT:    [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP19:%.*]] = add i32 [[TMP18]], 1
+; SSE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
+; SSE-NEXT:    store i32 [[TMP19]], i32* [[TMP15]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
+; SSE-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP23:%.*]] = add i32 [[TMP22]], 2
+; SSE-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
+; SSE-NEXT:    store i32 [[TMP23]], i32* [[TMP20]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
+; SSE-NEXT:    [[TMP26:%.*]] = load i32, i32* [[TMP25]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP27:%.*]] = add i32 [[TMP26]], 3
+; SSE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7
+; SSE-NEXT:    store i32 [[TMP27]], i32* [[TMP24]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
+; SSE-NEXT:    [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP31:%.*]] = add i32 [[TMP30]], 4
+; SSE-NEXT:    store i32 [[TMP31]], i32* [[TMP28]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_3(
-; AVX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
-; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
-; AVX-NEXT:    store i32 [[TMP4]], i32* [[TMP0]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
-; AVX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP8:%.*]] = add i32 [[TMP7]], 2
-; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2
-; AVX-NEXT:    store i32 [[TMP8]], i32* [[TMP5]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
-; AVX-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP12:%.*]] = add i32 [[TMP11]], 3
-; AVX-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3
-; AVX-NEXT:    store i32 [[TMP12]], i32* [[TMP9]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
-; AVX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP16:%.*]] = add i32 [[TMP15]], 4
-; AVX-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4
-; AVX-NEXT:    store i32 [[TMP16]], i32* [[TMP13]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
-; AVX-NEXT:    [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP20:%.*]] = add i32 [[TMP19]], 1
-; AVX-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
-; AVX-NEXT:    store i32 [[TMP20]], i32* [[TMP17]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
-; AVX-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP24:%.*]] = add i32 [[TMP23]], 2
-; AVX-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
-; AVX-NEXT:    store i32 [[TMP24]], i32* [[TMP21]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
-; AVX-NEXT:    [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP28:%.*]] = add i32 [[TMP27]], 3
-; AVX-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7
-; AVX-NEXT:    store i32 [[TMP28]], i32* [[TMP25]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
-; AVX-NEXT:    [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP32:%.*]] = add i32 [[TMP31]], 4
-; AVX-NEXT:    store i32 [[TMP32]], i32* [[TMP29]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
+; AVX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
+; AVX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
+; AVX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i32 0
+; AVX-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32*> [[TMP10]], <4 x i32*> undef, <4 x i32> zeroinitializer
+; AVX-NEXT:    [[TMP12:%.*]] = getelementptr i32, <4 x i32*> [[TMP11]], <4 x i64> <i64 18, i64 9, i64 6, i64 21>
+; AVX-NEXT:    [[TMP13:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP12]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP13]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP15:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i32 0
+; AVX-NEXT:    [[TMP16:%.*]] = insertelement <8 x i32> [[TMP15]], i32 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP17:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[TMP7]], i32 2
+; AVX-NEXT:    [[TMP18:%.*]] = insertelement <8 x i32> [[TMP17]], i32 [[TMP9]], i32 3
+; AVX-NEXT:    [[TMP19:%.*]] = shufflevector <8 x i32> [[TMP18]], <8 x i32> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; AVX-NEXT:    [[TMP20:%.*]] = add <8 x i32> [[TMP19]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
+; AVX-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>*
+; AVX-NEXT:    store <8 x i32> [[TMP20]], <8 x i32>* [[TMP21]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_3(
-; AVX2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
-; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
-; AVX2-NEXT:    store i32 [[TMP4]], i32* [[TMP0]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
+; AVX2-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i32 0
 ; AVX2-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> undef, <4 x i32> zeroinitializer
-; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i32, <4 x i32*> [[TMP7]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
-; AVX2-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP8]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0]]
-; AVX2-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP9]], <i32 2, i32 3, i32 4, i32 1>
-; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
-; AVX2-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
-; AVX2-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
-; AVX2-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP15:%.*]] = add i32 [[TMP14]], 2
-; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
-; AVX2-NEXT:    store i32 [[TMP15]], i32* [[TMP11]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
-; AVX2-NEXT:    [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP19:%.*]] = add i32 [[TMP18]], 3
-; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7
-; AVX2-NEXT:    store i32 [[TMP19]], i32* [[TMP16]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
-; AVX2-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP23:%.*]] = add i32 [[TMP22]], 4
-; AVX2-NEXT:    store i32 [[TMP23]], i32* [[TMP20]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i32, <4 x i32*> [[TMP7]], <4 x i64> <i64 4, i64 15, i64 18, i64 9>
+; AVX2-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP8]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32*> poison, i32* [[TMP1]], i32 0
+; AVX2-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i32*> [[TMP11]], <2 x i32*> undef, <2 x i32> zeroinitializer
+; AVX2-NEXT:    [[TMP13:%.*]] = getelementptr i32, <2 x i32*> [[TMP12]], <2 x i64> <i64 6, i64 21>
+; AVX2-NEXT:    [[TMP14:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP13]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef), !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP15:%.*]] = shufflevector <2 x i32> [[TMP14]], <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP16:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i32 0
+; AVX2-NEXT:    [[TMP17:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[TMP5]], i32 1
+; AVX2-NEXT:    [[TMP18:%.*]] = shufflevector <8 x i32> [[TMP17]], <8 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP19:%.*]] = shufflevector <8 x i32> [[TMP18]], <8 x i32> [[TMP15]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; AVX2-NEXT:    [[TMP20:%.*]] = add <8 x i32> [[TMP19]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
+; AVX2-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>*
+; AVX2-NEXT:    store <8 x i32> [[TMP20]], <8 x i32>* [[TMP21]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512-LABEL: @gather_load_3(
-; AVX512-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
-; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
-; AVX512-NEXT:    store i32 [[TMP4]], i32* [[TMP0]], align 4, [[TBAA0]]
+; AVX512-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
+; AVX512-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
 ; AVX512-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i32 0
 ; AVX512-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> undef, <4 x i32> zeroinitializer
-; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr i32, <4 x i32*> [[TMP7]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
-; AVX512-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP8]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0]]
-; AVX512-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP9]], <i32 2, i32 3, i32 4, i32 1>
-; AVX512-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
-; AVX512-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
-; AVX512-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
-; AVX512-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP15:%.*]] = add i32 [[TMP14]], 2
-; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
-; AVX512-NEXT:    store i32 [[TMP15]], i32* [[TMP11]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
-; AVX512-NEXT:    [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP19:%.*]] = add i32 [[TMP18]], 3
-; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7
-; AVX512-NEXT:    store i32 [[TMP19]], i32* [[TMP16]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
-; AVX512-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP23:%.*]] = add i32 [[TMP22]], 4
-; AVX512-NEXT:    store i32 [[TMP23]], i32* [[TMP20]], align 4, [[TBAA0]]
+; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr i32, <4 x i32*> [[TMP7]], <4 x i64> <i64 4, i64 15, i64 18, i64 9>
+; AVX512-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP8]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
+; AVX512-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32*> poison, i32* [[TMP1]], i32 0
+; AVX512-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i32*> [[TMP11]], <2 x i32*> undef, <2 x i32> zeroinitializer
+; AVX512-NEXT:    [[TMP13:%.*]] = getelementptr i32, <2 x i32*> [[TMP12]], <2 x i64> <i64 6, i64 21>
+; AVX512-NEXT:    [[TMP14:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP13]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef), !tbaa [[TBAA0]]
+; AVX512-NEXT:    [[TMP15:%.*]] = shufflevector <2 x i32> [[TMP14]], <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512-NEXT:    [[TMP16:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i32 0
+; AVX512-NEXT:    [[TMP17:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[TMP5]], i32 1
+; AVX512-NEXT:    [[TMP18:%.*]] = shufflevector <8 x i32> [[TMP17]], <8 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef>
+; AVX512-NEXT:    [[TMP19:%.*]] = shufflevector <8 x i32> [[TMP18]], <8 x i32> [[TMP15]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; AVX512-NEXT:    [[TMP20:%.*]] = add <8 x i32> [[TMP19]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
+; AVX512-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>*
+; AVX512-NEXT:    store <8 x i32> [[TMP20]], <8 x i32>* [[TMP21]], align 4, !tbaa [[TBAA0]]
 ; AVX512-NEXT:    ret void
+;
+; AVX512VL-LABEL: @gather_load_3(
+; AVX512VL-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i32 0
+; AVX512VL-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32*> [[TMP4]], <4 x i32*> undef, <4 x i32> zeroinitializer
+; AVX512VL-NEXT:    [[TMP6:%.*]] = getelementptr i32, <4 x i32*> [[TMP5]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
+; AVX512VL-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP6]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
+; AVX512VL-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
+; AVX512VL-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
+; AVX512VL-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP14:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i32 0
+; AVX512VL-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP7]], i32 0
+; AVX512VL-NEXT:    [[TMP16:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[TMP15]], i32 1
+; AVX512VL-NEXT:    [[TMP17:%.*]] = extractelement <4 x i32> [[TMP7]], i32 1
+; AVX512VL-NEXT:    [[TMP18:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[TMP17]], i32 2
+; AVX512VL-NEXT:    [[TMP19:%.*]] = extractelement <4 x i32> [[TMP7]], i32 2
+; AVX512VL-NEXT:    [[TMP20:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP19]], i32 3
+; AVX512VL-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[TMP7]], i32 3
+; AVX512VL-NEXT:    [[TMP22:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP21]], i32 4
+; AVX512VL-NEXT:    [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP9]], i32 5
+; AVX512VL-NEXT:    [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP11]], i32 6
+; AVX512VL-NEXT:    [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP13]], i32 7
+; AVX512VL-NEXT:    [[TMP26:%.*]] = add <8 x i32> [[TMP25]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
+; AVX512VL-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>*
+; AVX512VL-NEXT:    store <8 x i32> [[TMP26]], <8 x i32>* [[TMP27]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    ret void
 ;
   %3 = load i32, i32* %1, align 4, !tbaa !2
   %4 = add i32 %3, 1
@@ -315,13 +316,10 @@
 
 define void @gather_load_4(i32* noalias nocapture %t0, i32* noalias nocapture readonly %t1) {
 ; SSE-LABEL: @gather_load_4(
-; SSE-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
 ; SSE-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11
-; SSE-NEXT:    [[T9:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 2
 ; SSE-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4
-; SSE-NEXT:    [[T13:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 3
 ; SSE-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15
-; SSE-NEXT:    [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 4
+; SSE-NEXT:    [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 4
 ; SSE-NEXT:    [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18
 ; SSE-NEXT:    [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
 ; SSE-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
@@ -329,130 +327,128 @@
 ; SSE-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
 ; SSE-NEXT:    [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7
 ; SSE-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
-; SSE-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[T7:%.*]] = load i32, i32* [[T6]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[T11:%.*]] = load i32, i32* [[T10]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[T15:%.*]] = load i32, i32* [[T14]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[T19:%.*]] = load i32, i32* [[T18]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[T23:%.*]] = load i32, i32* [[T22]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[T27:%.*]] = load i32, i32* [[T26]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[T31:%.*]] = load i32, i32* [[T30]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[T4:%.*]] = add i32 [[T3]], 1
-; SSE-NEXT:    [[T8:%.*]] = add i32 [[T7]], 2
-; SSE-NEXT:    [[T12:%.*]] = add i32 [[T11]], 3
-; SSE-NEXT:    [[T16:%.*]] = add i32 [[T15]], 4
+; SSE-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[T11:%.*]] = load i32, i32* [[T10]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[T15:%.*]] = load i32, i32* [[T14]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[T19:%.*]] = load i32, i32* [[T18]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T3]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T7]], i32 1
+; SSE-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T11]], i32 2
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T15]], i32 3
+; SSE-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP4]], <i32 1, i32 2, i32 3, i32 4>
 ; SSE-NEXT:    [[T20:%.*]] = add i32 [[T19]], 1
 ; SSE-NEXT:    [[T24:%.*]] = add i32 [[T23]], 2
 ; SSE-NEXT:    [[T28:%.*]] = add i32 [[T27]], 3
 ; SSE-NEXT:    [[T32:%.*]] = add i32 [[T31]], 4
-; SSE-NEXT:    store i32 [[T4]], i32* [[T0]], align 4, [[TBAA0]]
-; SSE-NEXT:    store i32 [[T8]], i32* [[T5]], align 4, [[TBAA0]]
-; SSE-NEXT:    store i32 [[T12]], i32* [[T9]], align 4, [[TBAA0]]
-; SSE-NEXT:    store i32 [[T16]], i32* [[T13]], align 4, [[TBAA0]]
-; SSE-NEXT:    store i32 [[T20]], i32* [[T17]], align 4, [[TBAA0]]
-; SSE-NEXT:    store i32 [[T24]], i32* [[T21]], align 4, [[TBAA0]]
-; SSE-NEXT:    store i32 [[T28]], i32* [[T25]], align 4, [[TBAA0]]
-; SSE-NEXT:    store i32 [[T32]], i32* [[T29]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP6:%.*]] = bitcast i32* [[T0]] to <4 x i32>*
+; SSE-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store i32 [[T20]], i32* [[T17]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store i32 [[T24]], i32* [[T21]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store i32 [[T28]], i32* [[T25]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store i32 [[T32]], i32* [[T29]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_4(
-; AVX-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
 ; AVX-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11
-; AVX-NEXT:    [[T9:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 2
 ; AVX-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4
-; AVX-NEXT:    [[T13:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 3
 ; AVX-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15
-; AVX-NEXT:    [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 4
-; AVX-NEXT:    [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18
-; AVX-NEXT:    [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
-; AVX-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
-; AVX-NEXT:    [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6
-; AVX-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
-; AVX-NEXT:    [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7
-; AVX-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
-; AVX-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[T7:%.*]] = load i32, i32* [[T6]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[T11:%.*]] = load i32, i32* [[T10]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[T15:%.*]] = load i32, i32* [[T14]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[T19:%.*]] = load i32, i32* [[T18]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[T23:%.*]] = load i32, i32* [[T22]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[T27:%.*]] = load i32, i32* [[T26]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[T31:%.*]] = load i32, i32* [[T30]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[T4:%.*]] = add i32 [[T3]], 1
-; AVX-NEXT:    [[T8:%.*]] = add i32 [[T7]], 2
-; AVX-NEXT:    [[T12:%.*]] = add i32 [[T11]], 3
-; AVX-NEXT:    [[T16:%.*]] = add i32 [[T15]], 4
-; AVX-NEXT:    [[T20:%.*]] = add i32 [[T19]], 1
-; AVX-NEXT:    [[T24:%.*]] = add i32 [[T23]], 2
-; AVX-NEXT:    [[T28:%.*]] = add i32 [[T27]], 3
-; AVX-NEXT:    [[T32:%.*]] = add i32 [[T31]], 4
-; AVX-NEXT:    store i32 [[T4]], i32* [[T0]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T8]], i32* [[T5]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T12]], i32* [[T9]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T16]], i32* [[T13]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T20]], i32* [[T17]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T24]], i32* [[T21]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T28]], i32* [[T25]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T32]], i32* [[T29]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1]], i32 0
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> undef, <4 x i32> zeroinitializer
+; AVX-NEXT:    [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> <i64 18, i64 9, i64 6, i64 21>
+; AVX-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[T11:%.*]] = load i32, i32* [[T10]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[T15:%.*]] = load i32, i32* [[T14]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i32 0
+; AVX-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T7]], i32 1
+; AVX-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T11]], i32 2
+; AVX-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[T15]], i32 3
+; AVX-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; AVX-NEXT:    [[TMP11:%.*]] = add <8 x i32> [[TMP10]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
+; AVX-NEXT:    [[TMP12:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>*
+; AVX-NEXT:    store <8 x i32> [[TMP11]], <8 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_4(
-; AVX2-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
-; AVX2-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i32 0
+; AVX2-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11
+; AVX2-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1]], i32 0
 ; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> undef, <4 x i32> zeroinitializer
-; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
-; AVX2-NEXT:    [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
-; AVX2-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
-; AVX2-NEXT:    [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6
-; AVX2-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
-; AVX2-NEXT:    [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7
-; AVX2-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
-; AVX2-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0]]
-; AVX2-NEXT:    [[T23:%.*]] = load i32, i32* [[T22]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[T27:%.*]] = load i32, i32* [[T26]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[T31:%.*]] = load i32, i32* [[T30]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[T4:%.*]] = add i32 [[T3]], 1
-; AVX2-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP4]], <i32 2, i32 3, i32 4, i32 1>
-; AVX2-NEXT:    [[T24:%.*]] = add i32 [[T23]], 2
-; AVX2-NEXT:    [[T28:%.*]] = add i32 [[T27]], 3
-; AVX2-NEXT:    [[T32:%.*]] = add i32 [[T31]], 4
-; AVX2-NEXT:    store i32 [[T4]], i32* [[T0]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP6:%.*]] = bitcast i32* [[T5]] to <4 x i32>*
-; AVX2-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, [[TBAA0]]
-; AVX2-NEXT:    store i32 [[T24]], i32* [[T21]], align 4, [[TBAA0]]
-; AVX2-NEXT:    store i32 [[T28]], i32* [[T25]], align 4, [[TBAA0]]
-; AVX2-NEXT:    store i32 [[T32]], i32* [[T29]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> <i64 4, i64 15, i64 18, i64 9>
+; AVX2-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32*> poison, i32* [[T1]], i32 0
+; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32*> [[TMP4]], <2 x i32*> undef, <2 x i32> zeroinitializer
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr i32, <2 x i32*> [[TMP5]], <2 x i64> <i64 6, i64 21>
+; AVX2-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP9:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP6]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef), !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP11:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i32 0
+; AVX2-NEXT:    [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[T7]], i32 1
+; AVX2-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i32> [[TMP12]], <8 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP14:%.*]] = shufflevector <8 x i32> [[TMP13]], <8 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; AVX2-NEXT:    [[TMP15:%.*]] = add <8 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
+; AVX2-NEXT:    [[TMP16:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>*
+; AVX2-NEXT:    store <8 x i32> [[TMP15]], <8 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512-LABEL: @gather_load_4(
-; AVX512-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
-; AVX512-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i32 0
+; AVX512-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11
+; AVX512-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1]], i32 0
 ; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> undef, <4 x i32> zeroinitializer
-; AVX512-NEXT:    [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
-; AVX512-NEXT:    [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
-; AVX512-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
-; AVX512-NEXT:    [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6
-; AVX512-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
-; AVX512-NEXT:    [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7
-; AVX512-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
-; AVX512-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0]]
-; AVX512-NEXT:    [[T23:%.*]] = load i32, i32* [[T22]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[T27:%.*]] = load i32, i32* [[T26]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[T31:%.*]] = load i32, i32* [[T30]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[T4:%.*]] = add i32 [[T3]], 1
-; AVX512-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP4]], <i32 2, i32 3, i32 4, i32 1>
-; AVX512-NEXT:    [[T24:%.*]] = add i32 [[T23]], 2
-; AVX512-NEXT:    [[T28:%.*]] = add i32 [[T27]], 3
-; AVX512-NEXT:    [[T32:%.*]] = add i32 [[T31]], 4
-; AVX512-NEXT:    store i32 [[T4]], i32* [[T0]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP6:%.*]] = bitcast i32* [[T5]] to <4 x i32>*
-; AVX512-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, [[TBAA0]]
-; AVX512-NEXT:    store i32 [[T24]], i32* [[T21]], align 4, [[TBAA0]]
-; AVX512-NEXT:    store i32 [[T28]], i32* [[T25]], align 4, [[TBAA0]]
-; AVX512-NEXT:    store i32 [[T32]], i32* [[T29]], align 4, [[TBAA0]]
+; AVX512-NEXT:    [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> <i64 4, i64 15, i64 18, i64 9>
+; AVX512-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32*> poison, i32* [[T1]], i32 0
+; AVX512-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32*> [[TMP4]], <2 x i32*> undef, <2 x i32> zeroinitializer
+; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr i32, <2 x i32*> [[TMP5]], <2 x i64> <i64 6, i64 21>
+; AVX512-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]]
+; AVX512-NEXT:    [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]]
+; AVX512-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
+; AVX512-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512-NEXT:    [[TMP9:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP6]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef), !tbaa [[TBAA0]]
+; AVX512-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512-NEXT:    [[TMP11:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i32 0
+; AVX512-NEXT:    [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[T7]], i32 1
+; AVX512-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i32> [[TMP12]], <8 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef>
+; AVX512-NEXT:    [[TMP14:%.*]] = shufflevector <8 x i32> [[TMP13]], <8 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; AVX512-NEXT:    [[TMP15:%.*]] = add <8 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
+; AVX512-NEXT:    [[TMP16:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>*
+; AVX512-NEXT:    store <8 x i32> [[TMP15]], <8 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]]
 ; AVX512-NEXT:    ret void
+;
+; AVX512VL-LABEL: @gather_load_4(
+; AVX512VL-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i32 0
+; AVX512VL-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> undef, <4 x i32> zeroinitializer
+; AVX512VL-NEXT:    [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
+; AVX512VL-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
+; AVX512VL-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
+; AVX512VL-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
+; AVX512VL-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i32 0
+; AVX512VL-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
+; AVX512VL-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[TMP6]], i32 1
+; AVX512VL-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1
+; AVX512VL-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[TMP8]], i32 2
+; AVX512VL-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
+; AVX512VL-NEXT:    [[TMP11:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[TMP10]], i32 3
+; AVX512VL-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
+; AVX512VL-NEXT:    [[TMP13:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[TMP12]], i32 4
+; AVX512VL-NEXT:    [[TMP14:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[T23]], i32 5
+; AVX512VL-NEXT:    [[TMP15:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[T27]], i32 6
+; AVX512VL-NEXT:    [[TMP16:%.*]] = insertelement <8 x i32> [[TMP15]], i32 [[T31]], i32 7
+; AVX512VL-NEXT:    [[TMP17:%.*]] = add <8 x i32> [[TMP16]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
+; AVX512VL-NEXT:    [[TMP18:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>*
+; AVX512VL-NEXT:    store <8 x i32> [[TMP17]], <8 x i32>* [[TMP18]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    ret void
 ;
   %t5 = getelementptr inbounds i32, i32* %t0, i64 1
   %t6 = getelementptr inbounds i32, i32* %t1, i64 11
@@ -509,21 +505,21 @@
 ; SSE-NEXT:    [[TMP7:%.*]] = insertelement <4 x float*> [[TMP6]], float* [[TMP3]], i32 1
 ; SSE-NEXT:    [[TMP8:%.*]] = insertelement <4 x float*> [[TMP7]], float* [[TMP4]], i32 2
 ; SSE-NEXT:    [[TMP9:%.*]] = insertelement <4 x float*> [[TMP8]], float* [[TMP5]], i32 3
-; SSE-NEXT:    [[TMP10:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP9]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef), [[TBAA0]]
+; SSE-NEXT:    [[TMP10:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP9]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef), !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP11:%.*]] = shufflevector <4 x float*> [[TMP6]], <4 x float*> undef, <4 x i32> zeroinitializer
 ; SSE-NEXT:    [[TMP12:%.*]] = getelementptr float, <4 x float*> [[TMP11]], <4 x i64> <i64 4, i64 13, i64 11, i64 44>
-; SSE-NEXT:    [[TMP13:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP12]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef), [[TBAA0]]
+; SSE-NEXT:    [[TMP13:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP12]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef), !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP14:%.*]] = fdiv <4 x float> [[TMP10]], [[TMP13]]
 ; SSE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP0:%.*]], i64 4
 ; SSE-NEXT:    [[TMP16:%.*]] = bitcast float* [[TMP0]] to <4 x float>*
-; SSE-NEXT:    store <4 x float> [[TMP14]], <4 x float>* [[TMP16]], align 4, [[TBAA0]]
+; SSE-NEXT:    store <4 x float> [[TMP14]], <4 x float>* [[TMP16]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP17:%.*]] = getelementptr float, <4 x float*> [[TMP11]], <4 x i64> <i64 17, i64 8, i64 5, i64 20>
-; SSE-NEXT:    [[TMP18:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP17]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef), [[TBAA0]]
+; SSE-NEXT:    [[TMP18:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP17]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef), !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP19:%.*]] = getelementptr float, <4 x float*> [[TMP11]], <4 x i64> <i64 33, i64 30, i64 27, i64 23>
-; SSE-NEXT:    [[TMP20:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP19]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef), [[TBAA0]]
+; SSE-NEXT:    [[TMP20:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP19]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef), !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP21:%.*]] = fdiv <4 x float> [[TMP18]], [[TMP20]]
 ; SSE-NEXT:    [[TMP22:%.*]] = bitcast float* [[TMP15]] to <4 x float>*
-; SSE-NEXT:    store <4 x float> [[TMP21]], <4 x float>* [[TMP22]], align 4, [[TBAA0]]
+; SSE-NEXT:    store <4 x float> [[TMP21]], <4 x float>* [[TMP22]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_div(
@@ -542,13 +538,13 @@
 ; AVX-NEXT:    [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP7]], i32 5
 ; AVX-NEXT:    [[TMP16:%.*]] = insertelement <8 x float*> [[TMP15]], float* [[TMP8]], i32 6
 ; AVX-NEXT:    [[TMP17:%.*]] = insertelement <8 x float*> [[TMP16]], float* [[TMP9]], i32 7
-; AVX-NEXT:    [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), [[TBAA0]]
+; AVX-NEXT:    [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[TMP19:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> undef, <8 x i32> zeroinitializer
 ; AVX-NEXT:    [[TMP20:%.*]] = getelementptr float, <8 x float*> [[TMP19]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
-; AVX-NEXT:    [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), [[TBAA0]]
+; AVX-NEXT:    [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[TMP22:%.*]] = fdiv <8 x float> [[TMP18]], [[TMP21]]
 ; AVX-NEXT:    [[TMP23:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
-; AVX-NEXT:    store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, [[TBAA0]]
+; AVX-NEXT:    store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_div(
@@ -567,13 +563,13 @@
 ; AVX2-NEXT:    [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP7]], i32 5
 ; AVX2-NEXT:    [[TMP16:%.*]] = insertelement <8 x float*> [[TMP15]], float* [[TMP8]], i32 6
 ; AVX2-NEXT:    [[TMP17:%.*]] = insertelement <8 x float*> [[TMP16]], float* [[TMP9]], i32 7
-; AVX2-NEXT:    [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), [[TBAA0]]
+; AVX2-NEXT:    [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
 ; AVX2-NEXT:    [[TMP19:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> undef, <8 x i32> zeroinitializer
 ; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr float, <8 x float*> [[TMP19]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
-; AVX2-NEXT:    [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), [[TBAA0]]
+; AVX2-NEXT:    [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
 ; AVX2-NEXT:    [[TMP22:%.*]] = fdiv <8 x float> [[TMP18]], [[TMP21]]
 ; AVX2-NEXT:    [[TMP23:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
-; AVX2-NEXT:    store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, [[TBAA0]]
+; AVX2-NEXT:    store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512-LABEL: @gather_load_div(
@@ -592,14 +588,39 @@
 ; AVX512-NEXT:    [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP7]], i32 5
 ; AVX512-NEXT:    [[TMP16:%.*]] = insertelement <8 x float*> [[TMP15]], float* [[TMP8]], i32 6
 ; AVX512-NEXT:    [[TMP17:%.*]] = insertelement <8 x float*> [[TMP16]], float* [[TMP9]], i32 7
-; AVX512-NEXT:    [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), [[TBAA0]]
+; AVX512-NEXT:    [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
 ; AVX512-NEXT:    [[TMP19:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> undef, <8 x i32> zeroinitializer
 ; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr float, <8 x float*> [[TMP19]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
-; AVX512-NEXT:    [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), [[TBAA0]]
+; AVX512-NEXT:    [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
 ; AVX512-NEXT:    [[TMP22:%.*]] = fdiv <8 x float> [[TMP18]], [[TMP21]]
 ; AVX512-NEXT:    [[TMP23:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
-; AVX512-NEXT:    store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, [[TBAA0]]
+; AVX512-NEXT:    store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, !tbaa [[TBAA0]]
 ; AVX512-NEXT:    ret void
+;
+; AVX512VL-LABEL: @gather_load_div(
+; AVX512VL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 10
+; AVX512VL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
+; AVX512VL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
+; AVX512VL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17
+; AVX512VL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8
+; AVX512VL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
+; AVX512VL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
+; AVX512VL-NEXT:    [[TMP10:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i32 0
+; AVX512VL-NEXT:    [[TMP11:%.*]] = insertelement <8 x float*> [[TMP10]], float* [[TMP3]], i32 1
+; AVX512VL-NEXT:    [[TMP12:%.*]] = insertelement <8 x float*> [[TMP11]], float* [[TMP4]], i32 2
+; AVX512VL-NEXT:    [[TMP13:%.*]] = insertelement <8 x float*> [[TMP12]], float* [[TMP5]], i32 3
+; AVX512VL-NEXT:    [[TMP14:%.*]] = insertelement <8 x float*> [[TMP13]], float* [[TMP6]], i32 4
+; AVX512VL-NEXT:    [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP7]], i32 5
+; AVX512VL-NEXT:    [[TMP16:%.*]] = insertelement <8 x float*> [[TMP15]], float* [[TMP8]], i32 6
+; AVX512VL-NEXT:    [[TMP17:%.*]] = insertelement <8 x float*> [[TMP16]], float* [[TMP9]], i32 7
+; AVX512VL-NEXT:    [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP19:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> undef, <8 x i32> zeroinitializer
+; AVX512VL-NEXT:    [[TMP20:%.*]] = getelementptr float, <8 x float*> [[TMP19]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
+; AVX512VL-NEXT:    [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP22:%.*]] = fdiv <8 x float> [[TMP18]], [[TMP21]]
+; AVX512VL-NEXT:    [[TMP23:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
+; AVX512VL-NEXT:    store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    ret void
 ;
   %3 = load float, float* %1, align 4, !tbaa !2
   %4 = getelementptr inbounds float, float* %1, i64 4
Index: llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
@@ -3,24 +3,24 @@
 ; RUN:  opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx      | FileCheck %s --check-prefixes=CHECK,AVX
 ; RUN:  opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx2     | FileCheck %s --check-prefixes=CHECK,AVX2
 ; RUN:  opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512f  | FileCheck %s --check-prefixes=CHECK,AVX512
-; RUN:  opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN:  opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512VL
 
 define void @gather_load(i32* noalias nocapture %0, i32* noalias nocapture readonly %1) {
 ; CHECK-LABEL: @gather_load(
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, [[TBAA0:!tbaa !.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, [[TBAA0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, [[TBAA0]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, [[TBAA0]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i32 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i32 2
 ; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 3
 ; CHECK-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, [[TBAA0]]
+; CHECK-NEXT:    store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    ret void
 ;
   %3 = getelementptr inbounds i32, i32* %1, i64 1
@@ -46,67 +46,73 @@
 define void @gather_load_2(i32* noalias nocapture %0, i32* noalias nocapture readonly %1) {
 ; SSE-LABEL: @gather_load_2(
 ; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
-; SSE-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, [[TBAA0:!tbaa !.*]]
+; SSE-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP4]], 1
 ; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
-; SSE-NEXT:    store i32 [[TMP5]], i32* [[TMP0]], align 4, [[TBAA0]]
+; SSE-NEXT:    store i32 [[TMP5]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10
-; SSE-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP9:%.*]] = add nsw i32 [[TMP8]], 2
 ; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2
-; SSE-NEXT:    store i32 [[TMP9]], i32* [[TMP6]], align 4, [[TBAA0]]
+; SSE-NEXT:    store i32 [[TMP9]], i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3
-; SSE-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3
 ; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3
-; SSE-NEXT:    store i32 [[TMP13]], i32* [[TMP10]], align 4, [[TBAA0]]
+; SSE-NEXT:    store i32 [[TMP13]], i32* [[TMP10]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5
-; SSE-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4
-; SSE-NEXT:    store i32 [[TMP17]], i32* [[TMP14]], align 4, [[TBAA0]]
+; SSE-NEXT:    store i32 [[TMP17]], i32* [[TMP14]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_2(
 ; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
-; AVX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, [[TBAA0:!tbaa !.*]]
-; AVX-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP4]], 1
-; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
-; AVX-NEXT:    store i32 [[TMP5]], i32* [[TMP0]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10
-; AVX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP9:%.*]] = add nsw i32 [[TMP8]], 2
-; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2
-; AVX-NEXT:    store i32 [[TMP9]], i32* [[TMP6]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3
-; AVX-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3
-; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3
-; AVX-NEXT:    store i32 [[TMP13]], i32* [[TMP10]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5
-; AVX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4
-; AVX-NEXT:    store i32 [[TMP17]], i32* [[TMP14]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32*> poison, i32* [[TMP1]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32*> [[TMP4]], <2 x i32*> undef, <2 x i32> zeroinitializer
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr i32, <2 x i32*> [[TMP5]], <2 x i64> <i64 10, i64 3>
+; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5
+; AVX-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP3]], i32 0
+; AVX-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32*> [[TMP6]], i32 0
+; AVX-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32*> [[TMP8]], i32* [[TMP9]], i32 1
+; AVX-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32*> [[TMP6]], i32 1
+; AVX-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32*> [[TMP10]], i32* [[TMP11]], i32 2
+; AVX-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32*> [[TMP12]], i32* [[TMP7]], i32 3
+; AVX-NEXT:    [[TMP14:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP13]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
+; AVX-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
+; AVX-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_2(
 ; AVX2-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1:%.*]], i32 0
 ; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> undef, <4 x i32> zeroinitializer
 ; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i32, <4 x i32*> [[TMP4]], <4 x i64> <i64 1, i64 10, i64 3, i64 5>
-; AVX2-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0:!tbaa !.*]]
+; AVX2-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
 ; AVX2-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], <i32 1, i32 2, i32 3, i32 4>
 ; AVX2-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
-; AVX2-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4, [[TBAA0]]
+; AVX2-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512-LABEL: @gather_load_2(
 ; AVX512-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1:%.*]], i32 0
 ; AVX512-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> undef, <4 x i32> zeroinitializer
 ; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr i32, <4 x i32*> [[TMP4]], <4 x i64> <i64 1, i64 10, i64 3, i64 5>
-; AVX512-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0:!tbaa !.*]]
+; AVX512-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
 ; AVX512-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], <i32 1, i32 2, i32 3, i32 4>
 ; AVX512-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
-; AVX512-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4, [[TBAA0]]
+; AVX512-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4, !tbaa [[TBAA0]]
 ; AVX512-NEXT:    ret void
+;
+; AVX512VL-LABEL: @gather_load_2(
+; AVX512VL-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1:%.*]], i32 0
+; AVX512VL-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> undef, <4 x i32> zeroinitializer
+; AVX512VL-NEXT:    [[TMP5:%.*]] = getelementptr i32, <4 x i32*> [[TMP4]], <4 x i64> <i64 1, i64 10, i64 3, i64 5>
+; AVX512VL-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], <i32 1, i32 2, i32 3, i32 4>
+; AVX512VL-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
+; AVX512VL-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    ret void
 ;
   %3 = getelementptr inbounds i32, i32* %1, i64 1
   %4 = load i32, i32* %3, align 4, !tbaa !2
@@ -133,144 +139,139 @@
 
 define void @gather_load_3(i32* noalias nocapture %0, i32* noalias nocapture readonly %1) {
 ; SSE-LABEL: @gather_load_3(
-; SSE-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
-; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
-; SSE-NEXT:    store i32 [[TMP4]], i32* [[TMP0]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
-; SSE-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP8:%.*]] = add i32 [[TMP7]], 2
-; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2
-; SSE-NEXT:    store i32 [[TMP8]], i32* [[TMP5]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
-; SSE-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP12:%.*]] = add i32 [[TMP11]], 3
-; SSE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3
-; SSE-NEXT:    store i32 [[TMP12]], i32* [[TMP9]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
-; SSE-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP16:%.*]] = add i32 [[TMP15]], 4
-; SSE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4
-; SSE-NEXT:    store i32 [[TMP16]], i32* [[TMP13]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
-; SSE-NEXT:    [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP20:%.*]] = add i32 [[TMP19]], 1
-; SSE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
-; SSE-NEXT:    store i32 [[TMP20]], i32* [[TMP17]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
-; SSE-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP24:%.*]] = add i32 [[TMP23]], 2
-; SSE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
-; SSE-NEXT:    store i32 [[TMP24]], i32* [[TMP21]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
-; SSE-NEXT:    [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP28:%.*]] = add i32 [[TMP27]], 3
-; SSE-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7
-; SSE-NEXT:    store i32 [[TMP28]], i32* [[TMP25]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
-; SSE-NEXT:    [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP32:%.*]] = add i32 [[TMP31]], 4
-; SSE-NEXT:    store i32 [[TMP32]], i32* [[TMP29]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
+; SSE-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
+; SSE-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
+; SSE-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0
+; SSE-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP5]], i32 1
+; SSE-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP7]], i32 2
+; SSE-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 3
+; SSE-NEXT:    [[TMP14:%.*]] = add <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
+; SSE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 4
+; SSE-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
+; SSE-NEXT:    store <4 x i32> [[TMP14]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
+; SSE-NEXT:    [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP19:%.*]] = add i32 [[TMP18]], 1
+; SSE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
+; SSE-NEXT:    store i32 [[TMP19]], i32* [[TMP15]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
+; SSE-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP23:%.*]] = add i32 [[TMP22]], 2
+; SSE-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
+; SSE-NEXT:    store i32 [[TMP23]], i32* [[TMP20]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
+; SSE-NEXT:    [[TMP26:%.*]] = load i32, i32* [[TMP25]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP27:%.*]] = add i32 [[TMP26]], 3
+; SSE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7
+; SSE-NEXT:    store i32 [[TMP27]], i32* [[TMP24]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
+; SSE-NEXT:    [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP31:%.*]] = add i32 [[TMP30]], 4
+; SSE-NEXT:    store i32 [[TMP31]], i32* [[TMP28]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_3(
-; AVX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
-; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
-; AVX-NEXT:    store i32 [[TMP4]], i32* [[TMP0]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
-; AVX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP8:%.*]] = add i32 [[TMP7]], 2
-; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2
-; AVX-NEXT:    store i32 [[TMP8]], i32* [[TMP5]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
-; AVX-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP12:%.*]] = add i32 [[TMP11]], 3
-; AVX-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3
-; AVX-NEXT:    store i32 [[TMP12]], i32* [[TMP9]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
-; AVX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP16:%.*]] = add i32 [[TMP15]], 4
-; AVX-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4
-; AVX-NEXT:    store i32 [[TMP16]], i32* [[TMP13]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
-; AVX-NEXT:    [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP20:%.*]] = add i32 [[TMP19]], 1
-; AVX-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
-; AVX-NEXT:    store i32 [[TMP20]], i32* [[TMP17]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
-; AVX-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP24:%.*]] = add i32 [[TMP23]], 2
-; AVX-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
-; AVX-NEXT:    store i32 [[TMP24]], i32* [[TMP21]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
-; AVX-NEXT:    [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP28:%.*]] = add i32 [[TMP27]], 3
-; AVX-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7
-; AVX-NEXT:    store i32 [[TMP28]], i32* [[TMP25]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
-; AVX-NEXT:    [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP32:%.*]] = add i32 [[TMP31]], 4
-; AVX-NEXT:    store i32 [[TMP32]], i32* [[TMP29]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
+; AVX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
+; AVX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
+; AVX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i32 0
+; AVX-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32*> [[TMP10]], <4 x i32*> undef, <4 x i32> zeroinitializer
+; AVX-NEXT:    [[TMP12:%.*]] = getelementptr i32, <4 x i32*> [[TMP11]], <4 x i64> <i64 18, i64 9, i64 6, i64 21>
+; AVX-NEXT:    [[TMP13:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP12]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP13]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP15:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i32 0
+; AVX-NEXT:    [[TMP16:%.*]] = insertelement <8 x i32> [[TMP15]], i32 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP17:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[TMP7]], i32 2
+; AVX-NEXT:    [[TMP18:%.*]] = insertelement <8 x i32> [[TMP17]], i32 [[TMP9]], i32 3
+; AVX-NEXT:    [[TMP19:%.*]] = shufflevector <8 x i32> [[TMP18]], <8 x i32> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; AVX-NEXT:    [[TMP20:%.*]] = add <8 x i32> [[TMP19]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
+; AVX-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>*
+; AVX-NEXT:    store <8 x i32> [[TMP20]], <8 x i32>* [[TMP21]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_3(
-; AVX2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
-; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
-; AVX2-NEXT:    store i32 [[TMP4]], i32* [[TMP0]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
+; AVX2-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i32 0
 ; AVX2-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> undef, <4 x i32> zeroinitializer
-; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i32, <4 x i32*> [[TMP7]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
-; AVX2-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP8]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0]]
-; AVX2-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP9]], <i32 2, i32 3, i32 4, i32 1>
-; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
-; AVX2-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
-; AVX2-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
-; AVX2-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP15:%.*]] = add i32 [[TMP14]], 2
-; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
-; AVX2-NEXT:    store i32 [[TMP15]], i32* [[TMP11]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
-; AVX2-NEXT:    [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP19:%.*]] = add i32 [[TMP18]], 3
-; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7
-; AVX2-NEXT:    store i32 [[TMP19]], i32* [[TMP16]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
-; AVX2-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP23:%.*]] = add i32 [[TMP22]], 4
-; AVX2-NEXT:    store i32 [[TMP23]], i32* [[TMP20]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i32, <4 x i32*> [[TMP7]], <4 x i64> <i64 4, i64 15, i64 18, i64 9>
+; AVX2-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP8]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32*> poison, i32* [[TMP1]], i32 0
+; AVX2-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i32*> [[TMP11]], <2 x i32*> undef, <2 x i32> zeroinitializer
+; AVX2-NEXT:    [[TMP13:%.*]] = getelementptr i32, <2 x i32*> [[TMP12]], <2 x i64> <i64 6, i64 21>
+; AVX2-NEXT:    [[TMP14:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP13]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef), !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP15:%.*]] = shufflevector <2 x i32> [[TMP14]], <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP16:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i32 0
+; AVX2-NEXT:    [[TMP17:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[TMP5]], i32 1
+; AVX2-NEXT:    [[TMP18:%.*]] = shufflevector <8 x i32> [[TMP17]], <8 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP19:%.*]] = shufflevector <8 x i32> [[TMP18]], <8 x i32> [[TMP15]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; AVX2-NEXT:    [[TMP20:%.*]] = add <8 x i32> [[TMP19]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
+; AVX2-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>*
+; AVX2-NEXT:    store <8 x i32> [[TMP20]], <8 x i32>* [[TMP21]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512-LABEL: @gather_load_3(
-; AVX512-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
-; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
-; AVX512-NEXT:    store i32 [[TMP4]], i32* [[TMP0]], align 4, [[TBAA0]]
+; AVX512-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
+; AVX512-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
 ; AVX512-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i32 0
 ; AVX512-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> undef, <4 x i32> zeroinitializer
-; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr i32, <4 x i32*> [[TMP7]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
-; AVX512-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP8]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0]]
-; AVX512-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP9]], <i32 2, i32 3, i32 4, i32 1>
-; AVX512-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
-; AVX512-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
-; AVX512-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
-; AVX512-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP15:%.*]] = add i32 [[TMP14]], 2
-; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
-; AVX512-NEXT:    store i32 [[TMP15]], i32* [[TMP11]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
-; AVX512-NEXT:    [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP19:%.*]] = add i32 [[TMP18]], 3
-; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7
-; AVX512-NEXT:    store i32 [[TMP19]], i32* [[TMP16]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
-; AVX512-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP23:%.*]] = add i32 [[TMP22]], 4
-; AVX512-NEXT:    store i32 [[TMP23]], i32* [[TMP20]], align 4, [[TBAA0]]
+; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr i32, <4 x i32*> [[TMP7]], <4 x i64> <i64 4, i64 15, i64 18, i64 9>
+; AVX512-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP8]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
+; AVX512-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32*> poison, i32* [[TMP1]], i32 0
+; AVX512-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i32*> [[TMP11]], <2 x i32*> undef, <2 x i32> zeroinitializer
+; AVX512-NEXT:    [[TMP13:%.*]] = getelementptr i32, <2 x i32*> [[TMP12]], <2 x i64> <i64 6, i64 21>
+; AVX512-NEXT:    [[TMP14:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP13]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef), !tbaa [[TBAA0]]
+; AVX512-NEXT:    [[TMP15:%.*]] = shufflevector <2 x i32> [[TMP14]], <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512-NEXT:    [[TMP16:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i32 0
+; AVX512-NEXT:    [[TMP17:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[TMP5]], i32 1
+; AVX512-NEXT:    [[TMP18:%.*]] = shufflevector <8 x i32> [[TMP17]], <8 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef>
+; AVX512-NEXT:    [[TMP19:%.*]] = shufflevector <8 x i32> [[TMP18]], <8 x i32> [[TMP15]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; AVX512-NEXT:    [[TMP20:%.*]] = add <8 x i32> [[TMP19]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
+; AVX512-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>*
+; AVX512-NEXT:    store <8 x i32> [[TMP20]], <8 x i32>* [[TMP21]], align 4, !tbaa [[TBAA0]]
 ; AVX512-NEXT:    ret void
+;
+; AVX512VL-LABEL: @gather_load_3(
+; AVX512VL-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i32 0
+; AVX512VL-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32*> [[TMP4]], <4 x i32*> undef, <4 x i32> zeroinitializer
+; AVX512VL-NEXT:    [[TMP6:%.*]] = getelementptr i32, <4 x i32*> [[TMP5]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
+; AVX512VL-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP6]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
+; AVX512VL-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
+; AVX512VL-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
+; AVX512VL-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP14:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i32 0
+; AVX512VL-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP7]], i32 0
+; AVX512VL-NEXT:    [[TMP16:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[TMP15]], i32 1
+; AVX512VL-NEXT:    [[TMP17:%.*]] = extractelement <4 x i32> [[TMP7]], i32 1
+; AVX512VL-NEXT:    [[TMP18:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[TMP17]], i32 2
+; AVX512VL-NEXT:    [[TMP19:%.*]] = extractelement <4 x i32> [[TMP7]], i32 2
+; AVX512VL-NEXT:    [[TMP20:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP19]], i32 3
+; AVX512VL-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[TMP7]], i32 3
+; AVX512VL-NEXT:    [[TMP22:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP21]], i32 4
+; AVX512VL-NEXT:    [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP9]], i32 5
+; AVX512VL-NEXT:    [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP11]], i32 6
+; AVX512VL-NEXT:    [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP13]], i32 7
+; AVX512VL-NEXT:    [[TMP26:%.*]] = add <8 x i32> [[TMP25]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
+; AVX512VL-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>*
+; AVX512VL-NEXT:    store <8 x i32> [[TMP26]], <8 x i32>* [[TMP27]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    ret void
 ;
   %3 = load i32, i32* %1, align 4, !tbaa !2
   %4 = add i32 %3, 1
@@ -315,13 +316,10 @@
 
 define void @gather_load_4(i32* noalias nocapture %t0, i32* noalias nocapture readonly %t1) {
 ; SSE-LABEL: @gather_load_4(
-; SSE-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
 ; SSE-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11
-; SSE-NEXT:    [[T9:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 2
 ; SSE-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4
-; SSE-NEXT:    [[T13:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 3
 ; SSE-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15
-; SSE-NEXT:    [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 4
+; SSE-NEXT:    [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 4
 ; SSE-NEXT:    [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18
 ; SSE-NEXT:    [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
 ; SSE-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
@@ -329,130 +327,128 @@
 ; SSE-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
 ; SSE-NEXT:    [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7
 ; SSE-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
-; SSE-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[T7:%.*]] = load i32, i32* [[T6]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[T11:%.*]] = load i32, i32* [[T10]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[T15:%.*]] = load i32, i32* [[T14]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[T19:%.*]] = load i32, i32* [[T18]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[T23:%.*]] = load i32, i32* [[T22]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[T27:%.*]] = load i32, i32* [[T26]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[T31:%.*]] = load i32, i32* [[T30]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[T4:%.*]] = add i32 [[T3]], 1
-; SSE-NEXT:    [[T8:%.*]] = add i32 [[T7]], 2
-; SSE-NEXT:    [[T12:%.*]] = add i32 [[T11]], 3
-; SSE-NEXT:    [[T16:%.*]] = add i32 [[T15]], 4
+; SSE-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[T11:%.*]] = load i32, i32* [[T10]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[T15:%.*]] = load i32, i32* [[T14]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[T19:%.*]] = load i32, i32* [[T18]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T3]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T7]], i32 1
+; SSE-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T11]], i32 2
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T15]], i32 3
+; SSE-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP4]], <i32 1, i32 2, i32 3, i32 4>
 ; SSE-NEXT:    [[T20:%.*]] = add i32 [[T19]], 1
 ; SSE-NEXT:    [[T24:%.*]] = add i32 [[T23]], 2
 ; SSE-NEXT:    [[T28:%.*]] = add i32 [[T27]], 3
 ; SSE-NEXT:    [[T32:%.*]] = add i32 [[T31]], 4
-; SSE-NEXT:    store i32 [[T4]], i32* [[T0]], align 4, [[TBAA0]]
-; SSE-NEXT:    store i32 [[T8]], i32* [[T5]], align 4, [[TBAA0]]
-; SSE-NEXT:    store i32 [[T12]], i32* [[T9]], align 4, [[TBAA0]]
-; SSE-NEXT:    store i32 [[T16]], i32* [[T13]], align 4, [[TBAA0]]
-; SSE-NEXT:    store i32 [[T20]], i32* [[T17]], align 4, [[TBAA0]]
-; SSE-NEXT:    store i32 [[T24]], i32* [[T21]], align 4, [[TBAA0]]
-; SSE-NEXT:    store i32 [[T28]], i32* [[T25]], align 4, [[TBAA0]]
-; SSE-NEXT:    store i32 [[T32]], i32* [[T29]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP6:%.*]] = bitcast i32* [[T0]] to <4 x i32>*
+; SSE-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store i32 [[T20]], i32* [[T17]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store i32 [[T24]], i32* [[T21]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store i32 [[T28]], i32* [[T25]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store i32 [[T32]], i32* [[T29]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_4(
-; AVX-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
 ; AVX-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11
-; AVX-NEXT:    [[T9:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 2
 ; AVX-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4
-; AVX-NEXT:    [[T13:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 3
 ; AVX-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15
-; AVX-NEXT:    [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 4
-; AVX-NEXT:    [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18
-; AVX-NEXT:    [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
-; AVX-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
-; AVX-NEXT:    [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6
-; AVX-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
-; AVX-NEXT:    [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7
-; AVX-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
-; AVX-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[T7:%.*]] = load i32, i32* [[T6]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[T11:%.*]] = load i32, i32* [[T10]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[T15:%.*]] = load i32, i32* [[T14]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[T19:%.*]] = load i32, i32* [[T18]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[T23:%.*]] = load i32, i32* [[T22]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[T27:%.*]] = load i32, i32* [[T26]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[T31:%.*]] = load i32, i32* [[T30]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[T4:%.*]] = add i32 [[T3]], 1
-; AVX-NEXT:    [[T8:%.*]] = add i32 [[T7]], 2
-; AVX-NEXT:    [[T12:%.*]] = add i32 [[T11]], 3
-; AVX-NEXT:    [[T16:%.*]] = add i32 [[T15]], 4
-; AVX-NEXT:    [[T20:%.*]] = add i32 [[T19]], 1
-; AVX-NEXT:    [[T24:%.*]] = add i32 [[T23]], 2
-; AVX-NEXT:    [[T28:%.*]] = add i32 [[T27]], 3
-; AVX-NEXT:    [[T32:%.*]] = add i32 [[T31]], 4
-; AVX-NEXT:    store i32 [[T4]], i32* [[T0]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T8]], i32* [[T5]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T12]], i32* [[T9]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T16]], i32* [[T13]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T20]], i32* [[T17]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T24]], i32* [[T21]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T28]], i32* [[T25]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T32]], i32* [[T29]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1]], i32 0
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> undef, <4 x i32> zeroinitializer
+; AVX-NEXT:    [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> <i64 18, i64 9, i64 6, i64 21>
+; AVX-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[T11:%.*]] = load i32, i32* [[T10]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[T15:%.*]] = load i32, i32* [[T14]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i32 0
+; AVX-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T7]], i32 1
+; AVX-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T11]], i32 2
+; AVX-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[T15]], i32 3
+; AVX-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; AVX-NEXT:    [[TMP11:%.*]] = add <8 x i32> [[TMP10]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
+; AVX-NEXT:    [[TMP12:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>*
+; AVX-NEXT:    store <8 x i32> [[TMP11]], <8 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_4(
-; AVX2-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
-; AVX2-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i32 0
+; AVX2-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11
+; AVX2-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1]], i32 0
 ; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> undef, <4 x i32> zeroinitializer
-; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
-; AVX2-NEXT:    [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
-; AVX2-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
-; AVX2-NEXT:    [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6
-; AVX2-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
-; AVX2-NEXT:    [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7
-; AVX2-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
-; AVX2-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0]]
-; AVX2-NEXT:    [[T23:%.*]] = load i32, i32* [[T22]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[T27:%.*]] = load i32, i32* [[T26]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[T31:%.*]] = load i32, i32* [[T30]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[T4:%.*]] = add i32 [[T3]], 1
-; AVX2-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP4]], <i32 2, i32 3, i32 4, i32 1>
-; AVX2-NEXT:    [[T24:%.*]] = add i32 [[T23]], 2
-; AVX2-NEXT:    [[T28:%.*]] = add i32 [[T27]], 3
-; AVX2-NEXT:    [[T32:%.*]] = add i32 [[T31]], 4
-; AVX2-NEXT:    store i32 [[T4]], i32* [[T0]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP6:%.*]] = bitcast i32* [[T5]] to <4 x i32>*
-; AVX2-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, [[TBAA0]]
-; AVX2-NEXT:    store i32 [[T24]], i32* [[T21]], align 4, [[TBAA0]]
-; AVX2-NEXT:    store i32 [[T28]], i32* [[T25]], align 4, [[TBAA0]]
-; AVX2-NEXT:    store i32 [[T32]], i32* [[T29]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> <i64 4, i64 15, i64 18, i64 9>
+; AVX2-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32*> poison, i32* [[T1]], i32 0
+; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32*> [[TMP4]], <2 x i32*> undef, <2 x i32> zeroinitializer
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr i32, <2 x i32*> [[TMP5]], <2 x i64> <i64 6, i64 21>
+; AVX2-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP9:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP6]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef), !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP11:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i32 0
+; AVX2-NEXT:    [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[T7]], i32 1
+; AVX2-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i32> [[TMP12]], <8 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP14:%.*]] = shufflevector <8 x i32> [[TMP13]], <8 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; AVX2-NEXT:    [[TMP15:%.*]] = add <8 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
+; AVX2-NEXT:    [[TMP16:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>*
+; AVX2-NEXT:    store <8 x i32> [[TMP15]], <8 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512-LABEL: @gather_load_4(
-; AVX512-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
-; AVX512-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i32 0
+; AVX512-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11
+; AVX512-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1]], i32 0
 ; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> undef, <4 x i32> zeroinitializer
-; AVX512-NEXT:    [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
-; AVX512-NEXT:    [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
-; AVX512-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
-; AVX512-NEXT:    [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6
-; AVX512-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
-; AVX512-NEXT:    [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7
-; AVX512-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
-; AVX512-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0]]
-; AVX512-NEXT:    [[T23:%.*]] = load i32, i32* [[T22]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[T27:%.*]] = load i32, i32* [[T26]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[T31:%.*]] = load i32, i32* [[T30]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[T4:%.*]] = add i32 [[T3]], 1
-; AVX512-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP4]], <i32 2, i32 3, i32 4, i32 1>
-; AVX512-NEXT:    [[T24:%.*]] = add i32 [[T23]], 2
-; AVX512-NEXT:    [[T28:%.*]] = add i32 [[T27]], 3
-; AVX512-NEXT:    [[T32:%.*]] = add i32 [[T31]], 4
-; AVX512-NEXT:    store i32 [[T4]], i32* [[T0]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP6:%.*]] = bitcast i32* [[T5]] to <4 x i32>*
-; AVX512-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, [[TBAA0]]
-; AVX512-NEXT:    store i32 [[T24]], i32* [[T21]], align 4, [[TBAA0]]
-; AVX512-NEXT:    store i32 [[T28]], i32* [[T25]], align 4, [[TBAA0]]
-; AVX512-NEXT:    store i32 [[T32]], i32* [[T29]], align 4, [[TBAA0]]
+; AVX512-NEXT:    [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> <i64 4, i64 15, i64 18, i64 9>
+; AVX512-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32*> poison, i32* [[T1]], i32 0
+; AVX512-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32*> [[TMP4]], <2 x i32*> undef, <2 x i32> zeroinitializer
+; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr i32, <2 x i32*> [[TMP5]], <2 x i64> <i64 6, i64 21>
+; AVX512-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]]
+; AVX512-NEXT:    [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]]
+; AVX512-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
+; AVX512-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512-NEXT:    [[TMP9:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP6]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef), !tbaa [[TBAA0]]
+; AVX512-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512-NEXT:    [[TMP11:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i32 0
+; AVX512-NEXT:    [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[T7]], i32 1
+; AVX512-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i32> [[TMP12]], <8 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef>
+; AVX512-NEXT:    [[TMP14:%.*]] = shufflevector <8 x i32> [[TMP13]], <8 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; AVX512-NEXT:    [[TMP15:%.*]] = add <8 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
+; AVX512-NEXT:    [[TMP16:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>*
+; AVX512-NEXT:    store <8 x i32> [[TMP15]], <8 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]]
 ; AVX512-NEXT:    ret void
+;
+; AVX512VL-LABEL: @gather_load_4(
+; AVX512VL-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i32 0
+; AVX512VL-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> undef, <4 x i32> zeroinitializer
+; AVX512VL-NEXT:    [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
+; AVX512VL-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
+; AVX512VL-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
+; AVX512VL-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
+; AVX512VL-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i32 0
+; AVX512VL-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
+; AVX512VL-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[TMP6]], i32 1
+; AVX512VL-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1
+; AVX512VL-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[TMP8]], i32 2
+; AVX512VL-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
+; AVX512VL-NEXT:    [[TMP11:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[TMP10]], i32 3
+; AVX512VL-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
+; AVX512VL-NEXT:    [[TMP13:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[TMP12]], i32 4
+; AVX512VL-NEXT:    [[TMP14:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[T23]], i32 5
+; AVX512VL-NEXT:    [[TMP15:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[T27]], i32 6
+; AVX512VL-NEXT:    [[TMP16:%.*]] = insertelement <8 x i32> [[TMP15]], i32 [[T31]], i32 7
+; AVX512VL-NEXT:    [[TMP17:%.*]] = add <8 x i32> [[TMP16]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
+; AVX512VL-NEXT:    [[TMP18:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>*
+; AVX512VL-NEXT:    store <8 x i32> [[TMP17]], <8 x i32>* [[TMP18]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    ret void
 ;
   %t5 = getelementptr inbounds i32, i32* %t0, i64 1
   %t6 = getelementptr inbounds i32, i32* %t1, i64 11
@@ -509,21 +505,21 @@
 ; SSE-NEXT:    [[TMP7:%.*]] = insertelement <4 x float*> [[TMP6]], float* [[TMP3]], i32 1
 ; SSE-NEXT:    [[TMP8:%.*]] = insertelement <4 x float*> [[TMP7]], float* [[TMP4]], i32 2
 ; SSE-NEXT:    [[TMP9:%.*]] = insertelement <4 x float*> [[TMP8]], float* [[TMP5]], i32 3
-; SSE-NEXT:    [[TMP10:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP9]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef), [[TBAA0]]
+; SSE-NEXT:    [[TMP10:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP9]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef), !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP11:%.*]] = shufflevector <4 x float*> [[TMP6]], <4 x float*> undef, <4 x i32> zeroinitializer
 ; SSE-NEXT:    [[TMP12:%.*]] = getelementptr float, <4 x float*> [[TMP11]], <4 x i64> <i64 4, i64 13, i64 11, i64 44>
-; SSE-NEXT:    [[TMP13:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP12]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef), [[TBAA0]]
+; SSE-NEXT:    [[TMP13:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP12]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef), !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP14:%.*]] = fdiv <4 x float> [[TMP10]], [[TMP13]]
 ; SSE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP0:%.*]], i64 4
 ; SSE-NEXT:    [[TMP16:%.*]] = bitcast float* [[TMP0]] to <4 x float>*
-; SSE-NEXT:    store <4 x float> [[TMP14]], <4 x float>* [[TMP16]], align 4, [[TBAA0]]
+; SSE-NEXT:    store <4 x float> [[TMP14]], <4 x float>* [[TMP16]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP17:%.*]] = getelementptr float, <4 x float*> [[TMP11]], <4 x i64> <i64 17, i64 8, i64 5, i64 20>
-; SSE-NEXT:    [[TMP18:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP17]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef), [[TBAA0]]
+; SSE-NEXT:    [[TMP18:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP17]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef), !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP19:%.*]] = getelementptr float, <4 x float*> [[TMP11]], <4 x i64> <i64 33, i64 30, i64 27, i64 23>
-; SSE-NEXT:    [[TMP20:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP19]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef), [[TBAA0]]
+; SSE-NEXT:    [[TMP20:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP19]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef), !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP21:%.*]] = fdiv <4 x float> [[TMP18]], [[TMP20]]
 ; SSE-NEXT:    [[TMP22:%.*]] = bitcast float* [[TMP15]] to <4 x float>*
-; SSE-NEXT:    store <4 x float> [[TMP21]], <4 x float>* [[TMP22]], align 4, [[TBAA0]]
+; SSE-NEXT:    store <4 x float> [[TMP21]], <4 x float>* [[TMP22]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_div(
@@ -542,13 +538,13 @@
 ; AVX-NEXT:    [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP7]], i32 5
 ; AVX-NEXT:    [[TMP16:%.*]] = insertelement <8 x float*> [[TMP15]], float* [[TMP8]], i32 6
 ; AVX-NEXT:    [[TMP17:%.*]] = insertelement <8 x float*> [[TMP16]], float* [[TMP9]], i32 7
-; AVX-NEXT:    [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), [[TBAA0]]
+; AVX-NEXT:    [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[TMP19:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> undef, <8 x i32> zeroinitializer
 ; AVX-NEXT:    [[TMP20:%.*]] = getelementptr float, <8 x float*> [[TMP19]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
-; AVX-NEXT:    [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), [[TBAA0]]
+; AVX-NEXT:    [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[TMP22:%.*]] = fdiv <8 x float> [[TMP18]], [[TMP21]]
 ; AVX-NEXT:    [[TMP23:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
-; AVX-NEXT:    store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, [[TBAA0]]
+; AVX-NEXT:    store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_div(
@@ -567,13 +563,13 @@
 ; AVX2-NEXT:    [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP7]], i32 5
 ; AVX2-NEXT:    [[TMP16:%.*]] = insertelement <8 x float*> [[TMP15]], float* [[TMP8]], i32 6
 ; AVX2-NEXT:    [[TMP17:%.*]] = insertelement <8 x float*> [[TMP16]], float* [[TMP9]], i32 7
-; AVX2-NEXT:    [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), [[TBAA0]]
+; AVX2-NEXT:    [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
 ; AVX2-NEXT:    [[TMP19:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> undef, <8 x i32> zeroinitializer
 ; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr float, <8 x float*> [[TMP19]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
-; AVX2-NEXT:    [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), [[TBAA0]]
+; AVX2-NEXT:    [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
 ; AVX2-NEXT:    [[TMP22:%.*]] = fdiv <8 x float> [[TMP18]], [[TMP21]]
 ; AVX2-NEXT:    [[TMP23:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
-; AVX2-NEXT:    store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, [[TBAA0]]
+; AVX2-NEXT:    store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512-LABEL: @gather_load_div(
@@ -592,14 +588,39 @@
 ; AVX512-NEXT:    [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP7]], i32 5
 ; AVX512-NEXT:    [[TMP16:%.*]] = insertelement <8 x float*> [[TMP15]], float* [[TMP8]], i32 6
 ; AVX512-NEXT:    [[TMP17:%.*]] = insertelement <8 x float*> [[TMP16]], float* [[TMP9]], i32 7
-; AVX512-NEXT:    [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), [[TBAA0]]
+; AVX512-NEXT:    [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
 ; AVX512-NEXT:    [[TMP19:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> undef, <8 x i32> zeroinitializer
 ; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr float, <8 x float*> [[TMP19]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
-; AVX512-NEXT:    [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), [[TBAA0]]
+; AVX512-NEXT:    [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
 ; AVX512-NEXT:    [[TMP22:%.*]] = fdiv <8 x float> [[TMP18]], [[TMP21]]
 ; AVX512-NEXT:    [[TMP23:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
-; AVX512-NEXT:    store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, [[TBAA0]]
+; AVX512-NEXT:    store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, !tbaa [[TBAA0]]
 ; AVX512-NEXT:    ret void
+;
+; AVX512VL-LABEL: @gather_load_div(
+; AVX512VL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 10
+; AVX512VL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
+; AVX512VL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
+; AVX512VL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17
+; AVX512VL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8
+; AVX512VL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
+; AVX512VL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
+; AVX512VL-NEXT:    [[TMP10:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i32 0
+; AVX512VL-NEXT:    [[TMP11:%.*]] = insertelement <8 x float*> [[TMP10]], float* [[TMP3]], i32 1
+; AVX512VL-NEXT:    [[TMP12:%.*]] = insertelement <8 x float*> [[TMP11]], float* [[TMP4]], i32 2
+; AVX512VL-NEXT:    [[TMP13:%.*]] = insertelement <8 x float*> [[TMP12]], float* [[TMP5]], i32 3
+; AVX512VL-NEXT:    [[TMP14:%.*]] = insertelement <8 x float*> [[TMP13]], float* [[TMP6]], i32 4
+; AVX512VL-NEXT:    [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP7]], i32 5
+; AVX512VL-NEXT:    [[TMP16:%.*]] = insertelement <8 x float*> [[TMP15]], float* [[TMP8]], i32 6
+; AVX512VL-NEXT:    [[TMP17:%.*]] = insertelement <8 x float*> [[TMP16]], float* [[TMP9]], i32 7
+; AVX512VL-NEXT:    [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP19:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> undef, <8 x i32> zeroinitializer
+; AVX512VL-NEXT:    [[TMP20:%.*]] = getelementptr float, <8 x float*> [[TMP19]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
+; AVX512VL-NEXT:    [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP22:%.*]] = fdiv <8 x float> [[TMP18]], [[TMP21]]
+; AVX512VL-NEXT:    [[TMP23:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
+; AVX512VL-NEXT:    store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    ret void
 ;
   %3 = load float, float* %1, align 4, !tbaa !2
   %4 = getelementptr inbounds float, float* %1, i64 4
Index: llvm/test/Transforms/SLPVectorizer/X86/shift-ashr.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/shift-ashr.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/shift-ashr.ll
@@ -3,8 +3,8 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX1
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX512-SKX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX2-SKX
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver4 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=XOP
 
 @a64 = common global [8 x i64] zeroinitializer, align 64
@@ -109,6 +109,24 @@
 ; AVX512-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
 ; AVX512-NEXT:    ret void
 ;
+; AVX512-SKX-LABEL: @ashr_v8i64(
+; AVX512-SKX-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
+; AVX512-SKX-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
+; AVX512-SKX-NEXT:    [[TMP3:%.*]] = ashr <8 x i64> [[TMP1]], [[TMP2]]
+; AVX512-SKX-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
+; AVX512-SKX-NEXT:    ret void
+;
+; AVX2-SKX-LABEL: @ashr_v8i64(
+; AVX2-SKX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; AVX2-SKX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-SKX-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX2-SKX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-SKX-NEXT:    [[TMP5:%.*]] = ashr <4 x i64> [[TMP1]], [[TMP3]]
+; AVX2-SKX-NEXT:    [[TMP6:%.*]] = ashr <4 x i64> [[TMP2]], [[TMP4]]
+; AVX2-SKX-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX2-SKX-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-SKX-NEXT:    ret void
+;
 ; XOP-LABEL: @ashr_v8i64(
 ; XOP-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
 ; XOP-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
@@ -260,6 +278,24 @@
 ; AVX512-NEXT:    store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
 ; AVX512-NEXT:    ret void
 ;
+; AVX512-SKX-LABEL: @ashr_v16i32(
+; AVX512-SKX-NEXT:    [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
+; AVX512-SKX-NEXT:    [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
+; AVX512-SKX-NEXT:    [[TMP3:%.*]] = ashr <16 x i32> [[TMP1]], [[TMP2]]
+; AVX512-SKX-NEXT:    store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
+; AVX512-SKX-NEXT:    ret void
+;
+; AVX2-SKX-LABEL: @ashr_v16i32(
+; AVX2-SKX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
+; AVX2-SKX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX2-SKX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX2-SKX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX2-SKX-NEXT:    [[TMP5:%.*]] = ashr <8 x i32> [[TMP1]], [[TMP3]]
+; AVX2-SKX-NEXT:    [[TMP6:%.*]] = ashr <8 x i32> [[TMP2]], [[TMP4]]
+; AVX2-SKX-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX2-SKX-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX2-SKX-NEXT:    ret void
+;
 ; XOP-LABEL: @ashr_v16i32(
 ; XOP-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
 ; XOP-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
@@ -340,146 +376,169 @@
 
 define void @ashr_v32i16() {
 ; SSE-LABEL: @ashr_v32i16(
-; SSE-NEXT:    [[A0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0), align 2
-; SSE-NEXT:    [[A1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1), align 2
-; SSE-NEXT:    [[A2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2), align 2
-; SSE-NEXT:    [[A3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3), align 2
-; SSE-NEXT:    [[A4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4), align 2
-; SSE-NEXT:    [[A5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5), align 2
-; SSE-NEXT:    [[A6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6), align 2
-; SSE-NEXT:    [[A7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7), align 2
-; SSE-NEXT:    [[A8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8), align 2
-; SSE-NEXT:    [[A9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9), align 2
-; SSE-NEXT:    [[A10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
-; SSE-NEXT:    [[A11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
-; SSE-NEXT:    [[A12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
-; SSE-NEXT:    [[A13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
-; SSE-NEXT:    [[A14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
-; SSE-NEXT:    [[A15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
-; SSE-NEXT:    [[A16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
-; SSE-NEXT:    [[A17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
-; SSE-NEXT:    [[A18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
-; SSE-NEXT:    [[A19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
-; SSE-NEXT:    [[A20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
-; SSE-NEXT:    [[A21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
-; SSE-NEXT:    [[A22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
-; SSE-NEXT:    [[A23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
-; SSE-NEXT:    [[A24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
-; SSE-NEXT:    [[A25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
-; SSE-NEXT:    [[A26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
-; SSE-NEXT:    [[A27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
-; SSE-NEXT:    [[A28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
-; SSE-NEXT:    [[A29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
-; SSE-NEXT:    [[A30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
-; SSE-NEXT:    [[A31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
-; SSE-NEXT:    [[B0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0), align 2
-; SSE-NEXT:    [[B1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1), align 2
-; SSE-NEXT:    [[B2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2), align 2
-; SSE-NEXT:    [[B3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3), align 2
-; SSE-NEXT:    [[B4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4), align 2
-; SSE-NEXT:    [[B5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5), align 2
-; SSE-NEXT:    [[B6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6), align 2
-; SSE-NEXT:    [[B7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7), align 2
-; SSE-NEXT:    [[B8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8), align 2
-; SSE-NEXT:    [[B9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9), align 2
-; SSE-NEXT:    [[B10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
-; SSE-NEXT:    [[B11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
-; SSE-NEXT:    [[B12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
-; SSE-NEXT:    [[B13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
-; SSE-NEXT:    [[B14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
-; SSE-NEXT:    [[B15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
-; SSE-NEXT:    [[B16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
-; SSE-NEXT:    [[B17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
-; SSE-NEXT:    [[B18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
-; SSE-NEXT:    [[B19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
-; SSE-NEXT:    [[B20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
-; SSE-NEXT:    [[B21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
-; SSE-NEXT:    [[B22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
-; SSE-NEXT:    [[B23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
-; SSE-NEXT:    [[B24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
-; SSE-NEXT:    [[B25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
-; SSE-NEXT:    [[B26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
-; SSE-NEXT:    [[B27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
-; SSE-NEXT:    [[B28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
-; SSE-NEXT:    [[B29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
-; SSE-NEXT:    [[B30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
-; SSE-NEXT:    [[B31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
-; SSE-NEXT:    [[R0:%.*]] = ashr i16 [[A0]], [[B0]]
-; SSE-NEXT:    [[R1:%.*]] = ashr i16 [[A1]], [[B1]]
-; SSE-NEXT:    [[R2:%.*]] = ashr i16 [[A2]], [[B2]]
-; SSE-NEXT:    [[R3:%.*]] = ashr i16 [[A3]], [[B3]]
-; SSE-NEXT:    [[R4:%.*]] = ashr i16 [[A4]], [[B4]]
-; SSE-NEXT:    [[R5:%.*]] = ashr i16 [[A5]], [[B5]]
-; SSE-NEXT:    [[R6:%.*]] = ashr i16 [[A6]], [[B6]]
-; SSE-NEXT:    [[R7:%.*]] = ashr i16 [[A7]], [[B7]]
-; SSE-NEXT:    [[R8:%.*]] = ashr i16 [[A8]], [[B8]]
-; SSE-NEXT:    [[R9:%.*]] = ashr i16 [[A9]], [[B9]]
-; SSE-NEXT:    [[R10:%.*]] = ashr i16 [[A10]], [[B10]]
-; SSE-NEXT:    [[R11:%.*]] = ashr i16 [[A11]], [[B11]]
-; SSE-NEXT:    [[R12:%.*]] = ashr i16 [[A12]], [[B12]]
-; SSE-NEXT:    [[R13:%.*]] = ashr i16 [[A13]], [[B13]]
-; SSE-NEXT:    [[R14:%.*]] = ashr i16 [[A14]], [[B14]]
-; SSE-NEXT:    [[R15:%.*]] = ashr i16 [[A15]], [[B15]]
-; SSE-NEXT:    [[R16:%.*]] = ashr i16 [[A16]], [[B16]]
-; SSE-NEXT:    [[R17:%.*]] = ashr i16 [[A17]], [[B17]]
-; SSE-NEXT:    [[R18:%.*]] = ashr i16 [[A18]], [[B18]]
-; SSE-NEXT:    [[R19:%.*]] = ashr i16 [[A19]], [[B19]]
-; SSE-NEXT:    [[R20:%.*]] = ashr i16 [[A20]], [[B20]]
-; SSE-NEXT:    [[R21:%.*]] = ashr i16 [[A21]], [[B21]]
-; SSE-NEXT:    [[R22:%.*]] = ashr i16 [[A22]], [[B22]]
-; SSE-NEXT:    [[R23:%.*]] = ashr i16 [[A23]], [[B23]]
-; SSE-NEXT:    [[R24:%.*]] = ashr i16 [[A24]], [[B24]]
-; SSE-NEXT:    [[R25:%.*]] = ashr i16 [[A25]], [[B25]]
-; SSE-NEXT:    [[R26:%.*]] = ashr i16 [[A26]], [[B26]]
-; SSE-NEXT:    [[R27:%.*]] = ashr i16 [[A27]], [[B27]]
-; SSE-NEXT:    [[R28:%.*]] = ashr i16 [[A28]], [[B28]]
-; SSE-NEXT:    [[R29:%.*]] = ashr i16 [[A29]], [[B29]]
-; SSE-NEXT:    [[R30:%.*]] = ashr i16 [[A30]], [[B30]]
-; SSE-NEXT:    [[R31:%.*]] = ashr i16 [[A31]], [[B31]]
-; SSE-NEXT:    store i16 [[R0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0), align 2
-; SSE-NEXT:    store i16 [[R1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1), align 2
-; SSE-NEXT:    store i16 [[R2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2), align 2
-; SSE-NEXT:    store i16 [[R3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3), align 2
-; SSE-NEXT:    store i16 [[R4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4), align 2
-; SSE-NEXT:    store i16 [[R5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5), align 2
-; SSE-NEXT:    store i16 [[R6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6), align 2
-; SSE-NEXT:    store i16 [[R7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7), align 2
-; SSE-NEXT:    store i16 [[R8]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8), align 2
-; SSE-NEXT:    store i16 [[R9]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9), align 2
-; SSE-NEXT:    store i16 [[R10]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
-; SSE-NEXT:    store i16 [[R11]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
-; SSE-NEXT:    store i16 [[R12]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
-; SSE-NEXT:    store i16 [[R13]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
-; SSE-NEXT:    store i16 [[R14]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
-; SSE-NEXT:    store i16 [[R15]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
-; SSE-NEXT:    store i16 [[R16]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
-; SSE-NEXT:    store i16 [[R17]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
-; SSE-NEXT:    store i16 [[R18]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
-; SSE-NEXT:    store i16 [[R19]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
-; SSE-NEXT:    store i16 [[R20]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
-; SSE-NEXT:    store i16 [[R21]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
-; SSE-NEXT:    store i16 [[R22]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
-; SSE-NEXT:    store i16 [[R23]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
-; SSE-NEXT:    store i16 [[R24]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
-; SSE-NEXT:    store i16 [[R25]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
-; SSE-NEXT:    store i16 [[R26]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
-; SSE-NEXT:    store i16 [[R27]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
-; SSE-NEXT:    store i16 [[R28]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
-; SSE-NEXT:    store i16 [[R29]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
-; SSE-NEXT:    store i16 [[R30]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
-; SSE-NEXT:    store i16 [[R31]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0
+; SSE-NEXT:    [[TMP10:%.*]] = extractelement <8 x i16> [[TMP5]], i32 0
+; SSE-NEXT:    [[R0:%.*]] = ashr i16 [[TMP9]], [[TMP10]]
+; SSE-NEXT:    [[TMP11:%.*]] = extractelement <8 x i16> [[TMP1]], i32 1
+; SSE-NEXT:    [[TMP12:%.*]] = extractelement <8 x i16> [[TMP5]], i32 1
+; SSE-NEXT:    [[R1:%.*]] = ashr i16 [[TMP11]], [[TMP12]]
+; SSE-NEXT:    [[TMP13:%.*]] = extractelement <8 x i16> [[TMP1]], i32 2
+; SSE-NEXT:    [[TMP14:%.*]] = extractelement <8 x i16> [[TMP5]], i32 2
+; SSE-NEXT:    [[R2:%.*]] = ashr i16 [[TMP13]], [[TMP14]]
+; SSE-NEXT:    [[TMP15:%.*]] = extractelement <8 x i16> [[TMP1]], i32 3
+; SSE-NEXT:    [[TMP16:%.*]] = extractelement <8 x i16> [[TMP5]], i32 3
+; SSE-NEXT:    [[R3:%.*]] = ashr i16 [[TMP15]], [[TMP16]]
+; SSE-NEXT:    [[TMP17:%.*]] = extractelement <8 x i16> [[TMP1]], i32 4
+; SSE-NEXT:    [[TMP18:%.*]] = extractelement <8 x i16> [[TMP5]], i32 4
+; SSE-NEXT:    [[R4:%.*]] = ashr i16 [[TMP17]], [[TMP18]]
+; SSE-NEXT:    [[TMP19:%.*]] = extractelement <8 x i16> [[TMP1]], i32 5
+; SSE-NEXT:    [[TMP20:%.*]] = extractelement <8 x i16> [[TMP5]], i32 5
+; SSE-NEXT:    [[R5:%.*]] = ashr i16 [[TMP19]], [[TMP20]]
+; SSE-NEXT:    [[TMP21:%.*]] = extractelement <8 x i16> [[TMP1]], i32 6
+; SSE-NEXT:    [[TMP22:%.*]] = extractelement <8 x i16> [[TMP5]], i32 6
+; SSE-NEXT:    [[R6:%.*]] = ashr i16 [[TMP21]], [[TMP22]]
+; SSE-NEXT:    [[TMP23:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+; SSE-NEXT:    [[TMP24:%.*]] = extractelement <8 x i16> [[TMP5]], i32 7
+; SSE-NEXT:    [[R7:%.*]] = ashr i16 [[TMP23]], [[TMP24]]
+; SSE-NEXT:    [[TMP25:%.*]] = extractelement <8 x i16> [[TMP2]], i32 0
+; SSE-NEXT:    [[TMP26:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0
+; SSE-NEXT:    [[R8:%.*]] = ashr i16 [[TMP25]], [[TMP26]]
+; SSE-NEXT:    [[TMP27:%.*]] = extractelement <8 x i16> [[TMP2]], i32 1
+; SSE-NEXT:    [[TMP28:%.*]] = extractelement <8 x i16> [[TMP6]], i32 1
+; SSE-NEXT:    [[R9:%.*]] = ashr i16 [[TMP27]], [[TMP28]]
+; SSE-NEXT:    [[TMP29:%.*]] = extractelement <8 x i16> [[TMP2]], i32 2
+; SSE-NEXT:    [[TMP30:%.*]] = extractelement <8 x i16> [[TMP6]], i32 2
+; SSE-NEXT:    [[R10:%.*]] = ashr i16 [[TMP29]], [[TMP30]]
+; SSE-NEXT:    [[TMP31:%.*]] = extractelement <8 x i16> [[TMP2]], i32 3
+; SSE-NEXT:    [[TMP32:%.*]] = extractelement <8 x i16> [[TMP6]], i32 3
+; SSE-NEXT:    [[R11:%.*]] = ashr i16 [[TMP31]], [[TMP32]]
+; SSE-NEXT:    [[TMP33:%.*]] = extractelement <8 x i16> [[TMP2]], i32 4
+; SSE-NEXT:    [[TMP34:%.*]] = extractelement <8 x i16> [[TMP6]], i32 4
+; SSE-NEXT:    [[R12:%.*]] = ashr i16 [[TMP33]], [[TMP34]]
+; SSE-NEXT:    [[TMP35:%.*]] = extractelement <8 x i16> [[TMP2]], i32 5
+; SSE-NEXT:    [[TMP36:%.*]] = extractelement <8 x i16> [[TMP6]], i32 5
+; SSE-NEXT:    [[R13:%.*]] = ashr i16 [[TMP35]], [[TMP36]]
+; SSE-NEXT:    [[TMP37:%.*]] = extractelement <8 x i16> [[TMP2]], i32 6
+; SSE-NEXT:    [[TMP38:%.*]] = extractelement <8 x i16> [[TMP6]], i32 6
+; SSE-NEXT:    [[R14:%.*]] = ashr i16 [[TMP37]], [[TMP38]]
+; SSE-NEXT:    [[TMP39:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
+; SSE-NEXT:    [[TMP40:%.*]] = extractelement <8 x i16> [[TMP6]], i32 7
+; SSE-NEXT:    [[R15:%.*]] = ashr i16 [[TMP39]], [[TMP40]]
+; SSE-NEXT:    [[TMP41:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
+; SSE-NEXT:    [[TMP42:%.*]] = extractelement <8 x i16> [[TMP7]], i32 0
+; SSE-NEXT:    [[R16:%.*]] = ashr i16 [[TMP41]], [[TMP42]]
+; SSE-NEXT:    [[TMP43:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
+; SSE-NEXT:    [[TMP44:%.*]] = extractelement <8 x i16> [[TMP7]], i32 1
+; SSE-NEXT:    [[R17:%.*]] = ashr i16 [[TMP43]], [[TMP44]]
+; SSE-NEXT:    [[TMP45:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
+; SSE-NEXT:    [[TMP46:%.*]] = extractelement <8 x i16> [[TMP7]], i32 2
+; SSE-NEXT:    [[R18:%.*]] = ashr i16 [[TMP45]], [[TMP46]]
+; SSE-NEXT:    [[TMP47:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
+; SSE-NEXT:    [[TMP48:%.*]] = extractelement <8 x i16> [[TMP7]], i32 3
+; SSE-NEXT:    [[R19:%.*]] = ashr i16 [[TMP47]], [[TMP48]]
+; SSE-NEXT:    [[TMP49:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
+; SSE-NEXT:    [[TMP50:%.*]] = extractelement <8 x i16> [[TMP7]], i32 4
+; SSE-NEXT:    [[R20:%.*]] = ashr i16 [[TMP49]], [[TMP50]]
+; SSE-NEXT:    [[TMP51:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
+; SSE-NEXT:    [[TMP52:%.*]] = extractelement <8 x i16> [[TMP7]], i32 5
+; SSE-NEXT:    [[R21:%.*]] = ashr i16 [[TMP51]], [[TMP52]]
+; SSE-NEXT:    [[TMP53:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
+; SSE-NEXT:    [[TMP54:%.*]] = extractelement <8 x i16> [[TMP7]], i32 6
+; SSE-NEXT:    [[R22:%.*]] = ashr i16 [[TMP53]], [[TMP54]]
+; SSE-NEXT:    [[TMP55:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
+; SSE-NEXT:    [[TMP56:%.*]] = extractelement <8 x i16> [[TMP7]], i32 7
+; SSE-NEXT:    [[R23:%.*]] = ashr i16 [[TMP55]], [[TMP56]]
+; SSE-NEXT:    [[TMP57:%.*]] = extractelement <8 x i16> [[TMP4]], i32 0
+; SSE-NEXT:    [[TMP58:%.*]] = extractelement <8 x i16> [[TMP8]], i32 0
+; SSE-NEXT:    [[R24:%.*]] = ashr i16 [[TMP57]], [[TMP58]]
+; SSE-NEXT:    [[TMP59:%.*]] = extractelement <8 x i16> [[TMP4]], i32 1
+; SSE-NEXT:    [[TMP60:%.*]] = extractelement <8 x i16> [[TMP8]], i32 1
+; SSE-NEXT:    [[R25:%.*]] = ashr i16 [[TMP59]], [[TMP60]]
+; SSE-NEXT:    [[TMP61:%.*]] = extractelement <8 x i16> [[TMP4]], i32 2
+; SSE-NEXT:    [[TMP62:%.*]] = extractelement <8 x i16> [[TMP8]], i32 2
+; SSE-NEXT:    [[R26:%.*]] = ashr i16 [[TMP61]], [[TMP62]]
+; SSE-NEXT:    [[TMP63:%.*]] = extractelement <8 x i16> [[TMP4]], i32 3
+; SSE-NEXT:    [[TMP64:%.*]] = extractelement <8 x i16> [[TMP8]], i32 3
+; SSE-NEXT:    [[R27:%.*]] = ashr i16 [[TMP63]], [[TMP64]]
+; SSE-NEXT:    [[TMP65:%.*]] = extractelement <8 x i16> [[TMP4]], i32 4
+; SSE-NEXT:    [[TMP66:%.*]] = extractelement <8 x i16> [[TMP8]], i32 4
+; SSE-NEXT:    [[R28:%.*]] = ashr i16 [[TMP65]], [[TMP66]]
+; SSE-NEXT:    [[TMP67:%.*]] = extractelement <8 x i16> [[TMP4]], i32 5
+; SSE-NEXT:    [[TMP68:%.*]] = extractelement <8 x i16> [[TMP8]], i32 5
+; SSE-NEXT:    [[R29:%.*]] = ashr i16 [[TMP67]], [[TMP68]]
+; SSE-NEXT:    [[TMP69:%.*]] = extractelement <8 x i16> [[TMP4]], i32 6
+; SSE-NEXT:    [[TMP70:%.*]] = extractelement <8 x i16> [[TMP8]], i32 6
+; SSE-NEXT:    [[R30:%.*]] = ashr i16 [[TMP69]], [[TMP70]]
+; SSE-NEXT:    [[TMP71:%.*]] = extractelement <8 x i16> [[TMP4]], i32 7
+; SSE-NEXT:    [[TMP72:%.*]] = extractelement <8 x i16> [[TMP8]], i32 7
+; SSE-NEXT:    [[R31:%.*]] = ashr i16 [[TMP71]], [[TMP72]]
+; SSE-NEXT:    [[TMP73:%.*]] = insertelement <8 x i16> poison, i16 [[R0]], i32 0
+; SSE-NEXT:    [[TMP74:%.*]] = insertelement <8 x i16> [[TMP73]], i16 [[R1]], i32 1
+; SSE-NEXT:    [[TMP75:%.*]] = insertelement <8 x i16> [[TMP74]], i16 [[R2]], i32 2
+; SSE-NEXT:    [[TMP76:%.*]] = insertelement <8 x i16> [[TMP75]], i16 [[R3]], i32 3
+; SSE-NEXT:    [[TMP77:%.*]] = insertelement <8 x i16> [[TMP76]], i16 [[R4]], i32 4
+; SSE-NEXT:    [[TMP78:%.*]] = insertelement <8 x i16> [[TMP77]], i16 [[R5]], i32 5
+; SSE-NEXT:    [[TMP79:%.*]] = insertelement <8 x i16> [[TMP78]], i16 [[R6]], i32 6
+; SSE-NEXT:    [[TMP80:%.*]] = insertelement <8 x i16> [[TMP79]], i16 [[R7]], i32 7
+; SSE-NEXT:    store <8 x i16> [[TMP80]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP81:%.*]] = insertelement <8 x i16> poison, i16 [[R8]], i32 0
+; SSE-NEXT:    [[TMP82:%.*]] = insertelement <8 x i16> [[TMP81]], i16 [[R9]], i32 1
+; SSE-NEXT:    [[TMP83:%.*]] = insertelement <8 x i16> [[TMP82]], i16 [[R10]], i32 2
+; SSE-NEXT:    [[TMP84:%.*]] = insertelement <8 x i16> [[TMP83]], i16 [[R11]], i32 3
+; SSE-NEXT:    [[TMP85:%.*]] = insertelement <8 x i16> [[TMP84]], i16 [[R12]], i32 4
+; SSE-NEXT:    [[TMP86:%.*]] = insertelement <8 x i16> [[TMP85]], i16 [[R13]], i32 5
+; SSE-NEXT:    [[TMP87:%.*]] = insertelement <8 x i16> [[TMP86]], i16 [[R14]], i32 6
+; SSE-NEXT:    [[TMP88:%.*]] = insertelement <8 x i16> [[TMP87]], i16 [[R15]], i32 7
+; SSE-NEXT:    store <8 x i16> [[TMP88]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP89:%.*]] = insertelement <8 x i16> poison, i16 [[R16]], i32 0
+; SSE-NEXT:    [[TMP90:%.*]] = insertelement <8 x i16> [[TMP89]], i16 [[R17]], i32 1
+; SSE-NEXT:    [[TMP91:%.*]] = insertelement <8 x i16> [[TMP90]], i16 [[R18]], i32 2
+; SSE-NEXT:    [[TMP92:%.*]] = insertelement <8 x i16> [[TMP91]], i16 [[R19]], i32 3
+; SSE-NEXT:    [[TMP93:%.*]] = insertelement <8 x i16> [[TMP92]], i16 [[R20]], i32 4
+; SSE-NEXT:    [[TMP94:%.*]] = insertelement <8 x i16> [[TMP93]], i16 [[R21]], i32 5
+; SSE-NEXT:    [[TMP95:%.*]] = insertelement <8 x i16> [[TMP94]], i16 [[R22]], i32 6
+; SSE-NEXT:    [[TMP96:%.*]] = insertelement <8 x i16> [[TMP95]], i16 [[R23]], i32 7
+; SSE-NEXT:    store <8 x i16> [[TMP96]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP97:%.*]] = insertelement <8 x i16> poison, i16 [[R24]], i32 0
+; SSE-NEXT:    [[TMP98:%.*]] = insertelement <8 x i16> [[TMP97]], i16 [[R25]], i32 1
+; SSE-NEXT:    [[TMP99:%.*]] = insertelement <8 x i16> [[TMP98]], i16 [[R26]], i32 2
+; SSE-NEXT:    [[TMP100:%.*]] = insertelement <8 x i16> [[TMP99]], i16 [[R27]], i32 3
+; SSE-NEXT:    [[TMP101:%.*]] = insertelement <8 x i16> [[TMP100]], i16 [[R28]], i32 4
+; SSE-NEXT:    [[TMP102:%.*]] = insertelement <8 x i16> [[TMP101]], i16 [[R29]], i32 5
+; SSE-NEXT:    [[TMP103:%.*]] = insertelement <8 x i16> [[TMP102]], i16 [[R30]], i32 6
+; SSE-NEXT:    [[TMP104:%.*]] = insertelement <8 x i16> [[TMP103]], i16 [[R31]], i32 7
+; SSE-NEXT:    store <8 x i16> [[TMP104]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
-; AVX-LABEL: @ashr_v32i16(
-; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP5:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP3]]
-; AVX-NEXT:    [[TMP6:%.*]] = ashr <16 x i16> [[TMP2]], [[TMP4]]
-; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
-; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    ret void
+; AVX1-LABEL: @ashr_v32i16(
+; AVX1-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX1-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX1-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX1-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX1-NEXT:    [[TMP5:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP3]]
+; AVX1-NEXT:    [[TMP6:%.*]] = ashr <16 x i16> [[TMP2]], [[TMP4]]
+; AVX1-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX1-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @ashr_v32i16(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX2-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX2-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX2-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX2-NEXT:    [[TMP5:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP3]]
+; AVX2-NEXT:    [[TMP6:%.*]] = ashr <16 x i16> [[TMP2]], [[TMP4]]
+; AVX2-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX2-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX2-NEXT:    ret void
 ;
 ; AVX512-LABEL: @ashr_v32i16(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <32 x i16>, <32 x i16>* bitcast ([32 x i16]* @a16 to <32 x i16>*), align 2
@@ -488,6 +547,24 @@
 ; AVX512-NEXT:    store <32 x i16> [[TMP3]], <32 x i16>* bitcast ([32 x i16]* @c16 to <32 x i16>*), align 2
 ; AVX512-NEXT:    ret void
 ;
+; AVX512-SKX-LABEL: @ashr_v32i16(
+; AVX512-SKX-NEXT:    [[TMP1:%.*]] = load <32 x i16>, <32 x i16>* bitcast ([32 x i16]* @a16 to <32 x i16>*), align 2
+; AVX512-SKX-NEXT:    [[TMP2:%.*]] = load <32 x i16>, <32 x i16>* bitcast ([32 x i16]* @b16 to <32 x i16>*), align 2
+; AVX512-SKX-NEXT:    [[TMP3:%.*]] = ashr <32 x i16> [[TMP1]], [[TMP2]]
+; AVX512-SKX-NEXT:    store <32 x i16> [[TMP3]], <32 x i16>* bitcast ([32 x i16]* @c16 to <32 x i16>*), align 2
+; AVX512-SKX-NEXT:    ret void
+;
+; AVX2-SKX-LABEL: @ashr_v32i16(
+; AVX2-SKX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX2-SKX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX2-SKX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX2-SKX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX2-SKX-NEXT:    [[TMP5:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP3]]
+; AVX2-SKX-NEXT:    [[TMP6:%.*]] = ashr <16 x i16> [[TMP2]], [[TMP4]]
+; AVX2-SKX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX2-SKX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX2-SKX-NEXT:    ret void
+;
 ; XOP-LABEL: @ashr_v32i16(
 ; XOP-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
 ; XOP-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
@@ -499,6 +576,16 @@
 ; XOP-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
 ; XOP-NEXT:    ret void
 ;
+; AVX-LABEL: @ashr_v32i16(
+; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = ashr <16 x i16> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    ret void
   %a0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2
   %a1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2
   %a2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2
@@ -650,16 +737,27 @@
 ; SSE-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SSE-NEXT:    ret void
 ;
-; AVX-LABEL: @ashr_v64i8(
-; AVX-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP5:%.*]] = ashr <32 x i8> [[TMP1]], [[TMP3]]
-; AVX-NEXT:    [[TMP6:%.*]] = ashr <32 x i8> [[TMP2]], [[TMP4]]
-; AVX-NEXT:    store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
-; AVX-NEXT:    store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    ret void
+; AVX1-LABEL: @ashr_v64i8(
+; AVX1-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
+; AVX1-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX1-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
+; AVX1-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX1-NEXT:    [[TMP5:%.*]] = ashr <32 x i8> [[TMP1]], [[TMP3]]
+; AVX1-NEXT:    [[TMP6:%.*]] = ashr <32 x i8> [[TMP2]], [[TMP4]]
+; AVX1-NEXT:    store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
+; AVX1-NEXT:    store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @ashr_v64i8(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
+; AVX2-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX2-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
+; AVX2-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX2-NEXT:    [[TMP5:%.*]] = ashr <32 x i8> [[TMP1]], [[TMP3]]
+; AVX2-NEXT:    [[TMP6:%.*]] = ashr <32 x i8> [[TMP2]], [[TMP4]]
+; AVX2-NEXT:    store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
+; AVX2-NEXT:    store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX2-NEXT:    ret void
 ;
 ; AVX512-LABEL: @ashr_v64i8(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <64 x i8>, <64 x i8>* bitcast ([64 x i8]* @a8 to <64 x i8>*), align 1
@@ -668,6 +766,24 @@
 ; AVX512-NEXT:    store <64 x i8> [[TMP3]], <64 x i8>* bitcast ([64 x i8]* @c8 to <64 x i8>*), align 1
 ; AVX512-NEXT:    ret void
 ;
+; AVX512-SKX-LABEL: @ashr_v64i8(
+; AVX512-SKX-NEXT:    [[TMP1:%.*]] = load <64 x i8>, <64 x i8>* bitcast ([64 x i8]* @a8 to <64 x i8>*), align 1
+; AVX512-SKX-NEXT:    [[TMP2:%.*]] = load <64 x i8>, <64 x i8>* bitcast ([64 x i8]* @b8 to <64 x i8>*), align 1
+; AVX512-SKX-NEXT:    [[TMP3:%.*]] = ashr <64 x i8> [[TMP1]], [[TMP2]]
+; AVX512-SKX-NEXT:    store <64 x i8> [[TMP3]], <64 x i8>* bitcast ([64 x i8]* @c8 to <64 x i8>*), align 1
+; AVX512-SKX-NEXT:    ret void
+;
+; AVX2-SKX-LABEL: @ashr_v64i8(
+; AVX2-SKX-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
+; AVX2-SKX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX2-SKX-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
+; AVX2-SKX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX2-SKX-NEXT:    [[TMP5:%.*]] = ashr <32 x i8> [[TMP1]], [[TMP3]]
+; AVX2-SKX-NEXT:    [[TMP6:%.*]] = ashr <32 x i8> [[TMP2]], [[TMP4]]
+; AVX2-SKX-NEXT:    store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
+; AVX2-SKX-NEXT:    store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX2-SKX-NEXT:    ret void
+;
 ; XOP-LABEL: @ashr_v64i8(
 ; XOP-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
 ; XOP-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
@@ -679,6 +795,16 @@
 ; XOP-NEXT:    store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
 ; XOP-NEXT:    ret void
 ;
+; AVX-LABEL: @ashr_v64i8(
+; AVX-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP5:%.*]] = ashr <32 x i8> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = ashr <32 x i8> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
+; AVX-NEXT:    store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    ret void
   %a0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
   %a1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
   %a2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1
Index: llvm/test/Transforms/SLPVectorizer/X86/shift-lshr.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/shift-lshr.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/shift-lshr.ll
@@ -3,8 +3,8 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX512-SKX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX --check-prefix=AVX2-SKX
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver4 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=XOP
 
 @a64 = common global [8 x i64] zeroinitializer, align 64
@@ -77,6 +77,24 @@
 ; AVX512-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
 ; AVX512-NEXT:    ret void
 ;
+; AVX512-SKX-LABEL: @lshr_v8i64(
+; AVX512-SKX-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
+; AVX512-SKX-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
+; AVX512-SKX-NEXT:    [[TMP3:%.*]] = lshr <8 x i64> [[TMP1]], [[TMP2]]
+; AVX512-SKX-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
+; AVX512-SKX-NEXT:    ret void
+;
+; AVX2-SKX-LABEL: @lshr_v8i64(
+; AVX2-SKX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; AVX2-SKX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-SKX-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX2-SKX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-SKX-NEXT:    [[TMP5:%.*]] = lshr <4 x i64> [[TMP1]], [[TMP3]]
+; AVX2-SKX-NEXT:    [[TMP6:%.*]] = lshr <4 x i64> [[TMP2]], [[TMP4]]
+; AVX2-SKX-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX2-SKX-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-SKX-NEXT:    ret void
+;
 ; XOP-LABEL: @lshr_v8i64(
 ; XOP-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
 ; XOP-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
@@ -209,6 +227,13 @@
 ; AVX512-NEXT:    store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
 ; AVX512-NEXT:    ret void
 ;
+; AVX512-SKX-LABEL: @lshr_v16i32(
+; AVX512-SKX-NEXT:    [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
+; AVX512-SKX-NEXT:    [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
+; AVX512-SKX-NEXT:    [[TMP3:%.*]] = lshr <16 x i32> [[TMP1]], [[TMP2]]
+; AVX512-SKX-NEXT:    store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
+; AVX512-SKX-NEXT:    ret void
+;
 ; XOP-LABEL: @lshr_v16i32(
 ; XOP-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
 ; XOP-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
@@ -289,134 +314,146 @@
 
 define void @lshr_v32i16() {
 ; SSE-LABEL: @lshr_v32i16(
-; SSE-NEXT:    [[A0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0), align 2
-; SSE-NEXT:    [[A1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1), align 2
-; SSE-NEXT:    [[A2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2), align 2
-; SSE-NEXT:    [[A3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3), align 2
-; SSE-NEXT:    [[A4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4), align 2
-; SSE-NEXT:    [[A5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5), align 2
-; SSE-NEXT:    [[A6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6), align 2
-; SSE-NEXT:    [[A7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7), align 2
-; SSE-NEXT:    [[A8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8), align 2
-; SSE-NEXT:    [[A9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9), align 2
-; SSE-NEXT:    [[A10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
-; SSE-NEXT:    [[A11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
-; SSE-NEXT:    [[A12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
-; SSE-NEXT:    [[A13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
-; SSE-NEXT:    [[A14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
-; SSE-NEXT:    [[A15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
-; SSE-NEXT:    [[A16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
-; SSE-NEXT:    [[A17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
-; SSE-NEXT:    [[A18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
-; SSE-NEXT:    [[A19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
-; SSE-NEXT:    [[A20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
-; SSE-NEXT:    [[A21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
-; SSE-NEXT:    [[A22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
-; SSE-NEXT:    [[A23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
-; SSE-NEXT:    [[A24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
-; SSE-NEXT:    [[A25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
-; SSE-NEXT:    [[A26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
-; SSE-NEXT:    [[A27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
-; SSE-NEXT:    [[A28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
-; SSE-NEXT:    [[A29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
-; SSE-NEXT:    [[A30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
-; SSE-NEXT:    [[A31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
-; SSE-NEXT:    [[B0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0), align 2
-; SSE-NEXT:    [[B1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1), align 2
-; SSE-NEXT:    [[B2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2), align 2
-; SSE-NEXT:    [[B3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3), align 2
-; SSE-NEXT:    [[B4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4), align 2
-; SSE-NEXT:    [[B5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5), align 2
-; SSE-NEXT:    [[B6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6), align 2
-; SSE-NEXT:    [[B7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7), align 2
-; SSE-NEXT:    [[B8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8), align 2
-; SSE-NEXT:    [[B9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9), align 2
-; SSE-NEXT:    [[B10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
-; SSE-NEXT:    [[B11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
-; SSE-NEXT:    [[B12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
-; SSE-NEXT:    [[B13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
-; SSE-NEXT:    [[B14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
-; SSE-NEXT:    [[B15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
-; SSE-NEXT:    [[B16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
-; SSE-NEXT:    [[B17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
-; SSE-NEXT:    [[B18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
-; SSE-NEXT:    [[B19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
-; SSE-NEXT:    [[B20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
-; SSE-NEXT:    [[B21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
-; SSE-NEXT:    [[B22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
-; SSE-NEXT:    [[B23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
-; SSE-NEXT:    [[B24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
-; SSE-NEXT:    [[B25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
-; SSE-NEXT:    [[B26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
-; SSE-NEXT:    [[B27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
-; SSE-NEXT:    [[B28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
-; SSE-NEXT:    [[B29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
-; SSE-NEXT:    [[B30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
-; SSE-NEXT:    [[B31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
-; SSE-NEXT:    [[R0:%.*]] = lshr i16 [[A0]], [[B0]]
-; SSE-NEXT:    [[R1:%.*]] = lshr i16 [[A1]], [[B1]]
-; SSE-NEXT:    [[R2:%.*]] = lshr i16 [[A2]], [[B2]]
-; SSE-NEXT:    [[R3:%.*]] = lshr i16 [[A3]], [[B3]]
-; SSE-NEXT:    [[R4:%.*]] = lshr i16 [[A4]], [[B4]]
-; SSE-NEXT:    [[R5:%.*]] = lshr i16 [[A5]], [[B5]]
-; SSE-NEXT:    [[R6:%.*]] = lshr i16 [[A6]], [[B6]]
-; SSE-NEXT:    [[R7:%.*]] = lshr i16 [[A7]], [[B7]]
-; SSE-NEXT:    [[R8:%.*]] = lshr i16 [[A8]], [[B8]]
-; SSE-NEXT:    [[R9:%.*]] = lshr i16 [[A9]], [[B9]]
-; SSE-NEXT:    [[R10:%.*]] = lshr i16 [[A10]], [[B10]]
-; SSE-NEXT:    [[R11:%.*]] = lshr i16 [[A11]], [[B11]]
-; SSE-NEXT:    [[R12:%.*]] = lshr i16 [[A12]], [[B12]]
-; SSE-NEXT:    [[R13:%.*]] = lshr i16 [[A13]], [[B13]]
-; SSE-NEXT:    [[R14:%.*]] = lshr i16 [[A14]], [[B14]]
-; SSE-NEXT:    [[R15:%.*]] = lshr i16 [[A15]], [[B15]]
-; SSE-NEXT:    [[R16:%.*]] = lshr i16 [[A16]], [[B16]]
-; SSE-NEXT:    [[R17:%.*]] = lshr i16 [[A17]], [[B17]]
-; SSE-NEXT:    [[R18:%.*]] = lshr i16 [[A18]], [[B18]]
-; SSE-NEXT:    [[R19:%.*]] = lshr i16 [[A19]], [[B19]]
-; SSE-NEXT:    [[R20:%.*]] = lshr i16 [[A20]], [[B20]]
-; SSE-NEXT:    [[R21:%.*]] = lshr i16 [[A21]], [[B21]]
-; SSE-NEXT:    [[R22:%.*]] = lshr i16 [[A22]], [[B22]]
-; SSE-NEXT:    [[R23:%.*]] = lshr i16 [[A23]], [[B23]]
-; SSE-NEXT:    [[R24:%.*]] = lshr i16 [[A24]], [[B24]]
-; SSE-NEXT:    [[R25:%.*]] = lshr i16 [[A25]], [[B25]]
-; SSE-NEXT:    [[R26:%.*]] = lshr i16 [[A26]], [[B26]]
-; SSE-NEXT:    [[R27:%.*]] = lshr i16 [[A27]], [[B27]]
-; SSE-NEXT:    [[R28:%.*]] = lshr i16 [[A28]], [[B28]]
-; SSE-NEXT:    [[R29:%.*]] = lshr i16 [[A29]], [[B29]]
-; SSE-NEXT:    [[R30:%.*]] = lshr i16 [[A30]], [[B30]]
-; SSE-NEXT:    [[R31:%.*]] = lshr i16 [[A31]], [[B31]]
-; SSE-NEXT:    store i16 [[R0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0), align 2
-; SSE-NEXT:    store i16 [[R1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1), align 2
-; SSE-NEXT:    store i16 [[R2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2), align 2
-; SSE-NEXT:    store i16 [[R3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3), align 2
-; SSE-NEXT:    store i16 [[R4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4), align 2
-; SSE-NEXT:    store i16 [[R5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5), align 2
-; SSE-NEXT:    store i16 [[R6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6), align 2
-; SSE-NEXT:    store i16 [[R7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7), align 2
-; SSE-NEXT:    store i16 [[R8]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8), align 2
-; SSE-NEXT:    store i16 [[R9]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9), align 2
-; SSE-NEXT:    store i16 [[R10]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
-; SSE-NEXT:    store i16 [[R11]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
-; SSE-NEXT:    store i16 [[R12]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
-; SSE-NEXT:    store i16 [[R13]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
-; SSE-NEXT:    store i16 [[R14]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
-; SSE-NEXT:    store i16 [[R15]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
-; SSE-NEXT:    store i16 [[R16]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
-; SSE-NEXT:    store i16 [[R17]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
-; SSE-NEXT:    store i16 [[R18]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
-; SSE-NEXT:    store i16 [[R19]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
-; SSE-NEXT:    store i16 [[R20]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
-; SSE-NEXT:    store i16 [[R21]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
-; SSE-NEXT:    store i16 [[R22]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
-; SSE-NEXT:    store i16 [[R23]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
-; SSE-NEXT:    store i16 [[R24]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
-; SSE-NEXT:    store i16 [[R25]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
-; SSE-NEXT:    store i16 [[R26]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
-; SSE-NEXT:    store i16 [[R27]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
-; SSE-NEXT:    store i16 [[R28]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
-; SSE-NEXT:    store i16 [[R29]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
-; SSE-NEXT:    store i16 [[R30]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
-; SSE-NEXT:    store i16 [[R31]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0
+; SSE-NEXT:    [[TMP10:%.*]] = extractelement <8 x i16> [[TMP5]], i32 0
+; SSE-NEXT:    [[R0:%.*]] = lshr i16 [[TMP9]], [[TMP10]]
+; SSE-NEXT:    [[TMP11:%.*]] = extractelement <8 x i16> [[TMP1]], i32 1
+; SSE-NEXT:    [[TMP12:%.*]] = extractelement <8 x i16> [[TMP5]], i32 1
+; SSE-NEXT:    [[R1:%.*]] = lshr i16 [[TMP11]], [[TMP12]]
+; SSE-NEXT:    [[TMP13:%.*]] = extractelement <8 x i16> [[TMP1]], i32 2
+; SSE-NEXT:    [[TMP14:%.*]] = extractelement <8 x i16> [[TMP5]], i32 2
+; SSE-NEXT:    [[R2:%.*]] = lshr i16 [[TMP13]], [[TMP14]]
+; SSE-NEXT:    [[TMP15:%.*]] = extractelement <8 x i16> [[TMP1]], i32 3
+; SSE-NEXT:    [[TMP16:%.*]] = extractelement <8 x i16> [[TMP5]], i32 3
+; SSE-NEXT:    [[R3:%.*]] = lshr i16 [[TMP15]], [[TMP16]]
+; SSE-NEXT:    [[TMP17:%.*]] = extractelement <8 x i16> [[TMP1]], i32 4
+; SSE-NEXT:    [[TMP18:%.*]] = extractelement <8 x i16> [[TMP5]], i32 4
+; SSE-NEXT:    [[R4:%.*]] = lshr i16 [[TMP17]], [[TMP18]]
+; SSE-NEXT:    [[TMP19:%.*]] = extractelement <8 x i16> [[TMP1]], i32 5
+; SSE-NEXT:    [[TMP20:%.*]] = extractelement <8 x i16> [[TMP5]], i32 5
+; SSE-NEXT:    [[R5:%.*]] = lshr i16 [[TMP19]], [[TMP20]]
+; SSE-NEXT:    [[TMP21:%.*]] = extractelement <8 x i16> [[TMP1]], i32 6
+; SSE-NEXT:    [[TMP22:%.*]] = extractelement <8 x i16> [[TMP5]], i32 6
+; SSE-NEXT:    [[R6:%.*]] = lshr i16 [[TMP21]], [[TMP22]]
+; SSE-NEXT:    [[TMP23:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+; SSE-NEXT:    [[TMP24:%.*]] = extractelement <8 x i16> [[TMP5]], i32 7
+; SSE-NEXT:    [[R7:%.*]] = lshr i16 [[TMP23]], [[TMP24]]
+; SSE-NEXT:    [[TMP25:%.*]] = extractelement <8 x i16> [[TMP2]], i32 0
+; SSE-NEXT:    [[TMP26:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0
+; SSE-NEXT:    [[R8:%.*]] = lshr i16 [[TMP25]], [[TMP26]]
+; SSE-NEXT:    [[TMP27:%.*]] = extractelement <8 x i16> [[TMP2]], i32 1
+; SSE-NEXT:    [[TMP28:%.*]] = extractelement <8 x i16> [[TMP6]], i32 1
+; SSE-NEXT:    [[R9:%.*]] = lshr i16 [[TMP27]], [[TMP28]]
+; SSE-NEXT:    [[TMP29:%.*]] = extractelement <8 x i16> [[TMP2]], i32 2
+; SSE-NEXT:    [[TMP30:%.*]] = extractelement <8 x i16> [[TMP6]], i32 2
+; SSE-NEXT:    [[R10:%.*]] = lshr i16 [[TMP29]], [[TMP30]]
+; SSE-NEXT:    [[TMP31:%.*]] = extractelement <8 x i16> [[TMP2]], i32 3
+; SSE-NEXT:    [[TMP32:%.*]] = extractelement <8 x i16> [[TMP6]], i32 3
+; SSE-NEXT:    [[R11:%.*]] = lshr i16 [[TMP31]], [[TMP32]]
+; SSE-NEXT:    [[TMP33:%.*]] = extractelement <8 x i16> [[TMP2]], i32 4
+; SSE-NEXT:    [[TMP34:%.*]] = extractelement <8 x i16> [[TMP6]], i32 4
+; SSE-NEXT:    [[R12:%.*]] = lshr i16 [[TMP33]], [[TMP34]]
+; SSE-NEXT:    [[TMP35:%.*]] = extractelement <8 x i16> [[TMP2]], i32 5
+; SSE-NEXT:    [[TMP36:%.*]] = extractelement <8 x i16> [[TMP6]], i32 5
+; SSE-NEXT:    [[R13:%.*]] = lshr i16 [[TMP35]], [[TMP36]]
+; SSE-NEXT:    [[TMP37:%.*]] = extractelement <8 x i16> [[TMP2]], i32 6
+; SSE-NEXT:    [[TMP38:%.*]] = extractelement <8 x i16> [[TMP6]], i32 6
+; SSE-NEXT:    [[R14:%.*]] = lshr i16 [[TMP37]], [[TMP38]]
+; SSE-NEXT:    [[TMP39:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
+; SSE-NEXT:    [[TMP40:%.*]] = extractelement <8 x i16> [[TMP6]], i32 7
+; SSE-NEXT:    [[R15:%.*]] = lshr i16 [[TMP39]], [[TMP40]]
+; SSE-NEXT:    [[TMP41:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
+; SSE-NEXT:    [[TMP42:%.*]] = extractelement <8 x i16> [[TMP7]], i32 0
+; SSE-NEXT:    [[R16:%.*]] = lshr i16 [[TMP41]], [[TMP42]]
+; SSE-NEXT:    [[TMP43:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
+; SSE-NEXT:    [[TMP44:%.*]] = extractelement <8 x i16> [[TMP7]], i32 1
+; SSE-NEXT:    [[R17:%.*]] = lshr i16 [[TMP43]], [[TMP44]]
+; SSE-NEXT:    [[TMP45:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
+; SSE-NEXT:    [[TMP46:%.*]] = extractelement <8 x i16> [[TMP7]], i32 2
+; SSE-NEXT:    [[R18:%.*]] = lshr i16 [[TMP45]], [[TMP46]]
+; SSE-NEXT:    [[TMP47:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
+; SSE-NEXT:    [[TMP48:%.*]] = extractelement <8 x i16> [[TMP7]], i32 3
+; SSE-NEXT:    [[R19:%.*]] = lshr i16 [[TMP47]], [[TMP48]]
+; SSE-NEXT:    [[TMP49:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
+; SSE-NEXT:    [[TMP50:%.*]] = extractelement <8 x i16> [[TMP7]], i32 4
+; SSE-NEXT:    [[R20:%.*]] = lshr i16 [[TMP49]], [[TMP50]]
+; SSE-NEXT:    [[TMP51:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
+; SSE-NEXT:    [[TMP52:%.*]] = extractelement <8 x i16> [[TMP7]], i32 5
+; SSE-NEXT:    [[R21:%.*]] = lshr i16 [[TMP51]], [[TMP52]]
+; SSE-NEXT:    [[TMP53:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
+; SSE-NEXT:    [[TMP54:%.*]] = extractelement <8 x i16> [[TMP7]], i32 6
+; SSE-NEXT:    [[R22:%.*]] = lshr i16 [[TMP53]], [[TMP54]]
+; SSE-NEXT:    [[TMP55:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
+; SSE-NEXT:    [[TMP56:%.*]] = extractelement <8 x i16> [[TMP7]], i32 7
+; SSE-NEXT:    [[R23:%.*]] = lshr i16 [[TMP55]], [[TMP56]]
+; SSE-NEXT:    [[TMP57:%.*]] = extractelement <8 x i16> [[TMP4]], i32 0
+; SSE-NEXT:    [[TMP58:%.*]] = extractelement <8 x i16> [[TMP8]], i32 0
+; SSE-NEXT:    [[R24:%.*]] = lshr i16 [[TMP57]], [[TMP58]]
+; SSE-NEXT:    [[TMP59:%.*]] = extractelement <8 x i16> [[TMP4]], i32 1
+; SSE-NEXT:    [[TMP60:%.*]] = extractelement <8 x i16> [[TMP8]], i32 1
+; SSE-NEXT:    [[R25:%.*]] = lshr i16 [[TMP59]], [[TMP60]]
+; SSE-NEXT:    [[TMP61:%.*]] = extractelement <8 x i16> [[TMP4]], i32 2
+; SSE-NEXT:    [[TMP62:%.*]] = extractelement <8 x i16> [[TMP8]], i32 2
+; SSE-NEXT:    [[R26:%.*]] = lshr i16 [[TMP61]], [[TMP62]]
+; SSE-NEXT:    [[TMP63:%.*]] = extractelement <8 x i16> [[TMP4]], i32 3
+; SSE-NEXT:    [[TMP64:%.*]] = extractelement <8 x i16> [[TMP8]], i32 3
+; SSE-NEXT:    [[R27:%.*]] = lshr i16 [[TMP63]], [[TMP64]]
+; SSE-NEXT:    [[TMP65:%.*]] = extractelement <8 x i16> [[TMP4]], i32 4
+; SSE-NEXT:    [[TMP66:%.*]] = extractelement <8 x i16> [[TMP8]], i32 4
+; SSE-NEXT:    [[R28:%.*]] = lshr i16 [[TMP65]], [[TMP66]]
+; SSE-NEXT:    [[TMP67:%.*]] = extractelement <8 x i16> [[TMP4]], i32 5
+; SSE-NEXT:    [[TMP68:%.*]] = extractelement <8 x i16> [[TMP8]], i32 5
+; SSE-NEXT:    [[R29:%.*]] = lshr i16 [[TMP67]], [[TMP68]]
+; SSE-NEXT:    [[TMP69:%.*]] = extractelement <8 x i16> [[TMP4]], i32 6
+; SSE-NEXT:    [[TMP70:%.*]] = extractelement <8 x i16> [[TMP8]], i32 6
+; SSE-NEXT:    [[R30:%.*]] = lshr i16 [[TMP69]], [[TMP70]]
+; SSE-NEXT:    [[TMP71:%.*]] = extractelement <8 x i16> [[TMP4]], i32 7
+; SSE-NEXT:    [[TMP72:%.*]] = extractelement <8 x i16> [[TMP8]], i32 7
+; SSE-NEXT:    [[R31:%.*]] = lshr i16 [[TMP71]], [[TMP72]]
+; SSE-NEXT:    [[TMP73:%.*]] = insertelement <8 x i16> poison, i16 [[R0]], i32 0
+; SSE-NEXT:    [[TMP74:%.*]] = insertelement <8 x i16> [[TMP73]], i16 [[R1]], i32 1
+; SSE-NEXT:    [[TMP75:%.*]] = insertelement <8 x i16> [[TMP74]], i16 [[R2]], i32 2
+; SSE-NEXT:    [[TMP76:%.*]] = insertelement <8 x i16> [[TMP75]], i16 [[R3]], i32 3
+; SSE-NEXT:    [[TMP77:%.*]] = insertelement <8 x i16> [[TMP76]], i16 [[R4]], i32 4
+; SSE-NEXT:    [[TMP78:%.*]] = insertelement <8 x i16> [[TMP77]], i16 [[R5]], i32 5
+; SSE-NEXT:    [[TMP79:%.*]] = insertelement <8 x i16> [[TMP78]], i16 [[R6]], i32 6
+; SSE-NEXT:    [[TMP80:%.*]] = insertelement <8 x i16> [[TMP79]], i16 [[R7]], i32 7
+; SSE-NEXT:    store <8 x i16> [[TMP80]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP81:%.*]] = insertelement <8 x i16> poison, i16 [[R8]], i32 0
+; SSE-NEXT:    [[TMP82:%.*]] = insertelement <8 x i16> [[TMP81]], i16 [[R9]], i32 1
+; SSE-NEXT:    [[TMP83:%.*]] = insertelement <8 x i16> [[TMP82]], i16 [[R10]], i32 2
+; SSE-NEXT:    [[TMP84:%.*]] = insertelement <8 x i16> [[TMP83]], i16 [[R11]], i32 3
+; SSE-NEXT:    [[TMP85:%.*]] = insertelement <8 x i16> [[TMP84]], i16 [[R12]], i32 4
+; SSE-NEXT:    [[TMP86:%.*]] = insertelement <8 x i16> [[TMP85]], i16 [[R13]], i32 5
+; SSE-NEXT:    [[TMP87:%.*]] = insertelement <8 x i16> [[TMP86]], i16 [[R14]], i32 6
+; SSE-NEXT:    [[TMP88:%.*]] = insertelement <8 x i16> [[TMP87]], i16 [[R15]], i32 7
+; SSE-NEXT:    store <8 x i16> [[TMP88]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP89:%.*]] = insertelement <8 x i16> poison, i16 [[R16]], i32 0
+; SSE-NEXT:    [[TMP90:%.*]] = insertelement <8 x i16> [[TMP89]], i16 [[R17]], i32 1
+; SSE-NEXT:    [[TMP91:%.*]] = insertelement <8 x i16> [[TMP90]], i16 [[R18]], i32 2
+; SSE-NEXT:    [[TMP92:%.*]] = insertelement <8 x i16> [[TMP91]], i16 [[R19]], i32 3
+; SSE-NEXT:    [[TMP93:%.*]] = insertelement <8 x i16> [[TMP92]], i16 [[R20]], i32 4
+; SSE-NEXT:    [[TMP94:%.*]] = insertelement <8 x i16> [[TMP93]], i16 [[R21]], i32 5
+; SSE-NEXT:    [[TMP95:%.*]] = insertelement <8 x i16> [[TMP94]], i16 [[R22]], i32 6
+; SSE-NEXT:    [[TMP96:%.*]] = insertelement <8 x i16> [[TMP95]], i16 [[R23]], i32 7
+; SSE-NEXT:    store <8 x i16> [[TMP96]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP97:%.*]] = insertelement <8 x i16> poison, i16 [[R24]], i32 0
+; SSE-NEXT:    [[TMP98:%.*]] = insertelement <8 x i16> [[TMP97]], i16 [[R25]], i32 1
+; SSE-NEXT:    [[TMP99:%.*]] = insertelement <8 x i16> [[TMP98]], i16 [[R26]], i32 2
+; SSE-NEXT:    [[TMP100:%.*]] = insertelement <8 x i16> [[TMP99]], i16 [[R27]], i32 3
+; SSE-NEXT:    [[TMP101:%.*]] = insertelement <8 x i16> [[TMP100]], i16 [[R28]], i32 4
+; SSE-NEXT:    [[TMP102:%.*]] = insertelement <8 x i16> [[TMP101]], i16 [[R29]], i32 5
+; SSE-NEXT:    [[TMP103:%.*]] = insertelement <8 x i16> [[TMP102]], i16 [[R30]], i32 6
+; SSE-NEXT:    [[TMP104:%.*]] = insertelement <8 x i16> [[TMP103]], i16 [[R31]], i32 7
+; SSE-NEXT:    store <8 x i16> [[TMP104]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @lshr_v32i16(
@@ -437,6 +474,13 @@
 ; AVX512-NEXT:    store <32 x i16> [[TMP3]], <32 x i16>* bitcast ([32 x i16]* @c16 to <32 x i16>*), align 2
 ; AVX512-NEXT:    ret void
 ;
+; AVX512-SKX-LABEL: @lshr_v32i16(
+; AVX512-SKX-NEXT:    [[TMP1:%.*]] = load <32 x i16>, <32 x i16>* bitcast ([32 x i16]* @a16 to <32 x i16>*), align 2
+; AVX512-SKX-NEXT:    [[TMP2:%.*]] = load <32 x i16>, <32 x i16>* bitcast ([32 x i16]* @b16 to <32 x i16>*), align 2
+; AVX512-SKX-NEXT:    [[TMP3:%.*]] = lshr <32 x i16> [[TMP1]], [[TMP2]]
+; AVX512-SKX-NEXT:    store <32 x i16> [[TMP3]], <32 x i16>* bitcast ([32 x i16]* @c16 to <32 x i16>*), align 2
+; AVX512-SKX-NEXT:    ret void
+;
 ; XOP-LABEL: @lshr_v32i16(
 ; XOP-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
 ; XOP-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
@@ -617,6 +661,13 @@
 ; AVX512-NEXT:    store <64 x i8> [[TMP3]], <64 x i8>* bitcast ([64 x i8]* @c8 to <64 x i8>*), align 1
 ; AVX512-NEXT:    ret void
 ;
+; AVX512-SKX-LABEL: @lshr_v64i8(
+; AVX512-SKX-NEXT:    [[TMP1:%.*]] = load <64 x i8>, <64 x i8>* bitcast ([64 x i8]* @a8 to <64 x i8>*), align 1
+; AVX512-SKX-NEXT:    [[TMP2:%.*]] = load <64 x i8>, <64 x i8>* bitcast ([64 x i8]* @b8 to <64 x i8>*), align 1
+; AVX512-SKX-NEXT:    [[TMP3:%.*]] = lshr <64 x i8> [[TMP1]], [[TMP2]]
+; AVX512-SKX-NEXT:    store <64 x i8> [[TMP3]], <64 x i8>* bitcast ([64 x i8]* @c8 to <64 x i8>*), align 1
+; AVX512-SKX-NEXT:    ret void
+;
 ; XOP-LABEL: @lshr_v64i8(
 ; XOP-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
 ; XOP-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
Index: llvm/test/Transforms/SLPVectorizer/X86/shift-shl.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/shift-shl.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/shift-shl.ll
@@ -3,8 +3,8 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX512-SKX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX --check-prefix=AVX2-SKX
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver4 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=XOP
 
 @a64 = common global [8 x i64] zeroinitializer, align 64
@@ -77,6 +77,24 @@
 ; AVX512-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
 ; AVX512-NEXT:    ret void
 ;
+; AVX512-SKX-LABEL: @shl_v8i64(
+; AVX512-SKX-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
+; AVX512-SKX-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
+; AVX512-SKX-NEXT:    [[TMP3:%.*]] = shl <8 x i64> [[TMP1]], [[TMP2]]
+; AVX512-SKX-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
+; AVX512-SKX-NEXT:    ret void
+;
+; AVX2-SKX-LABEL: @shl_v8i64(
+; AVX2-SKX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; AVX2-SKX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-SKX-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX2-SKX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-SKX-NEXT:    [[TMP5:%.*]] = shl <4 x i64> [[TMP1]], [[TMP3]]
+; AVX2-SKX-NEXT:    [[TMP6:%.*]] = shl <4 x i64> [[TMP2]], [[TMP4]]
+; AVX2-SKX-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX2-SKX-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-SKX-NEXT:    ret void
+;
 ; XOP-LABEL: @shl_v8i64(
 ; XOP-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
 ; XOP-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
@@ -161,6 +179,13 @@
 ; AVX512-NEXT:    store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
 ; AVX512-NEXT:    ret void
 ;
+; AVX512-SKX-LABEL: @shl_v16i32(
+; AVX512-SKX-NEXT:    [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
+; AVX512-SKX-NEXT:    [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
+; AVX512-SKX-NEXT:    [[TMP3:%.*]] = shl <16 x i32> [[TMP1]], [[TMP2]]
+; AVX512-SKX-NEXT:    store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
+; AVX512-SKX-NEXT:    ret void
+;
 ; XOP-LABEL: @shl_v16i32(
 ; XOP-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
 ; XOP-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
@@ -241,134 +266,146 @@
 
 define void @shl_v32i16() {
 ; SSE-LABEL: @shl_v32i16(
-; SSE-NEXT:    [[A0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0), align 2
-; SSE-NEXT:    [[A1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1), align 2
-; SSE-NEXT:    [[A2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2), align 2
-; SSE-NEXT:    [[A3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3), align 2
-; SSE-NEXT:    [[A4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4), align 2
-; SSE-NEXT:    [[A5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5), align 2
-; SSE-NEXT:    [[A6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6), align 2
-; SSE-NEXT:    [[A7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7), align 2
-; SSE-NEXT:    [[A8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8), align 2
-; SSE-NEXT:    [[A9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9), align 2
-; SSE-NEXT:    [[A10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
-; SSE-NEXT:    [[A11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
-; SSE-NEXT:    [[A12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
-; SSE-NEXT:    [[A13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
-; SSE-NEXT:    [[A14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
-; SSE-NEXT:    [[A15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
-; SSE-NEXT:    [[A16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
-; SSE-NEXT:    [[A17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
-; SSE-NEXT:    [[A18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
-; SSE-NEXT:    [[A19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
-; SSE-NEXT:    [[A20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
-; SSE-NEXT:    [[A21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
-; SSE-NEXT:    [[A22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
-; SSE-NEXT:    [[A23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
-; SSE-NEXT:    [[A24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
-; SSE-NEXT:    [[A25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
-; SSE-NEXT:    [[A26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
-; SSE-NEXT:    [[A27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
-; SSE-NEXT:    [[A28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
-; SSE-NEXT:    [[A29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
-; SSE-NEXT:    [[A30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
-; SSE-NEXT:    [[A31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
-; SSE-NEXT:    [[B0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0), align 2
-; SSE-NEXT:    [[B1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1), align 2
-; SSE-NEXT:    [[B2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2), align 2
-; SSE-NEXT:    [[B3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3), align 2
-; SSE-NEXT:    [[B4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4), align 2
-; SSE-NEXT:    [[B5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5), align 2
-; SSE-NEXT:    [[B6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6), align 2
-; SSE-NEXT:    [[B7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7), align 2
-; SSE-NEXT:    [[B8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8), align 2
-; SSE-NEXT:    [[B9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9), align 2
-; SSE-NEXT:    [[B10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
-; SSE-NEXT:    [[B11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
-; SSE-NEXT:    [[B12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
-; SSE-NEXT:    [[B13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
-; SSE-NEXT:    [[B14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
-; SSE-NEXT:    [[B15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
-; SSE-NEXT:    [[B16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
-; SSE-NEXT:    [[B17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
-; SSE-NEXT:    [[B18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
-; SSE-NEXT:    [[B19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
-; SSE-NEXT:    [[B20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
-; SSE-NEXT:    [[B21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
-; SSE-NEXT:    [[B22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
-; SSE-NEXT:    [[B23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
-; SSE-NEXT:    [[B24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
-; SSE-NEXT:    [[B25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
-; SSE-NEXT:    [[B26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
-; SSE-NEXT:    [[B27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
-; SSE-NEXT:    [[B28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
-; SSE-NEXT:    [[B29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
-; SSE-NEXT:    [[B30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
-; SSE-NEXT:    [[B31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
-; SSE-NEXT:    [[R0:%.*]] = shl i16 [[A0]], [[B0]]
-; SSE-NEXT:    [[R1:%.*]] = shl i16 [[A1]], [[B1]]
-; SSE-NEXT:    [[R2:%.*]] = shl i16 [[A2]], [[B2]]
-; SSE-NEXT:    [[R3:%.*]] = shl i16 [[A3]], [[B3]]
-; SSE-NEXT:    [[R4:%.*]] = shl i16 [[A4]], [[B4]]
-; SSE-NEXT:    [[R5:%.*]] = shl i16 [[A5]], [[B5]]
-; SSE-NEXT:    [[R6:%.*]] = shl i16 [[A6]], [[B6]]
-; SSE-NEXT:    [[R7:%.*]] = shl i16 [[A7]], [[B7]]
-; SSE-NEXT:    [[R8:%.*]] = shl i16 [[A8]], [[B8]]
-; SSE-NEXT:    [[R9:%.*]] = shl i16 [[A9]], [[B9]]
-; SSE-NEXT:    [[R10:%.*]] = shl i16 [[A10]], [[B10]]
-; SSE-NEXT:    [[R11:%.*]] = shl i16 [[A11]], [[B11]]
-; SSE-NEXT:    [[R12:%.*]] = shl i16 [[A12]], [[B12]]
-; SSE-NEXT:    [[R13:%.*]] = shl i16 [[A13]], [[B13]]
-; SSE-NEXT:    [[R14:%.*]] = shl i16 [[A14]], [[B14]]
-; SSE-NEXT:    [[R15:%.*]] = shl i16 [[A15]], [[B15]]
-; SSE-NEXT:    [[R16:%.*]] = shl i16 [[A16]], [[B16]]
-; SSE-NEXT:    [[R17:%.*]] = shl i16 [[A17]], [[B17]]
-; SSE-NEXT:    [[R18:%.*]] = shl i16 [[A18]], [[B18]]
-; SSE-NEXT:    [[R19:%.*]] = shl i16 [[A19]], [[B19]]
-; SSE-NEXT:    [[R20:%.*]] = shl i16 [[A20]], [[B20]]
-; SSE-NEXT:    [[R21:%.*]] = shl i16 [[A21]], [[B21]]
-; SSE-NEXT:    [[R22:%.*]] = shl i16 [[A22]], [[B22]]
-; SSE-NEXT:    [[R23:%.*]] = shl i16 [[A23]], [[B23]]
-; SSE-NEXT:    [[R24:%.*]] = shl i16 [[A24]], [[B24]]
-; SSE-NEXT:    [[R25:%.*]] = shl i16 [[A25]], [[B25]]
-; SSE-NEXT:    [[R26:%.*]] = shl i16 [[A26]], [[B26]]
-; SSE-NEXT:    [[R27:%.*]] = shl i16 [[A27]], [[B27]]
-; SSE-NEXT:    [[R28:%.*]] = shl i16 [[A28]], [[B28]]
-; SSE-NEXT:    [[R29:%.*]] = shl i16 [[A29]], [[B29]]
-; SSE-NEXT:    [[R30:%.*]] = shl i16 [[A30]], [[B30]]
-; SSE-NEXT:    [[R31:%.*]] = shl i16 [[A31]], [[B31]]
-; SSE-NEXT:    store i16 [[R0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0), align 2
-; SSE-NEXT:    store i16 [[R1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1), align 2
-; SSE-NEXT:    store i16 [[R2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2), align 2
-; SSE-NEXT:    store i16 [[R3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3), align 2
-; SSE-NEXT:    store i16 [[R4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4), align 2
-; SSE-NEXT:    store i16 [[R5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5), align 2
-; SSE-NEXT:    store i16 [[R6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6), align 2
-; SSE-NEXT:    store i16 [[R7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7), align 2
-; SSE-NEXT:    store i16 [[R8]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8), align 2
-; SSE-NEXT:    store i16 [[R9]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9), align 2
-; SSE-NEXT:    store i16 [[R10]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
-; SSE-NEXT:    store i16 [[R11]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
-; SSE-NEXT:    store i16 [[R12]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
-; SSE-NEXT:    store i16 [[R13]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
-; SSE-NEXT:    store i16 [[R14]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
-; SSE-NEXT:    store i16 [[R15]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
-; SSE-NEXT:    store i16 [[R16]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
-; SSE-NEXT:    store i16 [[R17]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
-; SSE-NEXT:    store i16 [[R18]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
-; SSE-NEXT:    store i16 [[R19]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
-; SSE-NEXT:    store i16 [[R20]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
-; SSE-NEXT:    store i16 [[R21]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
-; SSE-NEXT:    store i16 [[R22]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
-; SSE-NEXT:    store i16 [[R23]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
-; SSE-NEXT:    store i16 [[R24]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
-; SSE-NEXT:    store i16 [[R25]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
-; SSE-NEXT:    store i16 [[R26]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
-; SSE-NEXT:    store i16 [[R27]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
-; SSE-NEXT:    store i16 [[R28]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
-; SSE-NEXT:    store i16 [[R29]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
-; SSE-NEXT:    store i16 [[R30]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
-; SSE-NEXT:    store i16 [[R31]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0
+; SSE-NEXT:    [[TMP10:%.*]] = extractelement <8 x i16> [[TMP5]], i32 0
+; SSE-NEXT:    [[R0:%.*]] = shl i16 [[TMP9]], [[TMP10]]
+; SSE-NEXT:    [[TMP11:%.*]] = extractelement <8 x i16> [[TMP1]], i32 1
+; SSE-NEXT:    [[TMP12:%.*]] = extractelement <8 x i16> [[TMP5]], i32 1
+; SSE-NEXT:    [[R1:%.*]] = shl i16 [[TMP11]], [[TMP12]]
+; SSE-NEXT:    [[TMP13:%.*]] = extractelement <8 x i16> [[TMP1]], i32 2
+; SSE-NEXT:    [[TMP14:%.*]] = extractelement <8 x i16> [[TMP5]], i32 2
+; SSE-NEXT:    [[R2:%.*]] = shl i16 [[TMP13]], [[TMP14]]
+; SSE-NEXT:    [[TMP15:%.*]] = extractelement <8 x i16> [[TMP1]], i32 3
+; SSE-NEXT:    [[TMP16:%.*]] = extractelement <8 x i16> [[TMP5]], i32 3
+; SSE-NEXT:    [[R3:%.*]] = shl i16 [[TMP15]], [[TMP16]]
+; SSE-NEXT:    [[TMP17:%.*]] = extractelement <8 x i16> [[TMP1]], i32 4
+; SSE-NEXT:    [[TMP18:%.*]] = extractelement <8 x i16> [[TMP5]], i32 4
+; SSE-NEXT:    [[R4:%.*]] = shl i16 [[TMP17]], [[TMP18]]
+; SSE-NEXT:    [[TMP19:%.*]] = extractelement <8 x i16> [[TMP1]], i32 5
+; SSE-NEXT:    [[TMP20:%.*]] = extractelement <8 x i16> [[TMP5]], i32 5
+; SSE-NEXT:    [[R5:%.*]] = shl i16 [[TMP19]], [[TMP20]]
+; SSE-NEXT:    [[TMP21:%.*]] = extractelement <8 x i16> [[TMP1]], i32 6
+; SSE-NEXT:    [[TMP22:%.*]] = extractelement <8 x i16> [[TMP5]], i32 6
+; SSE-NEXT:    [[R6:%.*]] = shl i16 [[TMP21]], [[TMP22]]
+; SSE-NEXT:    [[TMP23:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+; SSE-NEXT:    [[TMP24:%.*]] = extractelement <8 x i16> [[TMP5]], i32 7
+; SSE-NEXT:    [[R7:%.*]] = shl i16 [[TMP23]], [[TMP24]]
+; SSE-NEXT:    [[TMP25:%.*]] = extractelement <8 x i16> [[TMP2]], i32 0
+; SSE-NEXT:    [[TMP26:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0
+; SSE-NEXT:    [[R8:%.*]] = shl i16 [[TMP25]], [[TMP26]]
+; SSE-NEXT:    [[TMP27:%.*]] = extractelement <8 x i16> [[TMP2]], i32 1
+; SSE-NEXT:    [[TMP28:%.*]] = extractelement <8 x i16> [[TMP6]], i32 1
+; SSE-NEXT:    [[R9:%.*]] = shl i16 [[TMP27]], [[TMP28]]
+; SSE-NEXT:    [[TMP29:%.*]] = extractelement <8 x i16> [[TMP2]], i32 2
+; SSE-NEXT:    [[TMP30:%.*]] = extractelement <8 x i16> [[TMP6]], i32 2
+; SSE-NEXT:    [[R10:%.*]] = shl i16 [[TMP29]], [[TMP30]]
+; SSE-NEXT:    [[TMP31:%.*]] = extractelement <8 x i16> [[TMP2]], i32 3
+; SSE-NEXT:    [[TMP32:%.*]] = extractelement <8 x i16> [[TMP6]], i32 3
+; SSE-NEXT:    [[R11:%.*]] = shl i16 [[TMP31]], [[TMP32]]
+; SSE-NEXT:    [[TMP33:%.*]] = extractelement <8 x i16> [[TMP2]], i32 4
+; SSE-NEXT:    [[TMP34:%.*]] = extractelement <8 x i16> [[TMP6]], i32 4
+; SSE-NEXT:    [[R12:%.*]] = shl i16 [[TMP33]], [[TMP34]]
+; SSE-NEXT:    [[TMP35:%.*]] = extractelement <8 x i16> [[TMP2]], i32 5
+; SSE-NEXT:    [[TMP36:%.*]] = extractelement <8 x i16> [[TMP6]], i32 5
+; SSE-NEXT:    [[R13:%.*]] = shl i16 [[TMP35]], [[TMP36]]
+; SSE-NEXT:    [[TMP37:%.*]] = extractelement <8 x i16> [[TMP2]], i32 6
+; SSE-NEXT:    [[TMP38:%.*]] = extractelement <8 x i16> [[TMP6]], i32 6
+; SSE-NEXT:    [[R14:%.*]] = shl i16 [[TMP37]], [[TMP38]]
+; SSE-NEXT:    [[TMP39:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
+; SSE-NEXT:    [[TMP40:%.*]] = extractelement <8 x i16> [[TMP6]], i32 7
+; SSE-NEXT:    [[R15:%.*]] = shl i16 [[TMP39]], [[TMP40]]
+; SSE-NEXT:    [[TMP41:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
+; SSE-NEXT:    [[TMP42:%.*]] = extractelement <8 x i16> [[TMP7]], i32 0
+; SSE-NEXT:    [[R16:%.*]] = shl i16 [[TMP41]], [[TMP42]]
+; SSE-NEXT:    [[TMP43:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
+; SSE-NEXT:    [[TMP44:%.*]] = extractelement <8 x i16> [[TMP7]], i32 1
+; SSE-NEXT:    [[R17:%.*]] = shl i16 [[TMP43]], [[TMP44]]
+; SSE-NEXT:    [[TMP45:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
+; SSE-NEXT:    [[TMP46:%.*]] = extractelement <8 x i16> [[TMP7]], i32 2
+; SSE-NEXT:    [[R18:%.*]] = shl i16 [[TMP45]], [[TMP46]]
+; SSE-NEXT:    [[TMP47:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
+; SSE-NEXT:    [[TMP48:%.*]] = extractelement <8 x i16> [[TMP7]], i32 3
+; SSE-NEXT:    [[R19:%.*]] = shl i16 [[TMP47]], [[TMP48]]
+; SSE-NEXT:    [[TMP49:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
+; SSE-NEXT:    [[TMP50:%.*]] = extractelement <8 x i16> [[TMP7]], i32 4
+; SSE-NEXT:    [[R20:%.*]] = shl i16 [[TMP49]], [[TMP50]]
+; SSE-NEXT:    [[TMP51:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
+; SSE-NEXT:    [[TMP52:%.*]] = extractelement <8 x i16> [[TMP7]], i32 5
+; SSE-NEXT:    [[R21:%.*]] = shl i16 [[TMP51]], [[TMP52]]
+; SSE-NEXT:    [[TMP53:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
+; SSE-NEXT:    [[TMP54:%.*]] = extractelement <8 x i16> [[TMP7]], i32 6
+; SSE-NEXT:    [[R22:%.*]] = shl i16 [[TMP53]], [[TMP54]]
+; SSE-NEXT:    [[TMP55:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
+; SSE-NEXT:    [[TMP56:%.*]] = extractelement <8 x i16> [[TMP7]], i32 7
+; SSE-NEXT:    [[R23:%.*]] = shl i16 [[TMP55]], [[TMP56]]
+; SSE-NEXT:    [[TMP57:%.*]] = extractelement <8 x i16> [[TMP4]], i32 0
+; SSE-NEXT:    [[TMP58:%.*]] = extractelement <8 x i16> [[TMP8]], i32 0
+; SSE-NEXT:    [[R24:%.*]] = shl i16 [[TMP57]], [[TMP58]]
+; SSE-NEXT:    [[TMP59:%.*]] = extractelement <8 x i16> [[TMP4]], i32 1
+; SSE-NEXT:    [[TMP60:%.*]] = extractelement <8 x i16> [[TMP8]], i32 1
+; SSE-NEXT:    [[R25:%.*]] = shl i16 [[TMP59]], [[TMP60]]
+; SSE-NEXT:    [[TMP61:%.*]] = extractelement <8 x i16> [[TMP4]], i32 2
+; SSE-NEXT:    [[TMP62:%.*]] = extractelement <8 x i16> [[TMP8]], i32 2
+; SSE-NEXT:    [[R26:%.*]] = shl i16 [[TMP61]], [[TMP62]]
+; SSE-NEXT:    [[TMP63:%.*]] = extractelement <8 x i16> [[TMP4]], i32 3
+; SSE-NEXT:    [[TMP64:%.*]] = extractelement <8 x i16> [[TMP8]], i32 3
+; SSE-NEXT:    [[R27:%.*]] = shl i16 [[TMP63]], [[TMP64]]
+; SSE-NEXT:    [[TMP65:%.*]] = extractelement <8 x i16> [[TMP4]], i32 4
+; SSE-NEXT:    [[TMP66:%.*]] = extractelement <8 x i16> [[TMP8]], i32 4
+; SSE-NEXT:    [[R28:%.*]] = shl i16 [[TMP65]], [[TMP66]]
+; SSE-NEXT:    [[TMP67:%.*]] = extractelement <8 x i16> [[TMP4]], i32 5
+; SSE-NEXT:    [[TMP68:%.*]] = extractelement <8 x i16> [[TMP8]], i32 5
+; SSE-NEXT:    [[R29:%.*]] = shl i16 [[TMP67]], [[TMP68]]
+; SSE-NEXT:    [[TMP69:%.*]] = extractelement <8 x i16> [[TMP4]], i32 6
+; SSE-NEXT:    [[TMP70:%.*]] = extractelement <8 x i16> [[TMP8]], i32 6
+; SSE-NEXT:    [[R30:%.*]] = shl i16 [[TMP69]], [[TMP70]]
+; SSE-NEXT:    [[TMP71:%.*]] = extractelement <8 x i16> [[TMP4]], i32 7
+; SSE-NEXT:    [[TMP72:%.*]] = extractelement <8 x i16> [[TMP8]], i32 7
+; SSE-NEXT:    [[R31:%.*]] = shl i16 [[TMP71]], [[TMP72]]
+; SSE-NEXT:    [[TMP73:%.*]] = insertelement <8 x i16> poison, i16 [[R0]], i32 0
+; SSE-NEXT:    [[TMP74:%.*]] = insertelement <8 x i16> [[TMP73]], i16 [[R1]], i32 1
+; SSE-NEXT:    [[TMP75:%.*]] = insertelement <8 x i16> [[TMP74]], i16 [[R2]], i32 2
+; SSE-NEXT:    [[TMP76:%.*]] = insertelement <8 x i16> [[TMP75]], i16 [[R3]], i32 3
+; SSE-NEXT:    [[TMP77:%.*]] = insertelement <8 x i16> [[TMP76]], i16 [[R4]], i32 4
+; SSE-NEXT:    [[TMP78:%.*]] = insertelement <8 x i16> [[TMP77]], i16 [[R5]], i32 5
+; SSE-NEXT:    [[TMP79:%.*]] = insertelement <8 x i16> [[TMP78]], i16 [[R6]], i32 6
+; SSE-NEXT:    [[TMP80:%.*]] = insertelement <8 x i16> [[TMP79]], i16 [[R7]], i32 7
+; SSE-NEXT:    store <8 x i16> [[TMP80]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP81:%.*]] = insertelement <8 x i16> poison, i16 [[R8]], i32 0
+; SSE-NEXT:    [[TMP82:%.*]] = insertelement <8 x i16> [[TMP81]], i16 [[R9]], i32 1
+; SSE-NEXT:    [[TMP83:%.*]] = insertelement <8 x i16> [[TMP82]], i16 [[R10]], i32 2
+; SSE-NEXT:    [[TMP84:%.*]] = insertelement <8 x i16> [[TMP83]], i16 [[R11]], i32 3
+; SSE-NEXT:    [[TMP85:%.*]] = insertelement <8 x i16> [[TMP84]], i16 [[R12]], i32 4
+; SSE-NEXT:    [[TMP86:%.*]] = insertelement <8 x i16> [[TMP85]], i16 [[R13]], i32 5
+; SSE-NEXT:    [[TMP87:%.*]] = insertelement <8 x i16> [[TMP86]], i16 [[R14]], i32 6
+; SSE-NEXT:    [[TMP88:%.*]] = insertelement <8 x i16> [[TMP87]], i16 [[R15]], i32 7
+; SSE-NEXT:    store <8 x i16> [[TMP88]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP89:%.*]] = insertelement <8 x i16> poison, i16 [[R16]], i32 0
+; SSE-NEXT:    [[TMP90:%.*]] = insertelement <8 x i16> [[TMP89]], i16 [[R17]], i32 1
+; SSE-NEXT:    [[TMP91:%.*]] = insertelement <8 x i16> [[TMP90]], i16 [[R18]], i32 2
+; SSE-NEXT:    [[TMP92:%.*]] = insertelement <8 x i16> [[TMP91]], i16 [[R19]], i32 3
+; SSE-NEXT:    [[TMP93:%.*]] = insertelement <8 x i16> [[TMP92]], i16 [[R20]], i32 4
+; SSE-NEXT:    [[TMP94:%.*]] = insertelement <8 x i16> [[TMP93]], i16 [[R21]], i32 5
+; SSE-NEXT:    [[TMP95:%.*]] = insertelement <8 x i16> [[TMP94]], i16 [[R22]], i32 6
+; SSE-NEXT:    [[TMP96:%.*]] = insertelement <8 x i16> [[TMP95]], i16 [[R23]], i32 7
+; SSE-NEXT:    store <8 x i16> [[TMP96]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP97:%.*]] = insertelement <8 x i16> poison, i16 [[R24]], i32 0
+; SSE-NEXT:    [[TMP98:%.*]] = insertelement <8 x i16> [[TMP97]], i16 [[R25]], i32 1
+; SSE-NEXT:    [[TMP99:%.*]] = insertelement <8 x i16> [[TMP98]], i16 [[R26]], i32 2
+; SSE-NEXT:    [[TMP100:%.*]] = insertelement <8 x i16> [[TMP99]], i16 [[R27]], i32 3
+; SSE-NEXT:    [[TMP101:%.*]] = insertelement <8 x i16> [[TMP100]], i16 [[R28]], i32 4
+; SSE-NEXT:    [[TMP102:%.*]] = insertelement <8 x i16> [[TMP101]], i16 [[R29]], i32 5
+; SSE-NEXT:    [[TMP103:%.*]] = insertelement <8 x i16> [[TMP102]], i16 [[R30]], i32 6
+; SSE-NEXT:    [[TMP104:%.*]] = insertelement <8 x i16> [[TMP103]], i16 [[R31]], i32 7
+; SSE-NEXT:    store <8 x i16> [[TMP104]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @shl_v32i16(
@@ -389,6 +426,13 @@
 ; AVX512-NEXT:    store <32 x i16> [[TMP3]], <32 x i16>* bitcast ([32 x i16]* @c16 to <32 x i16>*), align 2
 ; AVX512-NEXT:    ret void
 ;
+; AVX512-SKX-LABEL: @shl_v32i16(
+; AVX512-SKX-NEXT:    [[TMP1:%.*]] = load <32 x i16>, <32 x i16>* bitcast ([32 x i16]* @a16 to <32 x i16>*), align 2
+; AVX512-SKX-NEXT:    [[TMP2:%.*]] = load <32 x i16>, <32 x i16>* bitcast ([32 x i16]* @b16 to <32 x i16>*), align 2
+; AVX512-SKX-NEXT:    [[TMP3:%.*]] = shl <32 x i16> [[TMP1]], [[TMP2]]
+; AVX512-SKX-NEXT:    store <32 x i16> [[TMP3]], <32 x i16>* bitcast ([32 x i16]* @c16 to <32 x i16>*), align 2
+; AVX512-SKX-NEXT:    ret void
+;
 ; XOP-LABEL: @shl_v32i16(
 ; XOP-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
 ; XOP-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
@@ -569,6 +613,13 @@
 ; AVX512-NEXT:    store <64 x i8> [[TMP3]], <64 x i8>* bitcast ([64 x i8]* @c8 to <64 x i8>*), align 1
 ; AVX512-NEXT:    ret void
 ;
+; AVX512-SKX-LABEL: @shl_v64i8(
+; AVX512-SKX-NEXT:    [[TMP1:%.*]] = load <64 x i8>, <64 x i8>* bitcast ([64 x i8]* @a8 to <64 x i8>*), align 1
+; AVX512-SKX-NEXT:    [[TMP2:%.*]] = load <64 x i8>, <64 x i8>* bitcast ([64 x i8]* @b8 to <64 x i8>*), align 1
+; AVX512-SKX-NEXT:    [[TMP3:%.*]] = shl <64 x i8> [[TMP1]], [[TMP2]]
+; AVX512-SKX-NEXT:    store <64 x i8> [[TMP3]], <64 x i8>* bitcast ([64 x i8]* @c8 to <64 x i8>*), align 1
+; AVX512-SKX-NEXT:    ret void
+;
 ; XOP-LABEL: @shl_v64i8(
 ; XOP-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
 ; XOP-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
Index: llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ-BDVER1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ-AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256DQ
 
@@ -39,6 +39,24 @@
 ; AVX256NODQ-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
 ; AVX256NODQ-NEXT:    ret void
 ;
+; AVX256NODQ-BDVER1-LABEL: @sitofp_2i64_2f64(
+; AVX256NODQ-BDVER1-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; AVX256NODQ-BDVER1-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-BDVER1-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to double
+; AVX256NODQ-BDVER1-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to double
+; AVX256NODQ-BDVER1-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; AVX256NODQ-BDVER1-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; AVX256NODQ-BDVER1-NEXT:    ret void
+;
+; AVX256NODQ-AVX2-LABEL: @sitofp_2i64_2f64(
+; AVX256NODQ-AVX2-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; AVX256NODQ-AVX2-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-AVX2-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to double
+; AVX256NODQ-AVX2-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to double
+; AVX256NODQ-AVX2-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; AVX256NODQ-AVX2-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; AVX256NODQ-AVX2-NEXT:    ret void
+;
 ; AVX512-LABEL: @sitofp_2i64_2f64(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
 ; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <2 x i64> [[TMP1]] to <2 x double>
@@ -91,6 +109,36 @@
 ; AVX256NODQ-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
 ; AVX256NODQ-NEXT:    ret void
 ;
+; AVX256NODQ-BDVER1-LABEL: @sitofp_4i64_4f64(
+; AVX256NODQ-BDVER1-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; AVX256NODQ-BDVER1-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-BDVER1-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; AVX256NODQ-BDVER1-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; AVX256NODQ-BDVER1-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to double
+; AVX256NODQ-BDVER1-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to double
+; AVX256NODQ-BDVER1-NEXT:    [[CVT2:%.*]] = sitofp i64 [[LD2]] to double
+; AVX256NODQ-BDVER1-NEXT:    [[CVT3:%.*]] = sitofp i64 [[LD3]] to double
+; AVX256NODQ-BDVER1-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; AVX256NODQ-BDVER1-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; AVX256NODQ-BDVER1-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+; AVX256NODQ-BDVER1-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; AVX256NODQ-BDVER1-NEXT:    ret void
+;
+; AVX256NODQ-AVX2-LABEL: @sitofp_4i64_4f64(
+; AVX256NODQ-AVX2-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; AVX256NODQ-AVX2-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-AVX2-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; AVX256NODQ-AVX2-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; AVX256NODQ-AVX2-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to double
+; AVX256NODQ-AVX2-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to double
+; AVX256NODQ-AVX2-NEXT:    [[CVT2:%.*]] = sitofp i64 [[LD2]] to double
+; AVX256NODQ-AVX2-NEXT:    [[CVT3:%.*]] = sitofp i64 [[LD3]] to double
+; AVX256NODQ-AVX2-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; AVX256NODQ-AVX2-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; AVX256NODQ-AVX2-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+; AVX256NODQ-AVX2-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; AVX256NODQ-AVX2-NEXT:    ret void
+;
 ; AVX512-LABEL: @sitofp_4i64_4f64(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
 ; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x double>
@@ -173,6 +221,60 @@
 ; AVX256NODQ-NEXT:    store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
 ; AVX256NODQ-NEXT:    ret void
 ;
+; AVX256NODQ-BDVER1-LABEL: @sitofp_8i64_8f64(
+; AVX256NODQ-BDVER1-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; AVX256NODQ-BDVER1-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-BDVER1-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; AVX256NODQ-BDVER1-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; AVX256NODQ-BDVER1-NEXT:    [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
+; AVX256NODQ-BDVER1-NEXT:    [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
+; AVX256NODQ-BDVER1-NEXT:    [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
+; AVX256NODQ-BDVER1-NEXT:    [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
+; AVX256NODQ-BDVER1-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to double
+; AVX256NODQ-BDVER1-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to double
+; AVX256NODQ-BDVER1-NEXT:    [[CVT2:%.*]] = sitofp i64 [[LD2]] to double
+; AVX256NODQ-BDVER1-NEXT:    [[CVT3:%.*]] = sitofp i64 [[LD3]] to double
+; AVX256NODQ-BDVER1-NEXT:    [[CVT4:%.*]] = sitofp i64 [[LD4]] to double
+; AVX256NODQ-BDVER1-NEXT:    [[CVT5:%.*]] = sitofp i64 [[LD5]] to double
+; AVX256NODQ-BDVER1-NEXT:    [[CVT6:%.*]] = sitofp i64 [[LD6]] to double
+; AVX256NODQ-BDVER1-NEXT:    [[CVT7:%.*]] = sitofp i64 [[LD7]] to double
+; AVX256NODQ-BDVER1-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; AVX256NODQ-BDVER1-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; AVX256NODQ-BDVER1-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+; AVX256NODQ-BDVER1-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; AVX256NODQ-BDVER1-NEXT:    store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
+; AVX256NODQ-BDVER1-NEXT:    store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+; AVX256NODQ-BDVER1-NEXT:    store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
+; AVX256NODQ-BDVER1-NEXT:    store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; AVX256NODQ-BDVER1-NEXT:    ret void
+;
+; AVX256NODQ-AVX2-LABEL: @sitofp_8i64_8f64(
+; AVX256NODQ-AVX2-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; AVX256NODQ-AVX2-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-AVX2-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; AVX256NODQ-AVX2-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; AVX256NODQ-AVX2-NEXT:    [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
+; AVX256NODQ-AVX2-NEXT:    [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
+; AVX256NODQ-AVX2-NEXT:    [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
+; AVX256NODQ-AVX2-NEXT:    [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
+; AVX256NODQ-AVX2-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to double
+; AVX256NODQ-AVX2-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to double
+; AVX256NODQ-AVX2-NEXT:    [[CVT2:%.*]] = sitofp i64 [[LD2]] to double
+; AVX256NODQ-AVX2-NEXT:    [[CVT3:%.*]] = sitofp i64 [[LD3]] to double
+; AVX256NODQ-AVX2-NEXT:    [[CVT4:%.*]] = sitofp i64 [[LD4]] to double
+; AVX256NODQ-AVX2-NEXT:    [[CVT5:%.*]] = sitofp i64 [[LD5]] to double
+; AVX256NODQ-AVX2-NEXT:    [[CVT6:%.*]] = sitofp i64 [[LD6]] to double
+; AVX256NODQ-AVX2-NEXT:    [[CVT7:%.*]] = sitofp i64 [[LD7]] to double
+; AVX256NODQ-AVX2-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; AVX256NODQ-AVX2-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; AVX256NODQ-AVX2-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+; AVX256NODQ-AVX2-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; AVX256NODQ-AVX2-NEXT:    store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
+; AVX256NODQ-AVX2-NEXT:    store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+; AVX256NODQ-AVX2-NEXT:    store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
+; AVX256NODQ-AVX2-NEXT:    store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; AVX256NODQ-AVX2-NEXT:    ret void
+;
 ; AVX512-LABEL: @sitofp_8i64_8f64(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
 ; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <8 x i64> [[TMP1]] to <8 x double>
@@ -647,6 +749,36 @@
 ; AVX256NODQ-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
 ; AVX256NODQ-NEXT:    ret void
 ;
+; AVX256NODQ-BDVER1-LABEL: @sitofp_4i64_4f32(
+; AVX256NODQ-BDVER1-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; AVX256NODQ-BDVER1-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-BDVER1-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; AVX256NODQ-BDVER1-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; AVX256NODQ-BDVER1-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to float
+; AVX256NODQ-BDVER1-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to float
+; AVX256NODQ-BDVER1-NEXT:    [[CVT2:%.*]] = sitofp i64 [[LD2]] to float
+; AVX256NODQ-BDVER1-NEXT:    [[CVT3:%.*]] = sitofp i64 [[LD3]] to float
+; AVX256NODQ-BDVER1-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; AVX256NODQ-BDVER1-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; AVX256NODQ-BDVER1-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+; AVX256NODQ-BDVER1-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; AVX256NODQ-BDVER1-NEXT:    ret void
+;
+; AVX256NODQ-AVX2-LABEL: @sitofp_4i64_4f32(
+; AVX256NODQ-AVX2-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; AVX256NODQ-AVX2-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-AVX2-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; AVX256NODQ-AVX2-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; AVX256NODQ-AVX2-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to float
+; AVX256NODQ-AVX2-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to float
+; AVX256NODQ-AVX2-NEXT:    [[CVT2:%.*]] = sitofp i64 [[LD2]] to float
+; AVX256NODQ-AVX2-NEXT:    [[CVT3:%.*]] = sitofp i64 [[LD3]] to float
+; AVX256NODQ-AVX2-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; AVX256NODQ-AVX2-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; AVX256NODQ-AVX2-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+; AVX256NODQ-AVX2-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; AVX256NODQ-AVX2-NEXT:    ret void
+;
 ; AVX512-LABEL: @sitofp_4i64_4f32(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
 ; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x float>
@@ -729,6 +861,50 @@
 ; AVX256NODQ-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
 ; AVX256NODQ-NEXT:    ret void
 ;
+; AVX256NODQ-BDVER1-LABEL: @sitofp_8i64_8f32(
+; AVX256NODQ-BDVER1-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP2:%.*]] = sitofp <8 x i64> [[TMP1]] to <8 x float>
+; AVX256NODQ-BDVER1-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP2]], i32 7
+; AVX256NODQ-BDVER1-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[TMP2]], i32 0
+; AVX256NODQ-BDVER1-NEXT:    [[TMP5:%.*]] = insertelement <8 x float> poison, float [[TMP4]], i32 0
+; AVX256NODQ-BDVER1-NEXT:    [[TMP6:%.*]] = extractelement <8 x float> [[TMP2]], i32 1
+; AVX256NODQ-BDVER1-NEXT:    [[TMP7:%.*]] = insertelement <8 x float> [[TMP5]], float [[TMP6]], i32 1
+; AVX256NODQ-BDVER1-NEXT:    [[TMP8:%.*]] = extractelement <8 x float> [[TMP2]], i32 2
+; AVX256NODQ-BDVER1-NEXT:    [[TMP9:%.*]] = insertelement <8 x float> [[TMP7]], float [[TMP8]], i32 2
+; AVX256NODQ-BDVER1-NEXT:    [[TMP10:%.*]] = extractelement <8 x float> [[TMP2]], i32 3
+; AVX256NODQ-BDVER1-NEXT:    [[TMP11:%.*]] = insertelement <8 x float> [[TMP9]], float [[TMP10]], i32 3
+; AVX256NODQ-BDVER1-NEXT:    [[TMP12:%.*]] = extractelement <8 x float> [[TMP2]], i32 4
+; AVX256NODQ-BDVER1-NEXT:    [[TMP13:%.*]] = insertelement <8 x float> [[TMP11]], float [[TMP12]], i32 4
+; AVX256NODQ-BDVER1-NEXT:    [[TMP14:%.*]] = extractelement <8 x float> [[TMP2]], i32 5
+; AVX256NODQ-BDVER1-NEXT:    [[TMP15:%.*]] = insertelement <8 x float> [[TMP13]], float [[TMP14]], i32 5
+; AVX256NODQ-BDVER1-NEXT:    [[TMP16:%.*]] = extractelement <8 x float> [[TMP2]], i32 6
+; AVX256NODQ-BDVER1-NEXT:    [[TMP17:%.*]] = insertelement <8 x float> [[TMP15]], float [[TMP16]], i32 6
+; AVX256NODQ-BDVER1-NEXT:    [[TMP18:%.*]] = insertelement <8 x float> [[TMP17]], float [[TMP3]], i32 7
+; AVX256NODQ-BDVER1-NEXT:    store <8 x float> [[TMP18]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX256NODQ-BDVER1-NEXT:    ret void
+;
+; AVX256NODQ-AVX2-LABEL: @sitofp_8i64_8f32(
+; AVX256NODQ-AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
+; AVX256NODQ-AVX2-NEXT:    [[TMP2:%.*]] = sitofp <8 x i64> [[TMP1]] to <8 x float>
+; AVX256NODQ-AVX2-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP2]], i32 7
+; AVX256NODQ-AVX2-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[TMP2]], i32 0
+; AVX256NODQ-AVX2-NEXT:    [[TMP5:%.*]] = insertelement <8 x float> poison, float [[TMP4]], i32 0
+; AVX256NODQ-AVX2-NEXT:    [[TMP6:%.*]] = extractelement <8 x float> [[TMP2]], i32 1
+; AVX256NODQ-AVX2-NEXT:    [[TMP7:%.*]] = insertelement <8 x float> [[TMP5]], float [[TMP6]], i32 1
+; AVX256NODQ-AVX2-NEXT:    [[TMP8:%.*]] = extractelement <8 x float> [[TMP2]], i32 2
+; AVX256NODQ-AVX2-NEXT:    [[TMP9:%.*]] = insertelement <8 x float> [[TMP7]], float [[TMP8]], i32 2
+; AVX256NODQ-AVX2-NEXT:    [[TMP10:%.*]] = extractelement <8 x float> [[TMP2]], i32 3
+; AVX256NODQ-AVX2-NEXT:    [[TMP11:%.*]] = insertelement <8 x float> [[TMP9]], float [[TMP10]], i32 3
+; AVX256NODQ-AVX2-NEXT:    [[TMP12:%.*]] = extractelement <8 x float> [[TMP2]], i32 4
+; AVX256NODQ-AVX2-NEXT:    [[TMP13:%.*]] = insertelement <8 x float> [[TMP11]], float [[TMP12]], i32 4
+; AVX256NODQ-AVX2-NEXT:    [[TMP14:%.*]] = extractelement <8 x float> [[TMP2]], i32 5
+; AVX256NODQ-AVX2-NEXT:    [[TMP15:%.*]] = insertelement <8 x float> [[TMP13]], float [[TMP14]], i32 5
+; AVX256NODQ-AVX2-NEXT:    [[TMP16:%.*]] = extractelement <8 x float> [[TMP2]], i32 6
+; AVX256NODQ-AVX2-NEXT:    [[TMP17:%.*]] = insertelement <8 x float> [[TMP15]], float [[TMP16]], i32 6
+; AVX256NODQ-AVX2-NEXT:    [[TMP18:%.*]] = insertelement <8 x float> [[TMP17]], float [[TMP3]], i32 7
+; AVX256NODQ-AVX2-NEXT:    store <8 x float> [[TMP18]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX256NODQ-AVX2-NEXT:    ret void
+;
 ; AVX512-LABEL: @sitofp_8i64_8f32(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
 ; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <8 x i64> [[TMP1]] to <8 x float>
@@ -917,18 +1093,20 @@
 
 define void @sitofp_4i16_4f32() #0 {
 ; SSE-LABEL: @sitofp_4i16_4f32(
-; SSE-NEXT:    [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
-; SSE-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
-; SSE-NEXT:    [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
-; SSE-NEXT:    [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
-; SSE-NEXT:    [[CVT0:%.*]] = sitofp i16 [[LD0]] to float
-; SSE-NEXT:    [[CVT1:%.*]] = sitofp i16 [[LD1]] to float
-; SSE-NEXT:    [[CVT2:%.*]] = sitofp i16 [[LD2]] to float
-; SSE-NEXT:    [[CVT3:%.*]] = sitofp i16 [[LD3]] to float
-; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
-; SSE-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[TMP1]], i32 0
+; SSE-NEXT:    [[CVT0:%.*]] = sitofp i16 [[TMP2]] to float
+; SSE-NEXT:    [[TMP3:%.*]] = extractelement <4 x i16> [[TMP1]], i32 1
+; SSE-NEXT:    [[CVT1:%.*]] = sitofp i16 [[TMP3]] to float
+; SSE-NEXT:    [[TMP4:%.*]] = extractelement <4 x i16> [[TMP1]], i32 2
+; SSE-NEXT:    [[CVT2:%.*]] = sitofp i16 [[TMP4]] to float
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+; SSE-NEXT:    [[CVT3:%.*]] = sitofp i16 [[TMP5]] to float
+; SSE-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> poison, float [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[CVT1]], i32 1
+; SSE-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[CVT2]], i32 2
+; SSE-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[CVT3]], i32 3
+; SSE-NEXT:    store <4 x float> [[TMP9]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @sitofp_4i16_4f32(
@@ -954,30 +1132,34 @@
 
 define void @sitofp_8i16_8f32() #0 {
 ; SSE-LABEL: @sitofp_8i16_8f32(
-; SSE-NEXT:    [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
-; SSE-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
-; SSE-NEXT:    [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
-; SSE-NEXT:    [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
-; SSE-NEXT:    [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8
-; SSE-NEXT:    [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2
-; SSE-NEXT:    [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4
-; SSE-NEXT:    [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2
-; SSE-NEXT:    [[CVT0:%.*]] = sitofp i16 [[LD0]] to float
-; SSE-NEXT:    [[CVT1:%.*]] = sitofp i16 [[LD1]] to float
-; SSE-NEXT:    [[CVT2:%.*]] = sitofp i16 [[LD2]] to float
-; SSE-NEXT:    [[CVT3:%.*]] = sitofp i16 [[LD3]] to float
-; SSE-NEXT:    [[CVT4:%.*]] = sitofp i16 [[LD4]] to float
-; SSE-NEXT:    [[CVT5:%.*]] = sitofp i16 [[LD5]] to float
-; SSE-NEXT:    [[CVT6:%.*]] = sitofp i16 [[LD6]] to float
-; SSE-NEXT:    [[CVT7:%.*]] = sitofp i16 [[LD7]] to float
-; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
-; SSE-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE-NEXT:    store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
-; SSE-NEXT:    store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE-NEXT:    store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
-; SSE-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = extractelement <4 x i16> [[TMP1]], i32 0
+; SSE-NEXT:    [[CVT0:%.*]] = sitofp i16 [[TMP3]] to float
+; SSE-NEXT:    [[TMP4:%.*]] = extractelement <4 x i16> [[TMP1]], i32 1
+; SSE-NEXT:    [[CVT1:%.*]] = sitofp i16 [[TMP4]] to float
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <4 x i16> [[TMP1]], i32 2
+; SSE-NEXT:    [[CVT2:%.*]] = sitofp i16 [[TMP5]] to float
+; SSE-NEXT:    [[TMP6:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+; SSE-NEXT:    [[CVT3:%.*]] = sitofp i16 [[TMP6]] to float
+; SSE-NEXT:    [[TMP7:%.*]] = extractelement <4 x i16> [[TMP2]], i32 0
+; SSE-NEXT:    [[CVT4:%.*]] = sitofp i16 [[TMP7]] to float
+; SSE-NEXT:    [[TMP8:%.*]] = extractelement <4 x i16> [[TMP2]], i32 1
+; SSE-NEXT:    [[CVT5:%.*]] = sitofp i16 [[TMP8]] to float
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2
+; SSE-NEXT:    [[CVT6:%.*]] = sitofp i16 [[TMP9]] to float
+; SSE-NEXT:    [[TMP10:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+; SSE-NEXT:    [[CVT7:%.*]] = sitofp i16 [[TMP10]] to float
+; SSE-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> poison, float [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[CVT1]], i32 1
+; SSE-NEXT:    [[TMP13:%.*]] = insertelement <4 x float> [[TMP12]], float [[CVT2]], i32 2
+; SSE-NEXT:    [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[CVT3]], i32 3
+; SSE-NEXT:    store <4 x float> [[TMP14]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    [[TMP15:%.*]] = insertelement <4 x float> poison, float [[CVT4]], i32 0
+; SSE-NEXT:    [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[CVT5]], i32 1
+; SSE-NEXT:    [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float [[CVT6]], i32 2
+; SSE-NEXT:    [[TMP18:%.*]] = insertelement <4 x float> [[TMP17]], float [[CVT7]], i32 3
+; SSE-NEXT:    store <4 x float> [[TMP18]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @sitofp_8i16_8f32(
@@ -1015,54 +1197,62 @@
 
 define void @sitofp_16i16_16f32() #0 {
 ; SSE-LABEL: @sitofp_16i16_16f32(
-; SSE-NEXT:    [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
-; SSE-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
-; SSE-NEXT:    [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
-; SSE-NEXT:    [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
-; SSE-NEXT:    [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8
-; SSE-NEXT:    [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2
-; SSE-NEXT:    [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4
-; SSE-NEXT:    [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2
-; SSE-NEXT:    [[LD8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8), align 16
-; SSE-NEXT:    [[LD9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 9), align 2
-; SSE-NEXT:    [[LD10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 10), align 4
-; SSE-NEXT:    [[LD11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 11), align 2
-; SSE-NEXT:    [[LD12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12), align 8
-; SSE-NEXT:    [[LD13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 13), align 2
-; SSE-NEXT:    [[LD14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 14), align 4
-; SSE-NEXT:    [[LD15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 15), align 2
-; SSE-NEXT:    [[CVT0:%.*]] = sitofp i16 [[LD0]] to float
-; SSE-NEXT:    [[CVT1:%.*]] = sitofp i16 [[LD1]] to float
-; SSE-NEXT:    [[CVT2:%.*]] = sitofp i16 [[LD2]] to float
-; SSE-NEXT:    [[CVT3:%.*]] = sitofp i16 [[LD3]] to float
-; SSE-NEXT:    [[CVT4:%.*]] = sitofp i16 [[LD4]] to float
-; SSE-NEXT:    [[CVT5:%.*]] = sitofp i16 [[LD5]] to float
-; SSE-NEXT:    [[CVT6:%.*]] = sitofp i16 [[LD6]] to float
-; SSE-NEXT:    [[CVT7:%.*]] = sitofp i16 [[LD7]] to float
-; SSE-NEXT:    [[CVT8:%.*]] = sitofp i16 [[LD8]] to float
-; SSE-NEXT:    [[CVT9:%.*]] = sitofp i16 [[LD9]] to float
-; SSE-NEXT:    [[CVT10:%.*]] = sitofp i16 [[LD10]] to float
-; SSE-NEXT:    [[CVT11:%.*]] = sitofp i16 [[LD11]] to float
-; SSE-NEXT:    [[CVT12:%.*]] = sitofp i16 [[LD12]] to float
-; SSE-NEXT:    [[CVT13:%.*]] = sitofp i16 [[LD13]] to float
-; SSE-NEXT:    [[CVT14:%.*]] = sitofp i16 [[LD14]] to float
-; SSE-NEXT:    [[CVT15:%.*]] = sitofp i16 [[LD15]] to float
-; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
-; SSE-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE-NEXT:    store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
-; SSE-NEXT:    store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE-NEXT:    store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
-; SSE-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
-; SSE-NEXT:    store float [[CVT8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 32
-; SSE-NEXT:    store float [[CVT9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4
-; SSE-NEXT:    store float [[CVT10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 8
-; SSE-NEXT:    store float [[CVT11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
-; SSE-NEXT:    store float [[CVT12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 16
-; SSE-NEXT:    store float [[CVT13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
-; SSE-NEXT:    store float [[CVT14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 8
-; SSE-NEXT:    store float [[CVT15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <4 x i16>*), align 16
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <4 x i16> [[TMP1]], i32 0
+; SSE-NEXT:    [[CVT0:%.*]] = sitofp i16 [[TMP5]] to float
+; SSE-NEXT:    [[TMP6:%.*]] = extractelement <4 x i16> [[TMP1]], i32 1
+; SSE-NEXT:    [[CVT1:%.*]] = sitofp i16 [[TMP6]] to float
+; SSE-NEXT:    [[TMP7:%.*]] = extractelement <4 x i16> [[TMP1]], i32 2
+; SSE-NEXT:    [[CVT2:%.*]] = sitofp i16 [[TMP7]] to float
+; SSE-NEXT:    [[TMP8:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+; SSE-NEXT:    [[CVT3:%.*]] = sitofp i16 [[TMP8]] to float
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <4 x i16> [[TMP2]], i32 0
+; SSE-NEXT:    [[CVT4:%.*]] = sitofp i16 [[TMP9]] to float
+; SSE-NEXT:    [[TMP10:%.*]] = extractelement <4 x i16> [[TMP2]], i32 1
+; SSE-NEXT:    [[CVT5:%.*]] = sitofp i16 [[TMP10]] to float
+; SSE-NEXT:    [[TMP11:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2
+; SSE-NEXT:    [[CVT6:%.*]] = sitofp i16 [[TMP11]] to float
+; SSE-NEXT:    [[TMP12:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+; SSE-NEXT:    [[CVT7:%.*]] = sitofp i16 [[TMP12]] to float
+; SSE-NEXT:    [[TMP13:%.*]] = extractelement <4 x i16> [[TMP3]], i32 0
+; SSE-NEXT:    [[CVT8:%.*]] = sitofp i16 [[TMP13]] to float
+; SSE-NEXT:    [[TMP14:%.*]] = extractelement <4 x i16> [[TMP3]], i32 1
+; SSE-NEXT:    [[CVT9:%.*]] = sitofp i16 [[TMP14]] to float
+; SSE-NEXT:    [[TMP15:%.*]] = extractelement <4 x i16> [[TMP3]], i32 2
+; SSE-NEXT:    [[CVT10:%.*]] = sitofp i16 [[TMP15]] to float
+; SSE-NEXT:    [[TMP16:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3
+; SSE-NEXT:    [[CVT11:%.*]] = sitofp i16 [[TMP16]] to float
+; SSE-NEXT:    [[TMP17:%.*]] = extractelement <4 x i16> [[TMP4]], i32 0
+; SSE-NEXT:    [[CVT12:%.*]] = sitofp i16 [[TMP17]] to float
+; SSE-NEXT:    [[TMP18:%.*]] = extractelement <4 x i16> [[TMP4]], i32 1
+; SSE-NEXT:    [[CVT13:%.*]] = sitofp i16 [[TMP18]] to float
+; SSE-NEXT:    [[TMP19:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2
+; SSE-NEXT:    [[CVT14:%.*]] = sitofp i16 [[TMP19]] to float
+; SSE-NEXT:    [[TMP20:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3
+; SSE-NEXT:    [[CVT15:%.*]] = sitofp i16 [[TMP20]] to float
+; SSE-NEXT:    [[TMP21:%.*]] = insertelement <4 x float> poison, float [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[CVT1]], i32 1
+; SSE-NEXT:    [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[CVT2]], i32 2
+; SSE-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[CVT3]], i32 3
+; SSE-NEXT:    store <4 x float> [[TMP24]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    [[TMP25:%.*]] = insertelement <4 x float> poison, float [[CVT4]], i32 0
+; SSE-NEXT:    [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[CVT5]], i32 1
+; SSE-NEXT:    [[TMP27:%.*]] = insertelement <4 x float> [[TMP26]], float [[CVT6]], i32 2
+; SSE-NEXT:    [[TMP28:%.*]] = insertelement <4 x float> [[TMP27]], float [[CVT7]], i32 3
+; SSE-NEXT:    store <4 x float> [[TMP28]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    [[TMP29:%.*]] = insertelement <4 x float> poison, float [[CVT8]], i32 0
+; SSE-NEXT:    [[TMP30:%.*]] = insertelement <4 x float> [[TMP29]], float [[CVT9]], i32 1
+; SSE-NEXT:    [[TMP31:%.*]] = insertelement <4 x float> [[TMP30]], float [[CVT10]], i32 2
+; SSE-NEXT:    [[TMP32:%.*]] = insertelement <4 x float> [[TMP31]], float [[CVT11]], i32 3
+; SSE-NEXT:    store <4 x float> [[TMP32]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
+; SSE-NEXT:    [[TMP33:%.*]] = insertelement <4 x float> poison, float [[CVT12]], i32 0
+; SSE-NEXT:    [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[CVT13]], i32 1
+; SSE-NEXT:    [[TMP35:%.*]] = insertelement <4 x float> [[TMP34]], float [[CVT14]], i32 2
+; SSE-NEXT:    [[TMP36:%.*]] = insertelement <4 x float> [[TMP35]], float [[CVT15]], i32 3
+; SSE-NEXT:    store <4 x float> [[TMP36]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @sitofp_16i16_16f32(
Index: llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ-BDVER1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ-AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256DQ
 
@@ -39,6 +39,24 @@
 ; AVX256NODQ-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
 ; AVX256NODQ-NEXT:    ret void
 ;
+; AVX256NODQ-BDVER1-LABEL: @sitofp_2i64_2f64(
+; AVX256NODQ-BDVER1-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; AVX256NODQ-BDVER1-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-BDVER1-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to double
+; AVX256NODQ-BDVER1-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to double
+; AVX256NODQ-BDVER1-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; AVX256NODQ-BDVER1-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; AVX256NODQ-BDVER1-NEXT:    ret void
+;
+; AVX256NODQ-AVX2-LABEL: @sitofp_2i64_2f64(
+; AVX256NODQ-AVX2-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; AVX256NODQ-AVX2-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-AVX2-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to double
+; AVX256NODQ-AVX2-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to double
+; AVX256NODQ-AVX2-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; AVX256NODQ-AVX2-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; AVX256NODQ-AVX2-NEXT:    ret void
+;
 ; AVX512-LABEL: @sitofp_2i64_2f64(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
 ; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <2 x i64> [[TMP1]] to <2 x double>
@@ -91,6 +109,36 @@
 ; AVX256NODQ-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
 ; AVX256NODQ-NEXT:    ret void
 ;
+; AVX256NODQ-BDVER1-LABEL: @sitofp_4i64_4f64(
+; AVX256NODQ-BDVER1-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; AVX256NODQ-BDVER1-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-BDVER1-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; AVX256NODQ-BDVER1-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; AVX256NODQ-BDVER1-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to double
+; AVX256NODQ-BDVER1-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to double
+; AVX256NODQ-BDVER1-NEXT:    [[CVT2:%.*]] = sitofp i64 [[LD2]] to double
+; AVX256NODQ-BDVER1-NEXT:    [[CVT3:%.*]] = sitofp i64 [[LD3]] to double
+; AVX256NODQ-BDVER1-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; AVX256NODQ-BDVER1-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; AVX256NODQ-BDVER1-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+; AVX256NODQ-BDVER1-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; AVX256NODQ-BDVER1-NEXT:    ret void
+;
+; AVX256NODQ-AVX2-LABEL: @sitofp_4i64_4f64(
+; AVX256NODQ-AVX2-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; AVX256NODQ-AVX2-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-AVX2-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; AVX256NODQ-AVX2-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; AVX256NODQ-AVX2-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to double
+; AVX256NODQ-AVX2-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to double
+; AVX256NODQ-AVX2-NEXT:    [[CVT2:%.*]] = sitofp i64 [[LD2]] to double
+; AVX256NODQ-AVX2-NEXT:    [[CVT3:%.*]] = sitofp i64 [[LD3]] to double
+; AVX256NODQ-AVX2-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; AVX256NODQ-AVX2-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; AVX256NODQ-AVX2-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+; AVX256NODQ-AVX2-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; AVX256NODQ-AVX2-NEXT:    ret void
+;
 ; AVX512-LABEL: @sitofp_4i64_4f64(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
 ; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x double>
@@ -173,6 +221,60 @@
 ; AVX256NODQ-NEXT:    store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
 ; AVX256NODQ-NEXT:    ret void
 ;
+; AVX256NODQ-BDVER1-LABEL: @sitofp_8i64_8f64(
+; AVX256NODQ-BDVER1-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; AVX256NODQ-BDVER1-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-BDVER1-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; AVX256NODQ-BDVER1-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; AVX256NODQ-BDVER1-NEXT:    [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
+; AVX256NODQ-BDVER1-NEXT:    [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
+; AVX256NODQ-BDVER1-NEXT:    [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
+; AVX256NODQ-BDVER1-NEXT:    [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
+; AVX256NODQ-BDVER1-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to double
+; AVX256NODQ-BDVER1-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to double
+; AVX256NODQ-BDVER1-NEXT:    [[CVT2:%.*]] = sitofp i64 [[LD2]] to double
+; AVX256NODQ-BDVER1-NEXT:    [[CVT3:%.*]] = sitofp i64 [[LD3]] to double
+; AVX256NODQ-BDVER1-NEXT:    [[CVT4:%.*]] = sitofp i64 [[LD4]] to double
+; AVX256NODQ-BDVER1-NEXT:    [[CVT5:%.*]] = sitofp i64 [[LD5]] to double
+; AVX256NODQ-BDVER1-NEXT:    [[CVT6:%.*]] = sitofp i64 [[LD6]] to double
+; AVX256NODQ-BDVER1-NEXT:    [[CVT7:%.*]] = sitofp i64 [[LD7]] to double
+; AVX256NODQ-BDVER1-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; AVX256NODQ-BDVER1-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; AVX256NODQ-BDVER1-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+; AVX256NODQ-BDVER1-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; AVX256NODQ-BDVER1-NEXT:    store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
+; AVX256NODQ-BDVER1-NEXT:    store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+; AVX256NODQ-BDVER1-NEXT:    store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
+; AVX256NODQ-BDVER1-NEXT:    store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; AVX256NODQ-BDVER1-NEXT:    ret void
+;
+; AVX256NODQ-AVX2-LABEL: @sitofp_8i64_8f64(
+; AVX256NODQ-AVX2-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; AVX256NODQ-AVX2-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-AVX2-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; AVX256NODQ-AVX2-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; AVX256NODQ-AVX2-NEXT:    [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
+; AVX256NODQ-AVX2-NEXT:    [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
+; AVX256NODQ-AVX2-NEXT:    [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
+; AVX256NODQ-AVX2-NEXT:    [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
+; AVX256NODQ-AVX2-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to double
+; AVX256NODQ-AVX2-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to double
+; AVX256NODQ-AVX2-NEXT:    [[CVT2:%.*]] = sitofp i64 [[LD2]] to double
+; AVX256NODQ-AVX2-NEXT:    [[CVT3:%.*]] = sitofp i64 [[LD3]] to double
+; AVX256NODQ-AVX2-NEXT:    [[CVT4:%.*]] = sitofp i64 [[LD4]] to double
+; AVX256NODQ-AVX2-NEXT:    [[CVT5:%.*]] = sitofp i64 [[LD5]] to double
+; AVX256NODQ-AVX2-NEXT:    [[CVT6:%.*]] = sitofp i64 [[LD6]] to double
+; AVX256NODQ-AVX2-NEXT:    [[CVT7:%.*]] = sitofp i64 [[LD7]] to double
+; AVX256NODQ-AVX2-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; AVX256NODQ-AVX2-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; AVX256NODQ-AVX2-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+; AVX256NODQ-AVX2-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; AVX256NODQ-AVX2-NEXT:    store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
+; AVX256NODQ-AVX2-NEXT:    store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+; AVX256NODQ-AVX2-NEXT:    store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
+; AVX256NODQ-AVX2-NEXT:    store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; AVX256NODQ-AVX2-NEXT:    ret void
+;
 ; AVX512-LABEL: @sitofp_8i64_8f64(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
 ; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <8 x i64> [[TMP1]] to <8 x double>
@@ -647,6 +749,36 @@
 ; AVX256NODQ-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
 ; AVX256NODQ-NEXT:    ret void
 ;
+; AVX256NODQ-BDVER1-LABEL: @sitofp_4i64_4f32(
+; AVX256NODQ-BDVER1-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; AVX256NODQ-BDVER1-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-BDVER1-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; AVX256NODQ-BDVER1-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; AVX256NODQ-BDVER1-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to float
+; AVX256NODQ-BDVER1-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to float
+; AVX256NODQ-BDVER1-NEXT:    [[CVT2:%.*]] = sitofp i64 [[LD2]] to float
+; AVX256NODQ-BDVER1-NEXT:    [[CVT3:%.*]] = sitofp i64 [[LD3]] to float
+; AVX256NODQ-BDVER1-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; AVX256NODQ-BDVER1-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; AVX256NODQ-BDVER1-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+; AVX256NODQ-BDVER1-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; AVX256NODQ-BDVER1-NEXT:    ret void
+;
+; AVX256NODQ-AVX2-LABEL: @sitofp_4i64_4f32(
+; AVX256NODQ-AVX2-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; AVX256NODQ-AVX2-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-AVX2-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; AVX256NODQ-AVX2-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; AVX256NODQ-AVX2-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to float
+; AVX256NODQ-AVX2-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to float
+; AVX256NODQ-AVX2-NEXT:    [[CVT2:%.*]] = sitofp i64 [[LD2]] to float
+; AVX256NODQ-AVX2-NEXT:    [[CVT3:%.*]] = sitofp i64 [[LD3]] to float
+; AVX256NODQ-AVX2-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; AVX256NODQ-AVX2-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; AVX256NODQ-AVX2-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+; AVX256NODQ-AVX2-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; AVX256NODQ-AVX2-NEXT:    ret void
+;
 ; AVX512-LABEL: @sitofp_4i64_4f32(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
 ; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x float>
@@ -729,6 +861,50 @@
 ; AVX256NODQ-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
 ; AVX256NODQ-NEXT:    ret void
 ;
+; AVX256NODQ-BDVER1-LABEL: @sitofp_8i64_8f32(
+; AVX256NODQ-BDVER1-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP2:%.*]] = sitofp <8 x i64> [[TMP1]] to <8 x float>
+; AVX256NODQ-BDVER1-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP2]], i32 7
+; AVX256NODQ-BDVER1-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[TMP2]], i32 0
+; AVX256NODQ-BDVER1-NEXT:    [[TMP5:%.*]] = insertelement <8 x float> poison, float [[TMP4]], i32 0
+; AVX256NODQ-BDVER1-NEXT:    [[TMP6:%.*]] = extractelement <8 x float> [[TMP2]], i32 1
+; AVX256NODQ-BDVER1-NEXT:    [[TMP7:%.*]] = insertelement <8 x float> [[TMP5]], float [[TMP6]], i32 1
+; AVX256NODQ-BDVER1-NEXT:    [[TMP8:%.*]] = extractelement <8 x float> [[TMP2]], i32 2
+; AVX256NODQ-BDVER1-NEXT:    [[TMP9:%.*]] = insertelement <8 x float> [[TMP7]], float [[TMP8]], i32 2
+; AVX256NODQ-BDVER1-NEXT:    [[TMP10:%.*]] = extractelement <8 x float> [[TMP2]], i32 3
+; AVX256NODQ-BDVER1-NEXT:    [[TMP11:%.*]] = insertelement <8 x float> [[TMP9]], float [[TMP10]], i32 3
+; AVX256NODQ-BDVER1-NEXT:    [[TMP12:%.*]] = extractelement <8 x float> [[TMP2]], i32 4
+; AVX256NODQ-BDVER1-NEXT:    [[TMP13:%.*]] = insertelement <8 x float> [[TMP11]], float [[TMP12]], i32 4
+; AVX256NODQ-BDVER1-NEXT:    [[TMP14:%.*]] = extractelement <8 x float> [[TMP2]], i32 5
+; AVX256NODQ-BDVER1-NEXT:    [[TMP15:%.*]] = insertelement <8 x float> [[TMP13]], float [[TMP14]], i32 5
+; AVX256NODQ-BDVER1-NEXT:    [[TMP16:%.*]] = extractelement <8 x float> [[TMP2]], i32 6
+; AVX256NODQ-BDVER1-NEXT:    [[TMP17:%.*]] = insertelement <8 x float> [[TMP15]], float [[TMP16]], i32 6
+; AVX256NODQ-BDVER1-NEXT:    [[TMP18:%.*]] = insertelement <8 x float> [[TMP17]], float [[TMP3]], i32 7
+; AVX256NODQ-BDVER1-NEXT:    store <8 x float> [[TMP18]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX256NODQ-BDVER1-NEXT:    ret void
+;
+; AVX256NODQ-AVX2-LABEL: @sitofp_8i64_8f32(
+; AVX256NODQ-AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
+; AVX256NODQ-AVX2-NEXT:    [[TMP2:%.*]] = sitofp <8 x i64> [[TMP1]] to <8 x float>
+; AVX256NODQ-AVX2-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP2]], i32 7
+; AVX256NODQ-AVX2-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[TMP2]], i32 0
+; AVX256NODQ-AVX2-NEXT:    [[TMP5:%.*]] = insertelement <8 x float> poison, float [[TMP4]], i32 0
+; AVX256NODQ-AVX2-NEXT:    [[TMP6:%.*]] = extractelement <8 x float> [[TMP2]], i32 1
+; AVX256NODQ-AVX2-NEXT:    [[TMP7:%.*]] = insertelement <8 x float> [[TMP5]], float [[TMP6]], i32 1
+; AVX256NODQ-AVX2-NEXT:    [[TMP8:%.*]] = extractelement <8 x float> [[TMP2]], i32 2
+; AVX256NODQ-AVX2-NEXT:    [[TMP9:%.*]] = insertelement <8 x float> [[TMP7]], float [[TMP8]], i32 2
+; AVX256NODQ-AVX2-NEXT:    [[TMP10:%.*]] = extractelement <8 x float> [[TMP2]], i32 3
+; AVX256NODQ-AVX2-NEXT:    [[TMP11:%.*]] = insertelement <8 x float> [[TMP9]], float [[TMP10]], i32 3
+; AVX256NODQ-AVX2-NEXT:    [[TMP12:%.*]] = extractelement <8 x float> [[TMP2]], i32 4
+; AVX256NODQ-AVX2-NEXT:    [[TMP13:%.*]] = insertelement <8 x float> [[TMP11]], float [[TMP12]], i32 4
+; AVX256NODQ-AVX2-NEXT:    [[TMP14:%.*]] = extractelement <8 x float> [[TMP2]], i32 5
+; AVX256NODQ-AVX2-NEXT:    [[TMP15:%.*]] = insertelement <8 x float> [[TMP13]], float [[TMP14]], i32 5
+; AVX256NODQ-AVX2-NEXT:    [[TMP16:%.*]] = extractelement <8 x float> [[TMP2]], i32 6
+; AVX256NODQ-AVX2-NEXT:    [[TMP17:%.*]] = insertelement <8 x float> [[TMP15]], float [[TMP16]], i32 6
+; AVX256NODQ-AVX2-NEXT:    [[TMP18:%.*]] = insertelement <8 x float> [[TMP17]], float [[TMP3]], i32 7
+; AVX256NODQ-AVX2-NEXT:    store <8 x float> [[TMP18]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX256NODQ-AVX2-NEXT:    ret void
+;
 ; AVX512-LABEL: @sitofp_8i64_8f32(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
 ; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <8 x i64> [[TMP1]] to <8 x float>
@@ -917,18 +1093,20 @@
 
 define void @sitofp_4i16_4f32() #0 {
 ; SSE-LABEL: @sitofp_4i16_4f32(
-; SSE-NEXT:    [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
-; SSE-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
-; SSE-NEXT:    [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
-; SSE-NEXT:    [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
-; SSE-NEXT:    [[CVT0:%.*]] = sitofp i16 [[LD0]] to float
-; SSE-NEXT:    [[CVT1:%.*]] = sitofp i16 [[LD1]] to float
-; SSE-NEXT:    [[CVT2:%.*]] = sitofp i16 [[LD2]] to float
-; SSE-NEXT:    [[CVT3:%.*]] = sitofp i16 [[LD3]] to float
-; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
-; SSE-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[TMP1]], i32 0
+; SSE-NEXT:    [[CVT0:%.*]] = sitofp i16 [[TMP2]] to float
+; SSE-NEXT:    [[TMP3:%.*]] = extractelement <4 x i16> [[TMP1]], i32 1
+; SSE-NEXT:    [[CVT1:%.*]] = sitofp i16 [[TMP3]] to float
+; SSE-NEXT:    [[TMP4:%.*]] = extractelement <4 x i16> [[TMP1]], i32 2
+; SSE-NEXT:    [[CVT2:%.*]] = sitofp i16 [[TMP4]] to float
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+; SSE-NEXT:    [[CVT3:%.*]] = sitofp i16 [[TMP5]] to float
+; SSE-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> poison, float [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[CVT1]], i32 1
+; SSE-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[CVT2]], i32 2
+; SSE-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[CVT3]], i32 3
+; SSE-NEXT:    store <4 x float> [[TMP9]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @sitofp_4i16_4f32(
@@ -954,30 +1132,34 @@
 
 define void @sitofp_8i16_8f32() #0 {
 ; SSE-LABEL: @sitofp_8i16_8f32(
-; SSE-NEXT:    [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
-; SSE-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
-; SSE-NEXT:    [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
-; SSE-NEXT:    [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
-; SSE-NEXT:    [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8
-; SSE-NEXT:    [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2
-; SSE-NEXT:    [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4
-; SSE-NEXT:    [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2
-; SSE-NEXT:    [[CVT0:%.*]] = sitofp i16 [[LD0]] to float
-; SSE-NEXT:    [[CVT1:%.*]] = sitofp i16 [[LD1]] to float
-; SSE-NEXT:    [[CVT2:%.*]] = sitofp i16 [[LD2]] to float
-; SSE-NEXT:    [[CVT3:%.*]] = sitofp i16 [[LD3]] to float
-; SSE-NEXT:    [[CVT4:%.*]] = sitofp i16 [[LD4]] to float
-; SSE-NEXT:    [[CVT5:%.*]] = sitofp i16 [[LD5]] to float
-; SSE-NEXT:    [[CVT6:%.*]] = sitofp i16 [[LD6]] to float
-; SSE-NEXT:    [[CVT7:%.*]] = sitofp i16 [[LD7]] to float
-; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
-; SSE-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE-NEXT:    store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
-; SSE-NEXT:    store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE-NEXT:    store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
-; SSE-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = extractelement <4 x i16> [[TMP1]], i32 0
+; SSE-NEXT:    [[CVT0:%.*]] = sitofp i16 [[TMP3]] to float
+; SSE-NEXT:    [[TMP4:%.*]] = extractelement <4 x i16> [[TMP1]], i32 1
+; SSE-NEXT:    [[CVT1:%.*]] = sitofp i16 [[TMP4]] to float
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <4 x i16> [[TMP1]], i32 2
+; SSE-NEXT:    [[CVT2:%.*]] = sitofp i16 [[TMP5]] to float
+; SSE-NEXT:    [[TMP6:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+; SSE-NEXT:    [[CVT3:%.*]] = sitofp i16 [[TMP6]] to float
+; SSE-NEXT:    [[TMP7:%.*]] = extractelement <4 x i16> [[TMP2]], i32 0
+; SSE-NEXT:    [[CVT4:%.*]] = sitofp i16 [[TMP7]] to float
+; SSE-NEXT:    [[TMP8:%.*]] = extractelement <4 x i16> [[TMP2]], i32 1
+; SSE-NEXT:    [[CVT5:%.*]] = sitofp i16 [[TMP8]] to float
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2
+; SSE-NEXT:    [[CVT6:%.*]] = sitofp i16 [[TMP9]] to float
+; SSE-NEXT:    [[TMP10:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+; SSE-NEXT:    [[CVT7:%.*]] = sitofp i16 [[TMP10]] to float
+; SSE-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> poison, float [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[CVT1]], i32 1
+; SSE-NEXT:    [[TMP13:%.*]] = insertelement <4 x float> [[TMP12]], float [[CVT2]], i32 2
+; SSE-NEXT:    [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[CVT3]], i32 3
+; SSE-NEXT:    store <4 x float> [[TMP14]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    [[TMP15:%.*]] = insertelement <4 x float> poison, float [[CVT4]], i32 0
+; SSE-NEXT:    [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[CVT5]], i32 1
+; SSE-NEXT:    [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float [[CVT6]], i32 2
+; SSE-NEXT:    [[TMP18:%.*]] = insertelement <4 x float> [[TMP17]], float [[CVT7]], i32 3
+; SSE-NEXT:    store <4 x float> [[TMP18]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @sitofp_8i16_8f32(
@@ -1015,54 +1197,62 @@
 
 define void @sitofp_16i16_16f32() #0 {
 ; SSE-LABEL: @sitofp_16i16_16f32(
-; SSE-NEXT:    [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
-; SSE-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
-; SSE-NEXT:    [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
-; SSE-NEXT:    [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
-; SSE-NEXT:    [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8
-; SSE-NEXT:    [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2
-; SSE-NEXT:    [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4
-; SSE-NEXT:    [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2
-; SSE-NEXT:    [[LD8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8), align 16
-; SSE-NEXT:    [[LD9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 9), align 2
-; SSE-NEXT:    [[LD10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 10), align 4
-; SSE-NEXT:    [[LD11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 11), align 2
-; SSE-NEXT:    [[LD12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12), align 8
-; SSE-NEXT:    [[LD13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 13), align 2
-; SSE-NEXT:    [[LD14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 14), align 4
-; SSE-NEXT:    [[LD15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 15), align 2
-; SSE-NEXT:    [[CVT0:%.*]] = sitofp i16 [[LD0]] to float
-; SSE-NEXT:    [[CVT1:%.*]] = sitofp i16 [[LD1]] to float
-; SSE-NEXT:    [[CVT2:%.*]] = sitofp i16 [[LD2]] to float
-; SSE-NEXT:    [[CVT3:%.*]] = sitofp i16 [[LD3]] to float
-; SSE-NEXT:    [[CVT4:%.*]] = sitofp i16 [[LD4]] to float
-; SSE-NEXT:    [[CVT5:%.*]] = sitofp i16 [[LD5]] to float
-; SSE-NEXT:    [[CVT6:%.*]] = sitofp i16 [[LD6]] to float
-; SSE-NEXT:    [[CVT7:%.*]] = sitofp i16 [[LD7]] to float
-; SSE-NEXT:    [[CVT8:%.*]] = sitofp i16 [[LD8]] to float
-; SSE-NEXT:    [[CVT9:%.*]] = sitofp i16 [[LD9]] to float
-; SSE-NEXT:    [[CVT10:%.*]] = sitofp i16 [[LD10]] to float
-; SSE-NEXT:    [[CVT11:%.*]] = sitofp i16 [[LD11]] to float
-; SSE-NEXT:    [[CVT12:%.*]] = sitofp i16 [[LD12]] to float
-; SSE-NEXT:    [[CVT13:%.*]] = sitofp i16 [[LD13]] to float
-; SSE-NEXT:    [[CVT14:%.*]] = sitofp i16 [[LD14]] to float
-; SSE-NEXT:    [[CVT15:%.*]] = sitofp i16 [[LD15]] to float
-; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
-; SSE-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE-NEXT:    store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
-; SSE-NEXT:    store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE-NEXT:    store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
-; SSE-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
-; SSE-NEXT:    store float [[CVT8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 32
-; SSE-NEXT:    store float [[CVT9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4
-; SSE-NEXT:    store float [[CVT10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 8
-; SSE-NEXT:    store float [[CVT11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
-; SSE-NEXT:    store float [[CVT12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 16
-; SSE-NEXT:    store float [[CVT13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
-; SSE-NEXT:    store float [[CVT14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 8
-; SSE-NEXT:    store float [[CVT15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <4 x i16>*), align 16
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <4 x i16> [[TMP1]], i32 0
+; SSE-NEXT:    [[CVT0:%.*]] = sitofp i16 [[TMP5]] to float
+; SSE-NEXT:    [[TMP6:%.*]] = extractelement <4 x i16> [[TMP1]], i32 1
+; SSE-NEXT:    [[CVT1:%.*]] = sitofp i16 [[TMP6]] to float
+; SSE-NEXT:    [[TMP7:%.*]] = extractelement <4 x i16> [[TMP1]], i32 2
+; SSE-NEXT:    [[CVT2:%.*]] = sitofp i16 [[TMP7]] to float
+; SSE-NEXT:    [[TMP8:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+; SSE-NEXT:    [[CVT3:%.*]] = sitofp i16 [[TMP8]] to float
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <4 x i16> [[TMP2]], i32 0
+; SSE-NEXT:    [[CVT4:%.*]] = sitofp i16 [[TMP9]] to float
+; SSE-NEXT:    [[TMP10:%.*]] = extractelement <4 x i16> [[TMP2]], i32 1
+; SSE-NEXT:    [[CVT5:%.*]] = sitofp i16 [[TMP10]] to float
+; SSE-NEXT:    [[TMP11:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2
+; SSE-NEXT:    [[CVT6:%.*]] = sitofp i16 [[TMP11]] to float
+; SSE-NEXT:    [[TMP12:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+; SSE-NEXT:    [[CVT7:%.*]] = sitofp i16 [[TMP12]] to float
+; SSE-NEXT:    [[TMP13:%.*]] = extractelement <4 x i16> [[TMP3]], i32 0
+; SSE-NEXT:    [[CVT8:%.*]] = sitofp i16 [[TMP13]] to float
+; SSE-NEXT:    [[TMP14:%.*]] = extractelement <4 x i16> [[TMP3]], i32 1
+; SSE-NEXT:    [[CVT9:%.*]] = sitofp i16 [[TMP14]] to float
+; SSE-NEXT:    [[TMP15:%.*]] = extractelement <4 x i16> [[TMP3]], i32 2
+; SSE-NEXT:    [[CVT10:%.*]] = sitofp i16 [[TMP15]] to float
+; SSE-NEXT:    [[TMP16:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3
+; SSE-NEXT:    [[CVT11:%.*]] = sitofp i16 [[TMP16]] to float
+; SSE-NEXT:    [[TMP17:%.*]] = extractelement <4 x i16> [[TMP4]], i32 0
+; SSE-NEXT:    [[CVT12:%.*]] = sitofp i16 [[TMP17]] to float
+; SSE-NEXT:    [[TMP18:%.*]] = extractelement <4 x i16> [[TMP4]], i32 1
+; SSE-NEXT:    [[CVT13:%.*]] = sitofp i16 [[TMP18]] to float
+; SSE-NEXT:    [[TMP19:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2
+; SSE-NEXT:    [[CVT14:%.*]] = sitofp i16 [[TMP19]] to float
+; SSE-NEXT:    [[TMP20:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3
+; SSE-NEXT:    [[CVT15:%.*]] = sitofp i16 [[TMP20]] to float
+; SSE-NEXT:    [[TMP21:%.*]] = insertelement <4 x float> poison, float [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[CVT1]], i32 1
+; SSE-NEXT:    [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[CVT2]], i32 2
+; SSE-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[CVT3]], i32 3
+; SSE-NEXT:    store <4 x float> [[TMP24]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    [[TMP25:%.*]] = insertelement <4 x float> poison, float [[CVT4]], i32 0
+; SSE-NEXT:    [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[CVT5]], i32 1
+; SSE-NEXT:    [[TMP27:%.*]] = insertelement <4 x float> [[TMP26]], float [[CVT6]], i32 2
+; SSE-NEXT:    [[TMP28:%.*]] = insertelement <4 x float> [[TMP27]], float [[CVT7]], i32 3
+; SSE-NEXT:    store <4 x float> [[TMP28]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    [[TMP29:%.*]] = insertelement <4 x float> poison, float [[CVT8]], i32 0
+; SSE-NEXT:    [[TMP30:%.*]] = insertelement <4 x float> [[TMP29]], float [[CVT9]], i32 1
+; SSE-NEXT:    [[TMP31:%.*]] = insertelement <4 x float> [[TMP30]], float [[CVT10]], i32 2
+; SSE-NEXT:    [[TMP32:%.*]] = insertelement <4 x float> [[TMP31]], float [[CVT11]], i32 3
+; SSE-NEXT:    store <4 x float> [[TMP32]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
+; SSE-NEXT:    [[TMP33:%.*]] = insertelement <4 x float> poison, float [[CVT12]], i32 0
+; SSE-NEXT:    [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[CVT13]], i32 1
+; SSE-NEXT:    [[TMP35:%.*]] = insertelement <4 x float> [[TMP34]], float [[CVT14]], i32 2
+; SSE-NEXT:    [[TMP36:%.*]] = insertelement <4 x float> [[TMP35]], float [[CVT15]], i32 3
+; SSE-NEXT:    store <4 x float> [[TMP36]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @sitofp_16i16_16f32(
Index: llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll
@@ -5,18 +5,20 @@
 ; CHECK-LABEL: @rftbsub(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 2
-; CHECK-NEXT:    [[TMP0:%.*]] = load double, double* [[ARRAYIDX6]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = or i64 2, 1
-; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load double, double* [[ARRAYIDX12]], align 8
-; CHECK-NEXT:    [[ADD16:%.*]] = fadd double [[TMP2]], undef
+; CHECK-NEXT:    [[TMP0:%.*]] = or i64 2, 1
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; CHECK-NEXT:    [[ADD16:%.*]] = fadd double [[TMP3]], undef
 ; CHECK-NEXT:    [[MUL18:%.*]] = fmul double undef, [[ADD16]]
 ; CHECK-NEXT:    [[ADD19:%.*]] = fadd double undef, [[MUL18]]
 ; CHECK-NEXT:    [[SUB22:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[SUB25:%.*]] = fsub double [[TMP0]], [[ADD19]]
-; CHECK-NEXT:    store double [[SUB25]], double* [[ARRAYIDX6]], align 8
-; CHECK-NEXT:    [[SUB29:%.*]] = fsub double [[TMP2]], [[SUB22]]
-; CHECK-NEXT:    store double [[SUB29]], double* [[ARRAYIDX12]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[ADD19]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[SUB22]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP6]], <2 x double>* [[TMP7]], align 8
 ; CHECK-NEXT:    unreachable
 ;
 entry:
Index: llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ-BDVER1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ-AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256DQ
 
@@ -144,6 +144,24 @@
 ; AVX256NODQ-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
 ; AVX256NODQ-NEXT:    ret void
 ;
+; AVX256NODQ-BDVER1-LABEL: @uitofp_2i32_2f64(
+; AVX256NODQ-BDVER1-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
+; AVX256NODQ-BDVER1-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
+; AVX256NODQ-BDVER1-NEXT:    [[CVT0:%.*]] = uitofp i32 [[LD0]] to double
+; AVX256NODQ-BDVER1-NEXT:    [[CVT1:%.*]] = uitofp i32 [[LD1]] to double
+; AVX256NODQ-BDVER1-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; AVX256NODQ-BDVER1-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; AVX256NODQ-BDVER1-NEXT:    ret void
+;
+; AVX256NODQ-AVX2-LABEL: @uitofp_2i32_2f64(
+; AVX256NODQ-AVX2-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
+; AVX256NODQ-AVX2-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
+; AVX256NODQ-AVX2-NEXT:    [[CVT0:%.*]] = uitofp i32 [[LD0]] to double
+; AVX256NODQ-AVX2-NEXT:    [[CVT1:%.*]] = uitofp i32 [[LD1]] to double
+; AVX256NODQ-AVX2-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; AVX256NODQ-AVX2-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; AVX256NODQ-AVX2-NEXT:    ret void
+;
 ; AVX512-LABEL: @uitofp_2i32_2f64(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 64
 ; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x double>
@@ -417,6 +435,24 @@
 ; AVX256NODQ-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
 ; AVX256NODQ-NEXT:    ret void
 ;
+; AVX256NODQ-BDVER1-LABEL: @uitofp_2i8_2f64(
+; AVX256NODQ-BDVER1-NEXT:    [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
+; AVX256NODQ-BDVER1-NEXT:    [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
+; AVX256NODQ-BDVER1-NEXT:    [[CVT0:%.*]] = uitofp i8 [[LD0]] to double
+; AVX256NODQ-BDVER1-NEXT:    [[CVT1:%.*]] = uitofp i8 [[LD1]] to double
+; AVX256NODQ-BDVER1-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; AVX256NODQ-BDVER1-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; AVX256NODQ-BDVER1-NEXT:    ret void
+;
+; AVX256NODQ-AVX2-LABEL: @uitofp_2i8_2f64(
+; AVX256NODQ-AVX2-NEXT:    [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
+; AVX256NODQ-AVX2-NEXT:    [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
+; AVX256NODQ-AVX2-NEXT:    [[CVT0:%.*]] = uitofp i8 [[LD0]] to double
+; AVX256NODQ-AVX2-NEXT:    [[CVT1:%.*]] = uitofp i8 [[LD1]] to double
+; AVX256NODQ-AVX2-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; AVX256NODQ-AVX2-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; AVX256NODQ-AVX2-NEXT:    ret void
+;
 ; AVX512-LABEL: @uitofp_2i8_2f64(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64
 ; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <2 x i8> [[TMP1]] to <2 x double>
@@ -599,6 +635,36 @@
 ; AVX256NODQ-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
 ; AVX256NODQ-NEXT:    ret void
 ;
+; AVX256NODQ-BDVER1-LABEL: @uitofp_4i64_4f32(
+; AVX256NODQ-BDVER1-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; AVX256NODQ-BDVER1-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-BDVER1-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; AVX256NODQ-BDVER1-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; AVX256NODQ-BDVER1-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to float
+; AVX256NODQ-BDVER1-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to float
+; AVX256NODQ-BDVER1-NEXT:    [[CVT2:%.*]] = uitofp i64 [[LD2]] to float
+; AVX256NODQ-BDVER1-NEXT:    [[CVT3:%.*]] = uitofp i64 [[LD3]] to float
+; AVX256NODQ-BDVER1-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; AVX256NODQ-BDVER1-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; AVX256NODQ-BDVER1-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+; AVX256NODQ-BDVER1-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; AVX256NODQ-BDVER1-NEXT:    ret void
+;
+; AVX256NODQ-AVX2-LABEL: @uitofp_4i64_4f32(
+; AVX256NODQ-AVX2-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; AVX256NODQ-AVX2-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-AVX2-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; AVX256NODQ-AVX2-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; AVX256NODQ-AVX2-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to float
+; AVX256NODQ-AVX2-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to float
+; AVX256NODQ-AVX2-NEXT:    [[CVT2:%.*]] = uitofp i64 [[LD2]] to float
+; AVX256NODQ-AVX2-NEXT:    [[CVT3:%.*]] = uitofp i64 [[LD3]] to float
+; AVX256NODQ-AVX2-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; AVX256NODQ-AVX2-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; AVX256NODQ-AVX2-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+; AVX256NODQ-AVX2-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; AVX256NODQ-AVX2-NEXT:    ret void
+;
 ; AVX512-LABEL: @uitofp_4i64_4f32(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
 ; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x float>
@@ -681,6 +747,50 @@
 ; AVX256NODQ-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
 ; AVX256NODQ-NEXT:    ret void
 ;
+; AVX256NODQ-BDVER1-LABEL: @uitofp_8i64_8f32(
+; AVX256NODQ-BDVER1-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
+; AVX256NODQ-BDVER1-NEXT:    [[TMP2:%.*]] = uitofp <8 x i64> [[TMP1]] to <8 x float>
+; AVX256NODQ-BDVER1-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP2]], i32 7
+; AVX256NODQ-BDVER1-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[TMP2]], i32 0
+; AVX256NODQ-BDVER1-NEXT:    [[TMP5:%.*]] = insertelement <8 x float> poison, float [[TMP4]], i32 0
+; AVX256NODQ-BDVER1-NEXT:    [[TMP6:%.*]] = extractelement <8 x float> [[TMP2]], i32 1
+; AVX256NODQ-BDVER1-NEXT:    [[TMP7:%.*]] = insertelement <8 x float> [[TMP5]], float [[TMP6]], i32 1
+; AVX256NODQ-BDVER1-NEXT:    [[TMP8:%.*]] = extractelement <8 x float> [[TMP2]], i32 2
+; AVX256NODQ-BDVER1-NEXT:    [[TMP9:%.*]] = insertelement <8 x float> [[TMP7]], float [[TMP8]], i32 2
+; AVX256NODQ-BDVER1-NEXT:    [[TMP10:%.*]] = extractelement <8 x float> [[TMP2]], i32 3
+; AVX256NODQ-BDVER1-NEXT:    [[TMP11:%.*]] = insertelement <8 x float> [[TMP9]], float [[TMP10]], i32 3
+; AVX256NODQ-BDVER1-NEXT:    [[TMP12:%.*]] = extractelement <8 x float> [[TMP2]], i32 4
+; AVX256NODQ-BDVER1-NEXT:    [[TMP13:%.*]] = insertelement <8 x float> [[TMP11]], float [[TMP12]], i32 4
+; AVX256NODQ-BDVER1-NEXT:    [[TMP14:%.*]] = extractelement <8 x float> [[TMP2]], i32 5
+; AVX256NODQ-BDVER1-NEXT:    [[TMP15:%.*]] = insertelement <8 x float> [[TMP13]], float [[TMP14]], i32 5
+; AVX256NODQ-BDVER1-NEXT:    [[TMP16:%.*]] = extractelement <8 x float> [[TMP2]], i32 6
+; AVX256NODQ-BDVER1-NEXT:    [[TMP17:%.*]] = insertelement <8 x float> [[TMP15]], float [[TMP16]], i32 6
+; AVX256NODQ-BDVER1-NEXT:    [[TMP18:%.*]] = insertelement <8 x float> [[TMP17]], float [[TMP3]], i32 7
+; AVX256NODQ-BDVER1-NEXT:    store <8 x float> [[TMP18]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX256NODQ-BDVER1-NEXT:    ret void
+;
+; AVX256NODQ-AVX2-LABEL: @uitofp_8i64_8f32(
+; AVX256NODQ-AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
+; AVX256NODQ-AVX2-NEXT:    [[TMP2:%.*]] = uitofp <8 x i64> [[TMP1]] to <8 x float>
+; AVX256NODQ-AVX2-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP2]], i32 7
+; AVX256NODQ-AVX2-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[TMP2]], i32 0
+; AVX256NODQ-AVX2-NEXT:    [[TMP5:%.*]] = insertelement <8 x float> poison, float [[TMP4]], i32 0
+; AVX256NODQ-AVX2-NEXT:    [[TMP6:%.*]] = extractelement <8 x float> [[TMP2]], i32 1
+; AVX256NODQ-AVX2-NEXT:    [[TMP7:%.*]] = insertelement <8 x float> [[TMP5]], float [[TMP6]], i32 1
+; AVX256NODQ-AVX2-NEXT:    [[TMP8:%.*]] = extractelement <8 x float> [[TMP2]], i32 2
+; AVX256NODQ-AVX2-NEXT:    [[TMP9:%.*]] = insertelement <8 x float> [[TMP7]], float [[TMP8]], i32 2
+; AVX256NODQ-AVX2-NEXT:    [[TMP10:%.*]] = extractelement <8 x float> [[TMP2]], i32 3
+; AVX256NODQ-AVX2-NEXT:    [[TMP11:%.*]] = insertelement <8 x float> [[TMP9]], float [[TMP10]], i32 3
+; AVX256NODQ-AVX2-NEXT:    [[TMP12:%.*]] = extractelement <8 x float> [[TMP2]], i32 4
+; AVX256NODQ-AVX2-NEXT:    [[TMP13:%.*]] = insertelement <8 x float> [[TMP11]], float [[TMP12]], i32 4
+; AVX256NODQ-AVX2-NEXT:    [[TMP14:%.*]] = extractelement <8 x float> [[TMP2]], i32 5
+; AVX256NODQ-AVX2-NEXT:    [[TMP15:%.*]] = insertelement <8 x float> [[TMP13]], float [[TMP14]], i32 5
+; AVX256NODQ-AVX2-NEXT:    [[TMP16:%.*]] = extractelement <8 x float> [[TMP2]], i32 6
+; AVX256NODQ-AVX2-NEXT:    [[TMP17:%.*]] = insertelement <8 x float> [[TMP15]], float [[TMP16]], i32 6
+; AVX256NODQ-AVX2-NEXT:    [[TMP18:%.*]] = insertelement <8 x float> [[TMP17]], float [[TMP3]], i32 7
+; AVX256NODQ-AVX2-NEXT:    store <8 x float> [[TMP18]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX256NODQ-AVX2-NEXT:    ret void
+;
 ; AVX512-LABEL: @uitofp_8i64_8f32(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
 ; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <8 x i64> [[TMP1]] to <8 x float>
@@ -869,18 +979,20 @@
 
 define void @uitofp_4i16_4f32() #0 {
 ; SSE-LABEL: @uitofp_4i16_4f32(
-; SSE-NEXT:    [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
-; SSE-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
-; SSE-NEXT:    [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
-; SSE-NEXT:    [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
-; SSE-NEXT:    [[CVT0:%.*]] = uitofp i16 [[LD0]] to float
-; SSE-NEXT:    [[CVT1:%.*]] = uitofp i16 [[LD1]] to float
-; SSE-NEXT:    [[CVT2:%.*]] = uitofp i16 [[LD2]] to float
-; SSE-NEXT:    [[CVT3:%.*]] = uitofp i16 [[LD3]] to float
-; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
-; SSE-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[TMP1]], i32 0
+; SSE-NEXT:    [[CVT0:%.*]] = uitofp i16 [[TMP2]] to float
+; SSE-NEXT:    [[TMP3:%.*]] = extractelement <4 x i16> [[TMP1]], i32 1
+; SSE-NEXT:    [[CVT1:%.*]] = uitofp i16 [[TMP3]] to float
+; SSE-NEXT:    [[TMP4:%.*]] = extractelement <4 x i16> [[TMP1]], i32 2
+; SSE-NEXT:    [[CVT2:%.*]] = uitofp i16 [[TMP4]] to float
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+; SSE-NEXT:    [[CVT3:%.*]] = uitofp i16 [[TMP5]] to float
+; SSE-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> poison, float [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[CVT1]], i32 1
+; SSE-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[CVT2]], i32 2
+; SSE-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[CVT3]], i32 3
+; SSE-NEXT:    store <4 x float> [[TMP9]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @uitofp_4i16_4f32(
@@ -906,30 +1018,34 @@
 
 define void @uitofp_8i16_8f32() #0 {
 ; SSE-LABEL: @uitofp_8i16_8f32(
-; SSE-NEXT:    [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
-; SSE-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
-; SSE-NEXT:    [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
-; SSE-NEXT:    [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
-; SSE-NEXT:    [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8
-; SSE-NEXT:    [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2
-; SSE-NEXT:    [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4
-; SSE-NEXT:    [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2
-; SSE-NEXT:    [[CVT0:%.*]] = uitofp i16 [[LD0]] to float
-; SSE-NEXT:    [[CVT1:%.*]] = uitofp i16 [[LD1]] to float
-; SSE-NEXT:    [[CVT2:%.*]] = uitofp i16 [[LD2]] to float
-; SSE-NEXT:    [[CVT3:%.*]] = uitofp i16 [[LD3]] to float
-; SSE-NEXT:    [[CVT4:%.*]] = uitofp i16 [[LD4]] to float
-; SSE-NEXT:    [[CVT5:%.*]] = uitofp i16 [[LD5]] to float
-; SSE-NEXT:    [[CVT6:%.*]] = uitofp i16 [[LD6]] to float
-; SSE-NEXT:    [[CVT7:%.*]] = uitofp i16 [[LD7]] to float
-; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
-; SSE-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE-NEXT:    store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
-; SSE-NEXT:    store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE-NEXT:    store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
-; SSE-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = extractelement <4 x i16> [[TMP1]], i32 0
+; SSE-NEXT:    [[CVT0:%.*]] = uitofp i16 [[TMP3]] to float
+; SSE-NEXT:    [[TMP4:%.*]] = extractelement <4 x i16> [[TMP1]], i32 1
+; SSE-NEXT:    [[CVT1:%.*]] = uitofp i16 [[TMP4]] to float
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <4 x i16> [[TMP1]], i32 2
+; SSE-NEXT:    [[CVT2:%.*]] = uitofp i16 [[TMP5]] to float
+; SSE-NEXT:    [[TMP6:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+; SSE-NEXT:    [[CVT3:%.*]] = uitofp i16 [[TMP6]] to float
+; SSE-NEXT:    [[TMP7:%.*]] = extractelement <4 x i16> [[TMP2]], i32 0
+; SSE-NEXT:    [[CVT4:%.*]] = uitofp i16 [[TMP7]] to float
+; SSE-NEXT:    [[TMP8:%.*]] = extractelement <4 x i16> [[TMP2]], i32 1
+; SSE-NEXT:    [[CVT5:%.*]] = uitofp i16 [[TMP8]] to float
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2
+; SSE-NEXT:    [[CVT6:%.*]] = uitofp i16 [[TMP9]] to float
+; SSE-NEXT:    [[TMP10:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+; SSE-NEXT:    [[CVT7:%.*]] = uitofp i16 [[TMP10]] to float
+; SSE-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> poison, float [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[CVT1]], i32 1
+; SSE-NEXT:    [[TMP13:%.*]] = insertelement <4 x float> [[TMP12]], float [[CVT2]], i32 2
+; SSE-NEXT:    [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[CVT3]], i32 3
+; SSE-NEXT:    store <4 x float> [[TMP14]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    [[TMP15:%.*]] = insertelement <4 x float> poison, float [[CVT4]], i32 0
+; SSE-NEXT:    [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[CVT5]], i32 1
+; SSE-NEXT:    [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float [[CVT6]], i32 2
+; SSE-NEXT:    [[TMP18:%.*]] = insertelement <4 x float> [[TMP17]], float [[CVT7]], i32 3
+; SSE-NEXT:    store <4 x float> [[TMP18]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @uitofp_8i16_8f32(
@@ -967,54 +1083,62 @@
 
 define void @uitofp_16i16_16f32() #0 {
 ; SSE-LABEL: @uitofp_16i16_16f32(
-; SSE-NEXT:    [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
-; SSE-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
-; SSE-NEXT:    [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
-; SSE-NEXT:    [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
-; SSE-NEXT:    [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8
-; SSE-NEXT:    [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2
-; SSE-NEXT:    [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4
-; SSE-NEXT:    [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2
-; SSE-NEXT:    [[LD8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8), align 16
-; SSE-NEXT:    [[LD9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 9), align 2
-; SSE-NEXT:    [[LD10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 10), align 4
-; SSE-NEXT:    [[LD11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 11), align 2
-; SSE-NEXT:    [[LD12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12), align 8
-; SSE-NEXT:    [[LD13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 13), align 2
-; SSE-NEXT:    [[LD14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 14), align 4
-; SSE-NEXT:    [[LD15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 15), align 2
-; SSE-NEXT:    [[CVT0:%.*]] = uitofp i16 [[LD0]] to float
-; SSE-NEXT:    [[CVT1:%.*]] = uitofp i16 [[LD1]] to float
-; SSE-NEXT:    [[CVT2:%.*]] = uitofp i16 [[LD2]] to float
-; SSE-NEXT:    [[CVT3:%.*]] = uitofp i16 [[LD3]] to float
-; SSE-NEXT:    [[CVT4:%.*]] = uitofp i16 [[LD4]] to float
-; SSE-NEXT:    [[CVT5:%.*]] = uitofp i16 [[LD5]] to float
-; SSE-NEXT:    [[CVT6:%.*]] = uitofp i16 [[LD6]] to float
-; SSE-NEXT:    [[CVT7:%.*]] = uitofp i16 [[LD7]] to float
-; SSE-NEXT:    [[CVT8:%.*]] = uitofp i16 [[LD8]] to float
-; SSE-NEXT:    [[CVT9:%.*]] = uitofp i16 [[LD9]] to float
-; SSE-NEXT:    [[CVT10:%.*]] = uitofp i16 [[LD10]] to float
-; SSE-NEXT:    [[CVT11:%.*]] = uitofp i16 [[LD11]] to float
-; SSE-NEXT:    [[CVT12:%.*]] = uitofp i16 [[LD12]] to float
-; SSE-NEXT:    [[CVT13:%.*]] = uitofp i16 [[LD13]] to float
-; SSE-NEXT:    [[CVT14:%.*]] = uitofp i16 [[LD14]] to float
-; SSE-NEXT:    [[CVT15:%.*]] = uitofp i16 [[LD15]] to float
-; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
-; SSE-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE-NEXT:    store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
-; SSE-NEXT:    store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE-NEXT:    store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
-; SSE-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
-; SSE-NEXT:    store float [[CVT8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 32
-; SSE-NEXT:    store float [[CVT9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4
-; SSE-NEXT:    store float [[CVT10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 8
-; SSE-NEXT:    store float [[CVT11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
-; SSE-NEXT:    store float [[CVT12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 16
-; SSE-NEXT:    store float [[CVT13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
-; SSE-NEXT:    store float [[CVT14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 8
-; SSE-NEXT:    store float [[CVT15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <4 x i16>*), align 16
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <4 x i16> [[TMP1]], i32 0
+; SSE-NEXT:    [[CVT0:%.*]] = uitofp i16 [[TMP5]] to float
+; SSE-NEXT:    [[TMP6:%.*]] = extractelement <4 x i16> [[TMP1]], i32 1
+; SSE-NEXT:    [[CVT1:%.*]] = uitofp i16 [[TMP6]] to float
+; SSE-NEXT:    [[TMP7:%.*]] = extractelement <4 x i16> [[TMP1]], i32 2
+; SSE-NEXT:    [[CVT2:%.*]] = uitofp i16 [[TMP7]] to float
+; SSE-NEXT:    [[TMP8:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+; SSE-NEXT:    [[CVT3:%.*]] = uitofp i16 [[TMP8]] to float
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <4 x i16> [[TMP2]], i32 0
+; SSE-NEXT:    [[CVT4:%.*]] = uitofp i16 [[TMP9]] to float
+; SSE-NEXT:    [[TMP10:%.*]] = extractelement <4 x i16> [[TMP2]], i32 1
+; SSE-NEXT:    [[CVT5:%.*]] = uitofp i16 [[TMP10]] to float
+; SSE-NEXT:    [[TMP11:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2
+; SSE-NEXT:    [[CVT6:%.*]] = uitofp i16 [[TMP11]] to float
+; SSE-NEXT:    [[TMP12:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+; SSE-NEXT:    [[CVT7:%.*]] = uitofp i16 [[TMP12]] to float
+; SSE-NEXT:    [[TMP13:%.*]] = extractelement <4 x i16> [[TMP3]], i32 0
+; SSE-NEXT:    [[CVT8:%.*]] = uitofp i16 [[TMP13]] to float
+; SSE-NEXT:    [[TMP14:%.*]] = extractelement <4 x i16> [[TMP3]], i32 1
+; SSE-NEXT:    [[CVT9:%.*]] = uitofp i16 [[TMP14]] to float
+; SSE-NEXT:    [[TMP15:%.*]] = extractelement <4 x i16> [[TMP3]], i32 2
+; SSE-NEXT:    [[CVT10:%.*]] = uitofp i16 [[TMP15]] to float
+; SSE-NEXT:    [[TMP16:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3
+; SSE-NEXT:    [[CVT11:%.*]] = uitofp i16 [[TMP16]] to float
+; SSE-NEXT:    [[TMP17:%.*]] = extractelement <4 x i16> [[TMP4]], i32 0
+; SSE-NEXT:    [[CVT12:%.*]] = uitofp i16 [[TMP17]] to float
+; SSE-NEXT:    [[TMP18:%.*]] = extractelement <4 x i16> [[TMP4]], i32 1
+; SSE-NEXT:    [[CVT13:%.*]] = uitofp i16 [[TMP18]] to float
+; SSE-NEXT:    [[TMP19:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2
+; SSE-NEXT:    [[CVT14:%.*]] = uitofp i16 [[TMP19]] to float
+; SSE-NEXT:    [[TMP20:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3
+; SSE-NEXT:    [[CVT15:%.*]] = uitofp i16 [[TMP20]] to float
+; SSE-NEXT:    [[TMP21:%.*]] = insertelement <4 x float> poison, float [[CVT0]], i32 0
+; SSE-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[CVT1]], i32 1
+; SSE-NEXT:    [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[CVT2]], i32 2
+; SSE-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[CVT3]], i32 3
+; SSE-NEXT:    store <4 x float> [[TMP24]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    [[TMP25:%.*]] = insertelement <4 x float> poison, float [[CVT4]], i32 0
+; SSE-NEXT:    [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[CVT5]], i32 1
+; SSE-NEXT:    [[TMP27:%.*]] = insertelement <4 x float> [[TMP26]], float [[CVT6]], i32 2
+; SSE-NEXT:    [[TMP28:%.*]] = insertelement <4 x float> [[TMP27]], float [[CVT7]], i32 3
+; SSE-NEXT:    store <4 x float> [[TMP28]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    [[TMP29:%.*]] = insertelement <4 x float> poison, float [[CVT8]], i32 0
+; SSE-NEXT:    [[TMP30:%.*]] = insertelement <4 x float> [[TMP29]], float [[CVT9]], i32 1
+; SSE-NEXT:    [[TMP31:%.*]] = insertelement <4 x float> [[TMP30]], float [[CVT10]], i32 2
+; SSE-NEXT:    [[TMP32:%.*]] = insertelement <4 x float> [[TMP31]], float [[CVT11]], i32 3
+; SSE-NEXT:    store <4 x float> [[TMP32]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
+; SSE-NEXT:    [[TMP33:%.*]] = insertelement <4 x float> poison, float [[CVT12]], i32 0
+; SSE-NEXT:    [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[CVT13]], i32 1
+; SSE-NEXT:    [[TMP35:%.*]] = insertelement <4 x float> [[TMP34]], float [[CVT14]], i32 2
+; SSE-NEXT:    [[TMP36:%.*]] = insertelement <4 x float> [[TMP35]], float [[CVT15]], i32 3
+; SSE-NEXT:    store <4 x float> [[TMP36]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @uitofp_16i16_16f32(