Index: include/llvm/Transforms/Vectorize/SLPVectorizer.h
===================================================================
--- include/llvm/Transforms/Vectorize/SLPVectorizer.h
+++ include/llvm/Transforms/Vectorize/SLPVectorizer.h
@@ -59,8 +59,8 @@
 struct SLPVectorizerPass : public PassInfoMixin<SLPVectorizerPass> {
   using StoreList = SmallVector<StoreInst *, 8>;
   using StoreListMap = MapVector<Value *, StoreList>;
-  using WeakTrackingVHList = SmallVector<WeakTrackingVH, 8>;
-  using WeakTrackingVHListMap = MapVector<Value *, WeakTrackingVHList>;
+  using GEPList = SmallVector<GetElementPtrInst *, 8>;
+  using GEPListMap = MapVector<Value *, GEPList>;
 
   ScalarEvolution *SE = nullptr;
   TargetTransformInfo *TTI = nullptr;
@@ -131,7 +131,7 @@
 
   /// Tries to vectorize constructs started from CmpInst, InsertValueInst or
   /// InsertElementInst instructions.
-  bool vectorizeSimpleInstructions(SmallVectorImpl<WeakVH> &Instructions,
+  bool vectorizeSimpleInstructions(SmallVectorImpl<Instruction *> &Instructions,
                                    BasicBlock *BB, slpvectorizer::BoUpSLP &R);
 
   /// \brief Scan the basic block and look for patterns that are likely to start
@@ -147,7 +147,7 @@
   StoreListMap Stores;
 
   /// The getelementptr instructions in a basic block organized by base pointer.
-  WeakTrackingVHListMap GEPs;
+  GEPListMap GEPs;
 };
 
 } // end namespace llvm
Index: lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -627,6 +627,14 @@
 
   OptimizationRemarkEmitter *getORE() { return ORE; }
 
+  /// Checks if the instruction is marked for deletion.
+  bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
+
+  /// Marks values for later deletion.
+  void eraseInstructions(ArrayRef<Value *> AV);
+
+  ~BoUpSLP();
+
 private:
   struct TreeEntry;
 
@@ -814,14 +822,12 @@
   /// AliasCache, which can happen if a new instruction is allocated at the
   /// same address as a previously deleted instruction.
   void eraseInstruction(Instruction *I) {
-    I->removeFromParent();
-    I->dropAllReferences();
-    DeletedInstructions.emplace_back(I);
+    DeletedInstructions.insert(I);
   }
 
   /// Temporary store for deleted instructions. Instructions will be deleted
   /// eventually when the BoUpSLP is destructed.
-  SmallVector<unique_value, 8> DeletedInstructions;
+  SmallPtrSet<Instruction *, 8> DeletedInstructions;
 
   /// A list of values that need to extracted out of the tree.
   /// This list holds pairs of (Internal Scalar : External User). External User
@@ -1303,6 +1309,24 @@
 
 } // end namespace llvm
 
+BoUpSLP::~BoUpSLP() {
+  std::for_each(DeletedInstructions.begin(), DeletedInstructions.end(),
+                [](Instruction *I) { I->dropAllReferences(); });
+  std::for_each(DeletedInstructions.begin(), DeletedInstructions.end(),
+                [](Instruction *I) {
+                  assert(I->use_empty() &&
+                         "trying to erase instruction with users.");
+                  I->eraseFromParent();
+                });
+}
+
+void BoUpSLP::eraseInstructions(ArrayRef<Value *> AV) {
+  std::for_each(AV.begin(), AV.end(), [this](Value *V) {
+    if (auto *I = dyn_cast<Instruction>(V))
+      eraseInstruction(I);
+  });
+}
+
 void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
                         ArrayRef<Value *> UserIgnoreLst) {
   ExtraValueToDebugLocsMap ExternallyUsedValues;
@@ -2688,7 +2712,7 @@
   // Generate the 'InsertElement' instruction.
   for (unsigned i = 0; i < Ty->getNumElements(); ++i) {
     Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
-    if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) {
+    if (auto *Insrt = dyn_cast<InsertElementInst>(Vec)) {
       GatherSeq.insert(Insrt);
       CSEBlocks.insert(Insrt->getParent());
 
@@ -3277,20 +3301,18 @@
     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
       Value *Scalar = Entry->Scalars[Lane];
 
+#ifndef NDEBUG
       Type *Ty = Scalar->getType();
       if (!Ty->isVoidTy()) {
-#ifndef NDEBUG
         for (User *U : Scalar->users()) {
           DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
 
-          // It is legal to replace users in the ignorelist by undef.
+          // It is legal to delete users in the ignorelist.
           assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) &&
-                 "Replacing out-of-tree value with undef");
+                 "Deleting out-of-tree value");
         }
-#endif
-        Value *Undef = UndefValue::get(Ty);
-        Scalar->replaceAllUsesWith(Undef);
       }
+#endif
       DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
       eraseInstruction(cast<Instruction>(Scalar));
     }
@@ -3305,10 +3327,8 @@
   DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
         << " gather sequences instructions.\n");
   // LICM InsertElementInst sequences.
-  for (Instruction *it : GatherSeq) {
-    InsertElementInst *Insert = dyn_cast<InsertElementInst>(it);
-
-    if (!Insert)
+  for (auto *Insert : GatherSeq) {
+    if (isDeleted(Insert))
       continue;
 
     // Check if this block is inside a loop.
@@ -3362,6 +3382,8 @@
     // For all instructions in blocks containing gather sequences:
     for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
       Instruction *In = &*it++;
+      if (isDeleted(In))
+        continue;
       if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
         continue;
 
@@ -4237,19 +4259,6 @@
   return Changed;
 }
 
-/// \brief Check that the Values in the slice in VL array are still existent in
-/// the WeakTrackingVH array.
-/// Vectorization of part of the VL array may cause later values in the VL array
-/// to become invalid. We track when this has happened in the WeakTrackingVH
-/// array.
-static bool hasValueBeenRAUWed(ArrayRef<Value *> VL,
-                               ArrayRef<WeakTrackingVH> VH, unsigned SliceBegin,
-                               unsigned SliceSize) {
-  VL = VL.slice(SliceBegin, SliceSize);
-  VH = VH.slice(SliceBegin, SliceSize);
-  return !std::equal(VL.begin(), VL.end(), VH.begin());
-}
-
 bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
                                             unsigned VecRegSize) {
   unsigned ChainLen = Chain.size();
@@ -4261,22 +4270,21 @@
   if (!isPowerOf2_32(Sz) || VF < 2)
     return false;
 
-  // Keep track of values that were deleted by vectorizing in the loop below.
-  SmallVector<WeakTrackingVH, 8> TrackValues(Chain.begin(), Chain.end());
-
   bool Changed = false;
   // Look for profitable vectorizable trees at all offsets, starting at zero.
   for (unsigned i = 0, e = ChainLen; i < e; ++i) {
     if (i + VF > e)
       break;
 
+    ArrayRef<Value *> Operands = Chain.slice(i, VF);
     // Check that a previous iteration of this loop did not delete the Value.
-    if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))
+    if (llvm::any_of(Operands, [&R](Value *V) {
+          auto *I = dyn_cast<Instruction>(V);
+          return I && R.isDeleted(I);
+        }))
       continue;
-
     DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
           << "\n");
-    ArrayRef<Value *> Operands = Chain.slice(i, VF);
 
     R.buildTree(Operands);
     if (R.isTreeTinyAndNotFullyVectorizable())
@@ -4454,9 +4462,6 @@
 
   bool Changed = false;
 
-  // Keep track of values that were deleted by vectorizing in the loop below.
-  SmallVector<WeakTrackingVH, 8> TrackValues(VL.begin(), VL.end());
-
   unsigned NextInst = 0, MaxInst = VL.size();
   for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
        VF /= 2) {
@@ -4477,13 +4482,16 @@
       if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
         break;
 
+      ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);
       // Check that a previous iteration of this loop did not delete the Value.
-      if (hasValueBeenRAUWed(VL, TrackValues, I, OpsWidth))
+      if (llvm::any_of(Ops, [&R](Value *V) {
+            auto *I = dyn_cast<Instruction>(V);
+            return I && R.isDeleted(I);
+          }))
         continue;
 
       DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
                    << "\n");
-      ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);
 
       ArrayRef<Value *> BuildVectorSlice;
       if (!BuildVector.empty())
@@ -4654,7 +4662,9 @@
 ///   *p =
 ///
 class HorizontalReduction {
-  SmallVector<Value *, 16> ReductionOps;
+  using ReductionOpsType = SmallVector<Value *, 16>;
+  using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
+  ReductionOpsListType  ReductionOps;
   SmallVector<Value *, 32> ReducedVals;
   // Use map vector to make stable output.
   MapVector<Instruction *, Value *> ExtraArgs;
@@ -4695,6 +4705,37 @@
                (Kind == RK_UMin || Kind == RK_UMax)));
     }
 
+    /// Creates reduction operation with the current opcode.
+    Value *createOp(IRBuilder<> &Builder, const Twine &Name) const {
+      assert(isVectorizable() &&
+             "Expected add|fadd or min/max reduction operation.");
+      Value *Cmp;
+      switch (Kind) {
+      case RK_Arithmetic:
+        return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, LHS, RHS,
+                                   Name);
+      case RK_Min:
+        Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSLT(LHS, RHS)
+                                          : Builder.CreateFCmpOLT(LHS, RHS);
+        break;
+      case RK_Max:
+        Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSGT(LHS, RHS)
+                                          : Builder.CreateFCmpOGT(LHS, RHS);
+        break;
+      case RK_UMin:
+        assert(Opcode == Instruction::ICmp && "Expected integer types.");
+        Cmp = Builder.CreateICmpULT(LHS, RHS);
+        break;
+      case RK_UMax:
+        assert(Opcode == Instruction::ICmp && "Expected integer types.");
+        Cmp = Builder.CreateICmpUGT(LHS, RHS);
+        break;
+      case RK_None:
+        llvm_unreachable("Unknown reduction operation.");
+      }
+      return Builder.CreateSelect(Cmp, LHS, RHS, Name);
+    }
+
   public:
     explicit OperationData() = default;
 
@@ -4766,6 +4807,44 @@
       llvm_unreachable("Reduction kind is not set");
     }
 
+    void initReductionOps(ReductionOpsListType &ReductionOps) {
+      assert(Kind != RK_None && !!*this && LHS && RHS &&
+             "Expected reduction operation.");
+      switch (Kind) {
+      case RK_Arithmetic:
+        ReductionOps.assign(1, ReductionOpsType());
+        break;
+      case RK_Min:
+      case RK_UMin:
+      case RK_Max:
+      case RK_UMax:
+        ReductionOps.assign(2, ReductionOpsType());
+        break;
+      case RK_None:
+        llvm_unreachable("Reduction kind is not set");
+      }
+    }
+    /// Add all reduction operations for the reduction instruction \p I.
+    void addReductionOps(Instruction *I, ReductionOpsListType &ReductionOps) {
+      assert(Kind != RK_None && !!*this && LHS && RHS &&
+             "Expected reduction operation.");
+      switch (Kind) {
+      case RK_Arithmetic:
+        ReductionOps[0].emplace_back(I);
+        break;
+      case RK_Min:
+      case RK_UMin:
+      case RK_Max:
+      case RK_UMax:
+        if (cast<SelectInst>(I)->getCondition()->hasOneUse())
+          ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
+        ReductionOps[1].emplace_back(I);
+        break;
+      case RK_None:
+        llvm_unreachable("Reduction kind is not set");
+      }
+    }
+
     /// Checks if instruction is associative and can be vectorized.
     bool isAssociative(Instruction *I) const {
       assert(Kind != RK_None && *this && LHS && RHS &&
@@ -4834,36 +4913,57 @@
       llvm_unreachable("Reduction kind is not set");
     }
 
-    /// Creates reduction operation with the current opcode.
-    Value *createOp(IRBuilder<> &Builder, const Twine &Name = "") const {
+    /// Creates reduction operation with the current opcode with the IR flags
+    /// from \p ReductionOps.
+    Value *createOp(IRBuilder<> &Builder, const Twine &Name,
+                    const ReductionOpsListType &ReductionOps) const {
       assert(isVectorizable() &&
              "Expected add|fadd or min/max reduction operation.");
-      Value *Cmp;
+      auto *Op = createOp(Builder, Name);
       switch (Kind) {
       case RK_Arithmetic:
-        return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, LHS, RHS,
-                                   Name);
+        propagateIRFlags(Op, ReductionOps[0]);
+        return Op;
       case RK_Min:
-        Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSLT(LHS, RHS)
-                                          : Builder.CreateFCmpOLT(LHS, RHS);
-        break;
       case RK_Max:
-        Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSGT(LHS, RHS)
-                                          : Builder.CreateFCmpOGT(LHS, RHS);
-        break;
       case RK_UMin:
-        assert(Opcode == Instruction::ICmp && "Expected integer types.");
-        Cmp = Builder.CreateICmpULT(LHS, RHS);
+      case RK_UMax:
+        propagateIRFlags(cast<SelectInst>(Op)->getCondition(), ReductionOps[0]);
+        propagateIRFlags(Op, ReductionOps[1]);
+        return Op;
+        break;
+      case RK_None:
         break;
+      }
+      llvm_unreachable("Unknown reduction operation.");
+    }
+
+    /// Creates reduction operation with the current opcode with the IR flags
+    /// from \p I.
+    Value *createOp(IRBuilder<> &Builder, const Twine &Name,
+                    Instruction *I) const {
+      assert(isVectorizable() &&
+             "Expected add|fadd or min/max reduction operation.");
+      auto *Op = createOp(Builder, Name);
+      switch (Kind) {
+      case RK_Arithmetic:
+        propagateIRFlags(Op, I);
+        return Op;
+      case RK_Min:
+      case RK_Max:
+      case RK_UMin:
       case RK_UMax:
-        assert(Opcode == Instruction::ICmp && "Expected integer types.");
-        Cmp = Builder.CreateICmpUGT(LHS, RHS);
+        propagateIRFlags(cast<SelectInst>(Op)->getCondition(),
+                         cast<SelectInst>(I)->getCondition());
+        propagateIRFlags(Op, I);
+        return Op;
         break;
       case RK_None:
-        llvm_unreachable("Unknown reduction operation.");
+        break;
       }
-      return Builder.CreateSelect(Cmp, LHS, RHS, Name);
+      llvm_unreachable("Unknown reduction operation.");
     }
+
     TargetTransformInfo::ReductionFlags getFlags() const {
       TargetTransformInfo::ReductionFlags Flags;
       Flags.NoNaN = NoNaN;
@@ -5000,6 +5100,7 @@
     SmallVector<std::pair<Instruction *, unsigned>, 32> Stack;
     Stack.push_back(std::make_pair(B, ReductionData.getFirstOperandIndex()));
     const unsigned NUses = ReductionData.getRequiredNumberOfUses();
+    ReductionData.initReductionOps(ReductionOps);
     while (!Stack.empty()) {
       Instruction *TreeN = Stack.back().first;
       unsigned EdgeToVist = Stack.back().second++;
@@ -5025,7 +5126,7 @@
             markExtraArg(Stack[Stack.size() - 2], TreeN);
             ExtraArgs.erase(TreeN);
           } else
-            ReductionOps.push_back(TreeN);
+            ReductionData.addReductionOps(TreeN, ReductionOps);
         }
         // Retract.
         Stack.pop_back();
@@ -5110,14 +5211,18 @@
     BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
     // The same extra argument may be used several time, so log each attempt
     // to use it.
+    SmallVector<Value *, 16> IgnoreList;
+    for (auto &V : ReductionOps)
+      IgnoreList.append(V.begin(), V.end());
     for (auto &Pair : ExtraArgs)
       ExternallyUsedValues[Pair.second].push_back(Pair.first);
     while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
       auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth);
-      V.buildTree(VL, ExternallyUsedValues, ReductionOps);
+
+      V.buildTree(VL, ExternallyUsedValues, IgnoreList);
       if (V.shouldReorder()) {
         SmallVector<Value *, 8> Reversed(VL.rbegin(), VL.rend());
-        V.buildTree(Reversed, ExternallyUsedValues, ReductionOps);
+        V.buildTree(Reversed, ExternallyUsedValues, IgnoreList);
       }
       if (V.isTreeTinyAndNotFullyVectorizable())
         break;
@@ -5145,14 +5250,14 @@
 
       // Emit a reduction.
       Value *ReducedSubTree =
-          emitReduction(VectorizedRoot, Builder, ReduxWidth, ReductionOps, TTI);
+          emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
       if (VectorizedTree) {
         Builder.SetCurrentDebugLocation(Loc);
         OperationData VectReductionData(ReductionData.getOpcode(),
                                         VectorizedTree, ReducedSubTree,
                                         ReductionData.getKind());
-        VectorizedTree = VectReductionData.createOp(Builder, "op.rdx");
-        propagateIRFlags(VectorizedTree, ReductionOps);
+        VectorizedTree =
+            VectReductionData.createOp(Builder, "op.rdx", ReductionOps);
       } else
         VectorizedTree = ReducedSubTree;
       i += ReduxWidth;
@@ -5167,8 +5272,7 @@
         OperationData VectReductionData(ReductionData.getOpcode(),
                                         VectorizedTree, I,
                                         ReductionData.getKind());
-        VectorizedTree = VectReductionData.createOp(Builder);
-        propagateIRFlags(VectorizedTree, ReductionOps);
+        VectorizedTree = VectReductionData.createOp(Builder, "", ReductionOps);
       }
       for (auto &Pair : ExternallyUsedValues) {
         assert(!Pair.second.empty() &&
@@ -5179,12 +5283,14 @@
           OperationData VectReductionData(ReductionData.getOpcode(),
                                           VectorizedTree, Pair.first,
                                           ReductionData.getKind());
-          VectorizedTree = VectReductionData.createOp(Builder, "op.extra");
-          propagateIRFlags(VectorizedTree, I);
+          VectorizedTree = VectReductionData.createOp(Builder, "op.extra", I);
         }
       }
       // Update users.
       ReductionRoot->replaceAllUsesWith(VectorizedTree);
+      // Mark all scalar reduction ops for deletion, they are replaced by the
+      // vector reductions.
+      V.eraseInstructions(IgnoreList);
     }
     return VectorizedTree != nullptr;
   }
@@ -5264,8 +5370,7 @@
 
   /// \brief Emit a horizontal reduction of the vectorized value.
   Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder,
-                       unsigned ReduxWidth, ArrayRef<Value *> RedOps,
-                       const TargetTransformInfo *TTI) {
+                       unsigned ReduxWidth, const TargetTransformInfo *TTI) {
     assert(VectorizedValue && "Need to have a vectorized tree node");
     assert(isPowerOf2_32(ReduxWidth) &&
            "We only handle power-of-two reductions for now");
@@ -5273,7 +5378,7 @@
     if (!IsPairwiseReduction)
       return createSimpleTargetReduction(
           Builder, TTI, ReductionData.getOpcode(), VectorizedValue,
-          ReductionData.getFlags(), RedOps);
+          ReductionData.getFlags(), ReductionOps.back());
 
     Value *TmpVec = VectorizedValue;
     for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
@@ -5289,8 +5394,7 @@
           "rdx.shuf.r");
       OperationData VectReductionData(ReductionData.getOpcode(), LeftShuf,
                                       RightShuf, ReductionData.getKind());
-      TmpVec = VectReductionData.createOp(Builder, "op.rdx");
-      propagateIRFlags(TmpVec, RedOps);
+      TmpVec = VectReductionData.createOp(Builder, "op.rdx", ReductionOps);
     }
 
     // The result is in the first element of the vector.
@@ -5438,18 +5542,13 @@
   // horizontal reduction.
   // Interrupt the process if the Root instruction itself was vectorized or all
   // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
-  SmallVector<std::pair<WeakTrackingVH, unsigned>, 8> Stack(1, {Root, 0});
+  SmallVector<std::pair<Instruction *, unsigned>, 8> Stack(1, {Root, 0});
   SmallSet<Value *, 8> VisitedInstrs;
   bool Res = false;
   while (!Stack.empty()) {
-    Value *V;
+    Instruction *Inst;
     unsigned Level;
-    std::tie(V, Level) = Stack.pop_back_val();
-    if (!V)
-      continue;
-    auto *Inst = dyn_cast<Instruction>(V);
-    if (!Inst)
-      continue;
+    std::tie(Inst, Level) = Stack.pop_back_val();
     auto *BI = dyn_cast<BinaryOperator>(Inst);
     auto *SI = dyn_cast<SelectInst>(Inst);
     if (BI || SI) {
@@ -5490,8 +5589,8 @@
       for (auto *Op : Inst->operand_values())
         if (VisitedInstrs.insert(Op).second)
           if (auto *I = dyn_cast<Instruction>(Op))
-            if (!isa<PHINode>(I) && I->getParent() == BB)
-              Stack.emplace_back(Op, Level);
+            if (!isa<PHINode>(I) && !R.isDeleted(I) && I->getParent() == BB)
+              Stack.emplace_back(I, Level);
   }
   return Res;
 }
@@ -5556,11 +5655,10 @@
 }
 
 bool SLPVectorizerPass::vectorizeSimpleInstructions(
-    SmallVectorImpl<WeakVH> &Instructions, BasicBlock *BB, BoUpSLP &R) {
+    SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R) {
   bool OpsChanged = false;
-  for (auto &VH : reverse(Instructions)) {
-    auto *I = dyn_cast_or_null<Instruction>(VH);
-    if (!I)
+  for (auto *I : reverse(Instructions)) {
+    if (R.isDeleted(I))
       continue;
     if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I))
       OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
@@ -5589,7 +5687,7 @@
       if (!P)
         break;
 
-      if (!VisitedInstrs.count(P))
+      if (!VisitedInstrs.count(P) && !R.isDeleted(P))
         Incoming.push_back(P);
     }
 
@@ -5632,9 +5730,12 @@
 
   VisitedInstrs.clear();
 
-  SmallVector<WeakVH, 8> PostProcessInstructions;
+  SmallVector<Instruction *, 8> PostProcessInstructions;
   SmallDenseSet<Instruction *, 4> KeyNodes;
   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {
+    // Skip instructions marked for the deletion.
+    if (R.isDeleted(&*it))
+      continue;
     // We may go through BB multiple times so skip the one we have checked.
     if (!VisitedInstrs.insert(&*it).second) {
       if (it->use_empty() && KeyNodes.count(&*it) > 0 &&
@@ -5728,10 +5829,10 @@
       SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
 
       // Some of the candidates may have already been vectorized after we
-      // initially collected them. If so, the WeakTrackingVHs will have
-      // nullified the
-      // values, so remove them from the set of candidates.
-      Candidates.remove(nullptr);
+      // initially collected them. If so, they are marked as deleted, so remove
+      // them from the set of candidates.
+      Candidates.remove_if(
+          [&R](Value *I) { return R.isDeleted(cast<Instruction>(I)); });
 
       // Remove from the set of candidates all pairs of getelementptrs with
       // constant differences. Such getelementptrs are likely not good
@@ -5739,18 +5840,18 @@
       // computed from the other. We also ensure all candidate getelementptr
       // indices are unique.
       for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
-        auto *GEPI = cast<GetElementPtrInst>(GEPList[I]);
+        auto *GEPI = GEPList[I];
         if (!Candidates.count(GEPI))
           continue;
         auto *SCEVI = SE->getSCEV(GEPList[I]);
         for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
-          auto *GEPJ = cast<GetElementPtrInst>(GEPList[J]);
+          auto *GEPJ = GEPList[J];
           auto *SCEVJ = SE->getSCEV(GEPList[J]);
           if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
-            Candidates.remove(GEPList[I]);
-            Candidates.remove(GEPList[J]);
+            Candidates.remove(GEPI);
+            Candidates.remove(GEPJ);
           } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
-            Candidates.remove(GEPList[J]);
+            Candidates.remove(GEPJ);
           }
         }
       }
Index: test/Transforms/SLPVectorizer/AArch64/gather-root.ll
===================================================================
--- test/Transforms/SLPVectorizer/AArch64/gather-root.ll
+++ test/Transforms/SLPVectorizer/AArch64/gather-root.ll
@@ -15,18 +15,10 @@
 ; DEFAULT-NEXT:    [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
 ; DEFAULT-NEXT:    br label [[FOR_BODY:%.*]]
 ; DEFAULT:       for.body:
-; DEFAULT-NEXT:    [[TMP17:%.*]] = phi i32 [ [[BIN_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; DEFAULT-NEXT:    [[TMP17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
-; DEFAULT-NEXT:    [[TMP20:%.*]] = add i32 [[TMP17]], undef
-; DEFAULT-NEXT:    [[TMP22:%.*]] = add i32 [[TMP20]], undef
-; DEFAULT-NEXT:    [[TMP24:%.*]] = add i32 [[TMP22]], undef
-; DEFAULT-NEXT:    [[TMP26:%.*]] = add i32 [[TMP24]], undef
-; DEFAULT-NEXT:    [[TMP28:%.*]] = add i32 [[TMP26]], undef
-; DEFAULT-NEXT:    [[TMP30:%.*]] = add i32 [[TMP28]], undef
-; DEFAULT-NEXT:    [[TMP32:%.*]] = add i32 [[TMP30]], undef
 ; DEFAULT-NEXT:    [[TMP3:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> [[TMP2]])
-; DEFAULT-NEXT:    [[BIN_EXTRA]] = add i32 [[TMP3]], [[TMP17]]
-; DEFAULT-NEXT:    [[TMP34:%.*]] = add i32 [[TMP32]], undef
+; DEFAULT-NEXT:    [[OP_EXTRA]] = add i32 [[TMP3]], [[TMP17]]
 ; DEFAULT-NEXT:    br label [[FOR_BODY]]
 ;
 ; GATHER-LABEL: @PR28330(
@@ -47,22 +39,15 @@
 ; GATHER-NEXT:    [[TMP15:%.*]] = icmp eq i8 [[TMP14]], 0
 ; GATHER-NEXT:    br label [[FOR_BODY:%.*]]
 ; GATHER:       for.body:
-; GATHER-NEXT:    [[TMP17:%.*]] = phi i32 [ [[BIN_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; GATHER-NEXT:    [[TMP17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
 ; GATHER-NEXT:    [[TMP2:%.*]] = select <2 x i1> [[TMP1]], <2 x i32> <i32 -720, i32 -720>, <2 x i32> <i32 -80, i32 -80>
 ; GATHER-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
-; GATHER-NEXT:    [[TMP20:%.*]] = add i32 [[TMP17]], [[TMP3]]
 ; GATHER-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
-; GATHER-NEXT:    [[TMP22:%.*]] = add i32 [[TMP20]], [[TMP4]]
 ; GATHER-NEXT:    [[TMP23:%.*]] = select i1 [[TMP5]], i32 -720, i32 -80
-; GATHER-NEXT:    [[TMP24:%.*]] = add i32 [[TMP22]], [[TMP23]]
 ; GATHER-NEXT:    [[TMP25:%.*]] = select i1 [[TMP7]], i32 -720, i32 -80
-; GATHER-NEXT:    [[TMP26:%.*]] = add i32 [[TMP24]], [[TMP25]]
 ; GATHER-NEXT:    [[TMP27:%.*]] = select i1 [[TMP9]], i32 -720, i32 -80
-; GATHER-NEXT:    [[TMP28:%.*]] = add i32 [[TMP26]], [[TMP27]]
 ; GATHER-NEXT:    [[TMP29:%.*]] = select i1 [[TMP11]], i32 -720, i32 -80
-; GATHER-NEXT:    [[TMP30:%.*]] = add i32 [[TMP28]], [[TMP29]]
 ; GATHER-NEXT:    [[TMP31:%.*]] = select i1 [[TMP13]], i32 -720, i32 -80
-; GATHER-NEXT:    [[TMP32:%.*]] = add i32 [[TMP30]], [[TMP31]]
 ; GATHER-NEXT:    [[TMP33:%.*]] = select i1 [[TMP15]], i32 -720, i32 -80
 ; GATHER-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> undef, i32 [[TMP3]], i32 0
 ; GATHER-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[TMP4]], i32 1
@@ -73,8 +58,7 @@
 ; GATHER-NEXT:    [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[TMP31]], i32 6
 ; GATHER-NEXT:    [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[TMP33]], i32 7
 ; GATHER-NEXT:    [[TMP13:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> [[TMP12]])
-; GATHER-NEXT:    [[BIN_EXTRA]] = add i32 [[TMP13]], [[TMP17]]
-; GATHER-NEXT:    [[TMP34:%.*]] = add i32 [[TMP32]], [[TMP33]]
+; GATHER-NEXT:    [[OP_EXTRA]] = add i32 [[TMP13]], [[TMP17]]
 ; GATHER-NEXT:    br label [[FOR_BODY]]
 ;
 ; MAX-COST-LABEL: @PR28330(
@@ -163,18 +147,10 @@
 ; DEFAULT-NEXT:    [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
 ; DEFAULT-NEXT:    br label [[FOR_BODY:%.*]]
 ; DEFAULT:       for.body:
-; DEFAULT-NEXT:    [[TMP17:%.*]] = phi i32 [ [[BIN_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; DEFAULT-NEXT:    [[TMP17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
-; DEFAULT-NEXT:    [[TMP20:%.*]] = add i32 -5, undef
-; DEFAULT-NEXT:    [[TMP22:%.*]] = add i32 [[TMP20]], undef
-; DEFAULT-NEXT:    [[TMP24:%.*]] = add i32 [[TMP22]], undef
-; DEFAULT-NEXT:    [[TMP26:%.*]] = add i32 [[TMP24]], undef
-; DEFAULT-NEXT:    [[TMP28:%.*]] = add i32 [[TMP26]], undef
-; DEFAULT-NEXT:    [[TMP30:%.*]] = add i32 [[TMP28]], undef
-; DEFAULT-NEXT:    [[TMP32:%.*]] = add i32 [[TMP30]], undef
 ; DEFAULT-NEXT:    [[TMP3:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> [[TMP2]])
-; DEFAULT-NEXT:    [[BIN_EXTRA]] = add i32 [[TMP3]], -5
-; DEFAULT-NEXT:    [[TMP34:%.*]] = add i32 [[TMP32]], undef
+; DEFAULT-NEXT:    [[OP_EXTRA]] = add i32 [[TMP3]], -5
 ; DEFAULT-NEXT:    br label [[FOR_BODY]]
 ;
 ; GATHER-LABEL: @PR32038(
@@ -195,22 +171,15 @@
 ; GATHER-NEXT:    [[TMP15:%.*]] = icmp eq i8 [[TMP14]], 0
 ; GATHER-NEXT:    br label [[FOR_BODY:%.*]]
 ; GATHER:       for.body:
-; GATHER-NEXT:    [[TMP17:%.*]] = phi i32 [ [[BIN_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; GATHER-NEXT:    [[TMP17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
 ; GATHER-NEXT:    [[TMP2:%.*]] = select <2 x i1> [[TMP1]], <2 x i32> <i32 -720, i32 -720>, <2 x i32> <i32 -80, i32 -80>
 ; GATHER-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
-; GATHER-NEXT:    [[TMP20:%.*]] = add i32 -5, [[TMP3]]
 ; GATHER-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
-; GATHER-NEXT:    [[TMP22:%.*]] = add i32 [[TMP20]], [[TMP4]]
 ; GATHER-NEXT:    [[TMP23:%.*]] = select i1 [[TMP5]], i32 -720, i32 -80
-; GATHER-NEXT:    [[TMP24:%.*]] = add i32 [[TMP22]], [[TMP23]]
 ; GATHER-NEXT:    [[TMP25:%.*]] = select i1 [[TMP7]], i32 -720, i32 -80
-; GATHER-NEXT:    [[TMP26:%.*]] = add i32 [[TMP24]], [[TMP25]]
 ; GATHER-NEXT:    [[TMP27:%.*]] = select i1 [[TMP9]], i32 -720, i32 -80
-; GATHER-NEXT:    [[TMP28:%.*]] = add i32 [[TMP26]], [[TMP27]]
 ; GATHER-NEXT:    [[TMP29:%.*]] = select i1 [[TMP11]], i32 -720, i32 -80
-; GATHER-NEXT:    [[TMP30:%.*]] = add i32 [[TMP28]], [[TMP29]]
 ; GATHER-NEXT:    [[TMP31:%.*]] = select i1 [[TMP13]], i32 -720, i32 -80
-; GATHER-NEXT:    [[TMP32:%.*]] = add i32 [[TMP30]], [[TMP31]]
 ; GATHER-NEXT:    [[TMP33:%.*]] = select i1 [[TMP15]], i32 -720, i32 -80
 ; GATHER-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> undef, i32 [[TMP3]], i32 0
 ; GATHER-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[TMP4]], i32 1
@@ -221,8 +190,7 @@
 ; GATHER-NEXT:    [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[TMP31]], i32 6
 ; GATHER-NEXT:    [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[TMP33]], i32 7
 ; GATHER-NEXT:    [[TMP13:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> [[TMP12]])
-; GATHER-NEXT:    [[BIN_EXTRA]] = add i32 [[TMP13]], -5
-; GATHER-NEXT:    [[TMP34:%.*]] = add i32 [[TMP32]], [[TMP33]]
+; GATHER-NEXT:    [[OP_EXTRA]] = add i32 [[TMP13]], -5
 ; GATHER-NEXT:    br label [[FOR_BODY]]
 ;
 ; MAX-COST-LABEL: @PR32038(
@@ -230,9 +198,9 @@
 ; MAX-COST-NEXT:    [[TMP0:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <2 x i8>*), align 1
 ; MAX-COST-NEXT:    [[TMP1:%.*]] = icmp eq <2 x i8> [[TMP0]], zeroinitializer
 ; MAX-COST-NEXT:    [[TMP4:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1
-; MAX-COST-NEXT:    [[TMPP5:%.*]] = icmp eq i8 [[TMP4]], 0
+; MAX-COST-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[TMP4]], 0
 ; MAX-COST-NEXT:    [[TMP6:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4
-; MAX-COST-NEXT:    [[TMPP7:%.*]] = icmp eq i8 [[TMP6]], 0
+; MAX-COST-NEXT:    [[TMP7:%.*]] = icmp eq i8 [[TMP6]], 0
 ; MAX-COST-NEXT:    [[TMP8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1
 ; MAX-COST-NEXT:    [[TMP9:%.*]] = icmp eq i8 [[TMP8]], 0
 ; MAX-COST-NEXT:    [[TMP10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2
@@ -248,23 +216,17 @@
 ; MAX-COST-NEXT:    [[TMP3:%.*]] = insertelement <4 x i1> undef, i1 [[TMP2]], i32 0
 ; MAX-COST-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
 ; MAX-COST-NEXT:    [[TMP5:%.*]] = insertelement <4 x i1> [[TMP3]], i1 [[TMP4]], i32 1
-; MAX-COST-NEXT:    [[TMP6:%.*]] = insertelement <4 x i1> [[TMP5]], i1 [[TMPP5]], i32 2
-; MAX-COST-NEXT:    [[TMP7:%.*]] = insertelement <4 x i1> [[TMP6]], i1 [[TMPP7]], i32 3
+; MAX-COST-NEXT:    [[TMP6:%.*]] = insertelement <4 x i1> [[TMP5]], i1 [[TMP5]], i32 2
+; MAX-COST-NEXT:    [[TMP7:%.*]] = insertelement <4 x i1> [[TMP6]], i1 [[TMP7]], i32 3
 ; MAX-COST-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> <i32 -720, i32 -720, i32 -720, i32 -720>, <4 x i32> <i32 -80, i32 -80, i32 -80, i32 -80>
-; MAX-COST-NEXT:    [[TMP20:%.*]] = add i32 -5, undef
-; MAX-COST-NEXT:    [[TMP22:%.*]] = add i32 [[TMP20]], undef
-; MAX-COST-NEXT:    [[TMP24:%.*]] = add i32 [[TMP22]], undef
-; MAX-COST-NEXT:    [[TMP26:%.*]] = add i32 [[TMP24]], undef
 ; MAX-COST-NEXT:    [[TMP27:%.*]] = select i1 [[TMP9]], i32 -720, i32 -80
-; MAX-COST-NEXT:    [[TMP28:%.*]] = add i32 [[TMP26]], [[TMP27]]
 ; MAX-COST-NEXT:    [[TMP29:%.*]] = select i1 [[TMP11]], i32 -720, i32 -80
 ; MAX-COST-NEXT:    [[TMP9:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> [[TMP8]])
 ; MAX-COST-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], [[TMP27]]
 ; MAX-COST-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], [[TMP29]]
-; MAX-COST-NEXT:    [[BIN_EXTRA:%.*]] = add i32 [[TMP11]], -5
-; MAX-COST-NEXT:    [[TMP30:%.*]] = add i32 [[TMP28]], [[TMP29]]
+; MAX-COST-NEXT:    [[OP_EXTRA:%.*]] = add i32 [[TMP11]], -5
 ; MAX-COST-NEXT:    [[TMP31:%.*]] = select i1 [[TMP13]], i32 -720, i32 -80
-; MAX-COST-NEXT:    [[TMP32:%.*]] = add i32 [[BIN_EXTRA]], [[TMP31]]
+; MAX-COST-NEXT:    [[TMP32:%.*]] = add i32 [[OP_EXTRA]], [[TMP31]]
 ; MAX-COST-NEXT:    [[TMP33:%.*]] = select i1 [[TMP15]], i32 -720, i32 -80
 ; MAX-COST-NEXT:    [[TMP34]] = add i32 [[TMP32]], [[TMP33]]
 ; MAX-COST-NEXT:    br label [[FOR_BODY]]
Index: test/Transforms/SLPVectorizer/X86/PR31847.ll
===================================================================
--- /dev/null
+++ test/Transforms/SLPVectorizer/X86/PR31847.ll
@@ -0,0 +1,151 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -S -o - -mtriple=i386 -mcpu=haswell < %s | FileCheck %s
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+
+@shift = common local_unnamed_addr global [10 x i32] zeroinitializer, align 4
+@data = common local_unnamed_addr global [10 x i8*] zeroinitializer, align 4
+
+define void @flat(i32 %intensity) {
+; CHECK-LABEL: @flat(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([10 x i32], [10 x i32]* @shift, i32 0, i32 0), align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([10 x i32], [10 x i32]* @shift, i32 0, i32 1), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8*, i8** getelementptr inbounds ([10 x i8*], [10 x i8*]* @data, i32 0, i32 0), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8*, i8** getelementptr inbounds ([10 x i8*], [10 x i8*]* @data, i32 0, i32 1), align 4
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 1, [[TMP0]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 [[SHR]]
+; CHECK-NEXT:    [[SHR1:%.*]] = lshr i32 1, [[TMP1]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, i8* [[TMP3]], i32 [[SHR1]]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[D1_DATA_046:%.*]] = phi i8* [ [[TMP3]], [[ENTRY:%.*]] ], [ [[ADD_PTR23_1:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[Y_045:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_1:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i8> undef, i8 [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i8> [[TMP6]], i8 [[TMP4]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = zext <2 x i8> [[TMP7]] to <2 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <2 x i32> <i32 -128, i32 -128>, [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp sgt <2 x i32> [[TMP9]], <i32 -1, i32 -1>
+; CHECK-NEXT:    [[TMP11:%.*]] = sub nsw <2 x i32> <i32 128, i32 128>, [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <2 x i1> [[TMP10]], <2 x i32> [[TMP9]], <2 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i32> [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i32> [[TMP12]], i32 1
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[IDX_NEG:%.*]] = sub nsw i32 0, [[ADD]]
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, i8* [[D1_DATA_046]], i32 [[IDX_NEG]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i8, i8* [[ADD_PTR]], align 1
+; CHECK-NEXT:    [[CONV15:%.*]] = zext i8 [[TMP15]] to i32
+; CHECK-NEXT:    [[ADD16:%.*]] = add nsw i32 [[CONV15]], [[INTENSITY:%.*]]
+; CHECK-NEXT:    [[CONV17:%.*]] = trunc i32 [[ADD16]] to i8
+; CHECK-NEXT:    store i8 [[CONV17]], i8* [[ADD_PTR]], align 1
+; CHECK-NEXT:    [[ADD_PTR18:%.*]] = getelementptr inbounds i8, i8* [[D1_DATA_046]], i32 [[ADD]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i8, i8* [[ADD_PTR18]], align 1
+; CHECK-NEXT:    [[NOT_TOBOOL:%.*]] = icmp eq i8 [[TMP16]], 0
+; CHECK-NEXT:    [[CONV21:%.*]] = zext i1 [[NOT_TOBOOL]] to i8
+; CHECK-NEXT:    store i8 [[CONV21]], i8* [[ADD_PTR18]], align 1
+; CHECK-NEXT:    [[ADD_PTR23:%.*]] = getelementptr inbounds i8, i8* [[D1_DATA_046]], i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x i8> undef, i8 [[TMP18]], i32 0
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x i8> [[TMP19]], i8 [[TMP17]], i32 1
+; CHECK-NEXT:    [[TMP21:%.*]] = zext <2 x i8> [[TMP20]] to <2 x i32>
+; CHECK-NEXT:    [[TMP22:%.*]] = add nsw <2 x i32> <i32 -128, i32 -128>, [[TMP21]]
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp sgt <2 x i32> [[TMP22]], <i32 -1, i32 -1>
+; CHECK-NEXT:    [[TMP24:%.*]] = sub nsw <2 x i32> <i32 128, i32 128>, [[TMP21]]
+; CHECK-NEXT:    [[TMP25:%.*]] = select <2 x i1> [[TMP23]], <2 x i32> [[TMP22]], <2 x i32> [[TMP24]]
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <2 x i32> [[TMP25]], i32 0
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <2 x i32> [[TMP25]], i32 1
+; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    [[IDX_NEG_1:%.*]] = sub nsw i32 0, [[ADD_1]]
+; CHECK-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR23]], i32 [[IDX_NEG_1]]
+; CHECK-NEXT:    [[TMP28:%.*]] = load i8, i8* [[ADD_PTR_1]], align 1
+; CHECK-NEXT:    [[CONV15_1:%.*]] = zext i8 [[TMP28]] to i32
+; CHECK-NEXT:    [[ADD16_1:%.*]] = add nsw i32 [[CONV15_1]], [[INTENSITY]]
+; CHECK-NEXT:    [[CONV17_1:%.*]] = trunc i32 [[ADD16_1]] to i8
+; CHECK-NEXT:    store i8 [[CONV17_1]], i8* [[ADD_PTR_1]], align 1
+; CHECK-NEXT:    [[ADD_PTR18_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR23]], i32 [[ADD_1]]
+; CHECK-NEXT:    [[TMP29:%.*]] = load i8, i8* [[ADD_PTR18_1]], align 1
+; CHECK-NEXT:    [[NOT_TOBOOL_1:%.*]] = icmp eq i8 [[TMP29]], 0
+; CHECK-NEXT:    [[CONV21_1:%.*]] = zext i1 [[NOT_TOBOOL_1]] to i8
+; CHECK-NEXT:    store i8 [[CONV21_1]], i8* [[ADD_PTR18_1]], align 1
+; CHECK-NEXT:    [[ADD_PTR23_1]] = getelementptr inbounds i8, i8* [[ADD_PTR23]], i32 [[TMP1]]
+; CHECK-NEXT:    [[INC_1]] = add nsw i32 [[Y_045]], 2
+; CHECK-NEXT:    [[EXITCOND_1:%.*]] = icmp eq i32 [[INC_1]], 128
+; CHECK-NEXT:    br i1 [[EXITCOND_1]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+;
+entry:
+  %0 = load i32, i32* getelementptr inbounds ([10 x i32], [10 x i32]* @shift, i32 0, i32 0), align 4
+  %1 = load i32, i32* getelementptr inbounds ([10 x i32], [10 x i32]* @shift, i32 0, i32 1), align 4
+  %2 = load i8*, i8** getelementptr inbounds ([10 x i8*], [10 x i8*]* @data, i32 0, i32 0), align 4
+  %3 = load i8*, i8** getelementptr inbounds ([10 x i8*], [10 x i8*]* @data, i32 0, i32 1), align 4
+  %shr = lshr i32 1, %0
+  %arrayidx = getelementptr inbounds i8, i8* %2, i32 %shr
+  %shr1 = lshr i32 1, %1
+  %arrayidx2 = getelementptr inbounds i8, i8* %3, i32 %shr1
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %d1_data.046 = phi i8* [ %3, %entry ], [ %add.ptr23.1, %for.body ]
+  %y.045 = phi i32 [ 0, %entry ], [ %inc.1, %for.body ]
+  %4 = load i8, i8* %arrayidx, align 1
+  %conv = zext i8 %4 to i32
+  %sub = add nsw i32 %conv, -128
+  %5 = load i8, i8* %arrayidx2, align 1
+  %conv3 = zext i8 %5 to i32
+  %sub4 = add nsw i32 %conv3, -128
+  %cmp5 = icmp sgt i32 %sub, -1
+  %sub7 = sub nsw i32 128, %conv
+  %cond = select i1 %cmp5, i32 %sub, i32 %sub7
+  %cmp8 = icmp sgt i32 %sub4, -1
+  %sub12 = sub nsw i32 128, %conv3
+  %cond14 = select i1 %cmp8, i32 %sub4, i32 %sub12
+  %add = add nsw i32 %cond14, %cond
+  %idx.neg = sub nsw i32 0, %add
+  %add.ptr = getelementptr inbounds i8, i8* %d1_data.046, i32 %idx.neg
+  %6 = load i8, i8* %add.ptr, align 1
+  %conv15 = zext i8 %6 to i32
+  %add16 = add nsw i32 %conv15, %intensity
+  %conv17 = trunc i32 %add16 to i8
+  store i8 %conv17, i8* %add.ptr, align 1
+  %add.ptr18 = getelementptr inbounds i8, i8* %d1_data.046, i32 %add
+  %7 = load i8, i8* %add.ptr18, align 1
+  %not.tobool = icmp eq i8 %7, 0
+  %conv21 = zext i1 %not.tobool to i8
+  store i8 %conv21, i8* %add.ptr18, align 1
+  %add.ptr23 = getelementptr inbounds i8, i8* %d1_data.046, i32 %1
+  %8 = load i8, i8* %arrayidx, align 1
+  %conv.1 = zext i8 %8 to i32
+  %sub.1 = add nsw i32 %conv.1, -128
+  %9 = load i8, i8* %arrayidx2, align 1
+  %conv3.1 = zext i8 %9 to i32
+  %sub4.1 = add nsw i32 %conv3.1, -128
+  %cmp5.1 = icmp sgt i32 %sub.1, -1
+  %sub7.1 = sub nsw i32 128, %conv.1
+  %cond.1 = select i1 %cmp5.1, i32 %sub.1, i32 %sub7.1
+  %cmp8.1 = icmp sgt i32 %sub4.1, -1
+  %sub12.1 = sub nsw i32 128, %conv3.1
+  %cond14.1 = select i1 %cmp8.1, i32 %sub4.1, i32 %sub12.1
+  %add.1 = add nsw i32 %cond14.1, %cond.1
+  %idx.neg.1 = sub nsw i32 0, %add.1
+  %add.ptr.1 = getelementptr inbounds i8, i8* %add.ptr23, i32 %idx.neg.1
+  %10 = load i8, i8* %add.ptr.1, align 1
+  %conv15.1 = zext i8 %10 to i32
+  %add16.1 = add nsw i32 %conv15.1, %intensity
+  %conv17.1 = trunc i32 %add16.1 to i8
+  store i8 %conv17.1, i8* %add.ptr.1, align 1
+  %add.ptr18.1 = getelementptr inbounds i8, i8* %add.ptr23, i32 %add.1
+  %11 = load i8, i8* %add.ptr18.1, align 1
+  %not.tobool.1 = icmp eq i8 %11, 0
+  %conv21.1 = zext i1 %not.tobool.1 to i8
+  store i8 %conv21.1, i8* %add.ptr18.1, align 1
+  %add.ptr23.1 = getelementptr inbounds i8, i8* %add.ptr23, i32 %1
+  %inc.1 = add nsw i32 %y.045, 2
+  %exitcond.1 = icmp eq i32 %inc.1, 128
+  br i1 %exitcond.1, label %for.cond.cleanup, label %for.body
+}
Index: test/Transforms/SLPVectorizer/X86/horizontal-list.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/horizontal-list.ll
+++ test/Transforms/SLPVectorizer/X86/horizontal-list.ll
@@ -100,16 +100,8 @@
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr to <8 x float>*), align 16
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr1 to <8 x float>*), align 16
 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]]
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float undef, [[CONV]]
-; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float undef, [[ADD]]
-; CHECK-NEXT:    [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]]
-; CHECK-NEXT:    [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]]
 ; CHECK-NEXT:    [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2
 ; CHECK-NEXT:    [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float
-; CHECK-NEXT:    [[ADD7:%.*]] = fadd fast float [[ADD_3]], [[CONV6]]
-; CHECK-NEXT:    [[ADD19:%.*]] = fadd fast float undef, [[ADD7]]
-; CHECK-NEXT:    [[ADD19_1:%.*]] = fadd fast float undef, [[ADD19]]
-; CHECK-NEXT:    [[ADD19_2:%.*]] = fadd fast float undef, [[ADD19_1]]
 ; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP3]], [[RDX_SHUF]]
 ; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -119,7 +111,6 @@
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
 ; CHECK-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP4]], [[CONV]]
 ; CHECK-NEXT:    [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV6]]
-; CHECK-NEXT:    [[ADD19_3:%.*]] = fadd fast float undef, [[ADD19_2]]
 ; CHECK-NEXT:    store float [[OP_EXTRA5]], float* @res, align 4
 ; CHECK-NEXT:    ret float [[OP_EXTRA5]]
 ;
@@ -131,16 +122,8 @@
 ; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr to <8 x float>*), align 16
 ; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr1 to <8 x float>*), align 16
 ; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]]
-; THRESHOLD-NEXT:    [[ADD:%.*]] = fadd fast float undef, [[CONV]]
-; THRESHOLD-NEXT:    [[ADD_1:%.*]] = fadd fast float undef, [[ADD]]
-; THRESHOLD-NEXT:    [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]]
-; THRESHOLD-NEXT:    [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]]
 ; THRESHOLD-NEXT:    [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2
 ; THRESHOLD-NEXT:    [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float
-; THRESHOLD-NEXT:    [[ADD7:%.*]] = fadd fast float [[ADD_3]], [[CONV6]]
-; THRESHOLD-NEXT:    [[ADD19:%.*]] = fadd fast float undef, [[ADD7]]
-; THRESHOLD-NEXT:    [[ADD19_1:%.*]] = fadd fast float undef, [[ADD19]]
-; THRESHOLD-NEXT:    [[ADD19_2:%.*]] = fadd fast float undef, [[ADD19_1]]
 ; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
 ; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP3]], [[RDX_SHUF]]
 ; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -150,7 +133,6 @@
 ; THRESHOLD-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
 ; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP4]], [[CONV]]
 ; THRESHOLD-NEXT:    [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV6]]
-; THRESHOLD-NEXT:    [[ADD19_3:%.*]] = fadd fast float undef, [[ADD19_2]]
 ; THRESHOLD-NEXT:    store float [[OP_EXTRA5]], float* @res, align 4
 ; THRESHOLD-NEXT:    ret float [[OP_EXTRA5]]
 ;
@@ -205,17 +187,14 @@
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast float undef, undef
-; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast float undef, [[TMP4]]
 ; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF]]
 ; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast float undef, [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[CONV]], [[TMP6]]
-; CHECK-NEXT:    store float [[TMP8]], float* @res, align 4
-; CHECK-NEXT:    ret float [[TMP8]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[CONV]], [[TMP4]]
+; CHECK-NEXT:    store float [[TMP5]], float* @res, align 4
+; CHECK-NEXT:    ret float [[TMP5]]
 ;
 ; THRESHOLD-LABEL: @bazzz(
 ; THRESHOLD-NEXT:  entry:
@@ -224,17 +203,14 @@
 ; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
 ; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
 ; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
-; THRESHOLD-NEXT:    [[TMP4:%.*]] = fadd fast float undef, undef
-; THRESHOLD-NEXT:    [[TMP5:%.*]] = fadd fast float undef, [[TMP4]]
 ; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
 ; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF]]
 ; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; THRESHOLD-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
-; THRESHOLD-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
-; THRESHOLD-NEXT:    [[TMP7:%.*]] = fadd fast float undef, [[TMP5]]
-; THRESHOLD-NEXT:    [[TMP8:%.*]] = fmul fast float [[CONV]], [[TMP6]]
-; THRESHOLD-NEXT:    store float [[TMP8]], float* @res, align 4
-; THRESHOLD-NEXT:    ret float [[TMP8]]
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
+; THRESHOLD-NEXT:    [[TMP5:%.*]] = fmul fast float [[CONV]], [[TMP4]]
+; THRESHOLD-NEXT:    store float [[TMP5]], float* @res, align 4
+; THRESHOLD-NEXT:    ret float [[TMP5]]
 ;
 entry:
   %0 = load i32, i32* @n, align 4
@@ -267,16 +243,13 @@
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast float undef, undef
-; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast float undef, [[TMP4]]
 ; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF]]
 ; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast float undef, [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[CONV]], [[TMP6]]
-; CHECK-NEXT:    [[CONV4:%.*]] = fptosi float [[TMP8]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[CONV]], [[TMP4]]
+; CHECK-NEXT:    [[CONV4:%.*]] = fptosi float [[TMP5]] to i32
 ; CHECK-NEXT:    store i32 [[CONV4]], i32* @n, align 4
 ; CHECK-NEXT:    ret i32 [[CONV4]]
 ;
@@ -287,16 +260,13 @@
 ; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
 ; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
 ; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
-; THRESHOLD-NEXT:    [[TMP4:%.*]] = fadd fast float undef, undef
-; THRESHOLD-NEXT:    [[TMP5:%.*]] = fadd fast float undef, [[TMP4]]
 ; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
 ; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF]]
 ; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; THRESHOLD-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
-; THRESHOLD-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
-; THRESHOLD-NEXT:    [[TMP7:%.*]] = fadd fast float undef, [[TMP5]]
-; THRESHOLD-NEXT:    [[TMP8:%.*]] = fmul fast float [[CONV]], [[TMP6]]
-; THRESHOLD-NEXT:    [[CONV4:%.*]] = fptosi float [[TMP8]] to i32
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
+; THRESHOLD-NEXT:    [[TMP5:%.*]] = fmul fast float [[CONV]], [[TMP4]]
+; THRESHOLD-NEXT:    [[CONV4:%.*]] = fptosi float [[TMP5]] to i32
 ; THRESHOLD-NEXT:    store i32 [[CONV4]], i32* @n, align 4
 ; THRESHOLD-NEXT:    ret i32 [[CONV4]]
 ;
@@ -330,50 +300,30 @@
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
 ; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
-; CHECK-NEXT:    [[CMP4:%.*]] = fcmp fast ogt float [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[MAX_0_MUL3:%.*]] = select i1 [[CMP4]], float undef, float undef
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
-; CHECK-NEXT:    [[CMP4_1:%.*]] = fcmp fast ogt float [[MAX_0_MUL3]], [[TMP5]]
-; CHECK-NEXT:    [[MAX_0_MUL3_1:%.*]] = select i1 [[CMP4_1]], float [[MAX_0_MUL3]], float undef
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
-; CHECK-NEXT:    [[CMP4_2:%.*]] = fcmp fast ogt float [[MAX_0_MUL3_1]], [[TMP6]]
 ; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <4 x float> [[TMP2]], [[RDX_SHUF]]
 ; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP2]], <4 x float> [[RDX_SHUF]]
 ; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <4 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
 ; CHECK-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> [[RDX_SHUF1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[RDX_MINMAX_SELECT3]], i32 0
-; CHECK-NEXT:    [[MAX_0_MUL3_2:%.*]] = select i1 [[CMP4_2]], float [[MAX_0_MUL3_1]], float undef
-; CHECK-NEXT:    store float [[TMP7]], float* @res, align 4
-; CHECK-NEXT:    ret float [[TMP7]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[RDX_MINMAX_SELECT3]], i32 0
+; CHECK-NEXT:    store float [[TMP3]], float* @res, align 4
+; CHECK-NEXT:    ret float [[TMP3]]
 ;
 ; THRESHOLD-LABEL: @bar(
 ; THRESHOLD-NEXT:  entry:
 ; THRESHOLD-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
 ; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
 ; THRESHOLD-NEXT:    [[TMP2:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP0]]
-; THRESHOLD-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
-; THRESHOLD-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
-; THRESHOLD-NEXT:    [[CMP4:%.*]] = fcmp fast ogt float [[TMP3]], [[TMP4]]
-; THRESHOLD-NEXT:    [[MAX_0_MUL3:%.*]] = select i1 [[CMP4]], float undef, float undef
-; THRESHOLD-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
-; THRESHOLD-NEXT:    [[CMP4_1:%.*]] = fcmp fast ogt float [[MAX_0_MUL3]], [[TMP5]]
-; THRESHOLD-NEXT:    [[MAX_0_MUL3_1:%.*]] = select i1 [[CMP4_1]], float [[MAX_0_MUL3]], float undef
-; THRESHOLD-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
-; THRESHOLD-NEXT:    [[CMP4_2:%.*]] = fcmp fast ogt float [[MAX_0_MUL3_1]], [[TMP6]]
 ; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
 ; THRESHOLD-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <4 x float> [[TMP2]], [[RDX_SHUF]]
 ; THRESHOLD-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP2]], <4 x float> [[RDX_SHUF]]
 ; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; THRESHOLD-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <4 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
 ; THRESHOLD-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> [[RDX_SHUF1]]
-; THRESHOLD-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[RDX_MINMAX_SELECT3]], i32 0
-; THRESHOLD-NEXT:    [[MAX_0_MUL3_2:%.*]] = select i1 [[CMP4_2]], float [[MAX_0_MUL3_1]], float undef
-; THRESHOLD-NEXT:    store float [[TMP7]], float* @res, align 4
-; THRESHOLD-NEXT:    ret float [[TMP7]]
+; THRESHOLD-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[RDX_MINMAX_SELECT3]], i32 0
+; THRESHOLD-NEXT:    store float [[TMP3]], float* @res, align 4
+; THRESHOLD-NEXT:    ret float [[TMP3]]
 ;
 entry:
   %0 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 0), align 16
@@ -418,21 +368,6 @@
 ; CHECK-NEXT:    [[ARRAYIDX_15:%.*]] = getelementptr inbounds float, float* [[X]], i64 15
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <16 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x float>, <16 x float>* [[TMP0]], align 4
-; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float undef, undef
-; CHECK-NEXT:    [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]]
-; CHECK-NEXT:    [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]]
-; CHECK-NEXT:    [[ADD_4:%.*]] = fadd fast float undef, [[ADD_3]]
-; CHECK-NEXT:    [[ADD_5:%.*]] = fadd fast float undef, [[ADD_4]]
-; CHECK-NEXT:    [[ADD_6:%.*]] = fadd fast float undef, [[ADD_5]]
-; CHECK-NEXT:    [[ADD_7:%.*]] = fadd fast float undef, [[ADD_6]]
-; CHECK-NEXT:    [[ADD_8:%.*]] = fadd fast float undef, [[ADD_7]]
-; CHECK-NEXT:    [[ADD_9:%.*]] = fadd fast float undef, [[ADD_8]]
-; CHECK-NEXT:    [[ADD_10:%.*]] = fadd fast float undef, [[ADD_9]]
-; CHECK-NEXT:    [[ADD_11:%.*]] = fadd fast float undef, [[ADD_10]]
-; CHECK-NEXT:    [[ADD_12:%.*]] = fadd fast float undef, [[ADD_11]]
-; CHECK-NEXT:    [[ADD_13:%.*]] = fadd fast float undef, [[ADD_12]]
-; CHECK-NEXT:    [[ADD_14:%.*]] = fadd fast float undef, [[ADD_13]]
-; CHECK-NEXT:    [[ADD_15:%.*]] = fadd fast float undef, [[ADD_14]]
 ; CHECK-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, float* [[X]], i64 16
 ; CHECK-NEXT:    [[ARRAYIDX_17:%.*]] = getelementptr inbounds float, float* [[X]], i64 17
 ; CHECK-NEXT:    [[ARRAYIDX_18:%.*]] = getelementptr inbounds float, float* [[X]], i64 18
@@ -467,37 +402,6 @@
 ; CHECK-NEXT:    [[ARRAYIDX_47:%.*]] = getelementptr inbounds float, float* [[X]], i64 47
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_16]] to <32 x float>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x float>, <32 x float>* [[TMP2]], align 4
-; CHECK-NEXT:    [[ADD_16:%.*]] = fadd fast float undef, [[ADD_15]]
-; CHECK-NEXT:    [[ADD_17:%.*]] = fadd fast float undef, [[ADD_16]]
-; CHECK-NEXT:    [[ADD_18:%.*]] = fadd fast float undef, [[ADD_17]]
-; CHECK-NEXT:    [[ADD_19:%.*]] = fadd fast float undef, [[ADD_18]]
-; CHECK-NEXT:    [[ADD_20:%.*]] = fadd fast float undef, [[ADD_19]]
-; CHECK-NEXT:    [[ADD_21:%.*]] = fadd fast float undef, [[ADD_20]]
-; CHECK-NEXT:    [[ADD_22:%.*]] = fadd fast float undef, [[ADD_21]]
-; CHECK-NEXT:    [[ADD_23:%.*]] = fadd fast float undef, [[ADD_22]]
-; CHECK-NEXT:    [[ADD_24:%.*]] = fadd fast float undef, [[ADD_23]]
-; CHECK-NEXT:    [[ADD_25:%.*]] = fadd fast float undef, [[ADD_24]]
-; CHECK-NEXT:    [[ADD_26:%.*]] = fadd fast float undef, [[ADD_25]]
-; CHECK-NEXT:    [[ADD_27:%.*]] = fadd fast float undef, [[ADD_26]]
-; CHECK-NEXT:    [[ADD_28:%.*]] = fadd fast float undef, [[ADD_27]]
-; CHECK-NEXT:    [[ADD_29:%.*]] = fadd fast float undef, [[ADD_28]]
-; CHECK-NEXT:    [[ADD_30:%.*]] = fadd fast float undef, [[ADD_29]]
-; CHECK-NEXT:    [[ADD_31:%.*]] = fadd fast float undef, [[ADD_30]]
-; CHECK-NEXT:    [[ADD_32:%.*]] = fadd fast float undef, [[ADD_31]]
-; CHECK-NEXT:    [[ADD_33:%.*]] = fadd fast float undef, [[ADD_32]]
-; CHECK-NEXT:    [[ADD_34:%.*]] = fadd fast float undef, [[ADD_33]]
-; CHECK-NEXT:    [[ADD_35:%.*]] = fadd fast float undef, [[ADD_34]]
-; CHECK-NEXT:    [[ADD_36:%.*]] = fadd fast float undef, [[ADD_35]]
-; CHECK-NEXT:    [[ADD_37:%.*]] = fadd fast float undef, [[ADD_36]]
-; CHECK-NEXT:    [[ADD_38:%.*]] = fadd fast float undef, [[ADD_37]]
-; CHECK-NEXT:    [[ADD_39:%.*]] = fadd fast float undef, [[ADD_38]]
-; CHECK-NEXT:    [[ADD_40:%.*]] = fadd fast float undef, [[ADD_39]]
-; CHECK-NEXT:    [[ADD_41:%.*]] = fadd fast float undef, [[ADD_40]]
-; CHECK-NEXT:    [[ADD_42:%.*]] = fadd fast float undef, [[ADD_41]]
-; CHECK-NEXT:    [[ADD_43:%.*]] = fadd fast float undef, [[ADD_42]]
-; CHECK-NEXT:    [[ADD_44:%.*]] = fadd fast float undef, [[ADD_43]]
-; CHECK-NEXT:    [[ADD_45:%.*]] = fadd fast float undef, [[ADD_44]]
-; CHECK-NEXT:    [[ADD_46:%.*]] = fadd fast float undef, [[ADD_45]]
 ; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <32 x float> [[TMP3]], <32 x float> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <32 x float> [[TMP3]], [[RDX_SHUF]]
 ; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x float> [[BIN_RDX]], <32 x float> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -519,7 +423,6 @@
 ; CHECK-NEXT:    [[BIN_RDX16:%.*]] = fadd fast <16 x float> [[BIN_RDX14]], [[RDX_SHUF15]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <16 x float> [[BIN_RDX16]], i32 0
 ; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[ADD_47:%.*]] = fadd fast float undef, [[ADD_46]]
 ; CHECK-NEXT:    ret float [[OP_RDX]]
 ;
 ; THRESHOLD-LABEL: @f(
@@ -541,21 +444,6 @@
 ; THRESHOLD-NEXT:    [[ARRAYIDX_15:%.*]] = getelementptr inbounds float, float* [[X]], i64 15
 ; THRESHOLD-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <16 x float>*
 ; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <16 x float>, <16 x float>* [[TMP0]], align 4
-; THRESHOLD-NEXT:    [[ADD_1:%.*]] = fadd fast float undef, undef
-; THRESHOLD-NEXT:    [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]]
-; THRESHOLD-NEXT:    [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]]
-; THRESHOLD-NEXT:    [[ADD_4:%.*]] = fadd fast float undef, [[ADD_3]]
-; THRESHOLD-NEXT:    [[ADD_5:%.*]] = fadd fast float undef, [[ADD_4]]
-; THRESHOLD-NEXT:    [[ADD_6:%.*]] = fadd fast float undef, [[ADD_5]]
-; THRESHOLD-NEXT:    [[ADD_7:%.*]] = fadd fast float undef, [[ADD_6]]
-; THRESHOLD-NEXT:    [[ADD_8:%.*]] = fadd fast float undef, [[ADD_7]]
-; THRESHOLD-NEXT:    [[ADD_9:%.*]] = fadd fast float undef, [[ADD_8]]
-; THRESHOLD-NEXT:    [[ADD_10:%.*]] = fadd fast float undef, [[ADD_9]]
-; THRESHOLD-NEXT:    [[ADD_11:%.*]] = fadd fast float undef, [[ADD_10]]
-; THRESHOLD-NEXT:    [[ADD_12:%.*]] = fadd fast float undef, [[ADD_11]]
-; THRESHOLD-NEXT:    [[ADD_13:%.*]] = fadd fast float undef, [[ADD_12]]
-; THRESHOLD-NEXT:    [[ADD_14:%.*]] = fadd fast float undef, [[ADD_13]]
-; THRESHOLD-NEXT:    [[ADD_15:%.*]] = fadd fast float undef, [[ADD_14]]
 ; THRESHOLD-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, float* [[X]], i64 16
 ; THRESHOLD-NEXT:    [[ARRAYIDX_17:%.*]] = getelementptr inbounds float, float* [[X]], i64 17
 ; THRESHOLD-NEXT:    [[ARRAYIDX_18:%.*]] = getelementptr inbounds float, float* [[X]], i64 18
@@ -590,37 +478,6 @@
 ; THRESHOLD-NEXT:    [[ARRAYIDX_47:%.*]] = getelementptr inbounds float, float* [[X]], i64 47
 ; THRESHOLD-NEXT:    [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_16]] to <32 x float>*
 ; THRESHOLD-NEXT:    [[TMP3:%.*]] = load <32 x float>, <32 x float>* [[TMP2]], align 4
-; THRESHOLD-NEXT:    [[ADD_16:%.*]] = fadd fast float undef, [[ADD_15]]
-; THRESHOLD-NEXT:    [[ADD_17:%.*]] = fadd fast float undef, [[ADD_16]]
-; THRESHOLD-NEXT:    [[ADD_18:%.*]] = fadd fast float undef, [[ADD_17]]
-; THRESHOLD-NEXT:    [[ADD_19:%.*]] = fadd fast float undef, [[ADD_18]]
-; THRESHOLD-NEXT:    [[ADD_20:%.*]] = fadd fast float undef, [[ADD_19]]
-; THRESHOLD-NEXT:    [[ADD_21:%.*]] = fadd fast float undef, [[ADD_20]]
-; THRESHOLD-NEXT:    [[ADD_22:%.*]] = fadd fast float undef, [[ADD_21]]
-; THRESHOLD-NEXT:    [[ADD_23:%.*]] = fadd fast float undef, [[ADD_22]]
-; THRESHOLD-NEXT:    [[ADD_24:%.*]] = fadd fast float undef, [[ADD_23]]
-; THRESHOLD-NEXT:    [[ADD_25:%.*]] = fadd fast float undef, [[ADD_24]]
-; THRESHOLD-NEXT:    [[ADD_26:%.*]] = fadd fast float undef, [[ADD_25]]
-; THRESHOLD-NEXT:    [[ADD_27:%.*]] = fadd fast float undef, [[ADD_26]]
-; THRESHOLD-NEXT:    [[ADD_28:%.*]] = fadd fast float undef, [[ADD_27]]
-; THRESHOLD-NEXT:    [[ADD_29:%.*]] = fadd fast float undef, [[ADD_28]]
-; THRESHOLD-NEXT:    [[ADD_30:%.*]] = fadd fast float undef, [[ADD_29]]
-; THRESHOLD-NEXT:    [[ADD_31:%.*]] = fadd fast float undef, [[ADD_30]]
-; THRESHOLD-NEXT:    [[ADD_32:%.*]] = fadd fast float undef, [[ADD_31]]
-; THRESHOLD-NEXT:    [[ADD_33:%.*]] = fadd fast float undef, [[ADD_32]]
-; THRESHOLD-NEXT:    [[ADD_34:%.*]] = fadd fast float undef, [[ADD_33]]
-; THRESHOLD-NEXT:    [[ADD_35:%.*]] = fadd fast float undef, [[ADD_34]]
-; THRESHOLD-NEXT:    [[ADD_36:%.*]] = fadd fast float undef, [[ADD_35]]
-; THRESHOLD-NEXT:    [[ADD_37:%.*]] = fadd fast float undef, [[ADD_36]]
-; THRESHOLD-NEXT:    [[ADD_38:%.*]] = fadd fast float undef, [[ADD_37]]
-; THRESHOLD-NEXT:    [[ADD_39:%.*]] = fadd fast float undef, [[ADD_38]]
-; THRESHOLD-NEXT:    [[ADD_40:%.*]] = fadd fast float undef, [[ADD_39]]
-; THRESHOLD-NEXT:    [[ADD_41:%.*]] = fadd fast float undef, [[ADD_40]]
-; THRESHOLD-NEXT:    [[ADD_42:%.*]] = fadd fast float undef, [[ADD_41]]
-; THRESHOLD-NEXT:    [[ADD_43:%.*]] = fadd fast float undef, [[ADD_42]]
-; THRESHOLD-NEXT:    [[ADD_44:%.*]] = fadd fast float undef, [[ADD_43]]
-; THRESHOLD-NEXT:    [[ADD_45:%.*]] = fadd fast float undef, [[ADD_44]]
-; THRESHOLD-NEXT:    [[ADD_46:%.*]] = fadd fast float undef, [[ADD_45]]
 ; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <32 x float> [[TMP3]], <32 x float> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = fadd fast <32 x float> [[TMP3]], [[RDX_SHUF]]
 ; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x float> [[BIN_RDX]], <32 x float> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -642,7 +499,6 @@
 ; THRESHOLD-NEXT:    [[BIN_RDX16:%.*]] = fadd fast <16 x float> [[BIN_RDX14]], [[RDX_SHUF15]]
 ; THRESHOLD-NEXT:    [[TMP5:%.*]] = extractelement <16 x float> [[BIN_RDX16]], i32 0
 ; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[TMP5]]
-; THRESHOLD-NEXT:    [[ADD_47:%.*]] = fadd fast float undef, [[ADD_46]]
 ; THRESHOLD-NEXT:    ret float [[OP_RDX]]
 ;
   entry:
@@ -829,37 +685,6 @@
 ; CHECK-NEXT:    [[ARRAYIDX_31:%.*]] = getelementptr inbounds float, float* [[X]], i64 31
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <32 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x float>, <32 x float>* [[TMP0]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float undef, [[CONV]]
-; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float undef, [[ADD]]
-; CHECK-NEXT:    [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]]
-; CHECK-NEXT:    [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]]
-; CHECK-NEXT:    [[ADD_4:%.*]] = fadd fast float undef, [[ADD_3]]
-; CHECK-NEXT:    [[ADD_5:%.*]] = fadd fast float undef, [[ADD_4]]
-; CHECK-NEXT:    [[ADD_6:%.*]] = fadd fast float undef, [[ADD_5]]
-; CHECK-NEXT:    [[ADD_7:%.*]] = fadd fast float undef, [[ADD_6]]
-; CHECK-NEXT:    [[ADD_8:%.*]] = fadd fast float undef, [[ADD_7]]
-; CHECK-NEXT:    [[ADD_9:%.*]] = fadd fast float undef, [[ADD_8]]
-; CHECK-NEXT:    [[ADD_10:%.*]] = fadd fast float undef, [[ADD_9]]
-; CHECK-NEXT:    [[ADD_11:%.*]] = fadd fast float undef, [[ADD_10]]
-; CHECK-NEXT:    [[ADD_12:%.*]] = fadd fast float undef, [[ADD_11]]
-; CHECK-NEXT:    [[ADD_13:%.*]] = fadd fast float undef, [[ADD_12]]
-; CHECK-NEXT:    [[ADD_14:%.*]] = fadd fast float undef, [[ADD_13]]
-; CHECK-NEXT:    [[ADD_15:%.*]] = fadd fast float undef, [[ADD_14]]
-; CHECK-NEXT:    [[ADD_16:%.*]] = fadd fast float undef, [[ADD_15]]
-; CHECK-NEXT:    [[ADD_17:%.*]] = fadd fast float undef, [[ADD_16]]
-; CHECK-NEXT:    [[ADD_18:%.*]] = fadd fast float undef, [[ADD_17]]
-; CHECK-NEXT:    [[ADD_19:%.*]] = fadd fast float undef, [[ADD_18]]
-; CHECK-NEXT:    [[ADD_20:%.*]] = fadd fast float undef, [[ADD_19]]
-; CHECK-NEXT:    [[ADD_21:%.*]] = fadd fast float undef, [[ADD_20]]
-; CHECK-NEXT:    [[ADD_22:%.*]] = fadd fast float undef, [[ADD_21]]
-; CHECK-NEXT:    [[ADD_23:%.*]] = fadd fast float undef, [[ADD_22]]
-; CHECK-NEXT:    [[ADD_24:%.*]] = fadd fast float undef, [[ADD_23]]
-; CHECK-NEXT:    [[ADD_25:%.*]] = fadd fast float undef, [[ADD_24]]
-; CHECK-NEXT:    [[ADD_26:%.*]] = fadd fast float undef, [[ADD_25]]
-; CHECK-NEXT:    [[ADD_27:%.*]] = fadd fast float undef, [[ADD_26]]
-; CHECK-NEXT:    [[ADD_28:%.*]] = fadd fast float undef, [[ADD_27]]
-; CHECK-NEXT:    [[ADD_29:%.*]] = fadd fast float undef, [[ADD_28]]
-; CHECK-NEXT:    [[ADD_30:%.*]] = fadd fast float undef, [[ADD_29]]
 ; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <32 x float> [[TMP1]], <32 x float> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <32 x float> [[TMP1]], [[RDX_SHUF]]
 ; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x float> [[BIN_RDX]], <32 x float> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -872,7 +697,6 @@
 ; CHECK-NEXT:    [[BIN_RDX8:%.*]] = fadd fast <32 x float> [[BIN_RDX6]], [[RDX_SHUF7]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <32 x float> [[BIN_RDX8]], i32 0
 ; CHECK-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[CONV]]
-; CHECK-NEXT:    [[ADD_31:%.*]] = fadd fast float undef, [[ADD_30]]
 ; CHECK-NEXT:    ret float [[OP_EXTRA]]
 ;
 ; THRESHOLD-LABEL: @f1(
@@ -912,37 +736,6 @@
 ; THRESHOLD-NEXT:    [[ARRAYIDX_31:%.*]] = getelementptr inbounds float, float* [[X]], i64 31
 ; THRESHOLD-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <32 x float>*
 ; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <32 x float>, <32 x float>* [[TMP0]], align 4
-; THRESHOLD-NEXT:    [[ADD:%.*]] = fadd fast float undef, [[CONV]]
-; THRESHOLD-NEXT:    [[ADD_1:%.*]] = fadd fast float undef, [[ADD]]
-; THRESHOLD-NEXT:    [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]]
-; THRESHOLD-NEXT:    [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]]
-; THRESHOLD-NEXT:    [[ADD_4:%.*]] = fadd fast float undef, [[ADD_3]]
-; THRESHOLD-NEXT:    [[ADD_5:%.*]] = fadd fast float undef, [[ADD_4]]
-; THRESHOLD-NEXT:    [[ADD_6:%.*]] = fadd fast float undef, [[ADD_5]]
-; THRESHOLD-NEXT:    [[ADD_7:%.*]] = fadd fast float undef, [[ADD_6]]
-; THRESHOLD-NEXT:    [[ADD_8:%.*]] = fadd fast float undef, [[ADD_7]]
-; THRESHOLD-NEXT:    [[ADD_9:%.*]] = fadd fast float undef, [[ADD_8]]
-; THRESHOLD-NEXT:    [[ADD_10:%.*]] = fadd fast float undef, [[ADD_9]]
-; THRESHOLD-NEXT:    [[ADD_11:%.*]] = fadd fast float undef, [[ADD_10]]
-; THRESHOLD-NEXT:    [[ADD_12:%.*]] = fadd fast float undef, [[ADD_11]]
-; THRESHOLD-NEXT:    [[ADD_13:%.*]] = fadd fast float undef, [[ADD_12]]
-; THRESHOLD-NEXT:    [[ADD_14:%.*]] = fadd fast float undef, [[ADD_13]]
-; THRESHOLD-NEXT:    [[ADD_15:%.*]] = fadd fast float undef, [[ADD_14]]
-; THRESHOLD-NEXT:    [[ADD_16:%.*]] = fadd fast float undef, [[ADD_15]]
-; THRESHOLD-NEXT:    [[ADD_17:%.*]] = fadd fast float undef, [[ADD_16]]
-; THRESHOLD-NEXT:    [[ADD_18:%.*]] = fadd fast float undef, [[ADD_17]]
-; THRESHOLD-NEXT:    [[ADD_19:%.*]] = fadd fast float undef, [[ADD_18]]
-; THRESHOLD-NEXT:    [[ADD_20:%.*]] = fadd fast float undef, [[ADD_19]]
-; THRESHOLD-NEXT:    [[ADD_21:%.*]] = fadd fast float undef, [[ADD_20]]
-; THRESHOLD-NEXT:    [[ADD_22:%.*]] = fadd fast float undef, [[ADD_21]]
-; THRESHOLD-NEXT:    [[ADD_23:%.*]] = fadd fast float undef, [[ADD_22]]
-; THRESHOLD-NEXT:    [[ADD_24:%.*]] = fadd fast float undef, [[ADD_23]]
-; THRESHOLD-NEXT:    [[ADD_25:%.*]] = fadd fast float undef, [[ADD_24]]
-; THRESHOLD-NEXT:    [[ADD_26:%.*]] = fadd fast float undef, [[ADD_25]]
-; THRESHOLD-NEXT:    [[ADD_27:%.*]] = fadd fast float undef, [[ADD_26]]
-; THRESHOLD-NEXT:    [[ADD_28:%.*]] = fadd fast float undef, [[ADD_27]]
-; THRESHOLD-NEXT:    [[ADD_29:%.*]] = fadd fast float undef, [[ADD_28]]
-; THRESHOLD-NEXT:    [[ADD_30:%.*]] = fadd fast float undef, [[ADD_29]]
 ; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <32 x float> [[TMP1]], <32 x float> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = fadd fast <32 x float> [[TMP1]], [[RDX_SHUF]]
 ; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x float> [[BIN_RDX]], <32 x float> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -955,7 +748,6 @@
 ; THRESHOLD-NEXT:    [[BIN_RDX8:%.*]] = fadd fast <32 x float> [[BIN_RDX6]], [[RDX_SHUF7]]
 ; THRESHOLD-NEXT:    [[TMP2:%.*]] = extractelement <32 x float> [[BIN_RDX8]], i32 0
 ; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[CONV]]
-; THRESHOLD-NEXT:    [[ADD_31:%.*]] = fadd fast float undef, [[ADD_30]]
 ; THRESHOLD-NEXT:    ret float [[OP_EXTRA]]
 ;
   entry:
@@ -1066,17 +858,12 @@
 ; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX_1]], align 4
-; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float [[TMP1]], [[TMP0]]
 ; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
 ; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
 ; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
 ; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_2]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4
-; CHECK-NEXT:    [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]]
-; CHECK-NEXT:    [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]]
-; CHECK-NEXT:    [[ADD_4:%.*]] = fadd fast float undef, [[ADD_3]]
-; CHECK-NEXT:    [[ADD_5:%.*]] = fadd fast float undef, [[ADD_4]]
 ; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
 ; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds float, float* [[X]], i64 8
 ; CHECK-NEXT:    [[ARRAYIDX_8:%.*]] = getelementptr inbounds float, float* [[X]], i64 9
@@ -1087,14 +874,6 @@
 ; CHECK-NEXT:    [[ARRAYIDX_13:%.*]] = getelementptr inbounds float, float* [[X]], i64 14
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[ARRAYIDX_6]] to <8 x float>*
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x float>, <8 x float>* [[TMP4]], align 4
-; CHECK-NEXT:    [[ADD_6:%.*]] = fadd fast float undef, [[ADD_5]]
-; CHECK-NEXT:    [[ADD_7:%.*]] = fadd fast float undef, [[ADD_6]]
-; CHECK-NEXT:    [[ADD_8:%.*]] = fadd fast float undef, [[ADD_7]]
-; CHECK-NEXT:    [[ADD_9:%.*]] = fadd fast float undef, [[ADD_8]]
-; CHECK-NEXT:    [[ADD_10:%.*]] = fadd fast float undef, [[ADD_9]]
-; CHECK-NEXT:    [[ADD_11:%.*]] = fadd fast float undef, [[ADD_10]]
-; CHECK-NEXT:    [[ADD_12:%.*]] = fadd fast float undef, [[ADD_11]]
-; CHECK-NEXT:    [[ADD_13:%.*]] = fadd fast float undef, [[ADD_12]]
 ; CHECK-NEXT:    [[ARRAYIDX_14:%.*]] = getelementptr inbounds float, float* [[X]], i64 15
 ; CHECK-NEXT:    [[ARRAYIDX_15:%.*]] = getelementptr inbounds float, float* [[X]], i64 16
 ; CHECK-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, float* [[X]], i64 17
@@ -1113,21 +892,6 @@
 ; CHECK-NEXT:    [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 30
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[ARRAYIDX_14]] to <16 x float>*
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x float>, <16 x float>* [[TMP6]], align 4
-; CHECK-NEXT:    [[ADD_14:%.*]] = fadd fast float undef, [[ADD_13]]
-; CHECK-NEXT:    [[ADD_15:%.*]] = fadd fast float undef, [[ADD_14]]
-; CHECK-NEXT:    [[ADD_16:%.*]] = fadd fast float undef, [[ADD_15]]
-; CHECK-NEXT:    [[ADD_17:%.*]] = fadd fast float undef, [[ADD_16]]
-; CHECK-NEXT:    [[ADD_18:%.*]] = fadd fast float undef, [[ADD_17]]
-; CHECK-NEXT:    [[ADD_19:%.*]] = fadd fast float undef, [[ADD_18]]
-; CHECK-NEXT:    [[ADD_20:%.*]] = fadd fast float undef, [[ADD_19]]
-; CHECK-NEXT:    [[ADD_21:%.*]] = fadd fast float undef, [[ADD_20]]
-; CHECK-NEXT:    [[ADD_22:%.*]] = fadd fast float undef, [[ADD_21]]
-; CHECK-NEXT:    [[ADD_23:%.*]] = fadd fast float undef, [[ADD_22]]
-; CHECK-NEXT:    [[ADD_24:%.*]] = fadd fast float undef, [[ADD_23]]
-; CHECK-NEXT:    [[ADD_25:%.*]] = fadd fast float undef, [[ADD_24]]
-; CHECK-NEXT:    [[ADD_26:%.*]] = fadd fast float undef, [[ADD_25]]
-; CHECK-NEXT:    [[ADD_27:%.*]] = fadd fast float undef, [[ADD_26]]
-; CHECK-NEXT:    [[ADD_28:%.*]] = fadd fast float undef, [[ADD_27]]
 ; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <16 x float> [[TMP7]], <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <16 x float> [[TMP7]], [[RDX_SHUF]]
 ; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <16 x float> [[BIN_RDX]], <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -1153,7 +917,6 @@
 ; CHECK-NEXT:    [[OP_RDX17:%.*]] = fadd fast float [[OP_RDX]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = fadd fast float [[OP_RDX17]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = fadd fast float [[TMP11]], [[TMP0]]
-; CHECK-NEXT:    [[ADD_29:%.*]] = fadd fast float undef, [[ADD_28]]
 ; CHECK-NEXT:    ret float [[TMP12]]
 ;
 ; THRESHOLD-LABEL: @loadadd31(
@@ -1162,17 +925,12 @@
 ; THRESHOLD-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
 ; THRESHOLD-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
 ; THRESHOLD-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX_1]], align 4
-; THRESHOLD-NEXT:    [[ADD_1:%.*]] = fadd fast float [[TMP1]], [[TMP0]]
 ; THRESHOLD-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
 ; THRESHOLD-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
 ; THRESHOLD-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
 ; THRESHOLD-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
 ; THRESHOLD-NEXT:    [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_2]] to <4 x float>*
 ; THRESHOLD-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4
-; THRESHOLD-NEXT:    [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]]
-; THRESHOLD-NEXT:    [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]]
-; THRESHOLD-NEXT:    [[ADD_4:%.*]] = fadd fast float undef, [[ADD_3]]
-; THRESHOLD-NEXT:    [[ADD_5:%.*]] = fadd fast float undef, [[ADD_4]]
 ; THRESHOLD-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
 ; THRESHOLD-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds float, float* [[X]], i64 8
 ; THRESHOLD-NEXT:    [[ARRAYIDX_8:%.*]] = getelementptr inbounds float, float* [[X]], i64 9
@@ -1183,14 +941,6 @@
 ; THRESHOLD-NEXT:    [[ARRAYIDX_13:%.*]] = getelementptr inbounds float, float* [[X]], i64 14
 ; THRESHOLD-NEXT:    [[TMP4:%.*]] = bitcast float* [[ARRAYIDX_6]] to <8 x float>*
 ; THRESHOLD-NEXT:    [[TMP5:%.*]] = load <8 x float>, <8 x float>* [[TMP4]], align 4
-; THRESHOLD-NEXT:    [[ADD_6:%.*]] = fadd fast float undef, [[ADD_5]]
-; THRESHOLD-NEXT:    [[ADD_7:%.*]] = fadd fast float undef, [[ADD_6]]
-; THRESHOLD-NEXT:    [[ADD_8:%.*]] = fadd fast float undef, [[ADD_7]]
-; THRESHOLD-NEXT:    [[ADD_9:%.*]] = fadd fast float undef, [[ADD_8]]
-; THRESHOLD-NEXT:    [[ADD_10:%.*]] = fadd fast float undef, [[ADD_9]]
-; THRESHOLD-NEXT:    [[ADD_11:%.*]] = fadd fast float undef, [[ADD_10]]
-; THRESHOLD-NEXT:    [[ADD_12:%.*]] = fadd fast float undef, [[ADD_11]]
-; THRESHOLD-NEXT:    [[ADD_13:%.*]] = fadd fast float undef, [[ADD_12]]
 ; THRESHOLD-NEXT:    [[ARRAYIDX_14:%.*]] = getelementptr inbounds float, float* [[X]], i64 15
 ; THRESHOLD-NEXT:    [[ARRAYIDX_15:%.*]] = getelementptr inbounds float, float* [[X]], i64 16
 ; THRESHOLD-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, float* [[X]], i64 17
@@ -1209,21 +959,6 @@
 ; THRESHOLD-NEXT:    [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 30
 ; THRESHOLD-NEXT:    [[TMP6:%.*]] = bitcast float* [[ARRAYIDX_14]] to <16 x float>*
 ; THRESHOLD-NEXT:    [[TMP7:%.*]] = load <16 x float>, <16 x float>* [[TMP6]], align 4
-; THRESHOLD-NEXT:    [[ADD_14:%.*]] = fadd fast float undef, [[ADD_13]]
-; THRESHOLD-NEXT:    [[ADD_15:%.*]] = fadd fast float undef, [[ADD_14]]
-; THRESHOLD-NEXT:    [[ADD_16:%.*]] = fadd fast float undef, [[ADD_15]]
-; THRESHOLD-NEXT:    [[ADD_17:%.*]] = fadd fast float undef, [[ADD_16]]
-; THRESHOLD-NEXT:    [[ADD_18:%.*]] = fadd fast float undef, [[ADD_17]]
-; THRESHOLD-NEXT:    [[ADD_19:%.*]] = fadd fast float undef, [[ADD_18]]
-; THRESHOLD-NEXT:    [[ADD_20:%.*]] = fadd fast float undef, [[ADD_19]]
-; THRESHOLD-NEXT:    [[ADD_21:%.*]] = fadd fast float undef, [[ADD_20]]
-; THRESHOLD-NEXT:    [[ADD_22:%.*]] = fadd fast float undef, [[ADD_21]]
-; THRESHOLD-NEXT:    [[ADD_23:%.*]] = fadd fast float undef, [[ADD_22]]
-; THRESHOLD-NEXT:    [[ADD_24:%.*]] = fadd fast float undef, [[ADD_23]]
-; THRESHOLD-NEXT:    [[ADD_25:%.*]] = fadd fast float undef, [[ADD_24]]
-; THRESHOLD-NEXT:    [[ADD_26:%.*]] = fadd fast float undef, [[ADD_25]]
-; THRESHOLD-NEXT:    [[ADD_27:%.*]] = fadd fast float undef, [[ADD_26]]
-; THRESHOLD-NEXT:    [[ADD_28:%.*]] = fadd fast float undef, [[ADD_27]]
 ; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <16 x float> [[TMP7]], <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = fadd fast <16 x float> [[TMP7]], [[RDX_SHUF]]
 ; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <16 x float> [[BIN_RDX]], <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -1249,7 +984,6 @@
 ; THRESHOLD-NEXT:    [[OP_RDX17:%.*]] = fadd fast float [[OP_RDX]], [[TMP10]]
 ; THRESHOLD-NEXT:    [[TMP11:%.*]] = fadd fast float [[OP_RDX17]], [[TMP1]]
 ; THRESHOLD-NEXT:    [[TMP12:%.*]] = fadd fast float [[TMP11]], [[TMP0]]
-; THRESHOLD-NEXT:    [[ADD_29:%.*]] = fadd fast float undef, [[ADD_28]]
 ; THRESHOLD-NEXT:    ret float [[TMP12]]
 ;
   entry:
@@ -1360,14 +1094,6 @@
 ; CHECK-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
-; CHECK-NEXT:    [[ADD1:%.*]] = fadd fast float undef, [[ADD]]
-; CHECK-NEXT:    [[ADD4:%.*]] = fadd fast float undef, [[ADD1]]
-; CHECK-NEXT:    [[ADD5:%.*]] = fadd fast float [[ADD4]], [[CONV]]
-; CHECK-NEXT:    [[ADD4_1:%.*]] = fadd fast float undef, [[ADD5]]
-; CHECK-NEXT:    [[ADD4_2:%.*]] = fadd fast float undef, [[ADD4_1]]
-; CHECK-NEXT:    [[ADD4_3:%.*]] = fadd fast float undef, [[ADD4_2]]
-; CHECK-NEXT:    [[ADD4_4:%.*]] = fadd fast float undef, [[ADD4_3]]
-; CHECK-NEXT:    [[ADD4_5:%.*]] = fadd fast float undef, [[ADD4_4]]
 ; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP1]], [[RDX_SHUF]]
 ; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -1377,7 +1103,6 @@
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
 ; CHECK-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
 ; CHECK-NEXT:    [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]]
-; CHECK-NEXT:    [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]]
 ; CHECK-NEXT:    ret float [[OP_EXTRA5]]
 ;
 ; THRESHOLD-LABEL: @extra_args(
@@ -1394,14 +1119,6 @@
 ; THRESHOLD-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
 ; THRESHOLD-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>*
 ; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
-; THRESHOLD-NEXT:    [[ADD1:%.*]] = fadd fast float undef, [[ADD]]
-; THRESHOLD-NEXT:    [[ADD4:%.*]] = fadd fast float undef, [[ADD1]]
-; THRESHOLD-NEXT:    [[ADD5:%.*]] = fadd fast float [[ADD4]], [[CONV]]
-; THRESHOLD-NEXT:    [[ADD4_1:%.*]] = fadd fast float undef, [[ADD5]]
-; THRESHOLD-NEXT:    [[ADD4_2:%.*]] = fadd fast float undef, [[ADD4_1]]
-; THRESHOLD-NEXT:    [[ADD4_3:%.*]] = fadd fast float undef, [[ADD4_2]]
-; THRESHOLD-NEXT:    [[ADD4_4:%.*]] = fadd fast float undef, [[ADD4_3]]
-; THRESHOLD-NEXT:    [[ADD4_5:%.*]] = fadd fast float undef, [[ADD4_4]]
 ; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
 ; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP1]], [[RDX_SHUF]]
 ; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -1411,7 +1128,6 @@
 ; THRESHOLD-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
 ; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
 ; THRESHOLD-NEXT:    [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]]
-; THRESHOLD-NEXT:    [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]]
 ; THRESHOLD-NEXT:    ret float [[OP_EXTRA5]]
 ;
   entry:
@@ -1460,16 +1176,6 @@
 ; CHECK-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
-; CHECK-NEXT:    [[ADD1:%.*]] = fadd fast float undef, [[ADD]]
-; CHECK-NEXT:    [[ADD4:%.*]] = fadd fast float undef, [[ADD1]]
-; CHECK-NEXT:    [[ADD41:%.*]] = fadd fast float [[ADD4]], 5.000000e+00
-; CHECK-NEXT:    [[ADD5:%.*]] = fadd fast float [[ADD41]], [[CONV]]
-; CHECK-NEXT:    [[ADD4_1:%.*]] = fadd fast float undef, [[ADD5]]
-; CHECK-NEXT:    [[ADD4_11:%.*]] = fadd fast float [[ADD4_1]], 5.000000e+00
-; CHECK-NEXT:    [[ADD4_2:%.*]] = fadd fast float undef, [[ADD4_11]]
-; CHECK-NEXT:    [[ADD4_3:%.*]] = fadd fast float undef, [[ADD4_2]]
-; CHECK-NEXT:    [[ADD4_4:%.*]] = fadd fast float undef, [[ADD4_3]]
-; CHECK-NEXT:    [[ADD4_5:%.*]] = fadd fast float undef, [[ADD4_4]]
 ; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP1]], [[RDX_SHUF]]
 ; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -1481,7 +1187,6 @@
 ; CHECK-NEXT:    [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], 5.000000e+00
 ; CHECK-NEXT:    [[OP_EXTRA6:%.*]] = fadd fast float [[OP_EXTRA5]], 5.000000e+00
 ; CHECK-NEXT:    [[OP_EXTRA7:%.*]] = fadd fast float [[OP_EXTRA6]], [[CONV]]
-; CHECK-NEXT:    [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]]
 ; CHECK-NEXT:    ret float [[OP_EXTRA7]]
 ;
 ; THRESHOLD-LABEL: @extra_args_same_several_times(
@@ -1498,16 +1203,6 @@
 ; THRESHOLD-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
 ; THRESHOLD-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>*
 ; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
-; THRESHOLD-NEXT:    [[ADD1:%.*]] = fadd fast float undef, [[ADD]]
-; THRESHOLD-NEXT:    [[ADD4:%.*]] = fadd fast float undef, [[ADD1]]
-; THRESHOLD-NEXT:    [[ADD41:%.*]] = fadd fast float [[ADD4]], 5.000000e+00
-; THRESHOLD-NEXT:    [[ADD5:%.*]] = fadd fast float [[ADD41]], [[CONV]]
-; THRESHOLD-NEXT:    [[ADD4_1:%.*]] = fadd fast float undef, [[ADD5]]
-; THRESHOLD-NEXT:    [[ADD4_11:%.*]] = fadd fast float [[ADD4_1]], 5.000000e+00
-; THRESHOLD-NEXT:    [[ADD4_2:%.*]] = fadd fast float undef, [[ADD4_11]]
-; THRESHOLD-NEXT:    [[ADD4_3:%.*]] = fadd fast float undef, [[ADD4_2]]
-; THRESHOLD-NEXT:    [[ADD4_4:%.*]] = fadd fast float undef, [[ADD4_3]]
-; THRESHOLD-NEXT:    [[ADD4_5:%.*]] = fadd fast float undef, [[ADD4_4]]
 ; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
 ; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP1]], [[RDX_SHUF]]
 ; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -1519,7 +1214,6 @@
 ; THRESHOLD-NEXT:    [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], 5.000000e+00
 ; THRESHOLD-NEXT:    [[OP_EXTRA6:%.*]] = fadd fast float [[OP_EXTRA5]], 5.000000e+00
 ; THRESHOLD-NEXT:    [[OP_EXTRA7:%.*]] = fadd fast float [[OP_EXTRA6]], [[CONV]]
-; THRESHOLD-NEXT:    [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]]
 ; THRESHOLD-NEXT:    ret float [[OP_EXTRA7]]
 ;
   entry:
@@ -1572,14 +1266,6 @@
 ; CHECK-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
-; CHECK-NEXT:    [[ADD1:%.*]] = fadd fast float undef, [[ADD]]
-; CHECK-NEXT:    [[ADD4:%.*]] = fadd fast float undef, [[ADD1]]
-; CHECK-NEXT:    [[ADD4_1:%.*]] = fadd fast float undef, [[ADD4]]
-; CHECK-NEXT:    [[ADD4_2:%.*]] = fadd fast float undef, [[ADD4_1]]
-; CHECK-NEXT:    [[ADD4_3:%.*]] = fadd fast float undef, [[ADD4_2]]
-; CHECK-NEXT:    [[ADD5:%.*]] = fadd fast float [[ADD4_3]], [[CONV]]
-; CHECK-NEXT:    [[ADD4_4:%.*]] = fadd fast float undef, [[ADD5]]
-; CHECK-NEXT:    [[ADD4_5:%.*]] = fadd fast float undef, [[ADD4_4]]
 ; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP1]], [[RDX_SHUF]]
 ; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -1589,7 +1275,6 @@
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
 ; CHECK-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
 ; CHECK-NEXT:    [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]]
-; CHECK-NEXT:    [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]]
 ; CHECK-NEXT:    ret float [[OP_EXTRA5]]
 ;
 ; THRESHOLD-LABEL: @extra_args_no_replace(
@@ -1608,14 +1293,6 @@
 ; THRESHOLD-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
 ; THRESHOLD-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>*
 ; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
-; THRESHOLD-NEXT:    [[ADD1:%.*]] = fadd fast float undef, [[ADD]]
-; THRESHOLD-NEXT:    [[ADD4:%.*]] = fadd fast float undef, [[ADD1]]
-; THRESHOLD-NEXT:    [[ADD4_1:%.*]] = fadd fast float undef, [[ADD4]]
-; THRESHOLD-NEXT:    [[ADD4_2:%.*]] = fadd fast float undef, [[ADD4_1]]
-; THRESHOLD-NEXT:    [[ADD4_3:%.*]] = fadd fast float undef, [[ADD4_2]]
-; THRESHOLD-NEXT:    [[ADD5:%.*]] = fadd fast float [[ADD4_3]], [[CONV]]
-; THRESHOLD-NEXT:    [[ADD4_4:%.*]] = fadd fast float undef, [[ADD5]]
-; THRESHOLD-NEXT:    [[ADD4_5:%.*]] = fadd fast float undef, [[ADD4_4]]
 ; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
 ; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP1]], [[RDX_SHUF]]
 ; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -1625,7 +1302,6 @@
 ; THRESHOLD-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
 ; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
 ; THRESHOLD-NEXT:    [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]]
-; THRESHOLD-NEXT:    [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]]
 ; THRESHOLD-NEXT:    ret float [[OP_EXTRA5]]
 ;
   entry:
@@ -1676,10 +1352,6 @@
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq <4 x i32> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i32>
-; CHECK-NEXT:    [[R1:%.*]] = add nuw i32 [[ARG]], undef
-; CHECK-NEXT:    [[R2:%.*]] = add nsw i32 [[R1]], undef
-; CHECK-NEXT:    [[R3:%.*]] = add nsw i32 [[R2]], undef
-; CHECK-NEXT:    [[R4:%.*]] = add nsw i32 [[R3]], undef
 ; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP11]], [[RDX_SHUF]]
 ; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
@@ -1687,7 +1359,6 @@
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
 ; CHECK-NEXT:    [[OP_EXTRA:%.*]] = add nuw i32 [[TMP12]], [[ARG]]
 ; CHECK-NEXT:    [[OP_EXTRA3:%.*]] = add nsw i32 [[OP_EXTRA]], [[TMP9]]
-; CHECK-NEXT:    [[R5:%.*]] = add nsw i32 [[R4]], undef
 ; CHECK-NEXT:    ret i32 [[OP_EXTRA3]]
 ;
 ; THRESHOLD-LABEL: @wobble(
@@ -1704,10 +1375,6 @@
 ; THRESHOLD-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3
 ; THRESHOLD-NEXT:    [[TMP10:%.*]] = icmp eq <4 x i32> [[TMP8]], zeroinitializer
 ; THRESHOLD-NEXT:    [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i32>
-; THRESHOLD-NEXT:    [[R1:%.*]] = add nuw i32 [[ARG]], undef
-; THRESHOLD-NEXT:    [[R2:%.*]] = add nsw i32 [[R1]], undef
-; THRESHOLD-NEXT:    [[R3:%.*]] = add nsw i32 [[R2]], undef
-; THRESHOLD-NEXT:    [[R4:%.*]] = add nsw i32 [[R3]], undef
 ; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
 ; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP11]], [[RDX_SHUF]]
 ; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
@@ -1715,7 +1382,6 @@
 ; THRESHOLD-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
 ; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = add nuw i32 [[TMP12]], [[ARG]]
 ; THRESHOLD-NEXT:    [[OP_EXTRA3:%.*]] = add nsw i32 [[OP_EXTRA]], [[TMP9]]
-; THRESHOLD-NEXT:    [[R5:%.*]] = add nsw i32 [[R4]], undef
 ; THRESHOLD-NEXT:    ret i32 [[OP_EXTRA3]]
 ;
   bb:
Index: test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
+++ test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
@@ -36,27 +36,6 @@
 ;
 ; AVX-LABEL: @maxi8(
 ; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr to <8 x i32>*), align 16
-; AVX-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP2]], i32 0
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP2]], i32 1
-; AVX-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP4]]
-; AVX-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 undef, i32 undef
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP2]], i32 2
-; AVX-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]]
-; AVX-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP6]], i32 undef
-; AVX-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP2]], i32 3
-; AVX-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
-; AVX-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP9]], i32 undef
-; AVX-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[TMP2]], i32 4
-; AVX-NEXT:    [[TMP14:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]]
-; AVX-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP12]], i32 undef
-; AVX-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[TMP2]], i32 5
-; AVX-NEXT:    [[TMP17:%.*]] = icmp sgt i32 [[TMP15]], [[TMP16]]
-; AVX-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP15]], i32 undef
-; AVX-NEXT:    [[TMP19:%.*]] = extractelement <8 x i32> [[TMP2]], i32 6
-; AVX-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP18]], [[TMP19]]
-; AVX-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[TMP18]], i32 undef
-; AVX-NEXT:    [[TMP22:%.*]] = extractelement <8 x i32> [[TMP2]], i32 7
-; AVX-NEXT:    [[TMP23:%.*]] = icmp sgt i32 [[TMP21]], [[TMP22]]
 ; AVX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <8 x i32> [[TMP2]], [[RDX_SHUF]]
 ; AVX-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP]], <8 x i32> [[TMP2]], <8 x i32> [[RDX_SHUF]]
@@ -66,33 +45,11 @@
 ; AVX-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <8 x i32> [[RDX_MINMAX_SELECT3]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = icmp sgt <8 x i32> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
 ; AVX-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP5]], <8 x i32> [[RDX_MINMAX_SELECT3]], <8 x i32> [[RDX_SHUF4]]
-; AVX-NEXT:    [[TMP24:%.*]] = extractelement <8 x i32> [[RDX_MINMAX_SELECT6]], i32 0
-; AVX-NEXT:    [[TMP25:%.*]] = select i1 [[TMP23]], i32 [[TMP21]], i32 undef
-; AVX-NEXT:    ret i32 [[TMP24]]
+; AVX-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[RDX_MINMAX_SELECT6]], i32 0
+; AVX-NEXT:    ret i32 [[TMP3]]
 ;
 ; AVX2-LABEL: @maxi8(
 ; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr to <8 x i32>*), align 16
-; AVX2-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP2]], i32 0
-; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP2]], i32 1
-; AVX2-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP4]]
-; AVX2-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 undef, i32 undef
-; AVX2-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP2]], i32 2
-; AVX2-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]]
-; AVX2-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP6]], i32 undef
-; AVX2-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP2]], i32 3
-; AVX2-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
-; AVX2-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP9]], i32 undef
-; AVX2-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[TMP2]], i32 4
-; AVX2-NEXT:    [[TMP14:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]]
-; AVX2-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP12]], i32 undef
-; AVX2-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[TMP2]], i32 5
-; AVX2-NEXT:    [[TMP17:%.*]] = icmp sgt i32 [[TMP15]], [[TMP16]]
-; AVX2-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP15]], i32 undef
-; AVX2-NEXT:    [[TMP19:%.*]] = extractelement <8 x i32> [[TMP2]], i32 6
-; AVX2-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP18]], [[TMP19]]
-; AVX2-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[TMP18]], i32 undef
-; AVX2-NEXT:    [[TMP22:%.*]] = extractelement <8 x i32> [[TMP2]], i32 7
-; AVX2-NEXT:    [[TMP23:%.*]] = icmp sgt i32 [[TMP21]], [[TMP22]]
 ; AVX2-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX2-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <8 x i32> [[TMP2]], [[RDX_SHUF]]
 ; AVX2-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP]], <8 x i32> [[TMP2]], <8 x i32> [[RDX_SHUF]]
@@ -102,33 +59,11 @@
 ; AVX2-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <8 x i32> [[RDX_MINMAX_SELECT3]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX2-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = icmp sgt <8 x i32> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
 ; AVX2-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP5]], <8 x i32> [[RDX_MINMAX_SELECT3]], <8 x i32> [[RDX_SHUF4]]
-; AVX2-NEXT:    [[TMP24:%.*]] = extractelement <8 x i32> [[RDX_MINMAX_SELECT6]], i32 0
-; AVX2-NEXT:    [[TMP25:%.*]] = select i1 [[TMP23]], i32 [[TMP21]], i32 undef
-; AVX2-NEXT:    ret i32 [[TMP24]]
+; AVX2-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[RDX_MINMAX_SELECT6]], i32 0
+; AVX2-NEXT:    ret i32 [[TMP3]]
 ;
 ; SKX-LABEL: @maxi8(
 ; SKX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr to <8 x i32>*), align 16
-; SKX-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP2]], i32 0
-; SKX-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP2]], i32 1
-; SKX-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP4]]
-; SKX-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 undef, i32 undef
-; SKX-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP2]], i32 2
-; SKX-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]]
-; SKX-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP6]], i32 undef
-; SKX-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP2]], i32 3
-; SKX-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
-; SKX-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP9]], i32 undef
-; SKX-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[TMP2]], i32 4
-; SKX-NEXT:    [[TMP14:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]]
-; SKX-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP12]], i32 undef
-; SKX-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[TMP2]], i32 5
-; SKX-NEXT:    [[TMP17:%.*]] = icmp sgt i32 [[TMP15]], [[TMP16]]
-; SKX-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP15]], i32 undef
-; SKX-NEXT:    [[TMP19:%.*]] = extractelement <8 x i32> [[TMP2]], i32 6
-; SKX-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP18]], [[TMP19]]
-; SKX-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[TMP18]], i32 undef
-; SKX-NEXT:    [[TMP22:%.*]] = extractelement <8 x i32> [[TMP2]], i32 7
-; SKX-NEXT:    [[TMP23:%.*]] = icmp sgt i32 [[TMP21]], [[TMP22]]
 ; SKX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SKX-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <8 x i32> [[TMP2]], [[RDX_SHUF]]
 ; SKX-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP]], <8 x i32> [[TMP2]], <8 x i32> [[RDX_SHUF]]
@@ -138,9 +73,8 @@
 ; SKX-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <8 x i32> [[RDX_MINMAX_SELECT3]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SKX-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = icmp sgt <8 x i32> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
 ; SKX-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP5]], <8 x i32> [[RDX_MINMAX_SELECT3]], <8 x i32> [[RDX_SHUF4]]
-; SKX-NEXT:    [[TMP24:%.*]] = extractelement <8 x i32> [[RDX_MINMAX_SELECT6]], i32 0
-; SKX-NEXT:    [[TMP25:%.*]] = select i1 [[TMP23]], i32 [[TMP21]], i32 undef
-; SKX-NEXT:    ret i32 [[TMP24]]
+; SKX-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[RDX_MINMAX_SELECT6]], i32 0
+; SKX-NEXT:    ret i32 [[TMP3]]
 ;
   %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
   %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
@@ -169,101 +103,24 @@
 
 define i32 @maxi16(i32) {
 ; CHECK-LABEL: @maxi16(
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]]
-; CHECK-NEXT:    [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]]
-; CHECK-NEXT:    [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
-; CHECK-NEXT:    [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]]
-; CHECK-NEXT:    [[TMP24:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 8), align 16
-; CHECK-NEXT:    [[TMP25:%.*]] = icmp sgt i32 [[TMP23]], [[TMP24]]
-; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP23]], i32 [[TMP24]]
-; CHECK-NEXT:    [[TMP27:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 9), align 4
-; CHECK-NEXT:    [[TMP28:%.*]] = icmp sgt i32 [[TMP26]], [[TMP27]]
-; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i32 [[TMP26]], i32 [[TMP27]]
-; CHECK-NEXT:    [[TMP30:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 10), align 8
-; CHECK-NEXT:    [[TMP31:%.*]] = icmp sgt i32 [[TMP29]], [[TMP30]]
-; CHECK-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP29]], i32 [[TMP30]]
-; CHECK-NEXT:    [[TMP33:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 11), align 4
-; CHECK-NEXT:    [[TMP34:%.*]] = icmp sgt i32 [[TMP32]], [[TMP33]]
-; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[TMP32]], i32 [[TMP33]]
-; CHECK-NEXT:    [[TMP36:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 12), align 16
-; CHECK-NEXT:    [[TMP37:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]]
-; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 [[TMP35]], i32 [[TMP36]]
-; CHECK-NEXT:    [[TMP39:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 13), align 4
-; CHECK-NEXT:    [[TMP40:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]]
-; CHECK-NEXT:    [[TMP41:%.*]] = select i1 [[TMP40]], i32 [[TMP38]], i32 [[TMP39]]
-; CHECK-NEXT:    [[TMP42:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 14), align 8
-; CHECK-NEXT:    [[TMP43:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]]
-; CHECK-NEXT:    [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[TMP41]], i32 [[TMP42]]
-; CHECK-NEXT:    [[TMP45:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 15), align 4
-; CHECK-NEXT:    [[TMP46:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]]
-; CHECK-NEXT:    [[TMP47:%.*]] = select i1 [[TMP46]], i32 [[TMP44]], i32 [[TMP45]]
-; CHECK-NEXT:    ret i32 [[TMP47]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([32 x i32]* @arr to <16 x i32>*), align 16
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <16 x i32> [[TMP2]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP]], <16 x i32> [[TMP2]], <16 x i32> [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <16 x i32> [[RDX_MINMAX_SELECT]], <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <16 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP2]], <16 x i32> [[RDX_MINMAX_SELECT]], <16 x i32> [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <16 x i32> [[RDX_MINMAX_SELECT3]], <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = icmp sgt <16 x i32> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP5]], <16 x i32> [[RDX_MINMAX_SELECT3]], <16 x i32> [[RDX_SHUF4]]
+; CHECK-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <16 x i32> [[RDX_MINMAX_SELECT6]], <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP8:%.*]] = icmp sgt <16 x i32> [[RDX_MINMAX_SELECT6]], [[RDX_SHUF7]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT9:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP8]], <16 x i32> [[RDX_MINMAX_SELECT6]], <16 x i32> [[RDX_SHUF7]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <16 x i32> [[RDX_MINMAX_SELECT9]], i32 0
+; CHECK-NEXT:    ret i32 [[TMP3]]
 ;
 ; AVX-LABEL: @maxi16(
 ; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([32 x i32]* @arr to <16 x i32>*), align 16
-; AVX-NEXT:    [[TMP3:%.*]] = extractelement <16 x i32> [[TMP2]], i32 0
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <16 x i32> [[TMP2]], i32 1
-; AVX-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP4]]
-; AVX-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 undef, i32 undef
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <16 x i32> [[TMP2]], i32 2
-; AVX-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]]
-; AVX-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP6]], i32 undef
-; AVX-NEXT:    [[TMP10:%.*]] = extractelement <16 x i32> [[TMP2]], i32 3
-; AVX-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
-; AVX-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP9]], i32 undef
-; AVX-NEXT:    [[TMP13:%.*]] = extractelement <16 x i32> [[TMP2]], i32 4
-; AVX-NEXT:    [[TMP14:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]]
-; AVX-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP12]], i32 undef
-; AVX-NEXT:    [[TMP16:%.*]] = extractelement <16 x i32> [[TMP2]], i32 5
-; AVX-NEXT:    [[TMP17:%.*]] = icmp sgt i32 [[TMP15]], [[TMP16]]
-; AVX-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP15]], i32 undef
-; AVX-NEXT:    [[TMP19:%.*]] = extractelement <16 x i32> [[TMP2]], i32 6
-; AVX-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP18]], [[TMP19]]
-; AVX-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[TMP18]], i32 undef
-; AVX-NEXT:    [[TMP22:%.*]] = extractelement <16 x i32> [[TMP2]], i32 7
-; AVX-NEXT:    [[TMP23:%.*]] = icmp sgt i32 [[TMP21]], [[TMP22]]
-; AVX-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 [[TMP21]], i32 undef
-; AVX-NEXT:    [[TMP25:%.*]] = extractelement <16 x i32> [[TMP2]], i32 8
-; AVX-NEXT:    [[TMP26:%.*]] = icmp sgt i32 [[TMP24]], [[TMP25]]
-; AVX-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[TMP24]], i32 undef
-; AVX-NEXT:    [[TMP28:%.*]] = extractelement <16 x i32> [[TMP2]], i32 9
-; AVX-NEXT:    [[TMP29:%.*]] = icmp sgt i32 [[TMP27]], [[TMP28]]
-; AVX-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[TMP27]], i32 undef
-; AVX-NEXT:    [[TMP31:%.*]] = extractelement <16 x i32> [[TMP2]], i32 10
-; AVX-NEXT:    [[TMP32:%.*]] = icmp sgt i32 [[TMP30]], [[TMP31]]
-; AVX-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], i32 [[TMP30]], i32 undef
-; AVX-NEXT:    [[TMP34:%.*]] = extractelement <16 x i32> [[TMP2]], i32 11
-; AVX-NEXT:    [[TMP35:%.*]] = icmp sgt i32 [[TMP33]], [[TMP34]]
-; AVX-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 [[TMP33]], i32 undef
-; AVX-NEXT:    [[TMP37:%.*]] = extractelement <16 x i32> [[TMP2]], i32 12
-; AVX-NEXT:    [[TMP38:%.*]] = icmp sgt i32 [[TMP36]], [[TMP37]]
-; AVX-NEXT:    [[TMP39:%.*]] = select i1 [[TMP38]], i32 [[TMP36]], i32 undef
-; AVX-NEXT:    [[TMP40:%.*]] = extractelement <16 x i32> [[TMP2]], i32 13
-; AVX-NEXT:    [[TMP41:%.*]] = icmp sgt i32 [[TMP39]], [[TMP40]]
-; AVX-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], i32 [[TMP39]], i32 undef
-; AVX-NEXT:    [[TMP43:%.*]] = extractelement <16 x i32> [[TMP2]], i32 14
-; AVX-NEXT:    [[TMP44:%.*]] = icmp sgt i32 [[TMP42]], [[TMP43]]
-; AVX-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP42]], i32 undef
-; AVX-NEXT:    [[TMP46:%.*]] = extractelement <16 x i32> [[TMP2]], i32 15
-; AVX-NEXT:    [[TMP47:%.*]] = icmp sgt i32 [[TMP45]], [[TMP46]]
 ; AVX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <16 x i32> [[TMP2]], [[RDX_SHUF]]
 ; AVX-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP]], <16 x i32> [[TMP2]], <16 x i32> [[RDX_SHUF]]
@@ -276,57 +133,11 @@
 ; AVX-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <16 x i32> [[RDX_MINMAX_SELECT6]], <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX-NEXT:    [[RDX_MINMAX_CMP8:%.*]] = icmp sgt <16 x i32> [[RDX_MINMAX_SELECT6]], [[RDX_SHUF7]]
 ; AVX-NEXT:    [[RDX_MINMAX_SELECT9:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP8]], <16 x i32> [[RDX_MINMAX_SELECT6]], <16 x i32> [[RDX_SHUF7]]
-; AVX-NEXT:    [[TMP48:%.*]] = extractelement <16 x i32> [[RDX_MINMAX_SELECT9]], i32 0
-; AVX-NEXT:    [[TMP49:%.*]] = select i1 [[TMP47]], i32 [[TMP45]], i32 undef
-; AVX-NEXT:    ret i32 [[TMP48]]
+; AVX-NEXT:    [[TMP3:%.*]] = extractelement <16 x i32> [[RDX_MINMAX_SELECT9]], i32 0
+; AVX-NEXT:    ret i32 [[TMP3]]
 ;
 ; AVX2-LABEL: @maxi16(
 ; AVX2-NEXT:    [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([32 x i32]* @arr to <16 x i32>*), align 16
-; AVX2-NEXT:    [[TMP3:%.*]] = extractelement <16 x i32> [[TMP2]], i32 0
-; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <16 x i32> [[TMP2]], i32 1
-; AVX2-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP4]]
-; AVX2-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 undef, i32 undef
-; AVX2-NEXT:    [[TMP7:%.*]] = extractelement <16 x i32> [[TMP2]], i32 2
-; AVX2-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]]
-; AVX2-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP6]], i32 undef
-; AVX2-NEXT:    [[TMP10:%.*]] = extractelement <16 x i32> [[TMP2]], i32 3
-; AVX2-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
-; AVX2-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP9]], i32 undef
-; AVX2-NEXT:    [[TMP13:%.*]] = extractelement <16 x i32> [[TMP2]], i32 4
-; AVX2-NEXT:    [[TMP14:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]]
-; AVX2-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP12]], i32 undef
-; AVX2-NEXT:    [[TMP16:%.*]] = extractelement <16 x i32> [[TMP2]], i32 5
-; AVX2-NEXT:    [[TMP17:%.*]] = icmp sgt i32 [[TMP15]], [[TMP16]]
-; AVX2-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP15]], i32 undef
-; AVX2-NEXT:    [[TMP19:%.*]] = extractelement <16 x i32> [[TMP2]], i32 6
-; AVX2-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP18]], [[TMP19]]
-; AVX2-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[TMP18]], i32 undef
-; AVX2-NEXT:    [[TMP22:%.*]] = extractelement <16 x i32> [[TMP2]], i32 7
-; AVX2-NEXT:    [[TMP23:%.*]] = icmp sgt i32 [[TMP21]], [[TMP22]]
-; AVX2-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 [[TMP21]], i32 undef
-; AVX2-NEXT:    [[TMP25:%.*]] = extractelement <16 x i32> [[TMP2]], i32 8
-; AVX2-NEXT:    [[TMP26:%.*]] = icmp sgt i32 [[TMP24]], [[TMP25]]
-; AVX2-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[TMP24]], i32 undef
-; AVX2-NEXT:    [[TMP28:%.*]] = extractelement <16 x i32> [[TMP2]], i32 9
-; AVX2-NEXT:    [[TMP29:%.*]] = icmp sgt i32 [[TMP27]], [[TMP28]]
-; AVX2-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[TMP27]], i32 undef
-; AVX2-NEXT:    [[TMP31:%.*]] = extractelement <16 x i32> [[TMP2]], i32 10
-; AVX2-NEXT:    [[TMP32:%.*]] = icmp sgt i32 [[TMP30]], [[TMP31]]
-; AVX2-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], i32 [[TMP30]], i32 undef
-; AVX2-NEXT:    [[TMP34:%.*]] = extractelement <16 x i32> [[TMP2]], i32 11
-; AVX2-NEXT:    [[TMP35:%.*]] = icmp sgt i32 [[TMP33]], [[TMP34]]
-; AVX2-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 [[TMP33]], i32 undef
-; AVX2-NEXT:    [[TMP37:%.*]] = extractelement <16 x i32> [[TMP2]], i32 12
-; AVX2-NEXT:    [[TMP38:%.*]] = icmp sgt i32 [[TMP36]], [[TMP37]]
-; AVX2-NEXT:    [[TMP39:%.*]] = select i1 [[TMP38]], i32 [[TMP36]], i32 undef
-; AVX2-NEXT:    [[TMP40:%.*]] = extractelement <16 x i32> [[TMP2]], i32 13
-; AVX2-NEXT:    [[TMP41:%.*]] = icmp sgt i32 [[TMP39]], [[TMP40]]
-; AVX2-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], i32 [[TMP39]], i32 undef
-; AVX2-NEXT:    [[TMP43:%.*]] = extractelement <16 x i32> [[TMP2]], i32 14
-; AVX2-NEXT:    [[TMP44:%.*]] = icmp sgt i32 [[TMP42]], [[TMP43]]
-; AVX2-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP42]], i32 undef
-; AVX2-NEXT:    [[TMP46:%.*]] = extractelement <16 x i32> [[TMP2]], i32 15
-; AVX2-NEXT:    [[TMP47:%.*]] = icmp sgt i32 [[TMP45]], [[TMP46]]
 ; AVX2-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX2-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <16 x i32> [[TMP2]], [[RDX_SHUF]]
 ; AVX2-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP]], <16 x i32> [[TMP2]], <16 x i32> [[RDX_SHUF]]
@@ -339,57 +150,11 @@
 ; AVX2-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <16 x i32> [[RDX_MINMAX_SELECT6]], <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX2-NEXT:    [[RDX_MINMAX_CMP8:%.*]] = icmp sgt <16 x i32> [[RDX_MINMAX_SELECT6]], [[RDX_SHUF7]]
 ; AVX2-NEXT:    [[RDX_MINMAX_SELECT9:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP8]], <16 x i32> [[RDX_MINMAX_SELECT6]], <16 x i32> [[RDX_SHUF7]]
-; AVX2-NEXT:    [[TMP48:%.*]] = extractelement <16 x i32> [[RDX_MINMAX_SELECT9]], i32 0
-; AVX2-NEXT:    [[TMP49:%.*]] = select i1 [[TMP47]], i32 [[TMP45]], i32 undef
-; AVX2-NEXT:    ret i32 [[TMP48]]
+; AVX2-NEXT:    [[TMP3:%.*]] = extractelement <16 x i32> [[RDX_MINMAX_SELECT9]], i32 0
+; AVX2-NEXT:    ret i32 [[TMP3]]
 ;
 ; SKX-LABEL: @maxi16(
 ; SKX-NEXT:    [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([32 x i32]* @arr to <16 x i32>*), align 16
-; SKX-NEXT:    [[TMP3:%.*]] = extractelement <16 x i32> [[TMP2]], i32 0
-; SKX-NEXT:    [[TMP4:%.*]] = extractelement <16 x i32> [[TMP2]], i32 1
-; SKX-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP4]]
-; SKX-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 undef, i32 undef
-; SKX-NEXT:    [[TMP7:%.*]] = extractelement <16 x i32> [[TMP2]], i32 2
-; SKX-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]]
-; SKX-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP6]], i32 undef
-; SKX-NEXT:    [[TMP10:%.*]] = extractelement <16 x i32> [[TMP2]], i32 3
-; SKX-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
-; SKX-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP9]], i32 undef
-; SKX-NEXT:    [[TMP13:%.*]] = extractelement <16 x i32> [[TMP2]], i32 4
-; SKX-NEXT:    [[TMP14:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]]
-; SKX-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP12]], i32 undef
-; SKX-NEXT:    [[TMP16:%.*]] = extractelement <16 x i32> [[TMP2]], i32 5
-; SKX-NEXT:    [[TMP17:%.*]] = icmp sgt i32 [[TMP15]], [[TMP16]]
-; SKX-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP15]], i32 undef
-; SKX-NEXT:    [[TMP19:%.*]] = extractelement <16 x i32> [[TMP2]], i32 6
-; SKX-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP18]], [[TMP19]]
-; SKX-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[TMP18]], i32 undef
-; SKX-NEXT:    [[TMP22:%.*]] = extractelement <16 x i32> [[TMP2]], i32 7
-; SKX-NEXT:    [[TMP23:%.*]] = icmp sgt i32 [[TMP21]], [[TMP22]]
-; SKX-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 [[TMP21]], i32 undef
-; SKX-NEXT:    [[TMP25:%.*]] = extractelement <16 x i32> [[TMP2]], i32 8
-; SKX-NEXT:    [[TMP26:%.*]] = icmp sgt i32 [[TMP24]], [[TMP25]]
-; SKX-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[TMP24]], i32 undef
-; SKX-NEXT:    [[TMP28:%.*]] = extractelement <16 x i32> [[TMP2]], i32 9
-; SKX-NEXT:    [[TMP29:%.*]] = icmp sgt i32 [[TMP27]], [[TMP28]]
-; SKX-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[TMP27]], i32 undef
-; SKX-NEXT:    [[TMP31:%.*]] = extractelement <16 x i32> [[TMP2]], i32 10
-; SKX-NEXT:    [[TMP32:%.*]] = icmp sgt i32 [[TMP30]], [[TMP31]]
-; SKX-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], i32 [[TMP30]], i32 undef
-; SKX-NEXT:    [[TMP34:%.*]] = extractelement <16 x i32> [[TMP2]], i32 11
-; SKX-NEXT:    [[TMP35:%.*]] = icmp sgt i32 [[TMP33]], [[TMP34]]
-; SKX-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 [[TMP33]], i32 undef
-; SKX-NEXT:    [[TMP37:%.*]] = extractelement <16 x i32> [[TMP2]], i32 12
-; SKX-NEXT:    [[TMP38:%.*]] = icmp sgt i32 [[TMP36]], [[TMP37]]
-; SKX-NEXT:    [[TMP39:%.*]] = select i1 [[TMP38]], i32 [[TMP36]], i32 undef
-; SKX-NEXT:    [[TMP40:%.*]] = extractelement <16 x i32> [[TMP2]], i32 13
-; SKX-NEXT:    [[TMP41:%.*]] = icmp sgt i32 [[TMP39]], [[TMP40]]
-; SKX-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], i32 [[TMP39]], i32 undef
-; SKX-NEXT:    [[TMP43:%.*]] = extractelement <16 x i32> [[TMP2]], i32 14
-; SKX-NEXT:    [[TMP44:%.*]] = icmp sgt i32 [[TMP42]], [[TMP43]]
-; SKX-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP42]], i32 undef
-; SKX-NEXT:    [[TMP46:%.*]] = extractelement <16 x i32> [[TMP2]], i32 15
-; SKX-NEXT:    [[TMP47:%.*]] = icmp sgt i32 [[TMP45]], [[TMP46]]
 ; SKX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SKX-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <16 x i32> [[TMP2]], [[RDX_SHUF]]
 ; SKX-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP]], <16 x i32> [[TMP2]], <16 x i32> [[RDX_SHUF]]
@@ -402,9 +167,8 @@
 ; SKX-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <16 x i32> [[RDX_MINMAX_SELECT6]], <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SKX-NEXT:    [[RDX_MINMAX_CMP8:%.*]] = icmp sgt <16 x i32> [[RDX_MINMAX_SELECT6]], [[RDX_SHUF7]]
 ; SKX-NEXT:    [[RDX_MINMAX_SELECT9:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP8]], <16 x i32> [[RDX_MINMAX_SELECT6]], <16 x i32> [[RDX_SHUF7]]
-; SKX-NEXT:    [[TMP48:%.*]] = extractelement <16 x i32> [[RDX_MINMAX_SELECT9]], i32 0
-; SKX-NEXT:    [[TMP49:%.*]] = select i1 [[TMP47]], i32 [[TMP45]], i32 undef
-; SKX-NEXT:    ret i32 [[TMP48]]
+; SKX-NEXT:    [[TMP3:%.*]] = extractelement <16 x i32> [[RDX_MINMAX_SELECT9]], i32 0
+; SKX-NEXT:    ret i32 [[TMP3]]
 ;
   %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
   %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
@@ -458,99 +222,6 @@
 define i32 @maxi32(i32) {
 ; CHECK-LABEL: @maxi32(
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i32>, <32 x i32>* bitcast ([32 x i32]* @arr to <32 x i32>*), align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <32 x i32> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <32 x i32> [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 undef, i32 undef
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <32 x i32> [[TMP2]], i32 2
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP6]], i32 undef
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <32 x i32> [[TMP2]], i32 3
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP9]], i32 undef
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <32 x i32> [[TMP2]], i32 4
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP12]], i32 undef
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <32 x i32> [[TMP2]], i32 5
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp sgt i32 [[TMP15]], [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP15]], i32 undef
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <32 x i32> [[TMP2]], i32 6
-; CHECK-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP18]], [[TMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[TMP18]], i32 undef
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <32 x i32> [[TMP2]], i32 7
-; CHECK-NEXT:    [[TMP23:%.*]] = icmp sgt i32 [[TMP21]], [[TMP22]]
-; CHECK-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 [[TMP21]], i32 undef
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <32 x i32> [[TMP2]], i32 8
-; CHECK-NEXT:    [[TMP26:%.*]] = icmp sgt i32 [[TMP24]], [[TMP25]]
-; CHECK-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[TMP24]], i32 undef
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <32 x i32> [[TMP2]], i32 9
-; CHECK-NEXT:    [[TMP29:%.*]] = icmp sgt i32 [[TMP27]], [[TMP28]]
-; CHECK-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[TMP27]], i32 undef
-; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <32 x i32> [[TMP2]], i32 10
-; CHECK-NEXT:    [[TMP32:%.*]] = icmp sgt i32 [[TMP30]], [[TMP31]]
-; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], i32 [[TMP30]], i32 undef
-; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <32 x i32> [[TMP2]], i32 11
-; CHECK-NEXT:    [[TMP35:%.*]] = icmp sgt i32 [[TMP33]], [[TMP34]]
-; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 [[TMP33]], i32 undef
-; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <32 x i32> [[TMP2]], i32 12
-; CHECK-NEXT:    [[TMP38:%.*]] = icmp sgt i32 [[TMP36]], [[TMP37]]
-; CHECK-NEXT:    [[TMP39:%.*]] = select i1 [[TMP38]], i32 [[TMP36]], i32 undef
-; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <32 x i32> [[TMP2]], i32 13
-; CHECK-NEXT:    [[TMP41:%.*]] = icmp sgt i32 [[TMP39]], [[TMP40]]
-; CHECK-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], i32 [[TMP39]], i32 undef
-; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <32 x i32> [[TMP2]], i32 14
-; CHECK-NEXT:    [[TMP44:%.*]] = icmp sgt i32 [[TMP42]], [[TMP43]]
-; CHECK-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP42]], i32 undef
-; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <32 x i32> [[TMP2]], i32 15
-; CHECK-NEXT:    [[TMP47:%.*]] = icmp sgt i32 [[TMP45]], [[TMP46]]
-; CHECK-NEXT:    [[TMP48:%.*]] = select i1 [[TMP47]], i32 [[TMP45]], i32 undef
-; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <32 x i32> [[TMP2]], i32 16
-; CHECK-NEXT:    [[TMP50:%.*]] = icmp sgt i32 [[TMP48]], [[TMP49]]
-; CHECK-NEXT:    [[TMP51:%.*]] = select i1 [[TMP50]], i32 [[TMP48]], i32 undef
-; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <32 x i32> [[TMP2]], i32 17
-; CHECK-NEXT:    [[TMP53:%.*]] = icmp sgt i32 [[TMP51]], [[TMP52]]
-; CHECK-NEXT:    [[TMP54:%.*]] = select i1 [[TMP53]], i32 [[TMP51]], i32 undef
-; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <32 x i32> [[TMP2]], i32 18
-; CHECK-NEXT:    [[TMP56:%.*]] = icmp sgt i32 [[TMP54]], [[TMP55]]
-; CHECK-NEXT:    [[TMP57:%.*]] = select i1 [[TMP56]], i32 [[TMP54]], i32 undef
-; CHECK-NEXT:    [[TMP58:%.*]] = extractelement <32 x i32> [[TMP2]], i32 19
-; CHECK-NEXT:    [[TMP59:%.*]] = icmp sgt i32 [[TMP57]], [[TMP58]]
-; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], i32 [[TMP57]], i32 undef
-; CHECK-NEXT:    [[TMP61:%.*]] = extractelement <32 x i32> [[TMP2]], i32 20
-; CHECK-NEXT:    [[TMP62:%.*]] = icmp sgt i32 [[TMP60]], [[TMP61]]
-; CHECK-NEXT:    [[TMP63:%.*]] = select i1 [[TMP62]], i32 [[TMP60]], i32 undef
-; CHECK-NEXT:    [[TMP64:%.*]] = extractelement <32 x i32> [[TMP2]], i32 21
-; CHECK-NEXT:    [[TMP65:%.*]] = icmp sgt i32 [[TMP63]], [[TMP64]]
-; CHECK-NEXT:    [[TMP66:%.*]] = select i1 [[TMP65]], i32 [[TMP63]], i32 undef
-; CHECK-NEXT:    [[TMP67:%.*]] = extractelement <32 x i32> [[TMP2]], i32 22
-; CHECK-NEXT:    [[TMP68:%.*]] = icmp sgt i32 [[TMP66]], [[TMP67]]
-; CHECK-NEXT:    [[TMP69:%.*]] = select i1 [[TMP68]], i32 [[TMP66]], i32 undef
-; CHECK-NEXT:    [[TMP70:%.*]] = extractelement <32 x i32> [[TMP2]], i32 23
-; CHECK-NEXT:    [[TMP71:%.*]] = icmp sgt i32 [[TMP69]], [[TMP70]]
-; CHECK-NEXT:    [[TMP72:%.*]] = select i1 [[TMP71]], i32 [[TMP69]], i32 undef
-; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <32 x i32> [[TMP2]], i32 24
-; CHECK-NEXT:    [[TMP74:%.*]] = icmp sgt i32 [[TMP72]], [[TMP73]]
-; CHECK-NEXT:    [[TMP75:%.*]] = select i1 [[TMP74]], i32 [[TMP72]], i32 undef
-; CHECK-NEXT:    [[TMP76:%.*]] = extractelement <32 x i32> [[TMP2]], i32 25
-; CHECK-NEXT:    [[TMP77:%.*]] = icmp sgt i32 [[TMP75]], [[TMP76]]
-; CHECK-NEXT:    [[TMP78:%.*]] = select i1 [[TMP77]], i32 [[TMP75]], i32 undef
-; CHECK-NEXT:    [[TMP79:%.*]] = extractelement <32 x i32> [[TMP2]], i32 26
-; CHECK-NEXT:    [[TMP80:%.*]] = icmp sgt i32 [[TMP78]], [[TMP79]]
-; CHECK-NEXT:    [[TMP81:%.*]] = select i1 [[TMP80]], i32 [[TMP78]], i32 undef
-; CHECK-NEXT:    [[TMP82:%.*]] = extractelement <32 x i32> [[TMP2]], i32 27
-; CHECK-NEXT:    [[TMP83:%.*]] = icmp sgt i32 [[TMP81]], [[TMP82]]
-; CHECK-NEXT:    [[TMP84:%.*]] = select i1 [[TMP83]], i32 [[TMP81]], i32 undef
-; CHECK-NEXT:    [[TMP85:%.*]] = extractelement <32 x i32> [[TMP2]], i32 28
-; CHECK-NEXT:    [[TMP86:%.*]] = icmp sgt i32 [[TMP84]], [[TMP85]]
-; CHECK-NEXT:    [[TMP87:%.*]] = select i1 [[TMP86]], i32 [[TMP84]], i32 undef
-; CHECK-NEXT:    [[TMP88:%.*]] = extractelement <32 x i32> [[TMP2]], i32 29
-; CHECK-NEXT:    [[TMP89:%.*]] = icmp sgt i32 [[TMP87]], [[TMP88]]
-; CHECK-NEXT:    [[TMP90:%.*]] = select i1 [[TMP89]], i32 [[TMP87]], i32 undef
-; CHECK-NEXT:    [[TMP91:%.*]] = extractelement <32 x i32> [[TMP2]], i32 30
-; CHECK-NEXT:    [[TMP92:%.*]] = icmp sgt i32 [[TMP90]], [[TMP91]]
-; CHECK-NEXT:    [[TMP93:%.*]] = select i1 [[TMP92]], i32 [[TMP90]], i32 undef
-; CHECK-NEXT:    [[TMP94:%.*]] = extractelement <32 x i32> [[TMP2]], i32 31
-; CHECK-NEXT:    [[TMP95:%.*]] = icmp sgt i32 [[TMP93]], [[TMP94]]
 ; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <32 x i32> [[TMP2]], <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <32 x i32> [[TMP2]], [[RDX_SHUF]]
 ; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP]], <32 x i32> [[TMP2]], <32 x i32> [[RDX_SHUF]]
@@ -566,105 +237,11 @@
 ; CHECK-NEXT:    [[RDX_SHUF10:%.*]] = shufflevector <32 x i32> [[RDX_MINMAX_SELECT9]], <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[RDX_MINMAX_CMP11:%.*]] = icmp sgt <32 x i32> [[RDX_MINMAX_SELECT9]], [[RDX_SHUF10]]
 ; CHECK-NEXT:    [[RDX_MINMAX_SELECT12:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP11]], <32 x i32> [[RDX_MINMAX_SELECT9]], <32 x i32> [[RDX_SHUF10]]
-; CHECK-NEXT:    [[TMP96:%.*]] = extractelement <32 x i32> [[RDX_MINMAX_SELECT12]], i32 0
-; CHECK-NEXT:    [[TMP97:%.*]] = select i1 [[TMP95]], i32 [[TMP93]], i32 undef
-; CHECK-NEXT:    ret i32 [[TMP96]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <32 x i32> [[RDX_MINMAX_SELECT12]], i32 0
+; CHECK-NEXT:    ret i32 [[TMP3]]
 ;
 ; AVX-LABEL: @maxi32(
 ; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i32>, <32 x i32>* bitcast ([32 x i32]* @arr to <32 x i32>*), align 16
-; AVX-NEXT:    [[TMP3:%.*]] = extractelement <32 x i32> [[TMP2]], i32 0
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <32 x i32> [[TMP2]], i32 1
-; AVX-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP4]]
-; AVX-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 undef, i32 undef
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <32 x i32> [[TMP2]], i32 2
-; AVX-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]]
-; AVX-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP6]], i32 undef
-; AVX-NEXT:    [[TMP10:%.*]] = extractelement <32 x i32> [[TMP2]], i32 3
-; AVX-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
-; AVX-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP9]], i32 undef
-; AVX-NEXT:    [[TMP13:%.*]] = extractelement <32 x i32> [[TMP2]], i32 4
-; AVX-NEXT:    [[TMP14:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]]
-; AVX-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP12]], i32 undef
-; AVX-NEXT:    [[TMP16:%.*]] = extractelement <32 x i32> [[TMP2]], i32 5
-; AVX-NEXT:    [[TMP17:%.*]] = icmp sgt i32 [[TMP15]], [[TMP16]]
-; AVX-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP15]], i32 undef
-; AVX-NEXT:    [[TMP19:%.*]] = extractelement <32 x i32> [[TMP2]], i32 6
-; AVX-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP18]], [[TMP19]]
-; AVX-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[TMP18]], i32 undef
-; AVX-NEXT:    [[TMP22:%.*]] = extractelement <32 x i32> [[TMP2]], i32 7
-; AVX-NEXT:    [[TMP23:%.*]] = icmp sgt i32 [[TMP21]], [[TMP22]]
-; AVX-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 [[TMP21]], i32 undef
-; AVX-NEXT:    [[TMP25:%.*]] = extractelement <32 x i32> [[TMP2]], i32 8
-; AVX-NEXT:    [[TMP26:%.*]] = icmp sgt i32 [[TMP24]], [[TMP25]]
-; AVX-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[TMP24]], i32 undef
-; AVX-NEXT:    [[TMP28:%.*]] = extractelement <32 x i32> [[TMP2]], i32 9
-; AVX-NEXT:    [[TMP29:%.*]] = icmp sgt i32 [[TMP27]], [[TMP28]]
-; AVX-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[TMP27]], i32 undef
-; AVX-NEXT:    [[TMP31:%.*]] = extractelement <32 x i32> [[TMP2]], i32 10
-; AVX-NEXT:    [[TMP32:%.*]] = icmp sgt i32 [[TMP30]], [[TMP31]]
-; AVX-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], i32 [[TMP30]], i32 undef
-; AVX-NEXT:    [[TMP34:%.*]] = extractelement <32 x i32> [[TMP2]], i32 11
-; AVX-NEXT:    [[TMP35:%.*]] = icmp sgt i32 [[TMP33]], [[TMP34]]
-; AVX-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 [[TMP33]], i32 undef
-; AVX-NEXT:    [[TMP37:%.*]] = extractelement <32 x i32> [[TMP2]], i32 12
-; AVX-NEXT:    [[TMP38:%.*]] = icmp sgt i32 [[TMP36]], [[TMP37]]
-; AVX-NEXT:    [[TMP39:%.*]] = select i1 [[TMP38]], i32 [[TMP36]], i32 undef
-; AVX-NEXT:    [[TMP40:%.*]] = extractelement <32 x i32> [[TMP2]], i32 13
-; AVX-NEXT:    [[TMP41:%.*]] = icmp sgt i32 [[TMP39]], [[TMP40]]
-; AVX-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], i32 [[TMP39]], i32 undef
-; AVX-NEXT:    [[TMP43:%.*]] = extractelement <32 x i32> [[TMP2]], i32 14
-; AVX-NEXT:    [[TMP44:%.*]] = icmp sgt i32 [[TMP42]], [[TMP43]]
-; AVX-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP42]], i32 undef
-; AVX-NEXT:    [[TMP46:%.*]] = extractelement <32 x i32> [[TMP2]], i32 15
-; AVX-NEXT:    [[TMP47:%.*]] = icmp sgt i32 [[TMP45]], [[TMP46]]
-; AVX-NEXT:    [[TMP48:%.*]] = select i1 [[TMP47]], i32 [[TMP45]], i32 undef
-; AVX-NEXT:    [[TMP49:%.*]] = extractelement <32 x i32> [[TMP2]], i32 16
-; AVX-NEXT:    [[TMP50:%.*]] = icmp sgt i32 [[TMP48]], [[TMP49]]
-; AVX-NEXT:    [[TMP51:%.*]] = select i1 [[TMP50]], i32 [[TMP48]], i32 undef
-; AVX-NEXT:    [[TMP52:%.*]] = extractelement <32 x i32> [[TMP2]], i32 17
-; AVX-NEXT:    [[TMP53:%.*]] = icmp sgt i32 [[TMP51]], [[TMP52]]
-; AVX-NEXT:    [[TMP54:%.*]] = select i1 [[TMP53]], i32 [[TMP51]], i32 undef
-; AVX-NEXT:    [[TMP55:%.*]] = extractelement <32 x i32> [[TMP2]], i32 18
-; AVX-NEXT:    [[TMP56:%.*]] = icmp sgt i32 [[TMP54]], [[TMP55]]
-; AVX-NEXT:    [[TMP57:%.*]] = select i1 [[TMP56]], i32 [[TMP54]], i32 undef
-; AVX-NEXT:    [[TMP58:%.*]] = extractelement <32 x i32> [[TMP2]], i32 19
-; AVX-NEXT:    [[TMP59:%.*]] = icmp sgt i32 [[TMP57]], [[TMP58]]
-; AVX-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], i32 [[TMP57]], i32 undef
-; AVX-NEXT:    [[TMP61:%.*]] = extractelement <32 x i32> [[TMP2]], i32 20
-; AVX-NEXT:    [[TMP62:%.*]] = icmp sgt i32 [[TMP60]], [[TMP61]]
-; AVX-NEXT:    [[TMP63:%.*]] = select i1 [[TMP62]], i32 [[TMP60]], i32 undef
-; AVX-NEXT:    [[TMP64:%.*]] = extractelement <32 x i32> [[TMP2]], i32 21
-; AVX-NEXT:    [[TMP65:%.*]] = icmp sgt i32 [[TMP63]], [[TMP64]]
-; AVX-NEXT:    [[TMP66:%.*]] = select i1 [[TMP65]], i32 [[TMP63]], i32 undef
-; AVX-NEXT:    [[TMP67:%.*]] = extractelement <32 x i32> [[TMP2]], i32 22
-; AVX-NEXT:    [[TMP68:%.*]] = icmp sgt i32 [[TMP66]], [[TMP67]]
-; AVX-NEXT:    [[TMP69:%.*]] = select i1 [[TMP68]], i32 [[TMP66]], i32 undef
-; AVX-NEXT:    [[TMP70:%.*]] = extractelement <32 x i32> [[TMP2]], i32 23
-; AVX-NEXT:    [[TMP71:%.*]] = icmp sgt i32 [[TMP69]], [[TMP70]]
-; AVX-NEXT:    [[TMP72:%.*]] = select i1 [[TMP71]], i32 [[TMP69]], i32 undef
-; AVX-NEXT:    [[TMP73:%.*]] = extractelement <32 x i32> [[TMP2]], i32 24
-; AVX-NEXT:    [[TMP74:%.*]] = icmp sgt i32 [[TMP72]], [[TMP73]]
-; AVX-NEXT:    [[TMP75:%.*]] = select i1 [[TMP74]], i32 [[TMP72]], i32 undef
-; AVX-NEXT:    [[TMP76:%.*]] = extractelement <32 x i32> [[TMP2]], i32 25
-; AVX-NEXT:    [[TMP77:%.*]] = icmp sgt i32 [[TMP75]], [[TMP76]]
-; AVX-NEXT:    [[TMP78:%.*]] = select i1 [[TMP77]], i32 [[TMP75]], i32 undef
-; AVX-NEXT:    [[TMP79:%.*]] = extractelement <32 x i32> [[TMP2]], i32 26
-; AVX-NEXT:    [[TMP80:%.*]] = icmp sgt i32 [[TMP78]], [[TMP79]]
-; AVX-NEXT:    [[TMP81:%.*]] = select i1 [[TMP80]], i32 [[TMP78]], i32 undef
-; AVX-NEXT:    [[TMP82:%.*]] = extractelement <32 x i32> [[TMP2]], i32 27
-; AVX-NEXT:    [[TMP83:%.*]] = icmp sgt i32 [[TMP81]], [[TMP82]]
-; AVX-NEXT:    [[TMP84:%.*]] = select i1 [[TMP83]], i32 [[TMP81]], i32 undef
-; AVX-NEXT:    [[TMP85:%.*]] = extractelement <32 x i32> [[TMP2]], i32 28
-; AVX-NEXT:    [[TMP86:%.*]] = icmp sgt i32 [[TMP84]], [[TMP85]]
-; AVX-NEXT:    [[TMP87:%.*]] = select i1 [[TMP86]], i32 [[TMP84]], i32 undef
-; AVX-NEXT:    [[TMP88:%.*]] = extractelement <32 x i32> [[TMP2]], i32 29
-; AVX-NEXT:    [[TMP89:%.*]] = icmp sgt i32 [[TMP87]], [[TMP88]]
-; AVX-NEXT:    [[TMP90:%.*]] = select i1 [[TMP89]], i32 [[TMP87]], i32 undef
-; AVX-NEXT:    [[TMP91:%.*]] = extractelement <32 x i32> [[TMP2]], i32 30
-; AVX-NEXT:    [[TMP92:%.*]] = icmp sgt i32 [[TMP90]], [[TMP91]]
-; AVX-NEXT:    [[TMP93:%.*]] = select i1 [[TMP92]], i32 [[TMP90]], i32 undef
-; AVX-NEXT:    [[TMP94:%.*]] = extractelement <32 x i32> [[TMP2]], i32 31
-; AVX-NEXT:    [[TMP95:%.*]] = icmp sgt i32 [[TMP93]], [[TMP94]]
 ; AVX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <32 x i32> [[TMP2]], <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <32 x i32> [[TMP2]], [[RDX_SHUF]]
 ; AVX-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP]], <32 x i32> [[TMP2]], <32 x i32> [[RDX_SHUF]]
@@ -680,105 +257,11 @@
 ; AVX-NEXT:    [[RDX_SHUF10:%.*]] = shufflevector <32 x i32> [[RDX_MINMAX_SELECT9]], <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX-NEXT:    [[RDX_MINMAX_CMP11:%.*]] = icmp sgt <32 x i32> [[RDX_MINMAX_SELECT9]], [[RDX_SHUF10]]
 ; AVX-NEXT:    [[RDX_MINMAX_SELECT12:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP11]], <32 x i32> [[RDX_MINMAX_SELECT9]], <32 x i32> [[RDX_SHUF10]]
-; AVX-NEXT:    [[TMP96:%.*]] = extractelement <32 x i32> [[RDX_MINMAX_SELECT12]], i32 0
-; AVX-NEXT:    [[TMP97:%.*]] = select i1 [[TMP95]], i32 [[TMP93]], i32 undef
-; AVX-NEXT:    ret i32 [[TMP96]]
+; AVX-NEXT:    [[TMP3:%.*]] = extractelement <32 x i32> [[RDX_MINMAX_SELECT12]], i32 0
+; AVX-NEXT:    ret i32 [[TMP3]]
 ;
 ; AVX2-LABEL: @maxi32(
 ; AVX2-NEXT:    [[TMP2:%.*]] = load <32 x i32>, <32 x i32>* bitcast ([32 x i32]* @arr to <32 x i32>*), align 16
-; AVX2-NEXT:    [[TMP3:%.*]] = extractelement <32 x i32> [[TMP2]], i32 0
-; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <32 x i32> [[TMP2]], i32 1
-; AVX2-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP4]]
-; AVX2-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 undef, i32 undef
-; AVX2-NEXT:    [[TMP7:%.*]] = extractelement <32 x i32> [[TMP2]], i32 2
-; AVX2-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]]
-; AVX2-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP6]], i32 undef
-; AVX2-NEXT:    [[TMP10:%.*]] = extractelement <32 x i32> [[TMP2]], i32 3
-; AVX2-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
-; AVX2-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP9]], i32 undef
-; AVX2-NEXT:    [[TMP13:%.*]] = extractelement <32 x i32> [[TMP2]], i32 4
-; AVX2-NEXT:    [[TMP14:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]]
-; AVX2-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP12]], i32 undef
-; AVX2-NEXT:    [[TMP16:%.*]] = extractelement <32 x i32> [[TMP2]], i32 5
-; AVX2-NEXT:    [[TMP17:%.*]] = icmp sgt i32 [[TMP15]], [[TMP16]]
-; AVX2-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP15]], i32 undef
-; AVX2-NEXT:    [[TMP19:%.*]] = extractelement <32 x i32> [[TMP2]], i32 6
-; AVX2-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP18]], [[TMP19]]
-; AVX2-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[TMP18]], i32 undef
-; AVX2-NEXT:    [[TMP22:%.*]] = extractelement <32 x i32> [[TMP2]], i32 7
-; AVX2-NEXT:    [[TMP23:%.*]] = icmp sgt i32 [[TMP21]], [[TMP22]]
-; AVX2-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 [[TMP21]], i32 undef
-; AVX2-NEXT:    [[TMP25:%.*]] = extractelement <32 x i32> [[TMP2]], i32 8
-; AVX2-NEXT:    [[TMP26:%.*]] = icmp sgt i32 [[TMP24]], [[TMP25]]
-; AVX2-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[TMP24]], i32 undef
-; AVX2-NEXT:    [[TMP28:%.*]] = extractelement <32 x i32> [[TMP2]], i32 9
-; AVX2-NEXT:    [[TMP29:%.*]] = icmp sgt i32 [[TMP27]], [[TMP28]]
-; AVX2-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[TMP27]], i32 undef
-; AVX2-NEXT:    [[TMP31:%.*]] = extractelement <32 x i32> [[TMP2]], i32 10
-; AVX2-NEXT:    [[TMP32:%.*]] = icmp sgt i32 [[TMP30]], [[TMP31]]
-; AVX2-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], i32 [[TMP30]], i32 undef
-; AVX2-NEXT:    [[TMP34:%.*]] = extractelement <32 x i32> [[TMP2]], i32 11
-; AVX2-NEXT:    [[TMP35:%.*]] = icmp sgt i32 [[TMP33]], [[TMP34]]
-; AVX2-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 [[TMP33]], i32 undef
-; AVX2-NEXT:    [[TMP37:%.*]] = extractelement <32 x i32> [[TMP2]], i32 12
-; AVX2-NEXT:    [[TMP38:%.*]] = icmp sgt i32 [[TMP36]], [[TMP37]]
-; AVX2-NEXT:    [[TMP39:%.*]] = select i1 [[TMP38]], i32 [[TMP36]], i32 undef
-; AVX2-NEXT:    [[TMP40:%.*]] = extractelement <32 x i32> [[TMP2]], i32 13
-; AVX2-NEXT:    [[TMP41:%.*]] = icmp sgt i32 [[TMP39]], [[TMP40]]
-; AVX2-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], i32 [[TMP39]], i32 undef
-; AVX2-NEXT:    [[TMP43:%.*]] = extractelement <32 x i32> [[TMP2]], i32 14
-; AVX2-NEXT:    [[TMP44:%.*]] = icmp sgt i32 [[TMP42]], [[TMP43]]
-; AVX2-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP42]], i32 undef
-; AVX2-NEXT:    [[TMP46:%.*]] = extractelement <32 x i32> [[TMP2]], i32 15
-; AVX2-NEXT:    [[TMP47:%.*]] = icmp sgt i32 [[TMP45]], [[TMP46]]
-; AVX2-NEXT:    [[TMP48:%.*]] = select i1 [[TMP47]], i32 [[TMP45]], i32 undef
-; AVX2-NEXT:    [[TMP49:%.*]] = extractelement <32 x i32> [[TMP2]], i32 16
-; AVX2-NEXT:    [[TMP50:%.*]] = icmp sgt i32 [[TMP48]], [[TMP49]]
-; AVX2-NEXT:    [[TMP51:%.*]] = select i1 [[TMP50]], i32 [[TMP48]], i32 undef
-; AVX2-NEXT:    [[TMP52:%.*]] = extractelement <32 x i32> [[TMP2]], i32 17
-; AVX2-NEXT:    [[TMP53:%.*]] = icmp sgt i32 [[TMP51]], [[TMP52]]
-; AVX2-NEXT:    [[TMP54:%.*]] = select i1 [[TMP53]], i32 [[TMP51]], i32 undef
-; AVX2-NEXT:    [[TMP55:%.*]] = extractelement <32 x i32> [[TMP2]], i32 18
-; AVX2-NEXT:    [[TMP56:%.*]] = icmp sgt i32 [[TMP54]], [[TMP55]]
-; AVX2-NEXT:    [[TMP57:%.*]] = select i1 [[TMP56]], i32 [[TMP54]], i32 undef
-; AVX2-NEXT:    [[TMP58:%.*]] = extractelement <32 x i32> [[TMP2]], i32 19
-; AVX2-NEXT:    [[TMP59:%.*]] = icmp sgt i32 [[TMP57]], [[TMP58]]
-; AVX2-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], i32 [[TMP57]], i32 undef
-; AVX2-NEXT:    [[TMP61:%.*]] = extractelement <32 x i32> [[TMP2]], i32 20
-; AVX2-NEXT:    [[TMP62:%.*]] = icmp sgt i32 [[TMP60]], [[TMP61]]
-; AVX2-NEXT:    [[TMP63:%.*]] = select i1 [[TMP62]], i32 [[TMP60]], i32 undef
-; AVX2-NEXT:    [[TMP64:%.*]] = extractelement <32 x i32> [[TMP2]], i32 21
-; AVX2-NEXT:    [[TMP65:%.*]] = icmp sgt i32 [[TMP63]], [[TMP64]]
-; AVX2-NEXT:    [[TMP66:%.*]] = select i1 [[TMP65]], i32 [[TMP63]], i32 undef
-; AVX2-NEXT:    [[TMP67:%.*]] = extractelement <32 x i32> [[TMP2]], i32 22
-; AVX2-NEXT:    [[TMP68:%.*]] = icmp sgt i32 [[TMP66]], [[TMP67]]
-; AVX2-NEXT:    [[TMP69:%.*]] = select i1 [[TMP68]], i32 [[TMP66]], i32 undef
-; AVX2-NEXT:    [[TMP70:%.*]] = extractelement <32 x i32> [[TMP2]], i32 23
-; AVX2-NEXT:    [[TMP71:%.*]] = icmp sgt i32 [[TMP69]], [[TMP70]]
-; AVX2-NEXT:    [[TMP72:%.*]] = select i1 [[TMP71]], i32 [[TMP69]], i32 undef
-; AVX2-NEXT:    [[TMP73:%.*]] = extractelement <32 x i32> [[TMP2]], i32 24
-; AVX2-NEXT:    [[TMP74:%.*]] = icmp sgt i32 [[TMP72]], [[TMP73]]
-; AVX2-NEXT:    [[TMP75:%.*]] = select i1 [[TMP74]], i32 [[TMP72]], i32 undef
-; AVX2-NEXT:    [[TMP76:%.*]] = extractelement <32 x i32> [[TMP2]], i32 25
-; AVX2-NEXT:    [[TMP77:%.*]] = icmp sgt i32 [[TMP75]], [[TMP76]]
-; AVX2-NEXT:    [[TMP78:%.*]] = select i1 [[TMP77]], i32 [[TMP75]], i32 undef
-; AVX2-NEXT:    [[TMP79:%.*]] = extractelement <32 x i32> [[TMP2]], i32 26
-; AVX2-NEXT:    [[TMP80:%.*]] = icmp sgt i32 [[TMP78]], [[TMP79]]
-; AVX2-NEXT:    [[TMP81:%.*]] = select i1 [[TMP80]], i32 [[TMP78]], i32 undef
-; AVX2-NEXT:    [[TMP82:%.*]] = extractelement <32 x i32> [[TMP2]], i32 27
-; AVX2-NEXT:    [[TMP83:%.*]] = icmp sgt i32 [[TMP81]], [[TMP82]]
-; AVX2-NEXT:    [[TMP84:%.*]] = select i1 [[TMP83]], i32 [[TMP81]], i32 undef
-; AVX2-NEXT:    [[TMP85:%.*]] = extractelement <32 x i32> [[TMP2]], i32 28
-; AVX2-NEXT:    [[TMP86:%.*]] = icmp sgt i32 [[TMP84]], [[TMP85]]
-; AVX2-NEXT:    [[TMP87:%.*]] = select i1 [[TMP86]], i32 [[TMP84]], i32 undef
-; AVX2-NEXT:    [[TMP88:%.*]] = extractelement <32 x i32> [[TMP2]], i32 29
-; AVX2-NEXT:    [[TMP89:%.*]] = icmp sgt i32 [[TMP87]], [[TMP88]]
-; AVX2-NEXT:    [[TMP90:%.*]] = select i1 [[TMP89]], i32 [[TMP87]], i32 undef
-; AVX2-NEXT:    [[TMP91:%.*]] = extractelement <32 x i32> [[TMP2]], i32 30
-; AVX2-NEXT:    [[TMP92:%.*]] = icmp sgt i32 [[TMP90]], [[TMP91]]
-; AVX2-NEXT:    [[TMP93:%.*]] = select i1 [[TMP92]], i32 [[TMP90]], i32 undef
-; AVX2-NEXT:    [[TMP94:%.*]] = extractelement <32 x i32> [[TMP2]], i32 31
-; AVX2-NEXT:    [[TMP95:%.*]] = icmp sgt i32 [[TMP93]], [[TMP94]]
 ; AVX2-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <32 x i32> [[TMP2]], <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX2-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <32 x i32> [[TMP2]], [[RDX_SHUF]]
 ; AVX2-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP]], <32 x i32> [[TMP2]], <32 x i32> [[RDX_SHUF]]
@@ -794,105 +277,11 @@
 ; AVX2-NEXT:    [[RDX_SHUF10:%.*]] = shufflevector <32 x i32> [[RDX_MINMAX_SELECT9]], <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX2-NEXT:    [[RDX_MINMAX_CMP11:%.*]] = icmp sgt <32 x i32> [[RDX_MINMAX_SELECT9]], [[RDX_SHUF10]]
 ; AVX2-NEXT:    [[RDX_MINMAX_SELECT12:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP11]], <32 x i32> [[RDX_MINMAX_SELECT9]], <32 x i32> [[RDX_SHUF10]]
-; AVX2-NEXT:    [[TMP96:%.*]] = extractelement <32 x i32> [[RDX_MINMAX_SELECT12]], i32 0
-; AVX2-NEXT:    [[TMP97:%.*]] = select i1 [[TMP95]], i32 [[TMP93]], i32 undef
-; AVX2-NEXT:    ret i32 [[TMP96]]
+; AVX2-NEXT:    [[TMP3:%.*]] = extractelement <32 x i32> [[RDX_MINMAX_SELECT12]], i32 0
+; AVX2-NEXT:    ret i32 [[TMP3]]
 ;
 ; SKX-LABEL: @maxi32(
 ; SKX-NEXT:    [[TMP2:%.*]] = load <32 x i32>, <32 x i32>* bitcast ([32 x i32]* @arr to <32 x i32>*), align 16
-; SKX-NEXT:    [[TMP3:%.*]] = extractelement <32 x i32> [[TMP2]], i32 0
-; SKX-NEXT:    [[TMP4:%.*]] = extractelement <32 x i32> [[TMP2]], i32 1
-; SKX-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP4]]
-; SKX-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 undef, i32 undef
-; SKX-NEXT:    [[TMP7:%.*]] = extractelement <32 x i32> [[TMP2]], i32 2
-; SKX-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]]
-; SKX-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP6]], i32 undef
-; SKX-NEXT:    [[TMP10:%.*]] = extractelement <32 x i32> [[TMP2]], i32 3
-; SKX-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
-; SKX-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP9]], i32 undef
-; SKX-NEXT:    [[TMP13:%.*]] = extractelement <32 x i32> [[TMP2]], i32 4
-; SKX-NEXT:    [[TMP14:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]]
-; SKX-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP12]], i32 undef
-; SKX-NEXT:    [[TMP16:%.*]] = extractelement <32 x i32> [[TMP2]], i32 5
-; SKX-NEXT:    [[TMP17:%.*]] = icmp sgt i32 [[TMP15]], [[TMP16]]
-; SKX-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP15]], i32 undef
-; SKX-NEXT:    [[TMP19:%.*]] = extractelement <32 x i32> [[TMP2]], i32 6
-; SKX-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP18]], [[TMP19]]
-; SKX-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[TMP18]], i32 undef
-; SKX-NEXT:    [[TMP22:%.*]] = extractelement <32 x i32> [[TMP2]], i32 7
-; SKX-NEXT:    [[TMP23:%.*]] = icmp sgt i32 [[TMP21]], [[TMP22]]
-; SKX-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 [[TMP21]], i32 undef
-; SKX-NEXT:    [[TMP25:%.*]] = extractelement <32 x i32> [[TMP2]], i32 8
-; SKX-NEXT:    [[TMP26:%.*]] = icmp sgt i32 [[TMP24]], [[TMP25]]
-; SKX-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[TMP24]], i32 undef
-; SKX-NEXT:    [[TMP28:%.*]] = extractelement <32 x i32> [[TMP2]], i32 9
-; SKX-NEXT:    [[TMP29:%.*]] = icmp sgt i32 [[TMP27]], [[TMP28]]
-; SKX-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[TMP27]], i32 undef
-; SKX-NEXT:    [[TMP31:%.*]] = extractelement <32 x i32> [[TMP2]], i32 10
-; SKX-NEXT:    [[TMP32:%.*]] = icmp sgt i32 [[TMP30]], [[TMP31]]
-; SKX-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], i32 [[TMP30]], i32 undef
-; SKX-NEXT:    [[TMP34:%.*]] = extractelement <32 x i32> [[TMP2]], i32 11
-; SKX-NEXT:    [[TMP35:%.*]] = icmp sgt i32 [[TMP33]], [[TMP34]]
-; SKX-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 [[TMP33]], i32 undef
-; SKX-NEXT:    [[TMP37:%.*]] = extractelement <32 x i32> [[TMP2]], i32 12
-; SKX-NEXT:    [[TMP38:%.*]] = icmp sgt i32 [[TMP36]], [[TMP37]]
-; SKX-NEXT:    [[TMP39:%.*]] = select i1 [[TMP38]], i32 [[TMP36]], i32 undef
-; SKX-NEXT:    [[TMP40:%.*]] = extractelement <32 x i32> [[TMP2]], i32 13
-; SKX-NEXT:    [[TMP41:%.*]] = icmp sgt i32 [[TMP39]], [[TMP40]]
-; SKX-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], i32 [[TMP39]], i32 undef
-; SKX-NEXT:    [[TMP43:%.*]] = extractelement <32 x i32> [[TMP2]], i32 14
-; SKX-NEXT:    [[TMP44:%.*]] = icmp sgt i32 [[TMP42]], [[TMP43]]
-; SKX-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP42]], i32 undef
-; SKX-NEXT:    [[TMP46:%.*]] = extractelement <32 x i32> [[TMP2]], i32 15
-; SKX-NEXT:    [[TMP47:%.*]] = icmp sgt i32 [[TMP45]], [[TMP46]]
-; SKX-NEXT:    [[TMP48:%.*]] = select i1 [[TMP47]], i32 [[TMP45]], i32 undef
-; SKX-NEXT:    [[TMP49:%.*]] = extractelement <32 x i32> [[TMP2]], i32 16
-; SKX-NEXT:    [[TMP50:%.*]] = icmp sgt i32 [[TMP48]], [[TMP49]]
-; SKX-NEXT:    [[TMP51:%.*]] = select i1 [[TMP50]], i32 [[TMP48]], i32 undef
-; SKX-NEXT:    [[TMP52:%.*]] = extractelement <32 x i32> [[TMP2]], i32 17
-; SKX-NEXT:    [[TMP53:%.*]] = icmp sgt i32 [[TMP51]], [[TMP52]]
-; SKX-NEXT:    [[TMP54:%.*]] = select i1 [[TMP53]], i32 [[TMP51]], i32 undef
-; SKX-NEXT:    [[TMP55:%.*]] = extractelement <32 x i32> [[TMP2]], i32 18
-; SKX-NEXT:    [[TMP56:%.*]] = icmp sgt i32 [[TMP54]], [[TMP55]]
-; SKX-NEXT:    [[TMP57:%.*]] = select i1 [[TMP56]], i32 [[TMP54]], i32 undef
-; SKX-NEXT:    [[TMP58:%.*]] = extractelement <32 x i32> [[TMP2]], i32 19
-; SKX-NEXT:    [[TMP59:%.*]] = icmp sgt i32 [[TMP57]], [[TMP58]]
-; SKX-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], i32 [[TMP57]], i32 undef
-; SKX-NEXT:    [[TMP61:%.*]] = extractelement <32 x i32> [[TMP2]], i32 20
-; SKX-NEXT:    [[TMP62:%.*]] = icmp sgt i32 [[TMP60]], [[TMP61]]
-; SKX-NEXT:    [[TMP63:%.*]] = select i1 [[TMP62]], i32 [[TMP60]], i32 undef
-; SKX-NEXT:    [[TMP64:%.*]] = extractelement <32 x i32> [[TMP2]], i32 21
-; SKX-NEXT:    [[TMP65:%.*]] = icmp sgt i32 [[TMP63]], [[TMP64]]
-; SKX-NEXT:    [[TMP66:%.*]] = select i1 [[TMP65]], i32 [[TMP63]], i32 undef
-; SKX-NEXT:    [[TMP67:%.*]] = extractelement <32 x i32> [[TMP2]], i32 22
-; SKX-NEXT:    [[TMP68:%.*]] = icmp sgt i32 [[TMP66]], [[TMP67]]
-; SKX-NEXT:    [[TMP69:%.*]] = select i1 [[TMP68]], i32 [[TMP66]], i32 undef
-; SKX-NEXT:    [[TMP70:%.*]] = extractelement <32 x i32> [[TMP2]], i32 23
-; SKX-NEXT:    [[TMP71:%.*]] = icmp sgt i32 [[TMP69]], [[TMP70]]
-; SKX-NEXT:    [[TMP72:%.*]] = select i1 [[TMP71]], i32 [[TMP69]], i32 undef
-; SKX-NEXT:    [[TMP73:%.*]] = extractelement <32 x i32> [[TMP2]], i32 24
-; SKX-NEXT:    [[TMP74:%.*]] = icmp sgt i32 [[TMP72]], [[TMP73]]
-; SKX-NEXT:    [[TMP75:%.*]] = select i1 [[TMP74]], i32 [[TMP72]], i32 undef
-; SKX-NEXT:    [[TMP76:%.*]] = extractelement <32 x i32> [[TMP2]], i32 25
-; SKX-NEXT:    [[TMP77:%.*]] = icmp sgt i32 [[TMP75]], [[TMP76]]
-; SKX-NEXT:    [[TMP78:%.*]] = select i1 [[TMP77]], i32 [[TMP75]], i32 undef
-; SKX-NEXT:    [[TMP79:%.*]] = extractelement <32 x i32> [[TMP2]], i32 26
-; SKX-NEXT:    [[TMP80:%.*]] = icmp sgt i32 [[TMP78]], [[TMP79]]
-; SKX-NEXT:    [[TMP81:%.*]] = select i1 [[TMP80]], i32 [[TMP78]], i32 undef
-; SKX-NEXT:    [[TMP82:%.*]] = extractelement <32 x i32> [[TMP2]], i32 27
-; SKX-NEXT:    [[TMP83:%.*]] = icmp sgt i32 [[TMP81]], [[TMP82]]
-; SKX-NEXT:    [[TMP84:%.*]] = select i1 [[TMP83]], i32 [[TMP81]], i32 undef
-; SKX-NEXT:    [[TMP85:%.*]] = extractelement <32 x i32> [[TMP2]], i32 28
-; SKX-NEXT:    [[TMP86:%.*]] = icmp sgt i32 [[TMP84]], [[TMP85]]
-; SKX-NEXT:    [[TMP87:%.*]] = select i1 [[TMP86]], i32 [[TMP84]], i32 undef
-; SKX-NEXT:    [[TMP88:%.*]] = extractelement <32 x i32> [[TMP2]], i32 29
-; SKX-NEXT:    [[TMP89:%.*]] = icmp sgt i32 [[TMP87]], [[TMP88]]
-; SKX-NEXT:    [[TMP90:%.*]] = select i1 [[TMP89]], i32 [[TMP87]], i32 undef
-; SKX-NEXT:    [[TMP91:%.*]] = extractelement <32 x i32> [[TMP2]], i32 30
-; SKX-NEXT:    [[TMP92:%.*]] = icmp sgt i32 [[TMP90]], [[TMP91]]
-; SKX-NEXT:    [[TMP93:%.*]] = select i1 [[TMP92]], i32 [[TMP90]], i32 undef
-; SKX-NEXT:    [[TMP94:%.*]] = extractelement <32 x i32> [[TMP2]], i32 31
-; SKX-NEXT:    [[TMP95:%.*]] = icmp sgt i32 [[TMP93]], [[TMP94]]
 ; SKX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <32 x i32> [[TMP2]], <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SKX-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <32 x i32> [[TMP2]], [[RDX_SHUF]]
 ; SKX-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP]], <32 x i32> [[TMP2]], <32 x i32> [[RDX_SHUF]]
@@ -908,9 +297,8 @@
 ; SKX-NEXT:    [[RDX_SHUF10:%.*]] = shufflevector <32 x i32> [[RDX_MINMAX_SELECT9]], <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SKX-NEXT:    [[RDX_MINMAX_CMP11:%.*]] = icmp sgt <32 x i32> [[RDX_MINMAX_SELECT9]], [[RDX_SHUF10]]
 ; SKX-NEXT:    [[RDX_MINMAX_SELECT12:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP11]], <32 x i32> [[RDX_MINMAX_SELECT9]], <32 x i32> [[RDX_SHUF10]]
-; SKX-NEXT:    [[TMP96:%.*]] = extractelement <32 x i32> [[RDX_MINMAX_SELECT12]], i32 0
-; SKX-NEXT:    [[TMP97:%.*]] = select i1 [[TMP95]], i32 [[TMP93]], i32 undef
-; SKX-NEXT:    ret i32 [[TMP96]]
+; SKX-NEXT:    [[TMP3:%.*]] = extractelement <32 x i32> [[RDX_MINMAX_SELECT12]], i32 0
+; SKX-NEXT:    ret i32 [[TMP3]]
 ;
   %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
   %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
@@ -1037,27 +425,6 @@
 ;
 ; AVX-LABEL: @maxf8(
 ; AVX-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([32 x float]* @arr1 to <8 x float>*), align 16
-; AVX-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP2]], i32 0
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[TMP2]], i32 1
-; AVX-NEXT:    [[TMP5:%.*]] = fcmp fast ogt float [[TMP3]], [[TMP4]]
-; AVX-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], float undef, float undef
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <8 x float> [[TMP2]], i32 2
-; AVX-NEXT:    [[TMP8:%.*]] = fcmp fast ogt float [[TMP6]], [[TMP7]]
-; AVX-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], float [[TMP6]], float undef
-; AVX-NEXT:    [[TMP10:%.*]] = extractelement <8 x float> [[TMP2]], i32 3
-; AVX-NEXT:    [[TMP11:%.*]] = fcmp fast ogt float [[TMP9]], [[TMP10]]
-; AVX-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], float [[TMP9]], float undef
-; AVX-NEXT:    [[TMP13:%.*]] = extractelement <8 x float> [[TMP2]], i32 4
-; AVX-NEXT:    [[TMP14:%.*]] = fcmp fast ogt float [[TMP12]], [[TMP13]]
-; AVX-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], float [[TMP12]], float undef
-; AVX-NEXT:    [[TMP16:%.*]] = extractelement <8 x float> [[TMP2]], i32 5
-; AVX-NEXT:    [[TMP17:%.*]] = fcmp fast ogt float [[TMP15]], [[TMP16]]
-; AVX-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], float [[TMP15]], float undef
-; AVX-NEXT:    [[TMP19:%.*]] = extractelement <8 x float> [[TMP2]], i32 6
-; AVX-NEXT:    [[TMP20:%.*]] = fcmp fast ogt float [[TMP18]], [[TMP19]]
-; AVX-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], float [[TMP18]], float undef
-; AVX-NEXT:    [[TMP22:%.*]] = extractelement <8 x float> [[TMP2]], i32 7
-; AVX-NEXT:    [[TMP23:%.*]] = fcmp fast ogt float [[TMP21]], [[TMP22]]
 ; AVX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <8 x float> [[TMP2]], [[RDX_SHUF]]
 ; AVX-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP]], <8 x float> [[TMP2]], <8 x float> [[RDX_SHUF]]
@@ -1067,33 +434,11 @@
 ; AVX-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <8 x float> [[RDX_MINMAX_SELECT3]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = fcmp fast ogt <8 x float> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
 ; AVX-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP5]], <8 x float> [[RDX_MINMAX_SELECT3]], <8 x float> [[RDX_SHUF4]]
-; AVX-NEXT:    [[TMP24:%.*]] = extractelement <8 x float> [[RDX_MINMAX_SELECT6]], i32 0
-; AVX-NEXT:    [[TMP25:%.*]] = select i1 [[TMP23]], float [[TMP21]], float undef
-; AVX-NEXT:    ret float [[TMP24]]
+; AVX-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[RDX_MINMAX_SELECT6]], i32 0
+; AVX-NEXT:    ret float [[TMP3]]
 ;
 ; AVX2-LABEL: @maxf8(
 ; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([32 x float]* @arr1 to <8 x float>*), align 16
-; AVX2-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP2]], i32 0
-; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[TMP2]], i32 1
-; AVX2-NEXT:    [[TMP5:%.*]] = fcmp fast ogt float [[TMP3]], [[TMP4]]
-; AVX2-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], float undef, float undef
-; AVX2-NEXT:    [[TMP7:%.*]] = extractelement <8 x float> [[TMP2]], i32 2
-; AVX2-NEXT:    [[TMP8:%.*]] = fcmp fast ogt float [[TMP6]], [[TMP7]]
-; AVX2-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], float [[TMP6]], float undef
-; AVX2-NEXT:    [[TMP10:%.*]] = extractelement <8 x float> [[TMP2]], i32 3
-; AVX2-NEXT:    [[TMP11:%.*]] = fcmp fast ogt float [[TMP9]], [[TMP10]]
-; AVX2-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], float [[TMP9]], float undef
-; AVX2-NEXT:    [[TMP13:%.*]] = extractelement <8 x float> [[TMP2]], i32 4
-; AVX2-NEXT:    [[TMP14:%.*]] = fcmp fast ogt float [[TMP12]], [[TMP13]]
-; AVX2-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], float [[TMP12]], float undef
-; AVX2-NEXT:    [[TMP16:%.*]] = extractelement <8 x float> [[TMP2]], i32 5
-; AVX2-NEXT:    [[TMP17:%.*]] = fcmp fast ogt float [[TMP15]], [[TMP16]]
-; AVX2-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], float [[TMP15]], float undef
-; AVX2-NEXT:    [[TMP19:%.*]] = extractelement <8 x float> [[TMP2]], i32 6
-; AVX2-NEXT:    [[TMP20:%.*]] = fcmp fast ogt float [[TMP18]], [[TMP19]]
-; AVX2-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], float [[TMP18]], float undef
-; AVX2-NEXT:    [[TMP22:%.*]] = extractelement <8 x float> [[TMP2]], i32 7
-; AVX2-NEXT:    [[TMP23:%.*]] = fcmp fast ogt float [[TMP21]], [[TMP22]]
 ; AVX2-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX2-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <8 x float> [[TMP2]], [[RDX_SHUF]]
 ; AVX2-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP]], <8 x float> [[TMP2]], <8 x float> [[RDX_SHUF]]
@@ -1103,33 +448,11 @@
 ; AVX2-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <8 x float> [[RDX_MINMAX_SELECT3]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX2-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = fcmp fast ogt <8 x float> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
 ; AVX2-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP5]], <8 x float> [[RDX_MINMAX_SELECT3]], <8 x float> [[RDX_SHUF4]]
-; AVX2-NEXT:    [[TMP24:%.*]] = extractelement <8 x float> [[RDX_MINMAX_SELECT6]], i32 0
-; AVX2-NEXT:    [[TMP25:%.*]] = select i1 [[TMP23]], float [[TMP21]], float undef
-; AVX2-NEXT:    ret float [[TMP24]]
+; AVX2-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[RDX_MINMAX_SELECT6]], i32 0
+; AVX2-NEXT:    ret float [[TMP3]]
 ;
 ; SKX-LABEL: @maxf8(
 ; SKX-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([32 x float]* @arr1 to <8 x float>*), align 16
-; SKX-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP2]], i32 0
-; SKX-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[TMP2]], i32 1
-; SKX-NEXT:    [[TMP5:%.*]] = fcmp fast ogt float [[TMP3]], [[TMP4]]
-; SKX-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], float undef, float undef
-; SKX-NEXT:    [[TMP7:%.*]] = extractelement <8 x float> [[TMP2]], i32 2
-; SKX-NEXT:    [[TMP8:%.*]] = fcmp fast ogt float [[TMP6]], [[TMP7]]
-; SKX-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], float [[TMP6]], float undef
-; SKX-NEXT:    [[TMP10:%.*]] = extractelement <8 x float> [[TMP2]], i32 3
-; SKX-NEXT:    [[TMP11:%.*]] = fcmp fast ogt float [[TMP9]], [[TMP10]]
-; SKX-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], float [[TMP9]], float undef
-; SKX-NEXT:    [[TMP13:%.*]] = extractelement <8 x float> [[TMP2]], i32 4
-; SKX-NEXT:    [[TMP14:%.*]] = fcmp fast ogt float [[TMP12]], [[TMP13]]
-; SKX-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], float [[TMP12]], float undef
-; SKX-NEXT:    [[TMP16:%.*]] = extractelement <8 x float> [[TMP2]], i32 5
-; SKX-NEXT:    [[TMP17:%.*]] = fcmp fast ogt float [[TMP15]], [[TMP16]]
-; SKX-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], float [[TMP15]], float undef
-; SKX-NEXT:    [[TMP19:%.*]] = extractelement <8 x float> [[TMP2]], i32 6
-; SKX-NEXT:    [[TMP20:%.*]] = fcmp fast ogt float [[TMP18]], [[TMP19]]
-; SKX-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], float [[TMP18]], float undef
-; SKX-NEXT:    [[TMP22:%.*]] = extractelement <8 x float> [[TMP2]], i32 7
-; SKX-NEXT:    [[TMP23:%.*]] = fcmp fast ogt float [[TMP21]], [[TMP22]]
 ; SKX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SKX-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <8 x float> [[TMP2]], [[RDX_SHUF]]
 ; SKX-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP]], <8 x float> [[TMP2]], <8 x float> [[RDX_SHUF]]
@@ -1139,9 +462,8 @@
 ; SKX-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <8 x float> [[RDX_MINMAX_SELECT3]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SKX-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = fcmp fast ogt <8 x float> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
 ; SKX-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP5]], <8 x float> [[RDX_MINMAX_SELECT3]], <8 x float> [[RDX_SHUF4]]
-; SKX-NEXT:    [[TMP24:%.*]] = extractelement <8 x float> [[RDX_MINMAX_SELECT6]], i32 0
-; SKX-NEXT:    [[TMP25:%.*]] = select i1 [[TMP23]], float [[TMP21]], float undef
-; SKX-NEXT:    ret float [[TMP24]]
+; SKX-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[RDX_MINMAX_SELECT6]], i32 0
+; SKX-NEXT:    ret float [[TMP3]]
 ;
   %2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
   %3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
@@ -1220,51 +542,6 @@
 ;
 ; AVX-LABEL: @maxf16(
 ; AVX-NEXT:    [[TMP2:%.*]] = load <16 x float>, <16 x float>* bitcast ([32 x float]* @arr1 to <16 x float>*), align 16
-; AVX-NEXT:    [[TMP3:%.*]] = extractelement <16 x float> [[TMP2]], i32 0
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <16 x float> [[TMP2]], i32 1
-; AVX-NEXT:    [[TMP5:%.*]] = fcmp fast ogt float [[TMP3]], [[TMP4]]
-; AVX-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], float undef, float undef
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <16 x float> [[TMP2]], i32 2
-; AVX-NEXT:    [[TMP8:%.*]] = fcmp fast ogt float [[TMP6]], [[TMP7]]
-; AVX-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], float [[TMP6]], float undef
-; AVX-NEXT:    [[TMP10:%.*]] = extractelement <16 x float> [[TMP2]], i32 3
-; AVX-NEXT:    [[TMP11:%.*]] = fcmp fast ogt float [[TMP9]], [[TMP10]]
-; AVX-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], float [[TMP9]], float undef
-; AVX-NEXT:    [[TMP13:%.*]] = extractelement <16 x float> [[TMP2]], i32 4
-; AVX-NEXT:    [[TMP14:%.*]] = fcmp fast ogt float [[TMP12]], [[TMP13]]
-; AVX-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], float [[TMP12]], float undef
-; AVX-NEXT:    [[TMP16:%.*]] = extractelement <16 x float> [[TMP2]], i32 5
-; AVX-NEXT:    [[TMP17:%.*]] = fcmp fast ogt float [[TMP15]], [[TMP16]]
-; AVX-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], float [[TMP15]], float undef
-; AVX-NEXT:    [[TMP19:%.*]] = extractelement <16 x float> [[TMP2]], i32 6
-; AVX-NEXT:    [[TMP20:%.*]] = fcmp fast ogt float [[TMP18]], [[TMP19]]
-; AVX-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], float [[TMP18]], float undef
-; AVX-NEXT:    [[TMP22:%.*]] = extractelement <16 x float> [[TMP2]], i32 7
-; AVX-NEXT:    [[TMP23:%.*]] = fcmp fast ogt float [[TMP21]], [[TMP22]]
-; AVX-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], float [[TMP21]], float undef
-; AVX-NEXT:    [[TMP25:%.*]] = extractelement <16 x float> [[TMP2]], i32 8
-; AVX-NEXT:    [[TMP26:%.*]] = fcmp fast ogt float [[TMP24]], [[TMP25]]
-; AVX-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], float [[TMP24]], float undef
-; AVX-NEXT:    [[TMP28:%.*]] = extractelement <16 x float> [[TMP2]], i32 9
-; AVX-NEXT:    [[TMP29:%.*]] = fcmp fast ogt float [[TMP27]], [[TMP28]]
-; AVX-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], float [[TMP27]], float undef
-; AVX-NEXT:    [[TMP31:%.*]] = extractelement <16 x float> [[TMP2]], i32 10
-; AVX-NEXT:    [[TMP32:%.*]] = fcmp fast ogt float [[TMP30]], [[TMP31]]
-; AVX-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], float [[TMP30]], float undef
-; AVX-NEXT:    [[TMP34:%.*]] = extractelement <16 x float> [[TMP2]], i32 11
-; AVX-NEXT:    [[TMP35:%.*]] = fcmp fast ogt float [[TMP33]], [[TMP34]]
-; AVX-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], float [[TMP33]], float undef
-; AVX-NEXT:    [[TMP37:%.*]] = extractelement <16 x float> [[TMP2]], i32 12
-; AVX-NEXT:    [[TMP38:%.*]] = fcmp fast ogt float [[TMP36]], [[TMP37]]
-; AVX-NEXT:    [[TMP39:%.*]] = select i1 [[TMP38]], float [[TMP36]], float undef
-; AVX-NEXT:    [[TMP40:%.*]] = extractelement <16 x float> [[TMP2]], i32 13
-; AVX-NEXT:    [[TMP41:%.*]] = fcmp fast ogt float [[TMP39]], [[TMP40]]
-; AVX-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], float [[TMP39]], float undef
-; AVX-NEXT:    [[TMP43:%.*]] = extractelement <16 x float> [[TMP2]], i32 14
-; AVX-NEXT:    [[TMP44:%.*]] = fcmp fast ogt float [[TMP42]], [[TMP43]]
-; AVX-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], float [[TMP42]], float undef
-; AVX-NEXT:    [[TMP46:%.*]] = extractelement <16 x float> [[TMP2]], i32 15
-; AVX-NEXT:    [[TMP47:%.*]] = fcmp fast ogt float [[TMP45]], [[TMP46]]
 ; AVX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <16 x float> [[TMP2]], <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <16 x float> [[TMP2]], [[RDX_SHUF]]
 ; AVX-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP]], <16 x float> [[TMP2]], <16 x float> [[RDX_SHUF]]
@@ -1277,57 +554,11 @@
 ; AVX-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <16 x float> [[RDX_MINMAX_SELECT6]], <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX-NEXT:    [[RDX_MINMAX_CMP8:%.*]] = fcmp fast ogt <16 x float> [[RDX_MINMAX_SELECT6]], [[RDX_SHUF7]]
 ; AVX-NEXT:    [[RDX_MINMAX_SELECT9:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP8]], <16 x float> [[RDX_MINMAX_SELECT6]], <16 x float> [[RDX_SHUF7]]
-; AVX-NEXT:    [[TMP48:%.*]] = extractelement <16 x float> [[RDX_MINMAX_SELECT9]], i32 0
-; AVX-NEXT:    [[TMP49:%.*]] = select i1 [[TMP47]], float [[TMP45]], float undef
-; AVX-NEXT:    ret float [[TMP48]]
+; AVX-NEXT:    [[TMP3:%.*]] = extractelement <16 x float> [[RDX_MINMAX_SELECT9]], i32 0
+; AVX-NEXT:    ret float [[TMP3]]
 ;
 ; AVX2-LABEL: @maxf16(
 ; AVX2-NEXT:    [[TMP2:%.*]] = load <16 x float>, <16 x float>* bitcast ([32 x float]* @arr1 to <16 x float>*), align 16
-; AVX2-NEXT:    [[TMP3:%.*]] = extractelement <16 x float> [[TMP2]], i32 0
-; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <16 x float> [[TMP2]], i32 1
-; AVX2-NEXT:    [[TMP5:%.*]] = fcmp fast ogt float [[TMP3]], [[TMP4]]
-; AVX2-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], float undef, float undef
-; AVX2-NEXT:    [[TMP7:%.*]] = extractelement <16 x float> [[TMP2]], i32 2
-; AVX2-NEXT:    [[TMP8:%.*]] = fcmp fast ogt float [[TMP6]], [[TMP7]]
-; AVX2-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], float [[TMP6]], float undef
-; AVX2-NEXT:    [[TMP10:%.*]] = extractelement <16 x float> [[TMP2]], i32 3
-; AVX2-NEXT:    [[TMP11:%.*]] = fcmp fast ogt float [[TMP9]], [[TMP10]]
-; AVX2-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], float [[TMP9]], float undef
-; AVX2-NEXT:    [[TMP13:%.*]] = extractelement <16 x float> [[TMP2]], i32 4
-; AVX2-NEXT:    [[TMP14:%.*]] = fcmp fast ogt float [[TMP12]], [[TMP13]]
-; AVX2-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], float [[TMP12]], float undef
-; AVX2-NEXT:    [[TMP16:%.*]] = extractelement <16 x float> [[TMP2]], i32 5
-; AVX2-NEXT:    [[TMP17:%.*]] = fcmp fast ogt float [[TMP15]], [[TMP16]]
-; AVX2-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], float [[TMP15]], float undef
-; AVX2-NEXT:    [[TMP19:%.*]] = extractelement <16 x float> [[TMP2]], i32 6
-; AVX2-NEXT:    [[TMP20:%.*]] = fcmp fast ogt float [[TMP18]], [[TMP19]]
-; AVX2-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], float [[TMP18]], float undef
-; AVX2-NEXT:    [[TMP22:%.*]] = extractelement <16 x float> [[TMP2]], i32 7
-; AVX2-NEXT:    [[TMP23:%.*]] = fcmp fast ogt float [[TMP21]], [[TMP22]]
-; AVX2-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], float [[TMP21]], float undef
-; AVX2-NEXT:    [[TMP25:%.*]] = extractelement <16 x float> [[TMP2]], i32 8
-; AVX2-NEXT:    [[TMP26:%.*]] = fcmp fast ogt float [[TMP24]], [[TMP25]]
-; AVX2-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], float [[TMP24]], float undef
-; AVX2-NEXT:    [[TMP28:%.*]] = extractelement <16 x float> [[TMP2]], i32 9
-; AVX2-NEXT:    [[TMP29:%.*]] = fcmp fast ogt float [[TMP27]], [[TMP28]]
-; AVX2-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], float [[TMP27]], float undef
-; AVX2-NEXT:    [[TMP31:%.*]] = extractelement <16 x float> [[TMP2]], i32 10
-; AVX2-NEXT:    [[TMP32:%.*]] = fcmp fast ogt float [[TMP30]], [[TMP31]]
-; AVX2-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], float [[TMP30]], float undef
-; AVX2-NEXT:    [[TMP34:%.*]] = extractelement <16 x float> [[TMP2]], i32 11
-; AVX2-NEXT:    [[TMP35:%.*]] = fcmp fast ogt float [[TMP33]], [[TMP34]]
-; AVX2-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], float [[TMP33]], float undef
-; AVX2-NEXT:    [[TMP37:%.*]] = extractelement <16 x float> [[TMP2]], i32 12
-; AVX2-NEXT:    [[TMP38:%.*]] = fcmp fast ogt float [[TMP36]], [[TMP37]]
-; AVX2-NEXT:    [[TMP39:%.*]] = select i1 [[TMP38]], float [[TMP36]], float undef
-; AVX2-NEXT:    [[TMP40:%.*]] = extractelement <16 x float> [[TMP2]], i32 13
-; AVX2-NEXT:    [[TMP41:%.*]] = fcmp fast ogt float [[TMP39]], [[TMP40]]
-; AVX2-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], float [[TMP39]], float undef
-; AVX2-NEXT:    [[TMP43:%.*]] = extractelement <16 x float> [[TMP2]], i32 14
-; AVX2-NEXT:    [[TMP44:%.*]] = fcmp fast ogt float [[TMP42]], [[TMP43]]
-; AVX2-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], float [[TMP42]], float undef
-; AVX2-NEXT:    [[TMP46:%.*]] = extractelement <16 x float> [[TMP2]], i32 15
-; AVX2-NEXT:    [[TMP47:%.*]] = fcmp fast ogt float [[TMP45]], [[TMP46]]
 ; AVX2-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <16 x float> [[TMP2]], <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX2-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <16 x float> [[TMP2]], [[RDX_SHUF]]
 ; AVX2-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP]], <16 x float> [[TMP2]], <16 x float> [[RDX_SHUF]]
@@ -1340,57 +571,11 @@
 ; AVX2-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <16 x float> [[RDX_MINMAX_SELECT6]], <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX2-NEXT:    [[RDX_MINMAX_CMP8:%.*]] = fcmp fast ogt <16 x float> [[RDX_MINMAX_SELECT6]], [[RDX_SHUF7]]
 ; AVX2-NEXT:    [[RDX_MINMAX_SELECT9:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP8]], <16 x float> [[RDX_MINMAX_SELECT6]], <16 x float> [[RDX_SHUF7]]
-; AVX2-NEXT:    [[TMP48:%.*]] = extractelement <16 x float> [[RDX_MINMAX_SELECT9]], i32 0
-; AVX2-NEXT:    [[TMP49:%.*]] = select i1 [[TMP47]], float [[TMP45]], float undef
-; AVX2-NEXT:    ret float [[TMP48]]
+; AVX2-NEXT:    [[TMP3:%.*]] = extractelement <16 x float> [[RDX_MINMAX_SELECT9]], i32 0
+; AVX2-NEXT:    ret float [[TMP3]]
 ;
 ; SKX-LABEL: @maxf16(
 ; SKX-NEXT:    [[TMP2:%.*]] = load <16 x float>, <16 x float>* bitcast ([32 x float]* @arr1 to <16 x float>*), align 16
-; SKX-NEXT:    [[TMP3:%.*]] = extractelement <16 x float> [[TMP2]], i32 0
-; SKX-NEXT:    [[TMP4:%.*]] = extractelement <16 x float> [[TMP2]], i32 1
-; SKX-NEXT:    [[TMP5:%.*]] = fcmp fast ogt float [[TMP3]], [[TMP4]]
-; SKX-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], float undef, float undef
-; SKX-NEXT:    [[TMP7:%.*]] = extractelement <16 x float> [[TMP2]], i32 2
-; SKX-NEXT:    [[TMP8:%.*]] = fcmp fast ogt float [[TMP6]], [[TMP7]]
-; SKX-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], float [[TMP6]], float undef
-; SKX-NEXT:    [[TMP10:%.*]] = extractelement <16 x float> [[TMP2]], i32 3
-; SKX-NEXT:    [[TMP11:%.*]] = fcmp fast ogt float [[TMP9]], [[TMP10]]
-; SKX-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], float [[TMP9]], float undef
-; SKX-NEXT:    [[TMP13:%.*]] = extractelement <16 x float> [[TMP2]], i32 4
-; SKX-NEXT:    [[TMP14:%.*]] = fcmp fast ogt float [[TMP12]], [[TMP13]]
-; SKX-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], float [[TMP12]], float undef
-; SKX-NEXT:    [[TMP16:%.*]] = extractelement <16 x float> [[TMP2]], i32 5
-; SKX-NEXT:    [[TMP17:%.*]] = fcmp fast ogt float [[TMP15]], [[TMP16]]
-; SKX-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], float [[TMP15]], float undef
-; SKX-NEXT:    [[TMP19:%.*]] = extractelement <16 x float> [[TMP2]], i32 6
-; SKX-NEXT:    [[TMP20:%.*]] = fcmp fast ogt float [[TMP18]], [[TMP19]]
-; SKX-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], float [[TMP18]], float undef
-; SKX-NEXT:    [[TMP22:%.*]] = extractelement <16 x float> [[TMP2]], i32 7
-; SKX-NEXT:    [[TMP23:%.*]] = fcmp fast ogt float [[TMP21]], [[TMP22]]
-; SKX-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], float [[TMP21]], float undef
-; SKX-NEXT:    [[TMP25:%.*]] = extractelement <16 x float> [[TMP2]], i32 8
-; SKX-NEXT:    [[TMP26:%.*]] = fcmp fast ogt float [[TMP24]], [[TMP25]]
-; SKX-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], float [[TMP24]], float undef
-; SKX-NEXT:    [[TMP28:%.*]] = extractelement <16 x float> [[TMP2]], i32 9
-; SKX-NEXT:    [[TMP29:%.*]] = fcmp fast ogt float [[TMP27]], [[TMP28]]
-; SKX-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], float [[TMP27]], float undef
-; SKX-NEXT:    [[TMP31:%.*]] = extractelement <16 x float> [[TMP2]], i32 10
-; SKX-NEXT:    [[TMP32:%.*]] = fcmp fast ogt float [[TMP30]], [[TMP31]]
-; SKX-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], float [[TMP30]], float undef
-; SKX-NEXT:    [[TMP34:%.*]] = extractelement <16 x float> [[TMP2]], i32 11
-; SKX-NEXT:    [[TMP35:%.*]] = fcmp fast ogt float [[TMP33]], [[TMP34]]
-; SKX-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], float [[TMP33]], float undef
-; SKX-NEXT:    [[TMP37:%.*]] = extractelement <16 x float> [[TMP2]], i32 12
-; SKX-NEXT:    [[TMP38:%.*]] = fcmp fast ogt float [[TMP36]], [[TMP37]]
-; SKX-NEXT:    [[TMP39:%.*]] = select i1 [[TMP38]], float [[TMP36]], float undef
-; SKX-NEXT:    [[TMP40:%.*]] = extractelement <16 x float> [[TMP2]], i32 13
-; SKX-NEXT:    [[TMP41:%.*]] = fcmp fast ogt float [[TMP39]], [[TMP40]]
-; SKX-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], float [[TMP39]], float undef
-; SKX-NEXT:    [[TMP43:%.*]] = extractelement <16 x float> [[TMP2]], i32 14
-; SKX-NEXT:    [[TMP44:%.*]] = fcmp fast ogt float [[TMP42]], [[TMP43]]
-; SKX-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], float [[TMP42]], float undef
-; SKX-NEXT:    [[TMP46:%.*]] = extractelement <16 x float> [[TMP2]], i32 15
-; SKX-NEXT:    [[TMP47:%.*]] = fcmp fast ogt float [[TMP45]], [[TMP46]]
 ; SKX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <16 x float> [[TMP2]], <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SKX-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <16 x float> [[TMP2]], [[RDX_SHUF]]
 ; SKX-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP]], <16 x float> [[TMP2]], <16 x float> [[RDX_SHUF]]
@@ -1403,9 +588,8 @@
 ; SKX-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <16 x float> [[RDX_MINMAX_SELECT6]], <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SKX-NEXT:    [[RDX_MINMAX_CMP8:%.*]] = fcmp fast ogt <16 x float> [[RDX_MINMAX_SELECT6]], [[RDX_SHUF7]]
 ; SKX-NEXT:    [[RDX_MINMAX_SELECT9:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP8]], <16 x float> [[RDX_MINMAX_SELECT6]], <16 x float> [[RDX_SHUF7]]
-; SKX-NEXT:    [[TMP48:%.*]] = extractelement <16 x float> [[RDX_MINMAX_SELECT9]], i32 0
-; SKX-NEXT:    [[TMP49:%.*]] = select i1 [[TMP47]], float [[TMP45]], float undef
-; SKX-NEXT:    ret float [[TMP48]]
+; SKX-NEXT:    [[TMP3:%.*]] = extractelement <16 x float> [[RDX_MINMAX_SELECT9]], i32 0
+; SKX-NEXT:    ret float [[TMP3]]
 ;
   %2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
   %3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
@@ -1556,99 +740,6 @@
 ;
 ; AVX-LABEL: @maxf32(
 ; AVX-NEXT:    [[TMP2:%.*]] = load <32 x float>, <32 x float>* bitcast ([32 x float]* @arr1 to <32 x float>*), align 16
-; AVX-NEXT:    [[TMP3:%.*]] = extractelement <32 x float> [[TMP2]], i32 0
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <32 x float> [[TMP2]], i32 1
-; AVX-NEXT:    [[TMP5:%.*]] = fcmp fast ogt float [[TMP3]], [[TMP4]]
-; AVX-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], float undef, float undef
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <32 x float> [[TMP2]], i32 2
-; AVX-NEXT:    [[TMP8:%.*]] = fcmp fast ogt float [[TMP6]], [[TMP7]]
-; AVX-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], float [[TMP6]], float undef
-; AVX-NEXT:    [[TMP10:%.*]] = extractelement <32 x float> [[TMP2]], i32 3
-; AVX-NEXT:    [[TMP11:%.*]] = fcmp fast ogt float [[TMP9]], [[TMP10]]
-; AVX-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], float [[TMP9]], float undef
-; AVX-NEXT:    [[TMP13:%.*]] = extractelement <32 x float> [[TMP2]], i32 4
-; AVX-NEXT:    [[TMP14:%.*]] = fcmp fast ogt float [[TMP12]], [[TMP13]]
-; AVX-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], float [[TMP12]], float undef
-; AVX-NEXT:    [[TMP16:%.*]] = extractelement <32 x float> [[TMP2]], i32 5
-; AVX-NEXT:    [[TMP17:%.*]] = fcmp fast ogt float [[TMP15]], [[TMP16]]
-; AVX-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], float [[TMP15]], float undef
-; AVX-NEXT:    [[TMP19:%.*]] = extractelement <32 x float> [[TMP2]], i32 6
-; AVX-NEXT:    [[TMP20:%.*]] = fcmp fast ogt float [[TMP18]], [[TMP19]]
-; AVX-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], float [[TMP18]], float undef
-; AVX-NEXT:    [[TMP22:%.*]] = extractelement <32 x float> [[TMP2]], i32 7
-; AVX-NEXT:    [[TMP23:%.*]] = fcmp fast ogt float [[TMP21]], [[TMP22]]
-; AVX-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], float [[TMP21]], float undef
-; AVX-NEXT:    [[TMP25:%.*]] = extractelement <32 x float> [[TMP2]], i32 8
-; AVX-NEXT:    [[TMP26:%.*]] = fcmp fast ogt float [[TMP24]], [[TMP25]]
-; AVX-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], float [[TMP24]], float undef
-; AVX-NEXT:    [[TMP28:%.*]] = extractelement <32 x float> [[TMP2]], i32 9
-; AVX-NEXT:    [[TMP29:%.*]] = fcmp fast ogt float [[TMP27]], [[TMP28]]
-; AVX-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], float [[TMP27]], float undef
-; AVX-NEXT:    [[TMP31:%.*]] = extractelement <32 x float> [[TMP2]], i32 10
-; AVX-NEXT:    [[TMP32:%.*]] = fcmp fast ogt float [[TMP30]], [[TMP31]]
-; AVX-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], float [[TMP30]], float undef
-; AVX-NEXT:    [[TMP34:%.*]] = extractelement <32 x float> [[TMP2]], i32 11
-; AVX-NEXT:    [[TMP35:%.*]] = fcmp fast ogt float [[TMP33]], [[TMP34]]
-; AVX-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], float [[TMP33]], float undef
-; AVX-NEXT:    [[TMP37:%.*]] = extractelement <32 x float> [[TMP2]], i32 12
-; AVX-NEXT:    [[TMP38:%.*]] = fcmp fast ogt float [[TMP36]], [[TMP37]]
-; AVX-NEXT:    [[TMP39:%.*]] = select i1 [[TMP38]], float [[TMP36]], float undef
-; AVX-NEXT:    [[TMP40:%.*]] = extractelement <32 x float> [[TMP2]], i32 13
-; AVX-NEXT:    [[TMP41:%.*]] = fcmp fast ogt float [[TMP39]], [[TMP40]]
-; AVX-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], float [[TMP39]], float undef
-; AVX-NEXT:    [[TMP43:%.*]] = extractelement <32 x float> [[TMP2]], i32 14
-; AVX-NEXT:    [[TMP44:%.*]] = fcmp fast ogt float [[TMP42]], [[TMP43]]
-; AVX-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], float [[TMP42]], float undef
-; AVX-NEXT:    [[TMP46:%.*]] = extractelement <32 x float> [[TMP2]], i32 15
-; AVX-NEXT:    [[TMP47:%.*]] = fcmp fast ogt float [[TMP45]], [[TMP46]]
-; AVX-NEXT:    [[TMP48:%.*]] = select i1 [[TMP47]], float [[TMP45]], float undef
-; AVX-NEXT:    [[TMP49:%.*]] = extractelement <32 x float> [[TMP2]], i32 16
-; AVX-NEXT:    [[TMP50:%.*]] = fcmp fast ogt float [[TMP48]], [[TMP49]]
-; AVX-NEXT:    [[TMP51:%.*]] = select i1 [[TMP50]], float [[TMP48]], float undef
-; AVX-NEXT:    [[TMP52:%.*]] = extractelement <32 x float> [[TMP2]], i32 17
-; AVX-NEXT:    [[TMP53:%.*]] = fcmp fast ogt float [[TMP51]], [[TMP52]]
-; AVX-NEXT:    [[TMP54:%.*]] = select i1 [[TMP53]], float [[TMP51]], float undef
-; AVX-NEXT:    [[TMP55:%.*]] = extractelement <32 x float> [[TMP2]], i32 18
-; AVX-NEXT:    [[TMP56:%.*]] = fcmp fast ogt float [[TMP54]], [[TMP55]]
-; AVX-NEXT:    [[TMP57:%.*]] = select i1 [[TMP56]], float [[TMP54]], float undef
-; AVX-NEXT:    [[TMP58:%.*]] = extractelement <32 x float> [[TMP2]], i32 19
-; AVX-NEXT:    [[TMP59:%.*]] = fcmp fast ogt float [[TMP57]], [[TMP58]]
-; AVX-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], float [[TMP57]], float undef
-; AVX-NEXT:    [[TMP61:%.*]] = extractelement <32 x float> [[TMP2]], i32 20
-; AVX-NEXT:    [[TMP62:%.*]] = fcmp fast ogt float [[TMP60]], [[TMP61]]
-; AVX-NEXT:    [[TMP63:%.*]] = select i1 [[TMP62]], float [[TMP60]], float undef
-; AVX-NEXT:    [[TMP64:%.*]] = extractelement <32 x float> [[TMP2]], i32 21
-; AVX-NEXT:    [[TMP65:%.*]] = fcmp fast ogt float [[TMP63]], [[TMP64]]
-; AVX-NEXT:    [[TMP66:%.*]] = select i1 [[TMP65]], float [[TMP63]], float undef
-; AVX-NEXT:    [[TMP67:%.*]] = extractelement <32 x float> [[TMP2]], i32 22
-; AVX-NEXT:    [[TMP68:%.*]] = fcmp fast ogt float [[TMP66]], [[TMP67]]
-; AVX-NEXT:    [[TMP69:%.*]] = select i1 [[TMP68]], float [[TMP66]], float undef
-; AVX-NEXT:    [[TMP70:%.*]] = extractelement <32 x float> [[TMP2]], i32 23
-; AVX-NEXT:    [[TMP71:%.*]] = fcmp fast ogt float [[TMP69]], [[TMP70]]
-; AVX-NEXT:    [[TMP72:%.*]] = select i1 [[TMP71]], float [[TMP69]], float undef
-; AVX-NEXT:    [[TMP73:%.*]] = extractelement <32 x float> [[TMP2]], i32 24
-; AVX-NEXT:    [[TMP74:%.*]] = fcmp fast ogt float [[TMP72]], [[TMP73]]
-; AVX-NEXT:    [[TMP75:%.*]] = select i1 [[TMP74]], float [[TMP72]], float undef
-; AVX-NEXT:    [[TMP76:%.*]] = extractelement <32 x float> [[TMP2]], i32 25
-; AVX-NEXT:    [[TMP77:%.*]] = fcmp fast ogt float [[TMP75]], [[TMP76]]
-; AVX-NEXT:    [[TMP78:%.*]] = select i1 [[TMP77]], float [[TMP75]], float undef
-; AVX-NEXT:    [[TMP79:%.*]] = extractelement <32 x float> [[TMP2]], i32 26
-; AVX-NEXT:    [[TMP80:%.*]] = fcmp fast ogt float [[TMP78]], [[TMP79]]
-; AVX-NEXT:    [[TMP81:%.*]] = select i1 [[TMP80]], float [[TMP78]], float undef
-; AVX-NEXT:    [[TMP82:%.*]] = extractelement <32 x float> [[TMP2]], i32 27
-; AVX-NEXT:    [[TMP83:%.*]] = fcmp fast ogt float [[TMP81]], [[TMP82]]
-; AVX-NEXT:    [[TMP84:%.*]] = select i1 [[TMP83]], float [[TMP81]], float undef
-; AVX-NEXT:    [[TMP85:%.*]] = extractelement <32 x float> [[TMP2]], i32 28
-; AVX-NEXT:    [[TMP86:%.*]] = fcmp fast ogt float [[TMP84]], [[TMP85]]
-; AVX-NEXT:    [[TMP87:%.*]] = select i1 [[TMP86]], float [[TMP84]], float undef
-; AVX-NEXT:    [[TMP88:%.*]] = extractelement <32 x float> [[TMP2]], i32 29
-; AVX-NEXT:    [[TMP89:%.*]] = fcmp fast ogt float [[TMP87]], [[TMP88]]
-; AVX-NEXT:    [[TMP90:%.*]] = select i1 [[TMP89]], float [[TMP87]], float undef
-; AVX-NEXT:    [[TMP91:%.*]] = extractelement <32 x float> [[TMP2]], i32 30
-; AVX-NEXT:    [[TMP92:%.*]] = fcmp fast ogt float [[TMP90]], [[TMP91]]
-; AVX-NEXT:    [[TMP93:%.*]] = select i1 [[TMP92]], float [[TMP90]], float undef
-; AVX-NEXT:    [[TMP94:%.*]] = extractelement <32 x float> [[TMP2]], i32 31
-; AVX-NEXT:    [[TMP95:%.*]] = fcmp fast ogt float [[TMP93]], [[TMP94]]
 ; AVX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <32 x float> [[TMP2]], <32 x float> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <32 x float> [[TMP2]], [[RDX_SHUF]]
 ; AVX-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP]], <32 x float> [[TMP2]], <32 x float> [[RDX_SHUF]]
@@ -1664,105 +755,11 @@
 ; AVX-NEXT:    [[RDX_SHUF10:%.*]] = shufflevector <32 x float> [[RDX_MINMAX_SELECT9]], <32 x float> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX-NEXT:    [[RDX_MINMAX_CMP11:%.*]] = fcmp fast ogt <32 x float> [[RDX_MINMAX_SELECT9]], [[RDX_SHUF10]]
 ; AVX-NEXT:    [[RDX_MINMAX_SELECT12:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP11]], <32 x float> [[RDX_MINMAX_SELECT9]], <32 x float> [[RDX_SHUF10]]
-; AVX-NEXT:    [[TMP96:%.*]] = extractelement <32 x float> [[RDX_MINMAX_SELECT12]], i32 0
-; AVX-NEXT:    [[TMP97:%.*]] = select i1 [[TMP95]], float [[TMP93]], float undef
-; AVX-NEXT:    ret float [[TMP96]]
+; AVX-NEXT:    [[TMP3:%.*]] = extractelement <32 x float> [[RDX_MINMAX_SELECT12]], i32 0
+; AVX-NEXT:    ret float [[TMP3]]
 ;
 ; AVX2-LABEL: @maxf32(
 ; AVX2-NEXT:    [[TMP2:%.*]] = load <32 x float>, <32 x float>* bitcast ([32 x float]* @arr1 to <32 x float>*), align 16
-; AVX2-NEXT:    [[TMP3:%.*]] = extractelement <32 x float> [[TMP2]], i32 0
-; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <32 x float> [[TMP2]], i32 1
-; AVX2-NEXT:    [[TMP5:%.*]] = fcmp fast ogt float [[TMP3]], [[TMP4]]
-; AVX2-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], float undef, float undef
-; AVX2-NEXT:    [[TMP7:%.*]] = extractelement <32 x float> [[TMP2]], i32 2
-; AVX2-NEXT:    [[TMP8:%.*]] = fcmp fast ogt float [[TMP6]], [[TMP7]]
-; AVX2-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], float [[TMP6]], float undef
-; AVX2-NEXT:    [[TMP10:%.*]] = extractelement <32 x float> [[TMP2]], i32 3
-; AVX2-NEXT:    [[TMP11:%.*]] = fcmp fast ogt float [[TMP9]], [[TMP10]]
-; AVX2-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], float [[TMP9]], float undef
-; AVX2-NEXT:    [[TMP13:%.*]] = extractelement <32 x float> [[TMP2]], i32 4
-; AVX2-NEXT:    [[TMP14:%.*]] = fcmp fast ogt float [[TMP12]], [[TMP13]]
-; AVX2-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], float [[TMP12]], float undef
-; AVX2-NEXT:    [[TMP16:%.*]] = extractelement <32 x float> [[TMP2]], i32 5
-; AVX2-NEXT:    [[TMP17:%.*]] = fcmp fast ogt float [[TMP15]], [[TMP16]]
-; AVX2-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], float [[TMP15]], float undef
-; AVX2-NEXT:    [[TMP19:%.*]] = extractelement <32 x float> [[TMP2]], i32 6
-; AVX2-NEXT:    [[TMP20:%.*]] = fcmp fast ogt float [[TMP18]], [[TMP19]]
-; AVX2-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], float [[TMP18]], float undef
-; AVX2-NEXT:    [[TMP22:%.*]] = extractelement <32 x float> [[TMP2]], i32 7
-; AVX2-NEXT:    [[TMP23:%.*]] = fcmp fast ogt float [[TMP21]], [[TMP22]]
-; AVX2-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], float [[TMP21]], float undef
-; AVX2-NEXT:    [[TMP25:%.*]] = extractelement <32 x float> [[TMP2]], i32 8
-; AVX2-NEXT:    [[TMP26:%.*]] = fcmp fast ogt float [[TMP24]], [[TMP25]]
-; AVX2-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], float [[TMP24]], float undef
-; AVX2-NEXT:    [[TMP28:%.*]] = extractelement <32 x float> [[TMP2]], i32 9
-; AVX2-NEXT:    [[TMP29:%.*]] = fcmp fast ogt float [[TMP27]], [[TMP28]]
-; AVX2-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], float [[TMP27]], float undef
-; AVX2-NEXT:    [[TMP31:%.*]] = extractelement <32 x float> [[TMP2]], i32 10
-; AVX2-NEXT:    [[TMP32:%.*]] = fcmp fast ogt float [[TMP30]], [[TMP31]]
-; AVX2-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], float [[TMP30]], float undef
-; AVX2-NEXT:    [[TMP34:%.*]] = extractelement <32 x float> [[TMP2]], i32 11
-; AVX2-NEXT:    [[TMP35:%.*]] = fcmp fast ogt float [[TMP33]], [[TMP34]]
-; AVX2-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], float [[TMP33]], float undef
-; AVX2-NEXT:    [[TMP37:%.*]] = extractelement <32 x float> [[TMP2]], i32 12
-; AVX2-NEXT:    [[TMP38:%.*]] = fcmp fast ogt float [[TMP36]], [[TMP37]]
-; AVX2-NEXT:    [[TMP39:%.*]] = select i1 [[TMP38]], float [[TMP36]], float undef
-; AVX2-NEXT:    [[TMP40:%.*]] = extractelement <32 x float> [[TMP2]], i32 13
-; AVX2-NEXT:    [[TMP41:%.*]] = fcmp fast ogt float [[TMP39]], [[TMP40]]
-; AVX2-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], float [[TMP39]], float undef
-; AVX2-NEXT:    [[TMP43:%.*]] = extractelement <32 x float> [[TMP2]], i32 14
-; AVX2-NEXT:    [[TMP44:%.*]] = fcmp fast ogt float [[TMP42]], [[TMP43]]
-; AVX2-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], float [[TMP42]], float undef
-; AVX2-NEXT:    [[TMP46:%.*]] = extractelement <32 x float> [[TMP2]], i32 15
-; AVX2-NEXT:    [[TMP47:%.*]] = fcmp fast ogt float [[TMP45]], [[TMP46]]
-; AVX2-NEXT:    [[TMP48:%.*]] = select i1 [[TMP47]], float [[TMP45]], float undef
-; AVX2-NEXT:    [[TMP49:%.*]] = extractelement <32 x float> [[TMP2]], i32 16
-; AVX2-NEXT:    [[TMP50:%.*]] = fcmp fast ogt float [[TMP48]], [[TMP49]]
-; AVX2-NEXT:    [[TMP51:%.*]] = select i1 [[TMP50]], float [[TMP48]], float undef
-; AVX2-NEXT:    [[TMP52:%.*]] = extractelement <32 x float> [[TMP2]], i32 17
-; AVX2-NEXT:    [[TMP53:%.*]] = fcmp fast ogt float [[TMP51]], [[TMP52]]
-; AVX2-NEXT:    [[TMP54:%.*]] = select i1 [[TMP53]], float [[TMP51]], float undef
-; AVX2-NEXT:    [[TMP55:%.*]] = extractelement <32 x float> [[TMP2]], i32 18
-; AVX2-NEXT:    [[TMP56:%.*]] = fcmp fast ogt float [[TMP54]], [[TMP55]]
-; AVX2-NEXT:    [[TMP57:%.*]] = select i1 [[TMP56]], float [[TMP54]], float undef
-; AVX2-NEXT:    [[TMP58:%.*]] = extractelement <32 x float> [[TMP2]], i32 19
-; AVX2-NEXT:    [[TMP59:%.*]] = fcmp fast ogt float [[TMP57]], [[TMP58]]
-; AVX2-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], float [[TMP57]], float undef
-; AVX2-NEXT:    [[TMP61:%.*]] = extractelement <32 x float> [[TMP2]], i32 20
-; AVX2-NEXT:    [[TMP62:%.*]] = fcmp fast ogt float [[TMP60]], [[TMP61]]
-; AVX2-NEXT:    [[TMP63:%.*]] = select i1 [[TMP62]], float [[TMP60]], float undef
-; AVX2-NEXT:    [[TMP64:%.*]] = extractelement <32 x float> [[TMP2]], i32 21
-; AVX2-NEXT:    [[TMP65:%.*]] = fcmp fast ogt float [[TMP63]], [[TMP64]]
-; AVX2-NEXT:    [[TMP66:%.*]] = select i1 [[TMP65]], float [[TMP63]], float undef
-; AVX2-NEXT:    [[TMP67:%.*]] = extractelement <32 x float> [[TMP2]], i32 22
-; AVX2-NEXT:    [[TMP68:%.*]] = fcmp fast ogt float [[TMP66]], [[TMP67]]
-; AVX2-NEXT:    [[TMP69:%.*]] = select i1 [[TMP68]], float [[TMP66]], float undef
-; AVX2-NEXT:    [[TMP70:%.*]] = extractelement <32 x float> [[TMP2]], i32 23
-; AVX2-NEXT:    [[TMP71:%.*]] = fcmp fast ogt float [[TMP69]], [[TMP70]]
-; AVX2-NEXT:    [[TMP72:%.*]] = select i1 [[TMP71]], float [[TMP69]], float undef
-; AVX2-NEXT:    [[TMP73:%.*]] = extractelement <32 x float> [[TMP2]], i32 24
-; AVX2-NEXT:    [[TMP74:%.*]] = fcmp fast ogt float [[TMP72]], [[TMP73]]
-; AVX2-NEXT:    [[TMP75:%.*]] = select i1 [[TMP74]], float [[TMP72]], float undef
-; AVX2-NEXT:    [[TMP76:%.*]] = extractelement <32 x float> [[TMP2]], i32 25
-; AVX2-NEXT:    [[TMP77:%.*]] = fcmp fast ogt float [[TMP75]], [[TMP76]]
-; AVX2-NEXT:    [[TMP78:%.*]] = select i1 [[TMP77]], float [[TMP75]], float undef
-; AVX2-NEXT:    [[TMP79:%.*]] = extractelement <32 x float> [[TMP2]], i32 26
-; AVX2-NEXT:    [[TMP80:%.*]] = fcmp fast ogt float [[TMP78]], [[TMP79]]
-; AVX2-NEXT:    [[TMP81:%.*]] = select i1 [[TMP80]], float [[TMP78]], float undef
-; AVX2-NEXT:    [[TMP82:%.*]] = extractelement <32 x float> [[TMP2]], i32 27
-; AVX2-NEXT:    [[TMP83:%.*]] = fcmp fast ogt float [[TMP81]], [[TMP82]]
-; AVX2-NEXT:    [[TMP84:%.*]] = select i1 [[TMP83]], float [[TMP81]], float undef
-; AVX2-NEXT:    [[TMP85:%.*]] = extractelement <32 x float> [[TMP2]], i32 28
-; AVX2-NEXT:    [[TMP86:%.*]] = fcmp fast ogt float [[TMP84]], [[TMP85]]
-; AVX2-NEXT:    [[TMP87:%.*]] = select i1 [[TMP86]], float [[TMP84]], float undef
-; AVX2-NEXT:    [[TMP88:%.*]] = extractelement <32 x float> [[TMP2]], i32 29
-; AVX2-NEXT:    [[TMP89:%.*]] = fcmp fast ogt float [[TMP87]], [[TMP88]]
-; AVX2-NEXT:    [[TMP90:%.*]] = select i1 [[TMP89]], float [[TMP87]], float undef
-; AVX2-NEXT:    [[TMP91:%.*]] = extractelement <32 x float> [[TMP2]], i32 30
-; AVX2-NEXT:    [[TMP92:%.*]] = fcmp fast ogt float [[TMP90]], [[TMP91]]
-; AVX2-NEXT:    [[TMP93:%.*]] = select i1 [[TMP92]], float [[TMP90]], float undef
-; AVX2-NEXT:    [[TMP94:%.*]] = extractelement <32 x float> [[TMP2]], i32 31
-; AVX2-NEXT:    [[TMP95:%.*]] = fcmp fast ogt float [[TMP93]], [[TMP94]]
 ; AVX2-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <32 x float> [[TMP2]], <32 x float> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX2-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <32 x float> [[TMP2]], [[RDX_SHUF]]
 ; AVX2-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP]], <32 x float> [[TMP2]], <32 x float> [[RDX_SHUF]]
@@ -1778,105 +775,11 @@
 ; AVX2-NEXT:    [[RDX_SHUF10:%.*]] = shufflevector <32 x float> [[RDX_MINMAX_SELECT9]], <32 x float> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX2-NEXT:    [[RDX_MINMAX_CMP11:%.*]] = fcmp fast ogt <32 x float> [[RDX_MINMAX_SELECT9]], [[RDX_SHUF10]]
 ; AVX2-NEXT:    [[RDX_MINMAX_SELECT12:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP11]], <32 x float> [[RDX_MINMAX_SELECT9]], <32 x float> [[RDX_SHUF10]]
-; AVX2-NEXT:    [[TMP96:%.*]] = extractelement <32 x float> [[RDX_MINMAX_SELECT12]], i32 0
-; AVX2-NEXT:    [[TMP97:%.*]] = select i1 [[TMP95]], float [[TMP93]], float undef
-; AVX2-NEXT:    ret float [[TMP96]]
+; AVX2-NEXT:    [[TMP3:%.*]] = extractelement <32 x float> [[RDX_MINMAX_SELECT12]], i32 0
+; AVX2-NEXT:    ret float [[TMP3]]
 ;
 ; SKX-LABEL: @maxf32(
 ; SKX-NEXT:    [[TMP2:%.*]] = load <32 x float>, <32 x float>* bitcast ([32 x float]* @arr1 to <32 x float>*), align 16
-; SKX-NEXT:    [[TMP3:%.*]] = extractelement <32 x float> [[TMP2]], i32 0
-; SKX-NEXT:    [[TMP4:%.*]] = extractelement <32 x float> [[TMP2]], i32 1
-; SKX-NEXT:    [[TMP5:%.*]] = fcmp fast ogt float [[TMP3]], [[TMP4]]
-; SKX-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], float undef, float undef
-; SKX-NEXT:    [[TMP7:%.*]] = extractelement <32 x float> [[TMP2]], i32 2
-; SKX-NEXT:    [[TMP8:%.*]] = fcmp fast ogt float [[TMP6]], [[TMP7]]
-; SKX-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], float [[TMP6]], float undef
-; SKX-NEXT:    [[TMP10:%.*]] = extractelement <32 x float> [[TMP2]], i32 3
-; SKX-NEXT:    [[TMP11:%.*]] = fcmp fast ogt float [[TMP9]], [[TMP10]]
-; SKX-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], float [[TMP9]], float undef
-; SKX-NEXT:    [[TMP13:%.*]] = extractelement <32 x float> [[TMP2]], i32 4
-; SKX-NEXT:    [[TMP14:%.*]] = fcmp fast ogt float [[TMP12]], [[TMP13]]
-; SKX-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], float [[TMP12]], float undef
-; SKX-NEXT:    [[TMP16:%.*]] = extractelement <32 x float> [[TMP2]], i32 5
-; SKX-NEXT:    [[TMP17:%.*]] = fcmp fast ogt float [[TMP15]], [[TMP16]]
-; SKX-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], float [[TMP15]], float undef
-; SKX-NEXT:    [[TMP19:%.*]] = extractelement <32 x float> [[TMP2]], i32 6
-; SKX-NEXT:    [[TMP20:%.*]] = fcmp fast ogt float [[TMP18]], [[TMP19]]
-; SKX-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], float [[TMP18]], float undef
-; SKX-NEXT:    [[TMP22:%.*]] = extractelement <32 x float> [[TMP2]], i32 7
-; SKX-NEXT:    [[TMP23:%.*]] = fcmp fast ogt float [[TMP21]], [[TMP22]]
-; SKX-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], float [[TMP21]], float undef
-; SKX-NEXT:    [[TMP25:%.*]] = extractelement <32 x float> [[TMP2]], i32 8
-; SKX-NEXT:    [[TMP26:%.*]] = fcmp fast ogt float [[TMP24]], [[TMP25]]
-; SKX-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], float [[TMP24]], float undef
-; SKX-NEXT:    [[TMP28:%.*]] = extractelement <32 x float> [[TMP2]], i32 9
-; SKX-NEXT:    [[TMP29:%.*]] = fcmp fast ogt float [[TMP27]], [[TMP28]]
-; SKX-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], float [[TMP27]], float undef
-; SKX-NEXT:    [[TMP31:%.*]] = extractelement <32 x float> [[TMP2]], i32 10
-; SKX-NEXT:    [[TMP32:%.*]] = fcmp fast ogt float [[TMP30]], [[TMP31]]
-; SKX-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], float [[TMP30]], float undef
-; SKX-NEXT:    [[TMP34:%.*]] = extractelement <32 x float> [[TMP2]], i32 11
-; SKX-NEXT:    [[TMP35:%.*]] = fcmp fast ogt float [[TMP33]], [[TMP34]]
-; SKX-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], float [[TMP33]], float undef
-; SKX-NEXT:    [[TMP37:%.*]] = extractelement <32 x float> [[TMP2]], i32 12
-; SKX-NEXT:    [[TMP38:%.*]] = fcmp fast ogt float [[TMP36]], [[TMP37]]
-; SKX-NEXT:    [[TMP39:%.*]] = select i1 [[TMP38]], float [[TMP36]], float undef
-; SKX-NEXT:    [[TMP40:%.*]] = extractelement <32 x float> [[TMP2]], i32 13
-; SKX-NEXT:    [[TMP41:%.*]] = fcmp fast ogt float [[TMP39]], [[TMP40]]
-; SKX-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], float [[TMP39]], float undef
-; SKX-NEXT:    [[TMP43:%.*]] = extractelement <32 x float> [[TMP2]], i32 14
-; SKX-NEXT:    [[TMP44:%.*]] = fcmp fast ogt float [[TMP42]], [[TMP43]]
-; SKX-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], float [[TMP42]], float undef
-; SKX-NEXT:    [[TMP46:%.*]] = extractelement <32 x float> [[TMP2]], i32 15
-; SKX-NEXT:    [[TMP47:%.*]] = fcmp fast ogt float [[TMP45]], [[TMP46]]
-; SKX-NEXT:    [[TMP48:%.*]] = select i1 [[TMP47]], float [[TMP45]], float undef
-; SKX-NEXT:    [[TMP49:%.*]] = extractelement <32 x float> [[TMP2]], i32 16
-; SKX-NEXT:    [[TMP50:%.*]] = fcmp fast ogt float [[TMP48]], [[TMP49]]
-; SKX-NEXT:    [[TMP51:%.*]] = select i1 [[TMP50]], float [[TMP48]], float undef
-; SKX-NEXT:    [[TMP52:%.*]] = extractelement <32 x float> [[TMP2]], i32 17
-; SKX-NEXT:    [[TMP53:%.*]] = fcmp fast ogt float [[TMP51]], [[TMP52]]
-; SKX-NEXT:    [[TMP54:%.*]] = select i1 [[TMP53]], float [[TMP51]], float undef
-; SKX-NEXT:    [[TMP55:%.*]] = extractelement <32 x float> [[TMP2]], i32 18
-; SKX-NEXT:    [[TMP56:%.*]] = fcmp fast ogt float [[TMP54]], [[TMP55]]
-; SKX-NEXT:    [[TMP57:%.*]] = select i1 [[TMP56]], float [[TMP54]], float undef
-; SKX-NEXT:    [[TMP58:%.*]] = extractelement <32 x float> [[TMP2]], i32 19
-; SKX-NEXT:    [[TMP59:%.*]] = fcmp fast ogt float [[TMP57]], [[TMP58]]
-; SKX-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], float [[TMP57]], float undef
-; SKX-NEXT:    [[TMP61:%.*]] = extractelement <32 x float> [[TMP2]], i32 20
-; SKX-NEXT:    [[TMP62:%.*]] = fcmp fast ogt float [[TMP60]], [[TMP61]]
-; SKX-NEXT:    [[TMP63:%.*]] = select i1 [[TMP62]], float [[TMP60]], float undef
-; SKX-NEXT:    [[TMP64:%.*]] = extractelement <32 x float> [[TMP2]], i32 21
-; SKX-NEXT:    [[TMP65:%.*]] = fcmp fast ogt float [[TMP63]], [[TMP64]]
-; SKX-NEXT:    [[TMP66:%.*]] = select i1 [[TMP65]], float [[TMP63]], float undef
-; SKX-NEXT:    [[TMP67:%.*]] = extractelement <32 x float> [[TMP2]], i32 22
-; SKX-NEXT:    [[TMP68:%.*]] = fcmp fast ogt float [[TMP66]], [[TMP67]]
-; SKX-NEXT:    [[TMP69:%.*]] = select i1 [[TMP68]], float [[TMP66]], float undef
-; SKX-NEXT:    [[TMP70:%.*]] = extractelement <32 x float> [[TMP2]], i32 23
-; SKX-NEXT:    [[TMP71:%.*]] = fcmp fast ogt float [[TMP69]], [[TMP70]]
-; SKX-NEXT:    [[TMP72:%.*]] = select i1 [[TMP71]], float [[TMP69]], float undef
-; SKX-NEXT:    [[TMP73:%.*]] = extractelement <32 x float> [[TMP2]], i32 24
-; SKX-NEXT:    [[TMP74:%.*]] = fcmp fast ogt float [[TMP72]], [[TMP73]]
-; SKX-NEXT:    [[TMP75:%.*]] = select i1 [[TMP74]], float [[TMP72]], float undef
-; SKX-NEXT:    [[TMP76:%.*]] = extractelement <32 x float> [[TMP2]], i32 25
-; SKX-NEXT:    [[TMP77:%.*]] = fcmp fast ogt float [[TMP75]], [[TMP76]]
-; SKX-NEXT:    [[TMP78:%.*]] = select i1 [[TMP77]], float [[TMP75]], float undef
-; SKX-NEXT:    [[TMP79:%.*]] = extractelement <32 x float> [[TMP2]], i32 26
-; SKX-NEXT:    [[TMP80:%.*]] = fcmp fast ogt float [[TMP78]], [[TMP79]]
-; SKX-NEXT:    [[TMP81:%.*]] = select i1 [[TMP80]], float [[TMP78]], float undef
-; SKX-NEXT:    [[TMP82:%.*]] = extractelement <32 x float> [[TMP2]], i32 27
-; SKX-NEXT:    [[TMP83:%.*]] = fcmp fast ogt float [[TMP81]], [[TMP82]]
-; SKX-NEXT:    [[TMP84:%.*]] = select i1 [[TMP83]], float [[TMP81]], float undef
-; SKX-NEXT:    [[TMP85:%.*]] = extractelement <32 x float> [[TMP2]], i32 28
-; SKX-NEXT:    [[TMP86:%.*]] = fcmp fast ogt float [[TMP84]], [[TMP85]]
-; SKX-NEXT:    [[TMP87:%.*]] = select i1 [[TMP86]], float [[TMP84]], float undef
-; SKX-NEXT:    [[TMP88:%.*]] = extractelement <32 x float> [[TMP2]], i32 29
-; SKX-NEXT:    [[TMP89:%.*]] = fcmp fast ogt float [[TMP87]], [[TMP88]]
-; SKX-NEXT:    [[TMP90:%.*]] = select i1 [[TMP89]], float [[TMP87]], float undef
-; SKX-NEXT:    [[TMP91:%.*]] = extractelement <32 x float> [[TMP2]], i32 30
-; SKX-NEXT:    [[TMP92:%.*]] = fcmp fast ogt float [[TMP90]], [[TMP91]]
-; SKX-NEXT:    [[TMP93:%.*]] = select i1 [[TMP92]], float [[TMP90]], float undef
-; SKX-NEXT:    [[TMP94:%.*]] = extractelement <32 x float> [[TMP2]], i32 31
-; SKX-NEXT:    [[TMP95:%.*]] = fcmp fast ogt float [[TMP93]], [[TMP94]]
 ; SKX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <32 x float> [[TMP2]], <32 x float> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SKX-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <32 x float> [[TMP2]], [[RDX_SHUF]]
 ; SKX-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP]], <32 x float> [[TMP2]], <32 x float> [[RDX_SHUF]]
@@ -1892,9 +795,8 @@
 ; SKX-NEXT:    [[RDX_SHUF10:%.*]] = shufflevector <32 x float> [[RDX_MINMAX_SELECT9]], <32 x float> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SKX-NEXT:    [[RDX_MINMAX_CMP11:%.*]] = fcmp fast ogt <32 x float> [[RDX_MINMAX_SELECT9]], [[RDX_SHUF10]]
 ; SKX-NEXT:    [[RDX_MINMAX_SELECT12:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP11]], <32 x float> [[RDX_MINMAX_SELECT9]], <32 x float> [[RDX_SHUF10]]
-; SKX-NEXT:    [[TMP96:%.*]] = extractelement <32 x float> [[RDX_MINMAX_SELECT12]], i32 0
-; SKX-NEXT:    [[TMP97:%.*]] = select i1 [[TMP95]], float [[TMP93]], float undef
-; SKX-NEXT:    ret float [[TMP96]]
+; SKX-NEXT:    [[TMP3:%.*]] = extractelement <32 x float> [[RDX_MINMAX_SELECT12]], i32 0
+; SKX-NEXT:    ret float [[TMP3]]
 ;
   %2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
   %3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
@@ -2026,123 +928,84 @@
 ; AVX-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
 ; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
 ; AVX-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP4]]
-; AVX-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 undef, i32 undef
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
-; AVX-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]]
-; AVX-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP6]], i32 undef
-; AVX-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
-; AVX-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
-; AVX-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP9]], i32 undef
-; AVX-NEXT:    [[TMP13:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
-; AVX-NEXT:    [[TMP14:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]]
-; AVX-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP12]], i32 [[TMP13]]
-; AVX-NEXT:    [[TMP16:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
-; AVX-NEXT:    [[TMP17:%.*]] = icmp sgt i32 [[TMP15]], [[TMP16]]
-; AVX-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP15]], i32 [[TMP16]]
-; AVX-NEXT:    [[TMP19:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
-; AVX-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP18]], [[TMP19]]
+; AVX-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
+; AVX-NEXT:    [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
+; AVX-NEXT:    [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
 ; AVX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
 ; AVX-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP2]], [[RDX_SHUF]]
 ; AVX-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP2]], <4 x i32> [[RDX_SHUF]]
 ; AVX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; AVX-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
 ; AVX-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]]
-; AVX-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0
-; AVX-NEXT:    [[TMP22:%.*]] = icmp sgt i32 [[TMP21]], [[TMP13]]
-; AVX-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP21]], i32 [[TMP13]]
-; AVX-NEXT:    [[TMP24:%.*]] = icmp sgt i32 [[TMP23]], [[TMP16]]
-; AVX-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i32 [[TMP23]], i32 [[TMP16]]
-; AVX-NEXT:    [[TMP26:%.*]] = icmp sgt i32 [[TMP25]], [[TMP19]]
-; AVX-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[TMP25]], i32 [[TMP19]]
-; AVX-NEXT:    [[TMP28:%.*]] = select i1 [[TMP20]], i32 [[TMP18]], i32 [[TMP19]]
-; AVX-NEXT:    [[TMP29:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
-; AVX-NEXT:    [[TMP30:%.*]] = icmp sgt i32 [[TMP27]], [[TMP29]]
-; AVX-NEXT:    [[TMP31:%.*]] = select i1 [[TMP30]], i32 [[TMP27]], i32 [[TMP29]]
-; AVX-NEXT:    [[TMP32:%.*]] = select i1 [[TMP5]], i32 3, i32 4
-; AVX-NEXT:    store i32 [[TMP32]], i32* @var, align 8
-; AVX-NEXT:    ret i32 [[TMP31]]
+; AVX-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0
+; AVX-NEXT:    [[TMP10:%.*]] = icmp sgt i32 [[TMP9]], [[TMP6]]
+; AVX-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP9]], i32 [[TMP6]]
+; AVX-NEXT:    [[TMP12:%.*]] = icmp sgt i32 [[TMP11]], [[TMP7]]
+; AVX-NEXT:    [[TMP13:%.*]] = select i1 [[TMP12]], i32 [[TMP11]], i32 [[TMP7]]
+; AVX-NEXT:    [[TMP14:%.*]] = icmp sgt i32 [[TMP13]], [[TMP8]]
+; AVX-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP13]], i32 [[TMP8]]
+; AVX-NEXT:    [[TMP16:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
+; AVX-NEXT:    [[TMP17:%.*]] = icmp sgt i32 [[TMP15]], [[TMP16]]
+; AVX-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP15]], i32 [[TMP16]]
+; AVX-NEXT:    [[TMP19:%.*]] = select i1 [[TMP5]], i32 3, i32 4
+; AVX-NEXT:    store i32 [[TMP19]], i32* @var, align 8
+; AVX-NEXT:    ret i32 [[TMP18]]
 ;
 ; AVX2-LABEL: @maxi8_mutiple_uses(
 ; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([32 x i32]* @arr to <4 x i32>*), align 16
 ; AVX2-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
 ; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
 ; AVX2-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP4]]
-; AVX2-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 undef, i32 undef
-; AVX2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
-; AVX2-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]]
-; AVX2-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP6]], i32 undef
-; AVX2-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
-; AVX2-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
-; AVX2-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP9]], i32 undef
-; AVX2-NEXT:    [[TMP13:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
-; AVX2-NEXT:    [[TMP14:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]]
-; AVX2-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP12]], i32 [[TMP13]]
-; AVX2-NEXT:    [[TMP16:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
-; AVX2-NEXT:    [[TMP17:%.*]] = icmp sgt i32 [[TMP15]], [[TMP16]]
-; AVX2-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP15]], i32 [[TMP16]]
-; AVX2-NEXT:    [[TMP19:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
-; AVX2-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP18]], [[TMP19]]
+; AVX2-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
+; AVX2-NEXT:    [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
+; AVX2-NEXT:    [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
 ; AVX2-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
 ; AVX2-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP2]], [[RDX_SHUF]]
 ; AVX2-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP2]], <4 x i32> [[RDX_SHUF]]
 ; AVX2-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; AVX2-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
 ; AVX2-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]]
-; AVX2-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0
-; AVX2-NEXT:    [[TMP22:%.*]] = icmp sgt i32 [[TMP21]], [[TMP13]]
-; AVX2-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP21]], i32 [[TMP13]]
-; AVX2-NEXT:    [[TMP24:%.*]] = icmp sgt i32 [[TMP23]], [[TMP16]]
-; AVX2-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i32 [[TMP23]], i32 [[TMP16]]
-; AVX2-NEXT:    [[TMP26:%.*]] = icmp sgt i32 [[TMP25]], [[TMP19]]
-; AVX2-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[TMP25]], i32 [[TMP19]]
-; AVX2-NEXT:    [[TMP28:%.*]] = select i1 [[TMP20]], i32 [[TMP18]], i32 [[TMP19]]
-; AVX2-NEXT:    [[TMP29:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
-; AVX2-NEXT:    [[TMP30:%.*]] = icmp sgt i32 [[TMP27]], [[TMP29]]
-; AVX2-NEXT:    [[TMP31:%.*]] = select i1 [[TMP30]], i32 [[TMP27]], i32 [[TMP29]]
-; AVX2-NEXT:    [[TMP32:%.*]] = select i1 [[TMP5]], i32 3, i32 4
-; AVX2-NEXT:    store i32 [[TMP32]], i32* @var, align 8
-; AVX2-NEXT:    ret i32 [[TMP31]]
+; AVX2-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0
+; AVX2-NEXT:    [[TMP10:%.*]] = icmp sgt i32 [[TMP9]], [[TMP6]]
+; AVX2-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP9]], i32 [[TMP6]]
+; AVX2-NEXT:    [[TMP12:%.*]] = icmp sgt i32 [[TMP11]], [[TMP7]]
+; AVX2-NEXT:    [[TMP13:%.*]] = select i1 [[TMP12]], i32 [[TMP11]], i32 [[TMP7]]
+; AVX2-NEXT:    [[TMP14:%.*]] = icmp sgt i32 [[TMP13]], [[TMP8]]
+; AVX2-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP13]], i32 [[TMP8]]
+; AVX2-NEXT:    [[TMP16:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
+; AVX2-NEXT:    [[TMP17:%.*]] = icmp sgt i32 [[TMP15]], [[TMP16]]
+; AVX2-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP15]], i32 [[TMP16]]
+; AVX2-NEXT:    [[TMP19:%.*]] = select i1 [[TMP5]], i32 3, i32 4
+; AVX2-NEXT:    store i32 [[TMP19]], i32* @var, align 8
+; AVX2-NEXT:    ret i32 [[TMP18]]
 ;
 ; SKX-LABEL: @maxi8_mutiple_uses(
 ; SKX-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([32 x i32]* @arr to <4 x i32>*), align 16
 ; SKX-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
 ; SKX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
 ; SKX-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP4]]
-; SKX-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 undef, i32 undef
-; SKX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
-; SKX-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]]
-; SKX-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP6]], i32 undef
-; SKX-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
-; SKX-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
-; SKX-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP9]], i32 undef
-; SKX-NEXT:    [[TMP13:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
-; SKX-NEXT:    [[TMP14:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]]
-; SKX-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP12]], i32 [[TMP13]]
-; SKX-NEXT:    [[TMP16:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
-; SKX-NEXT:    [[TMP17:%.*]] = icmp sgt i32 [[TMP15]], [[TMP16]]
-; SKX-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP15]], i32 [[TMP16]]
-; SKX-NEXT:    [[TMP19:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
-; SKX-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP18]], [[TMP19]]
+; SKX-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
+; SKX-NEXT:    [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
+; SKX-NEXT:    [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
 ; SKX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
 ; SKX-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP2]], [[RDX_SHUF]]
 ; SKX-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP2]], <4 x i32> [[RDX_SHUF]]
 ; SKX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SKX-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
 ; SKX-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]]
-; SKX-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0
-; SKX-NEXT:    [[TMP22:%.*]] = icmp sgt i32 [[TMP21]], [[TMP13]]
-; SKX-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP21]], i32 [[TMP13]]
-; SKX-NEXT:    [[TMP24:%.*]] = icmp sgt i32 [[TMP23]], [[TMP16]]
-; SKX-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i32 [[TMP23]], i32 [[TMP16]]
-; SKX-NEXT:    [[TMP26:%.*]] = icmp sgt i32 [[TMP25]], [[TMP19]]
-; SKX-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[TMP25]], i32 [[TMP19]]
-; SKX-NEXT:    [[TMP28:%.*]] = select i1 [[TMP20]], i32 [[TMP18]], i32 [[TMP19]]
-; SKX-NEXT:    [[TMP29:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
-; SKX-NEXT:    [[TMP30:%.*]] = icmp sgt i32 [[TMP27]], [[TMP29]]
-; SKX-NEXT:    [[TMP31:%.*]] = select i1 [[TMP30]], i32 [[TMP27]], i32 [[TMP29]]
-; SKX-NEXT:    [[TMP32:%.*]] = select i1 [[TMP5]], i32 3, i32 4
-; SKX-NEXT:    store i32 [[TMP32]], i32* @var, align 8
-; SKX-NEXT:    ret i32 [[TMP31]]
+; SKX-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0
+; SKX-NEXT:    [[TMP10:%.*]] = icmp sgt i32 [[TMP9]], [[TMP6]]
+; SKX-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP9]], i32 [[TMP6]]
+; SKX-NEXT:    [[TMP12:%.*]] = icmp sgt i32 [[TMP11]], [[TMP7]]
+; SKX-NEXT:    [[TMP13:%.*]] = select i1 [[TMP12]], i32 [[TMP11]], i32 [[TMP7]]
+; SKX-NEXT:    [[TMP14:%.*]] = icmp sgt i32 [[TMP13]], [[TMP8]]
+; SKX-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP13]], i32 [[TMP8]]
+; SKX-NEXT:    [[TMP16:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
+; SKX-NEXT:    [[TMP17:%.*]] = icmp sgt i32 [[TMP15]], [[TMP16]]
+; SKX-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP15]], i32 [[TMP16]]
+; SKX-NEXT:    [[TMP19:%.*]] = select i1 [[TMP5]], i32 3, i32 4
+; SKX-NEXT:    store i32 [[TMP19]], i32* @var, align 8
+; SKX-NEXT:    ret i32 [[TMP18]]
 ;
   %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
   %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
@@ -2207,37 +1070,21 @@
 ; AVX:       pp:
 ; AVX-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
 ; AVX-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0
-; AVX-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP5]], [[TMP7]]
-; AVX-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP5]], i32 undef
-; AVX-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[TMP6]], i32 1
-; AVX-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
-; AVX-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP9]], i32 undef
-; AVX-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[TMP6]], i32 2
-; AVX-NEXT:    [[TMP14:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]]
-; AVX-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP12]], i32 undef
-; AVX-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[TMP6]], i32 3
-; AVX-NEXT:    [[TMP17:%.*]] = icmp sgt i32 [[TMP15]], [[TMP16]]
-; AVX-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP15]], i32 undef
-; AVX-NEXT:    [[TMP19:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
-; AVX-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP18]], [[TMP19]]
-; AVX-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[TMP18]], i32 [[TMP19]]
-; AVX-NEXT:    [[TMP22:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
-; AVX-NEXT:    [[TMP23:%.*]] = icmp sgt i32 [[TMP21]], [[TMP22]]
+; AVX-NEXT:    [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
+; AVX-NEXT:    [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
 ; AVX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
 ; AVX-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP6]], [[RDX_SHUF]]
 ; AVX-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP6]], <4 x i32> [[RDX_SHUF]]
 ; AVX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; AVX-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
 ; AVX-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]]
-; AVX-NEXT:    [[TMP24:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0
-; AVX-NEXT:    [[TMP25:%.*]] = icmp sgt i32 [[TMP24]], [[TMP19]]
-; AVX-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP24]], i32 [[TMP19]]
-; AVX-NEXT:    [[TMP27:%.*]] = icmp sgt i32 [[TMP26]], [[TMP22]]
-; AVX-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], i32 [[TMP26]], i32 [[TMP22]]
-; AVX-NEXT:    [[TMP29:%.*]] = icmp sgt i32 [[TMP28]], [[TMP5]]
-; AVX-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP29]], i32 [[TMP28]], i32 [[TMP5]]
-; AVX-NEXT:    [[TMP30:%.*]] = select i1 [[TMP23]], i32 [[TMP21]], i32 [[TMP22]]
+; AVX-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0
+; AVX-NEXT:    [[TMP10:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]]
+; AVX-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP9]], i32 [[TMP7]]
+; AVX-NEXT:    [[TMP12:%.*]] = icmp sgt i32 [[TMP11]], [[TMP8]]
+; AVX-NEXT:    [[TMP13:%.*]] = select i1 [[TMP12]], i32 [[TMP11]], i32 [[TMP8]]
+; AVX-NEXT:    [[TMP14:%.*]] = icmp sgt i32 [[TMP13]], [[TMP5]]
+; AVX-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP14]], i32 [[TMP13]], i32 [[TMP5]]
 ; AVX-NEXT:    ret i32 [[OP_EXTRA]]
 ;
 ; AVX2-LABEL: @maxi8_wrong_parent(
@@ -2248,37 +1095,21 @@
 ; AVX2:       pp:
 ; AVX2-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
 ; AVX2-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8
-; AVX2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0
-; AVX2-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP5]], [[TMP7]]
-; AVX2-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP5]], i32 undef
-; AVX2-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[TMP6]], i32 1
-; AVX2-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
-; AVX2-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP9]], i32 undef
-; AVX2-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[TMP6]], i32 2
-; AVX2-NEXT:    [[TMP14:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]]
-; AVX2-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP12]], i32 undef
-; AVX2-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[TMP6]], i32 3
-; AVX2-NEXT:    [[TMP17:%.*]] = icmp sgt i32 [[TMP15]], [[TMP16]]
-; AVX2-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP15]], i32 undef
-; AVX2-NEXT:    [[TMP19:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
-; AVX2-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP18]], [[TMP19]]
-; AVX2-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[TMP18]], i32 [[TMP19]]
-; AVX2-NEXT:    [[TMP22:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
-; AVX2-NEXT:    [[TMP23:%.*]] = icmp sgt i32 [[TMP21]], [[TMP22]]
+; AVX2-NEXT:    [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
+; AVX2-NEXT:    [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
 ; AVX2-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
 ; AVX2-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP6]], [[RDX_SHUF]]
 ; AVX2-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP6]], <4 x i32> [[RDX_SHUF]]
 ; AVX2-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; AVX2-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
 ; AVX2-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]]
-; AVX2-NEXT:    [[TMP24:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0
-; AVX2-NEXT:    [[TMP25:%.*]] = icmp sgt i32 [[TMP24]], [[TMP19]]
-; AVX2-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP24]], i32 [[TMP19]]
-; AVX2-NEXT:    [[TMP27:%.*]] = icmp sgt i32 [[TMP26]], [[TMP22]]
-; AVX2-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], i32 [[TMP26]], i32 [[TMP22]]
-; AVX2-NEXT:    [[TMP29:%.*]] = icmp sgt i32 [[TMP28]], [[TMP5]]
-; AVX2-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP29]], i32 [[TMP28]], i32 [[TMP5]]
-; AVX2-NEXT:    [[TMP30:%.*]] = select i1 [[TMP23]], i32 [[TMP21]], i32 [[TMP22]]
+; AVX2-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0
+; AVX2-NEXT:    [[TMP10:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]]
+; AVX2-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP9]], i32 [[TMP7]]
+; AVX2-NEXT:    [[TMP12:%.*]] = icmp sgt i32 [[TMP11]], [[TMP8]]
+; AVX2-NEXT:    [[TMP13:%.*]] = select i1 [[TMP12]], i32 [[TMP11]], i32 [[TMP8]]
+; AVX2-NEXT:    [[TMP14:%.*]] = icmp sgt i32 [[TMP13]], [[TMP5]]
+; AVX2-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP14]], i32 [[TMP13]], i32 [[TMP5]]
 ; AVX2-NEXT:    ret i32 [[OP_EXTRA]]
 ;
 ; SKX-LABEL: @maxi8_wrong_parent(
@@ -2289,45 +1120,29 @@
 ; SKX-NEXT:    br label [[PP:%.*]]
 ; SKX:       pp:
 ; SKX-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8
-; SKX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0
-; SKX-NEXT:    [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
-; SKX-NEXT:    [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
+; SKX-NEXT:    [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
+; SKX-NEXT:    [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
 ; SKX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
 ; SKX-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP6]], [[RDX_SHUF]]
 ; SKX-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP6]], <4 x i32> [[RDX_SHUF]]
 ; SKX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SKX-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
 ; SKX-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]]
-; SKX-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0
-; SKX-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP10]], [[TMP8]]
-; SKX-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP10]], i32 [[TMP8]]
-; SKX-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP12]], [[TMP9]]
-; SKX-NEXT:    [[TMP14:%.*]] = insertelement <2 x i1> undef, i1 [[TMP13]], i32 0
-; SKX-NEXT:    [[TMP15:%.*]] = insertelement <2 x i1> [[TMP14]], i1 [[TMP5]], i32 1
-; SKX-NEXT:    [[TMP16:%.*]] = insertelement <2 x i32> undef, i32 [[TMP12]], i32 0
-; SKX-NEXT:    [[TMP17:%.*]] = insertelement <2 x i32> [[TMP16]], i32 [[TMP3]], i32 1
-; SKX-NEXT:    [[TMP18:%.*]] = insertelement <2 x i32> undef, i32 [[TMP9]], i32 0
-; SKX-NEXT:    [[TMP19:%.*]] = insertelement <2 x i32> [[TMP18]], i32 [[TMP4]], i32 1
-; SKX-NEXT:    [[TMP20:%.*]] = select <2 x i1> [[TMP15]], <2 x i32> [[TMP17]], <2 x i32> [[TMP19]]
-; SKX-NEXT:    [[TMP21:%.*]] = extractelement <2 x i32> [[TMP20]], i32 1
-; SKX-NEXT:    [[TMP22:%.*]] = icmp sgt i32 [[TMP21]], [[TMP7]]
-; SKX-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP21]], i32 undef
-; SKX-NEXT:    [[TMP24:%.*]] = extractelement <4 x i32> [[TMP6]], i32 1
-; SKX-NEXT:    [[TMP25:%.*]] = icmp sgt i32 [[TMP23]], [[TMP24]]
-; SKX-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP23]], i32 undef
-; SKX-NEXT:    [[TMP27:%.*]] = extractelement <4 x i32> [[TMP6]], i32 2
-; SKX-NEXT:    [[TMP28:%.*]] = icmp sgt i32 [[TMP26]], [[TMP27]]
-; SKX-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i32 [[TMP26]], i32 undef
-; SKX-NEXT:    [[TMP30:%.*]] = extractelement <4 x i32> [[TMP6]], i32 3
-; SKX-NEXT:    [[TMP31:%.*]] = icmp sgt i32 [[TMP29]], [[TMP30]]
-; SKX-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP29]], i32 undef
-; SKX-NEXT:    [[TMP33:%.*]] = icmp sgt i32 [[TMP32]], [[TMP8]]
-; SKX-NEXT:    [[TMP34:%.*]] = select i1 [[TMP33]], i32 [[TMP32]], i32 [[TMP8]]
-; SKX-NEXT:    [[TMP35:%.*]] = icmp sgt i32 [[TMP34]], [[TMP9]]
-; SKX-NEXT:    [[TMP36:%.*]] = extractelement <2 x i32> [[TMP20]], i32 0
-; SKX-NEXT:    [[TMP37:%.*]] = icmp sgt i32 [[TMP36]], [[TMP21]]
-; SKX-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP37]], i32 [[TMP36]], i32 [[TMP21]]
-; SKX-NEXT:    [[TMP38:%.*]] = select i1 [[TMP35]], i32 [[TMP34]], i32 [[TMP9]]
+; SKX-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0
+; SKX-NEXT:    [[TMP10:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]]
+; SKX-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP9]], i32 [[TMP7]]
+; SKX-NEXT:    [[TMP12:%.*]] = icmp sgt i32 [[TMP11]], [[TMP8]]
+; SKX-NEXT:    [[TMP13:%.*]] = insertelement <2 x i1> undef, i1 [[TMP12]], i32 0
+; SKX-NEXT:    [[TMP14:%.*]] = insertelement <2 x i1> [[TMP13]], i1 [[TMP5]], i32 1
+; SKX-NEXT:    [[TMP15:%.*]] = insertelement <2 x i32> undef, i32 [[TMP11]], i32 0
+; SKX-NEXT:    [[TMP16:%.*]] = insertelement <2 x i32> [[TMP15]], i32 [[TMP3]], i32 1
+; SKX-NEXT:    [[TMP17:%.*]] = insertelement <2 x i32> undef, i32 [[TMP8]], i32 0
+; SKX-NEXT:    [[TMP18:%.*]] = insertelement <2 x i32> [[TMP17]], i32 [[TMP4]], i32 1
+; SKX-NEXT:    [[TMP19:%.*]] = select <2 x i1> [[TMP14]], <2 x i32> [[TMP16]], <2 x i32> [[TMP18]]
+; SKX-NEXT:    [[TMP20:%.*]] = extractelement <2 x i32> [[TMP19]], i32 1
+; SKX-NEXT:    [[TMP21:%.*]] = extractelement <2 x i32> [[TMP19]], i32 0
+; SKX-NEXT:    [[TMP22:%.*]] = icmp sgt i32 [[TMP21]], [[TMP20]]
+; SKX-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP22]], i32 [[TMP21]], i32 [[TMP20]]
 ; SKX-NEXT:    ret i32 [[OP_EXTRA]]
 ;
   %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
Index: test/Transforms/SLPVectorizer/X86/reduction_loads.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/reduction_loads.ll
+++ test/Transforms/SLPVectorizer/X86/reduction_loads.ll
@@ -18,13 +18,6 @@
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul <8 x i32> <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>, [[TMP1]]
-; CHECK-NEXT:    [[ADD:%.*]] = add i32 undef, [[SUM]]
-; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 undef, [[ADD]]
-; CHECK-NEXT:    [[ADD_2:%.*]] = add i32 undef, [[ADD_1]]
-; CHECK-NEXT:    [[ADD_3:%.*]] = add i32 undef, [[ADD_2]]
-; CHECK-NEXT:    [[ADD_4:%.*]] = add i32 undef, [[ADD_3]]
-; CHECK-NEXT:    [[ADD_5:%.*]] = add i32 undef, [[ADD_4]]
-; CHECK-NEXT:    [[ADD_6:%.*]] = add i32 undef, [[ADD_5]]
 ; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <8 x i32> [[TMP2]], [[RDX_SHUF]]
 ; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -33,7 +26,6 @@
 ; CHECK-NEXT:    [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
 ; CHECK-NEXT:    [[BIN_EXTRA]] = add i32 [[TMP3]], [[SUM]]
-; CHECK-NEXT:    [[ADD_7:%.*]] = add i32 undef, [[ADD_6]]
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret i32 [[BIN_EXTRA]]