Index: include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfo.h
+++ include/llvm/Analysis/TargetTransformInfo.h
@@ -713,6 +713,7 @@
   ///  ((v0+v2), (v1+v3), undef, undef)
   int getArithmeticReductionCost(unsigned Opcode, Type *Ty,
                                  bool IsPairwiseForm) const;
+  int getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwiseForm) const;
 
   /// \returns The cost of Intrinsic instructions. Analyses the real arguments.
   /// Three cases are handled: 1. scalar instruction 2. vector instruction
@@ -978,6 +979,8 @@
                                          unsigned AddressSpace) = 0;
   virtual int getArithmeticReductionCost(unsigned Opcode, Type *Ty,
                                          bool IsPairwiseForm) = 0;
+  virtual int getMinMaxReductionCost(Type *Ty, Type *CondTy,
+                                     bool IsPairwiseForm) = 0;
   virtual int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
                       ArrayRef<Type *> Tys, FastMathFlags FMF,
                       unsigned ScalarizationCostPassed) = 0;
@@ -1286,6 +1289,10 @@
                                  bool IsPairwiseForm) override {
     return Impl.getArithmeticReductionCost(Opcode, Ty, IsPairwiseForm);
   }
+  int getMinMaxReductionCost(Type *Ty, Type *CondTy,
+                             bool IsPairwiseForm) override {
+    return Impl.getMinMaxReductionCost(Ty, CondTy, IsPairwiseForm);
+   }
   int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef<Type *> Tys,
                FastMathFlags FMF, unsigned ScalarizationCostPassed) override {
     return Impl.getIntrinsicInstrCost(ID, RetTy, Tys, FMF,
Index: include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfoImpl.h
+++ include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -430,6 +430,8 @@
 
   unsigned getArithmeticReductionCost(unsigned, Type *, bool) { return 1; }
 
+  unsigned getMinMaxReductionCost(Type *, Type *, bool) { return 1; }
+
   unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) { return 0; }
 
   bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) {
Index: include/llvm/CodeGen/BasicTTIImpl.h
===================================================================
--- include/llvm/CodeGen/BasicTTIImpl.h
+++ include/llvm/CodeGen/BasicTTIImpl.h
@@ -1170,6 +1170,133 @@
     return ShuffleCost + ArithCost + getScalarizationOverhead(Ty, false, true);
   }
 
+  /// Try to calculate arithmetic and shuffle op costs for reduction operations.
+  /// Try to calculate arithmetic and shuffle op costs for reduction operations.
+  /// We're assuming that reduction operation are performing the following way:
+  /// 1. Non-pairwise reduction
+  /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
+  /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef>
+  ///            \----------------v-------------/  \----------v------------/
+  ///                            n/2 elements               n/2 elements
+  /// %red1 = op <n x t> %val, <n x t> val1
+  /// After this operation we have a vector %red1 where only the first n/2
+  /// elements are meaningful, the second n/2 elements are undefined and can be
+  /// dropped. All other operations are actually working with the vector of
+  /// length n/2, not n, though the real vector length is still n.
+  /// %val2 = shufflevector<n x t> %red1, <n x t> %undef,
+  /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef>
+  ///            \----------------v-------------/  \----------v------------/
+  ///                            n/4 elements               3*n/4 elements
+  /// %red2 = op <n x t> %red1, <n x t> val2  - working with the vector of
+  /// length n/2, the resulting vector has length n/4 etc.
+  /// 2. Pairwise reduction:
+  /// Everything is the same except for an additional shuffle operation which
+  /// is used to produce operands for pairwise kind of reductions.
+  /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
+  /// <n x i32> <i32 0, i32 2, ..., i32 n-2, i32 undef, ..., i32 undef>
+  ///            \-------------v----------/  \----------v------------/
+  ///                   n/2 elements               n/2 elements
+  /// %val2 = shufflevector<n x t> %val, <n x t> %undef,
+  /// <n x i32> <i32 1, i32 3, ..., i32 n-1, i32 undef, ..., i32 undef>
+  ///            \-------------v----------/  \----------v------------/
+  ///                   n/2 elements               n/2 elements
+  /// %red1 = op <n x t> %val1, <n x t> val2
+  /// Again, the operation is performed on <n x t> vector, but the resulting
+  /// vector %red1 is <n/2 x t> vector.
+  ///
+  /// The cost model should take into account that the actual length of the
+  /// vector is reduced on each iteration.
+  /// We're assuming that reduction operation are performing the following way:
+  /// 1. Non-pairwise reduction
+  /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
+  /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef>
+  ///            \----------------v-------------/  \----------v------------/
+  ///                            n/2 elements               n/2 elements
+  /// %red1 = op <n x t> %val, <n x t> val1
+  /// After this operation we have a vector %red1 where only the first n/2
+  /// elements are meaningful, the second n/2 elements are undefined and can be
+  /// dropped. All other operations are actually working with the vector of
+  /// length n/2, not n, though the real vector length is still n.
+  /// %val2 = shufflevector<n x t> %red1, <n x t> %undef,
+  /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef>
+  ///            \----------------v-------------/  \----------v------------/
+  ///                            n/4 elements               3*n/4 elements
+  /// %red2 = op <n x t> %red1, <n x t> val2  - working with the vector of
+  /// length n/2, the resulting vector has length n/4 etc.
+  /// 2. Pairwise reduction:
+  /// Everything is the same except for an additional shuffle operation which
+  /// is used to produce operands for pairwise kind of reductions.
+  /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
+  /// <n x i32> <i32 0, i32 2, ..., i32 n-2, i32 undef, ..., i32 undef>
+  ///            \-------------v----------/  \----------v------------/
+  ///                   n/2 elements               n/2 elements
+  /// %val2 = shufflevector<n x t> %val, <n x t> %undef,
+  /// <n x i32> <i32 1, i32 3, ..., i32 n-1, i32 undef, ..., i32 undef>
+  ///            \-------------v----------/  \----------v------------/
+  ///                   n/2 elements               n/2 elements
+  /// %red1 = op <n x t> %val1, <n x t> val2
+  /// Again, the operation is performed on <n x t> vector, but the resulting
+  /// vector %red1 is <n/2 x t> vector.
+  ///
+  /// The cost model should take into account that the actual length of the
+  /// vector is reduced on each iteration.
+  unsigned getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwise) {
+    assert(Ty->isVectorTy() && "Expect a vector type");
+    Type *ScalarTy = Ty->getVectorElementType();
+    Type *ScalarCondTy = CondTy->getVectorElementType();
+    unsigned NumVecElts = Ty->getVectorNumElements();
+    unsigned NumReduxLevels = Log2_32(NumVecElts);
+    unsigned CmpOpcode;
+    if (Ty->getVectorElementType()->isFloatingPointTy())
+      CmpOpcode = Instruction::FCmp;
+    else {
+      assert(Ty->isIntOrIntVectorTy() &&
+             "expecting floating point or integer type for min/max reduction");
+      CmpOpcode = Instruction::ICmp;
+    }
+    unsigned MinMaxCost = 0;
+    unsigned ShuffleCost = 0;
+    auto *ConcreteTTI = static_cast<T *>(this);
+    std::pair<unsigned, MVT> LT =
+        ConcreteTTI->getTLI()->getTypeLegalizationCost(DL, Ty);
+    unsigned LongVectorCount = 0;
+    unsigned MVTLen =
+        LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
+    while (NumVecElts > MVTLen) {
+      NumVecElts /= 2;
+      // Assume the pairwise shuffles add a cost.
+      ShuffleCost += (IsPairwise + 1) *
+                     ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty,
+                                                 NumVecElts, Ty);
+      MinMaxCost +=
+          ConcreteTTI->getCmpSelInstrCost(CmpOpcode, Ty, CondTy, nullptr) +
+          ConcreteTTI->getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
+                                          nullptr);
+      Ty = VectorType::get(ScalarTy, NumVecElts);
+      CondTy = VectorType::get(ScalarCondTy, NumVecElts);
+      ++LongVectorCount;
+    }
+    // The minimal length of the vector is limited by the real length of vector
+    // operations performed on the current platform. That's why several final
+    // reduction opertions are perfomed on the vectors with the same
+    // architecture-dependent length.
+    ShuffleCost += (NumReduxLevels - LongVectorCount) * (IsPairwise + 1) *
+                   ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty,
+                                               NumVecElts, Ty);
+    MinMaxCost +=
+        (NumReduxLevels - LongVectorCount) *
+        (ConcreteTTI->getCmpSelInstrCost(CmpOpcode, Ty, CondTy, nullptr) +
+         ConcreteTTI->getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
+                                         nullptr));
+    // Need 3 extractelement instructions for scalarization + an additional
+    // scalar select instruction.
+    return ShuffleCost + MinMaxCost +
+           3 * getScalarizationOverhead(Ty, /*Insert=*/false,
+                                        /*Extract=*/true) +
+           static_cast<T *>(this)->getCmpSelInstrCost(
+               Instruction::Select, ScalarTy, ScalarCondTy, nullptr);
+  }
+
   unsigned getVectorSplitCost() { return 1; }
 
   /// @}
Index: include/llvm/Transforms/Vectorize/SLPVectorizer.h
===================================================================
--- include/llvm/Transforms/Vectorize/SLPVectorizer.h
+++ include/llvm/Transforms/Vectorize/SLPVectorizer.h
@@ -100,6 +100,19 @@
                                 slpvectorizer::BoUpSLP &R,
                                 TargetTransformInfo *TTI);
 
+  /// Try to vectorize trees that start at insertvalue instructions.
+  bool vectorizeInsertValueInst(InsertValueInst *IVI, BasicBlock *BB,
+                                slpvectorizer::BoUpSLP &R);
+  /// Try to vectorize trees that start at insertelement instructions.
+  bool vectorizeInsertElementInst(InsertElementInst *IEI, BasicBlock *BB,
+                                  slpvectorizer::BoUpSLP &R);
+  /// Try to vectorize trees that start at compare instructions.
+  bool vectorizeCmpInst(CmpInst *CI, BasicBlock *BB, slpvectorizer::BoUpSLP &R);
+  /// Tries to vectorize constructs started from CmpInst, InsertValueInst or
+  /// InsertElementInst instructions.
+  bool vectorizeSimpleInstructions(SmallVectorImpl<WeakVH> &Instructions,
+                                   BasicBlock *BB, slpvectorizer::BoUpSLP &R);
+
   /// \brief Scan the basic block and look for patterns that are likely to start
   /// a vectorization chain.
   bool vectorizeChainsInBlock(BasicBlock *BB, slpvectorizer::BoUpSLP &R);
Index: lib/Analysis/CostModel.cpp
===================================================================
--- lib/Analysis/CostModel.cpp
+++ lib/Analysis/CostModel.cpp
@@ -186,26 +186,58 @@
 }
 
 namespace {
+/// Kind of the reduction data.
+enum class ReductionKind {
+  NotReduction,        /// Not a reduction.
+  ArithmeticReduction, /// Binary reduction data.
+  MinMaxReduction,     /// Min/max reduction data.
+};
 /// Contains opcode + LHS/RHS parts of the reduction operations.
 struct ReductionData {
-  explicit ReductionData() = default;
-  ReductionData(unsigned Opcode, Value *LHS, Value *RHS)
-      : Opcode(Opcode), LHS(LHS), RHS(RHS) {}
+  ReductionData() = delete;
+  ReductionData(ReductionKind Kind, unsigned Opcode, Value *LHS, Value *RHS)
+      : Opcode(Opcode), LHS(LHS), RHS(RHS), Kind(Kind) {
+    assert(Kind != ReductionKind::NotReduction &&
+           "expected binary or min/max reduction only.");
+  }
   unsigned Opcode = 0;
   Value *LHS = nullptr;
   Value *RHS = nullptr;
+  ReductionKind Kind = ReductionKind::NotReduction;
+  bool isBinary() const { return Kind == ReductionKind::ArithmeticReduction; }
+  bool isMinMax() const { return Kind == ReductionKind::MinMaxReduction; }
+  bool hasSameData(ReductionData &RD) const {
+    return this == &RD || (Kind == RD.Kind && Opcode == RD.Opcode);
+  }
 };
 } // namespace
 
 static Optional<ReductionData> getReductionData(Instruction *I) {
   Value *L, *R;
-  if (m_BinOp(m_Value(L), m_Value(R)).match(I))
-    return ReductionData(I->getOpcode(), L, R);
+  if (m_BinOp(m_Value(L), m_Value(R)).match(I)) {
+    return ReductionData(ReductionKind::ArithmeticReduction, I->getOpcode(), L,
+                         R);
+  }
+  if (auto *SI = dyn_cast<SelectInst>(I)) {
+    if (m_UMin(m_Value(L), m_Value(R)).match(SI) ||
+        m_SMin(m_Value(L), m_Value(R)).match(SI) ||
+        m_SMax(m_Value(L), m_Value(R)).match(SI) ||
+        m_UMax(m_Value(L), m_Value(R)).match(SI) ||
+        m_OrdFMin(m_Value(L), m_Value(R)).match(SI) ||
+        m_OrdFMax(m_Value(L), m_Value(R)).match(SI) ||
+        m_UnordFMin(m_Value(L), m_Value(R)).match(SI) ||
+        m_UnordFMax(m_Value(L), m_Value(R)).match(SI)) {
+      auto *CI = cast<CmpInst>(SI->getCondition());
+      return ReductionData(ReductionKind::MinMaxReduction, CI->getOpcode(), L,
+                           R);
+    }
+  }
   return llvm::None;
 }
 
-static bool matchPairwiseReductionAtLevel(Instruction *I, unsigned Level,
-                                          unsigned NumLevels) {
+static ReductionKind matchPairwiseReductionAtLevel(Instruction *I,
+                                                   unsigned Level,
+                                                   unsigned NumLevels) {
   // Match one level of pairwise operations.
   // %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
   //       <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
@@ -213,24 +245,24 @@
   //       <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
   // %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
   if (!I)
-    return false;
+    return ReductionKind::NotReduction;
 
   assert(I->getType()->isVectorTy() && "Expecting a vector type");
 
   Optional<ReductionData> RD = getReductionData(I);
   if (!RD)
-    return false;
+    return ReductionKind::NotReduction;
 
   ShuffleVectorInst *LS = dyn_cast<ShuffleVectorInst>(RD->LHS);
   if (!LS && Level)
-    return false;
+    return ReductionKind::NotReduction;
   ShuffleVectorInst *RS = dyn_cast<ShuffleVectorInst>(RD->RHS);
   if (!RS && Level)
-    return false;
+    return ReductionKind::NotReduction;
 
   // On level 0 we can omit one shufflevector instruction.
   if (!Level && !RS && !LS)
-    return false;
+    return ReductionKind::NotReduction;
 
   // Shuffle inputs must match.
   Value *NextLevelOpL = LS ? LS->getOperand(0) : nullptr;
@@ -239,7 +271,7 @@
   if (NextLevelOpR && NextLevelOpL) {
     // If we have two shuffles their operands must match.
     if (NextLevelOpL != NextLevelOpR)
-      return false;
+      return ReductionKind::NotReduction;
 
     NextLevelOp = NextLevelOpL;
   } else if (Level == 0 && (NextLevelOpR || NextLevelOpL)) {
@@ -250,45 +282,45 @@
     //  %NextLevelOpL = shufflevector %R, <1, undef ...>
     //  %BinOp        = fadd          %NextLevelOpL, %R
     if (NextLevelOpL && NextLevelOpL != RD->RHS)
-      return false;
+      return ReductionKind::NotReduction;
     else if (NextLevelOpR && NextLevelOpR != RD->LHS)
-      return false;
+      return ReductionKind::NotReduction;
 
     NextLevelOp = NextLevelOpL ? RD->RHS : RD->LHS;
   } else
-    return false;
+    return ReductionKind::NotReduction;
 
   // Check that the next levels binary operation exists and matches with the
   // current one.
   if (Level + 1 != NumLevels) {
     Optional<ReductionData> NextLevelRD =
         getReductionData(cast<Instruction>(NextLevelOp));
-    if (!NextLevelRD || RD->Opcode != NextLevelRD->Opcode)
-      return false;
+    if (!NextLevelRD || !RD->hasSameData(*NextLevelRD))
+      return ReductionKind::NotReduction;
   }
 
   // Shuffle mask for pairwise operation must match.
   if (matchPairwiseShuffleMask(LS, /*IsLeft=*/true, Level)) {
     if (!matchPairwiseShuffleMask(RS, /*IsLeft=*/false, Level))
-      return false;
+      return ReductionKind::NotReduction;
   } else if (matchPairwiseShuffleMask(RS, /*IsLeft=*/true, Level)) {
     if (!matchPairwiseShuffleMask(LS, /*IsLeft=*/false, Level))
-      return false;
+      return ReductionKind::NotReduction;
   } else
-    return false;
+    return ReductionKind::NotReduction;
 
   if (++Level == NumLevels)
-    return true;
+    return RD->Kind;
 
   // Match next level.
   return matchPairwiseReductionAtLevel(cast<Instruction>(NextLevelOp), Level,
                                        NumLevels);
 }
 
-static bool matchPairwiseReduction(const ExtractElementInst *ReduxRoot,
-                                   unsigned &Opcode, Type *&Ty) {
+static ReductionKind matchPairwiseReduction(const ExtractElementInst *ReduxRoot,
+                                            unsigned &Opcode, Type *&Ty) {
   if (!EnableReduxCost)
-    return false;
+    return ReductionKind::NotReduction;
 
   // Need to extract the first element.
   ConstantInt *CI = dyn_cast<ConstantInt>(ReduxRoot->getOperand(1));
@@ -296,19 +328,19 @@
   if (CI)
     Idx = CI->getZExtValue();
   if (Idx != 0)
-    return false;
+    return ReductionKind::NotReduction;
 
   auto *RdxStart = dyn_cast<Instruction>(ReduxRoot->getOperand(0));
   if (!RdxStart)
-    return false;
+    return ReductionKind::NotReduction;
   Optional<ReductionData> RD = getReductionData(RdxStart);
   if (!RD)
-    return false;
+    return ReductionKind::NotReduction;
 
   Type *VecTy = RdxStart->getType();
   unsigned NumVecElems = VecTy->getVectorNumElements();
   if (!isPowerOf2_32(NumVecElems))
-    return false;
+    return ReductionKind::NotReduction;
 
   // We look for a sequence of shuffle,shuffle,add triples like the following
   // that builds a pairwise reduction tree.
@@ -328,13 +360,14 @@
   //       <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
   // %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
   // %r = extractelement <4 x float> %bin.rdx8, i32 0
-  if (!matchPairwiseReductionAtLevel(RdxStart, 0,  Log2_32(NumVecElems)))
-    return false;
+  if (matchPairwiseReductionAtLevel(RdxStart, 0, Log2_32(NumVecElems)) ==
+      ReductionKind::NotReduction)
+    return ReductionKind::NotReduction;
 
   Opcode = RD->Opcode;
   Ty = VecTy;
 
-  return true;
+  return RD->Kind;
 }
 
 static std::pair<Value *, ShuffleVectorInst *>
@@ -348,10 +381,11 @@
   return std::make_pair(L, S);
 }
 
-static bool matchVectorSplittingReduction(const ExtractElementInst *ReduxRoot,
-                                          unsigned &Opcode, Type *&Ty) {
+static ReductionKind
+matchVectorSplittingReduction(const ExtractElementInst *ReduxRoot,
+                              unsigned &Opcode, Type *&Ty) {
   if (!EnableReduxCost)
-    return false;
+    return ReductionKind::NotReduction;
 
   // Need to extract the first element.
   ConstantInt *CI = dyn_cast<ConstantInt>(ReduxRoot->getOperand(1));
@@ -359,19 +393,19 @@
   if (CI)
     Idx = CI->getZExtValue();
   if (Idx != 0)
-    return false;
+    return ReductionKind::NotReduction;
 
   auto *RdxStart = dyn_cast<Instruction>(ReduxRoot->getOperand(0));
   if (!RdxStart)
-    return false;
+    return ReductionKind::NotReduction;
   Optional<ReductionData> RD = getReductionData(RdxStart);
   if (!RD)
-    return false;
+    return ReductionKind::NotReduction;
 
   Type *VecTy = ReduxRoot->getOperand(0)->getType();
   unsigned NumVecElems = VecTy->getVectorNumElements();
   if (!isPowerOf2_32(NumVecElems))
-    return false;
+    return ReductionKind::NotReduction;
 
   // We look for a sequence of shuffles and adds like the following matching one
   // fadd, shuffle vector pair at a time.
@@ -391,10 +425,10 @@
   while (NumVecElemsRemain - 1) {
     // Check for the right reduction operation.
     if (!RdxOp)
-      return false;
+      return ReductionKind::NotReduction;
     Optional<ReductionData> RDLevel = getReductionData(RdxOp);
-    if (!RDLevel || RDLevel->Opcode != RD->Opcode)
-      return false;
+    if (!RDLevel || !RDLevel->hasSameData(*RD))
+      return ReductionKind::NotReduction;
 
     Value *NextRdxOp;
     ShuffleVectorInst *Shuffle;
@@ -403,9 +437,9 @@
 
     // Check the current reduction operation and the shuffle use the same value.
     if (Shuffle == nullptr)
-      return false;
+      return ReductionKind::NotReduction;
     if (Shuffle->getOperand(0) != NextRdxOp)
-      return false;
+      return ReductionKind::NotReduction;
 
     // Check that shuffle masks matches.
     for (unsigned j = 0; j != MaskStart; ++j)
@@ -415,7 +449,7 @@
 
     SmallVector<int, 16> Mask = Shuffle->getShuffleMask();
     if (ShuffleMask != Mask)
-      return false;
+      return ReductionKind::NotReduction;
 
     RdxOp = dyn_cast<Instruction>(NextRdxOp);
     NumVecElemsRemain /= 2;
@@ -424,7 +458,7 @@
 
   Opcode = RD->Opcode;
   Ty = VecTy;
-  return true;
+  return RD->Kind;
 }
 
 unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const {
@@ -519,13 +553,28 @@
     unsigned ReduxOpCode;
     Type *ReduxType;
 
-    if (matchVectorSplittingReduction(EEI, ReduxOpCode, ReduxType)) {
+    switch (matchVectorSplittingReduction(EEI, ReduxOpCode, ReduxType)) {
+    case ReductionKind::ArithmeticReduction:
       return TTI->getArithmeticReductionCost(ReduxOpCode, ReduxType,
                                              /*IsPairwiseForm=*/false);
+    case ReductionKind::MinMaxReduction:
+      return TTI->getMinMaxReductionCost(ReduxType,
+                                         CmpInst::makeCmpResultType(ReduxType),
+                                         /*IsPairwiseForm=*/false);
+    case ReductionKind::NotReduction:
+      break;
     }
-    if (matchPairwiseReduction(EEI, ReduxOpCode, ReduxType)) {
+
+    switch (matchPairwiseReduction(EEI, ReduxOpCode, ReduxType)) {
+    case ReductionKind::ArithmeticReduction:
       return TTI->getArithmeticReductionCost(ReduxOpCode, ReduxType,
                                              /*IsPairwiseForm=*/true);
+    case ReductionKind::MinMaxReduction:
+      return TTI->getMinMaxReductionCost(ReduxType,
+                                         CmpInst::makeCmpResultType(ReduxType),
+                                         /*IsPairwiseForm=*/true);
+    case ReductionKind::NotReduction:
+      break;
     }
 
     return TTI->getVectorInstrCost(I->getOpcode(),
Index: lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- lib/Analysis/TargetTransformInfo.cpp
+++ lib/Analysis/TargetTransformInfo.cpp
@@ -479,6 +479,13 @@
   return Cost;
 }
 
+int TargetTransformInfo::getMinMaxReductionCost(Type *Ty, Type *CondTy,
+                                                bool IsPairwiseForm) const {
+  int Cost = TTIImpl->getMinMaxReductionCost(Ty, CondTy, IsPairwiseForm);
+  assert(Cost >= 0 && "TTI should not produce negative costs!");
+  return Cost;
+}
+
 unsigned
 TargetTransformInfo::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) const {
   return TTIImpl->getCostOfKeepingLiveOverCall(Tys);
Index: lib/Target/X86/X86TargetTransformInfo.h
===================================================================
--- lib/Target/X86/X86TargetTransformInfo.h
+++ lib/Target/X86/X86TargetTransformInfo.h
@@ -88,6 +88,8 @@
   int getArithmeticReductionCost(unsigned Opcode, Type *Ty,
                                  bool IsPairwiseForm);
 
+  int getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwiseForm);
+
   int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                  unsigned Factor, ArrayRef<unsigned> Indices,
                                  unsigned Alignment, unsigned AddressSpace);
Index: lib/Target/X86/X86TargetTransformInfo.cpp
===================================================================
--- lib/Target/X86/X86TargetTransformInfo.cpp
+++ lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1877,6 +1877,111 @@
   return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise);
 }
 
+int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy,
+                                       bool IsPairwise) {
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+
+  MVT MTy = LT.second;
+
+  int ISD = ValTy->isIntOrIntVectorTy() ? ISD::SMIN : ISD::FMINNUM;
+
+  // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
+  // and make it as the cost.
+
+  static const CostTblEntry SSE42CostTblPairWise[] = {
+      {ISD::FMINNUM, MVT::v2f64, 3}, {ISD::FMINNUM, MVT::v4f32, 2},
+      {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"
+      {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5"
+      {ISD::SMIN, MVT::v8i16, 2},
+  };
+
+  static const CostTblEntry AVX1CostTblPairWise[] = {
+      {ISD::FMINNUM, MVT::v4f32, 1}, {ISD::FMINNUM, MVT::v4f64, 1},
+      {ISD::FMINNUM, MVT::v8f32, 2}, {ISD::SMIN, MVT::v2i64, 3},
+      {ISD::SMIN, MVT::v4i32, 1},    {ISD::SMIN, MVT::v8i16, 1},
+      {ISD::SMIN, MVT::v8i32, 3},
+  };
+
+  static const CostTblEntry AVX2CostTblPairWise[] = {
+      {ISD::SMIN, MVT::v4i64, 2},
+      {ISD::SMIN, MVT::v8i32, 1},
+      {ISD::SMIN, MVT::v16i16, 1},
+      {ISD::SMIN, MVT::v32i8, 2},
+  };
+
+  static const CostTblEntry AVX512CostTblPairWise[] = {
+      {ISD::FMINNUM, MVT::v8f64, 1},
+      {ISD::FMINNUM, MVT::v16f32, 2},
+      {ISD::SMIN, MVT::v8i64, 2},
+      {ISD::SMIN, MVT::v16i32, 1},
+  };
+
+  static const CostTblEntry SSE42CostTblNoPairWise[] = {
+      {ISD::FMINNUM, MVT::v2f64, 3}, {ISD::FMINNUM, MVT::v4f32, 3},
+      {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"
+      {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5"
+      {ISD::SMIN, MVT::v8i16, 1}, // The data reported by the IACA is "1.5"
+  };
+
+  static const CostTblEntry AVX1CostTblNoPairWise[] = {
+      {ISD::FMINNUM, MVT::v4f32, 1}, {ISD::FMINNUM, MVT::v4f64, 1},
+      {ISD::FMINNUM, MVT::v8f32, 1}, {ISD::SMIN, MVT::v2i64, 3},
+      {ISD::SMIN, MVT::v4i32, 1},    {ISD::SMIN, MVT::v8i16, 1},
+      {ISD::SMIN, MVT::v8i32, 2},
+  };
+
+  static const CostTblEntry AVX2CostTblNoPairWise[] = {
+      {ISD::SMIN, MVT::v4i64, 1},
+      {ISD::SMIN, MVT::v8i32, 1},
+      {ISD::SMIN, MVT::v16i16, 1},
+      {ISD::SMIN, MVT::v32i8, 1},
+  };
+
+  static const CostTblEntry AVX512CostTblNoPairWise[] = {
+      {ISD::FMINNUM, MVT::v8f64, 1},
+      {ISD::FMINNUM, MVT::v16f32, 2},
+      {ISD::SMIN, MVT::v8i64, 1},
+      {ISD::SMIN, MVT::v16i32, 1},
+  };
+
+  if (IsPairwise) {
+    if (ST->hasAVX512())
+      if (const auto *Entry = CostTableLookup(AVX512CostTblPairWise, ISD, MTy))
+        return LT.first * Entry->Cost;
+
+    if (ST->hasAVX2())
+      if (const auto *Entry = CostTableLookup(AVX2CostTblPairWise, ISD, MTy))
+        return LT.first * Entry->Cost;
+
+    if (ST->hasAVX())
+      if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
+        return LT.first * Entry->Cost;
+
+    if (ST->hasSSE42())
+      if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
+        return LT.first * Entry->Cost;
+  } else {
+    if (ST->hasAVX512())
+      if (const auto *Entry =
+              CostTableLookup(AVX512CostTblNoPairWise, ISD, MTy))
+        return LT.first * Entry->Cost;
+
+    if (ST->hasAVX2())
+      if (const auto *Entry = CostTableLookup(AVX2CostTblNoPairWise, ISD, MTy))
+        return LT.first * Entry->Cost;
+
+    if (ST->hasAVX())
+      if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
+        return LT.first * Entry->Cost;
+
+    if (ST->hasSSE42())
+      if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
+        return LT.first * Entry->Cost;
+  }
+
+  return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise);
+}
+
 /// \brief Calculate the cost of materializing a 64-bit value. This helper
 /// method might only calculate a fraction of a larger immediate. Therefore it
 /// is valid to return a cost of ZERO.
Index: lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -4356,7 +4356,7 @@
   if (!I)
     return false;
 
-  if (!isa<BinaryOperator>(I))
+  if (!isa<BinaryOperator>(I) && !isa<CmpInst>(I))
     return false;
 
   Value *P = I->getParent();
@@ -4459,59 +4459,126 @@
   // Use map vector to make stable output.
   MapVector<Instruction *, Value *> ExtraArgs;
 
+  /// Kind of the reduction data.
+  enum class ReductionKind {
+    NotReduction,        /// Not a reduction.
+    ArithmeticReduction, /// Binary reduction data.
+    MinReduction,        /// Minimum reduction data.
+    UMinReduction,       /// Unsigned minimum reduction data.
+    MaxReduction,        /// Maximum reduction data.
+    UMaxReduction,       /// Unsigned maximum reduction data.
+  };
   /// Contains info about operation, like its opcode, left and right operands.
-  struct OperationData {
-    /// true if the operation is a reduced value, false if reduction operation.
-    bool IsReducedValue = false;
+  class OperationData {
     /// Opcode of the instruction.
     unsigned Opcode = 0;
     /// Left operand of the reduction operation.
     Value *LHS = nullptr;
     /// Right operand of the reduction operation.
     Value *RHS = nullptr;
+    /// Kind of the reduction operation.
+    ReductionKind Kind = ReductionKind::NotReduction;
+    /// True if float point min/max reduction has no NaNs.
+    bool NoNaN = false;
 
     /// Checks if the reduction operation can be vectorized.
     bool isVectorizable() const {
       return LHS && RHS &&
-             // We currently only support adds.
-             (Opcode == Instruction::Add || Opcode == Instruction::FAdd);
+             // We currently only support adds && min/max reductions.
+             ((Kind == ReductionKind::ArithmeticReduction &&
+               (Opcode == Instruction::Add || Opcode == Instruction::FAdd)) ||
+              ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
+               (Kind == ReductionKind::MinReduction ||
+                Kind == ReductionKind::MaxReduction)) ||
+              (Opcode == Instruction::ICmp &&
+               (Kind == ReductionKind::UMinReduction ||
+                Kind == ReductionKind::UMaxReduction)));
     }
 
   public:
     explicit OperationData() = default;
     /// Construction for reduced values. They are identified by opcode only and
     /// don't have associated LHS/RHS values.
-    explicit OperationData(Value *V) : IsReducedValue(true) {
+    explicit OperationData(Value *V) : Kind(ReductionKind::NotReduction) {
       if (auto *I = dyn_cast<Instruction>(V))
         Opcode = I->getOpcode();
     }
-    /// Constructor for binary reduction operations with opcode and its left and
+    /// Constructor for reduction operations with opcode and its left and
     /// right operands.
-    OperationData(unsigned Opcode, Value *LHS, Value *RHS)
-        : IsReducedValue(false), Opcode(Opcode), LHS(LHS), RHS(RHS) {}
+    OperationData(unsigned Opcode, Value *LHS, Value *RHS, ReductionKind Kind,
+                  bool NoNaN = false)
+        : Opcode(Opcode), LHS(LHS), RHS(RHS), Kind(Kind), NoNaN(NoNaN) {
+      assert(Kind != ReductionKind::NotReduction &&
+             "One of the reduction operations is expected.");
+    }
     explicit operator bool() const { return Opcode; }
     /// Get the index of the first operand.
     unsigned getFirstOperandIndex() const {
       assert(!!*this && "The opcode is not set.");
+      switch (Kind) {
+      case ReductionKind::MinReduction:
+      case ReductionKind::UMinReduction:
+      case ReductionKind::MaxReduction:
+      case ReductionKind::UMaxReduction:
+        return 1;
+      case ReductionKind::ArithmeticReduction:
+      case ReductionKind::NotReduction:
+        break;
+      }
       return 0;
     }
     /// Total number of operands in the reduction operation.
     unsigned getNumberOfOperands() const {
-      assert(!IsReducedValue && !!*this && LHS && RHS &&
+      assert(Kind != ReductionKind::NotReduction && !!*this && LHS && RHS &&
              "Expected reduction operation.");
-      return 2;
+      switch (Kind) {
+      case ReductionKind::ArithmeticReduction:
+        return 2;
+      case ReductionKind::MinReduction:
+      case ReductionKind::UMinReduction:
+      case ReductionKind::MaxReduction:
+      case ReductionKind::UMaxReduction:
+        return 3;
+      case ReductionKind::NotReduction:
+        llvm_unreachable("Reduction kind is not set");
+      }
     }
     /// Expected number of uses for reduction operations/reduced values.
     unsigned getRequiredNumberOfUses() const {
-      assert(!IsReducedValue && !!*this && LHS && RHS &&
+      assert(Kind != ReductionKind::NotReduction && !!*this && LHS && RHS &&
              "Expected reduction operation.");
-      return 1;
+      switch (Kind) {
+      case ReductionKind::ArithmeticReduction:
+        return 1;
+      case ReductionKind::MinReduction:
+      case ReductionKind::UMinReduction:
+      case ReductionKind::MaxReduction:
+      case ReductionKind::UMaxReduction:
+        return 2;
+      case ReductionKind::NotReduction:
+        llvm_unreachable("Reduction kind is not set");
+      }
     }
     /// Checks if instruction is associative and can be vectorized.
     bool isAssociative(Instruction *I) const {
-      assert(!IsReducedValue && *this && LHS && RHS &&
+      assert(Kind != ReductionKind::NotReduction && *this && LHS && RHS &&
              "Expected reduction operation.");
-      return I->isAssociative();
+      switch (Kind) {
+      case ReductionKind::ArithmeticReduction:
+        return I->isAssociative();
+      case ReductionKind::MinReduction:
+      case ReductionKind::MaxReduction:
+        return Opcode == Instruction::ICmp ||
+               cast<Instruction>(I->getOperand(0))->hasUnsafeAlgebra();
+      case ReductionKind::UMinReduction:
+      case ReductionKind::UMaxReduction:
+        assert(Opcode == Instruction::ICmp &&
+               "Only integer compare operation is expected.");
+        return true;
+      case ReductionKind::NotReduction:
+        break;
+      }
+      llvm_unreachable("Reduction kind is not set");
     }
     /// Checks if the reduction operation can be vectorized.
     bool isVectorizable(Instruction *I) const {
@@ -4521,33 +4588,97 @@
     /// Checks if two operation data are both a reduction op or both a reduced
     /// value.
     bool operator==(const OperationData &OD) {
-      assert(((IsReducedValue != OD.IsReducedValue) ||
-              ((!LHS == !OD.LHS) && (!RHS == !OD.RHS))) &&
+      assert(((Kind != OD.Kind) || ((!LHS == !OD.LHS) && (!RHS == !OD.RHS))) &&
              "One of the comparing operations is incorrect.");
-      return this == &OD ||
-             (IsReducedValue == OD.IsReducedValue && Opcode == OD.Opcode);
+      return this == &OD || (Kind == OD.Kind && Opcode == OD.Opcode);
     }
     bool operator!=(const OperationData &OD) { return !(*this == OD); }
     void clear() {
-      IsReducedValue = false;
       Opcode = 0;
       LHS = nullptr;
       RHS = nullptr;
+      Kind = ReductionKind::NotReduction;
+      NoNaN = false;
     }
     /// Get the opcode of the reduction operation.
     unsigned getOpcode() const {
       assert(isVectorizable() && "Expected vectorizable operation.");
       return Opcode;
     }
+    /// Get kind of reduction data.
+    ReductionKind getKind() const { return Kind; }
     Value *getLHS() const { return LHS; }
     Value *getRHS() const { return RHS; }
+    Type *getConditionType() const {
+      switch (Kind) {
+      case ReductionKind::ArithmeticReduction:
+        return nullptr;
+      case ReductionKind::MinReduction:
+      case ReductionKind::MaxReduction:
+      case ReductionKind::UMinReduction:
+      case ReductionKind::UMaxReduction:
+        return CmpInst::makeCmpResultType(LHS->getType());
+      case ReductionKind::NotReduction:
+        break;
+      }
+      llvm_unreachable("Reduction kind is not set");
+    }
     /// Creates reduction operation with the current opcode.
     Value *createOp(IRBuilder<> &Builder, const Twine &Name = "") const {
-      assert(!IsReducedValue &&
-             (Opcode == Instruction::FAdd || Opcode == Instruction::Add) &&
-             "Expected add|fadd reduction operation.");
-      return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, LHS, RHS,
-                                 Name);
+      assert(isVectorizable() &&
+             "Expected add|fadd or min/max reduction operation.");
+      Value *Cmp;
+      switch (Kind) {
+      case ReductionKind::ArithmeticReduction:
+        return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, LHS, RHS,
+                                   Name);
+      case ReductionKind::MinReduction:
+        Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSLT(LHS, RHS)
+                                          : Builder.CreateFCmpOLT(LHS, RHS);
+        break;
+      case ReductionKind::MaxReduction:
+        Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSGT(LHS, RHS)
+                                          : Builder.CreateFCmpOGT(LHS, RHS);
+        break;
+      case ReductionKind::UMinReduction:
+        assert(Opcode == Instruction::ICmp && "Expected integer types.");
+        Cmp = Builder.CreateICmpULT(LHS, RHS);
+        break;
+      case ReductionKind::UMaxReduction:
+        assert(Opcode == Instruction::ICmp && "Expected integer types.");
+        Cmp = Builder.CreateICmpUGT(LHS, RHS);
+        break;
+      case ReductionKind::NotReduction:
+        llvm_unreachable("Unknown reduction operation.");
+      }
+      return Builder.CreateSelect(Cmp, LHS, RHS, Name);
+    }
+    TargetTransformInfo::ReductionFlags getFlags() const {
+      TargetTransformInfo::ReductionFlags Flags;
+      Flags.NoNaN = NoNaN;
+      switch (Kind) {
+      case ReductionKind::ArithmeticReduction:
+        break;
+      case ReductionKind::MinReduction:
+        Flags.IsSigned = Opcode == Instruction::ICmp;
+        Flags.IsMaxOp = false;
+        break;
+      case ReductionKind::MaxReduction:
+        Flags.IsSigned = Opcode == Instruction::ICmp;
+        Flags.IsMaxOp = true;
+        break;
+      case ReductionKind::UMinReduction:
+        Flags.IsSigned = false;
+        Flags.IsMaxOp = false;
+        break;
+      case ReductionKind::UMaxReduction:
+        Flags.IsSigned = false;
+        Flags.IsMaxOp = true;
+        break;
+      case ReductionKind::NotReduction:
+        llvm_unreachable("Reduction kind is not set");
+      }
+      return Flags;
     }
   };
 
@@ -4587,8 +4718,36 @@
 
     Value *LHS;
     Value *RHS;
-    if (m_BinOp(m_Value(LHS), m_Value(RHS)).match(V))
-      return OperationData(cast<BinaryOperator>(V)->getOpcode(), LHS, RHS);
+    if (m_BinOp(m_Value(LHS), m_Value(RHS)).match(V)) {
+      return OperationData(cast<BinaryOperator>(V)->getOpcode(), LHS, RHS,
+                           ReductionKind::ArithmeticReduction);
+    }
+    if (auto *Select = dyn_cast<SelectInst>(V)) {
+      // Look for a min/max pattern.
+      if (m_UMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
+        return OperationData(Instruction::ICmp, LHS, RHS,
+                             ReductionKind::UMinReduction);
+      } else if (m_SMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
+        return OperationData(Instruction::ICmp, LHS, RHS,
+                             ReductionKind::MinReduction);
+      } else if (m_OrdFMin(m_Value(LHS), m_Value(RHS)).match(Select) ||
+                 m_UnordFMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
+        return OperationData(
+            Instruction::FCmp, LHS, RHS, ReductionKind::MinReduction,
+            cast<Instruction>(Select->getCondition())->hasNoNaNs());
+      } else if (m_UMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
+        return OperationData(Instruction::ICmp, LHS, RHS,
+                             ReductionKind::UMaxReduction);
+      } else if (m_SMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
+        return OperationData(Instruction::ICmp, LHS, RHS,
+                             ReductionKind::MaxReduction);
+      } else if (m_OrdFMax(m_Value(LHS), m_Value(RHS)).match(Select) ||
+                 m_UnordFMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
+        return OperationData(
+            Instruction::FCmp, LHS, RHS, ReductionKind::MaxReduction,
+            cast<Instruction>(Select->getCondition())->hasNoNaNs());
+      }
+    }
     return OperationData(V);
   }
 
@@ -4781,8 +4940,9 @@
       if (VectorizedTree) {
         Builder.SetCurrentDebugLocation(Loc);
         OperationData VectReductionData(ReductionData.getOpcode(),
-                                        VectorizedTree, ReducedSubTree);
-        VectorizedTree = VectReductionData.createOp(Builder, "bin.rdx");
+                                        VectorizedTree, ReducedSubTree,
+                                        ReductionData.getKind());
+        VectorizedTree = VectReductionData.createOp(Builder, "op.rdx");
         propagateIRFlags(VectorizedTree, ReductionOps);
       } else
         VectorizedTree = ReducedSubTree;
@@ -4796,7 +4956,8 @@
         auto *I = cast<Instruction>(ReducedVals[i]);
         Builder.SetCurrentDebugLocation(I->getDebugLoc());
         OperationData VectReductionData(ReductionData.getOpcode(),
-                                        VectorizedTree, I);
+                                        VectorizedTree, I,
+                                        ReductionData.getKind());
         VectorizedTree = VectReductionData.createOp(Builder);
         propagateIRFlags(VectorizedTree, ReductionOps);
       }
@@ -4807,8 +4968,9 @@
         for (auto *I : Pair.second) {
           Builder.SetCurrentDebugLocation(I->getDebugLoc());
           OperationData VectReductionData(ReductionData.getOpcode(),
-                                          VectorizedTree, Pair.first);
-          VectorizedTree = VectReductionData.createOp(Builder, "bin.extra");
+                                          VectorizedTree, Pair.first,
+                                          ReductionData.getKind());
+          VectorizedTree = VectReductionData.createOp(Builder, "op.extra");
           propagateIRFlags(VectorizedTree, I);
         }
       }
@@ -4829,19 +4991,54 @@
     Type *ScalarTy = FirstReducedVal->getType();
     Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);
 
-    int PairwiseRdxCost =
-        TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy,
-                                        /*IsPairwiseForm=*/true);
-    int SplittingRdxCost =
-        TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy,
-                                        /*IsPairwiseForm=*/false);
+    int PairwiseRdxCost;
+    int SplittingRdxCost;
+    switch (ReductionData.getKind()) {
+    case ReductionKind::ArithmeticReduction:
+      PairwiseRdxCost =
+          TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy,
+                                          /*IsPairwiseForm=*/true);
+      SplittingRdxCost =
+          TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy,
+                                          /*IsPairwiseForm=*/false);
+      break;
+    case ReductionKind::MinReduction:
+    case ReductionKind::MaxReduction:
+    case ReductionKind::UMinReduction:
+    case ReductionKind::UMaxReduction: {
+      Type *VecCondTy = CmpInst::makeCmpResultType(VecTy);
+      PairwiseRdxCost = TTI->getMinMaxReductionCost(VecTy, VecCondTy,
+                                                    /*IsPairwiseForm=*/true);
+      SplittingRdxCost = TTI->getMinMaxReductionCost(VecTy, VecCondTy,
+                                                     /*IsPairwiseForm=*/false);
+      break;
+    }
+    case ReductionKind::NotReduction:
+      llvm_unreachable("Expected arithmetic or min/max reduction operation");
+    }
 
     IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
     int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
 
-    int ScalarReduxCost =
-        (ReduxWidth - 1) *
-        TTI->getArithmeticInstrCost(ReductionData.getOpcode(), ScalarTy);
+    int ScalarReduxCost;
+    switch (ReductionData.getKind()) {
+    case ReductionKind::ArithmeticReduction:
+      ScalarReduxCost =
+          TTI->getArithmeticInstrCost(ReductionData.getOpcode(), ScalarTy);
+      break;
+    case ReductionKind::MinReduction:
+    case ReductionKind::MaxReduction:
+    case ReductionKind::UMinReduction:
+    case ReductionKind::UMaxReduction:
+      ScalarReduxCost =
+          TTI->getCmpSelInstrCost(ReductionData.getOpcode(), ScalarTy) +
+          TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
+                                  CmpInst::makeCmpResultType(ScalarTy));
+      break;
+    case ReductionKind::NotReduction:
+      llvm_unreachable("Expected arithmetic or min/max reduction operation");
+    }
+    ScalarReduxCost *= (ReduxWidth - 1);
 
     DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
                  << " for reduction that starts with " << *FirstReducedVal
@@ -4863,7 +5060,7 @@
     if (!IsPairwiseReduction)
       return createSimpleTargetReduction(
           Builder, TTI, ReductionData.getOpcode(), VectorizedValue,
-          TargetTransformInfo::ReductionFlags(), RedOps);
+          ReductionData.getFlags(), RedOps);
 
     Value *TmpVec = VectorizedValue;
     for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
@@ -4878,8 +5075,8 @@
           TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
           "rdx.shuf.r");
       OperationData VectReductionData(ReductionData.getOpcode(), LeftShuf,
-                                      RightShuf);
-      TmpVec = VectReductionData.createOp(Builder, "bin.rdx");
+                                      RightShuf, ReductionData.getKind());
+      TmpVec = VectReductionData.createOp(Builder, "op.rdx");
       propagateIRFlags(TmpVec, RedOps);
     }
 
@@ -4894,39 +5091,30 @@
 ///  %rb = insertelement <4 x float> %ra, float %s1, i32 1
 ///  %rc = insertelement <4 x float> %rb, float %s2, i32 2
 ///  %rd = insertelement <4 x float> %rc, float %s3, i32 3
+///  starting from the last insertelement instruction.
 ///
 /// Returns true if it matches
 ///
-static bool findBuildVector(InsertElementInst *FirstInsertElem,
+static bool findBuildVector(InsertElementInst *LastInsertElem,
                             SmallVectorImpl<Value *> &BuildVector,
                             SmallVectorImpl<Value *> &BuildVectorOpds) {
-  if (!isa<UndefValue>(FirstInsertElem->getOperand(0)))
-    return false;
-
-  InsertElementInst *IE = FirstInsertElem;
-  while (true) {
-    BuildVector.push_back(IE);
-    BuildVectorOpds.push_back(IE->getOperand(1));
-
-    if (IE->use_empty())
-      return false;
-
-    InsertElementInst *NextUse = dyn_cast<InsertElementInst>(IE->user_back());
-    if (!NextUse)
-      return true;
-
-    // If this isn't the final use, make sure the next insertelement is the only
-    // use. It's OK if the final constructed vector is used multiple times
-    if (!IE->hasOneUse())
+  Value *V = nullptr;
+  do {
+    BuildVector.push_back(LastInsertElem);
+    BuildVectorOpds.push_back(LastInsertElem->getOperand(1));
+    V = LastInsertElem->getOperand(0);
+    if (isa<UndefValue>(V))
+      break;
+    LastInsertElem = dyn_cast<InsertElementInst>(V);
+    if (!LastInsertElem || !LastInsertElem->hasOneUse())
       return false;
-
-    IE = NextUse;
-  }
-
-  return false;
+  } while (true);
+  std::reverse(BuildVector.begin(), BuildVector.end());
+  std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());
+  return true;
 }
 
-/// \brief Like findBuildVector, but looks backwards for construction of aggregate.
+/// \brief Like findBuildVector, but looks for construction of aggregate.
 ///
 /// \return true if it matches.
 static bool findBuildAggregate(InsertValueInst *IV,
@@ -5048,9 +5236,11 @@
     auto *Inst = dyn_cast<Instruction>(V);
     if (!Inst)
       continue;
-    if (auto *BI = dyn_cast<BinaryOperator>(Inst)) {
+    auto *BI = dyn_cast<BinaryOperator>(Inst);
+    auto *SI = dyn_cast<SelectInst>(Inst);
+    if (BI || SI) {
       HorizontalReduction HorRdx;
-      if (HorRdx.matchAssociativeReduction(P, BI)) {
+      if (HorRdx.matchAssociativeReduction(P, Inst)) {
         if (HorRdx.tryToReduce(R, TTI)) {
           Res = true;
           // Set P to nullptr to avoid re-analysis of phi node in
@@ -5059,7 +5249,7 @@
           continue;
         }
       }
-      if (P) {
+      if (P && BI) {
         Inst = dyn_cast<Instruction>(BI->getOperand(0));
         if (Inst == P)
           Inst = dyn_cast<Instruction>(BI->getOperand(1));
@@ -5111,6 +5301,64 @@
                                                   ExtraVectorization);
 }
 
+bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
+                                                 BasicBlock *BB, BoUpSLP &R) {
+  const DataLayout &DL = BB->getModule()->getDataLayout();
+  if (!R.canMapToVector(IVI->getType(), DL))
+    return false;
+
+  SmallVector<Value *, 16> BuildVector;
+  SmallVector<Value *, 16> BuildVectorOpds;
+  if (!findBuildAggregate(IVI, BuildVector, BuildVectorOpds))
+    return false;
+
+  DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
+  return tryToVectorizeList(BuildVectorOpds, R, BuildVector, false);
+}
+
+bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
+                                                   BasicBlock *BB, BoUpSLP &R) {
+  SmallVector<Value *, 16> BuildVector;
+  SmallVector<Value *, 16> BuildVectorOpds;
+  if (!findBuildVector(IEI, BuildVector, BuildVectorOpds))
+    return false;
+
+  // Vectorize starting with the build vector operands ignoring the BuildVector
+  // instructions for the purpose of scheduling and user extraction.
+  return tryToVectorizeList(BuildVectorOpds, R, BuildVector);
+}
+
+bool SLPVectorizerPass::vectorizeCmpInst(CmpInst *CI, BasicBlock *BB,
+                                         BoUpSLP &R) {
+  if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R))
+    return true;
+
+  bool OpsChanged = false;
+  for (int Idx = 0; Idx < 2; ++Idx) {
+    OpsChanged |=
+        vectorizeRootInstruction(nullptr, CI->getOperand(Idx), BB, R, TTI);
+  }
+  return OpsChanged;
+}
+
+bool SLPVectorizerPass::vectorizeSimpleInstructions(
+    SmallVectorImpl<WeakVH> &Instructions, BasicBlock *BB, BoUpSLP &R) {
+  bool OpsChanged = false;
+  for (auto &VH : reverse(Instructions)) {
+    auto *I = dyn_cast_or_null<Instruction>(VH);
+    if (!I)
+      continue;
+    if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I))
+      OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
+    else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I))
+      OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);
+    else if (auto *CI = dyn_cast<CmpInst>(I))
+      OpsChanged |= vectorizeCmpInst(CI, BB, R);
+  }
+  Instructions.clear();
+  return OpsChanged;
+}
+
 bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
   bool Changed = false;
   SmallVector<Value *, 4> Incoming;
@@ -5170,10 +5418,21 @@
 
   VisitedInstrs.clear();
 
+  SmallVector<WeakVH, 8> PostProcessInstructions;
+  SmallDenseSet<Instruction *, 4> KeyNodes;
   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {
     // We may go through BB multiple times so skip the one we have checked.
-    if (!VisitedInstrs.insert(&*it).second)
+    if (!VisitedInstrs.insert(&*it).second) {
+      if (it->use_empty() && KeyNodes.count(&*it) > 0 &&
+          vectorizeSimpleInstructions(PostProcessInstructions, BB, R)) {
+        // We would like to start over since some instructions are deleted
+        // and the iterator may become invalid value.
+        Changed = true;
+        it = BB->begin();
+        e = BB->end();
+      }
       continue;
+    }
 
     if (isa<DbgInfoIntrinsic>(it))
       continue;
@@ -5195,97 +5454,39 @@
       continue;
     }
 
-    if (ShouldStartVectorizeHorAtStore) {
-      if (StoreInst *SI = dyn_cast<StoreInst>(it)) {
-        // Try to match and vectorize a horizontal reduction.
-        if (vectorizeRootInstruction(nullptr, SI->getValueOperand(), BB, R,
-                                     TTI)) {
-          Changed = true;
-          it = BB->begin();
-          e = BB->end();
-          continue;
+    // Ran into an instruction without users, like terminator, or function call
+    // with ignored return value, store. Ignore unused instructions (basing on
+    // instruction type, except for CallInst and InvokeInst).
+    if (it->use_empty() && (it->getType()->isVoidTy() || isa<CallInst>(it) ||
+                            isa<InvokeInst>(it))) {
+      KeyNodes.insert(&*it);
+      bool OpsChanged = false;
+      if (ShouldStartVectorizeHorAtStore || !isa<StoreInst>(it)) {
+        for (auto *V : it->operand_values()) {
+          // Try to match and vectorize a horizontal reduction.
+          OpsChanged |= vectorizeRootInstruction(nullptr, V, BB, R, TTI);
         }
       }
-    }
-
-    // Try to vectorize horizontal reductions feeding into a return.
-    if (ReturnInst *RI = dyn_cast<ReturnInst>(it)) {
-      if (RI->getNumOperands() != 0) {
-        // Try to match and vectorize a horizontal reduction.
-        if (vectorizeRootInstruction(nullptr, RI->getOperand(0), BB, R, TTI)) {
-          Changed = true;
-          it = BB->begin();
-          e = BB->end();
-          continue;
-        }
-      }
-    }
-
-    // Try to vectorize trees that start at compare instructions.
-    if (CmpInst *CI = dyn_cast<CmpInst>(it)) {
-      if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) {
-        Changed = true;
+      // Start vectorization of post-process list of instructions from the
+      // top-tree instructions to try to vectorize as many instructions as
+      // possible.
+      OpsChanged |= vectorizeSimpleInstructions(PostProcessInstructions, BB, R);
+      if (OpsChanged) {
         // We would like to start over since some instructions are deleted
         // and the iterator may become invalid value.
-        it = BB->begin();
-        e = BB->end();
-        continue;
-      }
-
-      for (int I = 0; I < 2; ++I) {
-        if (vectorizeRootInstruction(nullptr, CI->getOperand(I), BB, R, TTI)) {
-          Changed = true;
-          // We would like to start over since some instructions are deleted
-          // and the iterator may become invalid value.
-          it = BB->begin();
-          e = BB->end();
-          break;
-        }
-      }
-      continue;
-    }
-
-    // Try to vectorize trees that start at insertelement instructions.
-    if (InsertElementInst *FirstInsertElem = dyn_cast<InsertElementInst>(it)) {
-      SmallVector<Value *, 16> BuildVector;
-      SmallVector<Value *, 16> BuildVectorOpds;
-      if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds))
-        continue;
-
-      // Vectorize starting with the build vector operands ignoring the
-      // BuildVector instructions for the purpose of scheduling and user
-      // extraction.
-      if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) {
         Changed = true;
         it = BB->begin();
         e = BB->end();
+        continue;
       }
-
-      continue;
     }
 
-    // Try to vectorize trees that start at insertvalue instructions feeding into
-    // a store.
-    if (StoreInst *SI = dyn_cast<StoreInst>(it)) {
-      if (InsertValueInst *LastInsertValue = dyn_cast<InsertValueInst>(SI->getValueOperand())) {
-        const DataLayout &DL = BB->getModule()->getDataLayout();
-        if (R.canMapToVector(SI->getValueOperand()->getType(), DL)) {
-          SmallVector<Value *, 16> BuildVector;
-          SmallVector<Value *, 16> BuildVectorOpds;
-          if (!findBuildAggregate(LastInsertValue, BuildVector, BuildVectorOpds))
-            continue;
+    if (isa<InsertElementInst>(it) || isa<CmpInst>(it) ||
+        isa<InsertValueInst>(it))
+      PostProcessInstructions.push_back(&*it);
 
-          DEBUG(dbgs() << "SLP: store of array mappable to vector: " << *SI << "\n");
-          if (tryToVectorizeList(BuildVectorOpds, R, BuildVector, false)) {
-            Changed = true;
-            it = BB->begin();
-            e = BB->end();
-          }
-          continue;
-        }
-      }
-    }
   }
+  assert(PostProcessInstructions.empty());
 
   return Changed;
 }
Index: test/Transforms/SLPVectorizer/AArch64/gather-root.ll
===================================================================
--- test/Transforms/SLPVectorizer/AArch64/gather-root.ll
+++ test/Transforms/SLPVectorizer/AArch64/gather-root.ll
@@ -31,10 +31,8 @@
 ;
 ; GATHER-LABEL: @PR28330(
 ; GATHER-NEXT:  entry:
-; GATHER-NEXT:    [[TMP0:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1), align 1
-; GATHER-NEXT:    [[TMP1:%.*]] = icmp eq i8 [[TMP0]], 0
-; GATHER-NEXT:    [[TMP2:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 2), align 2
-; GATHER-NEXT:    [[TMP3:%.*]] = icmp eq i8 [[TMP2]], 0
+; GATHER-NEXT:    [[TMP0:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <2 x i8>*), align 1
+; GATHER-NEXT:    [[TMP1:%.*]] = icmp eq <2 x i8> [[TMP0]], zeroinitializer
 ; GATHER-NEXT:    [[TMP4:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1
 ; GATHER-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[TMP4]], 0
 ; GATHER-NEXT:    [[TMP6:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4
@@ -50,10 +48,11 @@
 ; GATHER-NEXT:    br label [[FOR_BODY:%.*]]
 ; GATHER:       for.body:
 ; GATHER-NEXT:    [[TMP17:%.*]] = phi i32 [ [[BIN_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; GATHER-NEXT:    [[TMP19:%.*]] = select i1 [[TMP1]], i32 -720, i32 -80
-; GATHER-NEXT:    [[TMP20:%.*]] = add i32 [[TMP17]], [[TMP19]]
-; GATHER-NEXT:    [[TMP21:%.*]] = select i1 [[TMP3]], i32 -720, i32 -80
-; GATHER-NEXT:    [[TMP22:%.*]] = add i32 [[TMP20]], [[TMP21]]
+; GATHER-NEXT:    [[TMP2:%.*]] = select <2 x i1> [[TMP1]], <2 x i32> <i32 -720, i32 -720>, <2 x i32> <i32 -80, i32 -80>
+; GATHER-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
+; GATHER-NEXT:    [[TMP20:%.*]] = add i32 [[TMP17]], [[TMP3]]
+; GATHER-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
+; GATHER-NEXT:    [[TMP22:%.*]] = add i32 [[TMP20]], [[TMP4]]
 ; GATHER-NEXT:    [[TMP23:%.*]] = select i1 [[TMP5]], i32 -720, i32 -80
 ; GATHER-NEXT:    [[TMP24:%.*]] = add i32 [[TMP22]], [[TMP23]]
 ; GATHER-NEXT:    [[TMP25:%.*]] = select i1 [[TMP7]], i32 -720, i32 -80
@@ -65,16 +64,16 @@
 ; GATHER-NEXT:    [[TMP31:%.*]] = select i1 [[TMP13]], i32 -720, i32 -80
 ; GATHER-NEXT:    [[TMP32:%.*]] = add i32 [[TMP30]], [[TMP31]]
 ; GATHER-NEXT:    [[TMP33:%.*]] = select i1 [[TMP15]], i32 -720, i32 -80
-; GATHER-NEXT:    [[TMP0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP19]], i32 0
-; GATHER-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> [[TMP0]], i32 [[TMP21]], i32 1
-; GATHER-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[TMP23]], i32 2
-; GATHER-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[TMP25]], i32 3
-; GATHER-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP27]], i32 4
-; GATHER-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[TMP29]], i32 5
-; GATHER-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[TMP31]], i32 6
-; GATHER-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[TMP33]], i32 7
-; GATHER-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> [[TMP7]])
-; GATHER-NEXT:    [[BIN_EXTRA]] = add i32 [[TMP8]], [[TMP17]]
+; GATHER-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> undef, i32 [[TMP3]], i32 0
+; GATHER-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[TMP4]], i32 1
+; GATHER-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[TMP23]], i32 2
+; GATHER-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[TMP25]], i32 3
+; GATHER-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[TMP27]], i32 4
+; GATHER-NEXT:    [[TMP10:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[TMP29]], i32 5
+; GATHER-NEXT:    [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[TMP31]], i32 6
+; GATHER-NEXT:    [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[TMP33]], i32 7
+; GATHER-NEXT:    [[TMP13:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> [[TMP12]])
+; GATHER-NEXT:    [[BIN_EXTRA]] = add i32 [[TMP13]], [[TMP17]]
 ; GATHER-NEXT:    [[TMP34:%.*]] = add i32 [[TMP32]], [[TMP33]]
 ; GATHER-NEXT:    br label [[FOR_BODY]]
 ;
@@ -180,10 +179,8 @@
 ;
 ; GATHER-LABEL: @PR32038(
 ; GATHER-NEXT:  entry:
-; GATHER-NEXT:    [[TMP0:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1), align 1
-; GATHER-NEXT:    [[TMP1:%.*]] = icmp eq i8 [[TMP0]], 0
-; GATHER-NEXT:    [[TMP2:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 2), align 2
-; GATHER-NEXT:    [[TMP3:%.*]] = icmp eq i8 [[TMP2]], 0
+; GATHER-NEXT:    [[TMP0:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <2 x i8>*), align 1
+; GATHER-NEXT:    [[TMP1:%.*]] = icmp eq <2 x i8> [[TMP0]], zeroinitializer
 ; GATHER-NEXT:    [[TMP4:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1
 ; GATHER-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[TMP4]], 0
 ; GATHER-NEXT:    [[TMP6:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4
@@ -199,10 +196,11 @@
 ; GATHER-NEXT:    br label [[FOR_BODY:%.*]]
 ; GATHER:       for.body:
 ; GATHER-NEXT:    [[TMP17:%.*]] = phi i32 [ [[BIN_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; GATHER-NEXT:    [[TMP19:%.*]] = select i1 [[TMP1]], i32 -720, i32 -80
-; GATHER-NEXT:    [[TMP20:%.*]] = add i32 -5, [[TMP19]]
-; GATHER-NEXT:    [[TMP21:%.*]] = select i1 [[TMP3]], i32 -720, i32 -80
-; GATHER-NEXT:    [[TMP22:%.*]] = add i32 [[TMP20]], [[TMP21]]
+; GATHER-NEXT:    [[TMP2:%.*]] = select <2 x i1> [[TMP1]], <2 x i32> <i32 -720, i32 -720>, <2 x i32> <i32 -80, i32 -80>
+; GATHER-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
+; GATHER-NEXT:    [[TMP20:%.*]] = add i32 -5, [[TMP3]]
+; GATHER-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
+; GATHER-NEXT:    [[TMP22:%.*]] = add i32 [[TMP20]], [[TMP4]]
 ; GATHER-NEXT:    [[TMP23:%.*]] = select i1 [[TMP5]], i32 -720, i32 -80
 ; GATHER-NEXT:    [[TMP24:%.*]] = add i32 [[TMP22]], [[TMP23]]
 ; GATHER-NEXT:    [[TMP25:%.*]] = select i1 [[TMP7]], i32 -720, i32 -80
@@ -214,29 +212,27 @@
 ; GATHER-NEXT:    [[TMP31:%.*]] = select i1 [[TMP13]], i32 -720, i32 -80
 ; GATHER-NEXT:    [[TMP32:%.*]] = add i32 [[TMP30]], [[TMP31]]
 ; GATHER-NEXT:    [[TMP33:%.*]] = select i1 [[TMP15]], i32 -720, i32 -80
-; GATHER-NEXT:    [[TMP0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP19]], i32 0
-; GATHER-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> [[TMP0]], i32 [[TMP21]], i32 1
-; GATHER-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[TMP23]], i32 2
-; GATHER-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[TMP25]], i32 3
-; GATHER-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP27]], i32 4
-; GATHER-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[TMP29]], i32 5
-; GATHER-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[TMP31]], i32 6
-; GATHER-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[TMP33]], i32 7
-; GATHER-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> [[TMP7]])
-; GATHER-NEXT:    [[BIN_EXTRA]] = add i32 [[TMP8]], -5
+; GATHER-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> undef, i32 [[TMP3]], i32 0
+; GATHER-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[TMP4]], i32 1
+; GATHER-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[TMP23]], i32 2
+; GATHER-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[TMP25]], i32 3
+; GATHER-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[TMP27]], i32 4
+; GATHER-NEXT:    [[TMP10:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[TMP29]], i32 5
+; GATHER-NEXT:    [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[TMP31]], i32 6
+; GATHER-NEXT:    [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[TMP33]], i32 7
+; GATHER-NEXT:    [[TMP13:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> [[TMP12]])
+; GATHER-NEXT:    [[BIN_EXTRA]] = add i32 [[TMP13]], -5
 ; GATHER-NEXT:    [[TMP34:%.*]] = add i32 [[TMP32]], [[TMP33]]
 ; GATHER-NEXT:    br label [[FOR_BODY]]
 ;
 ; MAX-COST-LABEL: @PR32038(
 ; MAX-COST-NEXT:  entry:
-; MAX-COST-NEXT:    [[TMP0:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1), align 1
-; MAX-COST-NEXT:    [[TMP1:%.*]] = icmp eq i8 [[TMP0]], 0
-; MAX-COST-NEXT:    [[TMP2:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 2), align 2
-; MAX-COST-NEXT:    [[TMP3:%.*]] = icmp eq i8 [[TMP2]], 0
+; MAX-COST-NEXT:    [[TMP0:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <2 x i8>*), align 1
+; MAX-COST-NEXT:    [[TMP1:%.*]] = icmp eq <2 x i8> [[TMP0]], zeroinitializer
 ; MAX-COST-NEXT:    [[TMP4:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1
-; MAX-COST-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[TMP4]], 0
+; MAX-COST-NEXT:    [[TMPP5:%.*]] = icmp eq i8 [[TMP4]], 0
 ; MAX-COST-NEXT:    [[TMP6:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4
-; MAX-COST-NEXT:    [[TMP7:%.*]] = icmp eq i8 [[TMP6]], 0
+; MAX-COST-NEXT:    [[TMPP7:%.*]] = icmp eq i8 [[TMP6]], 0
 ; MAX-COST-NEXT:    [[TMP8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1
 ; MAX-COST-NEXT:    [[TMP9:%.*]] = icmp eq i8 [[TMP8]], 0
 ; MAX-COST-NEXT:    [[TMP10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2
@@ -245,14 +241,16 @@
 ; MAX-COST-NEXT:    [[TMP13:%.*]] = icmp eq i8 [[TMP12]], 0
 ; MAX-COST-NEXT:    [[TMP14:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8
 ; MAX-COST-NEXT:    [[TMP15:%.*]] = icmp eq i8 [[TMP14]], 0
-; MAX-COST-NEXT:    [[TMP0:%.*]] = insertelement <4 x i1> undef, i1 [[TMP1]], i32 0
-; MAX-COST-NEXT:    [[TMP1:%.*]] = insertelement <4 x i1> [[TMP0]], i1 [[TMP3]], i32 1
-; MAX-COST-NEXT:    [[TMP2:%.*]] = insertelement <4 x i1> [[TMP1]], i1 [[TMP5]], i32 2
-; MAX-COST-NEXT:    [[TMP3:%.*]] = insertelement <4 x i1> [[TMP2]], i1 [[TMP7]], i32 3
 ; MAX-COST-NEXT:    br label [[FOR_BODY:%.*]]
 ; MAX-COST:       for.body:
 ; MAX-COST-NEXT:    [[TMP17:%.*]] = phi i32 [ [[TMP34:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; MAX-COST-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> <i32 -720, i32 -720, i32 -720, i32 -720>, <4 x i32> <i32 -80, i32 -80, i32 -80, i32 -80>
+; MAX-COST-NEXT:    [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
+; MAX-COST-NEXT:    [[TMP3:%.*]] = insertelement <4 x i1> undef, i1 [[TMP2]], i32 0
+; MAX-COST-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
+; MAX-COST-NEXT:    [[TMP5:%.*]] = insertelement <4 x i1> [[TMP3]], i1 [[TMP4]], i32 1
+; MAX-COST-NEXT:    [[TMP6:%.*]] = insertelement <4 x i1> [[TMP5]], i1 [[TMPP5]], i32 2
+; MAX-COST-NEXT:    [[TMP7:%.*]] = insertelement <4 x i1> [[TMP6]], i1 [[TMPP7]], i32 3
+; MAX-COST-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> <i32 -720, i32 -720, i32 -720, i32 -720>, <4 x i32> <i32 -80, i32 -80, i32 -80, i32 -80>
 ; MAX-COST-NEXT:    [[TMP20:%.*]] = add i32 -5, undef
 ; MAX-COST-NEXT:    [[TMP22:%.*]] = add i32 [[TMP20]], undef
 ; MAX-COST-NEXT:    [[TMP24:%.*]] = add i32 [[TMP22]], undef
@@ -260,10 +258,10 @@
 ; MAX-COST-NEXT:    [[TMP27:%.*]] = select i1 [[TMP9]], i32 -720, i32 -80
 ; MAX-COST-NEXT:    [[TMP28:%.*]] = add i32 [[TMP26]], [[TMP27]]
 ; MAX-COST-NEXT:    [[TMP29:%.*]] = select i1 [[TMP11]], i32 -720, i32 -80
-; MAX-COST-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> [[TMP4]])
-; MAX-COST-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], [[TMP27]]
-; MAX-COST-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], [[TMP29]]
-; MAX-COST-NEXT:    [[BIN_EXTRA:%.*]] = add i32 [[TMP7]], -5
+; MAX-COST-NEXT:    [[TMP9:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> [[TMP8]])
+; MAX-COST-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], [[TMP27]]
+; MAX-COST-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], [[TMP29]]
+; MAX-COST-NEXT:    [[BIN_EXTRA:%.*]] = add i32 [[TMP11]], -5
 ; MAX-COST-NEXT:    [[TMP30:%.*]] = add i32 [[TMP28]], [[TMP29]]
 ; MAX-COST-NEXT:    [[TMP31:%.*]] = select i1 [[TMP13]], i32 -720, i32 -80
 ; MAX-COST-NEXT:    [[TMP32:%.*]] = add i32 [[BIN_EXTRA]], [[TMP31]]
Index: test/Transforms/SLPVectorizer/X86/horizontal-list.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/horizontal-list.ll
+++ test/Transforms/SLPVectorizer/X86/horizontal-list.ll
@@ -117,11 +117,11 @@
 ; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
-; CHECK-NEXT:    [[BIN_EXTRA:%.*]] = fadd fast float [[TMP4]], [[CONV]]
-; CHECK-NEXT:    [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], [[CONV6]]
+; CHECK-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP4]], [[CONV]]
+; CHECK-NEXT:    [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV6]]
 ; CHECK-NEXT:    [[ADD19_3:%.*]] = fadd fast float undef, [[ADD19_2]]
-; CHECK-NEXT:    store float [[BIN_EXTRA5]], float* @res, align 4
-; CHECK-NEXT:    ret float [[BIN_EXTRA5]]
+; CHECK-NEXT:    store float [[OP_EXTRA5]], float* @res, align 4
+; CHECK-NEXT:    ret float [[OP_EXTRA5]]
 ;
 ; THRESHOLD-LABEL: @bazz(
 ; THRESHOLD-NEXT:  entry:
@@ -148,11 +148,11 @@
 ; THRESHOLD-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; THRESHOLD-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
 ; THRESHOLD-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
-; THRESHOLD-NEXT:    [[BIN_EXTRA:%.*]] = fadd fast float [[TMP4]], [[CONV]]
-; THRESHOLD-NEXT:    [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], [[CONV6]]
+; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP4]], [[CONV]]
+; THRESHOLD-NEXT:    [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV6]]
 ; THRESHOLD-NEXT:    [[ADD19_3:%.*]] = fadd fast float undef, [[ADD19_2]]
-; THRESHOLD-NEXT:    store float [[BIN_EXTRA5]], float* @res, align 4
-; THRESHOLD-NEXT:    ret float [[BIN_EXTRA5]]
+; THRESHOLD-NEXT:    store float [[OP_EXTRA5]], float* @res, align 4
+; THRESHOLD-NEXT:    ret float [[OP_EXTRA5]]
 ;
 entry:
   %0 = load i32, i32* @n, align 4
@@ -327,47 +327,53 @@
 define float @bar() {
 ; CHECK-LABEL: @bar(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
 ; CHECK-NEXT:    [[CMP4:%.*]] = fcmp fast ogt float [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[MAX_0_MUL3:%.*]] = select i1 [[CMP4]], float [[TMP3]], float [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2), align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2), align 8
-; CHECK-NEXT:    [[MUL3_1:%.*]] = fmul fast float [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[CMP4_1:%.*]] = fcmp fast ogt float [[MAX_0_MUL3]], [[MUL3_1]]
-; CHECK-NEXT:    [[MAX_0_MUL3_1:%.*]] = select i1 [[CMP4_1]], float [[MAX_0_MUL3]], float [[MUL3_1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 3), align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 3), align 4
-; CHECK-NEXT:    [[MUL3_2:%.*]] = fmul fast float [[TMP8]], [[TMP7]]
-; CHECK-NEXT:    [[CMP4_2:%.*]] = fcmp fast ogt float [[MAX_0_MUL3_1]], [[MUL3_2]]
-; CHECK-NEXT:    [[MAX_0_MUL3_2:%.*]] = select i1 [[CMP4_2]], float [[MAX_0_MUL3_1]], float [[MUL3_2]]
-; CHECK-NEXT:    store float [[MAX_0_MUL3_2]], float* @res, align 4
-; CHECK-NEXT:    ret float [[MAX_0_MUL3_2]]
+; CHECK-NEXT:    [[MAX_0_MUL3:%.*]] = select i1 [[CMP4]], float undef, float undef
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; CHECK-NEXT:    [[CMP4_1:%.*]] = fcmp fast ogt float [[MAX_0_MUL3]], [[TMP5]]
+; CHECK-NEXT:    [[MAX_0_MUL3_1:%.*]] = select i1 [[CMP4_1]], float [[MAX_0_MUL3]], float undef
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; CHECK-NEXT:    [[CMP4_2:%.*]] = fcmp fast ogt float [[MAX_0_MUL3_1]], [[TMP6]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <4 x float> [[TMP2]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP2]], <4 x float> [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <4 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> [[RDX_SHUF1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[RDX_MINMAX_SELECT3]], i32 0
+; CHECK-NEXT:    [[MAX_0_MUL3_2:%.*]] = select i1 [[CMP4_2]], float [[MAX_0_MUL3_1]], float undef
+; CHECK-NEXT:    store float [[TMP7]], float* @res, align 4
+; CHECK-NEXT:    ret float [[TMP7]]
 ;
 ; THRESHOLD-LABEL: @bar(
 ; THRESHOLD-NEXT:  entry:
-; THRESHOLD-NEXT:    [[TMP0:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16
-; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16
-; THRESHOLD-NEXT:    [[TMP2:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP0]]
-; THRESHOLD-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
-; THRESHOLD-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP0]]
+; THRESHOLD-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
 ; THRESHOLD-NEXT:    [[CMP4:%.*]] = fcmp fast ogt float [[TMP3]], [[TMP4]]
-; THRESHOLD-NEXT:    [[MAX_0_MUL3:%.*]] = select i1 [[CMP4]], float [[TMP3]], float [[TMP4]]
-; THRESHOLD-NEXT:    [[TMP5:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2), align 8
-; THRESHOLD-NEXT:    [[TMP6:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2), align 8
-; THRESHOLD-NEXT:    [[MUL3_1:%.*]] = fmul fast float [[TMP6]], [[TMP5]]
-; THRESHOLD-NEXT:    [[CMP4_1:%.*]] = fcmp fast ogt float [[MAX_0_MUL3]], [[MUL3_1]]
-; THRESHOLD-NEXT:    [[MAX_0_MUL3_1:%.*]] = select i1 [[CMP4_1]], float [[MAX_0_MUL3]], float [[MUL3_1]]
-; THRESHOLD-NEXT:    [[TMP7:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 3), align 4
-; THRESHOLD-NEXT:    [[TMP8:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 3), align 4
-; THRESHOLD-NEXT:    [[MUL3_2:%.*]] = fmul fast float [[TMP8]], [[TMP7]]
-; THRESHOLD-NEXT:    [[CMP4_2:%.*]] = fcmp fast ogt float [[MAX_0_MUL3_1]], [[MUL3_2]]
-; THRESHOLD-NEXT:    [[MAX_0_MUL3_2:%.*]] = select i1 [[CMP4_2]], float [[MAX_0_MUL3_1]], float [[MUL3_2]]
-; THRESHOLD-NEXT:    store float [[MAX_0_MUL3_2]], float* @res, align 4
-; THRESHOLD-NEXT:    ret float [[MAX_0_MUL3_2]]
+; THRESHOLD-NEXT:    [[MAX_0_MUL3:%.*]] = select i1 [[CMP4]], float undef, float undef
+; THRESHOLD-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; THRESHOLD-NEXT:    [[CMP4_1:%.*]] = fcmp fast ogt float [[MAX_0_MUL3]], [[TMP5]]
+; THRESHOLD-NEXT:    [[MAX_0_MUL3_1:%.*]] = select i1 [[CMP4_1]], float [[MAX_0_MUL3]], float undef
+; THRESHOLD-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; THRESHOLD-NEXT:    [[CMP4_2:%.*]] = fcmp fast ogt float [[MAX_0_MUL3_1]], [[TMP6]]
+; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <4 x float> [[TMP2]], [[RDX_SHUF]]
+; THRESHOLD-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP2]], <4 x float> [[RDX_SHUF]]
+; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <4 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; THRESHOLD-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> [[RDX_SHUF1]]
+; THRESHOLD-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[RDX_MINMAX_SELECT3]], i32 0
+; THRESHOLD-NEXT:    [[MAX_0_MUL3_2:%.*]] = select i1 [[CMP4_2]], float [[MAX_0_MUL3_1]], float undef
+; THRESHOLD-NEXT:    store float [[TMP7]], float* @res, align 4
+; THRESHOLD-NEXT:    ret float [[TMP7]]
 ;
 entry:
   %0 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 0), align 16
@@ -512,9 +518,9 @@
 ; CHECK-NEXT:    [[RDX_SHUF15:%.*]] = shufflevector <16 x float> [[BIN_RDX14]], <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX16:%.*]] = fadd fast <16 x float> [[BIN_RDX14]], [[RDX_SHUF15]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <16 x float> [[BIN_RDX16]], i32 0
-; CHECK-NEXT:    [[BIN_RDX17:%.*]] = fadd fast float [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[TMP5]]
 ; CHECK-NEXT:    [[ADD_47:%.*]] = fadd fast float undef, [[ADD_46]]
-; CHECK-NEXT:    ret float [[BIN_RDX17]]
+; CHECK-NEXT:    ret float [[OP_RDX]]
 ;
 ; THRESHOLD-LABEL: @f(
 ; THRESHOLD-NEXT:  entry:
@@ -635,9 +641,9 @@
 ; THRESHOLD-NEXT:    [[RDX_SHUF15:%.*]] = shufflevector <16 x float> [[BIN_RDX14]], <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; THRESHOLD-NEXT:    [[BIN_RDX16:%.*]] = fadd fast <16 x float> [[BIN_RDX14]], [[RDX_SHUF15]]
 ; THRESHOLD-NEXT:    [[TMP5:%.*]] = extractelement <16 x float> [[BIN_RDX16]], i32 0
-; THRESHOLD-NEXT:    [[BIN_RDX17:%.*]] = fadd fast float [[TMP4]], [[TMP5]]
+; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[TMP5]]
 ; THRESHOLD-NEXT:    [[ADD_47:%.*]] = fadd fast float undef, [[ADD_46]]
-; THRESHOLD-NEXT:    ret float [[BIN_RDX17]]
+; THRESHOLD-NEXT:    ret float [[OP_RDX]]
 ;
   entry:
   %0 = load float, float* %x, align 4
@@ -865,9 +871,9 @@
 ; CHECK-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <32 x float> [[BIN_RDX6]], <32 x float> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX8:%.*]] = fadd fast <32 x float> [[BIN_RDX6]], [[RDX_SHUF7]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <32 x float> [[BIN_RDX8]], i32 0
-; CHECK-NEXT:    [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[CONV]]
+; CHECK-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[CONV]]
 ; CHECK-NEXT:    [[ADD_31:%.*]] = fadd fast float undef, [[ADD_30]]
-; CHECK-NEXT:    ret float [[BIN_EXTRA]]
+; CHECK-NEXT:    ret float [[OP_EXTRA]]
 ;
 ; THRESHOLD-LABEL: @f1(
 ; THRESHOLD-NEXT:  entry:
@@ -948,9 +954,9 @@
 ; THRESHOLD-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <32 x float> [[BIN_RDX6]], <32 x float> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; THRESHOLD-NEXT:    [[BIN_RDX8:%.*]] = fadd fast <32 x float> [[BIN_RDX6]], [[RDX_SHUF7]]
 ; THRESHOLD-NEXT:    [[TMP2:%.*]] = extractelement <32 x float> [[BIN_RDX8]], i32 0
-; THRESHOLD-NEXT:    [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[CONV]]
+; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[CONV]]
 ; THRESHOLD-NEXT:    [[ADD_31:%.*]] = fadd fast float undef, [[ADD_30]]
-; THRESHOLD-NEXT:    ret float [[BIN_EXTRA]]
+; THRESHOLD-NEXT:    ret float [[OP_EXTRA]]
 ;
   entry:
   %rem = srem i32 %a, %b
@@ -1138,14 +1144,14 @@
 ; CHECK-NEXT:    [[RDX_SHUF11:%.*]] = shufflevector <8 x float> [[BIN_RDX10]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX12:%.*]] = fadd fast <8 x float> [[BIN_RDX10]], [[RDX_SHUF11]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x float> [[BIN_RDX12]], i32 0
-; CHECK-NEXT:    [[BIN_RDX13:%.*]] = fadd fast float [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[RDX_SHUF14:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX15:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF14]]
-; CHECK-NEXT:    [[RDX_SHUF16:%.*]] = shufflevector <4 x float> [[BIN_RDX15]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX17:%.*]] = fadd fast <4 x float> [[BIN_RDX15]], [[RDX_SHUF16]]
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[BIN_RDX17]], i32 0
-; CHECK-NEXT:    [[BIN_RDX18:%.*]] = fadd fast float [[BIN_RDX13]], [[TMP10]]
-; CHECK-NEXT:    [[TMP11:%.*]] = fadd fast float [[BIN_RDX18]], [[TMP1]]
+; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[RDX_SHUF13:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX14:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF13]]
+; CHECK-NEXT:    [[RDX_SHUF15:%.*]] = shufflevector <4 x float> [[BIN_RDX14]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX16:%.*]] = fadd fast <4 x float> [[BIN_RDX14]], [[RDX_SHUF15]]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[BIN_RDX16]], i32 0
+; CHECK-NEXT:    [[OP_RDX17:%.*]] = fadd fast float [[OP_RDX]], [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd fast float [[OP_RDX17]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = fadd fast float [[TMP11]], [[TMP0]]
 ; CHECK-NEXT:    [[ADD_29:%.*]] = fadd fast float undef, [[ADD_28]]
 ; CHECK-NEXT:    ret float [[TMP12]]
@@ -1234,14 +1240,14 @@
 ; THRESHOLD-NEXT:    [[RDX_SHUF11:%.*]] = shufflevector <8 x float> [[BIN_RDX10]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; THRESHOLD-NEXT:    [[BIN_RDX12:%.*]] = fadd fast <8 x float> [[BIN_RDX10]], [[RDX_SHUF11]]
 ; THRESHOLD-NEXT:    [[TMP9:%.*]] = extractelement <8 x float> [[BIN_RDX12]], i32 0
-; THRESHOLD-NEXT:    [[BIN_RDX13:%.*]] = fadd fast float [[TMP8]], [[TMP9]]
-; THRESHOLD-NEXT:    [[RDX_SHUF14:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; THRESHOLD-NEXT:    [[BIN_RDX15:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF14]]
-; THRESHOLD-NEXT:    [[RDX_SHUF16:%.*]] = shufflevector <4 x float> [[BIN_RDX15]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; THRESHOLD-NEXT:    [[BIN_RDX17:%.*]] = fadd fast <4 x float> [[BIN_RDX15]], [[RDX_SHUF16]]
-; THRESHOLD-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[BIN_RDX17]], i32 0
-; THRESHOLD-NEXT:    [[BIN_RDX18:%.*]] = fadd fast float [[BIN_RDX13]], [[TMP10]]
-; THRESHOLD-NEXT:    [[TMP11:%.*]] = fadd fast float [[BIN_RDX18]], [[TMP1]]
+; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[TMP9]]
+; THRESHOLD-NEXT:    [[RDX_SHUF13:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX14:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF13]]
+; THRESHOLD-NEXT:    [[RDX_SHUF15:%.*]] = shufflevector <4 x float> [[BIN_RDX14]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX16:%.*]] = fadd fast <4 x float> [[BIN_RDX14]], [[RDX_SHUF15]]
+; THRESHOLD-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[BIN_RDX16]], i32 0
+; THRESHOLD-NEXT:    [[OP_RDX17:%.*]] = fadd fast float [[OP_RDX]], [[TMP10]]
+; THRESHOLD-NEXT:    [[TMP11:%.*]] = fadd fast float [[OP_RDX17]], [[TMP1]]
 ; THRESHOLD-NEXT:    [[TMP12:%.*]] = fadd fast float [[TMP11]], [[TMP0]]
 ; THRESHOLD-NEXT:    [[ADD_29:%.*]] = fadd fast float undef, [[ADD_28]]
 ; THRESHOLD-NEXT:    ret float [[TMP12]]
@@ -1369,10 +1375,10 @@
 ; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
-; CHECK-NEXT:    [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
-; CHECK-NEXT:    [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], [[CONV]]
+; CHECK-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
+; CHECK-NEXT:    [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]]
 ; CHECK-NEXT:    [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]]
-; CHECK-NEXT:    ret float [[BIN_EXTRA5]]
+; CHECK-NEXT:    ret float [[OP_EXTRA5]]
 ;
 ; THRESHOLD-LABEL: @extra_args(
 ; THRESHOLD-NEXT:  entry:
@@ -1403,10 +1409,10 @@
 ; THRESHOLD-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; THRESHOLD-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
 ; THRESHOLD-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
-; THRESHOLD-NEXT:    [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
-; THRESHOLD-NEXT:    [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], [[CONV]]
+; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
+; THRESHOLD-NEXT:    [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]]
 ; THRESHOLD-NEXT:    [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]]
-; THRESHOLD-NEXT:    ret float [[BIN_EXTRA5]]
+; THRESHOLD-NEXT:    ret float [[OP_EXTRA5]]
 ;
   entry:
   %mul = mul nsw i32 %b, %a
@@ -1471,12 +1477,12 @@
 ; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
-; CHECK-NEXT:    [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
-; CHECK-NEXT:    [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], 5.000000e+00
-; CHECK-NEXT:    [[BIN_EXTRA6:%.*]] = fadd fast float [[BIN_EXTRA5]], 5.000000e+00
-; CHECK-NEXT:    [[BIN_EXTRA7:%.*]] = fadd fast float [[BIN_EXTRA6]], [[CONV]]
+; CHECK-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
+; CHECK-NEXT:    [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], 5.000000e+00
+; CHECK-NEXT:    [[OP_EXTRA6:%.*]] = fadd fast float [[OP_EXTRA5]], 5.000000e+00
+; CHECK-NEXT:    [[OP_EXTRA7:%.*]] = fadd fast float [[OP_EXTRA6]], [[CONV]]
 ; CHECK-NEXT:    [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]]
-; CHECK-NEXT:    ret float [[BIN_EXTRA7]]
+; CHECK-NEXT:    ret float [[OP_EXTRA7]]
 ;
 ; THRESHOLD-LABEL: @extra_args_same_several_times(
 ; THRESHOLD-NEXT:  entry:
@@ -1509,12 +1515,12 @@
 ; THRESHOLD-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; THRESHOLD-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
 ; THRESHOLD-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
-; THRESHOLD-NEXT:    [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
-; THRESHOLD-NEXT:    [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], 5.000000e+00
-; THRESHOLD-NEXT:    [[BIN_EXTRA6:%.*]] = fadd fast float [[BIN_EXTRA5]], 5.000000e+00
-; THRESHOLD-NEXT:    [[BIN_EXTRA7:%.*]] = fadd fast float [[BIN_EXTRA6]], [[CONV]]
+; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
+; THRESHOLD-NEXT:    [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], 5.000000e+00
+; THRESHOLD-NEXT:    [[OP_EXTRA6:%.*]] = fadd fast float [[OP_EXTRA5]], 5.000000e+00
+; THRESHOLD-NEXT:    [[OP_EXTRA7:%.*]] = fadd fast float [[OP_EXTRA6]], [[CONV]]
 ; THRESHOLD-NEXT:    [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]]
-; THRESHOLD-NEXT:    ret float [[BIN_EXTRA7]]
+; THRESHOLD-NEXT:    ret float [[OP_EXTRA7]]
 ;
   entry:
   %mul = mul nsw i32 %b, %a
@@ -1581,10 +1587,10 @@
 ; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
-; CHECK-NEXT:    [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
-; CHECK-NEXT:    [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], [[CONV]]
+; CHECK-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
+; CHECK-NEXT:    [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]]
 ; CHECK-NEXT:    [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]]
-; CHECK-NEXT:    ret float [[BIN_EXTRA5]]
+; CHECK-NEXT:    ret float [[OP_EXTRA5]]
 ;
 ; THRESHOLD-LABEL: @extra_args_no_replace(
 ; THRESHOLD-NEXT:  entry:
@@ -1617,10 +1623,10 @@
 ; THRESHOLD-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; THRESHOLD-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
 ; THRESHOLD-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
-; THRESHOLD-NEXT:    [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
-; THRESHOLD-NEXT:    [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], [[CONV]]
+; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
+; THRESHOLD-NEXT:    [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]]
 ; THRESHOLD-NEXT:    [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]]
-; THRESHOLD-NEXT:    ret float [[BIN_EXTRA5]]
+; THRESHOLD-NEXT:    ret float [[OP_EXTRA5]]
 ;
   entry:
   %mul = mul nsw i32 %b, %a
@@ -1679,10 +1685,10 @@
 ; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX2:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
-; CHECK-NEXT:    [[BIN_EXTRA:%.*]] = add nuw i32 [[TMP12]], [[ARG]]
-; CHECK-NEXT:    [[BIN_EXTRA3:%.*]] = add nsw i32 [[BIN_EXTRA]], [[TMP9]]
+; CHECK-NEXT:    [[OP_EXTRA:%.*]] = add nuw i32 [[TMP12]], [[ARG]]
+; CHECK-NEXT:    [[OP_EXTRA3:%.*]] = add nsw i32 [[OP_EXTRA]], [[TMP9]]
 ; CHECK-NEXT:    [[R5:%.*]] = add nsw i32 [[R4]], undef
-; CHECK-NEXT:    ret i32 [[BIN_EXTRA3]]
+; CHECK-NEXT:    ret i32 [[OP_EXTRA3]]
 ;
 ; THRESHOLD-LABEL: @wobble(
 ; THRESHOLD-NEXT:  bb:
@@ -1707,10 +1713,10 @@
 ; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; THRESHOLD-NEXT:    [[BIN_RDX2:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
 ; THRESHOLD-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
-; THRESHOLD-NEXT:    [[BIN_EXTRA:%.*]] = add nuw i32 [[TMP12]], [[ARG]]
-; THRESHOLD-NEXT:    [[BIN_EXTRA3:%.*]] = add nsw i32 [[BIN_EXTRA]], [[TMP9]]
+; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = add nuw i32 [[TMP12]], [[ARG]]
+; THRESHOLD-NEXT:    [[OP_EXTRA3:%.*]] = add nsw i32 [[OP_EXTRA]], [[TMP9]]
 ; THRESHOLD-NEXT:    [[R5:%.*]] = add nsw i32 [[R4]], undef
-; THRESHOLD-NEXT:    ret i32 [[BIN_EXTRA3]]
+; THRESHOLD-NEXT:    ret i32 [[OP_EXTRA3]]
 ;
   bb:
   %x1 = xor i32 %arg, %bar
Index: test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
+++ test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
@@ -34,79 +34,46 @@
 ; CHECK-NEXT:    ret i32 [[TMP23]]
 ;
 ; AVX-LABEL: @maxi8(
-; AVX-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
-; AVX-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
-; AVX-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
-; AVX-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
-; AVX-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
-; AVX-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]]
-; AVX-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]]
-; AVX-NEXT:    [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
-; AVX-NEXT:    [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
-; AVX-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]]
-; AVX-NEXT:    [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
-; AVX-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
-; AVX-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
-; AVX-NEXT:    [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
-; AVX-NEXT:    [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]
-; AVX-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]]
-; AVX-NEXT:    [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
-; AVX-NEXT:    [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]
-; AVX-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]]
-; AVX-NEXT:    [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
-; AVX-NEXT:    [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]]
-; AVX-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]]
-; AVX-NEXT:    ret i32 [[TMP23]]
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr to <8 x i32>*), align 16
+; AVX:         [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP24:%.*]] = icmp sgt <8 x i32> [[TMP2]], [[RDX_SHUF]]
+; AVX-NEXT:    [[BIN_RDX:%.*]] = select <8 x i1> [[TMP24]], <8 x i32> [[TMP2]], <8 x i32> [[RDX_SHUF]]
+; AVX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP25:%.*]] = icmp sgt <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; AVX-NEXT:    [[BIN_RDX2:%.*]] = select <8 x i1> [[TMP25]], <8 x i32> [[BIN_RDX]], <8 x i32> [[RDX_SHUF1]]
+; AVX-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP26:%.*]] = icmp sgt <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; AVX-NEXT:    [[BIN_RDX4:%.*]] = select <8 x i1> [[TMP26]], <8 x i32> [[BIN_RDX2]], <8 x i32> [[RDX_SHUF3]]
+; AVX-NEXT:    [[TMP27:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; AVX:         ret i32 [[TMP27]]
 ;
 ; AVX2-LABEL: @maxi8(
-; AVX2-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
-; AVX2-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
-; AVX2-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
-; AVX2-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
-; AVX2-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
-; AVX2-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]]
-; AVX2-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]]
-; AVX2-NEXT:    [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
-; AVX2-NEXT:    [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
-; AVX2-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]]
-; AVX2-NEXT:    [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
-; AVX2-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
-; AVX2-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
-; AVX2-NEXT:    [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
-; AVX2-NEXT:    [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]
-; AVX2-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]]
-; AVX2-NEXT:    [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
-; AVX2-NEXT:    [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]
-; AVX2-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]]
-; AVX2-NEXT:    [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
-; AVX2-NEXT:    [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]]
-; AVX2-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]]
-; AVX2-NEXT:    ret i32 [[TMP23]]
+; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr to <8 x i32>*), align 16
+; AVX2:         [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP24:%.*]] = icmp sgt <8 x i32> [[TMP2]], [[RDX_SHUF]]
+; AVX2-NEXT:    [[BIN_RDX:%.*]] = select <8 x i1> [[TMP24]], <8 x i32> [[TMP2]], <8 x i32> [[RDX_SHUF]]
+; AVX2-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP25:%.*]] = icmp sgt <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; AVX2-NEXT:    [[BIN_RDX2:%.*]] = select <8 x i1> [[TMP25]], <8 x i32> [[BIN_RDX]], <8 x i32> [[RDX_SHUF1]]
+; AVX2-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP26:%.*]] = icmp sgt <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; AVX2-NEXT:    [[BIN_RDX4:%.*]] = select <8 x i1> [[TMP26]], <8 x i32> [[BIN_RDX2]], <8 x i32> [[RDX_SHUF3]]
+; AVX2-NEXT:    [[TMP27:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; AVX2:         ret i32 [[TMP27]]
 ;
 ; SKX-LABEL: @maxi8(
-; SKX-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
-; SKX-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
-; SKX-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
-; SKX-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
-; SKX-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
-; SKX-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]]
-; SKX-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]]
-; SKX-NEXT:    [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
-; SKX-NEXT:    [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
-; SKX-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]]
-; SKX-NEXT:    [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
-; SKX-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
-; SKX-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
-; SKX-NEXT:    [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
-; SKX-NEXT:    [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]
-; SKX-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]]
-; SKX-NEXT:    [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
-; SKX-NEXT:    [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]
-; SKX-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]]
-; SKX-NEXT:    [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
-; SKX-NEXT:    [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]]
-; SKX-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]]
-; SKX-NEXT:    ret i32 [[TMP23]]
+; SKX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr to <8 x i32>*), align 16
+; SKX:         [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; SKX-NEXT:    [[TMP24:%.*]] = icmp sgt <8 x i32> [[TMP2]], [[RDX_SHUF]]
+; SKX-NEXT:    [[BIN_RDX:%.*]] = select <8 x i1> [[TMP24]], <8 x i32> [[TMP2]], <8 x i32> [[RDX_SHUF]]
+; SKX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SKX-NEXT:    [[TMP25:%.*]] = icmp sgt <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; SKX-NEXT:    [[BIN_RDX2:%.*]] = select <8 x i1> [[TMP25]], <8 x i32> [[BIN_RDX]], <8 x i32> [[RDX_SHUF1]]
+; SKX-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SKX-NEXT:    [[TMP26:%.*]] = icmp sgt <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; SKX-NEXT:    [[BIN_RDX4:%.*]] = select <8 x i1> [[TMP26]], <8 x i32> [[BIN_RDX2]], <8 x i32> [[RDX_SHUF3]]
+; SKX-NEXT:    [[TMP27:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; SKX:         ret i32 [[TMP27]]
 ;
   %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
   %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
@@ -184,151 +151,55 @@
 ; CHECK-NEXT:    ret i32 [[TMP47]]
 ;
 ; AVX-LABEL: @maxi16(
-; AVX-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
-; AVX-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
-; AVX-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
-; AVX-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
-; AVX-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
-; AVX-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]]
-; AVX-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]]
-; AVX-NEXT:    [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
-; AVX-NEXT:    [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
-; AVX-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]]
-; AVX-NEXT:    [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
-; AVX-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
-; AVX-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
-; AVX-NEXT:    [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
-; AVX-NEXT:    [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]
-; AVX-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]]
-; AVX-NEXT:    [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
-; AVX-NEXT:    [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]
-; AVX-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]]
-; AVX-NEXT:    [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
-; AVX-NEXT:    [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]]
-; AVX-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]]
-; AVX-NEXT:    [[TMP24:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 8), align 16
-; AVX-NEXT:    [[TMP25:%.*]] = icmp sgt i32 [[TMP23]], [[TMP24]]
-; AVX-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP23]], i32 [[TMP24]]
-; AVX-NEXT:    [[TMP27:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 9), align 4
-; AVX-NEXT:    [[TMP28:%.*]] = icmp sgt i32 [[TMP26]], [[TMP27]]
-; AVX-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i32 [[TMP26]], i32 [[TMP27]]
-; AVX-NEXT:    [[TMP30:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 10), align 8
-; AVX-NEXT:    [[TMP31:%.*]] = icmp sgt i32 [[TMP29]], [[TMP30]]
-; AVX-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP29]], i32 [[TMP30]]
-; AVX-NEXT:    [[TMP33:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 11), align 4
-; AVX-NEXT:    [[TMP34:%.*]] = icmp sgt i32 [[TMP32]], [[TMP33]]
-; AVX-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[TMP32]], i32 [[TMP33]]
-; AVX-NEXT:    [[TMP36:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 12), align 16
-; AVX-NEXT:    [[TMP37:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]]
-; AVX-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 [[TMP35]], i32 [[TMP36]]
-; AVX-NEXT:    [[TMP39:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 13), align 4
-; AVX-NEXT:    [[TMP40:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]]
-; AVX-NEXT:    [[TMP41:%.*]] = select i1 [[TMP40]], i32 [[TMP38]], i32 [[TMP39]]
-; AVX-NEXT:    [[TMP42:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 14), align 8
-; AVX-NEXT:    [[TMP43:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]]
-; AVX-NEXT:    [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[TMP41]], i32 [[TMP42]]
-; AVX-NEXT:    [[TMP45:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 15), align 4
-; AVX-NEXT:    [[TMP46:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]]
-; AVX-NEXT:    [[TMP47:%.*]] = select i1 [[TMP46]], i32 [[TMP44]], i32 [[TMP45]]
-; AVX-NEXT:    ret i32 [[TMP47]]
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([32 x i32]* @arr to <16 x i32>*), align 16
+; AVX:         [[RDX_SHUF:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP48:%.*]] = icmp sgt <16 x i32> [[TMP2]], [[RDX_SHUF]]
+; AVX-NEXT:    [[BIN_RDX:%.*]] = select <16 x i1> [[TMP48]], <16 x i32> [[TMP2]], <16 x i32> [[RDX_SHUF]]
+; AVX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <16 x i32> [[BIN_RDX]], <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP49:%.*]] = icmp sgt <16 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; AVX-NEXT:    [[BIN_RDX2:%.*]] = select <16 x i1> [[TMP49]], <16 x i32> [[BIN_RDX]], <16 x i32> [[RDX_SHUF1]]
+; AVX-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <16 x i32> [[BIN_RDX2]], <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP50:%.*]] = icmp sgt <16 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; AVX-NEXT:    [[BIN_RDX4:%.*]] = select <16 x i1> [[TMP50]], <16 x i32> [[BIN_RDX2]], <16 x i32> [[RDX_SHUF3]]
+; AVX-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <16 x i32> [[BIN_RDX4]], <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP51:%.*]] = icmp sgt <16 x i32> [[BIN_RDX4]], [[RDX_SHUF5]]
+; AVX-NEXT:    [[BIN_RDX6:%.*]] = select <16 x i1> [[TMP51]], <16 x i32> [[BIN_RDX4]], <16 x i32> [[RDX_SHUF5]]
+; AVX-NEXT:    [[TMP52:%.*]] = extractelement <16 x i32> [[BIN_RDX6]], i32 0
+; AVX:         ret i32 [[TMP52]]
 ;
 ; AVX2-LABEL: @maxi16(
-; AVX2-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
-; AVX2-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
-; AVX2-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
-; AVX2-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
-; AVX2-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
-; AVX2-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]]
-; AVX2-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]]
-; AVX2-NEXT:    [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
-; AVX2-NEXT:    [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
-; AVX2-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]]
-; AVX2-NEXT:    [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
-; AVX2-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
-; AVX2-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
-; AVX2-NEXT:    [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
-; AVX2-NEXT:    [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]
-; AVX2-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]]
-; AVX2-NEXT:    [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
-; AVX2-NEXT:    [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]
-; AVX2-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]]
-; AVX2-NEXT:    [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
-; AVX2-NEXT:    [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]]
-; AVX2-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]]
-; AVX2-NEXT:    [[TMP24:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 8), align 16
-; AVX2-NEXT:    [[TMP25:%.*]] = icmp sgt i32 [[TMP23]], [[TMP24]]
-; AVX2-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP23]], i32 [[TMP24]]
-; AVX2-NEXT:    [[TMP27:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 9), align 4
-; AVX2-NEXT:    [[TMP28:%.*]] = icmp sgt i32 [[TMP26]], [[TMP27]]
-; AVX2-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i32 [[TMP26]], i32 [[TMP27]]
-; AVX2-NEXT:    [[TMP30:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 10), align 8
-; AVX2-NEXT:    [[TMP31:%.*]] = icmp sgt i32 [[TMP29]], [[TMP30]]
-; AVX2-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP29]], i32 [[TMP30]]
-; AVX2-NEXT:    [[TMP33:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 11), align 4
-; AVX2-NEXT:    [[TMP34:%.*]] = icmp sgt i32 [[TMP32]], [[TMP33]]
-; AVX2-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[TMP32]], i32 [[TMP33]]
-; AVX2-NEXT:    [[TMP36:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 12), align 16
-; AVX2-NEXT:    [[TMP37:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]]
-; AVX2-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 [[TMP35]], i32 [[TMP36]]
-; AVX2-NEXT:    [[TMP39:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 13), align 4
-; AVX2-NEXT:    [[TMP40:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]]
-; AVX2-NEXT:    [[TMP41:%.*]] = select i1 [[TMP40]], i32 [[TMP38]], i32 [[TMP39]]
-; AVX2-NEXT:    [[TMP42:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 14), align 8
-; AVX2-NEXT:    [[TMP43:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]]
-; AVX2-NEXT:    [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[TMP41]], i32 [[TMP42]]
-; AVX2-NEXT:    [[TMP45:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 15), align 4
-; AVX2-NEXT:    [[TMP46:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]]
-; AVX2-NEXT:    [[TMP47:%.*]] = select i1 [[TMP46]], i32 [[TMP44]], i32 [[TMP45]]
-; AVX2-NEXT:    ret i32 [[TMP47]]
+; AVX2-NEXT:    [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([32 x i32]* @arr to <16 x i32>*), align 16
+; AVX2:         [[RDX_SHUF:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP48:%.*]] = icmp sgt <16 x i32> [[TMP2]], [[RDX_SHUF]]
+; AVX2-NEXT:    [[BIN_RDX:%.*]] = select <16 x i1> [[TMP48]], <16 x i32> [[TMP2]], <16 x i32> [[RDX_SHUF]]
+; AVX2-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <16 x i32> [[BIN_RDX]], <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP49:%.*]] = icmp sgt <16 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; AVX2-NEXT:    [[BIN_RDX2:%.*]] = select <16 x i1> [[TMP49]], <16 x i32> [[BIN_RDX]], <16 x i32> [[RDX_SHUF1]]
+; AVX2-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <16 x i32> [[BIN_RDX2]], <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP50:%.*]] = icmp sgt <16 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; AVX2-NEXT:    [[BIN_RDX4:%.*]] = select <16 x i1> [[TMP50]], <16 x i32> [[BIN_RDX2]], <16 x i32> [[RDX_SHUF3]]
+; AVX2-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <16 x i32> [[BIN_RDX4]], <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP51:%.*]] = icmp sgt <16 x i32> [[BIN_RDX4]], [[RDX_SHUF5]]
+; AVX2-NEXT:    [[BIN_RDX6:%.*]] = select <16 x i1> [[TMP51]], <16 x i32> [[BIN_RDX4]], <16 x i32> [[RDX_SHUF5]]
+; AVX2-NEXT:    [[TMP52:%.*]] = extractelement <16 x i32> [[BIN_RDX6]], i32 0
+; AVX2:         ret i32 [[TMP52]]
 ;
 ; SKX-LABEL: @maxi16(
-; SKX-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
-; SKX-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
-; SKX-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
-; SKX-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
-; SKX-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
-; SKX-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]]
-; SKX-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]]
-; SKX-NEXT:    [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
-; SKX-NEXT:    [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
-; SKX-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]]
-; SKX-NEXT:    [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
-; SKX-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
-; SKX-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
-; SKX-NEXT:    [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
-; SKX-NEXT:    [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]
-; SKX-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]]
-; SKX-NEXT:    [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
-; SKX-NEXT:    [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]
-; SKX-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]]
-; SKX-NEXT:    [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
-; SKX-NEXT:    [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]]
-; SKX-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]]
-; SKX-NEXT:    [[TMP24:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 8), align 16
-; SKX-NEXT:    [[TMP25:%.*]] = icmp sgt i32 [[TMP23]], [[TMP24]]
-; SKX-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP23]], i32 [[TMP24]]
-; SKX-NEXT:    [[TMP27:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 9), align 4
-; SKX-NEXT:    [[TMP28:%.*]] = icmp sgt i32 [[TMP26]], [[TMP27]]
-; SKX-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i32 [[TMP26]], i32 [[TMP27]]
-; SKX-NEXT:    [[TMP30:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 10), align 8
-; SKX-NEXT:    [[TMP31:%.*]] = icmp sgt i32 [[TMP29]], [[TMP30]]
-; SKX-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP29]], i32 [[TMP30]]
-; SKX-NEXT:    [[TMP33:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 11), align 4
-; SKX-NEXT:    [[TMP34:%.*]] = icmp sgt i32 [[TMP32]], [[TMP33]]
-; SKX-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[TMP32]], i32 [[TMP33]]
-; SKX-NEXT:    [[TMP36:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 12), align 16
-; SKX-NEXT:    [[TMP37:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]]
-; SKX-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 [[TMP35]], i32 [[TMP36]]
-; SKX-NEXT:    [[TMP39:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 13), align 4
-; SKX-NEXT:    [[TMP40:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]]
-; SKX-NEXT:    [[TMP41:%.*]] = select i1 [[TMP40]], i32 [[TMP38]], i32 [[TMP39]]
-; SKX-NEXT:    [[TMP42:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 14), align 8
-; SKX-NEXT:    [[TMP43:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]]
-; SKX-NEXT:    [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[TMP41]], i32 [[TMP42]]
-; SKX-NEXT:    [[TMP45:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 15), align 4
-; SKX-NEXT:    [[TMP46:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]]
-; SKX-NEXT:    [[TMP47:%.*]] = select i1 [[TMP46]], i32 [[TMP44]], i32 [[TMP45]]
-; SKX-NEXT:    ret i32 [[TMP47]]
+; SKX-NEXT:    [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([32 x i32]* @arr to <16 x i32>*), align 16
+; SKX:         [[RDX_SHUF:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SKX-NEXT:    [[TMP48:%.*]] = icmp sgt <16 x i32> [[TMP2]], [[RDX_SHUF]]
+; SKX-NEXT:    [[BIN_RDX:%.*]] = select <16 x i1> [[TMP48]], <16 x i32> [[TMP2]], <16 x i32> [[RDX_SHUF]]
+; SKX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <16 x i32> [[BIN_RDX]], <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SKX-NEXT:    [[TMP49:%.*]] = icmp sgt <16 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; SKX-NEXT:    [[BIN_RDX2:%.*]] = select <16 x i1> [[TMP49]], <16 x i32> [[BIN_RDX]], <16 x i32> [[RDX_SHUF1]]
+; SKX-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <16 x i32> [[BIN_RDX2]], <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SKX-NEXT:    [[TMP50:%.*]] = icmp sgt <16 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; SKX-NEXT:    [[BIN_RDX4:%.*]] = select <16 x i1> [[TMP50]], <16 x i32> [[BIN_RDX2]], <16 x i32> [[RDX_SHUF3]]
+; SKX-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <16 x i32> [[BIN_RDX4]], <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SKX-NEXT:    [[TMP51:%.*]] = icmp sgt <16 x i32> [[BIN_RDX4]], [[RDX_SHUF5]]
+; SKX-NEXT:    [[BIN_RDX6:%.*]] = select <16 x i1> [[TMP51]], <16 x i32> [[BIN_RDX4]], <16 x i32> [[RDX_SHUF5]]
+; SKX-NEXT:    [[TMP52:%.*]] = extractelement <16 x i32> [[BIN_RDX6]], i32 0
+; SKX:         ret i32 [[TMP52]]
 ;
   %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
   %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
@@ -381,392 +252,84 @@
 
 define i32 @maxi32(i32) {
 ; CHECK-LABEL: @maxi32(
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]]
-; CHECK-NEXT:    [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]]
-; CHECK-NEXT:    [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
-; CHECK-NEXT:    [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]]
-; CHECK-NEXT:    [[TMP24:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 8), align 16
-; CHECK-NEXT:    [[TMP25:%.*]] = icmp sgt i32 [[TMP23]], [[TMP24]]
-; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP23]], i32 [[TMP24]]
-; CHECK-NEXT:    [[TMP27:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 9), align 4
-; CHECK-NEXT:    [[TMP28:%.*]] = icmp sgt i32 [[TMP26]], [[TMP27]]
-; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i32 [[TMP26]], i32 [[TMP27]]
-; CHECK-NEXT:    [[TMP30:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 10), align 8
-; CHECK-NEXT:    [[TMP31:%.*]] = icmp sgt i32 [[TMP29]], [[TMP30]]
-; CHECK-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP29]], i32 [[TMP30]]
-; CHECK-NEXT:    [[TMP33:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 11), align 4
-; CHECK-NEXT:    [[TMP34:%.*]] = icmp sgt i32 [[TMP32]], [[TMP33]]
-; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[TMP32]], i32 [[TMP33]]
-; CHECK-NEXT:    [[TMP36:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 12), align 16
-; CHECK-NEXT:    [[TMP37:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]]
-; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 [[TMP35]], i32 [[TMP36]]
-; CHECK-NEXT:    [[TMP39:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 13), align 4
-; CHECK-NEXT:    [[TMP40:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]]
-; CHECK-NEXT:    [[TMP41:%.*]] = select i1 [[TMP40]], i32 [[TMP38]], i32 [[TMP39]]
-; CHECK-NEXT:    [[TMP42:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 14), align 8
-; CHECK-NEXT:    [[TMP43:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]]
-; CHECK-NEXT:    [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[TMP41]], i32 [[TMP42]]
-; CHECK-NEXT:    [[TMP45:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 15), align 4
-; CHECK-NEXT:    [[TMP46:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]]
-; CHECK-NEXT:    [[TMP47:%.*]] = select i1 [[TMP46]], i32 [[TMP44]], i32 [[TMP45]]
-; CHECK-NEXT:    [[TMP48:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 16), align 16
-; CHECK-NEXT:    [[TMP49:%.*]] = icmp sgt i32 [[TMP47]], [[TMP48]]
-; CHECK-NEXT:    [[TMP50:%.*]] = select i1 [[TMP49]], i32 [[TMP47]], i32 [[TMP48]]
-; CHECK-NEXT:    [[TMP51:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 17), align 4
-; CHECK-NEXT:    [[TMP52:%.*]] = icmp sgt i32 [[TMP50]], [[TMP51]]
-; CHECK-NEXT:    [[TMP53:%.*]] = select i1 [[TMP52]], i32 [[TMP50]], i32 [[TMP51]]
-; CHECK-NEXT:    [[TMP54:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 18), align 8
-; CHECK-NEXT:    [[TMP55:%.*]] = icmp sgt i32 [[TMP53]], [[TMP54]]
-; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i32 [[TMP53]], i32 [[TMP54]]
-; CHECK-NEXT:    [[TMP57:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 19), align 4
-; CHECK-NEXT:    [[TMP58:%.*]] = icmp sgt i32 [[TMP56]], [[TMP57]]
-; CHECK-NEXT:    [[TMP59:%.*]] = select i1 [[TMP58]], i32 [[TMP56]], i32 [[TMP57]]
-; CHECK-NEXT:    [[TMP60:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 20), align 16
-; CHECK-NEXT:    [[TMP61:%.*]] = icmp sgt i32 [[TMP59]], [[TMP60]]
-; CHECK-NEXT:    [[TMP62:%.*]] = select i1 [[TMP61]], i32 [[TMP59]], i32 [[TMP60]]
-; CHECK-NEXT:    [[TMP63:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 21), align 4
-; CHECK-NEXT:    [[TMP64:%.*]] = icmp sgt i32 [[TMP62]], [[TMP63]]
-; CHECK-NEXT:    [[TMP65:%.*]] = select i1 [[TMP64]], i32 [[TMP62]], i32 [[TMP63]]
-; CHECK-NEXT:    [[TMP66:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 22), align 8
-; CHECK-NEXT:    [[TMP67:%.*]] = icmp sgt i32 [[TMP65]], [[TMP66]]
-; CHECK-NEXT:    [[TMP68:%.*]] = select i1 [[TMP67]], i32 [[TMP65]], i32 [[TMP66]]
-; CHECK-NEXT:    [[TMP69:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 23), align 4
-; CHECK-NEXT:    [[TMP70:%.*]] = icmp sgt i32 [[TMP68]], [[TMP69]]
-; CHECK-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP68]], i32 [[TMP69]]
-; CHECK-NEXT:    [[TMP72:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 24), align 16
-; CHECK-NEXT:    [[TMP73:%.*]] = icmp sgt i32 [[TMP71]], [[TMP72]]
-; CHECK-NEXT:    [[TMP74:%.*]] = select i1 [[TMP73]], i32 [[TMP71]], i32 [[TMP72]]
-; CHECK-NEXT:    [[TMP75:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 25), align 4
-; CHECK-NEXT:    [[TMP76:%.*]] = icmp sgt i32 [[TMP74]], [[TMP75]]
-; CHECK-NEXT:    [[TMP77:%.*]] = select i1 [[TMP76]], i32 [[TMP74]], i32 [[TMP75]]
-; CHECK-NEXT:    [[TMP78:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 26), align 8
-; CHECK-NEXT:    [[TMP79:%.*]] = icmp sgt i32 [[TMP77]], [[TMP78]]
-; CHECK-NEXT:    [[TMP80:%.*]] = select i1 [[TMP79]], i32 [[TMP77]], i32 [[TMP78]]
-; CHECK-NEXT:    [[TMP81:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 27), align 4
-; CHECK-NEXT:    [[TMP82:%.*]] = icmp sgt i32 [[TMP80]], [[TMP81]]
-; CHECK-NEXT:    [[TMP83:%.*]] = select i1 [[TMP82]], i32 [[TMP80]], i32 [[TMP81]]
-; CHECK-NEXT:    [[TMP84:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 28), align 16
-; CHECK-NEXT:    [[TMP85:%.*]] = icmp sgt i32 [[TMP83]], [[TMP84]]
-; CHECK-NEXT:    [[TMP86:%.*]] = select i1 [[TMP85]], i32 [[TMP83]], i32 [[TMP84]]
-; CHECK-NEXT:    [[TMP87:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 29), align 4
-; CHECK-NEXT:    [[TMP88:%.*]] = icmp sgt i32 [[TMP86]], [[TMP87]]
-; CHECK-NEXT:    [[TMP89:%.*]] = select i1 [[TMP88]], i32 [[TMP86]], i32 [[TMP87]]
-; CHECK-NEXT:    [[TMP90:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 30), align 8
-; CHECK-NEXT:    [[TMP91:%.*]] = icmp sgt i32 [[TMP89]], [[TMP90]]
-; CHECK-NEXT:    [[TMP92:%.*]] = select i1 [[TMP91]], i32 [[TMP89]], i32 [[TMP90]]
-; CHECK-NEXT:    [[TMP93:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 31), align 4
-; CHECK-NEXT:    [[TMP94:%.*]] = icmp sgt i32 [[TMP92]], [[TMP93]]
-; CHECK-NEXT:    [[TMP95:%.*]] = select i1 [[TMP94]], i32 [[TMP92]], i32 [[TMP93]]
-; CHECK-NEXT:    ret i32 [[TMP95]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i32>, <32 x i32>* bitcast ([32 x i32]* @arr to <32 x i32>*), align 16
+; CHECK:         [[RDX_SHUF:%.*]] = shufflevector <32 x i32> [[TMP2]], <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP96:%.*]] = icmp sgt <32 x i32> [[TMP2]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = select <32 x i1> [[TMP96]], <32 x i32> [[TMP2]], <32 x i32> [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x i32> [[BIN_RDX]], <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP97:%.*]] = icmp sgt <32 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = select <32 x i1> [[TMP97]], <32 x i32> [[BIN_RDX]], <32 x i32> [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <32 x i32> [[BIN_RDX2]], <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP98:%.*]] = icmp sgt <32 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; CHECK-NEXT:    [[BIN_RDX4:%.*]] = select <32 x i1> [[TMP98]], <32 x i32> [[BIN_RDX2]], <32 x i32> [[RDX_SHUF3]]
+; CHECK-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <32 x i32> [[BIN_RDX4]], <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP99:%.*]] = icmp sgt <32 x i32> [[BIN_RDX4]], [[RDX_SHUF5]]
+; CHECK-NEXT:    [[BIN_RDX6:%.*]] = select <32 x i1> [[TMP99]], <32 x i32> [[BIN_RDX4]], <32 x i32> [[RDX_SHUF5]]
+; CHECK-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <32 x i32> [[BIN_RDX6]], <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP100:%.*]] = icmp sgt <32 x i32> [[BIN_RDX6]], [[RDX_SHUF7]]
+; CHECK-NEXT:    [[BIN_RDX8:%.*]] = select <32 x i1> [[TMP100]], <32 x i32> [[BIN_RDX6]], <32 x i32> [[RDX_SHUF7]]
+; CHECK-NEXT:    [[TMP101:%.*]] = extractelement <32 x i32> [[BIN_RDX8]], i32 0
+; CHECK:         ret i32 [[TMP101]]
 ;
 ; AVX-LABEL: @maxi32(
-; AVX-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
-; AVX-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
-; AVX-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
-; AVX-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
-; AVX-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
-; AVX-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]]
-; AVX-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]]
-; AVX-NEXT:    [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
-; AVX-NEXT:    [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
-; AVX-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]]
-; AVX-NEXT:    [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
-; AVX-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
-; AVX-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
-; AVX-NEXT:    [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
-; AVX-NEXT:    [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]
-; AVX-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]]
-; AVX-NEXT:    [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
-; AVX-NEXT:    [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]
-; AVX-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]]
-; AVX-NEXT:    [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
-; AVX-NEXT:    [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]]
-; AVX-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]]
-; AVX-NEXT:    [[TMP24:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 8), align 16
-; AVX-NEXT:    [[TMP25:%.*]] = icmp sgt i32 [[TMP23]], [[TMP24]]
-; AVX-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP23]], i32 [[TMP24]]
-; AVX-NEXT:    [[TMP27:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 9), align 4
-; AVX-NEXT:    [[TMP28:%.*]] = icmp sgt i32 [[TMP26]], [[TMP27]]
-; AVX-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i32 [[TMP26]], i32 [[TMP27]]
-; AVX-NEXT:    [[TMP30:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 10), align 8
-; AVX-NEXT:    [[TMP31:%.*]] = icmp sgt i32 [[TMP29]], [[TMP30]]
-; AVX-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP29]], i32 [[TMP30]]
-; AVX-NEXT:    [[TMP33:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 11), align 4
-; AVX-NEXT:    [[TMP34:%.*]] = icmp sgt i32 [[TMP32]], [[TMP33]]
-; AVX-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[TMP32]], i32 [[TMP33]]
-; AVX-NEXT:    [[TMP36:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 12), align 16
-; AVX-NEXT:    [[TMP37:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]]
-; AVX-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 [[TMP35]], i32 [[TMP36]]
-; AVX-NEXT:    [[TMP39:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 13), align 4
-; AVX-NEXT:    [[TMP40:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]]
-; AVX-NEXT:    [[TMP41:%.*]] = select i1 [[TMP40]], i32 [[TMP38]], i32 [[TMP39]]
-; AVX-NEXT:    [[TMP42:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 14), align 8
-; AVX-NEXT:    [[TMP43:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]]
-; AVX-NEXT:    [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[TMP41]], i32 [[TMP42]]
-; AVX-NEXT:    [[TMP45:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 15), align 4
-; AVX-NEXT:    [[TMP46:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]]
-; AVX-NEXT:    [[TMP47:%.*]] = select i1 [[TMP46]], i32 [[TMP44]], i32 [[TMP45]]
-; AVX-NEXT:    [[TMP48:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 16), align 16
-; AVX-NEXT:    [[TMP49:%.*]] = icmp sgt i32 [[TMP47]], [[TMP48]]
-; AVX-NEXT:    [[TMP50:%.*]] = select i1 [[TMP49]], i32 [[TMP47]], i32 [[TMP48]]
-; AVX-NEXT:    [[TMP51:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 17), align 4
-; AVX-NEXT:    [[TMP52:%.*]] = icmp sgt i32 [[TMP50]], [[TMP51]]
-; AVX-NEXT:    [[TMP53:%.*]] = select i1 [[TMP52]], i32 [[TMP50]], i32 [[TMP51]]
-; AVX-NEXT:    [[TMP54:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 18), align 8
-; AVX-NEXT:    [[TMP55:%.*]] = icmp sgt i32 [[TMP53]], [[TMP54]]
-; AVX-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i32 [[TMP53]], i32 [[TMP54]]
-; AVX-NEXT:    [[TMP57:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 19), align 4
-; AVX-NEXT:    [[TMP58:%.*]] = icmp sgt i32 [[TMP56]], [[TMP57]]
-; AVX-NEXT:    [[TMP59:%.*]] = select i1 [[TMP58]], i32 [[TMP56]], i32 [[TMP57]]
-; AVX-NEXT:    [[TMP60:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 20), align 16
-; AVX-NEXT:    [[TMP61:%.*]] = icmp sgt i32 [[TMP59]], [[TMP60]]
-; AVX-NEXT:    [[TMP62:%.*]] = select i1 [[TMP61]], i32 [[TMP59]], i32 [[TMP60]]
-; AVX-NEXT:    [[TMP63:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 21), align 4
-; AVX-NEXT:    [[TMP64:%.*]] = icmp sgt i32 [[TMP62]], [[TMP63]]
-; AVX-NEXT:    [[TMP65:%.*]] = select i1 [[TMP64]], i32 [[TMP62]], i32 [[TMP63]]
-; AVX-NEXT:    [[TMP66:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 22), align 8
-; AVX-NEXT:    [[TMP67:%.*]] = icmp sgt i32 [[TMP65]], [[TMP66]]
-; AVX-NEXT:    [[TMP68:%.*]] = select i1 [[TMP67]], i32 [[TMP65]], i32 [[TMP66]]
-; AVX-NEXT:    [[TMP69:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 23), align 4
-; AVX-NEXT:    [[TMP70:%.*]] = icmp sgt i32 [[TMP68]], [[TMP69]]
-; AVX-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP68]], i32 [[TMP69]]
-; AVX-NEXT:    [[TMP72:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 24), align 16
-; AVX-NEXT:    [[TMP73:%.*]] = icmp sgt i32 [[TMP71]], [[TMP72]]
-; AVX-NEXT:    [[TMP74:%.*]] = select i1 [[TMP73]], i32 [[TMP71]], i32 [[TMP72]]
-; AVX-NEXT:    [[TMP75:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 25), align 4
-; AVX-NEXT:    [[TMP76:%.*]] = icmp sgt i32 [[TMP74]], [[TMP75]]
-; AVX-NEXT:    [[TMP77:%.*]] = select i1 [[TMP76]], i32 [[TMP74]], i32 [[TMP75]]
-; AVX-NEXT:    [[TMP78:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 26), align 8
-; AVX-NEXT:    [[TMP79:%.*]] = icmp sgt i32 [[TMP77]], [[TMP78]]
-; AVX-NEXT:    [[TMP80:%.*]] = select i1 [[TMP79]], i32 [[TMP77]], i32 [[TMP78]]
-; AVX-NEXT:    [[TMP81:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 27), align 4
-; AVX-NEXT:    [[TMP82:%.*]] = icmp sgt i32 [[TMP80]], [[TMP81]]
-; AVX-NEXT:    [[TMP83:%.*]] = select i1 [[TMP82]], i32 [[TMP80]], i32 [[TMP81]]
-; AVX-NEXT:    [[TMP84:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 28), align 16
-; AVX-NEXT:    [[TMP85:%.*]] = icmp sgt i32 [[TMP83]], [[TMP84]]
-; AVX-NEXT:    [[TMP86:%.*]] = select i1 [[TMP85]], i32 [[TMP83]], i32 [[TMP84]]
-; AVX-NEXT:    [[TMP87:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 29), align 4
-; AVX-NEXT:    [[TMP88:%.*]] = icmp sgt i32 [[TMP86]], [[TMP87]]
-; AVX-NEXT:    [[TMP89:%.*]] = select i1 [[TMP88]], i32 [[TMP86]], i32 [[TMP87]]
-; AVX-NEXT:    [[TMP90:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 30), align 8
-; AVX-NEXT:    [[TMP91:%.*]] = icmp sgt i32 [[TMP89]], [[TMP90]]
-; AVX-NEXT:    [[TMP92:%.*]] = select i1 [[TMP91]], i32 [[TMP89]], i32 [[TMP90]]
-; AVX-NEXT:    [[TMP93:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 31), align 4
-; AVX-NEXT:    [[TMP94:%.*]] = icmp sgt i32 [[TMP92]], [[TMP93]]
-; AVX-NEXT:    [[TMP95:%.*]] = select i1 [[TMP94]], i32 [[TMP92]], i32 [[TMP93]]
-; AVX-NEXT:    ret i32 [[TMP95]]
+; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i32>, <32 x i32>* bitcast ([32 x i32]* @arr to <32 x i32>*), align 16
+; AVX:         [[RDX_SHUF:%.*]] = shufflevector <32 x i32> [[TMP2]], <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP96:%.*]] = icmp sgt <32 x i32> [[TMP2]], [[RDX_SHUF]]
+; AVX-NEXT:    [[BIN_RDX:%.*]] = select <32 x i1> [[TMP96]], <32 x i32> [[TMP2]], <32 x i32> [[RDX_SHUF]]
+; AVX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x i32> [[BIN_RDX]], <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP97:%.*]] = icmp sgt <32 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; AVX-NEXT:    [[BIN_RDX2:%.*]] = select <32 x i1> [[TMP97]], <32 x i32> [[BIN_RDX]], <32 x i32> [[RDX_SHUF1]]
+; AVX-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <32 x i32> [[BIN_RDX2]], <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP98:%.*]] = icmp sgt <32 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; AVX-NEXT:    [[BIN_RDX4:%.*]] = select <32 x i1> [[TMP98]], <32 x i32> [[BIN_RDX2]], <32 x i32> [[RDX_SHUF3]]
+; AVX-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <32 x i32> [[BIN_RDX4]], <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP99:%.*]] = icmp sgt <32 x i32> [[BIN_RDX4]], [[RDX_SHUF5]]
+; AVX-NEXT:    [[BIN_RDX6:%.*]] = select <32 x i1> [[TMP99]], <32 x i32> [[BIN_RDX4]], <32 x i32> [[RDX_SHUF5]]
+; AVX-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <32 x i32> [[BIN_RDX6]], <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP100:%.*]] = icmp sgt <32 x i32> [[BIN_RDX6]], [[RDX_SHUF7]]
+; AVX-NEXT:    [[BIN_RDX8:%.*]] = select <32 x i1> [[TMP100]], <32 x i32> [[BIN_RDX6]], <32 x i32> [[RDX_SHUF7]]
+; AVX-NEXT:    [[TMP101:%.*]] = extractelement <32 x i32> [[BIN_RDX8]], i32 0
+; AVX:         ret i32 [[TMP101]]
 ;
 ; AVX2-LABEL: @maxi32(
-; AVX2-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
-; AVX2-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
-; AVX2-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
-; AVX2-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
-; AVX2-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
-; AVX2-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]]
-; AVX2-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]]
-; AVX2-NEXT:    [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
-; AVX2-NEXT:    [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
-; AVX2-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]]
-; AVX2-NEXT:    [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
-; AVX2-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
-; AVX2-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
-; AVX2-NEXT:    [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
-; AVX2-NEXT:    [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]
-; AVX2-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]]
-; AVX2-NEXT:    [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
-; AVX2-NEXT:    [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]
-; AVX2-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]]
-; AVX2-NEXT:    [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
-; AVX2-NEXT:    [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]]
-; AVX2-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]]
-; AVX2-NEXT:    [[TMP24:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 8), align 16
-; AVX2-NEXT:    [[TMP25:%.*]] = icmp sgt i32 [[TMP23]], [[TMP24]]
-; AVX2-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP23]], i32 [[TMP24]]
-; AVX2-NEXT:    [[TMP27:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 9), align 4
-; AVX2-NEXT:    [[TMP28:%.*]] = icmp sgt i32 [[TMP26]], [[TMP27]]
-; AVX2-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i32 [[TMP26]], i32 [[TMP27]]
-; AVX2-NEXT:    [[TMP30:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 10), align 8
-; AVX2-NEXT:    [[TMP31:%.*]] = icmp sgt i32 [[TMP29]], [[TMP30]]
-; AVX2-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP29]], i32 [[TMP30]]
-; AVX2-NEXT:    [[TMP33:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 11), align 4
-; AVX2-NEXT:    [[TMP34:%.*]] = icmp sgt i32 [[TMP32]], [[TMP33]]
-; AVX2-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[TMP32]], i32 [[TMP33]]
-; AVX2-NEXT:    [[TMP36:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 12), align 16
-; AVX2-NEXT:    [[TMP37:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]]
-; AVX2-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 [[TMP35]], i32 [[TMP36]]
-; AVX2-NEXT:    [[TMP39:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 13), align 4
-; AVX2-NEXT:    [[TMP40:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]]
-; AVX2-NEXT:    [[TMP41:%.*]] = select i1 [[TMP40]], i32 [[TMP38]], i32 [[TMP39]]
-; AVX2-NEXT:    [[TMP42:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 14), align 8
-; AVX2-NEXT:    [[TMP43:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]]
-; AVX2-NEXT:    [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[TMP41]], i32 [[TMP42]]
-; AVX2-NEXT:    [[TMP45:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 15), align 4
-; AVX2-NEXT:    [[TMP46:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]]
-; AVX2-NEXT:    [[TMP47:%.*]] = select i1 [[TMP46]], i32 [[TMP44]], i32 [[TMP45]]
-; AVX2-NEXT:    [[TMP48:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 16), align 16
-; AVX2-NEXT:    [[TMP49:%.*]] = icmp sgt i32 [[TMP47]], [[TMP48]]
-; AVX2-NEXT:    [[TMP50:%.*]] = select i1 [[TMP49]], i32 [[TMP47]], i32 [[TMP48]]
-; AVX2-NEXT:    [[TMP51:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 17), align 4
-; AVX2-NEXT:    [[TMP52:%.*]] = icmp sgt i32 [[TMP50]], [[TMP51]]
-; AVX2-NEXT:    [[TMP53:%.*]] = select i1 [[TMP52]], i32 [[TMP50]], i32 [[TMP51]]
-; AVX2-NEXT:    [[TMP54:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 18), align 8
-; AVX2-NEXT:    [[TMP55:%.*]] = icmp sgt i32 [[TMP53]], [[TMP54]]
-; AVX2-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i32 [[TMP53]], i32 [[TMP54]]
-; AVX2-NEXT:    [[TMP57:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 19), align 4
-; AVX2-NEXT:    [[TMP58:%.*]] = icmp sgt i32 [[TMP56]], [[TMP57]]
-; AVX2-NEXT:    [[TMP59:%.*]] = select i1 [[TMP58]], i32 [[TMP56]], i32 [[TMP57]]
-; AVX2-NEXT:    [[TMP60:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 20), align 16
-; AVX2-NEXT:    [[TMP61:%.*]] = icmp sgt i32 [[TMP59]], [[TMP60]]
-; AVX2-NEXT:    [[TMP62:%.*]] = select i1 [[TMP61]], i32 [[TMP59]], i32 [[TMP60]]
-; AVX2-NEXT:    [[TMP63:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 21), align 4
-; AVX2-NEXT:    [[TMP64:%.*]] = icmp sgt i32 [[TMP62]], [[TMP63]]
-; AVX2-NEXT:    [[TMP65:%.*]] = select i1 [[TMP64]], i32 [[TMP62]], i32 [[TMP63]]
-; AVX2-NEXT:    [[TMP66:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 22), align 8
-; AVX2-NEXT:    [[TMP67:%.*]] = icmp sgt i32 [[TMP65]], [[TMP66]]
-; AVX2-NEXT:    [[TMP68:%.*]] = select i1 [[TMP67]], i32 [[TMP65]], i32 [[TMP66]]
-; AVX2-NEXT:    [[TMP69:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 23), align 4
-; AVX2-NEXT:    [[TMP70:%.*]] = icmp sgt i32 [[TMP68]], [[TMP69]]
-; AVX2-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP68]], i32 [[TMP69]]
-; AVX2-NEXT:    [[TMP72:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 24), align 16
-; AVX2-NEXT:    [[TMP73:%.*]] = icmp sgt i32 [[TMP71]], [[TMP72]]
-; AVX2-NEXT:    [[TMP74:%.*]] = select i1 [[TMP73]], i32 [[TMP71]], i32 [[TMP72]]
-; AVX2-NEXT:    [[TMP75:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 25), align 4
-; AVX2-NEXT:    [[TMP76:%.*]] = icmp sgt i32 [[TMP74]], [[TMP75]]
-; AVX2-NEXT:    [[TMP77:%.*]] = select i1 [[TMP76]], i32 [[TMP74]], i32 [[TMP75]]
-; AVX2-NEXT:    [[TMP78:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 26), align 8
-; AVX2-NEXT:    [[TMP79:%.*]] = icmp sgt i32 [[TMP77]], [[TMP78]]
-; AVX2-NEXT:    [[TMP80:%.*]] = select i1 [[TMP79]], i32 [[TMP77]], i32 [[TMP78]]
-; AVX2-NEXT:    [[TMP81:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 27), align 4
-; AVX2-NEXT:    [[TMP82:%.*]] = icmp sgt i32 [[TMP80]], [[TMP81]]
-; AVX2-NEXT:    [[TMP83:%.*]] = select i1 [[TMP82]], i32 [[TMP80]], i32 [[TMP81]]
-; AVX2-NEXT:    [[TMP84:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 28), align 16
-; AVX2-NEXT:    [[TMP85:%.*]] = icmp sgt i32 [[TMP83]], [[TMP84]]
-; AVX2-NEXT:    [[TMP86:%.*]] = select i1 [[TMP85]], i32 [[TMP83]], i32 [[TMP84]]
-; AVX2-NEXT:    [[TMP87:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 29), align 4
-; AVX2-NEXT:    [[TMP88:%.*]] = icmp sgt i32 [[TMP86]], [[TMP87]]
-; AVX2-NEXT:    [[TMP89:%.*]] = select i1 [[TMP88]], i32 [[TMP86]], i32 [[TMP87]]
-; AVX2-NEXT:    [[TMP90:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 30), align 8
-; AVX2-NEXT:    [[TMP91:%.*]] = icmp sgt i32 [[TMP89]], [[TMP90]]
-; AVX2-NEXT:    [[TMP92:%.*]] = select i1 [[TMP91]], i32 [[TMP89]], i32 [[TMP90]]
-; AVX2-NEXT:    [[TMP93:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 31), align 4
-; AVX2-NEXT:    [[TMP94:%.*]] = icmp sgt i32 [[TMP92]], [[TMP93]]
-; AVX2-NEXT:    [[TMP95:%.*]] = select i1 [[TMP94]], i32 [[TMP92]], i32 [[TMP93]]
-; AVX2-NEXT:    ret i32 [[TMP95]]
+; AVX2-NEXT:    [[TMP2:%.*]] = load <32 x i32>, <32 x i32>* bitcast ([32 x i32]* @arr to <32 x i32>*), align 16
+; AVX2:         [[RDX_SHUF:%.*]] = shufflevector <32 x i32> [[TMP2]], <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP96:%.*]] = icmp sgt <32 x i32> [[TMP2]], [[RDX_SHUF]]
+; AVX2-NEXT:    [[BIN_RDX:%.*]] = select <32 x i1> [[TMP96]], <32 x i32> [[TMP2]], <32 x i32> [[RDX_SHUF]]
+; AVX2-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x i32> [[BIN_RDX]], <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP97:%.*]] = icmp sgt <32 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; AVX2-NEXT:    [[BIN_RDX2:%.*]] = select <32 x i1> [[TMP97]], <32 x i32> [[BIN_RDX]], <32 x i32> [[RDX_SHUF1]]
+; AVX2-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <32 x i32> [[BIN_RDX2]], <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP98:%.*]] = icmp sgt <32 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; AVX2-NEXT:    [[BIN_RDX4:%.*]] = select <32 x i1> [[TMP98]], <32 x i32> [[BIN_RDX2]], <32 x i32> [[RDX_SHUF3]]
+; AVX2-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <32 x i32> [[BIN_RDX4]], <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP99:%.*]] = icmp sgt <32 x i32> [[BIN_RDX4]], [[RDX_SHUF5]]
+; AVX2-NEXT:    [[BIN_RDX6:%.*]] = select <32 x i1> [[TMP99]], <32 x i32> [[BIN_RDX4]], <32 x i32> [[RDX_SHUF5]]
+; AVX2-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <32 x i32> [[BIN_RDX6]], <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP100:%.*]] = icmp sgt <32 x i32> [[BIN_RDX6]], [[RDX_SHUF7]]
+; AVX2-NEXT:    [[BIN_RDX8:%.*]] = select <32 x i1> [[TMP100]], <32 x i32> [[BIN_RDX6]], <32 x i32> [[RDX_SHUF7]]
+; AVX2-NEXT:    [[TMP101:%.*]] = extractelement <32 x i32> [[BIN_RDX8]], i32 0
+; AVX2:         ret i32 [[TMP101]]
 ;
 ; SKX-LABEL: @maxi32(
-; SKX-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
-; SKX-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
-; SKX-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
-; SKX-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
-; SKX-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
-; SKX-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]]
-; SKX-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]]
-; SKX-NEXT:    [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
-; SKX-NEXT:    [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
-; SKX-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]]
-; SKX-NEXT:    [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
-; SKX-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
-; SKX-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
-; SKX-NEXT:    [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
-; SKX-NEXT:    [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]
-; SKX-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]]
-; SKX-NEXT:    [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
-; SKX-NEXT:    [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]
-; SKX-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]]
-; SKX-NEXT:    [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
-; SKX-NEXT:    [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]]
-; SKX-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]]
-; SKX-NEXT:    [[TMP24:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 8), align 16
-; SKX-NEXT:    [[TMP25:%.*]] = icmp sgt i32 [[TMP23]], [[TMP24]]
-; SKX-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP23]], i32 [[TMP24]]
-; SKX-NEXT:    [[TMP27:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 9), align 4
-; SKX-NEXT:    [[TMP28:%.*]] = icmp sgt i32 [[TMP26]], [[TMP27]]
-; SKX-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i32 [[TMP26]], i32 [[TMP27]]
-; SKX-NEXT:    [[TMP30:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 10), align 8
-; SKX-NEXT:    [[TMP31:%.*]] = icmp sgt i32 [[TMP29]], [[TMP30]]
-; SKX-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP29]], i32 [[TMP30]]
-; SKX-NEXT:    [[TMP33:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 11), align 4
-; SKX-NEXT:    [[TMP34:%.*]] = icmp sgt i32 [[TMP32]], [[TMP33]]
-; SKX-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[TMP32]], i32 [[TMP33]]
-; SKX-NEXT:    [[TMP36:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 12), align 16
-; SKX-NEXT:    [[TMP37:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]]
-; SKX-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 [[TMP35]], i32 [[TMP36]]
-; SKX-NEXT:    [[TMP39:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 13), align 4
-; SKX-NEXT:    [[TMP40:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]]
-; SKX-NEXT:    [[TMP41:%.*]] = select i1 [[TMP40]], i32 [[TMP38]], i32 [[TMP39]]
-; SKX-NEXT:    [[TMP42:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 14), align 8
-; SKX-NEXT:    [[TMP43:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]]
-; SKX-NEXT:    [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[TMP41]], i32 [[TMP42]]
-; SKX-NEXT:    [[TMP45:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 15), align 4
-; SKX-NEXT:    [[TMP46:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]]
-; SKX-NEXT:    [[TMP47:%.*]] = select i1 [[TMP46]], i32 [[TMP44]], i32 [[TMP45]]
-; SKX-NEXT:    [[TMP48:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 16), align 16
-; SKX-NEXT:    [[TMP49:%.*]] = icmp sgt i32 [[TMP47]], [[TMP48]]
-; SKX-NEXT:    [[TMP50:%.*]] = select i1 [[TMP49]], i32 [[TMP47]], i32 [[TMP48]]
-; SKX-NEXT:    [[TMP51:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 17), align 4
-; SKX-NEXT:    [[TMP52:%.*]] = icmp sgt i32 [[TMP50]], [[TMP51]]
-; SKX-NEXT:    [[TMP53:%.*]] = select i1 [[TMP52]], i32 [[TMP50]], i32 [[TMP51]]
-; SKX-NEXT:    [[TMP54:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 18), align 8
-; SKX-NEXT:    [[TMP55:%.*]] = icmp sgt i32 [[TMP53]], [[TMP54]]
-; SKX-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i32 [[TMP53]], i32 [[TMP54]]
-; SKX-NEXT:    [[TMP57:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 19), align 4
-; SKX-NEXT:    [[TMP58:%.*]] = icmp sgt i32 [[TMP56]], [[TMP57]]
-; SKX-NEXT:    [[TMP59:%.*]] = select i1 [[TMP58]], i32 [[TMP56]], i32 [[TMP57]]
-; SKX-NEXT:    [[TMP60:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 20), align 16
-; SKX-NEXT:    [[TMP61:%.*]] = icmp sgt i32 [[TMP59]], [[TMP60]]
-; SKX-NEXT:    [[TMP62:%.*]] = select i1 [[TMP61]], i32 [[TMP59]], i32 [[TMP60]]
-; SKX-NEXT:    [[TMP63:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 21), align 4
-; SKX-NEXT:    [[TMP64:%.*]] = icmp sgt i32 [[TMP62]], [[TMP63]]
-; SKX-NEXT:    [[TMP65:%.*]] = select i1 [[TMP64]], i32 [[TMP62]], i32 [[TMP63]]
-; SKX-NEXT:    [[TMP66:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 22), align 8
-; SKX-NEXT:    [[TMP67:%.*]] = icmp sgt i32 [[TMP65]], [[TMP66]]
-; SKX-NEXT:    [[TMP68:%.*]] = select i1 [[TMP67]], i32 [[TMP65]], i32 [[TMP66]]
-; SKX-NEXT:    [[TMP69:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 23), align 4
-; SKX-NEXT:    [[TMP70:%.*]] = icmp sgt i32 [[TMP68]], [[TMP69]]
-; SKX-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP68]], i32 [[TMP69]]
-; SKX-NEXT:    [[TMP72:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 24), align 16
-; SKX-NEXT:    [[TMP73:%.*]] = icmp sgt i32 [[TMP71]], [[TMP72]]
-; SKX-NEXT:    [[TMP74:%.*]] = select i1 [[TMP73]], i32 [[TMP71]], i32 [[TMP72]]
-; SKX-NEXT:    [[TMP75:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 25), align 4
-; SKX-NEXT:    [[TMP76:%.*]] = icmp sgt i32 [[TMP74]], [[TMP75]]
-; SKX-NEXT:    [[TMP77:%.*]] = select i1 [[TMP76]], i32 [[TMP74]], i32 [[TMP75]]
-; SKX-NEXT:    [[TMP78:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 26), align 8
-; SKX-NEXT:    [[TMP79:%.*]] = icmp sgt i32 [[TMP77]], [[TMP78]]
-; SKX-NEXT:    [[TMP80:%.*]] = select i1 [[TMP79]], i32 [[TMP77]], i32 [[TMP78]]
-; SKX-NEXT:    [[TMP81:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 27), align 4
-; SKX-NEXT:    [[TMP82:%.*]] = icmp sgt i32 [[TMP80]], [[TMP81]]
-; SKX-NEXT:    [[TMP83:%.*]] = select i1 [[TMP82]], i32 [[TMP80]], i32 [[TMP81]]
-; SKX-NEXT:    [[TMP84:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 28), align 16
-; SKX-NEXT:    [[TMP85:%.*]] = icmp sgt i32 [[TMP83]], [[TMP84]]
-; SKX-NEXT:    [[TMP86:%.*]] = select i1 [[TMP85]], i32 [[TMP83]], i32 [[TMP84]]
-; SKX-NEXT:    [[TMP87:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 29), align 4
-; SKX-NEXT:    [[TMP88:%.*]] = icmp sgt i32 [[TMP86]], [[TMP87]]
-; SKX-NEXT:    [[TMP89:%.*]] = select i1 [[TMP88]], i32 [[TMP86]], i32 [[TMP87]]
-; SKX-NEXT:    [[TMP90:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 30), align 8
-; SKX-NEXT:    [[TMP91:%.*]] = icmp sgt i32 [[TMP89]], [[TMP90]]
-; SKX-NEXT:    [[TMP92:%.*]] = select i1 [[TMP91]], i32 [[TMP89]], i32 [[TMP90]]
-; SKX-NEXT:    [[TMP93:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 31), align 4
-; SKX-NEXT:    [[TMP94:%.*]] = icmp sgt i32 [[TMP92]], [[TMP93]]
-; SKX-NEXT:    [[TMP95:%.*]] = select i1 [[TMP94]], i32 [[TMP92]], i32 [[TMP93]]
-; SKX-NEXT:    ret i32 [[TMP95]]
+; SKX-NEXT:    [[TMP2:%.*]] = load <32 x i32>, <32 x i32>* bitcast ([32 x i32]* @arr to <32 x i32>*), align 16
+; SKX:         [[RDX_SHUF:%.*]] = shufflevector <32 x i32> [[TMP2]], <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SKX-NEXT:    [[TMP96:%.*]] = icmp sgt <32 x i32> [[TMP2]], [[RDX_SHUF]]
+; SKX-NEXT:    [[BIN_RDX:%.*]] = select <32 x i1> [[TMP96]], <32 x i32> [[TMP2]], <32 x i32> [[RDX_SHUF]]
+; SKX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x i32> [[BIN_RDX]], <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SKX-NEXT:    [[TMP97:%.*]] = icmp sgt <32 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; SKX-NEXT:    [[BIN_RDX2:%.*]] = select <32 x i1> [[TMP97]], <32 x i32> [[BIN_RDX]], <32 x i32> [[RDX_SHUF1]]
+; SKX-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <32 x i32> [[BIN_RDX2]], <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SKX-NEXT:    [[TMP98:%.*]] = icmp sgt <32 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; SKX-NEXT:    [[BIN_RDX4:%.*]] = select <32 x i1> [[TMP98]], <32 x i32> [[BIN_RDX2]], <32 x i32> [[RDX_SHUF3]]
+; SKX-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <32 x i32> [[BIN_RDX4]], <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SKX-NEXT:    [[TMP99:%.*]] = icmp sgt <32 x i32> [[BIN_RDX4]], [[RDX_SHUF5]]
+; SKX-NEXT:    [[BIN_RDX6:%.*]] = select <32 x i1> [[TMP99]], <32 x i32> [[BIN_RDX4]], <32 x i32> [[RDX_SHUF5]]
+; SKX-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <32 x i32> [[BIN_RDX6]], <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SKX-NEXT:    [[TMP100:%.*]] = icmp sgt <32 x i32> [[BIN_RDX6]], [[RDX_SHUF7]]
+; SKX-NEXT:    [[BIN_RDX8:%.*]] = select <32 x i1> [[TMP100]], <32 x i32> [[BIN_RDX6]], <32 x i32> [[RDX_SHUF7]]
+; SKX-NEXT:    [[TMP101:%.*]] = extractelement <32 x i32> [[BIN_RDX8]], i32 0
+; SKX:         ret i32 [[TMP101]]
 ;
   %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
   %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
@@ -892,79 +455,46 @@
 ; CHECK-NEXT:    ret float [[TMP23]]
 ;
 ; AVX-LABEL: @maxf8(
-; AVX-NEXT:    [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
-; AVX-NEXT:    [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
-; AVX-NEXT:    [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]
-; AVX-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]]
-; AVX-NEXT:    [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8
-; AVX-NEXT:    [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]]
-; AVX-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]]
-; AVX-NEXT:    [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4
-; AVX-NEXT:    [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]]
-; AVX-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]]
-; AVX-NEXT:    [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16
-; AVX-NEXT:    [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]]
-; AVX-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]]
-; AVX-NEXT:    [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4
-; AVX-NEXT:    [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]]
-; AVX-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]]
-; AVX-NEXT:    [[TMP18:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8
-; AVX-NEXT:    [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]]
-; AVX-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]]
-; AVX-NEXT:    [[TMP21:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
-; AVX-NEXT:    [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]]
-; AVX-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]]
-; AVX-NEXT:    ret float [[TMP23]]
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([32 x float]* @arr1 to <8 x float>*), align 16
+; AVX:         [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP24:%.*]] = fcmp fast ogt <8 x float> [[TMP2]], [[RDX_SHUF]]
+; AVX-NEXT:    [[BIN_RDX:%.*]] = select <8 x i1> [[TMP24]], <8 x float> [[TMP2]], <8 x float> [[RDX_SHUF]]
+; AVX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP25:%.*]] = fcmp fast ogt <8 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; AVX-NEXT:    [[BIN_RDX2:%.*]] = select <8 x i1> [[TMP25]], <8 x float> [[BIN_RDX]], <8 x float> [[RDX_SHUF1]]
+; AVX-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP26:%.*]] = fcmp fast ogt <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; AVX-NEXT:    [[BIN_RDX4:%.*]] = select <8 x i1> [[TMP26]], <8 x float> [[BIN_RDX2]], <8 x float> [[RDX_SHUF3]]
+; AVX-NEXT:    [[TMP27:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
+; AVX:         ret float [[TMP27]]
 ;
 ; AVX2-LABEL: @maxf8(
-; AVX2-NEXT:    [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
-; AVX2-NEXT:    [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
-; AVX2-NEXT:    [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]
-; AVX2-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]]
-; AVX2-NEXT:    [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8
-; AVX2-NEXT:    [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]]
-; AVX2-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]]
-; AVX2-NEXT:    [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4
-; AVX2-NEXT:    [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]]
-; AVX2-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]]
-; AVX2-NEXT:    [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16
-; AVX2-NEXT:    [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]]
-; AVX2-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]]
-; AVX2-NEXT:    [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4
-; AVX2-NEXT:    [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]]
-; AVX2-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]]
-; AVX2-NEXT:    [[TMP18:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8
-; AVX2-NEXT:    [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]]
-; AVX2-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]]
-; AVX2-NEXT:    [[TMP21:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
-; AVX2-NEXT:    [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]]
-; AVX2-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]]
-; AVX2-NEXT:    ret float [[TMP23]]
+; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([32 x float]* @arr1 to <8 x float>*), align 16
+; AVX2:         [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP24:%.*]] = fcmp fast ogt <8 x float> [[TMP2]], [[RDX_SHUF]]
+; AVX2-NEXT:    [[BIN_RDX:%.*]] = select <8 x i1> [[TMP24]], <8 x float> [[TMP2]], <8 x float> [[RDX_SHUF]]
+; AVX2-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP25:%.*]] = fcmp fast ogt <8 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; AVX2-NEXT:    [[BIN_RDX2:%.*]] = select <8 x i1> [[TMP25]], <8 x float> [[BIN_RDX]], <8 x float> [[RDX_SHUF1]]
+; AVX2-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP26:%.*]] = fcmp fast ogt <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; AVX2-NEXT:    [[BIN_RDX4:%.*]] = select <8 x i1> [[TMP26]], <8 x float> [[BIN_RDX2]], <8 x float> [[RDX_SHUF3]]
+; AVX2-NEXT:    [[TMP27:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
+; AVX2:         ret float [[TMP27]]
 ;
 ; SKX-LABEL: @maxf8(
-; SKX-NEXT:    [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
-; SKX-NEXT:    [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
-; SKX-NEXT:    [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]
-; SKX-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]]
-; SKX-NEXT:    [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8
-; SKX-NEXT:    [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]]
-; SKX-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]]
-; SKX-NEXT:    [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4
-; SKX-NEXT:    [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]]
-; SKX-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]]
-; SKX-NEXT:    [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16
-; SKX-NEXT:    [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]]
-; SKX-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]]
-; SKX-NEXT:    [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4
-; SKX-NEXT:    [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]]
-; SKX-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]]
-; SKX-NEXT:    [[TMP18:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8
-; SKX-NEXT:    [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]]
-; SKX-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]]
-; SKX-NEXT:    [[TMP21:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
-; SKX-NEXT:    [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]]
-; SKX-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]]
-; SKX-NEXT:    ret float [[TMP23]]
+; SKX-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([32 x float]* @arr1 to <8 x float>*), align 16
+; SKX:         [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; SKX-NEXT:    [[TMP24:%.*]] = fcmp fast ogt <8 x float> [[TMP2]], [[RDX_SHUF]]
+; SKX-NEXT:    [[BIN_RDX:%.*]] = select <8 x i1> [[TMP24]], <8 x float> [[TMP2]], <8 x float> [[RDX_SHUF]]
+; SKX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SKX-NEXT:    [[TMP25:%.*]] = fcmp fast ogt <8 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; SKX-NEXT:    [[BIN_RDX2:%.*]] = select <8 x i1> [[TMP25]], <8 x float> [[BIN_RDX]], <8 x float> [[RDX_SHUF1]]
+; SKX-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SKX-NEXT:    [[TMP26:%.*]] = fcmp fast ogt <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; SKX-NEXT:    [[BIN_RDX4:%.*]] = select <8 x i1> [[TMP26]], <8 x float> [[BIN_RDX2]], <8 x float> [[RDX_SHUF3]]
+; SKX-NEXT:    [[TMP27:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
+; SKX:         ret float [[TMP27]]
 ;
   %2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
   %3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
@@ -1042,151 +572,55 @@
 ; CHECK-NEXT:    ret float [[TMP47]]
 ;
 ; AVX-LABEL: @maxf16(
-; AVX-NEXT:    [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
-; AVX-NEXT:    [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
-; AVX-NEXT:    [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]
-; AVX-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]]
-; AVX-NEXT:    [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8
-; AVX-NEXT:    [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]]
-; AVX-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]]
-; AVX-NEXT:    [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4
-; AVX-NEXT:    [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]]
-; AVX-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]]
-; AVX-NEXT:    [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16
-; AVX-NEXT:    [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]]
-; AVX-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]]
-; AVX-NEXT:    [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4
-; AVX-NEXT:    [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]]
-; AVX-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]]
-; AVX-NEXT:    [[TMP18:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8
-; AVX-NEXT:    [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]]
-; AVX-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]]
-; AVX-NEXT:    [[TMP21:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
-; AVX-NEXT:    [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]]
-; AVX-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]]
-; AVX-NEXT:    [[TMP24:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 8), align 16
-; AVX-NEXT:    [[TMP25:%.*]] = fcmp fast ogt float [[TMP23]], [[TMP24]]
-; AVX-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP23]], float [[TMP24]]
-; AVX-NEXT:    [[TMP27:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 9), align 4
-; AVX-NEXT:    [[TMP28:%.*]] = fcmp fast ogt float [[TMP26]], [[TMP27]]
-; AVX-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], float [[TMP26]], float [[TMP27]]
-; AVX-NEXT:    [[TMP30:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 10), align 8
-; AVX-NEXT:    [[TMP31:%.*]] = fcmp fast ogt float [[TMP29]], [[TMP30]]
-; AVX-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], float [[TMP29]], float [[TMP30]]
-; AVX-NEXT:    [[TMP33:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 11), align 4
-; AVX-NEXT:    [[TMP34:%.*]] = fcmp fast ogt float [[TMP32]], [[TMP33]]
-; AVX-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], float [[TMP32]], float [[TMP33]]
-; AVX-NEXT:    [[TMP36:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 12), align 16
-; AVX-NEXT:    [[TMP37:%.*]] = fcmp fast ogt float [[TMP35]], [[TMP36]]
-; AVX-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], float [[TMP35]], float [[TMP36]]
-; AVX-NEXT:    [[TMP39:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 13), align 4
-; AVX-NEXT:    [[TMP40:%.*]] = fcmp fast ogt float [[TMP38]], [[TMP39]]
-; AVX-NEXT:    [[TMP41:%.*]] = select i1 [[TMP40]], float [[TMP38]], float [[TMP39]]
-; AVX-NEXT:    [[TMP42:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 14), align 8
-; AVX-NEXT:    [[TMP43:%.*]] = fcmp fast ogt float [[TMP41]], [[TMP42]]
-; AVX-NEXT:    [[TMP44:%.*]] = select i1 [[TMP43]], float [[TMP41]], float [[TMP42]]
-; AVX-NEXT:    [[TMP45:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4
-; AVX-NEXT:    [[TMP46:%.*]] = fcmp fast ogt float [[TMP44]], [[TMP45]]
-; AVX-NEXT:    [[TMP47:%.*]] = select i1 [[TMP46]], float [[TMP44]], float [[TMP45]]
-; AVX-NEXT:    ret float [[TMP47]]
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x float>, <16 x float>* bitcast ([32 x float]* @arr1 to <16 x float>*), align 16
+; AVX:         [[RDX_SHUF:%.*]] = shufflevector <16 x float> [[TMP2]], <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP48:%.*]] = fcmp fast ogt <16 x float> [[TMP2]], [[RDX_SHUF]]
+; AVX-NEXT:    [[BIN_RDX:%.*]] = select <16 x i1> [[TMP48]], <16 x float> [[TMP2]], <16 x float> [[RDX_SHUF]]
+; AVX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <16 x float> [[BIN_RDX]], <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP49:%.*]] = fcmp fast ogt <16 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; AVX-NEXT:    [[BIN_RDX2:%.*]] = select <16 x i1> [[TMP49]], <16 x float> [[BIN_RDX]], <16 x float> [[RDX_SHUF1]]
+; AVX-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <16 x float> [[BIN_RDX2]], <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP50:%.*]] = fcmp fast ogt <16 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; AVX-NEXT:    [[BIN_RDX4:%.*]] = select <16 x i1> [[TMP50]], <16 x float> [[BIN_RDX2]], <16 x float> [[RDX_SHUF3]]
+; AVX-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <16 x float> [[BIN_RDX4]], <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP51:%.*]] = fcmp fast ogt <16 x float> [[BIN_RDX4]], [[RDX_SHUF5]]
+; AVX-NEXT:    [[BIN_RDX6:%.*]] = select <16 x i1> [[TMP51]], <16 x float> [[BIN_RDX4]], <16 x float> [[RDX_SHUF5]]
+; AVX-NEXT:    [[TMP52:%.*]] = extractelement <16 x float> [[BIN_RDX6]], i32 0
+; AVX:         ret float [[TMP52]]
 ;
 ; AVX2-LABEL: @maxf16(
-; AVX2-NEXT:    [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
-; AVX2-NEXT:    [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
-; AVX2-NEXT:    [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]
-; AVX2-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]]
-; AVX2-NEXT:    [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8
-; AVX2-NEXT:    [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]]
-; AVX2-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]]
-; AVX2-NEXT:    [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4
-; AVX2-NEXT:    [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]]
-; AVX2-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]]
-; AVX2-NEXT:    [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16
-; AVX2-NEXT:    [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]]
-; AVX2-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]]
-; AVX2-NEXT:    [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4
-; AVX2-NEXT:    [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]]
-; AVX2-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]]
-; AVX2-NEXT:    [[TMP18:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8
-; AVX2-NEXT:    [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]]
-; AVX2-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]]
-; AVX2-NEXT:    [[TMP21:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
-; AVX2-NEXT:    [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]]
-; AVX2-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]]
-; AVX2-NEXT:    [[TMP24:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 8), align 16
-; AVX2-NEXT:    [[TMP25:%.*]] = fcmp fast ogt float [[TMP23]], [[TMP24]]
-; AVX2-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP23]], float [[TMP24]]
-; AVX2-NEXT:    [[TMP27:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 9), align 4
-; AVX2-NEXT:    [[TMP28:%.*]] = fcmp fast ogt float [[TMP26]], [[TMP27]]
-; AVX2-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], float [[TMP26]], float [[TMP27]]
-; AVX2-NEXT:    [[TMP30:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 10), align 8
-; AVX2-NEXT:    [[TMP31:%.*]] = fcmp fast ogt float [[TMP29]], [[TMP30]]
-; AVX2-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], float [[TMP29]], float [[TMP30]]
-; AVX2-NEXT:    [[TMP33:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 11), align 4
-; AVX2-NEXT:    [[TMP34:%.*]] = fcmp fast ogt float [[TMP32]], [[TMP33]]
-; AVX2-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], float [[TMP32]], float [[TMP33]]
-; AVX2-NEXT:    [[TMP36:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 12), align 16
-; AVX2-NEXT:    [[TMP37:%.*]] = fcmp fast ogt float [[TMP35]], [[TMP36]]
-; AVX2-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], float [[TMP35]], float [[TMP36]]
-; AVX2-NEXT:    [[TMP39:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 13), align 4
-; AVX2-NEXT:    [[TMP40:%.*]] = fcmp fast ogt float [[TMP38]], [[TMP39]]
-; AVX2-NEXT:    [[TMP41:%.*]] = select i1 [[TMP40]], float [[TMP38]], float [[TMP39]]
-; AVX2-NEXT:    [[TMP42:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 14), align 8
-; AVX2-NEXT:    [[TMP43:%.*]] = fcmp fast ogt float [[TMP41]], [[TMP42]]
-; AVX2-NEXT:    [[TMP44:%.*]] = select i1 [[TMP43]], float [[TMP41]], float [[TMP42]]
-; AVX2-NEXT:    [[TMP45:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4
-; AVX2-NEXT:    [[TMP46:%.*]] = fcmp fast ogt float [[TMP44]], [[TMP45]]
-; AVX2-NEXT:    [[TMP47:%.*]] = select i1 [[TMP46]], float [[TMP44]], float [[TMP45]]
-; AVX2-NEXT:    ret float [[TMP47]]
+; AVX2-NEXT:    [[TMP2:%.*]] = load <16 x float>, <16 x float>* bitcast ([32 x float]* @arr1 to <16 x float>*), align 16
+; AVX2:         [[RDX_SHUF:%.*]] = shufflevector <16 x float> [[TMP2]], <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP48:%.*]] = fcmp fast ogt <16 x float> [[TMP2]], [[RDX_SHUF]]
+; AVX2-NEXT:    [[BIN_RDX:%.*]] = select <16 x i1> [[TMP48]], <16 x float> [[TMP2]], <16 x float> [[RDX_SHUF]]
+; AVX2-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <16 x float> [[BIN_RDX]], <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP49:%.*]] = fcmp fast ogt <16 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; AVX2-NEXT:    [[BIN_RDX2:%.*]] = select <16 x i1> [[TMP49]], <16 x float> [[BIN_RDX]], <16 x float> [[RDX_SHUF1]]
+; AVX2-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <16 x float> [[BIN_RDX2]], <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP50:%.*]] = fcmp fast ogt <16 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; AVX2-NEXT:    [[BIN_RDX4:%.*]] = select <16 x i1> [[TMP50]], <16 x float> [[BIN_RDX2]], <16 x float> [[RDX_SHUF3]]
+; AVX2-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <16 x float> [[BIN_RDX4]], <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP51:%.*]] = fcmp fast ogt <16 x float> [[BIN_RDX4]], [[RDX_SHUF5]]
+; AVX2-NEXT:    [[BIN_RDX6:%.*]] = select <16 x i1> [[TMP51]], <16 x float> [[BIN_RDX4]], <16 x float> [[RDX_SHUF5]]
+; AVX2-NEXT:    [[TMP52:%.*]] = extractelement <16 x float> [[BIN_RDX6]], i32 0
+; AVX2:         ret float [[TMP52]]
 ;
 ; SKX-LABEL: @maxf16(
-; SKX-NEXT:    [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
-; SKX-NEXT:    [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
-; SKX-NEXT:    [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]
-; SKX-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]]
-; SKX-NEXT:    [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8
-; SKX-NEXT:    [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]]
-; SKX-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]]
-; SKX-NEXT:    [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4
-; SKX-NEXT:    [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]]
-; SKX-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]]
-; SKX-NEXT:    [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16
-; SKX-NEXT:    [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]]
-; SKX-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]]
-; SKX-NEXT:    [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4
-; SKX-NEXT:    [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]]
-; SKX-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]]
-; SKX-NEXT:    [[TMP18:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8
-; SKX-NEXT:    [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]]
-; SKX-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]]
-; SKX-NEXT:    [[TMP21:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
-; SKX-NEXT:    [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]]
-; SKX-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]]
-; SKX-NEXT:    [[TMP24:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 8), align 16
-; SKX-NEXT:    [[TMP25:%.*]] = fcmp fast ogt float [[TMP23]], [[TMP24]]
-; SKX-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP23]], float [[TMP24]]
-; SKX-NEXT:    [[TMP27:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 9), align 4
-; SKX-NEXT:    [[TMP28:%.*]] = fcmp fast ogt float [[TMP26]], [[TMP27]]
-; SKX-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], float [[TMP26]], float [[TMP27]]
-; SKX-NEXT:    [[TMP30:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 10), align 8
-; SKX-NEXT:    [[TMP31:%.*]] = fcmp fast ogt float [[TMP29]], [[TMP30]]
-; SKX-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], float [[TMP29]], float [[TMP30]]
-; SKX-NEXT:    [[TMP33:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 11), align 4
-; SKX-NEXT:    [[TMP34:%.*]] = fcmp fast ogt float [[TMP32]], [[TMP33]]
-; SKX-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], float [[TMP32]], float [[TMP33]]
-; SKX-NEXT:    [[TMP36:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 12), align 16
-; SKX-NEXT:    [[TMP37:%.*]] = fcmp fast ogt float [[TMP35]], [[TMP36]]
-; SKX-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], float [[TMP35]], float [[TMP36]]
-; SKX-NEXT:    [[TMP39:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 13), align 4
-; SKX-NEXT:    [[TMP40:%.*]] = fcmp fast ogt float [[TMP38]], [[TMP39]]
-; SKX-NEXT:    [[TMP41:%.*]] = select i1 [[TMP40]], float [[TMP38]], float [[TMP39]]
-; SKX-NEXT:    [[TMP42:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 14), align 8
-; SKX-NEXT:    [[TMP43:%.*]] = fcmp fast ogt float [[TMP41]], [[TMP42]]
-; SKX-NEXT:    [[TMP44:%.*]] = select i1 [[TMP43]], float [[TMP41]], float [[TMP42]]
-; SKX-NEXT:    [[TMP45:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4
-; SKX-NEXT:    [[TMP46:%.*]] = fcmp fast ogt float [[TMP44]], [[TMP45]]
-; SKX-NEXT:    [[TMP47:%.*]] = select i1 [[TMP46]], float [[TMP44]], float [[TMP45]]
-; SKX-NEXT:    ret float [[TMP47]]
+; SKX-NEXT:    [[TMP2:%.*]] = load <16 x float>, <16 x float>* bitcast ([32 x float]* @arr1 to <16 x float>*), align 16
+; SKX:         [[RDX_SHUF:%.*]] = shufflevector <16 x float> [[TMP2]], <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SKX-NEXT:    [[TMP48:%.*]] = fcmp fast ogt <16 x float> [[TMP2]], [[RDX_SHUF]]
+; SKX-NEXT:    [[BIN_RDX:%.*]] = select <16 x i1> [[TMP48]], <16 x float> [[TMP2]], <16 x float> [[RDX_SHUF]]
+; SKX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <16 x float> [[BIN_RDX]], <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SKX-NEXT:    [[TMP49:%.*]] = fcmp fast ogt <16 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; SKX-NEXT:    [[BIN_RDX2:%.*]] = select <16 x i1> [[TMP49]], <16 x float> [[BIN_RDX]], <16 x float> [[RDX_SHUF1]]
+; SKX-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <16 x float> [[BIN_RDX2]], <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SKX-NEXT:    [[TMP50:%.*]] = fcmp fast ogt <16 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; SKX-NEXT:    [[BIN_RDX4:%.*]] = select <16 x i1> [[TMP50]], <16 x float> [[BIN_RDX2]], <16 x float> [[RDX_SHUF3]]
+; SKX-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <16 x float> [[BIN_RDX4]], <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SKX-NEXT:    [[TMP51:%.*]] = fcmp fast ogt <16 x float> [[BIN_RDX4]], [[RDX_SHUF5]]
+; SKX-NEXT:    [[BIN_RDX6:%.*]] = select <16 x i1> [[TMP51]], <16 x float> [[BIN_RDX4]], <16 x float> [[RDX_SHUF5]]
+; SKX-NEXT:    [[TMP52:%.*]] = extractelement <16 x float> [[BIN_RDX6]], i32 0
+; SKX:         ret float [[TMP52]]
 ;
   %2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
   %3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
@@ -1336,295 +770,64 @@
 ; CHECK-NEXT:    ret float [[TMP95]]
 ;
 ; AVX-LABEL: @maxf32(
-; AVX-NEXT:    [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
-; AVX-NEXT:    [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
-; AVX-NEXT:    [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]
-; AVX-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]]
-; AVX-NEXT:    [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8
-; AVX-NEXT:    [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]]
-; AVX-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]]
-; AVX-NEXT:    [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4
-; AVX-NEXT:    [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]]
-; AVX-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]]
-; AVX-NEXT:    [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16
-; AVX-NEXT:    [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]]
-; AVX-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]]
-; AVX-NEXT:    [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4
-; AVX-NEXT:    [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]]
-; AVX-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]]
-; AVX-NEXT:    [[TMP18:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8
-; AVX-NEXT:    [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]]
-; AVX-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]]
-; AVX-NEXT:    [[TMP21:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
-; AVX-NEXT:    [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]]
-; AVX-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]]
-; AVX-NEXT:    [[TMP24:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 8), align 16
-; AVX-NEXT:    [[TMP25:%.*]] = fcmp fast ogt float [[TMP23]], [[TMP24]]
-; AVX-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP23]], float [[TMP24]]
-; AVX-NEXT:    [[TMP27:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 9), align 4
-; AVX-NEXT:    [[TMP28:%.*]] = fcmp fast ogt float [[TMP26]], [[TMP27]]
-; AVX-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], float [[TMP26]], float [[TMP27]]
-; AVX-NEXT:    [[TMP30:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 10), align 8
-; AVX-NEXT:    [[TMP31:%.*]] = fcmp fast ogt float [[TMP29]], [[TMP30]]
-; AVX-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], float [[TMP29]], float [[TMP30]]
-; AVX-NEXT:    [[TMP33:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 11), align 4
-; AVX-NEXT:    [[TMP34:%.*]] = fcmp fast ogt float [[TMP32]], [[TMP33]]
-; AVX-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], float [[TMP32]], float [[TMP33]]
-; AVX-NEXT:    [[TMP36:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 12), align 16
-; AVX-NEXT:    [[TMP37:%.*]] = fcmp fast ogt float [[TMP35]], [[TMP36]]
-; AVX-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], float [[TMP35]], float [[TMP36]]
-; AVX-NEXT:    [[TMP39:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 13), align 4
-; AVX-NEXT:    [[TMP40:%.*]] = fcmp fast ogt float [[TMP38]], [[TMP39]]
-; AVX-NEXT:    [[TMP41:%.*]] = select i1 [[TMP40]], float [[TMP38]], float [[TMP39]]
-; AVX-NEXT:    [[TMP42:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 14), align 8
-; AVX-NEXT:    [[TMP43:%.*]] = fcmp fast ogt float [[TMP41]], [[TMP42]]
-; AVX-NEXT:    [[TMP44:%.*]] = select i1 [[TMP43]], float [[TMP41]], float [[TMP42]]
-; AVX-NEXT:    [[TMP45:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4
-; AVX-NEXT:    [[TMP46:%.*]] = fcmp fast ogt float [[TMP44]], [[TMP45]]
-; AVX-NEXT:    [[TMP47:%.*]] = select i1 [[TMP46]], float [[TMP44]], float [[TMP45]]
-; AVX-NEXT:    [[TMP48:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 16), align 16
-; AVX-NEXT:    [[TMP49:%.*]] = fcmp fast ogt float [[TMP47]], [[TMP48]]
-; AVX-NEXT:    [[TMP50:%.*]] = select i1 [[TMP49]], float [[TMP47]], float [[TMP48]]
-; AVX-NEXT:    [[TMP51:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 17), align 4
-; AVX-NEXT:    [[TMP52:%.*]] = fcmp fast ogt float [[TMP50]], [[TMP51]]
-; AVX-NEXT:    [[TMP53:%.*]] = select i1 [[TMP52]], float [[TMP50]], float [[TMP51]]
-; AVX-NEXT:    [[TMP54:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 18), align 8
-; AVX-NEXT:    [[TMP55:%.*]] = fcmp fast ogt float [[TMP53]], [[TMP54]]
-; AVX-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], float [[TMP53]], float [[TMP54]]
-; AVX-NEXT:    [[TMP57:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 19), align 4
-; AVX-NEXT:    [[TMP58:%.*]] = fcmp fast ogt float [[TMP56]], [[TMP57]]
-; AVX-NEXT:    [[TMP59:%.*]] = select i1 [[TMP58]], float [[TMP56]], float [[TMP57]]
-; AVX-NEXT:    [[TMP60:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 20), align 16
-; AVX-NEXT:    [[TMP61:%.*]] = fcmp fast ogt float [[TMP59]], [[TMP60]]
-; AVX-NEXT:    [[TMP62:%.*]] = select i1 [[TMP61]], float [[TMP59]], float [[TMP60]]
-; AVX-NEXT:    [[TMP63:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 21), align 4
-; AVX-NEXT:    [[TMP64:%.*]] = fcmp fast ogt float [[TMP62]], [[TMP63]]
-; AVX-NEXT:    [[TMP65:%.*]] = select i1 [[TMP64]], float [[TMP62]], float [[TMP63]]
-; AVX-NEXT:    [[TMP66:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 22), align 8
-; AVX-NEXT:    [[TMP67:%.*]] = fcmp fast ogt float [[TMP65]], [[TMP66]]
-; AVX-NEXT:    [[TMP68:%.*]] = select i1 [[TMP67]], float [[TMP65]], float [[TMP66]]
-; AVX-NEXT:    [[TMP69:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 23), align 4
-; AVX-NEXT:    [[TMP70:%.*]] = fcmp fast ogt float [[TMP68]], [[TMP69]]
-; AVX-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], float [[TMP68]], float [[TMP69]]
-; AVX-NEXT:    [[TMP72:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 24), align 16
-; AVX-NEXT:    [[TMP73:%.*]] = fcmp fast ogt float [[TMP71]], [[TMP72]]
-; AVX-NEXT:    [[TMP74:%.*]] = select i1 [[TMP73]], float [[TMP71]], float [[TMP72]]
-; AVX-NEXT:    [[TMP75:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 25), align 4
-; AVX-NEXT:    [[TMP76:%.*]] = fcmp fast ogt float [[TMP74]], [[TMP75]]
-; AVX-NEXT:    [[TMP77:%.*]] = select i1 [[TMP76]], float [[TMP74]], float [[TMP75]]
-; AVX-NEXT:    [[TMP78:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 26), align 8
-; AVX-NEXT:    [[TMP79:%.*]] = fcmp fast ogt float [[TMP77]], [[TMP78]]
-; AVX-NEXT:    [[TMP80:%.*]] = select i1 [[TMP79]], float [[TMP77]], float [[TMP78]]
-; AVX-NEXT:    [[TMP81:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 27), align 4
-; AVX-NEXT:    [[TMP82:%.*]] = fcmp fast ogt float [[TMP80]], [[TMP81]]
-; AVX-NEXT:    [[TMP83:%.*]] = select i1 [[TMP82]], float [[TMP80]], float [[TMP81]]
-; AVX-NEXT:    [[TMP84:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 28), align 16
-; AVX-NEXT:    [[TMP85:%.*]] = fcmp fast ogt float [[TMP83]], [[TMP84]]
-; AVX-NEXT:    [[TMP86:%.*]] = select i1 [[TMP85]], float [[TMP83]], float [[TMP84]]
-; AVX-NEXT:    [[TMP87:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 29), align 4
-; AVX-NEXT:    [[TMP88:%.*]] = fcmp fast ogt float [[TMP86]], [[TMP87]]
-; AVX-NEXT:    [[TMP89:%.*]] = select i1 [[TMP88]], float [[TMP86]], float [[TMP87]]
-; AVX-NEXT:    [[TMP90:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 30), align 8
-; AVX-NEXT:    [[TMP91:%.*]] = fcmp fast ogt float [[TMP89]], [[TMP90]]
-; AVX-NEXT:    [[TMP92:%.*]] = select i1 [[TMP91]], float [[TMP89]], float [[TMP90]]
-; AVX-NEXT:    [[TMP93:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 31), align 4
-; AVX-NEXT:    [[TMP94:%.*]] = fcmp fast ogt float [[TMP92]], [[TMP93]]
-; AVX-NEXT:    [[TMP95:%.*]] = select i1 [[TMP94]], float [[TMP92]], float [[TMP93]]
-; AVX-NEXT:    ret float [[TMP95]]
+; AVX-NEXT:    [[TMP2:%.*]] = load <32 x float>, <32 x float>* bitcast ([32 x float]* @arr1 to <32 x float>*), align 16
+; AVX:         [[RDX_SHUF:%.*]] = shufflevector <32 x float> [[TMP2]], <32 x float> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP96:%.*]] = fcmp fast ogt <32 x float> [[TMP2]], [[RDX_SHUF]]
+; AVX-NEXT:    [[BIN_RDX:%.*]] = select <32 x i1> [[TMP96]], <32 x float> [[TMP2]], <32 x float> [[RDX_SHUF]]
+; AVX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x float> [[BIN_RDX]], <32 x float> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP97:%.*]] = fcmp fast ogt <32 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; AVX-NEXT:    [[BIN_RDX2:%.*]] = select <32 x i1> [[TMP97]], <32 x float> [[BIN_RDX]], <32 x float> [[RDX_SHUF1]]
+; AVX-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <32 x float> [[BIN_RDX2]], <32 x float> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP98:%.*]] = fcmp fast ogt <32 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; AVX-NEXT:    [[BIN_RDX4:%.*]] = select <32 x i1> [[TMP98]], <32 x float> [[BIN_RDX2]], <32 x float> [[RDX_SHUF3]]
+; AVX-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <32 x float> [[BIN_RDX4]], <32 x float> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP99:%.*]] = fcmp fast ogt <32 x float> [[BIN_RDX4]], [[RDX_SHUF5]]
+; AVX-NEXT:    [[BIN_RDX6:%.*]] = select <32 x i1> [[TMP99]], <32 x float> [[BIN_RDX4]], <32 x float> [[RDX_SHUF5]]
+; AVX-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <32 x float> [[BIN_RDX6]], <32 x float> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP100:%.*]] = fcmp fast ogt <32 x float> [[BIN_RDX6]], [[RDX_SHUF7]]
+; AVX-NEXT:    [[BIN_RDX8:%.*]] = select <32 x i1> [[TMP100]], <32 x float> [[BIN_RDX6]], <32 x float> [[RDX_SHUF7]]
+; AVX-NEXT:    [[TMP101:%.*]] = extractelement <32 x float> [[BIN_RDX8]], i32 0
+; AVX:         ret float [[TMP101]]
 ;
 ; AVX2-LABEL: @maxf32(
-; AVX2-NEXT:    [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
-; AVX2-NEXT:    [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
-; AVX2-NEXT:    [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]
-; AVX2-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]]
-; AVX2-NEXT:    [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8
-; AVX2-NEXT:    [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]]
-; AVX2-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]]
-; AVX2-NEXT:    [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4
-; AVX2-NEXT:    [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]]
-; AVX2-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]]
-; AVX2-NEXT:    [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16
-; AVX2-NEXT:    [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]]
-; AVX2-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]]
-; AVX2-NEXT:    [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4
-; AVX2-NEXT:    [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]]
-; AVX2-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]]
-; AVX2-NEXT:    [[TMP18:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8
-; AVX2-NEXT:    [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]]
-; AVX2-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]]
-; AVX2-NEXT:    [[TMP21:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
-; AVX2-NEXT:    [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]]
-; AVX2-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]]
-; AVX2-NEXT:    [[TMP24:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 8), align 16
-; AVX2-NEXT:    [[TMP25:%.*]] = fcmp fast ogt float [[TMP23]], [[TMP24]]
-; AVX2-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP23]], float [[TMP24]]
-; AVX2-NEXT:    [[TMP27:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 9), align 4
-; AVX2-NEXT:    [[TMP28:%.*]] = fcmp fast ogt float [[TMP26]], [[TMP27]]
-; AVX2-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], float [[TMP26]], float [[TMP27]]
-; AVX2-NEXT:    [[TMP30:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 10), align 8
-; AVX2-NEXT:    [[TMP31:%.*]] = fcmp fast ogt float [[TMP29]], [[TMP30]]
-; AVX2-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], float [[TMP29]], float [[TMP30]]
-; AVX2-NEXT:    [[TMP33:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 11), align 4
-; AVX2-NEXT:    [[TMP34:%.*]] = fcmp fast ogt float [[TMP32]], [[TMP33]]
-; AVX2-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], float [[TMP32]], float [[TMP33]]
-; AVX2-NEXT:    [[TMP36:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 12), align 16
-; AVX2-NEXT:    [[TMP37:%.*]] = fcmp fast ogt float [[TMP35]], [[TMP36]]
-; AVX2-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], float [[TMP35]], float [[TMP36]]
-; AVX2-NEXT:    [[TMP39:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 13), align 4
-; AVX2-NEXT:    [[TMP40:%.*]] = fcmp fast ogt float [[TMP38]], [[TMP39]]
-; AVX2-NEXT:    [[TMP41:%.*]] = select i1 [[TMP40]], float [[TMP38]], float [[TMP39]]
-; AVX2-NEXT:    [[TMP42:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 14), align 8
-; AVX2-NEXT:    [[TMP43:%.*]] = fcmp fast ogt float [[TMP41]], [[TMP42]]
-; AVX2-NEXT:    [[TMP44:%.*]] = select i1 [[TMP43]], float [[TMP41]], float [[TMP42]]
-; AVX2-NEXT:    [[TMP45:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4
-; AVX2-NEXT:    [[TMP46:%.*]] = fcmp fast ogt float [[TMP44]], [[TMP45]]
-; AVX2-NEXT:    [[TMP47:%.*]] = select i1 [[TMP46]], float [[TMP44]], float [[TMP45]]
-; AVX2-NEXT:    [[TMP48:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 16), align 16
-; AVX2-NEXT:    [[TMP49:%.*]] = fcmp fast ogt float [[TMP47]], [[TMP48]]
-; AVX2-NEXT:    [[TMP50:%.*]] = select i1 [[TMP49]], float [[TMP47]], float [[TMP48]]
-; AVX2-NEXT:    [[TMP51:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 17), align 4
-; AVX2-NEXT:    [[TMP52:%.*]] = fcmp fast ogt float [[TMP50]], [[TMP51]]
-; AVX2-NEXT:    [[TMP53:%.*]] = select i1 [[TMP52]], float [[TMP50]], float [[TMP51]]
-; AVX2-NEXT:    [[TMP54:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 18), align 8
-; AVX2-NEXT:    [[TMP55:%.*]] = fcmp fast ogt float [[TMP53]], [[TMP54]]
-; AVX2-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], float [[TMP53]], float [[TMP54]]
-; AVX2-NEXT:    [[TMP57:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 19), align 4
-; AVX2-NEXT:    [[TMP58:%.*]] = fcmp fast ogt float [[TMP56]], [[TMP57]]
-; AVX2-NEXT:    [[TMP59:%.*]] = select i1 [[TMP58]], float [[TMP56]], float [[TMP57]]
-; AVX2-NEXT:    [[TMP60:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 20), align 16
-; AVX2-NEXT:    [[TMP61:%.*]] = fcmp fast ogt float [[TMP59]], [[TMP60]]
-; AVX2-NEXT:    [[TMP62:%.*]] = select i1 [[TMP61]], float [[TMP59]], float [[TMP60]]
-; AVX2-NEXT:    [[TMP63:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 21), align 4
-; AVX2-NEXT:    [[TMP64:%.*]] = fcmp fast ogt float [[TMP62]], [[TMP63]]
-; AVX2-NEXT:    [[TMP65:%.*]] = select i1 [[TMP64]], float [[TMP62]], float [[TMP63]]
-; AVX2-NEXT:    [[TMP66:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 22), align 8
-; AVX2-NEXT:    [[TMP67:%.*]] = fcmp fast ogt float [[TMP65]], [[TMP66]]
-; AVX2-NEXT:    [[TMP68:%.*]] = select i1 [[TMP67]], float [[TMP65]], float [[TMP66]]
-; AVX2-NEXT:    [[TMP69:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 23), align 4
-; AVX2-NEXT:    [[TMP70:%.*]] = fcmp fast ogt float [[TMP68]], [[TMP69]]
-; AVX2-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], float [[TMP68]], float [[TMP69]]
-; AVX2-NEXT:    [[TMP72:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 24), align 16
-; AVX2-NEXT:    [[TMP73:%.*]] = fcmp fast ogt float [[TMP71]], [[TMP72]]
-; AVX2-NEXT:    [[TMP74:%.*]] = select i1 [[TMP73]], float [[TMP71]], float [[TMP72]]
-; AVX2-NEXT:    [[TMP75:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 25), align 4
-; AVX2-NEXT:    [[TMP76:%.*]] = fcmp fast ogt float [[TMP74]], [[TMP75]]
-; AVX2-NEXT:    [[TMP77:%.*]] = select i1 [[TMP76]], float [[TMP74]], float [[TMP75]]
-; AVX2-NEXT:    [[TMP78:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 26), align 8
-; AVX2-NEXT:    [[TMP79:%.*]] = fcmp fast ogt float [[TMP77]], [[TMP78]]
-; AVX2-NEXT:    [[TMP80:%.*]] = select i1 [[TMP79]], float [[TMP77]], float [[TMP78]]
-; AVX2-NEXT:    [[TMP81:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 27), align 4
-; AVX2-NEXT:    [[TMP82:%.*]] = fcmp fast ogt float [[TMP80]], [[TMP81]]
-; AVX2-NEXT:    [[TMP83:%.*]] = select i1 [[TMP82]], float [[TMP80]], float [[TMP81]]
-; AVX2-NEXT:    [[TMP84:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 28), align 16
-; AVX2-NEXT:    [[TMP85:%.*]] = fcmp fast ogt float [[TMP83]], [[TMP84]]
-; AVX2-NEXT:    [[TMP86:%.*]] = select i1 [[TMP85]], float [[TMP83]], float [[TMP84]]
-; AVX2-NEXT:    [[TMP87:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 29), align 4
-; AVX2-NEXT:    [[TMP88:%.*]] = fcmp fast ogt float [[TMP86]], [[TMP87]]
-; AVX2-NEXT:    [[TMP89:%.*]] = select i1 [[TMP88]], float [[TMP86]], float [[TMP87]]
-; AVX2-NEXT:    [[TMP90:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 30), align 8
-; AVX2-NEXT:    [[TMP91:%.*]] = fcmp fast ogt float [[TMP89]], [[TMP90]]
-; AVX2-NEXT:    [[TMP92:%.*]] = select i1 [[TMP91]], float [[TMP89]], float [[TMP90]]
-; AVX2-NEXT:    [[TMP93:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 31), align 4
-; AVX2-NEXT:    [[TMP94:%.*]] = fcmp fast ogt float [[TMP92]], [[TMP93]]
-; AVX2-NEXT:    [[TMP95:%.*]] = select i1 [[TMP94]], float [[TMP92]], float [[TMP93]]
-; AVX2-NEXT:    ret float [[TMP95]]
+; AVX2-NEXT:    [[TMP2:%.*]] = load <32 x float>, <32 x float>* bitcast ([32 x float]* @arr1 to <32 x float>*), align 16
+; AVX2:         [[RDX_SHUF:%.*]] = shufflevector <32 x float> [[TMP2]], <32 x float> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP96:%.*]] = fcmp fast ogt <32 x float> [[TMP2]], [[RDX_SHUF]]
+; AVX2-NEXT:    [[BIN_RDX:%.*]] = select <32 x i1> [[TMP96]], <32 x float> [[TMP2]], <32 x float> [[RDX_SHUF]]
+; AVX2-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x float> [[BIN_RDX]], <32 x float> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP97:%.*]] = fcmp fast ogt <32 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; AVX2-NEXT:    [[BIN_RDX2:%.*]] = select <32 x i1> [[TMP97]], <32 x float> [[BIN_RDX]], <32 x float> [[RDX_SHUF1]]
+; AVX2-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <32 x float> [[BIN_RDX2]], <32 x float> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP98:%.*]] = fcmp fast ogt <32 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; AVX2-NEXT:    [[BIN_RDX4:%.*]] = select <32 x i1> [[TMP98]], <32 x float> [[BIN_RDX2]], <32 x float> [[RDX_SHUF3]]
+; AVX2-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <32 x float> [[BIN_RDX4]], <32 x float> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP99:%.*]] = fcmp fast ogt <32 x float> [[BIN_RDX4]], [[RDX_SHUF5]]
+; AVX2-NEXT:    [[BIN_RDX6:%.*]] = select <32 x i1> [[TMP99]], <32 x float> [[BIN_RDX4]], <32 x float> [[RDX_SHUF5]]
+; AVX2-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <32 x float> [[BIN_RDX6]], <32 x float> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP100:%.*]] = fcmp fast ogt <32 x float> [[BIN_RDX6]], [[RDX_SHUF7]]
+; AVX2-NEXT:    [[BIN_RDX8:%.*]] = select <32 x i1> [[TMP100]], <32 x float> [[BIN_RDX6]], <32 x float> [[RDX_SHUF7]]
+; AVX2-NEXT:    [[TMP101:%.*]] = extractelement <32 x float> [[BIN_RDX8]], i32 0
+; AVX2:         ret float [[TMP101]]
 ;
 ; SKX-LABEL: @maxf32(
-; SKX-NEXT:    [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
-; SKX-NEXT:    [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
-; SKX-NEXT:    [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]
-; SKX-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]]
-; SKX-NEXT:    [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8
-; SKX-NEXT:    [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]]
-; SKX-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]]
-; SKX-NEXT:    [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4
-; SKX-NEXT:    [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]]
-; SKX-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]]
-; SKX-NEXT:    [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16
-; SKX-NEXT:    [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]]
-; SKX-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]]
-; SKX-NEXT:    [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4
-; SKX-NEXT:    [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]]
-; SKX-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]]
-; SKX-NEXT:    [[TMP18:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8
-; SKX-NEXT:    [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]]
-; SKX-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]]
-; SKX-NEXT:    [[TMP21:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
-; SKX-NEXT:    [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]]
-; SKX-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]]
-; SKX-NEXT:    [[TMP24:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 8), align 16
-; SKX-NEXT:    [[TMP25:%.*]] = fcmp fast ogt float [[TMP23]], [[TMP24]]
-; SKX-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP23]], float [[TMP24]]
-; SKX-NEXT:    [[TMP27:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 9), align 4
-; SKX-NEXT:    [[TMP28:%.*]] = fcmp fast ogt float [[TMP26]], [[TMP27]]
-; SKX-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], float [[TMP26]], float [[TMP27]]
-; SKX-NEXT:    [[TMP30:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 10), align 8
-; SKX-NEXT:    [[TMP31:%.*]] = fcmp fast ogt float [[TMP29]], [[TMP30]]
-; SKX-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], float [[TMP29]], float [[TMP30]]
-; SKX-NEXT:    [[TMP33:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 11), align 4
-; SKX-NEXT:    [[TMP34:%.*]] = fcmp fast ogt float [[TMP32]], [[TMP33]]
-; SKX-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], float [[TMP32]], float [[TMP33]]
-; SKX-NEXT:    [[TMP36:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 12), align 16
-; SKX-NEXT:    [[TMP37:%.*]] = fcmp fast ogt float [[TMP35]], [[TMP36]]
-; SKX-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], float [[TMP35]], float [[TMP36]]
-; SKX-NEXT:    [[TMP39:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 13), align 4
-; SKX-NEXT:    [[TMP40:%.*]] = fcmp fast ogt float [[TMP38]], [[TMP39]]
-; SKX-NEXT:    [[TMP41:%.*]] = select i1 [[TMP40]], float [[TMP38]], float [[TMP39]]
-; SKX-NEXT:    [[TMP42:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 14), align 8
-; SKX-NEXT:    [[TMP43:%.*]] = fcmp fast ogt float [[TMP41]], [[TMP42]]
-; SKX-NEXT:    [[TMP44:%.*]] = select i1 [[TMP43]], float [[TMP41]], float [[TMP42]]
-; SKX-NEXT:    [[TMP45:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4
-; SKX-NEXT:    [[TMP46:%.*]] = fcmp fast ogt float [[TMP44]], [[TMP45]]
-; SKX-NEXT:    [[TMP47:%.*]] = select i1 [[TMP46]], float [[TMP44]], float [[TMP45]]
-; SKX-NEXT:    [[TMP48:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 16), align 16
-; SKX-NEXT:    [[TMP49:%.*]] = fcmp fast ogt float [[TMP47]], [[TMP48]]
-; SKX-NEXT:    [[TMP50:%.*]] = select i1 [[TMP49]], float [[TMP47]], float [[TMP48]]
-; SKX-NEXT:    [[TMP51:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 17), align 4
-; SKX-NEXT:    [[TMP52:%.*]] = fcmp fast ogt float [[TMP50]], [[TMP51]]
-; SKX-NEXT:    [[TMP53:%.*]] = select i1 [[TMP52]], float [[TMP50]], float [[TMP51]]
-; SKX-NEXT:    [[TMP54:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 18), align 8
-; SKX-NEXT:    [[TMP55:%.*]] = fcmp fast ogt float [[TMP53]], [[TMP54]]
-; SKX-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], float [[TMP53]], float [[TMP54]]
-; SKX-NEXT:    [[TMP57:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 19), align 4
-; SKX-NEXT:    [[TMP58:%.*]] = fcmp fast ogt float [[TMP56]], [[TMP57]]
-; SKX-NEXT:    [[TMP59:%.*]] = select i1 [[TMP58]], float [[TMP56]], float [[TMP57]]
-; SKX-NEXT:    [[TMP60:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 20), align 16
-; SKX-NEXT:    [[TMP61:%.*]] = fcmp fast ogt float [[TMP59]], [[TMP60]]
-; SKX-NEXT:    [[TMP62:%.*]] = select i1 [[TMP61]], float [[TMP59]], float [[TMP60]]
-; SKX-NEXT:    [[TMP63:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 21), align 4
-; SKX-NEXT:    [[TMP64:%.*]] = fcmp fast ogt float [[TMP62]], [[TMP63]]
-; SKX-NEXT:    [[TMP65:%.*]] = select i1 [[TMP64]], float [[TMP62]], float [[TMP63]]
-; SKX-NEXT:    [[TMP66:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 22), align 8
-; SKX-NEXT:    [[TMP67:%.*]] = fcmp fast ogt float [[TMP65]], [[TMP66]]
-; SKX-NEXT:    [[TMP68:%.*]] = select i1 [[TMP67]], float [[TMP65]], float [[TMP66]]
-; SKX-NEXT:    [[TMP69:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 23), align 4
-; SKX-NEXT:    [[TMP70:%.*]] = fcmp fast ogt float [[TMP68]], [[TMP69]]
-; SKX-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], float [[TMP68]], float [[TMP69]]
-; SKX-NEXT:    [[TMP72:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 24), align 16
-; SKX-NEXT:    [[TMP73:%.*]] = fcmp fast ogt float [[TMP71]], [[TMP72]]
-; SKX-NEXT:    [[TMP74:%.*]] = select i1 [[TMP73]], float [[TMP71]], float [[TMP72]]
-; SKX-NEXT:    [[TMP75:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 25), align 4
-; SKX-NEXT:    [[TMP76:%.*]] = fcmp fast ogt float [[TMP74]], [[TMP75]]
-; SKX-NEXT:    [[TMP77:%.*]] = select i1 [[TMP76]], float [[TMP74]], float [[TMP75]]
-; SKX-NEXT:    [[TMP78:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 26), align 8
-; SKX-NEXT:    [[TMP79:%.*]] = fcmp fast ogt float [[TMP77]], [[TMP78]]
-; SKX-NEXT:    [[TMP80:%.*]] = select i1 [[TMP79]], float [[TMP77]], float [[TMP78]]
-; SKX-NEXT:    [[TMP81:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 27), align 4
-; SKX-NEXT:    [[TMP82:%.*]] = fcmp fast ogt float [[TMP80]], [[TMP81]]
-; SKX-NEXT:    [[TMP83:%.*]] = select i1 [[TMP82]], float [[TMP80]], float [[TMP81]]
-; SKX-NEXT:    [[TMP84:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 28), align 16
-; SKX-NEXT:    [[TMP85:%.*]] = fcmp fast ogt float [[TMP83]], [[TMP84]]
-; SKX-NEXT:    [[TMP86:%.*]] = select i1 [[TMP85]], float [[TMP83]], float [[TMP84]]
-; SKX-NEXT:    [[TMP87:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 29), align 4
-; SKX-NEXT:    [[TMP88:%.*]] = fcmp fast ogt float [[TMP86]], [[TMP87]]
-; SKX-NEXT:    [[TMP89:%.*]] = select i1 [[TMP88]], float [[TMP86]], float [[TMP87]]
-; SKX-NEXT:    [[TMP90:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 30), align 8
-; SKX-NEXT:    [[TMP91:%.*]] = fcmp fast ogt float [[TMP89]], [[TMP90]]
-; SKX-NEXT:    [[TMP92:%.*]] = select i1 [[TMP91]], float [[TMP89]], float [[TMP90]]
-; SKX-NEXT:    [[TMP93:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 31), align 4
-; SKX-NEXT:    [[TMP94:%.*]] = fcmp fast ogt float [[TMP92]], [[TMP93]]
-; SKX-NEXT:    [[TMP95:%.*]] = select i1 [[TMP94]], float [[TMP92]], float [[TMP93]]
-; SKX-NEXT:    ret float [[TMP95]]
+; SKX-NEXT:    [[TMP2:%.*]] = load <32 x float>, <32 x float>* bitcast ([32 x float]* @arr1 to <32 x float>*), align 16
+; SKX:         [[RDX_SHUF:%.*]] = shufflevector <32 x float> [[TMP2]], <32 x float> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SKX-NEXT:    [[TMP96:%.*]] = fcmp fast ogt <32 x float> [[TMP2]], [[RDX_SHUF]]
+; SKX-NEXT:    [[BIN_RDX:%.*]] = select <32 x i1> [[TMP96]], <32 x float> [[TMP2]], <32 x float> [[RDX_SHUF]]
+; SKX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x float> [[BIN_RDX]], <32 x float> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SKX-NEXT:    [[TMP97:%.*]] = fcmp fast ogt <32 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; SKX-NEXT:    [[BIN_RDX2:%.*]] = select <32 x i1> [[TMP97]], <32 x float> [[BIN_RDX]], <32 x float> [[RDX_SHUF1]]
+; SKX-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <32 x float> [[BIN_RDX2]], <32 x float> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SKX-NEXT:    [[TMP98:%.*]] = fcmp fast ogt <32 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; SKX-NEXT:    [[BIN_RDX4:%.*]] = select <32 x i1> [[TMP98]], <32 x float> [[BIN_RDX2]], <32 x float> [[RDX_SHUF3]]
+; SKX-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <32 x float> [[BIN_RDX4]], <32 x float> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SKX-NEXT:    [[TMP99:%.*]] = fcmp fast ogt <32 x float> [[BIN_RDX4]], [[RDX_SHUF5]]
+; SKX-NEXT:    [[BIN_RDX6:%.*]] = select <32 x i1> [[TMP99]], <32 x float> [[BIN_RDX4]], <32 x float> [[RDX_SHUF5]]
+; SKX-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <32 x float> [[BIN_RDX6]], <32 x float> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SKX-NEXT:    [[TMP100:%.*]] = fcmp fast ogt <32 x float> [[BIN_RDX6]], [[RDX_SHUF7]]
+; SKX-NEXT:    [[BIN_RDX8:%.*]] = select <32 x i1> [[TMP100]], <32 x float> [[BIN_RDX6]], <32 x float> [[RDX_SHUF7]]
+; SKX-NEXT:    [[TMP101:%.*]] = extractelement <32 x float> [[BIN_RDX8]], i32 0
+; SKX:         ret float [[TMP101]]
 ;
   %2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
   %3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
Index: test/Transforms/SLPVectorizer/X86/horizontal.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/horizontal.ll
+++ test/Transforms/SLPVectorizer/X86/horizontal.ll
@@ -817,22 +817,22 @@
 define void @i32_red_call(i32 %val) {
 ; CHECK-LABEL: @i32_red_call(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
-; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[TMP2]], [[ADD]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
-; CHECK-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[TMP3]], [[ADD_1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16
-; CHECK-NEXT:    [[ADD_3:%.*]] = add nsw i32 [[TMP4]], [[ADD_2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4
-; CHECK-NEXT:    [[ADD_4:%.*]] = add nsw i32 [[TMP5]], [[ADD_3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8
-; CHECK-NEXT:    [[ADD_5:%.*]] = add nsw i32 [[TMP6]], [[ADD_4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4
-; CHECK-NEXT:    [[ADD_6:%.*]] = add nsw i32 [[TMP7]], [[ADD_5]]
-; CHECK-NEXT:    [[RES:%.*]] = call i32 @foobar(i32 [[ADD_6]])
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 undef, undef
+; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 undef, [[ADD]]
+; CHECK-NEXT:    [[ADD_2:%.*]] = add nsw i32 undef, [[ADD_1]]
+; CHECK-NEXT:    [[ADD_3:%.*]] = add nsw i32 undef, [[ADD_2]]
+; CHECK-NEXT:    [[ADD_4:%.*]] = add nsw i32 undef, [[ADD_3]]
+; CHECK-NEXT:    [[ADD_5:%.*]] = add nsw i32 undef, [[ADD_4]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add nsw <8 x i32> [[TMP0]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = add nsw <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX4:%.*]] = add nsw <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; CHECK-NEXT:    [[ADD_6:%.*]] = add nsw i32 undef, [[ADD_5]]
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @foobar(i32 [[TMP1]])
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -858,22 +858,22 @@
 define void @i32_red_invoke(i32 %val) personality i32 (...)* @__gxx_personality_v0 {
 ; CHECK-LABEL: @i32_red_invoke(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
-; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[TMP2]], [[ADD]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
-; CHECK-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[TMP3]], [[ADD_1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16
-; CHECK-NEXT:    [[ADD_3:%.*]] = add nsw i32 [[TMP4]], [[ADD_2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4
-; CHECK-NEXT:    [[ADD_4:%.*]] = add nsw i32 [[TMP5]], [[ADD_3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8
-; CHECK-NEXT:    [[ADD_5:%.*]] = add nsw i32 [[TMP6]], [[ADD_4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4
-; CHECK-NEXT:    [[ADD_6:%.*]] = add nsw i32 [[TMP7]], [[ADD_5]]
-; CHECK-NEXT:    [[RES:%.*]] = invoke i32 @foobar(i32 [[ADD_6]])
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 undef, undef
+; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 undef, [[ADD]]
+; CHECK-NEXT:    [[ADD_2:%.*]] = add nsw i32 undef, [[ADD_1]]
+; CHECK-NEXT:    [[ADD_3:%.*]] = add nsw i32 undef, [[ADD_2]]
+; CHECK-NEXT:    [[ADD_4:%.*]] = add nsw i32 undef, [[ADD_3]]
+; CHECK-NEXT:    [[ADD_5:%.*]] = add nsw i32 undef, [[ADD_4]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add nsw <8 x i32> [[TMP0]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = add nsw <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX4:%.*]] = add nsw <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; CHECK-NEXT:    [[ADD_6:%.*]] = add nsw i32 undef, [[ADD_5]]
+; CHECK-NEXT:    [[RES:%.*]] = invoke i32 @foobar(i32 [[TMP1]])
 ; CHECK-NEXT:    to label [[NORMAL:%.*]] unwind label [[EXCEPTION:%.*]]
 ; CHECK:       exception:
 ; CHECK-NEXT:    [[CLEANUP:%.*]] = landingpad i8
Index: test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
+++ test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
@@ -303,24 +303,30 @@
 ; CHECK-NEXT:    [[B1:%.*]] = extractelement <4 x float> %b, i32 1
 ; CHECK-NEXT:    [[B2:%.*]] = extractelement <4 x float> %b, i32 2
 ; CHECK-NEXT:    [[B3:%.*]] = extractelement <4 x float> %b, i32 3
-; CHECK-NEXT:    [[CMP0:%.*]] = icmp ne i32 [[C0]], 0
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[C1]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[C2]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C3]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[C0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C1]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[S0:%.*]] = select i1 [[CMP0]], float [[A0]], float [[B0]]
-; CHECK-NEXT:    [[S1:%.*]] = select i1 [[CMP1]], float [[A1]], float [[B1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> undef, float [[A2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[A3]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> undef, float [[B2]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[B3]], i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP5]], <2 x float> [[TMP7]]
-; CHECK-NEXT:    [[RA:%.*]] = insertelement <4 x float> undef, float [[S0]], i32 0
-; CHECK-NEXT:    [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[S1]], i32 1
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
-; CHECK-NEXT:    [[RC:%.*]] = insertelement <4 x float> undef, float [[TMP9]], i32 2
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
-; CHECK-NEXT:    [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP10]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> undef, i32 [[C2]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[C3]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <2 x i32> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> undef, float [[A0]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[A1]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x float> undef, float [[B0]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[B1]], i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP8]], <2 x float> [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x float> undef, float [[A2]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x float> [[TMP12]], float [[A3]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x float> undef, float [[B2]], i32 0
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[B3]], i32 1
+; CHECK-NEXT:    [[TMP16:%.*]] = select <2 x i1> [[TMP6]], <2 x float> [[TMP13]], <2 x float> [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <2 x float> [[TMP11]], i32 0
+; CHECK-NEXT:    [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP17]], i32 0
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x float> [[TMP11]], i32 1
+; CHECK-NEXT:    [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP18]], i32 1
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x float> [[TMP16]], i32 0
+; CHECK-NEXT:    [[RC:%.*]] = insertelement <4 x float> undef, float [[TMP19]], i32 2
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x float> [[TMP16]], i32 1
+; CHECK-NEXT:    [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP20]], i32 3
 ; CHECK-NEXT:    ret <4 x float> [[RD]]
 ;
 ; ZEROTHRESH-LABEL: @simple_select_no_users(