Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -603,7 +603,9 @@ /// Split: /// (v0, v1, v2, v3) /// ((v0+v2), (v1+v3), undef, undef) - int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm) const; + int getArithmeticReductionCost(unsigned Opcode, Type *Ty, + bool IsPairwiseForm) const; + int getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwiseForm) const; /// \returns The cost of Intrinsic instructions. Types analysis only. int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, @@ -797,8 +799,10 @@ ArrayRef Indices, unsigned Alignment, unsigned AddressSpace) = 0; - virtual int getReductionCost(unsigned Opcode, Type *Ty, - bool IsPairwiseForm) = 0; + virtual int getArithmeticReductionCost(unsigned Opcode, Type *Ty, + bool IsPairwiseForm) = 0; + virtual int getMinMaxReductionCost(Type *Ty, Type *CondTy, + bool IsPairwiseForm) = 0; virtual int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef Tys, FastMathFlags FMF) = 0; @@ -1039,9 +1043,13 @@ return Impl.getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, Alignment, AddressSpace); } - int getReductionCost(unsigned Opcode, Type *Ty, - bool IsPairwiseForm) override { - return Impl.getReductionCost(Opcode, Ty, IsPairwiseForm); + int getArithmeticReductionCost(unsigned Opcode, Type *Ty, + bool IsPairwiseForm) override { + return Impl.getArithmeticReductionCost(Opcode, Ty, IsPairwiseForm); + } + int getMinMaxReductionCost(Type *Ty, Type *CondTy, + bool IsPairwiseForm) override { + return Impl.getMinMaxReductionCost(Ty, CondTy, IsPairwiseForm); } int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef Tys, FastMathFlags FMF) override { Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -377,7 +377,9 @@ return 0; } - unsigned getReductionCost(unsigned, Type *, bool) { return 1; } + unsigned getArithmeticReductionCost(unsigned, Type *, bool) { return 1; } + + unsigned getMinMaxReductionCost(Type *, Type *, bool) { return 1; } unsigned getCostOfKeepingLiveOverCall(ArrayRef Tys) { return 0; } Index: include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- include/llvm/CodeGen/BasicTTIImpl.h +++ include/llvm/CodeGen/BasicTTIImpl.h @@ -931,46 +931,47 @@ return 0; } - unsigned getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwise) { + /// Try to calculate arithmetic and shuffle op costs for reduction operations. + /// We're assuming that reduction operation are performing the following way: + /// 1. Non-pairwise reduction + /// %val1 = shufflevector %val, %undef, + /// + /// \----------------v-------------/ \----------v------------/ + /// n/2 elements n/2 elements + /// %red1 = op %val, val1 + /// After this operation we have a vector %red1 where only the first n/2 + /// elements are meaningful, the second n/2 elements are undefined and can be + /// dropped. All other operations are actually working with the vector of + /// length n/2, not n, though the real vector length is still n. + /// %val2 = shufflevector %red1, %undef, + /// + /// \----------------v-------------/ \----------v------------/ + /// n/4 elements 3*n/4 elements + /// %red2 = op %red1, val2 - working with the vector of + /// length n/2, the resulting vector has length n/4 etc. + /// 2. Pairwise reduction: + /// Everything is the same except for an additional shuffle operation which + /// is used to produce operands for pairwise kind of reductions. + /// %val1 = shufflevector %val, %undef, + /// + /// \-------------v----------/ \----------v------------/ + /// n/2 elements n/2 elements + /// %val2 = shufflevector %val, %undef, + /// + /// \-------------v----------/ \----------v------------/ + /// n/2 elements n/2 elements + /// %red1 = op %val1, val2 + /// Again, the operation is performed on vector, but the resulting + /// vector %red1 is vector. + /// + /// The cost model should take into account that the actual length of the + /// vector is reduced on each iteration. + unsigned getArithmeticReductionCost(unsigned Opcode, Type *Ty, + bool IsPairwise) { assert(Ty->isVectorTy() && "Expect a vector type"); Type *ScalarTy = Ty->getVectorElementType(); unsigned NumVecElts = Ty->getVectorNumElements(); unsigned NumReduxLevels = Log2_32(NumVecElts); - // Try to calculate arithmetic and shuffle op costs for reduction operations. - // We're assuming that reduction operation are performing the following way: - // 1. Non-pairwise reduction - // %val1 = shufflevector %val, %undef, - // - // \----------------v-------------/ \----------v------------/ - // n/2 elements n/2 elements - // %red1 = op %val, val1 - // After this operation we have a vector %red1 with only maningfull the - // first n/2 elements, the second n/2 elements are undefined and can be - // dropped. All other operations are actually working with the vector of - // length n/2, not n. though the real vector length is still n. - // %val2 = shufflevector %red1, %undef, - // - // \----------------v-------------/ \----------v------------/ - // n/4 elements 3*n/4 elements - // %red2 = op %red1, val2 - working with the vector of - // length n/2, the resulting vector has length n/4 etc. - // 2. Pairwise reduction: - // Everything is the same except for an additional shuffle operation which - // is used to produce operands for pairwise kind of reductions. - // %val1 = shufflevector %val, %undef, - // - // \-------------v----------/ \----------v------------/ - // n/2 elements n/2 elements - // %val2 = shufflevector %val, %undef, - // - // \-------------v----------/ \----------v------------/ - // n/2 elements n/2 elements - // %red1 = op %val1, val2 - // Again, the operation is performed on vector, but the resulting - // vector %red1 is vector. - // - // The cost model should take into account that the actual length of the - // vector is reduced on each iteration. unsigned ArithCost = 0; unsigned ShuffleCost = 0; auto *ConcreteTTI = static_cast(this); @@ -1001,6 +1002,131 @@ return ShuffleCost + ArithCost + getScalarizationOverhead(Ty, false, true); } + /// Try to calculate arithmetic and shuffle op costs for reduction operations. + /// Try to calculate arithmetic and shuffle op costs for reduction operations. + /// We're assuming that reduction operation are performing the following way: + /// 1. Non-pairwise reduction + /// %val1 = shufflevector %val, %undef, + /// + /// \----------------v-------------/ \----------v------------/ + /// n/2 elements n/2 elements + /// %red1 = op %val, val1 + /// After this operation we have a vector %red1 where only the first n/2 + /// elements are meaningful, the second n/2 elements are undefined and can be + /// dropped. All other operations are actually working with the vector of + /// length n/2, not n, though the real vector length is still n. + /// %val2 = shufflevector %red1, %undef, + /// + /// \----------------v-------------/ \----------v------------/ + /// n/4 elements 3*n/4 elements + /// %red2 = op %red1, val2 - working with the vector of + /// length n/2, the resulting vector has length n/4 etc. + /// 2. Pairwise reduction: + /// Everything is the same except for an additional shuffle operation which + /// is used to produce operands for pairwise kind of reductions. + /// %val1 = shufflevector %val, %undef, + /// + /// \-------------v----------/ \----------v------------/ + /// n/2 elements n/2 elements + /// %val2 = shufflevector %val, %undef, + /// + /// \-------------v----------/ \----------v------------/ + /// n/2 elements n/2 elements + /// %red1 = op %val1, val2 + /// Again, the operation is performed on vector, but the resulting + /// vector %red1 is vector. + /// + /// The cost model should take into account that the actual length of the + /// vector is reduced on each iteration. + /// We're assuming that reduction operation are performing the following way: + /// 1. Non-pairwise reduction + /// %val1 = shufflevector %val, %undef, + /// + /// \----------------v-------------/ \----------v------------/ + /// n/2 elements n/2 elements + /// %red1 = op %val, val1 + /// After this operation we have a vector %red1 where only the first n/2 + /// elements are meaningful, the second n/2 elements are undefined and can be + /// dropped. All other operations are actually working with the vector of + /// length n/2, not n, though the real vector length is still n. + /// %val2 = shufflevector %red1, %undef, + /// + /// \----------------v-------------/ \----------v------------/ + /// n/4 elements 3*n/4 elements + /// %red2 = op %red1, val2 - working with the vector of + /// length n/2, the resulting vector has length n/4 etc. + /// 2. Pairwise reduction: + /// Everything is the same except for an additional shuffle operation which + /// is used to produce operands for pairwise kind of reductions. + /// %val1 = shufflevector %val, %undef, + /// + /// \-------------v----------/ \----------v------------/ + /// n/2 elements n/2 elements + /// %val2 = shufflevector %val, %undef, + /// + /// \-------------v----------/ \----------v------------/ + /// n/2 elements n/2 elements + /// %red1 = op %val1, val2 + /// Again, the operation is performed on vector, but the resulting + /// vector %red1 is vector. + /// + /// The cost model should take into account that the actual length of the + /// vector is reduced on each iteration. + unsigned getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwise) { + assert(Ty->isVectorTy() && "Expect a vector type"); + Type *ScalarTy = Ty->getVectorElementType(); + Type *ScalarCondTy = CondTy->getVectorElementType(); + unsigned NumVecElts = Ty->getVectorNumElements(); + unsigned NumReduxLevels = Log2_32(NumVecElts); + unsigned CmpOpcode; + if (Ty->getVectorElementType()->isFloatingPointTy()) + CmpOpcode = Instruction::FCmp; + else { + assert(Ty->isIntOrIntVectorTy() && + "expecting floating point or integer type for min/max reduction"); + CmpOpcode = Instruction::ICmp; + } + unsigned MinMaxCost = 0; + unsigned ShuffleCost = 0; + auto *ConcreteTTI = static_cast(this); + std::pair LT = + ConcreteTTI->getTLI()->getTypeLegalizationCost(DL, Ty); + unsigned LongVectorCount = 0; + unsigned MVTLen = + LT.second.isVector() ? LT.second.getVectorNumElements() : 1; + while (NumVecElts > MVTLen) { + NumVecElts /= 2; + // Assume the pairwise shuffles add a cost. + ShuffleCost += (IsPairwise + 1) * + ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty, + NumVecElts, Ty); + MinMaxCost += + ConcreteTTI->getCmpSelInstrCost(CmpOpcode, Ty, CondTy) + + ConcreteTTI->getCmpSelInstrCost(Instruction::Select, Ty, CondTy); + Ty = VectorType::get(ScalarTy, NumVecElts); + CondTy = VectorType::get(ScalarCondTy, NumVecElts); + ++LongVectorCount; + } + // The minimal length of the vector is limited by the real length of vector + // operations performed on the current platform. That's why several final + // reduction opertions are perfomed on the vectors with the same + // architecture-dependent length. + ShuffleCost += (NumReduxLevels - LongVectorCount) * (IsPairwise + 1) * + ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty, + NumVecElts, Ty); + MinMaxCost += + (NumReduxLevels - LongVectorCount) * + (ConcreteTTI->getCmpSelInstrCost(CmpOpcode, Ty, CondTy) + + ConcreteTTI->getCmpSelInstrCost(Instruction::Select, Ty, CondTy)); + // Need 3 extractelement instructions for scalarization + an additional + // scalar select instruction. + return ShuffleCost + MinMaxCost + + 3 * getScalarizationOverhead(Ty, /*Insert=*/false, + /*Extract=*/true) + + static_cast(this)->getCmpSelInstrCost(Instruction::Select, + ScalarTy, ScalarCondTy); + } + unsigned getVectorSplitCost() { return 1; } /// @} Index: include/llvm/Transforms/Vectorize/SLPVectorizer.h =================================================================== --- include/llvm/Transforms/Vectorize/SLPVectorizer.h +++ include/llvm/Transforms/Vectorize/SLPVectorizer.h @@ -83,7 +83,7 @@ bool AllowReorder = false); /// \brief Try to vectorize a chain that may start at the operands of \V; - bool tryToVectorize(BinaryOperator *V, slpvectorizer::BoUpSLP &R); + bool tryToVectorize(Instruction *I, slpvectorizer::BoUpSLP &R); /// \brief Vectorize the store instructions collected in Stores. bool vectorizeStoreChains(slpvectorizer::BoUpSLP &R); Index: lib/Analysis/CostModel.cpp =================================================================== --- lib/Analysis/CostModel.cpp +++ lib/Analysis/CostModel.cpp @@ -24,12 +24,14 @@ #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/IR/Value.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; +using namespace PatternMatch; #define CM_NAME "cost-model" #define DEBUG_TYPE CM_NAME @@ -183,22 +185,48 @@ return Mask == ActualMask; } -static bool matchPairwiseReductionAtLevel(const BinaryOperator *BinOp, - unsigned Level, unsigned NumLevels) { +static unsigned getReductionOpcode(Value *V, Value *&L, Value *&R, + Type *&CondTy) { + L = nullptr; + R = nullptr; + CondTy = nullptr; + if (m_BinOp(m_Value(L), m_Value(R)).match(V)) + return cast(V)->getOpcode(); + if (auto *SI = dyn_cast(V)) + if (m_UMin(m_Value(L), m_Value(R)).match(SI) || + m_SMin(m_Value(L), m_Value(R)).match(SI) || + m_SMax(m_Value(L), m_Value(R)).match(SI) || + m_UMax(m_Value(L), m_Value(R)).match(SI) || + m_OrdFMin(m_Value(L), m_Value(R)).match(SI) || + m_OrdFMax(m_Value(L), m_Value(R)).match(SI) || + m_UnordFMin(m_Value(L), m_Value(R)).match(SI) || + m_UnordFMax(m_Value(L), m_Value(R)).match(SI)) { + auto *CI = cast(SI->getCondition()); + CondTy = CI->getType(); + return CI->getOpcode(); + } + return 0; +} + +static bool matchPairwiseReductionAtLevel(Value *V, unsigned Level, + unsigned NumLevels) { // Match one level of pairwise operations. // %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, // <4 x i32> // %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, // <4 x i32> // %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 - if (BinOp == nullptr) + if (!V) return false; - assert(BinOp->getType()->isVectorTy() && "Expecting a vector type"); + assert(V->getType()->isVectorTy() && "Expecting a vector type"); - unsigned Opcode = BinOp->getOpcode(); - Value *L = BinOp->getOperand(0); - Value *R = BinOp->getOperand(1); + Type *CondTy; + Value *L; + Value *R; + unsigned Opcode = getReductionOpcode(V, L, R, CondTy); + if (!Opcode) + return false; ShuffleVectorInst *LS = dyn_cast(L); if (!LS && Level) @@ -239,20 +267,16 @@ // Check that the next levels binary operation exists and matches with the // current one. - BinaryOperator *NextLevelBinOp = nullptr; - if (Level + 1 != NumLevels) { - if (!(NextLevelBinOp = dyn_cast(NextLevelOp))) + if (Level + 1 != NumLevels) + if (Opcode != getReductionOpcode(NextLevelOp, L, R, CondTy)) return false; - else if (NextLevelBinOp->getOpcode() != Opcode) - return false; - } // Shuffle mask for pairwise operation must match. - if (matchPairwiseShuffleMask(LS, true, Level)) { - if (!matchPairwiseShuffleMask(RS, false, Level)) + if (matchPairwiseShuffleMask(LS, /*IsLeft=*/true, Level)) { + if (!matchPairwiseShuffleMask(RS, /*IsLeft=*/false, Level)) return false; - } else if (matchPairwiseShuffleMask(RS, true, Level)) { - if (!matchPairwiseShuffleMask(LS, false, Level)) + } else if (matchPairwiseShuffleMask(RS, /*IsLeft=*/true, Level)) { + if (!matchPairwiseShuffleMask(LS, /*IsLeft=*/false, Level)) return false; } else return false; @@ -261,11 +285,11 @@ return true; // Match next level. - return matchPairwiseReductionAtLevel(NextLevelBinOp, Level, NumLevels); + return matchPairwiseReductionAtLevel(NextLevelOp, Level, NumLevels); } static bool matchPairwiseReduction(const ExtractElementInst *ReduxRoot, - unsigned &Opcode, Type *&Ty) { + unsigned &Opcode, Type *&Ty, Type *&CondTy) { if (!EnableReduxCost) return false; @@ -277,11 +301,14 @@ if (Idx != 0) return false; - BinaryOperator *RdxStart = dyn_cast(ReduxRoot->getOperand(0)); - if (!RdxStart) + Value *L; + Value *R; + Value *RdxStart = ReduxRoot->getOperand(0); + unsigned RdxOpcode = getReductionOpcode(RdxStart, L, R, CondTy); + if (RdxOpcode == 0) return false; - Type *VecTy = ReduxRoot->getOperand(0)->getType(); + Type *VecTy = RdxStart->getType(); unsigned NumVecElems = VecTy->getVectorNumElements(); if (!isPowerOf2_32(NumVecElems)) return false; @@ -307,17 +334,14 @@ if (!matchPairwiseReductionAtLevel(RdxStart, 0, Log2_32(NumVecElems))) return false; - Opcode = RdxStart->getOpcode(); + Opcode = RdxOpcode; Ty = VecTy; return true; } static std::pair -getShuffleAndOtherOprd(BinaryOperator *B) { - - Value *L = B->getOperand(0); - Value *R = B->getOperand(1); +getShuffleAndOtherOprd(Value *L, Value *R) { ShuffleVectorInst *S = nullptr; if ((S = dyn_cast(L))) @@ -328,7 +352,9 @@ } static bool matchVectorSplittingReduction(const ExtractElementInst *ReduxRoot, - unsigned &Opcode, Type *&Ty) { + unsigned &Opcode, Type *&Ty, + Type *&CondTy) { + CondTy = nullptr; if (!EnableReduxCost) return false; @@ -340,10 +366,12 @@ if (Idx != 0) return false; - BinaryOperator *RdxStart = dyn_cast(ReduxRoot->getOperand(0)); - if (!RdxStart) + Value *L; + Value *R; + Value *RdxStart = ReduxRoot->getOperand(0); + unsigned RdxOpcode = getReductionOpcode(RdxStart, L, R, CondTy); + if (RdxOpcode == 0) return false; - unsigned RdxOpcode = RdxStart->getOpcode(); Type *VecTy = ReduxRoot->getOperand(0)->getType(); unsigned NumVecElems = VecTy->getVectorNumElements(); @@ -367,15 +395,13 @@ unsigned NumVecElemsRemain = NumVecElems; while (NumVecElemsRemain - 1) { // Check for the right reduction operation. - BinaryOperator *BinOp; - if (!(BinOp = dyn_cast(RdxOp))) - return false; - if (BinOp->getOpcode() != RdxOpcode) + Value *Op = RdxOp; + if (getReductionOpcode(Op, L, R, CondTy) != RdxOpcode) return false; Value *NextRdxOp; ShuffleVectorInst *Shuffle; - std::tie(NextRdxOp, Shuffle) = getShuffleAndOtherOprd(BinOp); + std::tie(NextRdxOp, Shuffle) = getShuffleAndOtherOprd(L, R); // Check the current reduction operation and the shuffle use the same value. if (Shuffle == nullptr) @@ -494,11 +520,22 @@ // adds followed by a extractelement). unsigned ReduxOpCode; Type *ReduxType; - - if (matchVectorSplittingReduction(EEI, ReduxOpCode, ReduxType)) - return TTI->getReductionCost(ReduxOpCode, ReduxType, false); - else if (matchPairwiseReduction(EEI, ReduxOpCode, ReduxType)) - return TTI->getReductionCost(ReduxOpCode, ReduxType, true); + Type *CondType; + + if (matchVectorSplittingReduction(EEI, ReduxOpCode, ReduxType, CondType)) { + return CondType + ? TTI->getMinMaxReductionCost(ReduxType, CondType, + /*IsPairwiseForm=*/false) + : TTI->getArithmeticReductionCost(ReduxOpCode, ReduxType, + /*IsPairwiseForm=*/false); + } + if (matchPairwiseReduction(EEI, ReduxOpCode, ReduxType, CondType)) { + return CondType + ? TTI->getMinMaxReductionCost(ReduxType, CondType, + /*IsPairwiseForm=*/true) + : TTI->getArithmeticReductionCost(ReduxOpCode, ReduxType, + /*IsPairwiseForm=*/true); + } return TTI->getVectorInstrCost(I->getOpcode(), EEI->getOperand(0)->getType(), Idx); Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -397,9 +397,16 @@ return Cost; } -int TargetTransformInfo::getReductionCost(unsigned Opcode, Type *Ty, - bool IsPairwiseForm) const { - int Cost = TTIImpl->getReductionCost(Opcode, Ty, IsPairwiseForm); +int TargetTransformInfo::getArithmeticReductionCost(unsigned Opcode, Type *Ty, + bool IsPairwiseForm) const { + int Cost = TTIImpl->getArithmeticReductionCost(Opcode, Ty, IsPairwiseForm); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; +} + +int TargetTransformInfo::getMinMaxReductionCost(Type *Ty, Type *CondTy, + bool IsPairwiseForm) const { + int Cost = TTIImpl->getMinMaxReductionCost(Ty, CondTy, IsPairwiseForm); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } Index: lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- lib/Target/X86/X86TargetTransformInfo.h +++ lib/Target/X86/X86TargetTransformInfo.h @@ -80,7 +80,10 @@ int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef Args, FastMathFlags FMF); - int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm); + int getArithmeticReductionCost(unsigned Opcode, Type *Ty, + bool IsPairwiseForm); + + int getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwiseForm); int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -1712,8 +1712,8 @@ return BaseT::getAddressComputationCost(Ty, SE, Ptr); } -int X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy, - bool IsPairwise) { +int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy, + bool IsPairwise) { std::pair LT = TLI->getTypeLegalizationCost(DL, ValTy); @@ -1781,7 +1781,121 @@ return LT.first * Entry->Cost; } - return BaseT::getReductionCost(Opcode, ValTy, IsPairwise); + return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise); +} + +int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy, + bool IsPairwise) { + + std::pair LT = TLI->getTypeLegalizationCost(DL, ValTy); + + MVT MTy = LT.second; + + int ISD = ValTy->isIntOrIntVectorTy() ? ISD::SMIN : ISD::FMINNUM; + + // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput + // and make it as the cost. + + static const CostTblEntry SSE42CostTblPairWise[] = { + { ISD::FMINNUM, MVT::v2f64, 3 }, + { ISD::FMINNUM, MVT::v4f32, 2 }, + { ISD::SMIN, MVT::v2i64, 7 }, // The data reported by the IACA is "6.8" + { ISD::SMIN, MVT::v4i32, 1 }, // The data reported by the IACA is "1.5" + { ISD::SMIN, MVT::v8i16, 2 }, + }; + + static const CostTblEntry AVX1CostTblPairWise[] = { + { ISD::FMINNUM, MVT::v4f32, 1 }, + { ISD::FMINNUM, MVT::v4f64, 1 }, + { ISD::FMINNUM, MVT::v8f32, 2 }, + { ISD::SMIN, MVT::v2i64, 3 }, + { ISD::SMIN, MVT::v4i32, 1 }, + { ISD::SMIN, MVT::v8i16, 1 }, + { ISD::SMIN, MVT::v8i32, 3 }, + }; + + static const CostTblEntry AVX2CostTblPairWise[] = { + { ISD::SMIN, MVT::v4i64, 2 }, + { ISD::SMIN, MVT::v8i32, 1 }, + { ISD::SMIN, MVT::v16i16, 1 }, + { ISD::SMIN, MVT::v32i8, 2 }, + }; + + static const CostTblEntry AVX512CostTblPairWise[] = { + { ISD::FMINNUM, MVT::v8f64, 1 }, + { ISD::FMINNUM, MVT::v16f32, 2 }, + { ISD::SMIN, MVT::v8i64, 2 }, + { ISD::SMIN, MVT::v16i32, 1 }, + }; + + static const CostTblEntry SSE42CostTblNoPairWise[] = { + { ISD::FMINNUM, MVT::v2f64, 3 }, + { ISD::FMINNUM, MVT::v4f32, 3 }, + { ISD::SMIN, MVT::v2i64, 7 }, // The data reported by the IACA is "6.8" + { ISD::SMIN, MVT::v4i32, 1 }, // The data reported by the IACA is "1.5" + { ISD::SMIN, MVT::v8i16, 1 }, // The data reported by the IACA is "1.5" + }; + + static const CostTblEntry AVX1CostTblNoPairWise[] = { + { ISD::FMINNUM, MVT::v4f32, 1 }, + { ISD::FMINNUM, MVT::v4f64, 1 }, + { ISD::FMINNUM, MVT::v8f32, 1 }, + { ISD::SMIN, MVT::v2i64, 3 }, + { ISD::SMIN, MVT::v4i32, 1 }, + { ISD::SMIN, MVT::v8i16, 1 }, + { ISD::SMIN, MVT::v8i32, 2 }, + }; + + static const CostTblEntry AVX2CostTblNoPairWise[] = { + { ISD::SMIN, MVT::v4i64, 1 }, + { ISD::SMIN, MVT::v8i32, 1 }, + { ISD::SMIN, MVT::v16i16, 1 }, + { ISD::SMIN, MVT::v32i8, 1 }, + }; + + static const CostTblEntry AVX512CostTblNoPairWise[] = { + { ISD::FMINNUM, MVT::v8f64, 1 }, + { ISD::FMINNUM, MVT::v16f32, 2 }, + { ISD::SMIN, MVT::v8i64, 1 }, + { ISD::SMIN, MVT::v16i32, 1 }, + }; + + if (IsPairwise) { + if (ST->hasAVX512()) + if (const auto *Entry = CostTableLookup(AVX512CostTblPairWise, ISD, MTy)) + return LT.first * Entry->Cost; + + if (ST->hasAVX2()) + if (const auto *Entry = CostTableLookup(AVX2CostTblPairWise, ISD, MTy)) + return LT.first * Entry->Cost; + + if (ST->hasAVX()) + if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy)) + return LT.first * Entry->Cost; + + if (ST->hasSSE42()) + if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy)) + return LT.first * Entry->Cost; + } else { + if (ST->hasAVX512()) + if (const auto *Entry = + CostTableLookup(AVX512CostTblNoPairWise, ISD, MTy)) + return LT.first * Entry->Cost; + + if (ST->hasAVX2()) + if (const auto *Entry = CostTableLookup(AVX2CostTblNoPairWise, ISD, MTy)) + return LT.first * Entry->Cost; + + if (ST->hasAVX()) + if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) + return LT.first * Entry->Cost; + + if (ST->hasSSE42()) + if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy)) + return LT.first * Entry->Cost; + } + + return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise); } /// \brief Calculate the cost of materializing a 64-bit value. This helper Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -33,6 +33,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/IR/NoFolder.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" #include "llvm/IR/Verifier.h" @@ -45,6 +46,7 @@ #include using namespace llvm; +using namespace llvm::PatternMatch; using namespace slpvectorizer; #define SV_NAME "slp-vectorizer" @@ -4022,15 +4024,18 @@ return Changed; } -bool SLPVectorizerPass::tryToVectorize(BinaryOperator *V, BoUpSLP &R) { - if (!V) +bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) { + if (!I) + return false; + + if (!isa(I) && !isa(I)) return false; - Value *P = V->getParent(); + Value *P = I->getParent(); // Vectorize in current basic block only. - auto *Op0 = dyn_cast(V->getOperand(0)); - auto *Op1 = dyn_cast(V->getOperand(1)); + auto *Op0 = dyn_cast(I->getOperand(0)); + auto *Op1 = dyn_cast(I->getOperand(1)); if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P) return false; @@ -4124,7 +4129,127 @@ SmallVector ReductionOps; SmallVector ReducedVals; - BinaryOperator *ReductionRoot; + struct OperationData { + enum MinMaxIntFloat { + IntMin = Instruction::BinaryOpsEnd, + IntUMin, + FloatMin, + IntMax, + IntUMax, + FloatMax + }; + bool Validity = false; + unsigned Opcode = 0; + Value *LHS = nullptr; + Value *RHS = nullptr; + Type *CondTy = nullptr; + + public: + OperationData() = default; + OperationData(Value *V) { + if (auto *I = dyn_cast(V)) { + Validity = true; + Opcode = I->getOpcode(); + } + } + OperationData(unsigned Opcode, Value *LHS, Value *RHS) + : Validity(true), Opcode(Opcode), LHS(LHS), RHS(RHS) {} + OperationData(Value *LHS, Value *RHS, Type *CondTy, bool IsMaximum, + bool IsUnsigned = false) + : Validity(true), LHS(LHS), RHS(RHS), CondTy(CondTy) { + if (LHS->getType()->isIntegerTy()) { + if (IsUnsigned) + Opcode = IsMaximum ? IntUMax : IntUMin; + else + Opcode = IsMaximum ? IntMax : IntMin; + } else + Opcode = IsMaximum ? FloatMax : FloatMin; + } + operator bool() const { return Validity; } + bool isBinOp() const { + return Validity && LHS && RHS && Instruction::isBinaryOp(Opcode); + } + bool isMinMax() const { + return Validity && LHS && RHS && Opcode >= IntMin && Opcode <= FloatMax; + } + bool isVectorizable() const { return Validity && LHS && RHS; } + bool operator==(const OperationData &OD) { + return this == &OD || (Validity == OD.Validity && (!LHS == !OD.LHS) && + (!RHS == !OD.RHS) && Opcode == OD.Opcode); + } + bool operator!=(const OperationData &OD) { return !(*this == OD); } + void clear() { + Validity = false; + LHS = nullptr; + RHS = nullptr; + Opcode = 0; + } + unsigned getOpcode() const { + assert(isVectorizable()); + if (isBinOp()) + return Opcode; + switch (Opcode) { + case FloatMax: + case FloatMin: + return Instruction::FCmp; + case IntMin: + case IntUMin: + case IntMax: + case IntUMax: + return Instruction::ICmp; + default: + break; + } + llvm_unreachable("Unexpected opcode"); + } + Value *getLHS() const { return LHS; } + Value *getRHS() const { return RHS; } + Type *getConditionType() const { return CondTy; } + bool isFloatMinMax() const { + return isMinMax() && (Opcode == FloatMin || Opcode == FloatMax); + } + bool isIntMinMax() const { + return isMinMax() && (Opcode == IntMin || Opcode == IntMax || + Opcode == IntUMin || Opcode == IntUMax); + } + Value *createOp(IRBuilder<> &Builder, Value *L, Value *R, + const Twine &Name = "") const { + if (isBinOp()) { + assert(Opcode == Instruction::FAdd || Opcode == Instruction::Add); + if (Opcode == Instruction::FAdd) + return Builder.CreateFAdd(L, R, Name); + return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name); + } + assert(Opcode >= OperationData::IntMin && + Opcode <= OperationData::FloatMax); + Value *Cmp; + switch (Opcode) { + case OperationData::IntMin: + Cmp = Builder.CreateICmpSLT(L, R); + break; + case OperationData::IntUMin: + Cmp = Builder.CreateICmpULT(L, R); + break; + case OperationData::FloatMin: + Cmp = Builder.CreateFCmpOLT(L, R); + break; + case OperationData::IntMax: + Cmp = Builder.CreateICmpSGT(L, R); + break; + case OperationData::IntUMax: + Cmp = Builder.CreateICmpUGT(L, R); + break; + case OperationData::FloatMax: + Cmp = Builder.CreateFCmpOGT(L, R); + break; + default: + llvm_unreachable("Unknown operation"); + } + return Builder.CreateSelect(Cmp, L, R, Name); + } + }; + + Instruction *ReductionRoot; // After successfull horizontal reduction vectorization attempt for PHI node // vectorizer tries to update root binary op by combining vectorized tree and // the ReductionPHI node. But during vectorization this ReductionPHI can be @@ -4134,14 +4259,48 @@ // is destroyed" crash upon PHI node deletion. WeakVH ReductionPHI; - /// The opcode of the reduction. - unsigned ReductionOpcode; - /// The opcode of the values we perform a reduction on. - unsigned ReducedValueOpcode; + /// The operation data of the reduction operation. + OperationData ReductionData; + /// The operation data of the values we perform a reduction on. + OperationData ReducedValueData; /// Should we model this reduction as a pairwise reduction tree or a tree that /// splits the vector in halves and adds those halves. bool IsPairwiseReduction; + static OperationData getOperationData(Value *V) { + if (!V) + return OperationData(); + + Value *LHS; + Value *RHS; + if (m_BinOp(m_Value(LHS), m_Value(RHS)).match(V)) + return {cast(V)->getOpcode(), LHS, RHS}; + if (auto *Select = dyn_cast(V)) { + // Look for a min/max pattern. + if (m_UMin(m_Value(LHS), m_Value(RHS)).match(Select)) + return {LHS, RHS, Select->getCondition()->getType(), + /*IsMaximum=*/false, /*IsUnsigned=*/true}; + else if (m_SMin(m_Value(LHS), m_Value(RHS)).match(Select)) + return {LHS, RHS, Select->getCondition()->getType(), + /*IsMaximum=*/false, /*IsUnsigned=*/false}; + else if (m_OrdFMin(m_Value(LHS), m_Value(RHS)).match(Select) || + m_UnordFMin(m_Value(LHS), m_Value(RHS)).match(Select)) + return {LHS, RHS, Select->getCondition()->getType(), + /*IsMaximum=*/false}; + else if (m_UMax(m_Value(LHS), m_Value(RHS)).match(Select)) + return {LHS, RHS, Select->getCondition()->getType(), /*IsMaximum=*/true, + /*IsUnsigned=*/true}; + else if (m_SMax(m_Value(LHS), m_Value(RHS)).match(Select)) + return {LHS, RHS, Select->getCondition()->getType(), /*IsMaximum=*/true, + /*IsUnsigned=*/false}; + else if (m_OrdFMax(m_Value(LHS), m_Value(RHS)).match(Select) || + m_UnordFMax(m_Value(LHS), m_Value(RHS)).match(Select)) + return {LHS, RHS, Select->getCondition()->getType(), + /*IsMaximum=*/true}; + } + return {V}; + } + public: /// The width of one full horizontal reduction operation. unsigned ReduxWidth; @@ -4151,29 +4310,32 @@ unsigned MinVecRegSize; HorizontalReduction(unsigned MinVecRegSize) - : ReductionRoot(nullptr), ReductionOpcode(0), ReducedValueOpcode(0), - IsPairwiseReduction(false), ReduxWidth(0), + : ReductionRoot(nullptr), IsPairwiseReduction(false), ReduxWidth(0), MinVecRegSize(MinVecRegSize) {} /// \brief Try to find a reduction tree. - bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) { + bool matchAssociativeReduction(PHINode *Phi, Instruction *B) { assert((!Phi || is_contained(Phi->operands(), B)) && "Thi phi needs to use the binary operator"); + ReductionData = getOperationData(B); + // We could have a initial reductions that is not an add. // r *= v1 + v2 + v3 + v4 // In such a case start looking for a tree rooted in the first '+'. - if (Phi) { - if (B->getOperand(0) == Phi) { + if (Phi && ReductionData.isVectorizable()) { + if (ReductionData.getLHS() == Phi) { Phi = nullptr; - B = dyn_cast(B->getOperand(1)); - } else if (B->getOperand(1) == Phi) { + B = dyn_cast(ReductionData.getRHS()); + ReductionData = getOperationData(B); + } else if (ReductionData.getRHS() == Phi) { Phi = nullptr; - B = dyn_cast(B->getOperand(0)); + B = dyn_cast(ReductionData.getLHS()); + ReductionData = getOperationData(B); } } - if (!B) + if (!B || !ReductionData.isVectorizable()) return false; Type *Ty = B->getType(); @@ -4181,8 +4343,7 @@ return false; const DataLayout &DL = B->getModule()->getDataLayout(); - ReductionOpcode = B->getOpcode(); - ReducedValueOpcode = 0; + ReducedValueData.clear(); // FIXME: Register size should be a parameter to this function, so we can // try different vectorization factors. ReduxWidth = MinVecRegSize / DL.getTypeSizeInBits(Ty); @@ -4192,19 +4353,22 @@ if (ReduxWidth < 4) return false; - // We currently only support adds. - if (ReductionOpcode != Instruction::Add && - ReductionOpcode != Instruction::FAdd) + // We currently only support adds and min/max. + if (ReductionData.getOpcode() != Instruction::Add && + ReductionData.getOpcode() != Instruction::FAdd && + !ReductionData.isMinMax()) return false; // Post order traverse the reduction tree starting at B. We only handle true // trees containing only binary operators or selects. + bool IsBinOp = ReductionData.isBinOp(); SmallVector, 32> Stack; - Stack.push_back(std::make_pair(B, 0)); + Stack.push_back(std::make_pair(B, IsBinOp ? 0 : 1)); while (!Stack.empty()) { Instruction *TreeN = Stack.back().first; unsigned EdgeToVist = Stack.back().second++; - bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode; + OperationData OpData = getOperationData(TreeN); + bool IsReducedValue = OpData != ReductionData; // Only handle trees in the current basic block. if (TreeN->getParent() != B->getParent()) @@ -4212,22 +4376,27 @@ // Each tree node needs to have one user except for the ultimate // reduction. - if (!TreeN->hasOneUse() && TreeN != B) + if (!TreeN->hasOneUse() && (IsBinOp || !TreeN->hasNUses(2)) && TreeN != B) return false; // Postorder vist. - if (EdgeToVist == 2 || IsReducedValue) { + if (((IsBinOp && EdgeToVist == 2) || + (OpData && OpData.isMinMax() && EdgeToVist == 3)) || + IsReducedValue) { if (IsReducedValue) { // Make sure that the opcodes of the operations that we are going to // reduce match. - if (!ReducedValueOpcode) - ReducedValueOpcode = TreeN->getOpcode(); - else if (ReducedValueOpcode != TreeN->getOpcode()) + if (!ReducedValueData) + ReducedValueData = OpData; + else if (ReducedValueData != OpData) return false; ReducedVals.push_back(TreeN); } else { // We need to be able to reassociate the adds. - if (!TreeN->isAssociative()) + if (!TreeN->isAssociative() && + !(OpData.isFloatMinMax() && + cast(TreeN->getOperand(0))->hasUnsafeAlgebra()) && + !OpData.isIntMinMax()) return false; ReductionOps.push_back(TreeN); } @@ -4240,15 +4409,16 @@ Value *NextV = TreeN->getOperand(EdgeToVist); if (NextV != Phi) { auto *I = dyn_cast(NextV); + OpData = getOperationData(I); // Continue analysis if the next operand is a reduction operation or // (possibly) a reduced value. If the reduced value opcode is not set, // the first met operation != reduction operation is considered as the // reduced value class. - if (I && (!ReducedValueOpcode || I->getOpcode() == ReducedValueOpcode || - I->getOpcode() == ReductionOpcode)) { - if (!ReducedValueOpcode && I->getOpcode() != ReductionOpcode) - ReducedValueOpcode = I->getOpcode(); - Stack.push_back(std::make_pair(I, 0)); + if (I && (!ReducedValueData || OpData == ReducedValueData || + OpData == ReductionData)) { + if (!ReducedValueData && OpData != ReductionData) + ReducedValueData = OpData; + Stack.push_back(std::make_pair(I, OpData.isMinMax() ? 1 : 0)); continue; } return false; @@ -4302,8 +4472,8 @@ Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder); if (VectorizedTree) { Builder.SetCurrentDebugLocation(Loc); - VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree, - ReducedSubTree, "bin.rdx"); + VectorizedTree = ReductionData.createOp(Builder, VectorizedTree, + ReducedSubTree, "bin.rdx"); } else VectorizedTree = ReducedSubTree; } @@ -4313,14 +4483,22 @@ for (; i < NumReducedVals; ++i) { Builder.SetCurrentDebugLocation( cast(ReducedVals[i])->getDebugLoc()); - VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree, - ReducedVals[i]); + VectorizedTree = + ReductionData.createOp(Builder, VectorizedTree, ReducedVals[i]); } // Update users. if (ReductionPHI && !isa(ReductionPHI)) { assert(ReductionRoot && "Need a reduction operation"); - ReductionRoot->setOperand(0, VectorizedTree); - ReductionRoot->setOperand(1, ReductionPHI); + if (ReductionData.isBinOp()) { + ReductionRoot->setOperand(0, VectorizedTree); + ReductionRoot->setOperand(1, ReductionPHI); + } else { + auto *Cmp = cast(ReductionRoot->getOperand(1)); + Cmp->setOperand(0, VectorizedTree); + Cmp->setOperand(1, ReductionPHI); + ReductionRoot->setOperand(1, VectorizedTree); + ReductionRoot->setOperand(2, ReductionPHI); + } } else ReductionRoot->replaceAllUsesWith(VectorizedTree); } @@ -4336,16 +4514,39 @@ int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) { Type *ScalarTy = FirstReducedVal->getType(); Type *VecTy = VectorType::get(ScalarTy, ReduxWidth); - - int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true); - int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false); + Type *ScalarCondTy = ReductionData.getConditionType(); + Type *VecCondTy = + ScalarCondTy ? VectorType::get(ScalarCondTy, ReduxWidth) : nullptr; + + int PairwiseRdxCost = + ReductionData.isMinMax() + ? TTI->getMinMaxReductionCost(VecTy, VecCondTy, + /*IsPairwiseForm=*/true) + : TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy, + /*IsPairwiseForm=*/true); + int SplittingRdxCost = + ReductionData.isMinMax() + ? TTI->getMinMaxReductionCost(VecTy, VecCondTy, + /*IsPairwiseForm=*/false) + : TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy, + /*IsPairwiseForm=*/false); IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost; int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost; - int ScalarReduxCost = - (ReduxWidth - 1) * - TTI->getArithmeticInstrCost(ReductionOpcode, ScalarTy); + int ScalarReduxCost; + if (ReductionData.isBinOp()) { + ScalarReduxCost = + (ReduxWidth - 1) * + TTI->getArithmeticInstrCost(ReductionData.getOpcode(), ScalarTy); + } else { + assert(ReductionData.isMinMax()); + ScalarReduxCost = + (ReduxWidth - 1) * + (TTI->getCmpSelInstrCost(ReductionData.getOpcode(), ScalarTy) + + TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, + ScalarCondTy)); + } DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost << " for reduction that starts with " << *FirstReducedVal @@ -4356,13 +4557,6 @@ return VecReduxCost - ScalarReduxCost; } - static Value *createBinOp(IRBuilder<> &Builder, unsigned Opcode, Value *L, - Value *R, const Twine &Name = "") { - if (Opcode == Instruction::FAdd) - return Builder.CreateFAdd(L, R, Name); - return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name); - } - /// \brief Emit a horizontal reduction of the vectorized value. Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) { assert(VectorizedValue && "Need to have a vectorized tree node"); @@ -4382,14 +4576,14 @@ Value *RightShuf = Builder.CreateShuffleVector( TmpVec, UndefValue::get(TmpVec->getType()), (RightMask), "rdx.shuf.r"); - TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf, - "bin.rdx"); + TmpVec = + ReductionData.createOp(Builder, LeftShuf, RightShuf, "bin.rdx"); } else { Value *UpperHalf = createRdxShuffleMask(ReduxWidth, i, false, false, Builder); Value *Shuf = Builder.CreateShuffleVector( TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf"); - TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf, "bin.rdx"); + TmpVec = ReductionData.createOp(Builder, TmpVec, Shuf, "bin.rdx"); } } @@ -4560,10 +4754,10 @@ /// if it can be done. /// \returns true if a horizontal reduction was matched and reduced. /// \returns false if a horizontal reduction was not matched. -static bool canBeVectorized( - PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R, - TargetTransformInfo *TTI, - const function_ref Vectorize) { +static bool +canBeVectorized(PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R, + TargetTransformInfo *TTI, + const function_ref Vectorize) { if (!ShouldVectorizeHor) return false; @@ -4588,9 +4782,11 @@ } if (Stack.back().isInitial()) { Stack.back().clearInitial(); - if (auto *BI = dyn_cast(Inst)) { + auto *BI = dyn_cast(Inst); + auto *SI = dyn_cast(Inst); + if (BI || SI) { HorizontalReduction HorRdx(R.getMinVecRegSize()); - if (HorRdx.matchAssociativeReduction(P, BI)) { + if (HorRdx.matchAssociativeReduction(P, Inst)) { // If there is a sufficient number of reduction values, reduce // to a nearby power-of-2. Can safely generate oversized // vectors and rely on the backend to split them to legal sizes. @@ -4603,7 +4799,7 @@ continue; } } - if (P) { + if (P && BI) { Inst = dyn_cast(BI->getOperand(0)); if (Inst == P) Inst = dyn_cast(BI->getOperand(1)); @@ -4614,7 +4810,7 @@ } } P = nullptr; - if (Vectorize(dyn_cast(Inst), R)) { + if (Vectorize(Inst, R)) { Res = true; continue; } @@ -4645,8 +4841,8 @@ P = nullptr; // Try to match and vectorize a horizontal reduction. return canBeVectorized(P, I, BB, R, TTI, - [this](BinaryOperator *BI, BoUpSLP &R) -> bool { - return tryToVectorize(BI, R); + [this](Instruction *I, BoUpSLP &R) -> bool { + return tryToVectorize(I, R); }); } @@ -4755,27 +4951,16 @@ } // Try to vectorize trees that start at compare instructions. - if (CmpInst *CI = dyn_cast(it)) { - if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) { + if (auto *BI = dyn_cast(it)) { + if (!BI->isConditional()) + continue; + + if (vectorizeRootInstruction(nullptr, BI->getCondition(), BB, R, TTI)) { Changed = true; - // We would like to start over since some instructions are deleted - // and the iterator may become invalid value. it = BB->begin(); e = BB->end(); continue; } - - for (int I = 0; I < 2; ++I) { - if (vectorizeRootInstruction(nullptr, CI->getOperand(I), BB, R, TTI)) { - Changed = true; - // We would like to start over since some instructions are deleted - // and the iterator may become invalid value. - it = BB->begin(); - e = BB->end(); - break; - } - } - continue; } // Try to vectorize trees that start at insertelement instructions. Index: test/Transforms/SLPVectorizer/X86/horizontal-list.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/horizontal-list.ll +++ test/Transforms/SLPVectorizer/X86/horizontal-list.ll @@ -236,26 +236,18 @@ define float @bar() { ; CHECK-LABEL: @bar( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16 -; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP0]] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 -; CHECK-NEXT: [[CMP4:%.*]] = fcmp fast ogt float [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[MAX_0_MUL3:%.*]] = select i1 [[CMP4]], float [[TMP3]], float [[TMP4]] -; CHECK-NEXT: [[TMP5:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2), align 8 -; CHECK-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2), align 8 -; CHECK-NEXT: [[MUL3_1:%.*]] = fmul fast float [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[CMP4_1:%.*]] = fcmp fast ogt float [[MAX_0_MUL3]], [[MUL3_1]] -; CHECK-NEXT: [[MAX_0_MUL3_1:%.*]] = select i1 [[CMP4_1]], float [[MAX_0_MUL3]], float [[MUL3_1]] -; CHECK-NEXT: [[TMP7:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 3), align 4 -; CHECK-NEXT: [[TMP8:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 3), align 4 -; CHECK-NEXT: [[MUL3_2:%.*]] = fmul fast float [[TMP8]], [[TMP7]] -; CHECK-NEXT: [[CMP4_2:%.*]] = fcmp fast ogt float [[MAX_0_MUL3_1]], [[MUL3_2]] -; CHECK-NEXT: [[MAX_0_MUL3_2:%.*]] = select i1 [[CMP4_2]], float [[MAX_0_MUL3_1]], float [[MUL3_2]] -; CHECK-NEXT: store float [[MAX_0_MUL3_2]], float* @res, align 4 -; CHECK-NEXT: ret float [[MAX_0_MUL3_2]] +; CHECK: [[TMP0:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16 +; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP0]] +; CHECK: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = fcmp fast ogt <4 x float> [[TMP2]], [[RDX_SHUF]] +; CHECK-NEXT: [[BIN_RDX:%.*]] = select <4 x i1> [[TMP7]], <4 x float> [[TMP2]], <4 x float> [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = fcmp fast ogt <4 x float> [[BIN_RDX]], [[RDX_SHUF1]] +; CHECK-NEXT: [[BIN_RDX2:%.*]] = select <4 x i1> [[TMP8]], <4 x float> [[BIN_RDX]], <4 x float> [[RDX_SHUF1]] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0 +; CHECK: store float [[TMP9]], float* @res, align 4 +; CHECK-NEXT: ret float [[TMP9]] ; entry: %0 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 0), align 16 Index: test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll +++ test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll @@ -34,79 +34,46 @@ ; CHECK-NEXT: ret i32 [[TMP23]] ; ; AVX-LABEL: @maxi8( -; AVX-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 -; AVX-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4 -; AVX-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]] -; AVX-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]] -; AVX-NEXT: [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8 -; AVX-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]] -; AVX-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]] -; AVX-NEXT: [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4 -; AVX-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]] -; AVX-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]] -; AVX-NEXT: [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16 -; AVX-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]] -; AVX-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] -; AVX-NEXT: [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4 -; AVX-NEXT: [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]] -; AVX-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]] -; AVX-NEXT: [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 -; AVX-NEXT: [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]] -; AVX-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]] -; AVX-NEXT: [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 -; AVX-NEXT: [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]] -; AVX-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]] -; AVX-NEXT: ret i32 [[TMP23]] +; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr to <8 x i32>*), align 16 +; AVX: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> undef, <8 x i32> +; AVX-NEXT: [[TMP24:%.*]] = icmp sgt <8 x i32> [[TMP2]], [[RDX_SHUF]] +; AVX-NEXT: [[BIN_RDX:%.*]] = select <8 x i1> [[TMP24]], <8 x i32> [[TMP2]], <8 x i32> [[RDX_SHUF]] +; AVX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> +; AVX-NEXT: [[TMP25:%.*]] = icmp sgt <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; AVX-NEXT: [[BIN_RDX2:%.*]] = select <8 x i1> [[TMP25]], <8 x i32> [[BIN_RDX]], <8 x i32> [[RDX_SHUF1]] +; AVX-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> +; AVX-NEXT: [[TMP26:%.*]] = icmp sgt <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; AVX-NEXT: [[BIN_RDX4:%.*]] = select <8 x i1> [[TMP26]], <8 x i32> [[BIN_RDX2]], <8 x i32> [[RDX_SHUF3]] +; AVX-NEXT: [[TMP27:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 +; AVX: ret i32 [[TMP27]] ; ; AVX2-LABEL: @maxi8( -; AVX2-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 -; AVX2-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4 -; AVX2-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]] -; AVX2-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]] -; AVX2-NEXT: [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8 -; AVX2-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]] -; AVX2-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]] -; AVX2-NEXT: [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4 -; AVX2-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]] -; AVX2-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]] -; AVX2-NEXT: [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16 -; AVX2-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]] -; AVX2-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] -; AVX2-NEXT: [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4 -; AVX2-NEXT: [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]] -; AVX2-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]] -; AVX2-NEXT: [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 -; AVX2-NEXT: [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]] -; AVX2-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]] -; AVX2-NEXT: [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 -; AVX2-NEXT: [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]] -; AVX2-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]] -; AVX2-NEXT: ret i32 [[TMP23]] +; AVX2-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr to <8 x i32>*), align 16 +; AVX2: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> undef, <8 x i32> +; AVX2-NEXT: [[TMP24:%.*]] = icmp sgt <8 x i32> [[TMP2]], [[RDX_SHUF]] +; AVX2-NEXT: [[BIN_RDX:%.*]] = select <8 x i1> [[TMP24]], <8 x i32> [[TMP2]], <8 x i32> [[RDX_SHUF]] +; AVX2-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> +; AVX2-NEXT: [[TMP25:%.*]] = icmp sgt <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; AVX2-NEXT: [[BIN_RDX2:%.*]] = select <8 x i1> [[TMP25]], <8 x i32> [[BIN_RDX]], <8 x i32> [[RDX_SHUF1]] +; AVX2-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> +; AVX2-NEXT: [[TMP26:%.*]] = icmp sgt <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; AVX2-NEXT: [[BIN_RDX4:%.*]] = select <8 x i1> [[TMP26]], <8 x i32> [[BIN_RDX2]], <8 x i32> [[RDX_SHUF3]] +; AVX2-NEXT: [[TMP27:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 +; AVX2: ret i32 [[TMP27]] ; ; SKX-LABEL: @maxi8( -; SKX-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 -; SKX-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4 -; SKX-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]] -; SKX-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]] -; SKX-NEXT: [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8 -; SKX-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]] -; SKX-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]] -; SKX-NEXT: [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4 -; SKX-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]] -; SKX-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]] -; SKX-NEXT: [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16 -; SKX-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]] -; SKX-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] -; SKX-NEXT: [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4 -; SKX-NEXT: [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]] -; SKX-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]] -; SKX-NEXT: [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 -; SKX-NEXT: [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]] -; SKX-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]] -; SKX-NEXT: [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 -; SKX-NEXT: [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]] -; SKX-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]] -; SKX-NEXT: ret i32 [[TMP23]] +; SKX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr to <8 x i32>*), align 16 +; SKX: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> undef, <8 x i32> +; SKX-NEXT: [[TMP24:%.*]] = icmp sgt <8 x i32> [[TMP2]], [[RDX_SHUF]] +; SKX-NEXT: [[BIN_RDX:%.*]] = select <8 x i1> [[TMP24]], <8 x i32> [[TMP2]], <8 x i32> [[RDX_SHUF]] +; SKX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> +; SKX-NEXT: [[TMP25:%.*]] = icmp sgt <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; SKX-NEXT: [[BIN_RDX2:%.*]] = select <8 x i1> [[TMP25]], <8 x i32> [[BIN_RDX]], <8 x i32> [[RDX_SHUF1]] +; SKX-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> +; SKX-NEXT: [[TMP26:%.*]] = icmp sgt <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; SKX-NEXT: [[BIN_RDX4:%.*]] = select <8 x i1> [[TMP26]], <8 x i32> [[BIN_RDX2]], <8 x i32> [[RDX_SHUF3]] +; SKX-NEXT: [[TMP27:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 +; SKX: ret i32 [[TMP27]] ; %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4 @@ -184,151 +151,55 @@ ; CHECK-NEXT: ret i32 [[TMP47]] ; ; AVX-LABEL: @maxi16( -; AVX-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 -; AVX-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4 -; AVX-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]] -; AVX-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]] -; AVX-NEXT: [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8 -; AVX-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]] -; AVX-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]] -; AVX-NEXT: [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4 -; AVX-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]] -; AVX-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]] -; AVX-NEXT: [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16 -; AVX-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]] -; AVX-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] -; AVX-NEXT: [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4 -; AVX-NEXT: [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]] -; AVX-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]] -; AVX-NEXT: [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 -; AVX-NEXT: [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]] -; AVX-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]] -; AVX-NEXT: [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 -; AVX-NEXT: [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]] -; AVX-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]] -; AVX-NEXT: [[TMP24:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 8), align 16 -; AVX-NEXT: [[TMP25:%.*]] = icmp sgt i32 [[TMP23]], [[TMP24]] -; AVX-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP23]], i32 [[TMP24]] -; AVX-NEXT: [[TMP27:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 9), align 4 -; AVX-NEXT: [[TMP28:%.*]] = icmp sgt i32 [[TMP26]], [[TMP27]] -; AVX-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i32 [[TMP26]], i32 [[TMP27]] -; AVX-NEXT: [[TMP30:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 10), align 8 -; AVX-NEXT: [[TMP31:%.*]] = icmp sgt i32 [[TMP29]], [[TMP30]] -; AVX-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP29]], i32 [[TMP30]] -; AVX-NEXT: [[TMP33:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 11), align 4 -; AVX-NEXT: [[TMP34:%.*]] = icmp sgt i32 [[TMP32]], [[TMP33]] -; AVX-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[TMP32]], i32 [[TMP33]] -; AVX-NEXT: [[TMP36:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 12), align 16 -; AVX-NEXT: [[TMP37:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] -; AVX-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 [[TMP35]], i32 [[TMP36]] -; AVX-NEXT: [[TMP39:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 13), align 4 -; AVX-NEXT: [[TMP40:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]] -; AVX-NEXT: [[TMP41:%.*]] = select i1 [[TMP40]], i32 [[TMP38]], i32 [[TMP39]] -; AVX-NEXT: [[TMP42:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 14), align 8 -; AVX-NEXT: [[TMP43:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] -; AVX-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[TMP41]], i32 [[TMP42]] -; AVX-NEXT: [[TMP45:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 15), align 4 -; AVX-NEXT: [[TMP46:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]] -; AVX-NEXT: [[TMP47:%.*]] = select i1 [[TMP46]], i32 [[TMP44]], i32 [[TMP45]] -; AVX-NEXT: ret i32 [[TMP47]] +; AVX-NEXT: [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([32 x i32]* @arr to <16 x i32>*), align 16 +; AVX: [[RDX_SHUF:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> undef, <16 x i32> +; AVX-NEXT: [[TMP48:%.*]] = icmp sgt <16 x i32> [[TMP2]], [[RDX_SHUF]] +; AVX-NEXT: [[BIN_RDX:%.*]] = select <16 x i1> [[TMP48]], <16 x i32> [[TMP2]], <16 x i32> [[RDX_SHUF]] +; AVX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <16 x i32> [[BIN_RDX]], <16 x i32> undef, <16 x i32> +; AVX-NEXT: [[TMP49:%.*]] = icmp sgt <16 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; AVX-NEXT: [[BIN_RDX2:%.*]] = select <16 x i1> [[TMP49]], <16 x i32> [[BIN_RDX]], <16 x i32> [[RDX_SHUF1]] +; AVX-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <16 x i32> [[BIN_RDX2]], <16 x i32> undef, <16 x i32> +; AVX-NEXT: [[TMP50:%.*]] = icmp sgt <16 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; AVX-NEXT: [[BIN_RDX4:%.*]] = select <16 x i1> [[TMP50]], <16 x i32> [[BIN_RDX2]], <16 x i32> [[RDX_SHUF3]] +; AVX-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <16 x i32> [[BIN_RDX4]], <16 x i32> undef, <16 x i32> +; AVX-NEXT: [[TMP51:%.*]] = icmp sgt <16 x i32> [[BIN_RDX4]], [[RDX_SHUF5]] +; AVX-NEXT: [[BIN_RDX6:%.*]] = select <16 x i1> [[TMP51]], <16 x i32> [[BIN_RDX4]], <16 x i32> [[RDX_SHUF5]] +; AVX-NEXT: [[TMP52:%.*]] = extractelement <16 x i32> [[BIN_RDX6]], i32 0 +; AVX: ret i32 [[TMP52]] ; ; AVX2-LABEL: @maxi16( -; AVX2-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 -; AVX2-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4 -; AVX2-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]] -; AVX2-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]] -; AVX2-NEXT: [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8 -; AVX2-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]] -; AVX2-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]] -; AVX2-NEXT: [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4 -; AVX2-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]] -; AVX2-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]] -; AVX2-NEXT: [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16 -; AVX2-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]] -; AVX2-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] -; AVX2-NEXT: [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4 -; AVX2-NEXT: [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]] -; AVX2-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]] -; AVX2-NEXT: [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 -; AVX2-NEXT: [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]] -; AVX2-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]] -; AVX2-NEXT: [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 -; AVX2-NEXT: [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]] -; AVX2-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]] -; AVX2-NEXT: [[TMP24:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 8), align 16 -; AVX2-NEXT: [[TMP25:%.*]] = icmp sgt i32 [[TMP23]], [[TMP24]] -; AVX2-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP23]], i32 [[TMP24]] -; AVX2-NEXT: [[TMP27:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 9), align 4 -; AVX2-NEXT: [[TMP28:%.*]] = icmp sgt i32 [[TMP26]], [[TMP27]] -; AVX2-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i32 [[TMP26]], i32 [[TMP27]] -; AVX2-NEXT: [[TMP30:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 10), align 8 -; AVX2-NEXT: [[TMP31:%.*]] = icmp sgt i32 [[TMP29]], [[TMP30]] -; AVX2-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP29]], i32 [[TMP30]] -; AVX2-NEXT: [[TMP33:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 11), align 4 -; AVX2-NEXT: [[TMP34:%.*]] = icmp sgt i32 [[TMP32]], [[TMP33]] -; AVX2-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[TMP32]], i32 [[TMP33]] -; AVX2-NEXT: [[TMP36:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 12), align 16 -; AVX2-NEXT: [[TMP37:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] -; AVX2-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 [[TMP35]], i32 [[TMP36]] -; AVX2-NEXT: [[TMP39:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 13), align 4 -; AVX2-NEXT: [[TMP40:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]] -; AVX2-NEXT: [[TMP41:%.*]] = select i1 [[TMP40]], i32 [[TMP38]], i32 [[TMP39]] -; AVX2-NEXT: [[TMP42:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 14), align 8 -; AVX2-NEXT: [[TMP43:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] -; AVX2-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[TMP41]], i32 [[TMP42]] -; AVX2-NEXT: [[TMP45:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 15), align 4 -; AVX2-NEXT: [[TMP46:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]] -; AVX2-NEXT: [[TMP47:%.*]] = select i1 [[TMP46]], i32 [[TMP44]], i32 [[TMP45]] -; AVX2-NEXT: ret i32 [[TMP47]] +; AVX2-NEXT: [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([32 x i32]* @arr to <16 x i32>*), align 16 +; AVX2: [[RDX_SHUF:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> undef, <16 x i32> +; AVX2-NEXT: [[TMP48:%.*]] = icmp sgt <16 x i32> [[TMP2]], [[RDX_SHUF]] +; AVX2-NEXT: [[BIN_RDX:%.*]] = select <16 x i1> [[TMP48]], <16 x i32> [[TMP2]], <16 x i32> [[RDX_SHUF]] +; AVX2-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <16 x i32> [[BIN_RDX]], <16 x i32> undef, <16 x i32> +; AVX2-NEXT: [[TMP49:%.*]] = icmp sgt <16 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; AVX2-NEXT: [[BIN_RDX2:%.*]] = select <16 x i1> [[TMP49]], <16 x i32> [[BIN_RDX]], <16 x i32> [[RDX_SHUF1]] +; AVX2-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <16 x i32> [[BIN_RDX2]], <16 x i32> undef, <16 x i32> +; AVX2-NEXT: [[TMP50:%.*]] = icmp sgt <16 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; AVX2-NEXT: [[BIN_RDX4:%.*]] = select <16 x i1> [[TMP50]], <16 x i32> [[BIN_RDX2]], <16 x i32> [[RDX_SHUF3]] +; AVX2-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <16 x i32> [[BIN_RDX4]], <16 x i32> undef, <16 x i32> +; AVX2-NEXT: [[TMP51:%.*]] = icmp sgt <16 x i32> [[BIN_RDX4]], [[RDX_SHUF5]] +; AVX2-NEXT: [[BIN_RDX6:%.*]] = select <16 x i1> [[TMP51]], <16 x i32> [[BIN_RDX4]], <16 x i32> [[RDX_SHUF5]] +; AVX2-NEXT: [[TMP52:%.*]] = extractelement <16 x i32> [[BIN_RDX6]], i32 0 +; AVX2: ret i32 [[TMP52]] ; ; SKX-LABEL: @maxi16( -; SKX-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 -; SKX-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4 -; SKX-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]] -; SKX-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]] -; SKX-NEXT: [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8 -; SKX-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]] -; SKX-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]] -; SKX-NEXT: [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4 -; SKX-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]] -; SKX-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]] -; SKX-NEXT: [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16 -; SKX-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]] -; SKX-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] -; SKX-NEXT: [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4 -; SKX-NEXT: [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]] -; SKX-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]] -; SKX-NEXT: [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 -; SKX-NEXT: [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]] -; SKX-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]] -; SKX-NEXT: [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 -; SKX-NEXT: [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]] -; SKX-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]] -; SKX-NEXT: [[TMP24:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 8), align 16 -; SKX-NEXT: [[TMP25:%.*]] = icmp sgt i32 [[TMP23]], [[TMP24]] -; SKX-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP23]], i32 [[TMP24]] -; SKX-NEXT: [[TMP27:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 9), align 4 -; SKX-NEXT: [[TMP28:%.*]] = icmp sgt i32 [[TMP26]], [[TMP27]] -; SKX-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i32 [[TMP26]], i32 [[TMP27]] -; SKX-NEXT: [[TMP30:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 10), align 8 -; SKX-NEXT: [[TMP31:%.*]] = icmp sgt i32 [[TMP29]], [[TMP30]] -; SKX-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP29]], i32 [[TMP30]] -; SKX-NEXT: [[TMP33:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 11), align 4 -; SKX-NEXT: [[TMP34:%.*]] = icmp sgt i32 [[TMP32]], [[TMP33]] -; SKX-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[TMP32]], i32 [[TMP33]] -; SKX-NEXT: [[TMP36:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 12), align 16 -; SKX-NEXT: [[TMP37:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] -; SKX-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 [[TMP35]], i32 [[TMP36]] -; SKX-NEXT: [[TMP39:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 13), align 4 -; SKX-NEXT: [[TMP40:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]] -; SKX-NEXT: [[TMP41:%.*]] = select i1 [[TMP40]], i32 [[TMP38]], i32 [[TMP39]] -; SKX-NEXT: [[TMP42:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 14), align 8 -; SKX-NEXT: [[TMP43:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] -; SKX-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[TMP41]], i32 [[TMP42]] -; SKX-NEXT: [[TMP45:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 15), align 4 -; SKX-NEXT: [[TMP46:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]] -; SKX-NEXT: [[TMP47:%.*]] = select i1 [[TMP46]], i32 [[TMP44]], i32 [[TMP45]] -; SKX-NEXT: ret i32 [[TMP47]] +; SKX-NEXT: [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([32 x i32]* @arr to <16 x i32>*), align 16 +; SKX: [[RDX_SHUF:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> undef, <16 x i32> +; SKX-NEXT: [[TMP48:%.*]] = icmp sgt <16 x i32> [[TMP2]], [[RDX_SHUF]] +; SKX-NEXT: [[BIN_RDX:%.*]] = select <16 x i1> [[TMP48]], <16 x i32> [[TMP2]], <16 x i32> [[RDX_SHUF]] +; SKX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <16 x i32> [[BIN_RDX]], <16 x i32> undef, <16 x i32> +; SKX-NEXT: [[TMP49:%.*]] = icmp sgt <16 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; SKX-NEXT: [[BIN_RDX2:%.*]] = select <16 x i1> [[TMP49]], <16 x i32> [[BIN_RDX]], <16 x i32> [[RDX_SHUF1]] +; SKX-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <16 x i32> [[BIN_RDX2]], <16 x i32> undef, <16 x i32> +; SKX-NEXT: [[TMP50:%.*]] = icmp sgt <16 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; SKX-NEXT: [[BIN_RDX4:%.*]] = select <16 x i1> [[TMP50]], <16 x i32> [[BIN_RDX2]], <16 x i32> [[RDX_SHUF3]] +; SKX-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <16 x i32> [[BIN_RDX4]], <16 x i32> undef, <16 x i32> +; SKX-NEXT: [[TMP51:%.*]] = icmp sgt <16 x i32> [[BIN_RDX4]], [[RDX_SHUF5]] +; SKX-NEXT: [[BIN_RDX6:%.*]] = select <16 x i1> [[TMP51]], <16 x i32> [[BIN_RDX4]], <16 x i32> [[RDX_SHUF5]] +; SKX-NEXT: [[TMP52:%.*]] = extractelement <16 x i32> [[BIN_RDX6]], i32 0 +; SKX: ret i32 [[TMP52]] ; %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4 @@ -381,392 +252,84 @@ define i32 @maxi32(i32) { ; CHECK-LABEL: @maxi32( -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4 -; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]] -; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8 -; CHECK-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4 -; CHECK-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]] -; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16 -; CHECK-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] -; CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4 -; CHECK-NEXT: [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]] -; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]] -; CHECK-NEXT: [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 -; CHECK-NEXT: [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]] -; CHECK-NEXT: [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 -; CHECK-NEXT: [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]] -; CHECK-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]] -; CHECK-NEXT: [[TMP24:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 8), align 16 -; CHECK-NEXT: [[TMP25:%.*]] = icmp sgt i32 [[TMP23]], [[TMP24]] -; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP23]], i32 [[TMP24]] -; CHECK-NEXT: [[TMP27:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 9), align 4 -; CHECK-NEXT: [[TMP28:%.*]] = icmp sgt i32 [[TMP26]], [[TMP27]] -; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i32 [[TMP26]], i32 [[TMP27]] -; CHECK-NEXT: [[TMP30:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 10), align 8 -; CHECK-NEXT: [[TMP31:%.*]] = icmp sgt i32 [[TMP29]], [[TMP30]] -; CHECK-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP29]], i32 [[TMP30]] -; CHECK-NEXT: [[TMP33:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 11), align 4 -; CHECK-NEXT: [[TMP34:%.*]] = icmp sgt i32 [[TMP32]], [[TMP33]] -; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[TMP32]], i32 [[TMP33]] -; CHECK-NEXT: [[TMP36:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 12), align 16 -; CHECK-NEXT: [[TMP37:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] -; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 [[TMP35]], i32 [[TMP36]] -; CHECK-NEXT: [[TMP39:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 13), align 4 -; CHECK-NEXT: [[TMP40:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]] -; CHECK-NEXT: [[TMP41:%.*]] = select i1 [[TMP40]], i32 [[TMP38]], i32 [[TMP39]] -; CHECK-NEXT: [[TMP42:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 14), align 8 -; CHECK-NEXT: [[TMP43:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] -; CHECK-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[TMP41]], i32 [[TMP42]] -; CHECK-NEXT: [[TMP45:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 15), align 4 -; CHECK-NEXT: [[TMP46:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]] -; CHECK-NEXT: [[TMP47:%.*]] = select i1 [[TMP46]], i32 [[TMP44]], i32 [[TMP45]] -; CHECK-NEXT: [[TMP48:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 16), align 16 -; CHECK-NEXT: [[TMP49:%.*]] = icmp sgt i32 [[TMP47]], [[TMP48]] -; CHECK-NEXT: [[TMP50:%.*]] = select i1 [[TMP49]], i32 [[TMP47]], i32 [[TMP48]] -; CHECK-NEXT: [[TMP51:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 17), align 4 -; CHECK-NEXT: [[TMP52:%.*]] = icmp sgt i32 [[TMP50]], [[TMP51]] -; CHECK-NEXT: [[TMP53:%.*]] = select i1 [[TMP52]], i32 [[TMP50]], i32 [[TMP51]] -; CHECK-NEXT: [[TMP54:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 18), align 8 -; CHECK-NEXT: [[TMP55:%.*]] = icmp sgt i32 [[TMP53]], [[TMP54]] -; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 [[TMP53]], i32 [[TMP54]] -; CHECK-NEXT: [[TMP57:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 19), align 4 -; CHECK-NEXT: [[TMP58:%.*]] = icmp sgt i32 [[TMP56]], [[TMP57]] -; CHECK-NEXT: [[TMP59:%.*]] = select i1 [[TMP58]], i32 [[TMP56]], i32 [[TMP57]] -; CHECK-NEXT: [[TMP60:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 20), align 16 -; CHECK-NEXT: [[TMP61:%.*]] = icmp sgt i32 [[TMP59]], [[TMP60]] -; CHECK-NEXT: [[TMP62:%.*]] = select i1 [[TMP61]], i32 [[TMP59]], i32 [[TMP60]] -; CHECK-NEXT: [[TMP63:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 21), align 4 -; CHECK-NEXT: [[TMP64:%.*]] = icmp sgt i32 [[TMP62]], [[TMP63]] -; CHECK-NEXT: [[TMP65:%.*]] = select i1 [[TMP64]], i32 [[TMP62]], i32 [[TMP63]] -; CHECK-NEXT: [[TMP66:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 22), align 8 -; CHECK-NEXT: [[TMP67:%.*]] = icmp sgt i32 [[TMP65]], [[TMP66]] -; CHECK-NEXT: [[TMP68:%.*]] = select i1 [[TMP67]], i32 [[TMP65]], i32 [[TMP66]] -; CHECK-NEXT: [[TMP69:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 23), align 4 -; CHECK-NEXT: [[TMP70:%.*]] = icmp sgt i32 [[TMP68]], [[TMP69]] -; CHECK-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP68]], i32 [[TMP69]] -; CHECK-NEXT: [[TMP72:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 24), align 16 -; CHECK-NEXT: [[TMP73:%.*]] = icmp sgt i32 [[TMP71]], [[TMP72]] -; CHECK-NEXT: [[TMP74:%.*]] = select i1 [[TMP73]], i32 [[TMP71]], i32 [[TMP72]] -; CHECK-NEXT: [[TMP75:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 25), align 4 -; CHECK-NEXT: [[TMP76:%.*]] = icmp sgt i32 [[TMP74]], [[TMP75]] -; CHECK-NEXT: [[TMP77:%.*]] = select i1 [[TMP76]], i32 [[TMP74]], i32 [[TMP75]] -; CHECK-NEXT: [[TMP78:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 26), align 8 -; CHECK-NEXT: [[TMP79:%.*]] = icmp sgt i32 [[TMP77]], [[TMP78]] -; CHECK-NEXT: [[TMP80:%.*]] = select i1 [[TMP79]], i32 [[TMP77]], i32 [[TMP78]] -; CHECK-NEXT: [[TMP81:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 27), align 4 -; CHECK-NEXT: [[TMP82:%.*]] = icmp sgt i32 [[TMP80]], [[TMP81]] -; CHECK-NEXT: [[TMP83:%.*]] = select i1 [[TMP82]], i32 [[TMP80]], i32 [[TMP81]] -; CHECK-NEXT: [[TMP84:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 28), align 16 -; CHECK-NEXT: [[TMP85:%.*]] = icmp sgt i32 [[TMP83]], [[TMP84]] -; CHECK-NEXT: [[TMP86:%.*]] = select i1 [[TMP85]], i32 [[TMP83]], i32 [[TMP84]] -; CHECK-NEXT: [[TMP87:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 29), align 4 -; CHECK-NEXT: [[TMP88:%.*]] = icmp sgt i32 [[TMP86]], [[TMP87]] -; CHECK-NEXT: [[TMP89:%.*]] = select i1 [[TMP88]], i32 [[TMP86]], i32 [[TMP87]] -; CHECK-NEXT: [[TMP90:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 30), align 8 -; CHECK-NEXT: [[TMP91:%.*]] = icmp sgt i32 [[TMP89]], [[TMP90]] -; CHECK-NEXT: [[TMP92:%.*]] = select i1 [[TMP91]], i32 [[TMP89]], i32 [[TMP90]] -; CHECK-NEXT: [[TMP93:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 31), align 4 -; CHECK-NEXT: [[TMP94:%.*]] = icmp sgt i32 [[TMP92]], [[TMP93]] -; CHECK-NEXT: [[TMP95:%.*]] = select i1 [[TMP94]], i32 [[TMP92]], i32 [[TMP93]] -; CHECK-NEXT: ret i32 [[TMP95]] +; CHECK-NEXT: [[TMP2:%.*]] = load <32 x i32>, <32 x i32>* bitcast ([32 x i32]* @arr to <32 x i32>*), align 16 +; CHECK: [[RDX_SHUF:%.*]] = shufflevector <32 x i32> [[TMP2]], <32 x i32> undef, <32 x i32> +; CHECK-NEXT: [[TMP96:%.*]] = icmp sgt <32 x i32> [[TMP2]], [[RDX_SHUF]] +; CHECK-NEXT: [[BIN_RDX:%.*]] = select <32 x i1> [[TMP96]], <32 x i32> [[TMP2]], <32 x i32> [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <32 x i32> [[BIN_RDX]], <32 x i32> undef, <32 x i32> +; CHECK-NEXT: [[TMP97:%.*]] = icmp sgt <32 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; CHECK-NEXT: [[BIN_RDX2:%.*]] = select <32 x i1> [[TMP97]], <32 x i32> [[BIN_RDX]], <32 x i32> [[RDX_SHUF1]] +; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <32 x i32> [[BIN_RDX2]], <32 x i32> undef, <32 x i32> +; CHECK-NEXT: [[TMP98:%.*]] = icmp sgt <32 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; CHECK-NEXT: [[BIN_RDX4:%.*]] = select <32 x i1> [[TMP98]], <32 x i32> [[BIN_RDX2]], <32 x i32> [[RDX_SHUF3]] +; CHECK-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <32 x i32> [[BIN_RDX4]], <32 x i32> undef, <32 x i32> +; CHECK-NEXT: [[TMP99:%.*]] = icmp sgt <32 x i32> [[BIN_RDX4]], [[RDX_SHUF5]] +; CHECK-NEXT: [[BIN_RDX6:%.*]] = select <32 x i1> [[TMP99]], <32 x i32> [[BIN_RDX4]], <32 x i32> [[RDX_SHUF5]] +; CHECK-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <32 x i32> [[BIN_RDX6]], <32 x i32> undef, <32 x i32> +; CHECK-NEXT: [[TMP100:%.*]] = icmp sgt <32 x i32> [[BIN_RDX6]], [[RDX_SHUF7]] +; CHECK-NEXT: [[BIN_RDX8:%.*]] = select <32 x i1> [[TMP100]], <32 x i32> [[BIN_RDX6]], <32 x i32> [[RDX_SHUF7]] +; CHECK-NEXT: [[TMP101:%.*]] = extractelement <32 x i32> [[BIN_RDX8]], i32 0 +; CHECK: ret i32 [[TMP101]] ; ; AVX-LABEL: @maxi32( -; AVX-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 -; AVX-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4 -; AVX-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]] -; AVX-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]] -; AVX-NEXT: [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8 -; AVX-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]] -; AVX-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]] -; AVX-NEXT: [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4 -; AVX-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]] -; AVX-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]] -; AVX-NEXT: [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16 -; AVX-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]] -; AVX-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] -; AVX-NEXT: [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4 -; AVX-NEXT: [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]] -; AVX-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]] -; AVX-NEXT: [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 -; AVX-NEXT: [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]] -; AVX-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]] -; AVX-NEXT: [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 -; AVX-NEXT: [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]] -; AVX-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]] -; AVX-NEXT: [[TMP24:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 8), align 16 -; AVX-NEXT: [[TMP25:%.*]] = icmp sgt i32 [[TMP23]], [[TMP24]] -; AVX-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP23]], i32 [[TMP24]] -; AVX-NEXT: [[TMP27:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 9), align 4 -; AVX-NEXT: [[TMP28:%.*]] = icmp sgt i32 [[TMP26]], [[TMP27]] -; AVX-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i32 [[TMP26]], i32 [[TMP27]] -; AVX-NEXT: [[TMP30:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 10), align 8 -; AVX-NEXT: [[TMP31:%.*]] = icmp sgt i32 [[TMP29]], [[TMP30]] -; AVX-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP29]], i32 [[TMP30]] -; AVX-NEXT: [[TMP33:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 11), align 4 -; AVX-NEXT: [[TMP34:%.*]] = icmp sgt i32 [[TMP32]], [[TMP33]] -; AVX-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[TMP32]], i32 [[TMP33]] -; AVX-NEXT: [[TMP36:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 12), align 16 -; AVX-NEXT: [[TMP37:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] -; AVX-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 [[TMP35]], i32 [[TMP36]] -; AVX-NEXT: [[TMP39:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 13), align 4 -; AVX-NEXT: [[TMP40:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]] -; AVX-NEXT: [[TMP41:%.*]] = select i1 [[TMP40]], i32 [[TMP38]], i32 [[TMP39]] -; AVX-NEXT: [[TMP42:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 14), align 8 -; AVX-NEXT: [[TMP43:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] -; AVX-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[TMP41]], i32 [[TMP42]] -; AVX-NEXT: [[TMP45:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 15), align 4 -; AVX-NEXT: [[TMP46:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]] -; AVX-NEXT: [[TMP47:%.*]] = select i1 [[TMP46]], i32 [[TMP44]], i32 [[TMP45]] -; AVX-NEXT: [[TMP48:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 16), align 16 -; AVX-NEXT: [[TMP49:%.*]] = icmp sgt i32 [[TMP47]], [[TMP48]] -; AVX-NEXT: [[TMP50:%.*]] = select i1 [[TMP49]], i32 [[TMP47]], i32 [[TMP48]] -; AVX-NEXT: [[TMP51:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 17), align 4 -; AVX-NEXT: [[TMP52:%.*]] = icmp sgt i32 [[TMP50]], [[TMP51]] -; AVX-NEXT: [[TMP53:%.*]] = select i1 [[TMP52]], i32 [[TMP50]], i32 [[TMP51]] -; AVX-NEXT: [[TMP54:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 18), align 8 -; AVX-NEXT: [[TMP55:%.*]] = icmp sgt i32 [[TMP53]], [[TMP54]] -; AVX-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 [[TMP53]], i32 [[TMP54]] -; AVX-NEXT: [[TMP57:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 19), align 4 -; AVX-NEXT: [[TMP58:%.*]] = icmp sgt i32 [[TMP56]], [[TMP57]] -; AVX-NEXT: [[TMP59:%.*]] = select i1 [[TMP58]], i32 [[TMP56]], i32 [[TMP57]] -; AVX-NEXT: [[TMP60:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 20), align 16 -; AVX-NEXT: [[TMP61:%.*]] = icmp sgt i32 [[TMP59]], [[TMP60]] -; AVX-NEXT: [[TMP62:%.*]] = select i1 [[TMP61]], i32 [[TMP59]], i32 [[TMP60]] -; AVX-NEXT: [[TMP63:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 21), align 4 -; AVX-NEXT: [[TMP64:%.*]] = icmp sgt i32 [[TMP62]], [[TMP63]] -; AVX-NEXT: [[TMP65:%.*]] = select i1 [[TMP64]], i32 [[TMP62]], i32 [[TMP63]] -; AVX-NEXT: [[TMP66:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 22), align 8 -; AVX-NEXT: [[TMP67:%.*]] = icmp sgt i32 [[TMP65]], [[TMP66]] -; AVX-NEXT: [[TMP68:%.*]] = select i1 [[TMP67]], i32 [[TMP65]], i32 [[TMP66]] -; AVX-NEXT: [[TMP69:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 23), align 4 -; AVX-NEXT: [[TMP70:%.*]] = icmp sgt i32 [[TMP68]], [[TMP69]] -; AVX-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP68]], i32 [[TMP69]] -; AVX-NEXT: [[TMP72:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 24), align 16 -; AVX-NEXT: [[TMP73:%.*]] = icmp sgt i32 [[TMP71]], [[TMP72]] -; AVX-NEXT: [[TMP74:%.*]] = select i1 [[TMP73]], i32 [[TMP71]], i32 [[TMP72]] -; AVX-NEXT: [[TMP75:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 25), align 4 -; AVX-NEXT: [[TMP76:%.*]] = icmp sgt i32 [[TMP74]], [[TMP75]] -; AVX-NEXT: [[TMP77:%.*]] = select i1 [[TMP76]], i32 [[TMP74]], i32 [[TMP75]] -; AVX-NEXT: [[TMP78:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 26), align 8 -; AVX-NEXT: [[TMP79:%.*]] = icmp sgt i32 [[TMP77]], [[TMP78]] -; AVX-NEXT: [[TMP80:%.*]] = select i1 [[TMP79]], i32 [[TMP77]], i32 [[TMP78]] -; AVX-NEXT: [[TMP81:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 27), align 4 -; AVX-NEXT: [[TMP82:%.*]] = icmp sgt i32 [[TMP80]], [[TMP81]] -; AVX-NEXT: [[TMP83:%.*]] = select i1 [[TMP82]], i32 [[TMP80]], i32 [[TMP81]] -; AVX-NEXT: [[TMP84:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 28), align 16 -; AVX-NEXT: [[TMP85:%.*]] = icmp sgt i32 [[TMP83]], [[TMP84]] -; AVX-NEXT: [[TMP86:%.*]] = select i1 [[TMP85]], i32 [[TMP83]], i32 [[TMP84]] -; AVX-NEXT: [[TMP87:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 29), align 4 -; AVX-NEXT: [[TMP88:%.*]] = icmp sgt i32 [[TMP86]], [[TMP87]] -; AVX-NEXT: [[TMP89:%.*]] = select i1 [[TMP88]], i32 [[TMP86]], i32 [[TMP87]] -; AVX-NEXT: [[TMP90:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 30), align 8 -; AVX-NEXT: [[TMP91:%.*]] = icmp sgt i32 [[TMP89]], [[TMP90]] -; AVX-NEXT: [[TMP92:%.*]] = select i1 [[TMP91]], i32 [[TMP89]], i32 [[TMP90]] -; AVX-NEXT: [[TMP93:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 31), align 4 -; AVX-NEXT: [[TMP94:%.*]] = icmp sgt i32 [[TMP92]], [[TMP93]] -; AVX-NEXT: [[TMP95:%.*]] = select i1 [[TMP94]], i32 [[TMP92]], i32 [[TMP93]] -; AVX-NEXT: ret i32 [[TMP95]] +; AVX-NEXT: [[TMP2:%.*]] = load <32 x i32>, <32 x i32>* bitcast ([32 x i32]* @arr to <32 x i32>*), align 16 +; AVX: [[RDX_SHUF:%.*]] = shufflevector <32 x i32> [[TMP2]], <32 x i32> undef, <32 x i32> +; AVX-NEXT: [[TMP96:%.*]] = icmp sgt <32 x i32> [[TMP2]], [[RDX_SHUF]] +; AVX-NEXT: [[BIN_RDX:%.*]] = select <32 x i1> [[TMP96]], <32 x i32> [[TMP2]], <32 x i32> [[RDX_SHUF]] +; AVX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <32 x i32> [[BIN_RDX]], <32 x i32> undef, <32 x i32> +; AVX-NEXT: [[TMP97:%.*]] = icmp sgt <32 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; AVX-NEXT: [[BIN_RDX2:%.*]] = select <32 x i1> [[TMP97]], <32 x i32> [[BIN_RDX]], <32 x i32> [[RDX_SHUF1]] +; AVX-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <32 x i32> [[BIN_RDX2]], <32 x i32> undef, <32 x i32> +; AVX-NEXT: [[TMP98:%.*]] = icmp sgt <32 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; AVX-NEXT: [[BIN_RDX4:%.*]] = select <32 x i1> [[TMP98]], <32 x i32> [[BIN_RDX2]], <32 x i32> [[RDX_SHUF3]] +; AVX-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <32 x i32> [[BIN_RDX4]], <32 x i32> undef, <32 x i32> +; AVX-NEXT: [[TMP99:%.*]] = icmp sgt <32 x i32> [[BIN_RDX4]], [[RDX_SHUF5]] +; AVX-NEXT: [[BIN_RDX6:%.*]] = select <32 x i1> [[TMP99]], <32 x i32> [[BIN_RDX4]], <32 x i32> [[RDX_SHUF5]] +; AVX-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <32 x i32> [[BIN_RDX6]], <32 x i32> undef, <32 x i32> +; AVX-NEXT: [[TMP100:%.*]] = icmp sgt <32 x i32> [[BIN_RDX6]], [[RDX_SHUF7]] +; AVX-NEXT: [[BIN_RDX8:%.*]] = select <32 x i1> [[TMP100]], <32 x i32> [[BIN_RDX6]], <32 x i32> [[RDX_SHUF7]] +; AVX-NEXT: [[TMP101:%.*]] = extractelement <32 x i32> [[BIN_RDX8]], i32 0 +; AVX: ret i32 [[TMP101]] ; ; AVX2-LABEL: @maxi32( -; AVX2-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 -; AVX2-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4 -; AVX2-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]] -; AVX2-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]] -; AVX2-NEXT: [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8 -; AVX2-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]] -; AVX2-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]] -; AVX2-NEXT: [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4 -; AVX2-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]] -; AVX2-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]] -; AVX2-NEXT: [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16 -; AVX2-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]] -; AVX2-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] -; AVX2-NEXT: [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4 -; AVX2-NEXT: [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]] -; AVX2-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]] -; AVX2-NEXT: [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 -; AVX2-NEXT: [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]] -; AVX2-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]] -; AVX2-NEXT: [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 -; AVX2-NEXT: [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]] -; AVX2-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]] -; AVX2-NEXT: [[TMP24:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 8), align 16 -; AVX2-NEXT: [[TMP25:%.*]] = icmp sgt i32 [[TMP23]], [[TMP24]] -; AVX2-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP23]], i32 [[TMP24]] -; AVX2-NEXT: [[TMP27:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 9), align 4 -; AVX2-NEXT: [[TMP28:%.*]] = icmp sgt i32 [[TMP26]], [[TMP27]] -; AVX2-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i32 [[TMP26]], i32 [[TMP27]] -; AVX2-NEXT: [[TMP30:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 10), align 8 -; AVX2-NEXT: [[TMP31:%.*]] = icmp sgt i32 [[TMP29]], [[TMP30]] -; AVX2-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP29]], i32 [[TMP30]] -; AVX2-NEXT: [[TMP33:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 11), align 4 -; AVX2-NEXT: [[TMP34:%.*]] = icmp sgt i32 [[TMP32]], [[TMP33]] -; AVX2-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[TMP32]], i32 [[TMP33]] -; AVX2-NEXT: [[TMP36:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 12), align 16 -; AVX2-NEXT: [[TMP37:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] -; AVX2-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 [[TMP35]], i32 [[TMP36]] -; AVX2-NEXT: [[TMP39:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 13), align 4 -; AVX2-NEXT: [[TMP40:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]] -; AVX2-NEXT: [[TMP41:%.*]] = select i1 [[TMP40]], i32 [[TMP38]], i32 [[TMP39]] -; AVX2-NEXT: [[TMP42:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 14), align 8 -; AVX2-NEXT: [[TMP43:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] -; AVX2-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[TMP41]], i32 [[TMP42]] -; AVX2-NEXT: [[TMP45:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 15), align 4 -; AVX2-NEXT: [[TMP46:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]] -; AVX2-NEXT: [[TMP47:%.*]] = select i1 [[TMP46]], i32 [[TMP44]], i32 [[TMP45]] -; AVX2-NEXT: [[TMP48:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 16), align 16 -; AVX2-NEXT: [[TMP49:%.*]] = icmp sgt i32 [[TMP47]], [[TMP48]] -; AVX2-NEXT: [[TMP50:%.*]] = select i1 [[TMP49]], i32 [[TMP47]], i32 [[TMP48]] -; AVX2-NEXT: [[TMP51:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 17), align 4 -; AVX2-NEXT: [[TMP52:%.*]] = icmp sgt i32 [[TMP50]], [[TMP51]] -; AVX2-NEXT: [[TMP53:%.*]] = select i1 [[TMP52]], i32 [[TMP50]], i32 [[TMP51]] -; AVX2-NEXT: [[TMP54:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 18), align 8 -; AVX2-NEXT: [[TMP55:%.*]] = icmp sgt i32 [[TMP53]], [[TMP54]] -; AVX2-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 [[TMP53]], i32 [[TMP54]] -; AVX2-NEXT: [[TMP57:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 19), align 4 -; AVX2-NEXT: [[TMP58:%.*]] = icmp sgt i32 [[TMP56]], [[TMP57]] -; AVX2-NEXT: [[TMP59:%.*]] = select i1 [[TMP58]], i32 [[TMP56]], i32 [[TMP57]] -; AVX2-NEXT: [[TMP60:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 20), align 16 -; AVX2-NEXT: [[TMP61:%.*]] = icmp sgt i32 [[TMP59]], [[TMP60]] -; AVX2-NEXT: [[TMP62:%.*]] = select i1 [[TMP61]], i32 [[TMP59]], i32 [[TMP60]] -; AVX2-NEXT: [[TMP63:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 21), align 4 -; AVX2-NEXT: [[TMP64:%.*]] = icmp sgt i32 [[TMP62]], [[TMP63]] -; AVX2-NEXT: [[TMP65:%.*]] = select i1 [[TMP64]], i32 [[TMP62]], i32 [[TMP63]] -; AVX2-NEXT: [[TMP66:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 22), align 8 -; AVX2-NEXT: [[TMP67:%.*]] = icmp sgt i32 [[TMP65]], [[TMP66]] -; AVX2-NEXT: [[TMP68:%.*]] = select i1 [[TMP67]], i32 [[TMP65]], i32 [[TMP66]] -; AVX2-NEXT: [[TMP69:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 23), align 4 -; AVX2-NEXT: [[TMP70:%.*]] = icmp sgt i32 [[TMP68]], [[TMP69]] -; AVX2-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP68]], i32 [[TMP69]] -; AVX2-NEXT: [[TMP72:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 24), align 16 -; AVX2-NEXT: [[TMP73:%.*]] = icmp sgt i32 [[TMP71]], [[TMP72]] -; AVX2-NEXT: [[TMP74:%.*]] = select i1 [[TMP73]], i32 [[TMP71]], i32 [[TMP72]] -; AVX2-NEXT: [[TMP75:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 25), align 4 -; AVX2-NEXT: [[TMP76:%.*]] = icmp sgt i32 [[TMP74]], [[TMP75]] -; AVX2-NEXT: [[TMP77:%.*]] = select i1 [[TMP76]], i32 [[TMP74]], i32 [[TMP75]] -; AVX2-NEXT: [[TMP78:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 26), align 8 -; AVX2-NEXT: [[TMP79:%.*]] = icmp sgt i32 [[TMP77]], [[TMP78]] -; AVX2-NEXT: [[TMP80:%.*]] = select i1 [[TMP79]], i32 [[TMP77]], i32 [[TMP78]] -; AVX2-NEXT: [[TMP81:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 27), align 4 -; AVX2-NEXT: [[TMP82:%.*]] = icmp sgt i32 [[TMP80]], [[TMP81]] -; AVX2-NEXT: [[TMP83:%.*]] = select i1 [[TMP82]], i32 [[TMP80]], i32 [[TMP81]] -; AVX2-NEXT: [[TMP84:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 28), align 16 -; AVX2-NEXT: [[TMP85:%.*]] = icmp sgt i32 [[TMP83]], [[TMP84]] -; AVX2-NEXT: [[TMP86:%.*]] = select i1 [[TMP85]], i32 [[TMP83]], i32 [[TMP84]] -; AVX2-NEXT: [[TMP87:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 29), align 4 -; AVX2-NEXT: [[TMP88:%.*]] = icmp sgt i32 [[TMP86]], [[TMP87]] -; AVX2-NEXT: [[TMP89:%.*]] = select i1 [[TMP88]], i32 [[TMP86]], i32 [[TMP87]] -; AVX2-NEXT: [[TMP90:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 30), align 8 -; AVX2-NEXT: [[TMP91:%.*]] = icmp sgt i32 [[TMP89]], [[TMP90]] -; AVX2-NEXT: [[TMP92:%.*]] = select i1 [[TMP91]], i32 [[TMP89]], i32 [[TMP90]] -; AVX2-NEXT: [[TMP93:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 31), align 4 -; AVX2-NEXT: [[TMP94:%.*]] = icmp sgt i32 [[TMP92]], [[TMP93]] -; AVX2-NEXT: [[TMP95:%.*]] = select i1 [[TMP94]], i32 [[TMP92]], i32 [[TMP93]] -; AVX2-NEXT: ret i32 [[TMP95]] +; AVX2-NEXT: [[TMP2:%.*]] = load <32 x i32>, <32 x i32>* bitcast ([32 x i32]* @arr to <32 x i32>*), align 16 +; AVX2: [[RDX_SHUF:%.*]] = shufflevector <32 x i32> [[TMP2]], <32 x i32> undef, <32 x i32> +; AVX2-NEXT: [[TMP96:%.*]] = icmp sgt <32 x i32> [[TMP2]], [[RDX_SHUF]] +; AVX2-NEXT: [[BIN_RDX:%.*]] = select <32 x i1> [[TMP96]], <32 x i32> [[TMP2]], <32 x i32> [[RDX_SHUF]] +; AVX2-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <32 x i32> [[BIN_RDX]], <32 x i32> undef, <32 x i32> +; AVX2-NEXT: [[TMP97:%.*]] = icmp sgt <32 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; AVX2-NEXT: [[BIN_RDX2:%.*]] = select <32 x i1> [[TMP97]], <32 x i32> [[BIN_RDX]], <32 x i32> [[RDX_SHUF1]] +; AVX2-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <32 x i32> [[BIN_RDX2]], <32 x i32> undef, <32 x i32> +; AVX2-NEXT: [[TMP98:%.*]] = icmp sgt <32 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; AVX2-NEXT: [[BIN_RDX4:%.*]] = select <32 x i1> [[TMP98]], <32 x i32> [[BIN_RDX2]], <32 x i32> [[RDX_SHUF3]] +; AVX2-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <32 x i32> [[BIN_RDX4]], <32 x i32> undef, <32 x i32> +; AVX2-NEXT: [[TMP99:%.*]] = icmp sgt <32 x i32> [[BIN_RDX4]], [[RDX_SHUF5]] +; AVX2-NEXT: [[BIN_RDX6:%.*]] = select <32 x i1> [[TMP99]], <32 x i32> [[BIN_RDX4]], <32 x i32> [[RDX_SHUF5]] +; AVX2-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <32 x i32> [[BIN_RDX6]], <32 x i32> undef, <32 x i32> +; AVX2-NEXT: [[TMP100:%.*]] = icmp sgt <32 x i32> [[BIN_RDX6]], [[RDX_SHUF7]] +; AVX2-NEXT: [[BIN_RDX8:%.*]] = select <32 x i1> [[TMP100]], <32 x i32> [[BIN_RDX6]], <32 x i32> [[RDX_SHUF7]] +; AVX2-NEXT: [[TMP101:%.*]] = extractelement <32 x i32> [[BIN_RDX8]], i32 0 +; AVX2: ret i32 [[TMP101]] ; ; SKX-LABEL: @maxi32( -; SKX-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 -; SKX-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4 -; SKX-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]] -; SKX-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]] -; SKX-NEXT: [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8 -; SKX-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]] -; SKX-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]] -; SKX-NEXT: [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4 -; SKX-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]] -; SKX-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]] -; SKX-NEXT: [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16 -; SKX-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]] -; SKX-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] -; SKX-NEXT: [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4 -; SKX-NEXT: [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]] -; SKX-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]] -; SKX-NEXT: [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 -; SKX-NEXT: [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]] -; SKX-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]] -; SKX-NEXT: [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 -; SKX-NEXT: [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]] -; SKX-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]] -; SKX-NEXT: [[TMP24:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 8), align 16 -; SKX-NEXT: [[TMP25:%.*]] = icmp sgt i32 [[TMP23]], [[TMP24]] -; SKX-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP23]], i32 [[TMP24]] -; SKX-NEXT: [[TMP27:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 9), align 4 -; SKX-NEXT: [[TMP28:%.*]] = icmp sgt i32 [[TMP26]], [[TMP27]] -; SKX-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i32 [[TMP26]], i32 [[TMP27]] -; SKX-NEXT: [[TMP30:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 10), align 8 -; SKX-NEXT: [[TMP31:%.*]] = icmp sgt i32 [[TMP29]], [[TMP30]] -; SKX-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP29]], i32 [[TMP30]] -; SKX-NEXT: [[TMP33:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 11), align 4 -; SKX-NEXT: [[TMP34:%.*]] = icmp sgt i32 [[TMP32]], [[TMP33]] -; SKX-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[TMP32]], i32 [[TMP33]] -; SKX-NEXT: [[TMP36:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 12), align 16 -; SKX-NEXT: [[TMP37:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] -; SKX-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 [[TMP35]], i32 [[TMP36]] -; SKX-NEXT: [[TMP39:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 13), align 4 -; SKX-NEXT: [[TMP40:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]] -; SKX-NEXT: [[TMP41:%.*]] = select i1 [[TMP40]], i32 [[TMP38]], i32 [[TMP39]] -; SKX-NEXT: [[TMP42:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 14), align 8 -; SKX-NEXT: [[TMP43:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] -; SKX-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[TMP41]], i32 [[TMP42]] -; SKX-NEXT: [[TMP45:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 15), align 4 -; SKX-NEXT: [[TMP46:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]] -; SKX-NEXT: [[TMP47:%.*]] = select i1 [[TMP46]], i32 [[TMP44]], i32 [[TMP45]] -; SKX-NEXT: [[TMP48:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 16), align 16 -; SKX-NEXT: [[TMP49:%.*]] = icmp sgt i32 [[TMP47]], [[TMP48]] -; SKX-NEXT: [[TMP50:%.*]] = select i1 [[TMP49]], i32 [[TMP47]], i32 [[TMP48]] -; SKX-NEXT: [[TMP51:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 17), align 4 -; SKX-NEXT: [[TMP52:%.*]] = icmp sgt i32 [[TMP50]], [[TMP51]] -; SKX-NEXT: [[TMP53:%.*]] = select i1 [[TMP52]], i32 [[TMP50]], i32 [[TMP51]] -; SKX-NEXT: [[TMP54:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 18), align 8 -; SKX-NEXT: [[TMP55:%.*]] = icmp sgt i32 [[TMP53]], [[TMP54]] -; SKX-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 [[TMP53]], i32 [[TMP54]] -; SKX-NEXT: [[TMP57:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 19), align 4 -; SKX-NEXT: [[TMP58:%.*]] = icmp sgt i32 [[TMP56]], [[TMP57]] -; SKX-NEXT: [[TMP59:%.*]] = select i1 [[TMP58]], i32 [[TMP56]], i32 [[TMP57]] -; SKX-NEXT: [[TMP60:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 20), align 16 -; SKX-NEXT: [[TMP61:%.*]] = icmp sgt i32 [[TMP59]], [[TMP60]] -; SKX-NEXT: [[TMP62:%.*]] = select i1 [[TMP61]], i32 [[TMP59]], i32 [[TMP60]] -; SKX-NEXT: [[TMP63:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 21), align 4 -; SKX-NEXT: [[TMP64:%.*]] = icmp sgt i32 [[TMP62]], [[TMP63]] -; SKX-NEXT: [[TMP65:%.*]] = select i1 [[TMP64]], i32 [[TMP62]], i32 [[TMP63]] -; SKX-NEXT: [[TMP66:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 22), align 8 -; SKX-NEXT: [[TMP67:%.*]] = icmp sgt i32 [[TMP65]], [[TMP66]] -; SKX-NEXT: [[TMP68:%.*]] = select i1 [[TMP67]], i32 [[TMP65]], i32 [[TMP66]] -; SKX-NEXT: [[TMP69:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 23), align 4 -; SKX-NEXT: [[TMP70:%.*]] = icmp sgt i32 [[TMP68]], [[TMP69]] -; SKX-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP68]], i32 [[TMP69]] -; SKX-NEXT: [[TMP72:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 24), align 16 -; SKX-NEXT: [[TMP73:%.*]] = icmp sgt i32 [[TMP71]], [[TMP72]] -; SKX-NEXT: [[TMP74:%.*]] = select i1 [[TMP73]], i32 [[TMP71]], i32 [[TMP72]] -; SKX-NEXT: [[TMP75:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 25), align 4 -; SKX-NEXT: [[TMP76:%.*]] = icmp sgt i32 [[TMP74]], [[TMP75]] -; SKX-NEXT: [[TMP77:%.*]] = select i1 [[TMP76]], i32 [[TMP74]], i32 [[TMP75]] -; SKX-NEXT: [[TMP78:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 26), align 8 -; SKX-NEXT: [[TMP79:%.*]] = icmp sgt i32 [[TMP77]], [[TMP78]] -; SKX-NEXT: [[TMP80:%.*]] = select i1 [[TMP79]], i32 [[TMP77]], i32 [[TMP78]] -; SKX-NEXT: [[TMP81:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 27), align 4 -; SKX-NEXT: [[TMP82:%.*]] = icmp sgt i32 [[TMP80]], [[TMP81]] -; SKX-NEXT: [[TMP83:%.*]] = select i1 [[TMP82]], i32 [[TMP80]], i32 [[TMP81]] -; SKX-NEXT: [[TMP84:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 28), align 16 -; SKX-NEXT: [[TMP85:%.*]] = icmp sgt i32 [[TMP83]], [[TMP84]] -; SKX-NEXT: [[TMP86:%.*]] = select i1 [[TMP85]], i32 [[TMP83]], i32 [[TMP84]] -; SKX-NEXT: [[TMP87:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 29), align 4 -; SKX-NEXT: [[TMP88:%.*]] = icmp sgt i32 [[TMP86]], [[TMP87]] -; SKX-NEXT: [[TMP89:%.*]] = select i1 [[TMP88]], i32 [[TMP86]], i32 [[TMP87]] -; SKX-NEXT: [[TMP90:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 30), align 8 -; SKX-NEXT: [[TMP91:%.*]] = icmp sgt i32 [[TMP89]], [[TMP90]] -; SKX-NEXT: [[TMP92:%.*]] = select i1 [[TMP91]], i32 [[TMP89]], i32 [[TMP90]] -; SKX-NEXT: [[TMP93:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 31), align 4 -; SKX-NEXT: [[TMP94:%.*]] = icmp sgt i32 [[TMP92]], [[TMP93]] -; SKX-NEXT: [[TMP95:%.*]] = select i1 [[TMP94]], i32 [[TMP92]], i32 [[TMP93]] -; SKX-NEXT: ret i32 [[TMP95]] +; SKX-NEXT: [[TMP2:%.*]] = load <32 x i32>, <32 x i32>* bitcast ([32 x i32]* @arr to <32 x i32>*), align 16 +; SKX: [[RDX_SHUF:%.*]] = shufflevector <32 x i32> [[TMP2]], <32 x i32> undef, <32 x i32> +; SKX-NEXT: [[TMP96:%.*]] = icmp sgt <32 x i32> [[TMP2]], [[RDX_SHUF]] +; SKX-NEXT: [[BIN_RDX:%.*]] = select <32 x i1> [[TMP96]], <32 x i32> [[TMP2]], <32 x i32> [[RDX_SHUF]] +; SKX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <32 x i32> [[BIN_RDX]], <32 x i32> undef, <32 x i32> +; SKX-NEXT: [[TMP97:%.*]] = icmp sgt <32 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; SKX-NEXT: [[BIN_RDX2:%.*]] = select <32 x i1> [[TMP97]], <32 x i32> [[BIN_RDX]], <32 x i32> [[RDX_SHUF1]] +; SKX-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <32 x i32> [[BIN_RDX2]], <32 x i32> undef, <32 x i32> +; SKX-NEXT: [[TMP98:%.*]] = icmp sgt <32 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; SKX-NEXT: [[BIN_RDX4:%.*]] = select <32 x i1> [[TMP98]], <32 x i32> [[BIN_RDX2]], <32 x i32> [[RDX_SHUF3]] +; SKX-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <32 x i32> [[BIN_RDX4]], <32 x i32> undef, <32 x i32> +; SKX-NEXT: [[TMP99:%.*]] = icmp sgt <32 x i32> [[BIN_RDX4]], [[RDX_SHUF5]] +; SKX-NEXT: [[BIN_RDX6:%.*]] = select <32 x i1> [[TMP99]], <32 x i32> [[BIN_RDX4]], <32 x i32> [[RDX_SHUF5]] +; SKX-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <32 x i32> [[BIN_RDX6]], <32 x i32> undef, <32 x i32> +; SKX-NEXT: [[TMP100:%.*]] = icmp sgt <32 x i32> [[BIN_RDX6]], [[RDX_SHUF7]] +; SKX-NEXT: [[BIN_RDX8:%.*]] = select <32 x i1> [[TMP100]], <32 x i32> [[BIN_RDX6]], <32 x i32> [[RDX_SHUF7]] +; SKX-NEXT: [[TMP101:%.*]] = extractelement <32 x i32> [[BIN_RDX8]], i32 0 +; SKX: ret i32 [[TMP101]] ; %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4 @@ -892,79 +455,46 @@ ; CHECK-NEXT: ret float [[TMP23]] ; ; AVX-LABEL: @maxf8( -; AVX-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16 -; AVX-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4 -; AVX-NEXT: [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]] -; AVX-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]] -; AVX-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8 -; AVX-NEXT: [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]] -; AVX-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]] -; AVX-NEXT: [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4 -; AVX-NEXT: [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]] -; AVX-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]] -; AVX-NEXT: [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16 -; AVX-NEXT: [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]] -; AVX-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]] -; AVX-NEXT: [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4 -; AVX-NEXT: [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]] -; AVX-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]] -; AVX-NEXT: [[TMP18:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8 -; AVX-NEXT: [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]] -; AVX-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]] -; AVX-NEXT: [[TMP21:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4 -; AVX-NEXT: [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]] -; AVX-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]] -; AVX-NEXT: ret float [[TMP23]] +; AVX-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([32 x float]* @arr1 to <8 x float>*), align 16 +; AVX: [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> undef, <8 x i32> +; AVX-NEXT: [[TMP24:%.*]] = fcmp fast ogt <8 x float> [[TMP2]], [[RDX_SHUF]] +; AVX-NEXT: [[BIN_RDX:%.*]] = select <8 x i1> [[TMP24]], <8 x float> [[TMP2]], <8 x float> [[RDX_SHUF]] +; AVX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> +; AVX-NEXT: [[TMP25:%.*]] = fcmp fast ogt <8 x float> [[BIN_RDX]], [[RDX_SHUF1]] +; AVX-NEXT: [[BIN_RDX2:%.*]] = select <8 x i1> [[TMP25]], <8 x float> [[BIN_RDX]], <8 x float> [[RDX_SHUF1]] +; AVX-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> +; AVX-NEXT: [[TMP26:%.*]] = fcmp fast ogt <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]] +; AVX-NEXT: [[BIN_RDX4:%.*]] = select <8 x i1> [[TMP26]], <8 x float> [[BIN_RDX2]], <8 x float> [[RDX_SHUF3]] +; AVX-NEXT: [[TMP27:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0 +; AVX: ret float [[TMP27]] ; ; AVX2-LABEL: @maxf8( -; AVX2-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16 -; AVX2-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4 -; AVX2-NEXT: [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]] -; AVX2-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]] -; AVX2-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8 -; AVX2-NEXT: [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]] -; AVX2-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]] -; AVX2-NEXT: [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4 -; AVX2-NEXT: [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]] -; AVX2-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]] -; AVX2-NEXT: [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16 -; AVX2-NEXT: [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]] -; AVX2-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]] -; AVX2-NEXT: [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4 -; AVX2-NEXT: [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]] -; AVX2-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]] -; AVX2-NEXT: [[TMP18:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8 -; AVX2-NEXT: [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]] -; AVX2-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]] -; AVX2-NEXT: [[TMP21:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4 -; AVX2-NEXT: [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]] -; AVX2-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]] -; AVX2-NEXT: ret float [[TMP23]] +; AVX2-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([32 x float]* @arr1 to <8 x float>*), align 16 +; AVX2: [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> undef, <8 x i32> +; AVX2-NEXT: [[TMP24:%.*]] = fcmp fast ogt <8 x float> [[TMP2]], [[RDX_SHUF]] +; AVX2-NEXT: [[BIN_RDX:%.*]] = select <8 x i1> [[TMP24]], <8 x float> [[TMP2]], <8 x float> [[RDX_SHUF]] +; AVX2-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> +; AVX2-NEXT: [[TMP25:%.*]] = fcmp fast ogt <8 x float> [[BIN_RDX]], [[RDX_SHUF1]] +; AVX2-NEXT: [[BIN_RDX2:%.*]] = select <8 x i1> [[TMP25]], <8 x float> [[BIN_RDX]], <8 x float> [[RDX_SHUF1]] +; AVX2-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> +; AVX2-NEXT: [[TMP26:%.*]] = fcmp fast ogt <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]] +; AVX2-NEXT: [[BIN_RDX4:%.*]] = select <8 x i1> [[TMP26]], <8 x float> [[BIN_RDX2]], <8 x float> [[RDX_SHUF3]] +; AVX2-NEXT: [[TMP27:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0 +; AVX2: ret float [[TMP27]] ; ; SKX-LABEL: @maxf8( -; SKX-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16 -; SKX-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4 -; SKX-NEXT: [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]] -; SKX-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]] -; SKX-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8 -; SKX-NEXT: [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]] -; SKX-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]] -; SKX-NEXT: [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4 -; SKX-NEXT: [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]] -; SKX-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]] -; SKX-NEXT: [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16 -; SKX-NEXT: [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]] -; SKX-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]] -; SKX-NEXT: [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4 -; SKX-NEXT: [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]] -; SKX-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]] -; SKX-NEXT: [[TMP18:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8 -; SKX-NEXT: [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]] -; SKX-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]] -; SKX-NEXT: [[TMP21:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4 -; SKX-NEXT: [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]] -; SKX-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]] -; SKX-NEXT: ret float [[TMP23]] +; SKX-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([32 x float]* @arr1 to <8 x float>*), align 16 +; SKX: [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> undef, <8 x i32> +; SKX-NEXT: [[TMP24:%.*]] = fcmp fast ogt <8 x float> [[TMP2]], [[RDX_SHUF]] +; SKX-NEXT: [[BIN_RDX:%.*]] = select <8 x i1> [[TMP24]], <8 x float> [[TMP2]], <8 x float> [[RDX_SHUF]] +; SKX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> +; SKX-NEXT: [[TMP25:%.*]] = fcmp fast ogt <8 x float> [[BIN_RDX]], [[RDX_SHUF1]] +; SKX-NEXT: [[BIN_RDX2:%.*]] = select <8 x i1> [[TMP25]], <8 x float> [[BIN_RDX]], <8 x float> [[RDX_SHUF1]] +; SKX-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> +; SKX-NEXT: [[TMP26:%.*]] = fcmp fast ogt <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]] +; SKX-NEXT: [[BIN_RDX4:%.*]] = select <8 x i1> [[TMP26]], <8 x float> [[BIN_RDX2]], <8 x float> [[RDX_SHUF3]] +; SKX-NEXT: [[TMP27:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0 +; SKX: ret float [[TMP27]] ; %2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16 %3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4 @@ -1042,151 +572,55 @@ ; CHECK-NEXT: ret float [[TMP47]] ; ; AVX-LABEL: @maxf16( -; AVX-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16 -; AVX-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4 -; AVX-NEXT: [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]] -; AVX-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]] -; AVX-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8 -; AVX-NEXT: [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]] -; AVX-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]] -; AVX-NEXT: [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4 -; AVX-NEXT: [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]] -; AVX-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]] -; AVX-NEXT: [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16 -; AVX-NEXT: [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]] -; AVX-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]] -; AVX-NEXT: [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4 -; AVX-NEXT: [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]] -; AVX-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]] -; AVX-NEXT: [[TMP18:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8 -; AVX-NEXT: [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]] -; AVX-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]] -; AVX-NEXT: [[TMP21:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4 -; AVX-NEXT: [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]] -; AVX-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]] -; AVX-NEXT: [[TMP24:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 8), align 16 -; AVX-NEXT: [[TMP25:%.*]] = fcmp fast ogt float [[TMP23]], [[TMP24]] -; AVX-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP23]], float [[TMP24]] -; AVX-NEXT: [[TMP27:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 9), align 4 -; AVX-NEXT: [[TMP28:%.*]] = fcmp fast ogt float [[TMP26]], [[TMP27]] -; AVX-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], float [[TMP26]], float [[TMP27]] -; AVX-NEXT: [[TMP30:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 10), align 8 -; AVX-NEXT: [[TMP31:%.*]] = fcmp fast ogt float [[TMP29]], [[TMP30]] -; AVX-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], float [[TMP29]], float [[TMP30]] -; AVX-NEXT: [[TMP33:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 11), align 4 -; AVX-NEXT: [[TMP34:%.*]] = fcmp fast ogt float [[TMP32]], [[TMP33]] -; AVX-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], float [[TMP32]], float [[TMP33]] -; AVX-NEXT: [[TMP36:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 12), align 16 -; AVX-NEXT: [[TMP37:%.*]] = fcmp fast ogt float [[TMP35]], [[TMP36]] -; AVX-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], float [[TMP35]], float [[TMP36]] -; AVX-NEXT: [[TMP39:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 13), align 4 -; AVX-NEXT: [[TMP40:%.*]] = fcmp fast ogt float [[TMP38]], [[TMP39]] -; AVX-NEXT: [[TMP41:%.*]] = select i1 [[TMP40]], float [[TMP38]], float [[TMP39]] -; AVX-NEXT: [[TMP42:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 14), align 8 -; AVX-NEXT: [[TMP43:%.*]] = fcmp fast ogt float [[TMP41]], [[TMP42]] -; AVX-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], float [[TMP41]], float [[TMP42]] -; AVX-NEXT: [[TMP45:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4 -; AVX-NEXT: [[TMP46:%.*]] = fcmp fast ogt float [[TMP44]], [[TMP45]] -; AVX-NEXT: [[TMP47:%.*]] = select i1 [[TMP46]], float [[TMP44]], float [[TMP45]] -; AVX-NEXT: ret float [[TMP47]] +; AVX-NEXT: [[TMP2:%.*]] = load <16 x float>, <16 x float>* bitcast ([32 x float]* @arr1 to <16 x float>*), align 16 +; AVX: [[RDX_SHUF:%.*]] = shufflevector <16 x float> [[TMP2]], <16 x float> undef, <16 x i32> +; AVX-NEXT: [[TMP48:%.*]] = fcmp fast ogt <16 x float> [[TMP2]], [[RDX_SHUF]] +; AVX-NEXT: [[BIN_RDX:%.*]] = select <16 x i1> [[TMP48]], <16 x float> [[TMP2]], <16 x float> [[RDX_SHUF]] +; AVX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <16 x float> [[BIN_RDX]], <16 x float> undef, <16 x i32> +; AVX-NEXT: [[TMP49:%.*]] = fcmp fast ogt <16 x float> [[BIN_RDX]], [[RDX_SHUF1]] +; AVX-NEXT: [[BIN_RDX2:%.*]] = select <16 x i1> [[TMP49]], <16 x float> [[BIN_RDX]], <16 x float> [[RDX_SHUF1]] +; AVX-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <16 x float> [[BIN_RDX2]], <16 x float> undef, <16 x i32> +; AVX-NEXT: [[TMP50:%.*]] = fcmp fast ogt <16 x float> [[BIN_RDX2]], [[RDX_SHUF3]] +; AVX-NEXT: [[BIN_RDX4:%.*]] = select <16 x i1> [[TMP50]], <16 x float> [[BIN_RDX2]], <16 x float> [[RDX_SHUF3]] +; AVX-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <16 x float> [[BIN_RDX4]], <16 x float> undef, <16 x i32> +; AVX-NEXT: [[TMP51:%.*]] = fcmp fast ogt <16 x float> [[BIN_RDX4]], [[RDX_SHUF5]] +; AVX-NEXT: [[BIN_RDX6:%.*]] = select <16 x i1> [[TMP51]], <16 x float> [[BIN_RDX4]], <16 x float> [[RDX_SHUF5]] +; AVX-NEXT: [[TMP52:%.*]] = extractelement <16 x float> [[BIN_RDX6]], i32 0 +; AVX: ret float [[TMP52]] ; ; AVX2-LABEL: @maxf16( -; AVX2-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16 -; AVX2-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4 -; AVX2-NEXT: [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]] -; AVX2-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]] -; AVX2-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8 -; AVX2-NEXT: [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]] -; AVX2-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]] -; AVX2-NEXT: [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4 -; AVX2-NEXT: [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]] -; AVX2-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]] -; AVX2-NEXT: [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16 -; AVX2-NEXT: [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]] -; AVX2-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]] -; AVX2-NEXT: [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4 -; AVX2-NEXT: [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]] -; AVX2-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]] -; AVX2-NEXT: [[TMP18:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8 -; AVX2-NEXT: [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]] -; AVX2-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]] -; AVX2-NEXT: [[TMP21:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4 -; AVX2-NEXT: [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]] -; AVX2-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]] -; AVX2-NEXT: [[TMP24:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 8), align 16 -; AVX2-NEXT: [[TMP25:%.*]] = fcmp fast ogt float [[TMP23]], [[TMP24]] -; AVX2-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP23]], float [[TMP24]] -; AVX2-NEXT: [[TMP27:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 9), align 4 -; AVX2-NEXT: [[TMP28:%.*]] = fcmp fast ogt float [[TMP26]], [[TMP27]] -; AVX2-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], float [[TMP26]], float [[TMP27]] -; AVX2-NEXT: [[TMP30:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 10), align 8 -; AVX2-NEXT: [[TMP31:%.*]] = fcmp fast ogt float [[TMP29]], [[TMP30]] -; AVX2-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], float [[TMP29]], float [[TMP30]] -; AVX2-NEXT: [[TMP33:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 11), align 4 -; AVX2-NEXT: [[TMP34:%.*]] = fcmp fast ogt float [[TMP32]], [[TMP33]] -; AVX2-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], float [[TMP32]], float [[TMP33]] -; AVX2-NEXT: [[TMP36:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 12), align 16 -; AVX2-NEXT: [[TMP37:%.*]] = fcmp fast ogt float [[TMP35]], [[TMP36]] -; AVX2-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], float [[TMP35]], float [[TMP36]] -; AVX2-NEXT: [[TMP39:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 13), align 4 -; AVX2-NEXT: [[TMP40:%.*]] = fcmp fast ogt float [[TMP38]], [[TMP39]] -; AVX2-NEXT: [[TMP41:%.*]] = select i1 [[TMP40]], float [[TMP38]], float [[TMP39]] -; AVX2-NEXT: [[TMP42:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 14), align 8 -; AVX2-NEXT: [[TMP43:%.*]] = fcmp fast ogt float [[TMP41]], [[TMP42]] -; AVX2-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], float [[TMP41]], float [[TMP42]] -; AVX2-NEXT: [[TMP45:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4 -; AVX2-NEXT: [[TMP46:%.*]] = fcmp fast ogt float [[TMP44]], [[TMP45]] -; AVX2-NEXT: [[TMP47:%.*]] = select i1 [[TMP46]], float [[TMP44]], float [[TMP45]] -; AVX2-NEXT: ret float [[TMP47]] +; AVX2-NEXT: [[TMP2:%.*]] = load <16 x float>, <16 x float>* bitcast ([32 x float]* @arr1 to <16 x float>*), align 16 +; AVX2: [[RDX_SHUF:%.*]] = shufflevector <16 x float> [[TMP2]], <16 x float> undef, <16 x i32> +; AVX2-NEXT: [[TMP48:%.*]] = fcmp fast ogt <16 x float> [[TMP2]], [[RDX_SHUF]] +; AVX2-NEXT: [[BIN_RDX:%.*]] = select <16 x i1> [[TMP48]], <16 x float> [[TMP2]], <16 x float> [[RDX_SHUF]] +; AVX2-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <16 x float> [[BIN_RDX]], <16 x float> undef, <16 x i32> +; AVX2-NEXT: [[TMP49:%.*]] = fcmp fast ogt <16 x float> [[BIN_RDX]], [[RDX_SHUF1]] +; AVX2-NEXT: [[BIN_RDX2:%.*]] = select <16 x i1> [[TMP49]], <16 x float> [[BIN_RDX]], <16 x float> [[RDX_SHUF1]] +; AVX2-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <16 x float> [[BIN_RDX2]], <16 x float> undef, <16 x i32> +; AVX2-NEXT: [[TMP50:%.*]] = fcmp fast ogt <16 x float> [[BIN_RDX2]], [[RDX_SHUF3]] +; AVX2-NEXT: [[BIN_RDX4:%.*]] = select <16 x i1> [[TMP50]], <16 x float> [[BIN_RDX2]], <16 x float> [[RDX_SHUF3]] +; AVX2-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <16 x float> [[BIN_RDX4]], <16 x float> undef, <16 x i32> +; AVX2-NEXT: [[TMP51:%.*]] = fcmp fast ogt <16 x float> [[BIN_RDX4]], [[RDX_SHUF5]] +; AVX2-NEXT: [[BIN_RDX6:%.*]] = select <16 x i1> [[TMP51]], <16 x float> [[BIN_RDX4]], <16 x float> [[RDX_SHUF5]] +; AVX2-NEXT: [[TMP52:%.*]] = extractelement <16 x float> [[BIN_RDX6]], i32 0 +; AVX2: ret float [[TMP52]] ; ; SKX-LABEL: @maxf16( -; SKX-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16 -; SKX-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4 -; SKX-NEXT: [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]] -; SKX-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]] -; SKX-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8 -; SKX-NEXT: [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]] -; SKX-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]] -; SKX-NEXT: [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4 -; SKX-NEXT: [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]] -; SKX-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]] -; SKX-NEXT: [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16 -; SKX-NEXT: [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]] -; SKX-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]] -; SKX-NEXT: [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4 -; SKX-NEXT: [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]] -; SKX-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]] -; SKX-NEXT: [[TMP18:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8 -; SKX-NEXT: [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]] -; SKX-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]] -; SKX-NEXT: [[TMP21:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4 -; SKX-NEXT: [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]] -; SKX-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]] -; SKX-NEXT: [[TMP24:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 8), align 16 -; SKX-NEXT: [[TMP25:%.*]] = fcmp fast ogt float [[TMP23]], [[TMP24]] -; SKX-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP23]], float [[TMP24]] -; SKX-NEXT: [[TMP27:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 9), align 4 -; SKX-NEXT: [[TMP28:%.*]] = fcmp fast ogt float [[TMP26]], [[TMP27]] -; SKX-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], float [[TMP26]], float [[TMP27]] -; SKX-NEXT: [[TMP30:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 10), align 8 -; SKX-NEXT: [[TMP31:%.*]] = fcmp fast ogt float [[TMP29]], [[TMP30]] -; SKX-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], float [[TMP29]], float [[TMP30]] -; SKX-NEXT: [[TMP33:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 11), align 4 -; SKX-NEXT: [[TMP34:%.*]] = fcmp fast ogt float [[TMP32]], [[TMP33]] -; SKX-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], float [[TMP32]], float [[TMP33]] -; SKX-NEXT: [[TMP36:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 12), align 16 -; SKX-NEXT: [[TMP37:%.*]] = fcmp fast ogt float [[TMP35]], [[TMP36]] -; SKX-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], float [[TMP35]], float [[TMP36]] -; SKX-NEXT: [[TMP39:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 13), align 4 -; SKX-NEXT: [[TMP40:%.*]] = fcmp fast ogt float [[TMP38]], [[TMP39]] -; SKX-NEXT: [[TMP41:%.*]] = select i1 [[TMP40]], float [[TMP38]], float [[TMP39]] -; SKX-NEXT: [[TMP42:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 14), align 8 -; SKX-NEXT: [[TMP43:%.*]] = fcmp fast ogt float [[TMP41]], [[TMP42]] -; SKX-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], float [[TMP41]], float [[TMP42]] -; SKX-NEXT: [[TMP45:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4 -; SKX-NEXT: [[TMP46:%.*]] = fcmp fast ogt float [[TMP44]], [[TMP45]] -; SKX-NEXT: [[TMP47:%.*]] = select i1 [[TMP46]], float [[TMP44]], float [[TMP45]] -; SKX-NEXT: ret float [[TMP47]] +; SKX-NEXT: [[TMP2:%.*]] = load <16 x float>, <16 x float>* bitcast ([32 x float]* @arr1 to <16 x float>*), align 16 +; SKX: [[RDX_SHUF:%.*]] = shufflevector <16 x float> [[TMP2]], <16 x float> undef, <16 x i32> +; SKX-NEXT: [[TMP48:%.*]] = fcmp fast ogt <16 x float> [[TMP2]], [[RDX_SHUF]] +; SKX-NEXT: [[BIN_RDX:%.*]] = select <16 x i1> [[TMP48]], <16 x float> [[TMP2]], <16 x float> [[RDX_SHUF]] +; SKX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <16 x float> [[BIN_RDX]], <16 x float> undef, <16 x i32> +; SKX-NEXT: [[TMP49:%.*]] = fcmp fast ogt <16 x float> [[BIN_RDX]], [[RDX_SHUF1]] +; SKX-NEXT: [[BIN_RDX2:%.*]] = select <16 x i1> [[TMP49]], <16 x float> [[BIN_RDX]], <16 x float> [[RDX_SHUF1]] +; SKX-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <16 x float> [[BIN_RDX2]], <16 x float> undef, <16 x i32> +; SKX-NEXT: [[TMP50:%.*]] = fcmp fast ogt <16 x float> [[BIN_RDX2]], [[RDX_SHUF3]] +; SKX-NEXT: [[BIN_RDX4:%.*]] = select <16 x i1> [[TMP50]], <16 x float> [[BIN_RDX2]], <16 x float> [[RDX_SHUF3]] +; SKX-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <16 x float> [[BIN_RDX4]], <16 x float> undef, <16 x i32> +; SKX-NEXT: [[TMP51:%.*]] = fcmp fast ogt <16 x float> [[BIN_RDX4]], [[RDX_SHUF5]] +; SKX-NEXT: [[BIN_RDX6:%.*]] = select <16 x i1> [[TMP51]], <16 x float> [[BIN_RDX4]], <16 x float> [[RDX_SHUF5]] +; SKX-NEXT: [[TMP52:%.*]] = extractelement <16 x float> [[BIN_RDX6]], i32 0 +; SKX: ret float [[TMP52]] ; %2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16 %3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4 @@ -1336,295 +770,64 @@ ; CHECK-NEXT: ret float [[TMP95]] ; ; AVX-LABEL: @maxf32( -; AVX-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16 -; AVX-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4 -; AVX-NEXT: [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]] -; AVX-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]] -; AVX-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8 -; AVX-NEXT: [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]] -; AVX-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]] -; AVX-NEXT: [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4 -; AVX-NEXT: [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]] -; AVX-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]] -; AVX-NEXT: [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16 -; AVX-NEXT: [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]] -; AVX-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]] -; AVX-NEXT: [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4 -; AVX-NEXT: [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]] -; AVX-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]] -; AVX-NEXT: [[TMP18:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8 -; AVX-NEXT: [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]] -; AVX-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]] -; AVX-NEXT: [[TMP21:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4 -; AVX-NEXT: [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]] -; AVX-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]] -; AVX-NEXT: [[TMP24:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 8), align 16 -; AVX-NEXT: [[TMP25:%.*]] = fcmp fast ogt float [[TMP23]], [[TMP24]] -; AVX-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP23]], float [[TMP24]] -; AVX-NEXT: [[TMP27:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 9), align 4 -; AVX-NEXT: [[TMP28:%.*]] = fcmp fast ogt float [[TMP26]], [[TMP27]] -; AVX-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], float [[TMP26]], float [[TMP27]] -; AVX-NEXT: [[TMP30:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 10), align 8 -; AVX-NEXT: [[TMP31:%.*]] = fcmp fast ogt float [[TMP29]], [[TMP30]] -; AVX-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], float [[TMP29]], float [[TMP30]] -; AVX-NEXT: [[TMP33:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 11), align 4 -; AVX-NEXT: [[TMP34:%.*]] = fcmp fast ogt float [[TMP32]], [[TMP33]] -; AVX-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], float [[TMP32]], float [[TMP33]] -; AVX-NEXT: [[TMP36:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 12), align 16 -; AVX-NEXT: [[TMP37:%.*]] = fcmp fast ogt float [[TMP35]], [[TMP36]] -; AVX-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], float [[TMP35]], float [[TMP36]] -; AVX-NEXT: [[TMP39:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 13), align 4 -; AVX-NEXT: [[TMP40:%.*]] = fcmp fast ogt float [[TMP38]], [[TMP39]] -; AVX-NEXT: [[TMP41:%.*]] = select i1 [[TMP40]], float [[TMP38]], float [[TMP39]] -; AVX-NEXT: [[TMP42:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 14), align 8 -; AVX-NEXT: [[TMP43:%.*]] = fcmp fast ogt float [[TMP41]], [[TMP42]] -; AVX-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], float [[TMP41]], float [[TMP42]] -; AVX-NEXT: [[TMP45:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4 -; AVX-NEXT: [[TMP46:%.*]] = fcmp fast ogt float [[TMP44]], [[TMP45]] -; AVX-NEXT: [[TMP47:%.*]] = select i1 [[TMP46]], float [[TMP44]], float [[TMP45]] -; AVX-NEXT: [[TMP48:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 16), align 16 -; AVX-NEXT: [[TMP49:%.*]] = fcmp fast ogt float [[TMP47]], [[TMP48]] -; AVX-NEXT: [[TMP50:%.*]] = select i1 [[TMP49]], float [[TMP47]], float [[TMP48]] -; AVX-NEXT: [[TMP51:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 17), align 4 -; AVX-NEXT: [[TMP52:%.*]] = fcmp fast ogt float [[TMP50]], [[TMP51]] -; AVX-NEXT: [[TMP53:%.*]] = select i1 [[TMP52]], float [[TMP50]], float [[TMP51]] -; AVX-NEXT: [[TMP54:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 18), align 8 -; AVX-NEXT: [[TMP55:%.*]] = fcmp fast ogt float [[TMP53]], [[TMP54]] -; AVX-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], float [[TMP53]], float [[TMP54]] -; AVX-NEXT: [[TMP57:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 19), align 4 -; AVX-NEXT: [[TMP58:%.*]] = fcmp fast ogt float [[TMP56]], [[TMP57]] -; AVX-NEXT: [[TMP59:%.*]] = select i1 [[TMP58]], float [[TMP56]], float [[TMP57]] -; AVX-NEXT: [[TMP60:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 20), align 16 -; AVX-NEXT: [[TMP61:%.*]] = fcmp fast ogt float [[TMP59]], [[TMP60]] -; AVX-NEXT: [[TMP62:%.*]] = select i1 [[TMP61]], float [[TMP59]], float [[TMP60]] -; AVX-NEXT: [[TMP63:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 21), align 4 -; AVX-NEXT: [[TMP64:%.*]] = fcmp fast ogt float [[TMP62]], [[TMP63]] -; AVX-NEXT: [[TMP65:%.*]] = select i1 [[TMP64]], float [[TMP62]], float [[TMP63]] -; AVX-NEXT: [[TMP66:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 22), align 8 -; AVX-NEXT: [[TMP67:%.*]] = fcmp fast ogt float [[TMP65]], [[TMP66]] -; AVX-NEXT: [[TMP68:%.*]] = select i1 [[TMP67]], float [[TMP65]], float [[TMP66]] -; AVX-NEXT: [[TMP69:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 23), align 4 -; AVX-NEXT: [[TMP70:%.*]] = fcmp fast ogt float [[TMP68]], [[TMP69]] -; AVX-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], float [[TMP68]], float [[TMP69]] -; AVX-NEXT: [[TMP72:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 24), align 16 -; AVX-NEXT: [[TMP73:%.*]] = fcmp fast ogt float [[TMP71]], [[TMP72]] -; AVX-NEXT: [[TMP74:%.*]] = select i1 [[TMP73]], float [[TMP71]], float [[TMP72]] -; AVX-NEXT: [[TMP75:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 25), align 4 -; AVX-NEXT: [[TMP76:%.*]] = fcmp fast ogt float [[TMP74]], [[TMP75]] -; AVX-NEXT: [[TMP77:%.*]] = select i1 [[TMP76]], float [[TMP74]], float [[TMP75]] -; AVX-NEXT: [[TMP78:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 26), align 8 -; AVX-NEXT: [[TMP79:%.*]] = fcmp fast ogt float [[TMP77]], [[TMP78]] -; AVX-NEXT: [[TMP80:%.*]] = select i1 [[TMP79]], float [[TMP77]], float [[TMP78]] -; AVX-NEXT: [[TMP81:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 27), align 4 -; AVX-NEXT: [[TMP82:%.*]] = fcmp fast ogt float [[TMP80]], [[TMP81]] -; AVX-NEXT: [[TMP83:%.*]] = select i1 [[TMP82]], float [[TMP80]], float [[TMP81]] -; AVX-NEXT: [[TMP84:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 28), align 16 -; AVX-NEXT: [[TMP85:%.*]] = fcmp fast ogt float [[TMP83]], [[TMP84]] -; AVX-NEXT: [[TMP86:%.*]] = select i1 [[TMP85]], float [[TMP83]], float [[TMP84]] -; AVX-NEXT: [[TMP87:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 29), align 4 -; AVX-NEXT: [[TMP88:%.*]] = fcmp fast ogt float [[TMP86]], [[TMP87]] -; AVX-NEXT: [[TMP89:%.*]] = select i1 [[TMP88]], float [[TMP86]], float [[TMP87]] -; AVX-NEXT: [[TMP90:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 30), align 8 -; AVX-NEXT: [[TMP91:%.*]] = fcmp fast ogt float [[TMP89]], [[TMP90]] -; AVX-NEXT: [[TMP92:%.*]] = select i1 [[TMP91]], float [[TMP89]], float [[TMP90]] -; AVX-NEXT: [[TMP93:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 31), align 4 -; AVX-NEXT: [[TMP94:%.*]] = fcmp fast ogt float [[TMP92]], [[TMP93]] -; AVX-NEXT: [[TMP95:%.*]] = select i1 [[TMP94]], float [[TMP92]], float [[TMP93]] -; AVX-NEXT: ret float [[TMP95]] +; AVX-NEXT: [[TMP2:%.*]] = load <32 x float>, <32 x float>* bitcast ([32 x float]* @arr1 to <32 x float>*), align 16 +; AVX: [[RDX_SHUF:%.*]] = shufflevector <32 x float> [[TMP2]], <32 x float> undef, <32 x i32> +; AVX-NEXT: [[TMP96:%.*]] = fcmp fast ogt <32 x float> [[TMP2]], [[RDX_SHUF]] +; AVX-NEXT: [[BIN_RDX:%.*]] = select <32 x i1> [[TMP96]], <32 x float> [[TMP2]], <32 x float> [[RDX_SHUF]] +; AVX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <32 x float> [[BIN_RDX]], <32 x float> undef, <32 x i32> +; AVX-NEXT: [[TMP97:%.*]] = fcmp fast ogt <32 x float> [[BIN_RDX]], [[RDX_SHUF1]] +; AVX-NEXT: [[BIN_RDX2:%.*]] = select <32 x i1> [[TMP97]], <32 x float> [[BIN_RDX]], <32 x float> [[RDX_SHUF1]] +; AVX-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <32 x float> [[BIN_RDX2]], <32 x float> undef, <32 x i32> +; AVX-NEXT: [[TMP98:%.*]] = fcmp fast ogt <32 x float> [[BIN_RDX2]], [[RDX_SHUF3]] +; AVX-NEXT: [[BIN_RDX4:%.*]] = select <32 x i1> [[TMP98]], <32 x float> [[BIN_RDX2]], <32 x float> [[RDX_SHUF3]] +; AVX-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <32 x float> [[BIN_RDX4]], <32 x float> undef, <32 x i32> +; AVX-NEXT: [[TMP99:%.*]] = fcmp fast ogt <32 x float> [[BIN_RDX4]], [[RDX_SHUF5]] +; AVX-NEXT: [[BIN_RDX6:%.*]] = select <32 x i1> [[TMP99]], <32 x float> [[BIN_RDX4]], <32 x float> [[RDX_SHUF5]] +; AVX-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <32 x float> [[BIN_RDX6]], <32 x float> undef, <32 x i32> +; AVX-NEXT: [[TMP100:%.*]] = fcmp fast ogt <32 x float> [[BIN_RDX6]], [[RDX_SHUF7]] +; AVX-NEXT: [[BIN_RDX8:%.*]] = select <32 x i1> [[TMP100]], <32 x float> [[BIN_RDX6]], <32 x float> [[RDX_SHUF7]] +; AVX-NEXT: [[TMP101:%.*]] = extractelement <32 x float> [[BIN_RDX8]], i32 0 +; AVX: ret float [[TMP101]] ; ; AVX2-LABEL: @maxf32( -; AVX2-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16 -; AVX2-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4 -; AVX2-NEXT: [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]] -; AVX2-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]] -; AVX2-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8 -; AVX2-NEXT: [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]] -; AVX2-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]] -; AVX2-NEXT: [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4 -; AVX2-NEXT: [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]] -; AVX2-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]] -; AVX2-NEXT: [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16 -; AVX2-NEXT: [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]] -; AVX2-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]] -; AVX2-NEXT: [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4 -; AVX2-NEXT: [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]] -; AVX2-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]] -; AVX2-NEXT: [[TMP18:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8 -; AVX2-NEXT: [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]] -; AVX2-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]] -; AVX2-NEXT: [[TMP21:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4 -; AVX2-NEXT: [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]] -; AVX2-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]] -; AVX2-NEXT: [[TMP24:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 8), align 16 -; AVX2-NEXT: [[TMP25:%.*]] = fcmp fast ogt float [[TMP23]], [[TMP24]] -; AVX2-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP23]], float [[TMP24]] -; AVX2-NEXT: [[TMP27:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 9), align 4 -; AVX2-NEXT: [[TMP28:%.*]] = fcmp fast ogt float [[TMP26]], [[TMP27]] -; AVX2-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], float [[TMP26]], float [[TMP27]] -; AVX2-NEXT: [[TMP30:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 10), align 8 -; AVX2-NEXT: [[TMP31:%.*]] = fcmp fast ogt float [[TMP29]], [[TMP30]] -; AVX2-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], float [[TMP29]], float [[TMP30]] -; AVX2-NEXT: [[TMP33:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 11), align 4 -; AVX2-NEXT: [[TMP34:%.*]] = fcmp fast ogt float [[TMP32]], [[TMP33]] -; AVX2-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], float [[TMP32]], float [[TMP33]] -; AVX2-NEXT: [[TMP36:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 12), align 16 -; AVX2-NEXT: [[TMP37:%.*]] = fcmp fast ogt float [[TMP35]], [[TMP36]] -; AVX2-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], float [[TMP35]], float [[TMP36]] -; AVX2-NEXT: [[TMP39:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 13), align 4 -; AVX2-NEXT: [[TMP40:%.*]] = fcmp fast ogt float [[TMP38]], [[TMP39]] -; AVX2-NEXT: [[TMP41:%.*]] = select i1 [[TMP40]], float [[TMP38]], float [[TMP39]] -; AVX2-NEXT: [[TMP42:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 14), align 8 -; AVX2-NEXT: [[TMP43:%.*]] = fcmp fast ogt float [[TMP41]], [[TMP42]] -; AVX2-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], float [[TMP41]], float [[TMP42]] -; AVX2-NEXT: [[TMP45:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4 -; AVX2-NEXT: [[TMP46:%.*]] = fcmp fast ogt float [[TMP44]], [[TMP45]] -; AVX2-NEXT: [[TMP47:%.*]] = select i1 [[TMP46]], float [[TMP44]], float [[TMP45]] -; AVX2-NEXT: [[TMP48:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 16), align 16 -; AVX2-NEXT: [[TMP49:%.*]] = fcmp fast ogt float [[TMP47]], [[TMP48]] -; AVX2-NEXT: [[TMP50:%.*]] = select i1 [[TMP49]], float [[TMP47]], float [[TMP48]] -; AVX2-NEXT: [[TMP51:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 17), align 4 -; AVX2-NEXT: [[TMP52:%.*]] = fcmp fast ogt float [[TMP50]], [[TMP51]] -; AVX2-NEXT: [[TMP53:%.*]] = select i1 [[TMP52]], float [[TMP50]], float [[TMP51]] -; AVX2-NEXT: [[TMP54:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 18), align 8 -; AVX2-NEXT: [[TMP55:%.*]] = fcmp fast ogt float [[TMP53]], [[TMP54]] -; AVX2-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], float [[TMP53]], float [[TMP54]] -; AVX2-NEXT: [[TMP57:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 19), align 4 -; AVX2-NEXT: [[TMP58:%.*]] = fcmp fast ogt float [[TMP56]], [[TMP57]] -; AVX2-NEXT: [[TMP59:%.*]] = select i1 [[TMP58]], float [[TMP56]], float [[TMP57]] -; AVX2-NEXT: [[TMP60:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 20), align 16 -; AVX2-NEXT: [[TMP61:%.*]] = fcmp fast ogt float [[TMP59]], [[TMP60]] -; AVX2-NEXT: [[TMP62:%.*]] = select i1 [[TMP61]], float [[TMP59]], float [[TMP60]] -; AVX2-NEXT: [[TMP63:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 21), align 4 -; AVX2-NEXT: [[TMP64:%.*]] = fcmp fast ogt float [[TMP62]], [[TMP63]] -; AVX2-NEXT: [[TMP65:%.*]] = select i1 [[TMP64]], float [[TMP62]], float [[TMP63]] -; AVX2-NEXT: [[TMP66:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 22), align 8 -; AVX2-NEXT: [[TMP67:%.*]] = fcmp fast ogt float [[TMP65]], [[TMP66]] -; AVX2-NEXT: [[TMP68:%.*]] = select i1 [[TMP67]], float [[TMP65]], float [[TMP66]] -; AVX2-NEXT: [[TMP69:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 23), align 4 -; AVX2-NEXT: [[TMP70:%.*]] = fcmp fast ogt float [[TMP68]], [[TMP69]] -; AVX2-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], float [[TMP68]], float [[TMP69]] -; AVX2-NEXT: [[TMP72:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 24), align 16 -; AVX2-NEXT: [[TMP73:%.*]] = fcmp fast ogt float [[TMP71]], [[TMP72]] -; AVX2-NEXT: [[TMP74:%.*]] = select i1 [[TMP73]], float [[TMP71]], float [[TMP72]] -; AVX2-NEXT: [[TMP75:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 25), align 4 -; AVX2-NEXT: [[TMP76:%.*]] = fcmp fast ogt float [[TMP74]], [[TMP75]] -; AVX2-NEXT: [[TMP77:%.*]] = select i1 [[TMP76]], float [[TMP74]], float [[TMP75]] -; AVX2-NEXT: [[TMP78:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 26), align 8 -; AVX2-NEXT: [[TMP79:%.*]] = fcmp fast ogt float [[TMP77]], [[TMP78]] -; AVX2-NEXT: [[TMP80:%.*]] = select i1 [[TMP79]], float [[TMP77]], float [[TMP78]] -; AVX2-NEXT: [[TMP81:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 27), align 4 -; AVX2-NEXT: [[TMP82:%.*]] = fcmp fast ogt float [[TMP80]], [[TMP81]] -; AVX2-NEXT: [[TMP83:%.*]] = select i1 [[TMP82]], float [[TMP80]], float [[TMP81]] -; AVX2-NEXT: [[TMP84:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 28), align 16 -; AVX2-NEXT: [[TMP85:%.*]] = fcmp fast ogt float [[TMP83]], [[TMP84]] -; AVX2-NEXT: [[TMP86:%.*]] = select i1 [[TMP85]], float [[TMP83]], float [[TMP84]] -; AVX2-NEXT: [[TMP87:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 29), align 4 -; AVX2-NEXT: [[TMP88:%.*]] = fcmp fast ogt float [[TMP86]], [[TMP87]] -; AVX2-NEXT: [[TMP89:%.*]] = select i1 [[TMP88]], float [[TMP86]], float [[TMP87]] -; AVX2-NEXT: [[TMP90:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 30), align 8 -; AVX2-NEXT: [[TMP91:%.*]] = fcmp fast ogt float [[TMP89]], [[TMP90]] -; AVX2-NEXT: [[TMP92:%.*]] = select i1 [[TMP91]], float [[TMP89]], float [[TMP90]] -; AVX2-NEXT: [[TMP93:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 31), align 4 -; AVX2-NEXT: [[TMP94:%.*]] = fcmp fast ogt float [[TMP92]], [[TMP93]] -; AVX2-NEXT: [[TMP95:%.*]] = select i1 [[TMP94]], float [[TMP92]], float [[TMP93]] -; AVX2-NEXT: ret float [[TMP95]] +; AVX2-NEXT: [[TMP2:%.*]] = load <32 x float>, <32 x float>* bitcast ([32 x float]* @arr1 to <32 x float>*), align 16 +; AVX2: [[RDX_SHUF:%.*]] = shufflevector <32 x float> [[TMP2]], <32 x float> undef, <32 x i32> +; AVX2-NEXT: [[TMP96:%.*]] = fcmp fast ogt <32 x float> [[TMP2]], [[RDX_SHUF]] +; AVX2-NEXT: [[BIN_RDX:%.*]] = select <32 x i1> [[TMP96]], <32 x float> [[TMP2]], <32 x float> [[RDX_SHUF]] +; AVX2-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <32 x float> [[BIN_RDX]], <32 x float> undef, <32 x i32> +; AVX2-NEXT: [[TMP97:%.*]] = fcmp fast ogt <32 x float> [[BIN_RDX]], [[RDX_SHUF1]] +; AVX2-NEXT: [[BIN_RDX2:%.*]] = select <32 x i1> [[TMP97]], <32 x float> [[BIN_RDX]], <32 x float> [[RDX_SHUF1]] +; AVX2-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <32 x float> [[BIN_RDX2]], <32 x float> undef, <32 x i32> +; AVX2-NEXT: [[TMP98:%.*]] = fcmp fast ogt <32 x float> [[BIN_RDX2]], [[RDX_SHUF3]] +; AVX2-NEXT: [[BIN_RDX4:%.*]] = select <32 x i1> [[TMP98]], <32 x float> [[BIN_RDX2]], <32 x float> [[RDX_SHUF3]] +; AVX2-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <32 x float> [[BIN_RDX4]], <32 x float> undef, <32 x i32> +; AVX2-NEXT: [[TMP99:%.*]] = fcmp fast ogt <32 x float> [[BIN_RDX4]], [[RDX_SHUF5]] +; AVX2-NEXT: [[BIN_RDX6:%.*]] = select <32 x i1> [[TMP99]], <32 x float> [[BIN_RDX4]], <32 x float> [[RDX_SHUF5]] +; AVX2-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <32 x float> [[BIN_RDX6]], <32 x float> undef, <32 x i32> +; AVX2-NEXT: [[TMP100:%.*]] = fcmp fast ogt <32 x float> [[BIN_RDX6]], [[RDX_SHUF7]] +; AVX2-NEXT: [[BIN_RDX8:%.*]] = select <32 x i1> [[TMP100]], <32 x float> [[BIN_RDX6]], <32 x float> [[RDX_SHUF7]] +; AVX2-NEXT: [[TMP101:%.*]] = extractelement <32 x float> [[BIN_RDX8]], i32 0 +; AVX2: ret float [[TMP101]] ; ; SKX-LABEL: @maxf32( -; SKX-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16 -; SKX-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4 -; SKX-NEXT: [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]] -; SKX-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]] -; SKX-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8 -; SKX-NEXT: [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]] -; SKX-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]] -; SKX-NEXT: [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4 -; SKX-NEXT: [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]] -; SKX-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]] -; SKX-NEXT: [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16 -; SKX-NEXT: [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]] -; SKX-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]] -; SKX-NEXT: [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4 -; SKX-NEXT: [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]] -; SKX-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]] -; SKX-NEXT: [[TMP18:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8 -; SKX-NEXT: [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]] -; SKX-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]] -; SKX-NEXT: [[TMP21:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4 -; SKX-NEXT: [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]] -; SKX-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]] -; SKX-NEXT: [[TMP24:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 8), align 16 -; SKX-NEXT: [[TMP25:%.*]] = fcmp fast ogt float [[TMP23]], [[TMP24]] -; SKX-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP23]], float [[TMP24]] -; SKX-NEXT: [[TMP27:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 9), align 4 -; SKX-NEXT: [[TMP28:%.*]] = fcmp fast ogt float [[TMP26]], [[TMP27]] -; SKX-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], float [[TMP26]], float [[TMP27]] -; SKX-NEXT: [[TMP30:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 10), align 8 -; SKX-NEXT: [[TMP31:%.*]] = fcmp fast ogt float [[TMP29]], [[TMP30]] -; SKX-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], float [[TMP29]], float [[TMP30]] -; SKX-NEXT: [[TMP33:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 11), align 4 -; SKX-NEXT: [[TMP34:%.*]] = fcmp fast ogt float [[TMP32]], [[TMP33]] -; SKX-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], float [[TMP32]], float [[TMP33]] -; SKX-NEXT: [[TMP36:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 12), align 16 -; SKX-NEXT: [[TMP37:%.*]] = fcmp fast ogt float [[TMP35]], [[TMP36]] -; SKX-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], float [[TMP35]], float [[TMP36]] -; SKX-NEXT: [[TMP39:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 13), align 4 -; SKX-NEXT: [[TMP40:%.*]] = fcmp fast ogt float [[TMP38]], [[TMP39]] -; SKX-NEXT: [[TMP41:%.*]] = select i1 [[TMP40]], float [[TMP38]], float [[TMP39]] -; SKX-NEXT: [[TMP42:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 14), align 8 -; SKX-NEXT: [[TMP43:%.*]] = fcmp fast ogt float [[TMP41]], [[TMP42]] -; SKX-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], float [[TMP41]], float [[TMP42]] -; SKX-NEXT: [[TMP45:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4 -; SKX-NEXT: [[TMP46:%.*]] = fcmp fast ogt float [[TMP44]], [[TMP45]] -; SKX-NEXT: [[TMP47:%.*]] = select i1 [[TMP46]], float [[TMP44]], float [[TMP45]] -; SKX-NEXT: [[TMP48:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 16), align 16 -; SKX-NEXT: [[TMP49:%.*]] = fcmp fast ogt float [[TMP47]], [[TMP48]] -; SKX-NEXT: [[TMP50:%.*]] = select i1 [[TMP49]], float [[TMP47]], float [[TMP48]] -; SKX-NEXT: [[TMP51:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 17), align 4 -; SKX-NEXT: [[TMP52:%.*]] = fcmp fast ogt float [[TMP50]], [[TMP51]] -; SKX-NEXT: [[TMP53:%.*]] = select i1 [[TMP52]], float [[TMP50]], float [[TMP51]] -; SKX-NEXT: [[TMP54:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 18), align 8 -; SKX-NEXT: [[TMP55:%.*]] = fcmp fast ogt float [[TMP53]], [[TMP54]] -; SKX-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], float [[TMP53]], float [[TMP54]] -; SKX-NEXT: [[TMP57:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 19), align 4 -; SKX-NEXT: [[TMP58:%.*]] = fcmp fast ogt float [[TMP56]], [[TMP57]] -; SKX-NEXT: [[TMP59:%.*]] = select i1 [[TMP58]], float [[TMP56]], float [[TMP57]] -; SKX-NEXT: [[TMP60:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 20), align 16 -; SKX-NEXT: [[TMP61:%.*]] = fcmp fast ogt float [[TMP59]], [[TMP60]] -; SKX-NEXT: [[TMP62:%.*]] = select i1 [[TMP61]], float [[TMP59]], float [[TMP60]] -; SKX-NEXT: [[TMP63:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 21), align 4 -; SKX-NEXT: [[TMP64:%.*]] = fcmp fast ogt float [[TMP62]], [[TMP63]] -; SKX-NEXT: [[TMP65:%.*]] = select i1 [[TMP64]], float [[TMP62]], float [[TMP63]] -; SKX-NEXT: [[TMP66:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 22), align 8 -; SKX-NEXT: [[TMP67:%.*]] = fcmp fast ogt float [[TMP65]], [[TMP66]] -; SKX-NEXT: [[TMP68:%.*]] = select i1 [[TMP67]], float [[TMP65]], float [[TMP66]] -; SKX-NEXT: [[TMP69:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 23), align 4 -; SKX-NEXT: [[TMP70:%.*]] = fcmp fast ogt float [[TMP68]], [[TMP69]] -; SKX-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], float [[TMP68]], float [[TMP69]] -; SKX-NEXT: [[TMP72:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 24), align 16 -; SKX-NEXT: [[TMP73:%.*]] = fcmp fast ogt float [[TMP71]], [[TMP72]] -; SKX-NEXT: [[TMP74:%.*]] = select i1 [[TMP73]], float [[TMP71]], float [[TMP72]] -; SKX-NEXT: [[TMP75:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 25), align 4 -; SKX-NEXT: [[TMP76:%.*]] = fcmp fast ogt float [[TMP74]], [[TMP75]] -; SKX-NEXT: [[TMP77:%.*]] = select i1 [[TMP76]], float [[TMP74]], float [[TMP75]] -; SKX-NEXT: [[TMP78:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 26), align 8 -; SKX-NEXT: [[TMP79:%.*]] = fcmp fast ogt float [[TMP77]], [[TMP78]] -; SKX-NEXT: [[TMP80:%.*]] = select i1 [[TMP79]], float [[TMP77]], float [[TMP78]] -; SKX-NEXT: [[TMP81:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 27), align 4 -; SKX-NEXT: [[TMP82:%.*]] = fcmp fast ogt float [[TMP80]], [[TMP81]] -; SKX-NEXT: [[TMP83:%.*]] = select i1 [[TMP82]], float [[TMP80]], float [[TMP81]] -; SKX-NEXT: [[TMP84:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 28), align 16 -; SKX-NEXT: [[TMP85:%.*]] = fcmp fast ogt float [[TMP83]], [[TMP84]] -; SKX-NEXT: [[TMP86:%.*]] = select i1 [[TMP85]], float [[TMP83]], float [[TMP84]] -; SKX-NEXT: [[TMP87:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 29), align 4 -; SKX-NEXT: [[TMP88:%.*]] = fcmp fast ogt float [[TMP86]], [[TMP87]] -; SKX-NEXT: [[TMP89:%.*]] = select i1 [[TMP88]], float [[TMP86]], float [[TMP87]] -; SKX-NEXT: [[TMP90:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 30), align 8 -; SKX-NEXT: [[TMP91:%.*]] = fcmp fast ogt float [[TMP89]], [[TMP90]] -; SKX-NEXT: [[TMP92:%.*]] = select i1 [[TMP91]], float [[TMP89]], float [[TMP90]] -; SKX-NEXT: [[TMP93:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 31), align 4 -; SKX-NEXT: [[TMP94:%.*]] = fcmp fast ogt float [[TMP92]], [[TMP93]] -; SKX-NEXT: [[TMP95:%.*]] = select i1 [[TMP94]], float [[TMP92]], float [[TMP93]] -; SKX-NEXT: ret float [[TMP95]] +; SKX-NEXT: [[TMP2:%.*]] = load <32 x float>, <32 x float>* bitcast ([32 x float]* @arr1 to <32 x float>*), align 16 +; SKX: [[RDX_SHUF:%.*]] = shufflevector <32 x float> [[TMP2]], <32 x float> undef, <32 x i32> +; SKX-NEXT: [[TMP96:%.*]] = fcmp fast ogt <32 x float> [[TMP2]], [[RDX_SHUF]] +; SKX-NEXT: [[BIN_RDX:%.*]] = select <32 x i1> [[TMP96]], <32 x float> [[TMP2]], <32 x float> [[RDX_SHUF]] +; SKX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <32 x float> [[BIN_RDX]], <32 x float> undef, <32 x i32> +; SKX-NEXT: [[TMP97:%.*]] = fcmp fast ogt <32 x float> [[BIN_RDX]], [[RDX_SHUF1]] +; SKX-NEXT: [[BIN_RDX2:%.*]] = select <32 x i1> [[TMP97]], <32 x float> [[BIN_RDX]], <32 x float> [[RDX_SHUF1]] +; SKX-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <32 x float> [[BIN_RDX2]], <32 x float> undef, <32 x i32> +; SKX-NEXT: [[TMP98:%.*]] = fcmp fast ogt <32 x float> [[BIN_RDX2]], [[RDX_SHUF3]] +; SKX-NEXT: [[BIN_RDX4:%.*]] = select <32 x i1> [[TMP98]], <32 x float> [[BIN_RDX2]], <32 x float> [[RDX_SHUF3]] +; SKX-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <32 x float> [[BIN_RDX4]], <32 x float> undef, <32 x i32> +; SKX-NEXT: [[TMP99:%.*]] = fcmp fast ogt <32 x float> [[BIN_RDX4]], [[RDX_SHUF5]] +; SKX-NEXT: [[BIN_RDX6:%.*]] = select <32 x i1> [[TMP99]], <32 x float> [[BIN_RDX4]], <32 x float> [[RDX_SHUF5]] +; SKX-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <32 x float> [[BIN_RDX6]], <32 x float> undef, <32 x i32> +; SKX-NEXT: [[TMP100:%.*]] = fcmp fast ogt <32 x float> [[BIN_RDX6]], [[RDX_SHUF7]] +; SKX-NEXT: [[BIN_RDX8:%.*]] = select <32 x i1> [[TMP100]], <32 x float> [[BIN_RDX6]], <32 x float> [[RDX_SHUF7]] +; SKX-NEXT: [[TMP101:%.*]] = extractelement <32 x float> [[BIN_RDX8]], i32 0 +; SKX: ret float [[TMP101]] ; %2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16 %3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4