Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -713,6 +713,7 @@ /// ((v0+v2), (v1+v3), undef, undef) int getArithmeticReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm) const; + int getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwiseForm) const; /// \returns The cost of Intrinsic instructions. Analyses the real arguments. /// Three cases are handled: 1. scalar instruction 2. vector instruction @@ -978,6 +979,8 @@ unsigned AddressSpace) = 0; virtual int getArithmeticReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm) = 0; + virtual int getMinMaxReductionCost(Type *Ty, Type *CondTy, + bool IsPairwiseForm) = 0; virtual int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef Tys, FastMathFlags FMF, unsigned ScalarizationCostPassed) = 0; @@ -1286,6 +1289,10 @@ bool IsPairwiseForm) override { return Impl.getArithmeticReductionCost(Opcode, Ty, IsPairwiseForm); } + int getMinMaxReductionCost(Type *Ty, Type *CondTy, + bool IsPairwiseForm) override { + return Impl.getMinMaxReductionCost(Ty, CondTy, IsPairwiseForm); + } int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef Tys, FastMathFlags FMF, unsigned ScalarizationCostPassed) override { return Impl.getIntrinsicInstrCost(ID, RetTy, Tys, FMF, Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -430,6 +430,8 @@ unsigned getArithmeticReductionCost(unsigned, Type *, bool) { return 1; } + unsigned getMinMaxReductionCost(Type *, Type *, bool) { return 1; } + unsigned getCostOfKeepingLiveOverCall(ArrayRef Tys) { return 0; } bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) { Index: include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- include/llvm/CodeGen/BasicTTIImpl.h +++ include/llvm/CodeGen/BasicTTIImpl.h @@ -1170,6 +1170,133 @@ return ShuffleCost + ArithCost + getScalarizationOverhead(Ty, false, true); } + /// Try to calculate arithmetic and shuffle op costs for reduction operations. + /// Try to calculate arithmetic and shuffle op costs for reduction operations. + /// We're assuming that reduction operation are performing the following way: + /// 1. Non-pairwise reduction + /// %val1 = shufflevector %val, %undef, + /// + /// \----------------v-------------/ \----------v------------/ + /// n/2 elements n/2 elements + /// %red1 = op %val, val1 + /// After this operation we have a vector %red1 where only the first n/2 + /// elements are meaningful, the second n/2 elements are undefined and can be + /// dropped. All other operations are actually working with the vector of + /// length n/2, not n, though the real vector length is still n. + /// %val2 = shufflevector %red1, %undef, + /// + /// \----------------v-------------/ \----------v------------/ + /// n/4 elements 3*n/4 elements + /// %red2 = op %red1, val2 - working with the vector of + /// length n/2, the resulting vector has length n/4 etc. + /// 2. Pairwise reduction: + /// Everything is the same except for an additional shuffle operation which + /// is used to produce operands for pairwise kind of reductions. + /// %val1 = shufflevector %val, %undef, + /// + /// \-------------v----------/ \----------v------------/ + /// n/2 elements n/2 elements + /// %val2 = shufflevector %val, %undef, + /// + /// \-------------v----------/ \----------v------------/ + /// n/2 elements n/2 elements + /// %red1 = op %val1, val2 + /// Again, the operation is performed on vector, but the resulting + /// vector %red1 is vector. + /// + /// The cost model should take into account that the actual length of the + /// vector is reduced on each iteration. + /// We're assuming that reduction operation are performing the following way: + /// 1. Non-pairwise reduction + /// %val1 = shufflevector %val, %undef, + /// + /// \----------------v-------------/ \----------v------------/ + /// n/2 elements n/2 elements + /// %red1 = op %val, val1 + /// After this operation we have a vector %red1 where only the first n/2 + /// elements are meaningful, the second n/2 elements are undefined and can be + /// dropped. All other operations are actually working with the vector of + /// length n/2, not n, though the real vector length is still n. + /// %val2 = shufflevector %red1, %undef, + /// + /// \----------------v-------------/ \----------v------------/ + /// n/4 elements 3*n/4 elements + /// %red2 = op %red1, val2 - working with the vector of + /// length n/2, the resulting vector has length n/4 etc. + /// 2. Pairwise reduction: + /// Everything is the same except for an additional shuffle operation which + /// is used to produce operands for pairwise kind of reductions. + /// %val1 = shufflevector %val, %undef, + /// + /// \-------------v----------/ \----------v------------/ + /// n/2 elements n/2 elements + /// %val2 = shufflevector %val, %undef, + /// + /// \-------------v----------/ \----------v------------/ + /// n/2 elements n/2 elements + /// %red1 = op %val1, val2 + /// Again, the operation is performed on vector, but the resulting + /// vector %red1 is vector. + /// + /// The cost model should take into account that the actual length of the + /// vector is reduced on each iteration. + unsigned getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwise) { + assert(Ty->isVectorTy() && "Expect a vector type"); + Type *ScalarTy = Ty->getVectorElementType(); + Type *ScalarCondTy = CondTy->getVectorElementType(); + unsigned NumVecElts = Ty->getVectorNumElements(); + unsigned NumReduxLevels = Log2_32(NumVecElts); + unsigned CmpOpcode; + if (Ty->getVectorElementType()->isFloatingPointTy()) + CmpOpcode = Instruction::FCmp; + else { + assert(Ty->isIntOrIntVectorTy() && + "expecting floating point or integer type for min/max reduction"); + CmpOpcode = Instruction::ICmp; + } + unsigned MinMaxCost = 0; + unsigned ShuffleCost = 0; + auto *ConcreteTTI = static_cast(this); + std::pair LT = + ConcreteTTI->getTLI()->getTypeLegalizationCost(DL, Ty); + unsigned LongVectorCount = 0; + unsigned MVTLen = + LT.second.isVector() ? LT.second.getVectorNumElements() : 1; + while (NumVecElts > MVTLen) { + NumVecElts /= 2; + // Assume the pairwise shuffles add a cost. + ShuffleCost += (IsPairwise + 1) * + ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty, + NumVecElts, Ty); + MinMaxCost += + ConcreteTTI->getCmpSelInstrCost(CmpOpcode, Ty, CondTy, nullptr) + + ConcreteTTI->getCmpSelInstrCost(Instruction::Select, Ty, CondTy, + nullptr); + Ty = VectorType::get(ScalarTy, NumVecElts); + CondTy = VectorType::get(ScalarCondTy, NumVecElts); + ++LongVectorCount; + } + // The minimal length of the vector is limited by the real length of vector + // operations performed on the current platform. That's why several final + // reduction opertions are perfomed on the vectors with the same + // architecture-dependent length. + ShuffleCost += (NumReduxLevels - LongVectorCount) * (IsPairwise + 1) * + ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty, + NumVecElts, Ty); + MinMaxCost += + (NumReduxLevels - LongVectorCount) * + (ConcreteTTI->getCmpSelInstrCost(CmpOpcode, Ty, CondTy, nullptr) + + ConcreteTTI->getCmpSelInstrCost(Instruction::Select, Ty, CondTy, + nullptr)); + // Need 3 extractelement instructions for scalarization + an additional + // scalar select instruction. + return ShuffleCost + MinMaxCost + + 3 * getScalarizationOverhead(Ty, /*Insert=*/false, + /*Extract=*/true) + + static_cast(this)->getCmpSelInstrCost( + Instruction::Select, ScalarTy, ScalarCondTy, nullptr); + } + unsigned getVectorSplitCost() { return 1; } /// @} Index: include/llvm/Transforms/Vectorize/SLPVectorizer.h =================================================================== --- include/llvm/Transforms/Vectorize/SLPVectorizer.h +++ include/llvm/Transforms/Vectorize/SLPVectorizer.h @@ -100,6 +100,19 @@ slpvectorizer::BoUpSLP &R, TargetTransformInfo *TTI); + /// Try to vectorize trees that start at insertvalue instructions. + bool vectorizeInsertValueInst(InsertValueInst *IVI, BasicBlock *BB, + slpvectorizer::BoUpSLP &R); + /// Try to vectorize trees that start at insertelement instructions. + bool vectorizeInsertElementInst(InsertElementInst *IEI, BasicBlock *BB, + slpvectorizer::BoUpSLP &R); + /// Try to vectorize trees that start at compare instructions. + bool vectorizeCmpInst(CmpInst *CI, BasicBlock *BB, slpvectorizer::BoUpSLP &R); + /// Tries to vectorize constructs started from CmpInst, InsertValueInst or + /// InsertElementInst instructions. + bool vectorizeSimpleInstructions(SmallVectorImpl &Instructions, + BasicBlock *BB, slpvectorizer::BoUpSLP &R); + /// \brief Scan the basic block and look for patterns that are likely to start /// a vectorization chain. bool vectorizeChainsInBlock(BasicBlock *BB, slpvectorizer::BoUpSLP &R); Index: lib/Analysis/CostModel.cpp =================================================================== --- lib/Analysis/CostModel.cpp +++ lib/Analysis/CostModel.cpp @@ -186,26 +186,58 @@ } namespace { +/// Kind of the reduction data. +enum class ReductionKind { + NotReduction, /// Not a reduction. + ArithmeticReduction, /// Binary reduction data. + MinMaxReduction, /// Min/max reduction data. +}; /// Contains opcode + LHS/RHS parts of the reduction operations. struct ReductionData { - explicit ReductionData() = default; - ReductionData(unsigned Opcode, Value *LHS, Value *RHS) - : Opcode(Opcode), LHS(LHS), RHS(RHS) {} + ReductionData() = delete; + ReductionData(ReductionKind Kind, unsigned Opcode, Value *LHS, Value *RHS) + : Opcode(Opcode), LHS(LHS), RHS(RHS), Kind(Kind) { + assert(Kind != ReductionKind::NotReduction && + "expected binary or min/max reduction only."); + } unsigned Opcode = 0; Value *LHS = nullptr; Value *RHS = nullptr; + ReductionKind Kind = ReductionKind::NotReduction; + bool isBinary() const { return Kind == ReductionKind::ArithmeticReduction; } + bool isMinMax() const { return Kind == ReductionKind::MinMaxReduction; } + bool hasSameData(ReductionData &RD) const { + return this == &RD || (Kind == RD.Kind && Opcode == RD.Opcode); + } }; } // namespace static Optional getReductionData(Instruction *I) { Value *L, *R; - if (m_BinOp(m_Value(L), m_Value(R)).match(I)) - return ReductionData(I->getOpcode(), L, R); + if (m_BinOp(m_Value(L), m_Value(R)).match(I)) { + return ReductionData(ReductionKind::ArithmeticReduction, I->getOpcode(), L, + R); + } + if (auto *SI = dyn_cast(I)) { + if (m_UMin(m_Value(L), m_Value(R)).match(SI) || + m_SMin(m_Value(L), m_Value(R)).match(SI) || + m_SMax(m_Value(L), m_Value(R)).match(SI) || + m_UMax(m_Value(L), m_Value(R)).match(SI) || + m_OrdFMin(m_Value(L), m_Value(R)).match(SI) || + m_OrdFMax(m_Value(L), m_Value(R)).match(SI) || + m_UnordFMin(m_Value(L), m_Value(R)).match(SI) || + m_UnordFMax(m_Value(L), m_Value(R)).match(SI)) { + auto *CI = cast(SI->getCondition()); + return ReductionData(ReductionKind::MinMaxReduction, CI->getOpcode(), L, + R); + } + } return llvm::None; } -static bool matchPairwiseReductionAtLevel(Instruction *I, unsigned Level, - unsigned NumLevels) { +static ReductionKind matchPairwiseReductionAtLevel(Instruction *I, + unsigned Level, + unsigned NumLevels) { // Match one level of pairwise operations. // %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, // <4 x i32> @@ -213,24 +245,24 @@ // <4 x i32> // %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 if (!I) - return false; + return ReductionKind::NotReduction; assert(I->getType()->isVectorTy() && "Expecting a vector type"); Optional RD = getReductionData(I); if (!RD) - return false; + return ReductionKind::NotReduction; ShuffleVectorInst *LS = dyn_cast(RD->LHS); if (!LS && Level) - return false; + return ReductionKind::NotReduction; ShuffleVectorInst *RS = dyn_cast(RD->RHS); if (!RS && Level) - return false; + return ReductionKind::NotReduction; // On level 0 we can omit one shufflevector instruction. if (!Level && !RS && !LS) - return false; + return ReductionKind::NotReduction; // Shuffle inputs must match. Value *NextLevelOpL = LS ? LS->getOperand(0) : nullptr; @@ -239,7 +271,7 @@ if (NextLevelOpR && NextLevelOpL) { // If we have two shuffles their operands must match. if (NextLevelOpL != NextLevelOpR) - return false; + return ReductionKind::NotReduction; NextLevelOp = NextLevelOpL; } else if (Level == 0 && (NextLevelOpR || NextLevelOpL)) { @@ -250,45 +282,45 @@ // %NextLevelOpL = shufflevector %R, <1, undef ...> // %BinOp = fadd %NextLevelOpL, %R if (NextLevelOpL && NextLevelOpL != RD->RHS) - return false; + return ReductionKind::NotReduction; else if (NextLevelOpR && NextLevelOpR != RD->LHS) - return false; + return ReductionKind::NotReduction; NextLevelOp = NextLevelOpL ? RD->RHS : RD->LHS; } else - return false; + return ReductionKind::NotReduction; // Check that the next levels binary operation exists and matches with the // current one. if (Level + 1 != NumLevels) { Optional NextLevelRD = getReductionData(cast(NextLevelOp)); - if (!NextLevelRD || RD->Opcode != NextLevelRD->Opcode) - return false; + if (!NextLevelRD || !RD->hasSameData(*NextLevelRD)) + return ReductionKind::NotReduction; } // Shuffle mask for pairwise operation must match. if (matchPairwiseShuffleMask(LS, /*IsLeft=*/true, Level)) { if (!matchPairwiseShuffleMask(RS, /*IsLeft=*/false, Level)) - return false; + return ReductionKind::NotReduction; } else if (matchPairwiseShuffleMask(RS, /*IsLeft=*/true, Level)) { if (!matchPairwiseShuffleMask(LS, /*IsLeft=*/false, Level)) - return false; + return ReductionKind::NotReduction; } else - return false; + return ReductionKind::NotReduction; if (++Level == NumLevels) - return true; + return RD->Kind; // Match next level. return matchPairwiseReductionAtLevel(cast(NextLevelOp), Level, NumLevels); } -static bool matchPairwiseReduction(const ExtractElementInst *ReduxRoot, - unsigned &Opcode, Type *&Ty) { +static ReductionKind matchPairwiseReduction(const ExtractElementInst *ReduxRoot, + unsigned &Opcode, Type *&Ty) { if (!EnableReduxCost) - return false; + return ReductionKind::NotReduction; // Need to extract the first element. ConstantInt *CI = dyn_cast(ReduxRoot->getOperand(1)); @@ -296,19 +328,19 @@ if (CI) Idx = CI->getZExtValue(); if (Idx != 0) - return false; + return ReductionKind::NotReduction; auto *RdxStart = dyn_cast(ReduxRoot->getOperand(0)); if (!RdxStart) - return false; + return ReductionKind::NotReduction; Optional RD = getReductionData(RdxStart); if (!RD) - return false; + return ReductionKind::NotReduction; Type *VecTy = RdxStart->getType(); unsigned NumVecElems = VecTy->getVectorNumElements(); if (!isPowerOf2_32(NumVecElems)) - return false; + return ReductionKind::NotReduction; // We look for a sequence of shuffle,shuffle,add triples like the following // that builds a pairwise reduction tree. @@ -328,13 +360,14 @@ // <4 x i32> // %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 // %r = extractelement <4 x float> %bin.rdx8, i32 0 - if (!matchPairwiseReductionAtLevel(RdxStart, 0, Log2_32(NumVecElems))) - return false; + if (matchPairwiseReductionAtLevel(RdxStart, 0, Log2_32(NumVecElems)) == + ReductionKind::NotReduction) + return ReductionKind::NotReduction; Opcode = RD->Opcode; Ty = VecTy; - return true; + return RD->Kind; } static std::pair @@ -348,10 +381,11 @@ return std::make_pair(L, S); } -static bool matchVectorSplittingReduction(const ExtractElementInst *ReduxRoot, - unsigned &Opcode, Type *&Ty) { +static ReductionKind +matchVectorSplittingReduction(const ExtractElementInst *ReduxRoot, + unsigned &Opcode, Type *&Ty) { if (!EnableReduxCost) - return false; + return ReductionKind::NotReduction; // Need to extract the first element. ConstantInt *CI = dyn_cast(ReduxRoot->getOperand(1)); @@ -359,19 +393,19 @@ if (CI) Idx = CI->getZExtValue(); if (Idx != 0) - return false; + return ReductionKind::NotReduction; auto *RdxStart = dyn_cast(ReduxRoot->getOperand(0)); if (!RdxStart) - return false; + return ReductionKind::NotReduction; Optional RD = getReductionData(RdxStart); if (!RD) - return false; + return ReductionKind::NotReduction; Type *VecTy = ReduxRoot->getOperand(0)->getType(); unsigned NumVecElems = VecTy->getVectorNumElements(); if (!isPowerOf2_32(NumVecElems)) - return false; + return ReductionKind::NotReduction; // We look for a sequence of shuffles and adds like the following matching one // fadd, shuffle vector pair at a time. @@ -391,10 +425,10 @@ while (NumVecElemsRemain - 1) { // Check for the right reduction operation. if (!RdxOp) - return false; + return ReductionKind::NotReduction; Optional RDLevel = getReductionData(RdxOp); - if (!RDLevel || RDLevel->Opcode != RD->Opcode) - return false; + if (!RDLevel || !RDLevel->hasSameData(*RD)) + return ReductionKind::NotReduction; Value *NextRdxOp; ShuffleVectorInst *Shuffle; @@ -403,9 +437,9 @@ // Check the current reduction operation and the shuffle use the same value. if (Shuffle == nullptr) - return false; + return ReductionKind::NotReduction; if (Shuffle->getOperand(0) != NextRdxOp) - return false; + return ReductionKind::NotReduction; // Check that shuffle masks matches. for (unsigned j = 0; j != MaskStart; ++j) @@ -415,7 +449,7 @@ SmallVector Mask = Shuffle->getShuffleMask(); if (ShuffleMask != Mask) - return false; + return ReductionKind::NotReduction; RdxOp = dyn_cast(NextRdxOp); NumVecElemsRemain /= 2; @@ -424,7 +458,7 @@ Opcode = RD->Opcode; Ty = VecTy; - return true; + return RD->Kind; } unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const { @@ -519,13 +553,28 @@ unsigned ReduxOpCode; Type *ReduxType; - if (matchVectorSplittingReduction(EEI, ReduxOpCode, ReduxType)) { + switch (matchVectorSplittingReduction(EEI, ReduxOpCode, ReduxType)) { + case ReductionKind::ArithmeticReduction: return TTI->getArithmeticReductionCost(ReduxOpCode, ReduxType, /*IsPairwiseForm=*/false); + case ReductionKind::MinMaxReduction: + return TTI->getMinMaxReductionCost(ReduxType, + CmpInst::makeCmpResultType(ReduxType), + /*IsPairwiseForm=*/false); + case ReductionKind::NotReduction: + break; } - if (matchPairwiseReduction(EEI, ReduxOpCode, ReduxType)) { + + switch (matchPairwiseReduction(EEI, ReduxOpCode, ReduxType)) { + case ReductionKind::ArithmeticReduction: return TTI->getArithmeticReductionCost(ReduxOpCode, ReduxType, /*IsPairwiseForm=*/true); + case ReductionKind::MinMaxReduction: + return TTI->getMinMaxReductionCost(ReduxType, + CmpInst::makeCmpResultType(ReduxType), + /*IsPairwiseForm=*/true); + case ReductionKind::NotReduction: + break; } return TTI->getVectorInstrCost(I->getOpcode(), Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -479,6 +479,13 @@ return Cost; } +int TargetTransformInfo::getMinMaxReductionCost(Type *Ty, Type *CondTy, + bool IsPairwiseForm) const { + int Cost = TTIImpl->getMinMaxReductionCost(Ty, CondTy, IsPairwiseForm); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; +} + unsigned TargetTransformInfo::getCostOfKeepingLiveOverCall(ArrayRef Tys) const { return TTIImpl->getCostOfKeepingLiveOverCall(Tys); Index: lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- lib/Target/X86/X86TargetTransformInfo.h +++ lib/Target/X86/X86TargetTransformInfo.h @@ -88,6 +88,8 @@ int getArithmeticReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm); + int getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwiseForm); + int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, unsigned AddressSpace); Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -1877,6 +1877,111 @@ return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise); } +int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy, + bool IsPairwise) { + std::pair LT = TLI->getTypeLegalizationCost(DL, ValTy); + + MVT MTy = LT.second; + + int ISD = ValTy->isIntOrIntVectorTy() ? ISD::SMIN : ISD::FMINNUM; + + // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput + // and make it as the cost. + + static const CostTblEntry SSE42CostTblPairWise[] = { + {ISD::FMINNUM, MVT::v2f64, 3}, {ISD::FMINNUM, MVT::v4f32, 2}, + {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8" + {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5" + {ISD::SMIN, MVT::v8i16, 2}, + }; + + static const CostTblEntry AVX1CostTblPairWise[] = { + {ISD::FMINNUM, MVT::v4f32, 1}, {ISD::FMINNUM, MVT::v4f64, 1}, + {ISD::FMINNUM, MVT::v8f32, 2}, {ISD::SMIN, MVT::v2i64, 3}, + {ISD::SMIN, MVT::v4i32, 1}, {ISD::SMIN, MVT::v8i16, 1}, + {ISD::SMIN, MVT::v8i32, 3}, + }; + + static const CostTblEntry AVX2CostTblPairWise[] = { + {ISD::SMIN, MVT::v4i64, 2}, + {ISD::SMIN, MVT::v8i32, 1}, + {ISD::SMIN, MVT::v16i16, 1}, + {ISD::SMIN, MVT::v32i8, 2}, + }; + + static const CostTblEntry AVX512CostTblPairWise[] = { + {ISD::FMINNUM, MVT::v8f64, 1}, + {ISD::FMINNUM, MVT::v16f32, 2}, + {ISD::SMIN, MVT::v8i64, 2}, + {ISD::SMIN, MVT::v16i32, 1}, + }; + + static const CostTblEntry SSE42CostTblNoPairWise[] = { + {ISD::FMINNUM, MVT::v2f64, 3}, {ISD::FMINNUM, MVT::v4f32, 3}, + {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8" + {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5" + {ISD::SMIN, MVT::v8i16, 1}, // The data reported by the IACA is "1.5" + }; + + static const CostTblEntry AVX1CostTblNoPairWise[] = { + {ISD::FMINNUM, MVT::v4f32, 1}, {ISD::FMINNUM, MVT::v4f64, 1}, + {ISD::FMINNUM, MVT::v8f32, 1}, {ISD::SMIN, MVT::v2i64, 3}, + {ISD::SMIN, MVT::v4i32, 1}, {ISD::SMIN, MVT::v8i16, 1}, + {ISD::SMIN, MVT::v8i32, 2}, + }; + + static const CostTblEntry AVX2CostTblNoPairWise[] = { + {ISD::SMIN, MVT::v4i64, 1}, + {ISD::SMIN, MVT::v8i32, 1}, + {ISD::SMIN, MVT::v16i16, 1}, + {ISD::SMIN, MVT::v32i8, 1}, + }; + + static const CostTblEntry AVX512CostTblNoPairWise[] = { + {ISD::FMINNUM, MVT::v8f64, 1}, + {ISD::FMINNUM, MVT::v16f32, 2}, + {ISD::SMIN, MVT::v8i64, 1}, + {ISD::SMIN, MVT::v16i32, 1}, + }; + + if (IsPairwise) { + if (ST->hasAVX512()) + if (const auto *Entry = CostTableLookup(AVX512CostTblPairWise, ISD, MTy)) + return LT.first * Entry->Cost; + + if (ST->hasAVX2()) + if (const auto *Entry = CostTableLookup(AVX2CostTblPairWise, ISD, MTy)) + return LT.first * Entry->Cost; + + if (ST->hasAVX()) + if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy)) + return LT.first * Entry->Cost; + + if (ST->hasSSE42()) + if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy)) + return LT.first * Entry->Cost; + } else { + if (ST->hasAVX512()) + if (const auto *Entry = + CostTableLookup(AVX512CostTblNoPairWise, ISD, MTy)) + return LT.first * Entry->Cost; + + if (ST->hasAVX2()) + if (const auto *Entry = CostTableLookup(AVX2CostTblNoPairWise, ISD, MTy)) + return LT.first * Entry->Cost; + + if (ST->hasAVX()) + if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) + return LT.first * Entry->Cost; + + if (ST->hasSSE42()) + if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy)) + return LT.first * Entry->Cost; + } + + return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise); +} + /// \brief Calculate the cost of materializing a 64-bit value. This helper /// method might only calculate a fraction of a larger immediate. Therefore it /// is valid to return a cost of ZERO. Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -4356,7 +4356,7 @@ if (!I) return false; - if (!isa(I)) + if (!isa(I) && !isa(I)) return false; Value *P = I->getParent(); @@ -4459,59 +4459,126 @@ // Use map vector to make stable output. MapVector ExtraArgs; + /// Kind of the reduction data. + enum class ReductionKind { + NotReduction, /// Not a reduction. + ArithmeticReduction, /// Binary reduction data. + MinReduction, /// Minimum reduction data. + UMinReduction, /// Unsigned minimum reduction data. + MaxReduction, /// Maximum reduction data. + UMaxReduction, /// Unsigned maximum reduction data. + }; /// Contains info about operation, like its opcode, left and right operands. - struct OperationData { - /// true if the operation is a reduced value, false if reduction operation. - bool IsReducedValue = false; + class OperationData { /// Opcode of the instruction. unsigned Opcode = 0; /// Left operand of the reduction operation. Value *LHS = nullptr; /// Right operand of the reduction operation. Value *RHS = nullptr; + /// Kind of the reduction operation. + ReductionKind Kind = ReductionKind::NotReduction; + /// True if float point min/max reduction has no NaNs. + bool NoNaN = false; /// Checks if the reduction operation can be vectorized. bool isVectorizable() const { return LHS && RHS && - // We currently only support adds. - (Opcode == Instruction::Add || Opcode == Instruction::FAdd); + // We currently only support adds && min/max reductions. + ((Kind == ReductionKind::ArithmeticReduction && + (Opcode == Instruction::Add || Opcode == Instruction::FAdd)) || + ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && + (Kind == ReductionKind::MinReduction || + Kind == ReductionKind::MaxReduction)) || + (Opcode == Instruction::ICmp && + (Kind == ReductionKind::UMinReduction || + Kind == ReductionKind::UMaxReduction))); } public: explicit OperationData() = default; /// Construction for reduced values. They are identified by opcode only and /// don't have associated LHS/RHS values. - explicit OperationData(Value *V) : IsReducedValue(true) { + explicit OperationData(Value *V) : Kind(ReductionKind::NotReduction) { if (auto *I = dyn_cast(V)) Opcode = I->getOpcode(); } - /// Constructor for binary reduction operations with opcode and its left and + /// Constructor for reduction operations with opcode and its left and /// right operands. - OperationData(unsigned Opcode, Value *LHS, Value *RHS) - : IsReducedValue(false), Opcode(Opcode), LHS(LHS), RHS(RHS) {} + OperationData(unsigned Opcode, Value *LHS, Value *RHS, ReductionKind Kind, + bool NoNaN = false) + : Opcode(Opcode), LHS(LHS), RHS(RHS), Kind(Kind), NoNaN(NoNaN) { + assert(Kind != ReductionKind::NotReduction && + "One of the reduction operations is expected."); + } explicit operator bool() const { return Opcode; } /// Get the index of the first operand. unsigned getFirstOperandIndex() const { assert(!!*this && "The opcode is not set."); + switch (Kind) { + case ReductionKind::MinReduction: + case ReductionKind::UMinReduction: + case ReductionKind::MaxReduction: + case ReductionKind::UMaxReduction: + return 1; + case ReductionKind::ArithmeticReduction: + case ReductionKind::NotReduction: + break; + } return 0; } /// Total number of operands in the reduction operation. unsigned getNumberOfOperands() const { - assert(!IsReducedValue && !!*this && LHS && RHS && + assert(Kind != ReductionKind::NotReduction && !!*this && LHS && RHS && "Expected reduction operation."); - return 2; + switch (Kind) { + case ReductionKind::ArithmeticReduction: + return 2; + case ReductionKind::MinReduction: + case ReductionKind::UMinReduction: + case ReductionKind::MaxReduction: + case ReductionKind::UMaxReduction: + return 3; + case ReductionKind::NotReduction: + llvm_unreachable("Reduction kind is not set"); + } } /// Expected number of uses for reduction operations/reduced values. unsigned getRequiredNumberOfUses() const { - assert(!IsReducedValue && !!*this && LHS && RHS && + assert(Kind != ReductionKind::NotReduction && !!*this && LHS && RHS && "Expected reduction operation."); - return 1; + switch (Kind) { + case ReductionKind::ArithmeticReduction: + return 1; + case ReductionKind::MinReduction: + case ReductionKind::UMinReduction: + case ReductionKind::MaxReduction: + case ReductionKind::UMaxReduction: + return 2; + case ReductionKind::NotReduction: + llvm_unreachable("Reduction kind is not set"); + } } /// Checks if instruction is associative and can be vectorized. bool isAssociative(Instruction *I) const { - assert(!IsReducedValue && *this && LHS && RHS && + assert(Kind != ReductionKind::NotReduction && *this && LHS && RHS && "Expected reduction operation."); - return I->isAssociative(); + switch (Kind) { + case ReductionKind::ArithmeticReduction: + return I->isAssociative(); + case ReductionKind::MinReduction: + case ReductionKind::MaxReduction: + return Opcode == Instruction::ICmp || + cast(I->getOperand(0))->hasUnsafeAlgebra(); + case ReductionKind::UMinReduction: + case ReductionKind::UMaxReduction: + assert(Opcode == Instruction::ICmp && + "Only integer compare operation is expected."); + return true; + case ReductionKind::NotReduction: + break; + } + llvm_unreachable("Reduction kind is not set"); } /// Checks if the reduction operation can be vectorized. bool isVectorizable(Instruction *I) const { @@ -4521,33 +4588,97 @@ /// Checks if two operation data are both a reduction op or both a reduced /// value. bool operator==(const OperationData &OD) { - assert(((IsReducedValue != OD.IsReducedValue) || - ((!LHS == !OD.LHS) && (!RHS == !OD.RHS))) && + assert(((Kind != OD.Kind) || ((!LHS == !OD.LHS) && (!RHS == !OD.RHS))) && "One of the comparing operations is incorrect."); - return this == &OD || - (IsReducedValue == OD.IsReducedValue && Opcode == OD.Opcode); + return this == &OD || (Kind == OD.Kind && Opcode == OD.Opcode); } bool operator!=(const OperationData &OD) { return !(*this == OD); } void clear() { - IsReducedValue = false; Opcode = 0; LHS = nullptr; RHS = nullptr; + Kind = ReductionKind::NotReduction; + NoNaN = false; } /// Get the opcode of the reduction operation. unsigned getOpcode() const { assert(isVectorizable() && "Expected vectorizable operation."); return Opcode; } + /// Get kind of reduction data. + ReductionKind getKind() const { return Kind; } Value *getLHS() const { return LHS; } Value *getRHS() const { return RHS; } + Type *getConditionType() const { + switch (Kind) { + case ReductionKind::ArithmeticReduction: + return nullptr; + case ReductionKind::MinReduction: + case ReductionKind::MaxReduction: + case ReductionKind::UMinReduction: + case ReductionKind::UMaxReduction: + return CmpInst::makeCmpResultType(LHS->getType()); + case ReductionKind::NotReduction: + break; + } + llvm_unreachable("Reduction kind is not set"); + } /// Creates reduction operation with the current opcode. Value *createOp(IRBuilder<> &Builder, const Twine &Name = "") const { - assert(!IsReducedValue && - (Opcode == Instruction::FAdd || Opcode == Instruction::Add) && - "Expected add|fadd reduction operation."); - return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, LHS, RHS, - Name); + assert(isVectorizable() && + "Expected add|fadd or min/max reduction operation."); + Value *Cmp; + switch (Kind) { + case ReductionKind::ArithmeticReduction: + return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, LHS, RHS, + Name); + case ReductionKind::MinReduction: + Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSLT(LHS, RHS) + : Builder.CreateFCmpOLT(LHS, RHS); + break; + case ReductionKind::MaxReduction: + Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSGT(LHS, RHS) + : Builder.CreateFCmpOGT(LHS, RHS); + break; + case ReductionKind::UMinReduction: + assert(Opcode == Instruction::ICmp && "Expected integer types."); + Cmp = Builder.CreateICmpULT(LHS, RHS); + break; + case ReductionKind::UMaxReduction: + assert(Opcode == Instruction::ICmp && "Expected integer types."); + Cmp = Builder.CreateICmpUGT(LHS, RHS); + break; + case ReductionKind::NotReduction: + llvm_unreachable("Unknown reduction operation."); + } + return Builder.CreateSelect(Cmp, LHS, RHS, Name); + } + TargetTransformInfo::ReductionFlags getFlags() const { + TargetTransformInfo::ReductionFlags Flags; + Flags.NoNaN = NoNaN; + switch (Kind) { + case ReductionKind::ArithmeticReduction: + break; + case ReductionKind::MinReduction: + Flags.IsSigned = Opcode == Instruction::ICmp; + Flags.IsMaxOp = false; + break; + case ReductionKind::MaxReduction: + Flags.IsSigned = Opcode == Instruction::ICmp; + Flags.IsMaxOp = true; + break; + case ReductionKind::UMinReduction: + Flags.IsSigned = false; + Flags.IsMaxOp = false; + break; + case ReductionKind::UMaxReduction: + Flags.IsSigned = false; + Flags.IsMaxOp = true; + break; + case ReductionKind::NotReduction: + llvm_unreachable("Reduction kind is not set"); + } + return Flags; } }; @@ -4587,8 +4718,36 @@ Value *LHS; Value *RHS; - if (m_BinOp(m_Value(LHS), m_Value(RHS)).match(V)) - return OperationData(cast(V)->getOpcode(), LHS, RHS); + if (m_BinOp(m_Value(LHS), m_Value(RHS)).match(V)) { + return OperationData(cast(V)->getOpcode(), LHS, RHS, + ReductionKind::ArithmeticReduction); + } + if (auto *Select = dyn_cast(V)) { + // Look for a min/max pattern. + if (m_UMin(m_Value(LHS), m_Value(RHS)).match(Select)) { + return OperationData(Instruction::ICmp, LHS, RHS, + ReductionKind::UMinReduction); + } else if (m_SMin(m_Value(LHS), m_Value(RHS)).match(Select)) { + return OperationData(Instruction::ICmp, LHS, RHS, + ReductionKind::MinReduction); + } else if (m_OrdFMin(m_Value(LHS), m_Value(RHS)).match(Select) || + m_UnordFMin(m_Value(LHS), m_Value(RHS)).match(Select)) { + return OperationData( + Instruction::FCmp, LHS, RHS, ReductionKind::MinReduction, + cast(Select->getCondition())->hasNoNaNs()); + } else if (m_UMax(m_Value(LHS), m_Value(RHS)).match(Select)) { + return OperationData(Instruction::ICmp, LHS, RHS, + ReductionKind::UMaxReduction); + } else if (m_SMax(m_Value(LHS), m_Value(RHS)).match(Select)) { + return OperationData(Instruction::ICmp, LHS, RHS, + ReductionKind::MaxReduction); + } else if (m_OrdFMax(m_Value(LHS), m_Value(RHS)).match(Select) || + m_UnordFMax(m_Value(LHS), m_Value(RHS)).match(Select)) { + return OperationData( + Instruction::FCmp, LHS, RHS, ReductionKind::MaxReduction, + cast(Select->getCondition())->hasNoNaNs()); + } + } return OperationData(V); } @@ -4781,8 +4940,9 @@ if (VectorizedTree) { Builder.SetCurrentDebugLocation(Loc); OperationData VectReductionData(ReductionData.getOpcode(), - VectorizedTree, ReducedSubTree); - VectorizedTree = VectReductionData.createOp(Builder, "bin.rdx"); + VectorizedTree, ReducedSubTree, + ReductionData.getKind()); + VectorizedTree = VectReductionData.createOp(Builder, "op.rdx"); propagateIRFlags(VectorizedTree, ReductionOps); } else VectorizedTree = ReducedSubTree; @@ -4796,7 +4956,8 @@ auto *I = cast(ReducedVals[i]); Builder.SetCurrentDebugLocation(I->getDebugLoc()); OperationData VectReductionData(ReductionData.getOpcode(), - VectorizedTree, I); + VectorizedTree, I, + ReductionData.getKind()); VectorizedTree = VectReductionData.createOp(Builder); propagateIRFlags(VectorizedTree, ReductionOps); } @@ -4807,8 +4968,9 @@ for (auto *I : Pair.second) { Builder.SetCurrentDebugLocation(I->getDebugLoc()); OperationData VectReductionData(ReductionData.getOpcode(), - VectorizedTree, Pair.first); - VectorizedTree = VectReductionData.createOp(Builder, "bin.extra"); + VectorizedTree, Pair.first, + ReductionData.getKind()); + VectorizedTree = VectReductionData.createOp(Builder, "op.extra"); propagateIRFlags(VectorizedTree, I); } } @@ -4829,19 +4991,54 @@ Type *ScalarTy = FirstReducedVal->getType(); Type *VecTy = VectorType::get(ScalarTy, ReduxWidth); - int PairwiseRdxCost = - TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy, - /*IsPairwiseForm=*/true); - int SplittingRdxCost = - TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy, - /*IsPairwiseForm=*/false); + int PairwiseRdxCost; + int SplittingRdxCost; + switch (ReductionData.getKind()) { + case ReductionKind::ArithmeticReduction: + PairwiseRdxCost = + TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy, + /*IsPairwiseForm=*/true); + SplittingRdxCost = + TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy, + /*IsPairwiseForm=*/false); + break; + case ReductionKind::MinReduction: + case ReductionKind::MaxReduction: + case ReductionKind::UMinReduction: + case ReductionKind::UMaxReduction: { + Type *VecCondTy = CmpInst::makeCmpResultType(VecTy); + PairwiseRdxCost = TTI->getMinMaxReductionCost(VecTy, VecCondTy, + /*IsPairwiseForm=*/true); + SplittingRdxCost = TTI->getMinMaxReductionCost(VecTy, VecCondTy, + /*IsPairwiseForm=*/false); + break; + } + case ReductionKind::NotReduction: + llvm_unreachable("Expected arithmetic or min/max reduction operation"); + } IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost; int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost; - int ScalarReduxCost = - (ReduxWidth - 1) * - TTI->getArithmeticInstrCost(ReductionData.getOpcode(), ScalarTy); + int ScalarReduxCost; + switch (ReductionData.getKind()) { + case ReductionKind::ArithmeticReduction: + ScalarReduxCost = + TTI->getArithmeticInstrCost(ReductionData.getOpcode(), ScalarTy); + break; + case ReductionKind::MinReduction: + case ReductionKind::MaxReduction: + case ReductionKind::UMinReduction: + case ReductionKind::UMaxReduction: + ScalarReduxCost = + TTI->getCmpSelInstrCost(ReductionData.getOpcode(), ScalarTy) + + TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, + CmpInst::makeCmpResultType(ScalarTy)); + break; + case ReductionKind::NotReduction: + llvm_unreachable("Expected arithmetic or min/max reduction operation"); + } + ScalarReduxCost *= (ReduxWidth - 1); DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost << " for reduction that starts with " << *FirstReducedVal @@ -4863,7 +5060,7 @@ if (!IsPairwiseReduction) return createSimpleTargetReduction( Builder, TTI, ReductionData.getOpcode(), VectorizedValue, - TargetTransformInfo::ReductionFlags(), RedOps); + ReductionData.getFlags(), RedOps); Value *TmpVec = VectorizedValue; for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) { @@ -4878,8 +5075,8 @@ TmpVec, UndefValue::get(TmpVec->getType()), (RightMask), "rdx.shuf.r"); OperationData VectReductionData(ReductionData.getOpcode(), LeftShuf, - RightShuf); - TmpVec = VectReductionData.createOp(Builder, "bin.rdx"); + RightShuf, ReductionData.getKind()); + TmpVec = VectReductionData.createOp(Builder, "op.rdx"); propagateIRFlags(TmpVec, RedOps); } @@ -4894,39 +5091,30 @@ /// %rb = insertelement <4 x float> %ra, float %s1, i32 1 /// %rc = insertelement <4 x float> %rb, float %s2, i32 2 /// %rd = insertelement <4 x float> %rc, float %s3, i32 3 +/// starting from the last insertelement instruction. /// /// Returns true if it matches /// -static bool findBuildVector(InsertElementInst *FirstInsertElem, +static bool findBuildVector(InsertElementInst *LastInsertElem, SmallVectorImpl &BuildVector, SmallVectorImpl &BuildVectorOpds) { - if (!isa(FirstInsertElem->getOperand(0))) - return false; - - InsertElementInst *IE = FirstInsertElem; - while (true) { - BuildVector.push_back(IE); - BuildVectorOpds.push_back(IE->getOperand(1)); - - if (IE->use_empty()) - return false; - - InsertElementInst *NextUse = dyn_cast(IE->user_back()); - if (!NextUse) - return true; - - // If this isn't the final use, make sure the next insertelement is the only - // use. It's OK if the final constructed vector is used multiple times - if (!IE->hasOneUse()) + Value *V = nullptr; + do { + BuildVector.push_back(LastInsertElem); + BuildVectorOpds.push_back(LastInsertElem->getOperand(1)); + V = LastInsertElem->getOperand(0); + if (isa(V)) + break; + LastInsertElem = dyn_cast(V); + if (!LastInsertElem || !LastInsertElem->hasOneUse()) return false; - - IE = NextUse; - } - - return false; + } while (true); + std::reverse(BuildVector.begin(), BuildVector.end()); + std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end()); + return true; } -/// \brief Like findBuildVector, but looks backwards for construction of aggregate. +/// \brief Like findBuildVector, but looks for construction of aggregate. /// /// \return true if it matches. static bool findBuildAggregate(InsertValueInst *IV, @@ -5048,9 +5236,11 @@ auto *Inst = dyn_cast(V); if (!Inst) continue; - if (auto *BI = dyn_cast(Inst)) { + auto *BI = dyn_cast(Inst); + auto *SI = dyn_cast(Inst); + if (BI || SI) { HorizontalReduction HorRdx; - if (HorRdx.matchAssociativeReduction(P, BI)) { + if (HorRdx.matchAssociativeReduction(P, Inst)) { if (HorRdx.tryToReduce(R, TTI)) { Res = true; // Set P to nullptr to avoid re-analysis of phi node in @@ -5059,7 +5249,7 @@ continue; } } - if (P) { + if (P && BI) { Inst = dyn_cast(BI->getOperand(0)); if (Inst == P) Inst = dyn_cast(BI->getOperand(1)); @@ -5111,6 +5301,64 @@ ExtraVectorization); } +bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI, + BasicBlock *BB, BoUpSLP &R) { + const DataLayout &DL = BB->getModule()->getDataLayout(); + if (!R.canMapToVector(IVI->getType(), DL)) + return false; + + SmallVector BuildVector; + SmallVector BuildVectorOpds; + if (!findBuildAggregate(IVI, BuildVector, BuildVectorOpds)) + return false; + + DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n"); + return tryToVectorizeList(BuildVectorOpds, R, BuildVector, false); +} + +bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, + BasicBlock *BB, BoUpSLP &R) { + SmallVector BuildVector; + SmallVector BuildVectorOpds; + if (!findBuildVector(IEI, BuildVector, BuildVectorOpds)) + return false; + + // Vectorize starting with the build vector operands ignoring the BuildVector + // instructions for the purpose of scheduling and user extraction. + return tryToVectorizeList(BuildVectorOpds, R, BuildVector); +} + +bool SLPVectorizerPass::vectorizeCmpInst(CmpInst *CI, BasicBlock *BB, + BoUpSLP &R) { + if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) + return true; + + bool OpsChanged = false; + for (int Idx = 0; Idx < 2; ++Idx) { + OpsChanged |= + vectorizeRootInstruction(nullptr, CI->getOperand(Idx), BB, R, TTI); + } + return OpsChanged; +} + +bool SLPVectorizerPass::vectorizeSimpleInstructions( + SmallVectorImpl &Instructions, BasicBlock *BB, BoUpSLP &R) { + bool OpsChanged = false; + for (auto &VH : reverse(Instructions)) { + auto *I = dyn_cast_or_null(VH); + if (!I) + continue; + if (auto *LastInsertValue = dyn_cast(I)) + OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R); + else if (auto *LastInsertElem = dyn_cast(I)) + OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R); + else if (auto *CI = dyn_cast(I)) + OpsChanged |= vectorizeCmpInst(CI, BB, R); + } + Instructions.clear(); + return OpsChanged; +} + bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { bool Changed = false; SmallVector Incoming; @@ -5170,10 +5418,21 @@ VisitedInstrs.clear(); + SmallVector PostProcessInstructions; + SmallDenseSet KeyNodes; for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) { // We may go through BB multiple times so skip the one we have checked. - if (!VisitedInstrs.insert(&*it).second) + if (!VisitedInstrs.insert(&*it).second) { + if (it->use_empty() && KeyNodes.count(&*it) > 0 && + vectorizeSimpleInstructions(PostProcessInstructions, BB, R)) { + // We would like to start over since some instructions are deleted + // and the iterator may become invalid value. + Changed = true; + it = BB->begin(); + e = BB->end(); + } continue; + } if (isa(it)) continue; @@ -5195,97 +5454,39 @@ continue; } - if (ShouldStartVectorizeHorAtStore) { - if (StoreInst *SI = dyn_cast(it)) { - // Try to match and vectorize a horizontal reduction. - if (vectorizeRootInstruction(nullptr, SI->getValueOperand(), BB, R, - TTI)) { - Changed = true; - it = BB->begin(); - e = BB->end(); - continue; + // Ran into an instruction without users, like terminator, or function call + // with ignored return value, store. Ignore unused instructions (basing on + // instruction type, except for CallInst and InvokeInst). + if (it->use_empty() && (it->getType()->isVoidTy() || isa(it) || + isa(it))) { + KeyNodes.insert(&*it); + bool OpsChanged = false; + if (ShouldStartVectorizeHorAtStore || !isa(it)) { + for (auto *V : it->operand_values()) { + // Try to match and vectorize a horizontal reduction. + OpsChanged |= vectorizeRootInstruction(nullptr, V, BB, R, TTI); } } - } - - // Try to vectorize horizontal reductions feeding into a return. - if (ReturnInst *RI = dyn_cast(it)) { - if (RI->getNumOperands() != 0) { - // Try to match and vectorize a horizontal reduction. - if (vectorizeRootInstruction(nullptr, RI->getOperand(0), BB, R, TTI)) { - Changed = true; - it = BB->begin(); - e = BB->end(); - continue; - } - } - } - - // Try to vectorize trees that start at compare instructions. - if (CmpInst *CI = dyn_cast(it)) { - if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) { - Changed = true; + // Start vectorization of post-process list of instructions from the + // top-tree instructions to try to vectorize as many instructions as + // possible. + OpsChanged |= vectorizeSimpleInstructions(PostProcessInstructions, BB, R); + if (OpsChanged) { // We would like to start over since some instructions are deleted // and the iterator may become invalid value. - it = BB->begin(); - e = BB->end(); - continue; - } - - for (int I = 0; I < 2; ++I) { - if (vectorizeRootInstruction(nullptr, CI->getOperand(I), BB, R, TTI)) { - Changed = true; - // We would like to start over since some instructions are deleted - // and the iterator may become invalid value. - it = BB->begin(); - e = BB->end(); - break; - } - } - continue; - } - - // Try to vectorize trees that start at insertelement instructions. - if (InsertElementInst *FirstInsertElem = dyn_cast(it)) { - SmallVector BuildVector; - SmallVector BuildVectorOpds; - if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds)) - continue; - - // Vectorize starting with the build vector operands ignoring the - // BuildVector instructions for the purpose of scheduling and user - // extraction. - if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) { Changed = true; it = BB->begin(); e = BB->end(); + continue; } - - continue; } - // Try to vectorize trees that start at insertvalue instructions feeding into - // a store. - if (StoreInst *SI = dyn_cast(it)) { - if (InsertValueInst *LastInsertValue = dyn_cast(SI->getValueOperand())) { - const DataLayout &DL = BB->getModule()->getDataLayout(); - if (R.canMapToVector(SI->getValueOperand()->getType(), DL)) { - SmallVector BuildVector; - SmallVector BuildVectorOpds; - if (!findBuildAggregate(LastInsertValue, BuildVector, BuildVectorOpds)) - continue; + if (isa(it) || isa(it) || + isa(it)) + PostProcessInstructions.push_back(&*it); - DEBUG(dbgs() << "SLP: store of array mappable to vector: " << *SI << "\n"); - if (tryToVectorizeList(BuildVectorOpds, R, BuildVector, false)) { - Changed = true; - it = BB->begin(); - e = BB->end(); - } - continue; - } - } - } } + assert(PostProcessInstructions.empty()); return Changed; } Index: test/Transforms/SLPVectorizer/AArch64/gather-root.ll =================================================================== --- test/Transforms/SLPVectorizer/AArch64/gather-root.ll +++ test/Transforms/SLPVectorizer/AArch64/gather-root.ll @@ -31,10 +31,8 @@ ; ; GATHER-LABEL: @PR28330( ; GATHER-NEXT: entry: -; GATHER-NEXT: [[TMP0:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1), align 1 -; GATHER-NEXT: [[TMP1:%.*]] = icmp eq i8 [[TMP0]], 0 -; GATHER-NEXT: [[TMP2:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 2), align 2 -; GATHER-NEXT: [[TMP3:%.*]] = icmp eq i8 [[TMP2]], 0 +; GATHER-NEXT: [[TMP0:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <2 x i8>*), align 1 +; GATHER-NEXT: [[TMP1:%.*]] = icmp eq <2 x i8> [[TMP0]], zeroinitializer ; GATHER-NEXT: [[TMP4:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1 ; GATHER-NEXT: [[TMP5:%.*]] = icmp eq i8 [[TMP4]], 0 ; GATHER-NEXT: [[TMP6:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4 @@ -50,10 +48,11 @@ ; GATHER-NEXT: br label [[FOR_BODY:%.*]] ; GATHER: for.body: ; GATHER-NEXT: [[TMP17:%.*]] = phi i32 [ [[BIN_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; GATHER-NEXT: [[TMP19:%.*]] = select i1 [[TMP1]], i32 -720, i32 -80 -; GATHER-NEXT: [[TMP20:%.*]] = add i32 [[TMP17]], [[TMP19]] -; GATHER-NEXT: [[TMP21:%.*]] = select i1 [[TMP3]], i32 -720, i32 -80 -; GATHER-NEXT: [[TMP22:%.*]] = add i32 [[TMP20]], [[TMP21]] +; GATHER-NEXT: [[TMP2:%.*]] = select <2 x i1> [[TMP1]], <2 x i32> , <2 x i32> +; GATHER-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; GATHER-NEXT: [[TMP20:%.*]] = add i32 [[TMP17]], [[TMP3]] +; GATHER-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; GATHER-NEXT: [[TMP22:%.*]] = add i32 [[TMP20]], [[TMP4]] ; GATHER-NEXT: [[TMP23:%.*]] = select i1 [[TMP5]], i32 -720, i32 -80 ; GATHER-NEXT: [[TMP24:%.*]] = add i32 [[TMP22]], [[TMP23]] ; GATHER-NEXT: [[TMP25:%.*]] = select i1 [[TMP7]], i32 -720, i32 -80 @@ -65,16 +64,16 @@ ; GATHER-NEXT: [[TMP31:%.*]] = select i1 [[TMP13]], i32 -720, i32 -80 ; GATHER-NEXT: [[TMP32:%.*]] = add i32 [[TMP30]], [[TMP31]] ; GATHER-NEXT: [[TMP33:%.*]] = select i1 [[TMP15]], i32 -720, i32 -80 -; GATHER-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP19]], i32 0 -; GATHER-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> [[TMP0]], i32 [[TMP21]], i32 1 -; GATHER-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[TMP23]], i32 2 -; GATHER-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[TMP25]], i32 3 -; GATHER-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP27]], i32 4 -; GATHER-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[TMP29]], i32 5 -; GATHER-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[TMP31]], i32 6 -; GATHER-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[TMP33]], i32 7 -; GATHER-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> [[TMP7]]) -; GATHER-NEXT: [[BIN_EXTRA]] = add i32 [[TMP8]], [[TMP17]] +; GATHER-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> undef, i32 [[TMP3]], i32 0 +; GATHER-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[TMP4]], i32 1 +; GATHER-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[TMP23]], i32 2 +; GATHER-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[TMP25]], i32 3 +; GATHER-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[TMP27]], i32 4 +; GATHER-NEXT: [[TMP10:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[TMP29]], i32 5 +; GATHER-NEXT: [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[TMP31]], i32 6 +; GATHER-NEXT: [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[TMP33]], i32 7 +; GATHER-NEXT: [[TMP13:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> [[TMP12]]) +; GATHER-NEXT: [[BIN_EXTRA]] = add i32 [[TMP13]], [[TMP17]] ; GATHER-NEXT: [[TMP34:%.*]] = add i32 [[TMP32]], [[TMP33]] ; GATHER-NEXT: br label [[FOR_BODY]] ; @@ -180,10 +179,8 @@ ; ; GATHER-LABEL: @PR32038( ; GATHER-NEXT: entry: -; GATHER-NEXT: [[TMP0:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1), align 1 -; GATHER-NEXT: [[TMP1:%.*]] = icmp eq i8 [[TMP0]], 0 -; GATHER-NEXT: [[TMP2:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 2), align 2 -; GATHER-NEXT: [[TMP3:%.*]] = icmp eq i8 [[TMP2]], 0 +; GATHER-NEXT: [[TMP0:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <2 x i8>*), align 1 +; GATHER-NEXT: [[TMP1:%.*]] = icmp eq <2 x i8> [[TMP0]], zeroinitializer ; GATHER-NEXT: [[TMP4:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1 ; GATHER-NEXT: [[TMP5:%.*]] = icmp eq i8 [[TMP4]], 0 ; GATHER-NEXT: [[TMP6:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4 @@ -199,10 +196,11 @@ ; GATHER-NEXT: br label [[FOR_BODY:%.*]] ; GATHER: for.body: ; GATHER-NEXT: [[TMP17:%.*]] = phi i32 [ [[BIN_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; GATHER-NEXT: [[TMP19:%.*]] = select i1 [[TMP1]], i32 -720, i32 -80 -; GATHER-NEXT: [[TMP20:%.*]] = add i32 -5, [[TMP19]] -; GATHER-NEXT: [[TMP21:%.*]] = select i1 [[TMP3]], i32 -720, i32 -80 -; GATHER-NEXT: [[TMP22:%.*]] = add i32 [[TMP20]], [[TMP21]] +; GATHER-NEXT: [[TMP2:%.*]] = select <2 x i1> [[TMP1]], <2 x i32> , <2 x i32> +; GATHER-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; GATHER-NEXT: [[TMP20:%.*]] = add i32 -5, [[TMP3]] +; GATHER-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; GATHER-NEXT: [[TMP22:%.*]] = add i32 [[TMP20]], [[TMP4]] ; GATHER-NEXT: [[TMP23:%.*]] = select i1 [[TMP5]], i32 -720, i32 -80 ; GATHER-NEXT: [[TMP24:%.*]] = add i32 [[TMP22]], [[TMP23]] ; GATHER-NEXT: [[TMP25:%.*]] = select i1 [[TMP7]], i32 -720, i32 -80 @@ -214,29 +212,27 @@ ; GATHER-NEXT: [[TMP31:%.*]] = select i1 [[TMP13]], i32 -720, i32 -80 ; GATHER-NEXT: [[TMP32:%.*]] = add i32 [[TMP30]], [[TMP31]] ; GATHER-NEXT: [[TMP33:%.*]] = select i1 [[TMP15]], i32 -720, i32 -80 -; GATHER-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP19]], i32 0 -; GATHER-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> [[TMP0]], i32 [[TMP21]], i32 1 -; GATHER-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[TMP23]], i32 2 -; GATHER-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[TMP25]], i32 3 -; GATHER-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP27]], i32 4 -; GATHER-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[TMP29]], i32 5 -; GATHER-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[TMP31]], i32 6 -; GATHER-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[TMP33]], i32 7 -; GATHER-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> [[TMP7]]) -; GATHER-NEXT: [[BIN_EXTRA]] = add i32 [[TMP8]], -5 +; GATHER-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> undef, i32 [[TMP3]], i32 0 +; GATHER-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[TMP4]], i32 1 +; GATHER-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[TMP23]], i32 2 +; GATHER-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[TMP25]], i32 3 +; GATHER-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[TMP27]], i32 4 +; GATHER-NEXT: [[TMP10:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[TMP29]], i32 5 +; GATHER-NEXT: [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[TMP31]], i32 6 +; GATHER-NEXT: [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[TMP33]], i32 7 +; GATHER-NEXT: [[TMP13:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> [[TMP12]]) +; GATHER-NEXT: [[BIN_EXTRA]] = add i32 [[TMP13]], -5 ; GATHER-NEXT: [[TMP34:%.*]] = add i32 [[TMP32]], [[TMP33]] ; GATHER-NEXT: br label [[FOR_BODY]] ; ; MAX-COST-LABEL: @PR32038( ; MAX-COST-NEXT: entry: -; MAX-COST-NEXT: [[TMP0:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1), align 1 -; MAX-COST-NEXT: [[TMP1:%.*]] = icmp eq i8 [[TMP0]], 0 -; MAX-COST-NEXT: [[TMP2:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 2), align 2 -; MAX-COST-NEXT: [[TMP3:%.*]] = icmp eq i8 [[TMP2]], 0 +; MAX-COST-NEXT: [[TMP0:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <2 x i8>*), align 1 +; MAX-COST-NEXT: [[TMP1:%.*]] = icmp eq <2 x i8> [[TMP0]], zeroinitializer ; MAX-COST-NEXT: [[TMP4:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1 -; MAX-COST-NEXT: [[TMP5:%.*]] = icmp eq i8 [[TMP4]], 0 +; MAX-COST-NEXT: [[TMPP5:%.*]] = icmp eq i8 [[TMP4]], 0 ; MAX-COST-NEXT: [[TMP6:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4 -; MAX-COST-NEXT: [[TMP7:%.*]] = icmp eq i8 [[TMP6]], 0 +; MAX-COST-NEXT: [[TMPP7:%.*]] = icmp eq i8 [[TMP6]], 0 ; MAX-COST-NEXT: [[TMP8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1 ; MAX-COST-NEXT: [[TMP9:%.*]] = icmp eq i8 [[TMP8]], 0 ; MAX-COST-NEXT: [[TMP10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2 @@ -245,14 +241,16 @@ ; MAX-COST-NEXT: [[TMP13:%.*]] = icmp eq i8 [[TMP12]], 0 ; MAX-COST-NEXT: [[TMP14:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8 ; MAX-COST-NEXT: [[TMP15:%.*]] = icmp eq i8 [[TMP14]], 0 -; MAX-COST-NEXT: [[TMP0:%.*]] = insertelement <4 x i1> undef, i1 [[TMP1]], i32 0 -; MAX-COST-NEXT: [[TMP1:%.*]] = insertelement <4 x i1> [[TMP0]], i1 [[TMP3]], i32 1 -; MAX-COST-NEXT: [[TMP2:%.*]] = insertelement <4 x i1> [[TMP1]], i1 [[TMP5]], i32 2 -; MAX-COST-NEXT: [[TMP3:%.*]] = insertelement <4 x i1> [[TMP2]], i1 [[TMP7]], i32 3 ; MAX-COST-NEXT: br label [[FOR_BODY:%.*]] ; MAX-COST: for.body: ; MAX-COST-NEXT: [[TMP17:%.*]] = phi i32 [ [[TMP34:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; MAX-COST-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> , <4 x i32> +; MAX-COST-NEXT: [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0 +; MAX-COST-NEXT: [[TMP3:%.*]] = insertelement <4 x i1> undef, i1 [[TMP2]], i32 0 +; MAX-COST-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1 +; MAX-COST-NEXT: [[TMP5:%.*]] = insertelement <4 x i1> [[TMP3]], i1 [[TMP4]], i32 1 +; MAX-COST-NEXT: [[TMP6:%.*]] = insertelement <4 x i1> [[TMP5]], i1 [[TMPP5]], i32 2 +; MAX-COST-NEXT: [[TMP7:%.*]] = insertelement <4 x i1> [[TMP6]], i1 [[TMPP7]], i32 3 +; MAX-COST-NEXT: [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> , <4 x i32> ; MAX-COST-NEXT: [[TMP20:%.*]] = add i32 -5, undef ; MAX-COST-NEXT: [[TMP22:%.*]] = add i32 [[TMP20]], undef ; MAX-COST-NEXT: [[TMP24:%.*]] = add i32 [[TMP22]], undef @@ -260,10 +258,10 @@ ; MAX-COST-NEXT: [[TMP27:%.*]] = select i1 [[TMP9]], i32 -720, i32 -80 ; MAX-COST-NEXT: [[TMP28:%.*]] = add i32 [[TMP26]], [[TMP27]] ; MAX-COST-NEXT: [[TMP29:%.*]] = select i1 [[TMP11]], i32 -720, i32 -80 -; MAX-COST-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> [[TMP4]]) -; MAX-COST-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], [[TMP27]] -; MAX-COST-NEXT: [[TMP7:%.*]] = add i32 [[TMP6]], [[TMP29]] -; MAX-COST-NEXT: [[BIN_EXTRA:%.*]] = add i32 [[TMP7]], -5 +; MAX-COST-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> [[TMP8]]) +; MAX-COST-NEXT: [[TMP10:%.*]] = add i32 [[TMP9]], [[TMP27]] +; MAX-COST-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], [[TMP29]] +; MAX-COST-NEXT: [[BIN_EXTRA:%.*]] = add i32 [[TMP11]], -5 ; MAX-COST-NEXT: [[TMP30:%.*]] = add i32 [[TMP28]], [[TMP29]] ; MAX-COST-NEXT: [[TMP31:%.*]] = select i1 [[TMP13]], i32 -720, i32 -80 ; MAX-COST-NEXT: [[TMP32:%.*]] = add i32 [[BIN_EXTRA]], [[TMP31]] Index: test/Transforms/SLPVectorizer/X86/horizontal-list.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/horizontal-list.ll +++ test/Transforms/SLPVectorizer/X86/horizontal-list.ll @@ -117,11 +117,11 @@ ; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> ; CHECK-NEXT: [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]] ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0 -; CHECK-NEXT: [[BIN_EXTRA:%.*]] = fadd fast float [[TMP4]], [[CONV]] -; CHECK-NEXT: [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], [[CONV6]] +; CHECK-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP4]], [[CONV]] +; CHECK-NEXT: [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV6]] ; CHECK-NEXT: [[ADD19_3:%.*]] = fadd fast float undef, [[ADD19_2]] -; CHECK-NEXT: store float [[BIN_EXTRA5]], float* @res, align 4 -; CHECK-NEXT: ret float [[BIN_EXTRA5]] +; CHECK-NEXT: store float [[OP_EXTRA5]], float* @res, align 4 +; CHECK-NEXT: ret float [[OP_EXTRA5]] ; ; THRESHOLD-LABEL: @bazz( ; THRESHOLD-NEXT: entry: @@ -148,11 +148,11 @@ ; THRESHOLD-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> ; THRESHOLD-NEXT: [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]] ; THRESHOLD-NEXT: [[TMP4:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0 -; THRESHOLD-NEXT: [[BIN_EXTRA:%.*]] = fadd fast float [[TMP4]], [[CONV]] -; THRESHOLD-NEXT: [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], [[CONV6]] +; THRESHOLD-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP4]], [[CONV]] +; THRESHOLD-NEXT: [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV6]] ; THRESHOLD-NEXT: [[ADD19_3:%.*]] = fadd fast float undef, [[ADD19_2]] -; THRESHOLD-NEXT: store float [[BIN_EXTRA5]], float* @res, align 4 -; THRESHOLD-NEXT: ret float [[BIN_EXTRA5]] +; THRESHOLD-NEXT: store float [[OP_EXTRA5]], float* @res, align 4 +; THRESHOLD-NEXT: ret float [[OP_EXTRA5]] ; entry: %0 = load i32, i32* @n, align 4 @@ -327,47 +327,53 @@ define float @bar() { ; CHECK-LABEL: @bar( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16 -; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP0]] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16 +; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1 ; CHECK-NEXT: [[CMP4:%.*]] = fcmp fast ogt float [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[MAX_0_MUL3:%.*]] = select i1 [[CMP4]], float [[TMP3]], float [[TMP4]] -; CHECK-NEXT: [[TMP5:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2), align 8 -; CHECK-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2), align 8 -; CHECK-NEXT: [[MUL3_1:%.*]] = fmul fast float [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[CMP4_1:%.*]] = fcmp fast ogt float [[MAX_0_MUL3]], [[MUL3_1]] -; CHECK-NEXT: [[MAX_0_MUL3_1:%.*]] = select i1 [[CMP4_1]], float [[MAX_0_MUL3]], float [[MUL3_1]] -; CHECK-NEXT: [[TMP7:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 3), align 4 -; CHECK-NEXT: [[TMP8:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 3), align 4 -; CHECK-NEXT: [[MUL3_2:%.*]] = fmul fast float [[TMP8]], [[TMP7]] -; CHECK-NEXT: [[CMP4_2:%.*]] = fcmp fast ogt float [[MAX_0_MUL3_1]], [[MUL3_2]] -; CHECK-NEXT: [[MAX_0_MUL3_2:%.*]] = select i1 [[CMP4_2]], float [[MAX_0_MUL3_1]], float [[MUL3_2]] -; CHECK-NEXT: store float [[MAX_0_MUL3_2]], float* @res, align 4 -; CHECK-NEXT: ret float [[MAX_0_MUL3_2]] +; CHECK-NEXT: [[MAX_0_MUL3:%.*]] = select i1 [[CMP4]], float undef, float undef +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2 +; CHECK-NEXT: [[CMP4_1:%.*]] = fcmp fast ogt float [[MAX_0_MUL3]], [[TMP5]] +; CHECK-NEXT: [[MAX_0_MUL3_1:%.*]] = select i1 [[CMP4_1]], float [[MAX_0_MUL3]], float undef +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3 +; CHECK-NEXT: [[CMP4_2:%.*]] = fcmp fast ogt float [[MAX_0_MUL3_1]], [[TMP6]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> +; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <4 x float> [[TMP2]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP2]], <4 x float> [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> undef, <4 x i32> +; CHECK-NEXT: [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <4 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> [[RDX_SHUF1]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[RDX_MINMAX_SELECT3]], i32 0 +; CHECK-NEXT: [[MAX_0_MUL3_2:%.*]] = select i1 [[CMP4_2]], float [[MAX_0_MUL3_1]], float undef +; CHECK-NEXT: store float [[TMP7]], float* @res, align 4 +; CHECK-NEXT: ret float [[TMP7]] ; ; THRESHOLD-LABEL: @bar( ; THRESHOLD-NEXT: entry: -; THRESHOLD-NEXT: [[TMP0:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16 -; THRESHOLD-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16 -; THRESHOLD-NEXT: [[TMP2:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP0]] -; THRESHOLD-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 -; THRESHOLD-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 +; THRESHOLD-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16 +; THRESHOLD-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16 +; THRESHOLD-NEXT: [[TMP2:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP0]] +; THRESHOLD-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 +; THRESHOLD-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1 ; THRESHOLD-NEXT: [[CMP4:%.*]] = fcmp fast ogt float [[TMP3]], [[TMP4]] -; THRESHOLD-NEXT: [[MAX_0_MUL3:%.*]] = select i1 [[CMP4]], float [[TMP3]], float [[TMP4]] -; THRESHOLD-NEXT: [[TMP5:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2), align 8 -; THRESHOLD-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2), align 8 -; THRESHOLD-NEXT: [[MUL3_1:%.*]] = fmul fast float [[TMP6]], [[TMP5]] -; THRESHOLD-NEXT: [[CMP4_1:%.*]] = fcmp fast ogt float [[MAX_0_MUL3]], [[MUL3_1]] -; THRESHOLD-NEXT: [[MAX_0_MUL3_1:%.*]] = select i1 [[CMP4_1]], float [[MAX_0_MUL3]], float [[MUL3_1]] -; THRESHOLD-NEXT: [[TMP7:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 3), align 4 -; THRESHOLD-NEXT: [[TMP8:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 3), align 4 -; THRESHOLD-NEXT: [[MUL3_2:%.*]] = fmul fast float [[TMP8]], [[TMP7]] -; THRESHOLD-NEXT: [[CMP4_2:%.*]] = fcmp fast ogt float [[MAX_0_MUL3_1]], [[MUL3_2]] -; THRESHOLD-NEXT: [[MAX_0_MUL3_2:%.*]] = select i1 [[CMP4_2]], float [[MAX_0_MUL3_1]], float [[MUL3_2]] -; THRESHOLD-NEXT: store float [[MAX_0_MUL3_2]], float* @res, align 4 -; THRESHOLD-NEXT: ret float [[MAX_0_MUL3_2]] +; THRESHOLD-NEXT: [[MAX_0_MUL3:%.*]] = select i1 [[CMP4]], float undef, float undef +; THRESHOLD-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2 +; THRESHOLD-NEXT: [[CMP4_1:%.*]] = fcmp fast ogt float [[MAX_0_MUL3]], [[TMP5]] +; THRESHOLD-NEXT: [[MAX_0_MUL3_1:%.*]] = select i1 [[CMP4_1]], float [[MAX_0_MUL3]], float undef +; THRESHOLD-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3 +; THRESHOLD-NEXT: [[CMP4_2:%.*]] = fcmp fast ogt float [[MAX_0_MUL3_1]], [[TMP6]] +; THRESHOLD-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> +; THRESHOLD-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <4 x float> [[TMP2]], [[RDX_SHUF]] +; THRESHOLD-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP2]], <4 x float> [[RDX_SHUF]] +; THRESHOLD-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> undef, <4 x i32> +; THRESHOLD-NEXT: [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <4 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]] +; THRESHOLD-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> [[RDX_SHUF1]] +; THRESHOLD-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[RDX_MINMAX_SELECT3]], i32 0 +; THRESHOLD-NEXT: [[MAX_0_MUL3_2:%.*]] = select i1 [[CMP4_2]], float [[MAX_0_MUL3_1]], float undef +; THRESHOLD-NEXT: store float [[TMP7]], float* @res, align 4 +; THRESHOLD-NEXT: ret float [[TMP7]] ; entry: %0 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 0), align 16 @@ -512,9 +518,9 @@ ; CHECK-NEXT: [[RDX_SHUF15:%.*]] = shufflevector <16 x float> [[BIN_RDX14]], <16 x float> undef, <16 x i32> ; CHECK-NEXT: [[BIN_RDX16:%.*]] = fadd fast <16 x float> [[BIN_RDX14]], [[RDX_SHUF15]] ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <16 x float> [[BIN_RDX16]], i32 0 -; CHECK-NEXT: [[BIN_RDX17:%.*]] = fadd fast float [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[TMP5]] ; CHECK-NEXT: [[ADD_47:%.*]] = fadd fast float undef, [[ADD_46]] -; CHECK-NEXT: ret float [[BIN_RDX17]] +; CHECK-NEXT: ret float [[OP_RDX]] ; ; THRESHOLD-LABEL: @f( ; THRESHOLD-NEXT: entry: @@ -635,9 +641,9 @@ ; THRESHOLD-NEXT: [[RDX_SHUF15:%.*]] = shufflevector <16 x float> [[BIN_RDX14]], <16 x float> undef, <16 x i32> ; THRESHOLD-NEXT: [[BIN_RDX16:%.*]] = fadd fast <16 x float> [[BIN_RDX14]], [[RDX_SHUF15]] ; THRESHOLD-NEXT: [[TMP5:%.*]] = extractelement <16 x float> [[BIN_RDX16]], i32 0 -; THRESHOLD-NEXT: [[BIN_RDX17:%.*]] = fadd fast float [[TMP4]], [[TMP5]] +; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[TMP5]] ; THRESHOLD-NEXT: [[ADD_47:%.*]] = fadd fast float undef, [[ADD_46]] -; THRESHOLD-NEXT: ret float [[BIN_RDX17]] +; THRESHOLD-NEXT: ret float [[OP_RDX]] ; entry: %0 = load float, float* %x, align 4 @@ -865,9 +871,9 @@ ; CHECK-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <32 x float> [[BIN_RDX6]], <32 x float> undef, <32 x i32> ; CHECK-NEXT: [[BIN_RDX8:%.*]] = fadd fast <32 x float> [[BIN_RDX6]], [[RDX_SHUF7]] ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <32 x float> [[BIN_RDX8]], i32 0 -; CHECK-NEXT: [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[CONV]] +; CHECK-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[CONV]] ; CHECK-NEXT: [[ADD_31:%.*]] = fadd fast float undef, [[ADD_30]] -; CHECK-NEXT: ret float [[BIN_EXTRA]] +; CHECK-NEXT: ret float [[OP_EXTRA]] ; ; THRESHOLD-LABEL: @f1( ; THRESHOLD-NEXT: entry: @@ -948,9 +954,9 @@ ; THRESHOLD-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <32 x float> [[BIN_RDX6]], <32 x float> undef, <32 x i32> ; THRESHOLD-NEXT: [[BIN_RDX8:%.*]] = fadd fast <32 x float> [[BIN_RDX6]], [[RDX_SHUF7]] ; THRESHOLD-NEXT: [[TMP2:%.*]] = extractelement <32 x float> [[BIN_RDX8]], i32 0 -; THRESHOLD-NEXT: [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[CONV]] +; THRESHOLD-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[CONV]] ; THRESHOLD-NEXT: [[ADD_31:%.*]] = fadd fast float undef, [[ADD_30]] -; THRESHOLD-NEXT: ret float [[BIN_EXTRA]] +; THRESHOLD-NEXT: ret float [[OP_EXTRA]] ; entry: %rem = srem i32 %a, %b @@ -1138,14 +1144,14 @@ ; CHECK-NEXT: [[RDX_SHUF11:%.*]] = shufflevector <8 x float> [[BIN_RDX10]], <8 x float> undef, <8 x i32> ; CHECK-NEXT: [[BIN_RDX12:%.*]] = fadd fast <8 x float> [[BIN_RDX10]], [[RDX_SHUF11]] ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x float> [[BIN_RDX12]], i32 0 -; CHECK-NEXT: [[BIN_RDX13:%.*]] = fadd fast float [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[RDX_SHUF14:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> -; CHECK-NEXT: [[BIN_RDX15:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF14]] -; CHECK-NEXT: [[RDX_SHUF16:%.*]] = shufflevector <4 x float> [[BIN_RDX15]], <4 x float> undef, <4 x i32> -; CHECK-NEXT: [[BIN_RDX17:%.*]] = fadd fast <4 x float> [[BIN_RDX15]], [[RDX_SHUF16]] -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[BIN_RDX17]], i32 0 -; CHECK-NEXT: [[BIN_RDX18:%.*]] = fadd fast float [[BIN_RDX13]], [[TMP10]] -; CHECK-NEXT: [[TMP11:%.*]] = fadd fast float [[BIN_RDX18]], [[TMP1]] +; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[RDX_SHUF13:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX14:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF13]] +; CHECK-NEXT: [[RDX_SHUF15:%.*]] = shufflevector <4 x float> [[BIN_RDX14]], <4 x float> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX16:%.*]] = fadd fast <4 x float> [[BIN_RDX14]], [[RDX_SHUF15]] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[BIN_RDX16]], i32 0 +; CHECK-NEXT: [[OP_RDX17:%.*]] = fadd fast float [[OP_RDX]], [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = fadd fast float [[OP_RDX17]], [[TMP1]] ; CHECK-NEXT: [[TMP12:%.*]] = fadd fast float [[TMP11]], [[TMP0]] ; CHECK-NEXT: [[ADD_29:%.*]] = fadd fast float undef, [[ADD_28]] ; CHECK-NEXT: ret float [[TMP12]] @@ -1234,14 +1240,14 @@ ; THRESHOLD-NEXT: [[RDX_SHUF11:%.*]] = shufflevector <8 x float> [[BIN_RDX10]], <8 x float> undef, <8 x i32> ; THRESHOLD-NEXT: [[BIN_RDX12:%.*]] = fadd fast <8 x float> [[BIN_RDX10]], [[RDX_SHUF11]] ; THRESHOLD-NEXT: [[TMP9:%.*]] = extractelement <8 x float> [[BIN_RDX12]], i32 0 -; THRESHOLD-NEXT: [[BIN_RDX13:%.*]] = fadd fast float [[TMP8]], [[TMP9]] -; THRESHOLD-NEXT: [[RDX_SHUF14:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> -; THRESHOLD-NEXT: [[BIN_RDX15:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF14]] -; THRESHOLD-NEXT: [[RDX_SHUF16:%.*]] = shufflevector <4 x float> [[BIN_RDX15]], <4 x float> undef, <4 x i32> -; THRESHOLD-NEXT: [[BIN_RDX17:%.*]] = fadd fast <4 x float> [[BIN_RDX15]], [[RDX_SHUF16]] -; THRESHOLD-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[BIN_RDX17]], i32 0 -; THRESHOLD-NEXT: [[BIN_RDX18:%.*]] = fadd fast float [[BIN_RDX13]], [[TMP10]] -; THRESHOLD-NEXT: [[TMP11:%.*]] = fadd fast float [[BIN_RDX18]], [[TMP1]] +; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[TMP9]] +; THRESHOLD-NEXT: [[RDX_SHUF13:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> +; THRESHOLD-NEXT: [[BIN_RDX14:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF13]] +; THRESHOLD-NEXT: [[RDX_SHUF15:%.*]] = shufflevector <4 x float> [[BIN_RDX14]], <4 x float> undef, <4 x i32> +; THRESHOLD-NEXT: [[BIN_RDX16:%.*]] = fadd fast <4 x float> [[BIN_RDX14]], [[RDX_SHUF15]] +; THRESHOLD-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[BIN_RDX16]], i32 0 +; THRESHOLD-NEXT: [[OP_RDX17:%.*]] = fadd fast float [[OP_RDX]], [[TMP10]] +; THRESHOLD-NEXT: [[TMP11:%.*]] = fadd fast float [[OP_RDX17]], [[TMP1]] ; THRESHOLD-NEXT: [[TMP12:%.*]] = fadd fast float [[TMP11]], [[TMP0]] ; THRESHOLD-NEXT: [[ADD_29:%.*]] = fadd fast float undef, [[ADD_28]] ; THRESHOLD-NEXT: ret float [[TMP12]] @@ -1369,10 +1375,10 @@ ; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> ; CHECK-NEXT: [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]] ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0 -; CHECK-NEXT: [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]] -; CHECK-NEXT: [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], [[CONV]] +; CHECK-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]] +; CHECK-NEXT: [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]] ; CHECK-NEXT: [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]] -; CHECK-NEXT: ret float [[BIN_EXTRA5]] +; CHECK-NEXT: ret float [[OP_EXTRA5]] ; ; THRESHOLD-LABEL: @extra_args( ; THRESHOLD-NEXT: entry: @@ -1403,10 +1409,10 @@ ; THRESHOLD-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> ; THRESHOLD-NEXT: [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]] ; THRESHOLD-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0 -; THRESHOLD-NEXT: [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]] -; THRESHOLD-NEXT: [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], [[CONV]] +; THRESHOLD-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]] +; THRESHOLD-NEXT: [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]] ; THRESHOLD-NEXT: [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]] -; THRESHOLD-NEXT: ret float [[BIN_EXTRA5]] +; THRESHOLD-NEXT: ret float [[OP_EXTRA5]] ; entry: %mul = mul nsw i32 %b, %a @@ -1471,12 +1477,12 @@ ; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> ; CHECK-NEXT: [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]] ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0 -; CHECK-NEXT: [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]] -; CHECK-NEXT: [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], 5.000000e+00 -; CHECK-NEXT: [[BIN_EXTRA6:%.*]] = fadd fast float [[BIN_EXTRA5]], 5.000000e+00 -; CHECK-NEXT: [[BIN_EXTRA7:%.*]] = fadd fast float [[BIN_EXTRA6]], [[CONV]] +; CHECK-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]] +; CHECK-NEXT: [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], 5.000000e+00 +; CHECK-NEXT: [[OP_EXTRA6:%.*]] = fadd fast float [[OP_EXTRA5]], 5.000000e+00 +; CHECK-NEXT: [[OP_EXTRA7:%.*]] = fadd fast float [[OP_EXTRA6]], [[CONV]] ; CHECK-NEXT: [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]] -; CHECK-NEXT: ret float [[BIN_EXTRA7]] +; CHECK-NEXT: ret float [[OP_EXTRA7]] ; ; THRESHOLD-LABEL: @extra_args_same_several_times( ; THRESHOLD-NEXT: entry: @@ -1509,12 +1515,12 @@ ; THRESHOLD-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> ; THRESHOLD-NEXT: [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]] ; THRESHOLD-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0 -; THRESHOLD-NEXT: [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]] -; THRESHOLD-NEXT: [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], 5.000000e+00 -; THRESHOLD-NEXT: [[BIN_EXTRA6:%.*]] = fadd fast float [[BIN_EXTRA5]], 5.000000e+00 -; THRESHOLD-NEXT: [[BIN_EXTRA7:%.*]] = fadd fast float [[BIN_EXTRA6]], [[CONV]] +; THRESHOLD-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]] +; THRESHOLD-NEXT: [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], 5.000000e+00 +; THRESHOLD-NEXT: [[OP_EXTRA6:%.*]] = fadd fast float [[OP_EXTRA5]], 5.000000e+00 +; THRESHOLD-NEXT: [[OP_EXTRA7:%.*]] = fadd fast float [[OP_EXTRA6]], [[CONV]] ; THRESHOLD-NEXT: [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]] -; THRESHOLD-NEXT: ret float [[BIN_EXTRA7]] +; THRESHOLD-NEXT: ret float [[OP_EXTRA7]] ; entry: %mul = mul nsw i32 %b, %a @@ -1581,10 +1587,10 @@ ; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> ; CHECK-NEXT: [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]] ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0 -; CHECK-NEXT: [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]] -; CHECK-NEXT: [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], [[CONV]] +; CHECK-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]] +; CHECK-NEXT: [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]] ; CHECK-NEXT: [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]] -; CHECK-NEXT: ret float [[BIN_EXTRA5]] +; CHECK-NEXT: ret float [[OP_EXTRA5]] ; ; THRESHOLD-LABEL: @extra_args_no_replace( ; THRESHOLD-NEXT: entry: @@ -1617,10 +1623,10 @@ ; THRESHOLD-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> ; THRESHOLD-NEXT: [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]] ; THRESHOLD-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0 -; THRESHOLD-NEXT: [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]] -; THRESHOLD-NEXT: [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], [[CONV]] +; THRESHOLD-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]] +; THRESHOLD-NEXT: [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]] ; THRESHOLD-NEXT: [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]] -; THRESHOLD-NEXT: ret float [[BIN_EXTRA5]] +; THRESHOLD-NEXT: ret float [[OP_EXTRA5]] ; entry: %mul = mul nsw i32 %b, %a @@ -1679,10 +1685,10 @@ ; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> ; CHECK-NEXT: [[BIN_RDX2:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]] ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0 -; CHECK-NEXT: [[BIN_EXTRA:%.*]] = add nuw i32 [[TMP12]], [[ARG]] -; CHECK-NEXT: [[BIN_EXTRA3:%.*]] = add nsw i32 [[BIN_EXTRA]], [[TMP9]] +; CHECK-NEXT: [[OP_EXTRA:%.*]] = add nuw i32 [[TMP12]], [[ARG]] +; CHECK-NEXT: [[OP_EXTRA3:%.*]] = add nsw i32 [[OP_EXTRA]], [[TMP9]] ; CHECK-NEXT: [[R5:%.*]] = add nsw i32 [[R4]], undef -; CHECK-NEXT: ret i32 [[BIN_EXTRA3]] +; CHECK-NEXT: ret i32 [[OP_EXTRA3]] ; ; THRESHOLD-LABEL: @wobble( ; THRESHOLD-NEXT: bb: @@ -1707,10 +1713,10 @@ ; THRESHOLD-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> ; THRESHOLD-NEXT: [[BIN_RDX2:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]] ; THRESHOLD-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0 -; THRESHOLD-NEXT: [[BIN_EXTRA:%.*]] = add nuw i32 [[TMP12]], [[ARG]] -; THRESHOLD-NEXT: [[BIN_EXTRA3:%.*]] = add nsw i32 [[BIN_EXTRA]], [[TMP9]] +; THRESHOLD-NEXT: [[OP_EXTRA:%.*]] = add nuw i32 [[TMP12]], [[ARG]] +; THRESHOLD-NEXT: [[OP_EXTRA3:%.*]] = add nsw i32 [[OP_EXTRA]], [[TMP9]] ; THRESHOLD-NEXT: [[R5:%.*]] = add nsw i32 [[R4]], undef -; THRESHOLD-NEXT: ret i32 [[BIN_EXTRA3]] +; THRESHOLD-NEXT: ret i32 [[OP_EXTRA3]] ; bb: %x1 = xor i32 %arg, %bar Index: test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll +++ test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll @@ -34,79 +34,46 @@ ; CHECK-NEXT: ret i32 [[TMP23]] ; ; AVX-LABEL: @maxi8( -; AVX-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 -; AVX-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4 -; AVX-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]] -; AVX-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]] -; AVX-NEXT: [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8 -; AVX-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]] -; AVX-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]] -; AVX-NEXT: [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4 -; AVX-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]] -; AVX-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]] -; AVX-NEXT: [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16 -; AVX-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]] -; AVX-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] -; AVX-NEXT: [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4 -; AVX-NEXT: [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]] -; AVX-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]] -; AVX-NEXT: [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 -; AVX-NEXT: [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]] -; AVX-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]] -; AVX-NEXT: [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 -; AVX-NEXT: [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]] -; AVX-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]] -; AVX-NEXT: ret i32 [[TMP23]] +; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr to <8 x i32>*), align 16 +; AVX: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> undef, <8 x i32> +; AVX-NEXT: [[TMP24:%.*]] = icmp sgt <8 x i32> [[TMP2]], [[RDX_SHUF]] +; AVX-NEXT: [[BIN_RDX:%.*]] = select <8 x i1> [[TMP24]], <8 x i32> [[TMP2]], <8 x i32> [[RDX_SHUF]] +; AVX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> +; AVX-NEXT: [[TMP25:%.*]] = icmp sgt <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; AVX-NEXT: [[BIN_RDX2:%.*]] = select <8 x i1> [[TMP25]], <8 x i32> [[BIN_RDX]], <8 x i32> [[RDX_SHUF1]] +; AVX-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> +; AVX-NEXT: [[TMP26:%.*]] = icmp sgt <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; AVX-NEXT: [[BIN_RDX4:%.*]] = select <8 x i1> [[TMP26]], <8 x i32> [[BIN_RDX2]], <8 x i32> [[RDX_SHUF3]] +; AVX-NEXT: [[TMP27:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 +; AVX: ret i32 [[TMP27]] ; ; AVX2-LABEL: @maxi8( -; AVX2-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 -; AVX2-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4 -; AVX2-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]] -; AVX2-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]] -; AVX2-NEXT: [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8 -; AVX2-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]] -; AVX2-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]] -; AVX2-NEXT: [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4 -; AVX2-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]] -; AVX2-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]] -; AVX2-NEXT: [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16 -; AVX2-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]] -; AVX2-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] -; AVX2-NEXT: [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4 -; AVX2-NEXT: [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]] -; AVX2-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]] -; AVX2-NEXT: [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 -; AVX2-NEXT: [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]] -; AVX2-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]] -; AVX2-NEXT: [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 -; AVX2-NEXT: [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]] -; AVX2-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]] -; AVX2-NEXT: ret i32 [[TMP23]] +; AVX2-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr to <8 x i32>*), align 16 +; AVX2: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> undef, <8 x i32> +; AVX2-NEXT: [[TMP24:%.*]] = icmp sgt <8 x i32> [[TMP2]], [[RDX_SHUF]] +; AVX2-NEXT: [[BIN_RDX:%.*]] = select <8 x i1> [[TMP24]], <8 x i32> [[TMP2]], <8 x i32> [[RDX_SHUF]] +; AVX2-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> +; AVX2-NEXT: [[TMP25:%.*]] = icmp sgt <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; AVX2-NEXT: [[BIN_RDX2:%.*]] = select <8 x i1> [[TMP25]], <8 x i32> [[BIN_RDX]], <8 x i32> [[RDX_SHUF1]] +; AVX2-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> +; AVX2-NEXT: [[TMP26:%.*]] = icmp sgt <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; AVX2-NEXT: [[BIN_RDX4:%.*]] = select <8 x i1> [[TMP26]], <8 x i32> [[BIN_RDX2]], <8 x i32> [[RDX_SHUF3]] +; AVX2-NEXT: [[TMP27:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 +; AVX2: ret i32 [[TMP27]] ; ; SKX-LABEL: @maxi8( -; SKX-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 -; SKX-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4 -; SKX-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]] -; SKX-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]] -; SKX-NEXT: [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8 -; SKX-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]] -; SKX-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]] -; SKX-NEXT: [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4 -; SKX-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]] -; SKX-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]] -; SKX-NEXT: [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16 -; SKX-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]] -; SKX-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] -; SKX-NEXT: [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4 -; SKX-NEXT: [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]] -; SKX-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]] -; SKX-NEXT: [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 -; SKX-NEXT: [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]] -; SKX-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]] -; SKX-NEXT: [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 -; SKX-NEXT: [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]] -; SKX-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]] -; SKX-NEXT: ret i32 [[TMP23]] +; SKX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr to <8 x i32>*), align 16 +; SKX: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> undef, <8 x i32> +; SKX-NEXT: [[TMP24:%.*]] = icmp sgt <8 x i32> [[TMP2]], [[RDX_SHUF]] +; SKX-NEXT: [[BIN_RDX:%.*]] = select <8 x i1> [[TMP24]], <8 x i32> [[TMP2]], <8 x i32> [[RDX_SHUF]] +; SKX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> +; SKX-NEXT: [[TMP25:%.*]] = icmp sgt <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; SKX-NEXT: [[BIN_RDX2:%.*]] = select <8 x i1> [[TMP25]], <8 x i32> [[BIN_RDX]], <8 x i32> [[RDX_SHUF1]] +; SKX-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> +; SKX-NEXT: [[TMP26:%.*]] = icmp sgt <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; SKX-NEXT: [[BIN_RDX4:%.*]] = select <8 x i1> [[TMP26]], <8 x i32> [[BIN_RDX2]], <8 x i32> [[RDX_SHUF3]] +; SKX-NEXT: [[TMP27:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 +; SKX: ret i32 [[TMP27]] ; %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4 @@ -184,151 +151,55 @@ ; CHECK-NEXT: ret i32 [[TMP47]] ; ; AVX-LABEL: @maxi16( -; AVX-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 -; AVX-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4 -; AVX-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]] -; AVX-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]] -; AVX-NEXT: [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8 -; AVX-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]] -; AVX-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]] -; AVX-NEXT: [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4 -; AVX-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]] -; AVX-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]] -; AVX-NEXT: [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16 -; AVX-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]] -; AVX-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] -; AVX-NEXT: [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4 -; AVX-NEXT: [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]] -; AVX-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]] -; AVX-NEXT: [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 -; AVX-NEXT: [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]] -; AVX-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]] -; AVX-NEXT: [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 -; AVX-NEXT: [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]] -; AVX-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]] -; AVX-NEXT: [[TMP24:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 8), align 16 -; AVX-NEXT: [[TMP25:%.*]] = icmp sgt i32 [[TMP23]], [[TMP24]] -; AVX-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP23]], i32 [[TMP24]] -; AVX-NEXT: [[TMP27:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 9), align 4 -; AVX-NEXT: [[TMP28:%.*]] = icmp sgt i32 [[TMP26]], [[TMP27]] -; AVX-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i32 [[TMP26]], i32 [[TMP27]] -; AVX-NEXT: [[TMP30:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 10), align 8 -; AVX-NEXT: [[TMP31:%.*]] = icmp sgt i32 [[TMP29]], [[TMP30]] -; AVX-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP29]], i32 [[TMP30]] -; AVX-NEXT: [[TMP33:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 11), align 4 -; AVX-NEXT: [[TMP34:%.*]] = icmp sgt i32 [[TMP32]], [[TMP33]] -; AVX-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[TMP32]], i32 [[TMP33]] -; AVX-NEXT: [[TMP36:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 12), align 16 -; AVX-NEXT: [[TMP37:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] -; AVX-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 [[TMP35]], i32 [[TMP36]] -; AVX-NEXT: [[TMP39:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 13), align 4 -; AVX-NEXT: [[TMP40:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]] -; AVX-NEXT: [[TMP41:%.*]] = select i1 [[TMP40]], i32 [[TMP38]], i32 [[TMP39]] -; AVX-NEXT: [[TMP42:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 14), align 8 -; AVX-NEXT: [[TMP43:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] -; AVX-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[TMP41]], i32 [[TMP42]] -; AVX-NEXT: [[TMP45:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 15), align 4 -; AVX-NEXT: [[TMP46:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]] -; AVX-NEXT: [[TMP47:%.*]] = select i1 [[TMP46]], i32 [[TMP44]], i32 [[TMP45]] -; AVX-NEXT: ret i32 [[TMP47]] +; AVX-NEXT: [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([32 x i32]* @arr to <16 x i32>*), align 16 +; AVX: [[RDX_SHUF:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> undef, <16 x i32> +; AVX-NEXT: [[TMP48:%.*]] = icmp sgt <16 x i32> [[TMP2]], [[RDX_SHUF]] +; AVX-NEXT: [[BIN_RDX:%.*]] = select <16 x i1> [[TMP48]], <16 x i32> [[TMP2]], <16 x i32> [[RDX_SHUF]] +; AVX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <16 x i32> [[BIN_RDX]], <16 x i32> undef, <16 x i32> +; AVX-NEXT: [[TMP49:%.*]] = icmp sgt <16 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; AVX-NEXT: [[BIN_RDX2:%.*]] = select <16 x i1> [[TMP49]], <16 x i32> [[BIN_RDX]], <16 x i32> [[RDX_SHUF1]] +; AVX-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <16 x i32> [[BIN_RDX2]], <16 x i32> undef, <16 x i32> +; AVX-NEXT: [[TMP50:%.*]] = icmp sgt <16 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; AVX-NEXT: [[BIN_RDX4:%.*]] = select <16 x i1> [[TMP50]], <16 x i32> [[BIN_RDX2]], <16 x i32> [[RDX_SHUF3]] +; AVX-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <16 x i32> [[BIN_RDX4]], <16 x i32> undef, <16 x i32> +; AVX-NEXT: [[TMP51:%.*]] = icmp sgt <16 x i32> [[BIN_RDX4]], [[RDX_SHUF5]] +; AVX-NEXT: [[BIN_RDX6:%.*]] = select <16 x i1> [[TMP51]], <16 x i32> [[BIN_RDX4]], <16 x i32> [[RDX_SHUF5]] +; AVX-NEXT: [[TMP52:%.*]] = extractelement <16 x i32> [[BIN_RDX6]], i32 0 +; AVX: ret i32 [[TMP52]] ; ; AVX2-LABEL: @maxi16( -; AVX2-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 -; AVX2-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4 -; AVX2-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]] -; AVX2-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]] -; AVX2-NEXT: [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8 -; AVX2-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]] -; AVX2-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]] -; AVX2-NEXT: [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4 -; AVX2-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]] -; AVX2-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]] -; AVX2-NEXT: [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16 -; AVX2-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]] -; AVX2-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] -; AVX2-NEXT: [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4 -; AVX2-NEXT: [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]] -; AVX2-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]] -; AVX2-NEXT: [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 -; AVX2-NEXT: [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]] -; AVX2-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]] -; AVX2-NEXT: [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 -; AVX2-NEXT: [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]] -; AVX2-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]] -; AVX2-NEXT: [[TMP24:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 8), align 16 -; AVX2-NEXT: [[TMP25:%.*]] = icmp sgt i32 [[TMP23]], [[TMP24]] -; AVX2-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP23]], i32 [[TMP24]] -; AVX2-NEXT: [[TMP27:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 9), align 4 -; AVX2-NEXT: [[TMP28:%.*]] = icmp sgt i32 [[TMP26]], [[TMP27]] -; AVX2-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i32 [[TMP26]], i32 [[TMP27]] -; AVX2-NEXT: [[TMP30:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 10), align 8 -; AVX2-NEXT: [[TMP31:%.*]] = icmp sgt i32 [[TMP29]], [[TMP30]] -; AVX2-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP29]], i32 [[TMP30]] -; AVX2-NEXT: [[TMP33:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 11), align 4 -; AVX2-NEXT: [[TMP34:%.*]] = icmp sgt i32 [[TMP32]], [[TMP33]] -; AVX2-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[TMP32]], i32 [[TMP33]] -; AVX2-NEXT: [[TMP36:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 12), align 16 -; AVX2-NEXT: [[TMP37:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] -; AVX2-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 [[TMP35]], i32 [[TMP36]] -; AVX2-NEXT: [[TMP39:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 13), align 4 -; AVX2-NEXT: [[TMP40:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]] -; AVX2-NEXT: [[TMP41:%.*]] = select i1 [[TMP40]], i32 [[TMP38]], i32 [[TMP39]] -; AVX2-NEXT: [[TMP42:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 14), align 8 -; AVX2-NEXT: [[TMP43:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] -; AVX2-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[TMP41]], i32 [[TMP42]] -; AVX2-NEXT: [[TMP45:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 15), align 4 -; AVX2-NEXT: [[TMP46:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]] -; AVX2-NEXT: [[TMP47:%.*]] = select i1 [[TMP46]], i32 [[TMP44]], i32 [[TMP45]] -; AVX2-NEXT: ret i32 [[TMP47]] +; AVX2-NEXT: [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([32 x i32]* @arr to <16 x i32>*), align 16 +; AVX2: [[RDX_SHUF:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> undef, <16 x i32> +; AVX2-NEXT: [[TMP48:%.*]] = icmp sgt <16 x i32> [[TMP2]], [[RDX_SHUF]] +; AVX2-NEXT: [[BIN_RDX:%.*]] = select <16 x i1> [[TMP48]], <16 x i32> [[TMP2]], <16 x i32> [[RDX_SHUF]] +; AVX2-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <16 x i32> [[BIN_RDX]], <16 x i32> undef, <16 x i32> +; AVX2-NEXT: [[TMP49:%.*]] = icmp sgt <16 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; AVX2-NEXT: [[BIN_RDX2:%.*]] = select <16 x i1> [[TMP49]], <16 x i32> [[BIN_RDX]], <16 x i32> [[RDX_SHUF1]] +; AVX2-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <16 x i32> [[BIN_RDX2]], <16 x i32> undef, <16 x i32> +; AVX2-NEXT: [[TMP50:%.*]] = icmp sgt <16 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; AVX2-NEXT: [[BIN_RDX4:%.*]] = select <16 x i1> [[TMP50]], <16 x i32> [[BIN_RDX2]], <16 x i32> [[RDX_SHUF3]] +; AVX2-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <16 x i32> [[BIN_RDX4]], <16 x i32> undef, <16 x i32> +; AVX2-NEXT: [[TMP51:%.*]] = icmp sgt <16 x i32> [[BIN_RDX4]], [[RDX_SHUF5]] +; AVX2-NEXT: [[BIN_RDX6:%.*]] = select <16 x i1> [[TMP51]], <16 x i32> [[BIN_RDX4]], <16 x i32> [[RDX_SHUF5]] +; AVX2-NEXT: [[TMP52:%.*]] = extractelement <16 x i32> [[BIN_RDX6]], i32 0 +; AVX2: ret i32 [[TMP52]] ; ; SKX-LABEL: @maxi16( -; SKX-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 -; SKX-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4 -; SKX-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]] -; SKX-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]] -; SKX-NEXT: [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8 -; SKX-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]] -; SKX-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]] -; SKX-NEXT: [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4 -; SKX-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]] -; SKX-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]] -; SKX-NEXT: [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16 -; SKX-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]] -; SKX-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] -; SKX-NEXT: [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4 -; SKX-NEXT: [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]] -; SKX-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]] -; SKX-NEXT: [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 -; SKX-NEXT: [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]] -; SKX-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]] -; SKX-NEXT: [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 -; SKX-NEXT: [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]] -; SKX-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]] -; SKX-NEXT: [[TMP24:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 8), align 16 -; SKX-NEXT: [[TMP25:%.*]] = icmp sgt i32 [[TMP23]], [[TMP24]] -; SKX-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP23]], i32 [[TMP24]] -; SKX-NEXT: [[TMP27:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 9), align 4 -; SKX-NEXT: [[TMP28:%.*]] = icmp sgt i32 [[TMP26]], [[TMP27]] -; SKX-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i32 [[TMP26]], i32 [[TMP27]] -; SKX-NEXT: [[TMP30:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 10), align 8 -; SKX-NEXT: [[TMP31:%.*]] = icmp sgt i32 [[TMP29]], [[TMP30]] -; SKX-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP29]], i32 [[TMP30]] -; SKX-NEXT: [[TMP33:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 11), align 4 -; SKX-NEXT: [[TMP34:%.*]] = icmp sgt i32 [[TMP32]], [[TMP33]] -; SKX-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[TMP32]], i32 [[TMP33]] -; SKX-NEXT: [[TMP36:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 12), align 16 -; SKX-NEXT: [[TMP37:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] -; SKX-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 [[TMP35]], i32 [[TMP36]] -; SKX-NEXT: [[TMP39:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 13), align 4 -; SKX-NEXT: [[TMP40:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]] -; SKX-NEXT: [[TMP41:%.*]] = select i1 [[TMP40]], i32 [[TMP38]], i32 [[TMP39]] -; SKX-NEXT: [[TMP42:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 14), align 8 -; SKX-NEXT: [[TMP43:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] -; SKX-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[TMP41]], i32 [[TMP42]] -; SKX-NEXT: [[TMP45:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 15), align 4 -; SKX-NEXT: [[TMP46:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]] -; SKX-NEXT: [[TMP47:%.*]] = select i1 [[TMP46]], i32 [[TMP44]], i32 [[TMP45]] -; SKX-NEXT: ret i32 [[TMP47]] +; SKX-NEXT: [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([32 x i32]* @arr to <16 x i32>*), align 16 +; SKX: [[RDX_SHUF:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> undef, <16 x i32> +; SKX-NEXT: [[TMP48:%.*]] = icmp sgt <16 x i32> [[TMP2]], [[RDX_SHUF]] +; SKX-NEXT: [[BIN_RDX:%.*]] = select <16 x i1> [[TMP48]], <16 x i32> [[TMP2]], <16 x i32> [[RDX_SHUF]] +; SKX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <16 x i32> [[BIN_RDX]], <16 x i32> undef, <16 x i32> +; SKX-NEXT: [[TMP49:%.*]] = icmp sgt <16 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; SKX-NEXT: [[BIN_RDX2:%.*]] = select <16 x i1> [[TMP49]], <16 x i32> [[BIN_RDX]], <16 x i32> [[RDX_SHUF1]] +; SKX-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <16 x i32> [[BIN_RDX2]], <16 x i32> undef, <16 x i32> +; SKX-NEXT: [[TMP50:%.*]] = icmp sgt <16 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; SKX-NEXT: [[BIN_RDX4:%.*]] = select <16 x i1> [[TMP50]], <16 x i32> [[BIN_RDX2]], <16 x i32> [[RDX_SHUF3]] +; SKX-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <16 x i32> [[BIN_RDX4]], <16 x i32> undef, <16 x i32> +; SKX-NEXT: [[TMP51:%.*]] = icmp sgt <16 x i32> [[BIN_RDX4]], [[RDX_SHUF5]] +; SKX-NEXT: [[BIN_RDX6:%.*]] = select <16 x i1> [[TMP51]], <16 x i32> [[BIN_RDX4]], <16 x i32> [[RDX_SHUF5]] +; SKX-NEXT: [[TMP52:%.*]] = extractelement <16 x i32> [[BIN_RDX6]], i32 0 +; SKX: ret i32 [[TMP52]] ; %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4 @@ -381,392 +252,84 @@ define i32 @maxi32(i32) { ; CHECK-LABEL: @maxi32( -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4 -; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]] -; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8 -; CHECK-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4 -; CHECK-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]] -; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16 -; CHECK-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] -; CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4 -; CHECK-NEXT: [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]] -; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]] -; CHECK-NEXT: [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 -; CHECK-NEXT: [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]] -; CHECK-NEXT: [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 -; CHECK-NEXT: [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]] -; CHECK-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]] -; CHECK-NEXT: [[TMP24:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 8), align 16 -; CHECK-NEXT: [[TMP25:%.*]] = icmp sgt i32 [[TMP23]], [[TMP24]] -; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP23]], i32 [[TMP24]] -; CHECK-NEXT: [[TMP27:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 9), align 4 -; CHECK-NEXT: [[TMP28:%.*]] = icmp sgt i32 [[TMP26]], [[TMP27]] -; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i32 [[TMP26]], i32 [[TMP27]] -; CHECK-NEXT: [[TMP30:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 10), align 8 -; CHECK-NEXT: [[TMP31:%.*]] = icmp sgt i32 [[TMP29]], [[TMP30]] -; CHECK-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP29]], i32 [[TMP30]] -; CHECK-NEXT: [[TMP33:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 11), align 4 -; CHECK-NEXT: [[TMP34:%.*]] = icmp sgt i32 [[TMP32]], [[TMP33]] -; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[TMP32]], i32 [[TMP33]] -; CHECK-NEXT: [[TMP36:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 12), align 16 -; CHECK-NEXT: [[TMP37:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] -; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 [[TMP35]], i32 [[TMP36]] -; CHECK-NEXT: [[TMP39:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 13), align 4 -; CHECK-NEXT: [[TMP40:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]] -; CHECK-NEXT: [[TMP41:%.*]] = select i1 [[TMP40]], i32 [[TMP38]], i32 [[TMP39]] -; CHECK-NEXT: [[TMP42:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 14), align 8 -; CHECK-NEXT: [[TMP43:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] -; CHECK-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[TMP41]], i32 [[TMP42]] -; CHECK-NEXT: [[TMP45:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 15), align 4 -; CHECK-NEXT: [[TMP46:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]] -; CHECK-NEXT: [[TMP47:%.*]] = select i1 [[TMP46]], i32 [[TMP44]], i32 [[TMP45]] -; CHECK-NEXT: [[TMP48:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 16), align 16 -; CHECK-NEXT: [[TMP49:%.*]] = icmp sgt i32 [[TMP47]], [[TMP48]] -; CHECK-NEXT: [[TMP50:%.*]] = select i1 [[TMP49]], i32 [[TMP47]], i32 [[TMP48]] -; CHECK-NEXT: [[TMP51:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 17), align 4 -; CHECK-NEXT: [[TMP52:%.*]] = icmp sgt i32 [[TMP50]], [[TMP51]] -; CHECK-NEXT: [[TMP53:%.*]] = select i1 [[TMP52]], i32 [[TMP50]], i32 [[TMP51]] -; CHECK-NEXT: [[TMP54:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 18), align 8 -; CHECK-NEXT: [[TMP55:%.*]] = icmp sgt i32 [[TMP53]], [[TMP54]] -; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 [[TMP53]], i32 [[TMP54]] -; CHECK-NEXT: [[TMP57:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 19), align 4 -; CHECK-NEXT: [[TMP58:%.*]] = icmp sgt i32 [[TMP56]], [[TMP57]] -; CHECK-NEXT: [[TMP59:%.*]] = select i1 [[TMP58]], i32 [[TMP56]], i32 [[TMP57]] -; CHECK-NEXT: [[TMP60:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 20), align 16 -; CHECK-NEXT: [[TMP61:%.*]] = icmp sgt i32 [[TMP59]], [[TMP60]] -; CHECK-NEXT: [[TMP62:%.*]] = select i1 [[TMP61]], i32 [[TMP59]], i32 [[TMP60]] -; CHECK-NEXT: [[TMP63:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 21), align 4 -; CHECK-NEXT: [[TMP64:%.*]] = icmp sgt i32 [[TMP62]], [[TMP63]] -; CHECK-NEXT: [[TMP65:%.*]] = select i1 [[TMP64]], i32 [[TMP62]], i32 [[TMP63]] -; CHECK-NEXT: [[TMP66:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 22), align 8 -; CHECK-NEXT: [[TMP67:%.*]] = icmp sgt i32 [[TMP65]], [[TMP66]] -; CHECK-NEXT: [[TMP68:%.*]] = select i1 [[TMP67]], i32 [[TMP65]], i32 [[TMP66]] -; CHECK-NEXT: [[TMP69:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 23), align 4 -; CHECK-NEXT: [[TMP70:%.*]] = icmp sgt i32 [[TMP68]], [[TMP69]] -; CHECK-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP68]], i32 [[TMP69]] -; CHECK-NEXT: [[TMP72:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 24), align 16 -; CHECK-NEXT: [[TMP73:%.*]] = icmp sgt i32 [[TMP71]], [[TMP72]] -; CHECK-NEXT: [[TMP74:%.*]] = select i1 [[TMP73]], i32 [[TMP71]], i32 [[TMP72]] -; CHECK-NEXT: [[TMP75:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 25), align 4 -; CHECK-NEXT: [[TMP76:%.*]] = icmp sgt i32 [[TMP74]], [[TMP75]] -; CHECK-NEXT: [[TMP77:%.*]] = select i1 [[TMP76]], i32 [[TMP74]], i32 [[TMP75]] -; CHECK-NEXT: [[TMP78:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 26), align 8 -; CHECK-NEXT: [[TMP79:%.*]] = icmp sgt i32 [[TMP77]], [[TMP78]] -; CHECK-NEXT: [[TMP80:%.*]] = select i1 [[TMP79]], i32 [[TMP77]], i32 [[TMP78]] -; CHECK-NEXT: [[TMP81:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 27), align 4 -; CHECK-NEXT: [[TMP82:%.*]] = icmp sgt i32 [[TMP80]], [[TMP81]] -; CHECK-NEXT: [[TMP83:%.*]] = select i1 [[TMP82]], i32 [[TMP80]], i32 [[TMP81]] -; CHECK-NEXT: [[TMP84:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 28), align 16 -; CHECK-NEXT: [[TMP85:%.*]] = icmp sgt i32 [[TMP83]], [[TMP84]] -; CHECK-NEXT: [[TMP86:%.*]] = select i1 [[TMP85]], i32 [[TMP83]], i32 [[TMP84]] -; CHECK-NEXT: [[TMP87:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 29), align 4 -; CHECK-NEXT: [[TMP88:%.*]] = icmp sgt i32 [[TMP86]], [[TMP87]] -; CHECK-NEXT: [[TMP89:%.*]] = select i1 [[TMP88]], i32 [[TMP86]], i32 [[TMP87]] -; CHECK-NEXT: [[TMP90:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 30), align 8 -; CHECK-NEXT: [[TMP91:%.*]] = icmp sgt i32 [[TMP89]], [[TMP90]] -; CHECK-NEXT: [[TMP92:%.*]] = select i1 [[TMP91]], i32 [[TMP89]], i32 [[TMP90]] -; CHECK-NEXT: [[TMP93:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 31), align 4 -; CHECK-NEXT: [[TMP94:%.*]] = icmp sgt i32 [[TMP92]], [[TMP93]] -; CHECK-NEXT: [[TMP95:%.*]] = select i1 [[TMP94]], i32 [[TMP92]], i32 [[TMP93]] -; CHECK-NEXT: ret i32 [[TMP95]] +; CHECK-NEXT: [[TMP2:%.*]] = load <32 x i32>, <32 x i32>* bitcast ([32 x i32]* @arr to <32 x i32>*), align 16 +; CHECK: [[RDX_SHUF:%.*]] = shufflevector <32 x i32> [[TMP2]], <32 x i32> undef, <32 x i32> +; CHECK-NEXT: [[TMP96:%.*]] = icmp sgt <32 x i32> [[TMP2]], [[RDX_SHUF]] +; CHECK-NEXT: [[BIN_RDX:%.*]] = select <32 x i1> [[TMP96]], <32 x i32> [[TMP2]], <32 x i32> [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <32 x i32> [[BIN_RDX]], <32 x i32> undef, <32 x i32> +; CHECK-NEXT: [[TMP97:%.*]] = icmp sgt <32 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; CHECK-NEXT: [[BIN_RDX2:%.*]] = select <32 x i1> [[TMP97]], <32 x i32> [[BIN_RDX]], <32 x i32> [[RDX_SHUF1]] +; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <32 x i32> [[BIN_RDX2]], <32 x i32> undef, <32 x i32> +; CHECK-NEXT: [[TMP98:%.*]] = icmp sgt <32 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; CHECK-NEXT: [[BIN_RDX4:%.*]] = select <32 x i1> [[TMP98]], <32 x i32> [[BIN_RDX2]], <32 x i32> [[RDX_SHUF3]] +; CHECK-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <32 x i32> [[BIN_RDX4]], <32 x i32> undef, <32 x i32> +; CHECK-NEXT: [[TMP99:%.*]] = icmp sgt <32 x i32> [[BIN_RDX4]], [[RDX_SHUF5]] +; CHECK-NEXT: [[BIN_RDX6:%.*]] = select <32 x i1> [[TMP99]], <32 x i32> [[BIN_RDX4]], <32 x i32> [[RDX_SHUF5]] +; CHECK-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <32 x i32> [[BIN_RDX6]], <32 x i32> undef, <32 x i32> +; CHECK-NEXT: [[TMP100:%.*]] = icmp sgt <32 x i32> [[BIN_RDX6]], [[RDX_SHUF7]] +; CHECK-NEXT: [[BIN_RDX8:%.*]] = select <32 x i1> [[TMP100]], <32 x i32> [[BIN_RDX6]], <32 x i32> [[RDX_SHUF7]] +; CHECK-NEXT: [[TMP101:%.*]] = extractelement <32 x i32> [[BIN_RDX8]], i32 0 +; CHECK: ret i32 [[TMP101]] ; ; AVX-LABEL: @maxi32( -; AVX-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 -; AVX-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4 -; AVX-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]] -; AVX-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]] -; AVX-NEXT: [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8 -; AVX-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]] -; AVX-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]] -; AVX-NEXT: [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4 -; AVX-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]] -; AVX-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]] -; AVX-NEXT: [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16 -; AVX-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]] -; AVX-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] -; AVX-NEXT: [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4 -; AVX-NEXT: [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]] -; AVX-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]] -; AVX-NEXT: [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 -; AVX-NEXT: [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]] -; AVX-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]] -; AVX-NEXT: [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 -; AVX-NEXT: [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]] -; AVX-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]] -; AVX-NEXT: [[TMP24:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 8), align 16 -; AVX-NEXT: [[TMP25:%.*]] = icmp sgt i32 [[TMP23]], [[TMP24]] -; AVX-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP23]], i32 [[TMP24]] -; AVX-NEXT: [[TMP27:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 9), align 4 -; AVX-NEXT: [[TMP28:%.*]] = icmp sgt i32 [[TMP26]], [[TMP27]] -; AVX-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i32 [[TMP26]], i32 [[TMP27]] -; AVX-NEXT: [[TMP30:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 10), align 8 -; AVX-NEXT: [[TMP31:%.*]] = icmp sgt i32 [[TMP29]], [[TMP30]] -; AVX-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP29]], i32 [[TMP30]] -; AVX-NEXT: [[TMP33:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 11), align 4 -; AVX-NEXT: [[TMP34:%.*]] = icmp sgt i32 [[TMP32]], [[TMP33]] -; AVX-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[TMP32]], i32 [[TMP33]] -; AVX-NEXT: [[TMP36:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 12), align 16 -; AVX-NEXT: [[TMP37:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] -; AVX-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 [[TMP35]], i32 [[TMP36]] -; AVX-NEXT: [[TMP39:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 13), align 4 -; AVX-NEXT: [[TMP40:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]] -; AVX-NEXT: [[TMP41:%.*]] = select i1 [[TMP40]], i32 [[TMP38]], i32 [[TMP39]] -; AVX-NEXT: [[TMP42:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 14), align 8 -; AVX-NEXT: [[TMP43:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] -; AVX-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[TMP41]], i32 [[TMP42]] -; AVX-NEXT: [[TMP45:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 15), align 4 -; AVX-NEXT: [[TMP46:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]] -; AVX-NEXT: [[TMP47:%.*]] = select i1 [[TMP46]], i32 [[TMP44]], i32 [[TMP45]] -; AVX-NEXT: [[TMP48:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 16), align 16 -; AVX-NEXT: [[TMP49:%.*]] = icmp sgt i32 [[TMP47]], [[TMP48]] -; AVX-NEXT: [[TMP50:%.*]] = select i1 [[TMP49]], i32 [[TMP47]], i32 [[TMP48]] -; AVX-NEXT: [[TMP51:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 17), align 4 -; AVX-NEXT: [[TMP52:%.*]] = icmp sgt i32 [[TMP50]], [[TMP51]] -; AVX-NEXT: [[TMP53:%.*]] = select i1 [[TMP52]], i32 [[TMP50]], i32 [[TMP51]] -; AVX-NEXT: [[TMP54:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 18), align 8 -; AVX-NEXT: [[TMP55:%.*]] = icmp sgt i32 [[TMP53]], [[TMP54]] -; AVX-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 [[TMP53]], i32 [[TMP54]] -; AVX-NEXT: [[TMP57:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 19), align 4 -; AVX-NEXT: [[TMP58:%.*]] = icmp sgt i32 [[TMP56]], [[TMP57]] -; AVX-NEXT: [[TMP59:%.*]] = select i1 [[TMP58]], i32 [[TMP56]], i32 [[TMP57]] -; AVX-NEXT: [[TMP60:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 20), align 16 -; AVX-NEXT: [[TMP61:%.*]] = icmp sgt i32 [[TMP59]], [[TMP60]] -; AVX-NEXT: [[TMP62:%.*]] = select i1 [[TMP61]], i32 [[TMP59]], i32 [[TMP60]] -; AVX-NEXT: [[TMP63:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 21), align 4 -; AVX-NEXT: [[TMP64:%.*]] = icmp sgt i32 [[TMP62]], [[TMP63]] -; AVX-NEXT: [[TMP65:%.*]] = select i1 [[TMP64]], i32 [[TMP62]], i32 [[TMP63]] -; AVX-NEXT: [[TMP66:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 22), align 8 -; AVX-NEXT: [[TMP67:%.*]] = icmp sgt i32 [[TMP65]], [[TMP66]] -; AVX-NEXT: [[TMP68:%.*]] = select i1 [[TMP67]], i32 [[TMP65]], i32 [[TMP66]] -; AVX-NEXT: [[TMP69:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 23), align 4 -; AVX-NEXT: [[TMP70:%.*]] = icmp sgt i32 [[TMP68]], [[TMP69]] -; AVX-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP68]], i32 [[TMP69]] -; AVX-NEXT: [[TMP72:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 24), align 16 -; AVX-NEXT: [[TMP73:%.*]] = icmp sgt i32 [[TMP71]], [[TMP72]] -; AVX-NEXT: [[TMP74:%.*]] = select i1 [[TMP73]], i32 [[TMP71]], i32 [[TMP72]] -; AVX-NEXT: [[TMP75:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 25), align 4 -; AVX-NEXT: [[TMP76:%.*]] = icmp sgt i32 [[TMP74]], [[TMP75]] -; AVX-NEXT: [[TMP77:%.*]] = select i1 [[TMP76]], i32 [[TMP74]], i32 [[TMP75]] -; AVX-NEXT: [[TMP78:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 26), align 8 -; AVX-NEXT: [[TMP79:%.*]] = icmp sgt i32 [[TMP77]], [[TMP78]] -; AVX-NEXT: [[TMP80:%.*]] = select i1 [[TMP79]], i32 [[TMP77]], i32 [[TMP78]] -; AVX-NEXT: [[TMP81:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 27), align 4 -; AVX-NEXT: [[TMP82:%.*]] = icmp sgt i32 [[TMP80]], [[TMP81]] -; AVX-NEXT: [[TMP83:%.*]] = select i1 [[TMP82]], i32 [[TMP80]], i32 [[TMP81]] -; AVX-NEXT: [[TMP84:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 28), align 16 -; AVX-NEXT: [[TMP85:%.*]] = icmp sgt i32 [[TMP83]], [[TMP84]] -; AVX-NEXT: [[TMP86:%.*]] = select i1 [[TMP85]], i32 [[TMP83]], i32 [[TMP84]] -; AVX-NEXT: [[TMP87:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 29), align 4 -; AVX-NEXT: [[TMP88:%.*]] = icmp sgt i32 [[TMP86]], [[TMP87]] -; AVX-NEXT: [[TMP89:%.*]] = select i1 [[TMP88]], i32 [[TMP86]], i32 [[TMP87]] -; AVX-NEXT: [[TMP90:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 30), align 8 -; AVX-NEXT: [[TMP91:%.*]] = icmp sgt i32 [[TMP89]], [[TMP90]] -; AVX-NEXT: [[TMP92:%.*]] = select i1 [[TMP91]], i32 [[TMP89]], i32 [[TMP90]] -; AVX-NEXT: [[TMP93:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 31), align 4 -; AVX-NEXT: [[TMP94:%.*]] = icmp sgt i32 [[TMP92]], [[TMP93]] -; AVX-NEXT: [[TMP95:%.*]] = select i1 [[TMP94]], i32 [[TMP92]], i32 [[TMP93]] -; AVX-NEXT: ret i32 [[TMP95]] +; AVX-NEXT: [[TMP2:%.*]] = load <32 x i32>, <32 x i32>* bitcast ([32 x i32]* @arr to <32 x i32>*), align 16 +; AVX: [[RDX_SHUF:%.*]] = shufflevector <32 x i32> [[TMP2]], <32 x i32> undef, <32 x i32> +; AVX-NEXT: [[TMP96:%.*]] = icmp sgt <32 x i32> [[TMP2]], [[RDX_SHUF]] +; AVX-NEXT: [[BIN_RDX:%.*]] = select <32 x i1> [[TMP96]], <32 x i32> [[TMP2]], <32 x i32> [[RDX_SHUF]] +; AVX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <32 x i32> [[BIN_RDX]], <32 x i32> undef, <32 x i32> +; AVX-NEXT: [[TMP97:%.*]] = icmp sgt <32 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; AVX-NEXT: [[BIN_RDX2:%.*]] = select <32 x i1> [[TMP97]], <32 x i32> [[BIN_RDX]], <32 x i32> [[RDX_SHUF1]] +; AVX-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <32 x i32> [[BIN_RDX2]], <32 x i32> undef, <32 x i32> +; AVX-NEXT: [[TMP98:%.*]] = icmp sgt <32 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; AVX-NEXT: [[BIN_RDX4:%.*]] = select <32 x i1> [[TMP98]], <32 x i32> [[BIN_RDX2]], <32 x i32> [[RDX_SHUF3]] +; AVX-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <32 x i32> [[BIN_RDX4]], <32 x i32> undef, <32 x i32> +; AVX-NEXT: [[TMP99:%.*]] = icmp sgt <32 x i32> [[BIN_RDX4]], [[RDX_SHUF5]] +; AVX-NEXT: [[BIN_RDX6:%.*]] = select <32 x i1> [[TMP99]], <32 x i32> [[BIN_RDX4]], <32 x i32> [[RDX_SHUF5]] +; AVX-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <32 x i32> [[BIN_RDX6]], <32 x i32> undef, <32 x i32> +; AVX-NEXT: [[TMP100:%.*]] = icmp sgt <32 x i32> [[BIN_RDX6]], [[RDX_SHUF7]] +; AVX-NEXT: [[BIN_RDX8:%.*]] = select <32 x i1> [[TMP100]], <32 x i32> [[BIN_RDX6]], <32 x i32> [[RDX_SHUF7]] +; AVX-NEXT: [[TMP101:%.*]] = extractelement <32 x i32> [[BIN_RDX8]], i32 0 +; AVX: ret i32 [[TMP101]] ; ; AVX2-LABEL: @maxi32( -; AVX2-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 -; AVX2-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4 -; AVX2-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]] -; AVX2-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]] -; AVX2-NEXT: [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8 -; AVX2-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]] -; AVX2-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]] -; AVX2-NEXT: [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4 -; AVX2-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]] -; AVX2-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]] -; AVX2-NEXT: [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16 -; AVX2-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]] -; AVX2-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] -; AVX2-NEXT: [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4 -; AVX2-NEXT: [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]] -; AVX2-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]] -; AVX2-NEXT: [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 -; AVX2-NEXT: [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]] -; AVX2-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]] -; AVX2-NEXT: [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 -; AVX2-NEXT: [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]] -; AVX2-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]] -; AVX2-NEXT: [[TMP24:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 8), align 16 -; AVX2-NEXT: [[TMP25:%.*]] = icmp sgt i32 [[TMP23]], [[TMP24]] -; AVX2-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP23]], i32 [[TMP24]] -; AVX2-NEXT: [[TMP27:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 9), align 4 -; AVX2-NEXT: [[TMP28:%.*]] = icmp sgt i32 [[TMP26]], [[TMP27]] -; AVX2-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i32 [[TMP26]], i32 [[TMP27]] -; AVX2-NEXT: [[TMP30:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 10), align 8 -; AVX2-NEXT: [[TMP31:%.*]] = icmp sgt i32 [[TMP29]], [[TMP30]] -; AVX2-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP29]], i32 [[TMP30]] -; AVX2-NEXT: [[TMP33:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 11), align 4 -; AVX2-NEXT: [[TMP34:%.*]] = icmp sgt i32 [[TMP32]], [[TMP33]] -; AVX2-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[TMP32]], i32 [[TMP33]] -; AVX2-NEXT: [[TMP36:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 12), align 16 -; AVX2-NEXT: [[TMP37:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] -; AVX2-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 [[TMP35]], i32 [[TMP36]] -; AVX2-NEXT: [[TMP39:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 13), align 4 -; AVX2-NEXT: [[TMP40:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]] -; AVX2-NEXT: [[TMP41:%.*]] = select i1 [[TMP40]], i32 [[TMP38]], i32 [[TMP39]] -; AVX2-NEXT: [[TMP42:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 14), align 8 -; AVX2-NEXT: [[TMP43:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] -; AVX2-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[TMP41]], i32 [[TMP42]] -; AVX2-NEXT: [[TMP45:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 15), align 4 -; AVX2-NEXT: [[TMP46:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]] -; AVX2-NEXT: [[TMP47:%.*]] = select i1 [[TMP46]], i32 [[TMP44]], i32 [[TMP45]] -; AVX2-NEXT: [[TMP48:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 16), align 16 -; AVX2-NEXT: [[TMP49:%.*]] = icmp sgt i32 [[TMP47]], [[TMP48]] -; AVX2-NEXT: [[TMP50:%.*]] = select i1 [[TMP49]], i32 [[TMP47]], i32 [[TMP48]] -; AVX2-NEXT: [[TMP51:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 17), align 4 -; AVX2-NEXT: [[TMP52:%.*]] = icmp sgt i32 [[TMP50]], [[TMP51]] -; AVX2-NEXT: [[TMP53:%.*]] = select i1 [[TMP52]], i32 [[TMP50]], i32 [[TMP51]] -; AVX2-NEXT: [[TMP54:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 18), align 8 -; AVX2-NEXT: [[TMP55:%.*]] = icmp sgt i32 [[TMP53]], [[TMP54]] -; AVX2-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 [[TMP53]], i32 [[TMP54]] -; AVX2-NEXT: [[TMP57:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 19), align 4 -; AVX2-NEXT: [[TMP58:%.*]] = icmp sgt i32 [[TMP56]], [[TMP57]] -; AVX2-NEXT: [[TMP59:%.*]] = select i1 [[TMP58]], i32 [[TMP56]], i32 [[TMP57]] -; AVX2-NEXT: [[TMP60:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 20), align 16 -; AVX2-NEXT: [[TMP61:%.*]] = icmp sgt i32 [[TMP59]], [[TMP60]] -; AVX2-NEXT: [[TMP62:%.*]] = select i1 [[TMP61]], i32 [[TMP59]], i32 [[TMP60]] -; AVX2-NEXT: [[TMP63:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 21), align 4 -; AVX2-NEXT: [[TMP64:%.*]] = icmp sgt i32 [[TMP62]], [[TMP63]] -; AVX2-NEXT: [[TMP65:%.*]] = select i1 [[TMP64]], i32 [[TMP62]], i32 [[TMP63]] -; AVX2-NEXT: [[TMP66:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 22), align 8 -; AVX2-NEXT: [[TMP67:%.*]] = icmp sgt i32 [[TMP65]], [[TMP66]] -; AVX2-NEXT: [[TMP68:%.*]] = select i1 [[TMP67]], i32 [[TMP65]], i32 [[TMP66]] -; AVX2-NEXT: [[TMP69:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 23), align 4 -; AVX2-NEXT: [[TMP70:%.*]] = icmp sgt i32 [[TMP68]], [[TMP69]] -; AVX2-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP68]], i32 [[TMP69]] -; AVX2-NEXT: [[TMP72:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 24), align 16 -; AVX2-NEXT: [[TMP73:%.*]] = icmp sgt i32 [[TMP71]], [[TMP72]] -; AVX2-NEXT: [[TMP74:%.*]] = select i1 [[TMP73]], i32 [[TMP71]], i32 [[TMP72]] -; AVX2-NEXT: [[TMP75:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 25), align 4 -; AVX2-NEXT: [[TMP76:%.*]] = icmp sgt i32 [[TMP74]], [[TMP75]] -; AVX2-NEXT: [[TMP77:%.*]] = select i1 [[TMP76]], i32 [[TMP74]], i32 [[TMP75]] -; AVX2-NEXT: [[TMP78:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 26), align 8 -; AVX2-NEXT: [[TMP79:%.*]] = icmp sgt i32 [[TMP77]], [[TMP78]] -; AVX2-NEXT: [[TMP80:%.*]] = select i1 [[TMP79]], i32 [[TMP77]], i32 [[TMP78]] -; AVX2-NEXT: [[TMP81:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 27), align 4 -; AVX2-NEXT: [[TMP82:%.*]] = icmp sgt i32 [[TMP80]], [[TMP81]] -; AVX2-NEXT: [[TMP83:%.*]] = select i1 [[TMP82]], i32 [[TMP80]], i32 [[TMP81]] -; AVX2-NEXT: [[TMP84:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 28), align 16 -; AVX2-NEXT: [[TMP85:%.*]] = icmp sgt i32 [[TMP83]], [[TMP84]] -; AVX2-NEXT: [[TMP86:%.*]] = select i1 [[TMP85]], i32 [[TMP83]], i32 [[TMP84]] -; AVX2-NEXT: [[TMP87:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 29), align 4 -; AVX2-NEXT: [[TMP88:%.*]] = icmp sgt i32 [[TMP86]], [[TMP87]] -; AVX2-NEXT: [[TMP89:%.*]] = select i1 [[TMP88]], i32 [[TMP86]], i32 [[TMP87]] -; AVX2-NEXT: [[TMP90:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 30), align 8 -; AVX2-NEXT: [[TMP91:%.*]] = icmp sgt i32 [[TMP89]], [[TMP90]] -; AVX2-NEXT: [[TMP92:%.*]] = select i1 [[TMP91]], i32 [[TMP89]], i32 [[TMP90]] -; AVX2-NEXT: [[TMP93:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 31), align 4 -; AVX2-NEXT: [[TMP94:%.*]] = icmp sgt i32 [[TMP92]], [[TMP93]] -; AVX2-NEXT: [[TMP95:%.*]] = select i1 [[TMP94]], i32 [[TMP92]], i32 [[TMP93]] -; AVX2-NEXT: ret i32 [[TMP95]] +; AVX2-NEXT: [[TMP2:%.*]] = load <32 x i32>, <32 x i32>* bitcast ([32 x i32]* @arr to <32 x i32>*), align 16 +; AVX2: [[RDX_SHUF:%.*]] = shufflevector <32 x i32> [[TMP2]], <32 x i32> undef, <32 x i32> +; AVX2-NEXT: [[TMP96:%.*]] = icmp sgt <32 x i32> [[TMP2]], [[RDX_SHUF]] +; AVX2-NEXT: [[BIN_RDX:%.*]] = select <32 x i1> [[TMP96]], <32 x i32> [[TMP2]], <32 x i32> [[RDX_SHUF]] +; AVX2-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <32 x i32> [[BIN_RDX]], <32 x i32> undef, <32 x i32> +; AVX2-NEXT: [[TMP97:%.*]] = icmp sgt <32 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; AVX2-NEXT: [[BIN_RDX2:%.*]] = select <32 x i1> [[TMP97]], <32 x i32> [[BIN_RDX]], <32 x i32> [[RDX_SHUF1]] +; AVX2-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <32 x i32> [[BIN_RDX2]], <32 x i32> undef, <32 x i32> +; AVX2-NEXT: [[TMP98:%.*]] = icmp sgt <32 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; AVX2-NEXT: [[BIN_RDX4:%.*]] = select <32 x i1> [[TMP98]], <32 x i32> [[BIN_RDX2]], <32 x i32> [[RDX_SHUF3]] +; AVX2-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <32 x i32> [[BIN_RDX4]], <32 x i32> undef, <32 x i32> +; AVX2-NEXT: [[TMP99:%.*]] = icmp sgt <32 x i32> [[BIN_RDX4]], [[RDX_SHUF5]] +; AVX2-NEXT: [[BIN_RDX6:%.*]] = select <32 x i1> [[TMP99]], <32 x i32> [[BIN_RDX4]], <32 x i32> [[RDX_SHUF5]] +; AVX2-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <32 x i32> [[BIN_RDX6]], <32 x i32> undef, <32 x i32> +; AVX2-NEXT: [[TMP100:%.*]] = icmp sgt <32 x i32> [[BIN_RDX6]], [[RDX_SHUF7]] +; AVX2-NEXT: [[BIN_RDX8:%.*]] = select <32 x i1> [[TMP100]], <32 x i32> [[BIN_RDX6]], <32 x i32> [[RDX_SHUF7]] +; AVX2-NEXT: [[TMP101:%.*]] = extractelement <32 x i32> [[BIN_RDX8]], i32 0 +; AVX2: ret i32 [[TMP101]] ; ; SKX-LABEL: @maxi32( -; SKX-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 -; SKX-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4 -; SKX-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]] -; SKX-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]] -; SKX-NEXT: [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8 -; SKX-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]] -; SKX-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]] -; SKX-NEXT: [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4 -; SKX-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]] -; SKX-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]] -; SKX-NEXT: [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16 -; SKX-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]] -; SKX-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] -; SKX-NEXT: [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4 -; SKX-NEXT: [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]] -; SKX-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]] -; SKX-NEXT: [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 -; SKX-NEXT: [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]] -; SKX-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]] -; SKX-NEXT: [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 -; SKX-NEXT: [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]] -; SKX-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]] -; SKX-NEXT: [[TMP24:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 8), align 16 -; SKX-NEXT: [[TMP25:%.*]] = icmp sgt i32 [[TMP23]], [[TMP24]] -; SKX-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP23]], i32 [[TMP24]] -; SKX-NEXT: [[TMP27:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 9), align 4 -; SKX-NEXT: [[TMP28:%.*]] = icmp sgt i32 [[TMP26]], [[TMP27]] -; SKX-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i32 [[TMP26]], i32 [[TMP27]] -; SKX-NEXT: [[TMP30:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 10), align 8 -; SKX-NEXT: [[TMP31:%.*]] = icmp sgt i32 [[TMP29]], [[TMP30]] -; SKX-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP29]], i32 [[TMP30]] -; SKX-NEXT: [[TMP33:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 11), align 4 -; SKX-NEXT: [[TMP34:%.*]] = icmp sgt i32 [[TMP32]], [[TMP33]] -; SKX-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[TMP32]], i32 [[TMP33]] -; SKX-NEXT: [[TMP36:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 12), align 16 -; SKX-NEXT: [[TMP37:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] -; SKX-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 [[TMP35]], i32 [[TMP36]] -; SKX-NEXT: [[TMP39:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 13), align 4 -; SKX-NEXT: [[TMP40:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]] -; SKX-NEXT: [[TMP41:%.*]] = select i1 [[TMP40]], i32 [[TMP38]], i32 [[TMP39]] -; SKX-NEXT: [[TMP42:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 14), align 8 -; SKX-NEXT: [[TMP43:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] -; SKX-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[TMP41]], i32 [[TMP42]] -; SKX-NEXT: [[TMP45:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 15), align 4 -; SKX-NEXT: [[TMP46:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]] -; SKX-NEXT: [[TMP47:%.*]] = select i1 [[TMP46]], i32 [[TMP44]], i32 [[TMP45]] -; SKX-NEXT: [[TMP48:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 16), align 16 -; SKX-NEXT: [[TMP49:%.*]] = icmp sgt i32 [[TMP47]], [[TMP48]] -; SKX-NEXT: [[TMP50:%.*]] = select i1 [[TMP49]], i32 [[TMP47]], i32 [[TMP48]] -; SKX-NEXT: [[TMP51:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 17), align 4 -; SKX-NEXT: [[TMP52:%.*]] = icmp sgt i32 [[TMP50]], [[TMP51]] -; SKX-NEXT: [[TMP53:%.*]] = select i1 [[TMP52]], i32 [[TMP50]], i32 [[TMP51]] -; SKX-NEXT: [[TMP54:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 18), align 8 -; SKX-NEXT: [[TMP55:%.*]] = icmp sgt i32 [[TMP53]], [[TMP54]] -; SKX-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 [[TMP53]], i32 [[TMP54]] -; SKX-NEXT: [[TMP57:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 19), align 4 -; SKX-NEXT: [[TMP58:%.*]] = icmp sgt i32 [[TMP56]], [[TMP57]] -; SKX-NEXT: [[TMP59:%.*]] = select i1 [[TMP58]], i32 [[TMP56]], i32 [[TMP57]] -; SKX-NEXT: [[TMP60:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 20), align 16 -; SKX-NEXT: [[TMP61:%.*]] = icmp sgt i32 [[TMP59]], [[TMP60]] -; SKX-NEXT: [[TMP62:%.*]] = select i1 [[TMP61]], i32 [[TMP59]], i32 [[TMP60]] -; SKX-NEXT: [[TMP63:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 21), align 4 -; SKX-NEXT: [[TMP64:%.*]] = icmp sgt i32 [[TMP62]], [[TMP63]] -; SKX-NEXT: [[TMP65:%.*]] = select i1 [[TMP64]], i32 [[TMP62]], i32 [[TMP63]] -; SKX-NEXT: [[TMP66:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 22), align 8 -; SKX-NEXT: [[TMP67:%.*]] = icmp sgt i32 [[TMP65]], [[TMP66]] -; SKX-NEXT: [[TMP68:%.*]] = select i1 [[TMP67]], i32 [[TMP65]], i32 [[TMP66]] -; SKX-NEXT: [[TMP69:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 23), align 4 -; SKX-NEXT: [[TMP70:%.*]] = icmp sgt i32 [[TMP68]], [[TMP69]] -; SKX-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP68]], i32 [[TMP69]] -; SKX-NEXT: [[TMP72:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 24), align 16 -; SKX-NEXT: [[TMP73:%.*]] = icmp sgt i32 [[TMP71]], [[TMP72]] -; SKX-NEXT: [[TMP74:%.*]] = select i1 [[TMP73]], i32 [[TMP71]], i32 [[TMP72]] -; SKX-NEXT: [[TMP75:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 25), align 4 -; SKX-NEXT: [[TMP76:%.*]] = icmp sgt i32 [[TMP74]], [[TMP75]] -; SKX-NEXT: [[TMP77:%.*]] = select i1 [[TMP76]], i32 [[TMP74]], i32 [[TMP75]] -; SKX-NEXT: [[TMP78:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 26), align 8 -; SKX-NEXT: [[TMP79:%.*]] = icmp sgt i32 [[TMP77]], [[TMP78]] -; SKX-NEXT: [[TMP80:%.*]] = select i1 [[TMP79]], i32 [[TMP77]], i32 [[TMP78]] -; SKX-NEXT: [[TMP81:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 27), align 4 -; SKX-NEXT: [[TMP82:%.*]] = icmp sgt i32 [[TMP80]], [[TMP81]] -; SKX-NEXT: [[TMP83:%.*]] = select i1 [[TMP82]], i32 [[TMP80]], i32 [[TMP81]] -; SKX-NEXT: [[TMP84:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 28), align 16 -; SKX-NEXT: [[TMP85:%.*]] = icmp sgt i32 [[TMP83]], [[TMP84]] -; SKX-NEXT: [[TMP86:%.*]] = select i1 [[TMP85]], i32 [[TMP83]], i32 [[TMP84]] -; SKX-NEXT: [[TMP87:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 29), align 4 -; SKX-NEXT: [[TMP88:%.*]] = icmp sgt i32 [[TMP86]], [[TMP87]] -; SKX-NEXT: [[TMP89:%.*]] = select i1 [[TMP88]], i32 [[TMP86]], i32 [[TMP87]] -; SKX-NEXT: [[TMP90:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 30), align 8 -; SKX-NEXT: [[TMP91:%.*]] = icmp sgt i32 [[TMP89]], [[TMP90]] -; SKX-NEXT: [[TMP92:%.*]] = select i1 [[TMP91]], i32 [[TMP89]], i32 [[TMP90]] -; SKX-NEXT: [[TMP93:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 31), align 4 -; SKX-NEXT: [[TMP94:%.*]] = icmp sgt i32 [[TMP92]], [[TMP93]] -; SKX-NEXT: [[TMP95:%.*]] = select i1 [[TMP94]], i32 [[TMP92]], i32 [[TMP93]] -; SKX-NEXT: ret i32 [[TMP95]] +; SKX-NEXT: [[TMP2:%.*]] = load <32 x i32>, <32 x i32>* bitcast ([32 x i32]* @arr to <32 x i32>*), align 16 +; SKX: [[RDX_SHUF:%.*]] = shufflevector <32 x i32> [[TMP2]], <32 x i32> undef, <32 x i32> +; SKX-NEXT: [[TMP96:%.*]] = icmp sgt <32 x i32> [[TMP2]], [[RDX_SHUF]] +; SKX-NEXT: [[BIN_RDX:%.*]] = select <32 x i1> [[TMP96]], <32 x i32> [[TMP2]], <32 x i32> [[RDX_SHUF]] +; SKX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <32 x i32> [[BIN_RDX]], <32 x i32> undef, <32 x i32> +; SKX-NEXT: [[TMP97:%.*]] = icmp sgt <32 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; SKX-NEXT: [[BIN_RDX2:%.*]] = select <32 x i1> [[TMP97]], <32 x i32> [[BIN_RDX]], <32 x i32> [[RDX_SHUF1]] +; SKX-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <32 x i32> [[BIN_RDX2]], <32 x i32> undef, <32 x i32> +; SKX-NEXT: [[TMP98:%.*]] = icmp sgt <32 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; SKX-NEXT: [[BIN_RDX4:%.*]] = select <32 x i1> [[TMP98]], <32 x i32> [[BIN_RDX2]], <32 x i32> [[RDX_SHUF3]] +; SKX-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <32 x i32> [[BIN_RDX4]], <32 x i32> undef, <32 x i32> +; SKX-NEXT: [[TMP99:%.*]] = icmp sgt <32 x i32> [[BIN_RDX4]], [[RDX_SHUF5]] +; SKX-NEXT: [[BIN_RDX6:%.*]] = select <32 x i1> [[TMP99]], <32 x i32> [[BIN_RDX4]], <32 x i32> [[RDX_SHUF5]] +; SKX-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <32 x i32> [[BIN_RDX6]], <32 x i32> undef, <32 x i32> +; SKX-NEXT: [[TMP100:%.*]] = icmp sgt <32 x i32> [[BIN_RDX6]], [[RDX_SHUF7]] +; SKX-NEXT: [[BIN_RDX8:%.*]] = select <32 x i1> [[TMP100]], <32 x i32> [[BIN_RDX6]], <32 x i32> [[RDX_SHUF7]] +; SKX-NEXT: [[TMP101:%.*]] = extractelement <32 x i32> [[BIN_RDX8]], i32 0 +; SKX: ret i32 [[TMP101]] ; %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4 @@ -892,79 +455,46 @@ ; CHECK-NEXT: ret float [[TMP23]] ; ; AVX-LABEL: @maxf8( -; AVX-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16 -; AVX-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4 -; AVX-NEXT: [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]] -; AVX-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]] -; AVX-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8 -; AVX-NEXT: [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]] -; AVX-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]] -; AVX-NEXT: [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4 -; AVX-NEXT: [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]] -; AVX-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]] -; AVX-NEXT: [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16 -; AVX-NEXT: [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]] -; AVX-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]] -; AVX-NEXT: [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4 -; AVX-NEXT: [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]] -; AVX-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]] -; AVX-NEXT: [[TMP18:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8 -; AVX-NEXT: [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]] -; AVX-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]] -; AVX-NEXT: [[TMP21:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4 -; AVX-NEXT: [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]] -; AVX-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]] -; AVX-NEXT: ret float [[TMP23]] +; AVX-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([32 x float]* @arr1 to <8 x float>*), align 16 +; AVX: [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> undef, <8 x i32> +; AVX-NEXT: [[TMP24:%.*]] = fcmp fast ogt <8 x float> [[TMP2]], [[RDX_SHUF]] +; AVX-NEXT: [[BIN_RDX:%.*]] = select <8 x i1> [[TMP24]], <8 x float> [[TMP2]], <8 x float> [[RDX_SHUF]] +; AVX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> +; AVX-NEXT: [[TMP25:%.*]] = fcmp fast ogt <8 x float> [[BIN_RDX]], [[RDX_SHUF1]] +; AVX-NEXT: [[BIN_RDX2:%.*]] = select <8 x i1> [[TMP25]], <8 x float> [[BIN_RDX]], <8 x float> [[RDX_SHUF1]] +; AVX-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> +; AVX-NEXT: [[TMP26:%.*]] = fcmp fast ogt <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]] +; AVX-NEXT: [[BIN_RDX4:%.*]] = select <8 x i1> [[TMP26]], <8 x float> [[BIN_RDX2]], <8 x float> [[RDX_SHUF3]] +; AVX-NEXT: [[TMP27:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0 +; AVX: ret float [[TMP27]] ; ; AVX2-LABEL: @maxf8( -; AVX2-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16 -; AVX2-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4 -; AVX2-NEXT: [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]] -; AVX2-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]] -; AVX2-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8 -; AVX2-NEXT: [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]] -; AVX2-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]] -; AVX2-NEXT: [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4 -; AVX2-NEXT: [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]] -; AVX2-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]] -; AVX2-NEXT: [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16 -; AVX2-NEXT: [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]] -; AVX2-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]] -; AVX2-NEXT: [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4 -; AVX2-NEXT: [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]] -; AVX2-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]] -; AVX2-NEXT: [[TMP18:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8 -; AVX2-NEXT: [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]] -; AVX2-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]] -; AVX2-NEXT: [[TMP21:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4 -; AVX2-NEXT: [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]] -; AVX2-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]] -; AVX2-NEXT: ret float [[TMP23]] +; AVX2-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([32 x float]* @arr1 to <8 x float>*), align 16 +; AVX2: [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> undef, <8 x i32> +; AVX2-NEXT: [[TMP24:%.*]] = fcmp fast ogt <8 x float> [[TMP2]], [[RDX_SHUF]] +; AVX2-NEXT: [[BIN_RDX:%.*]] = select <8 x i1> [[TMP24]], <8 x float> [[TMP2]], <8 x float> [[RDX_SHUF]] +; AVX2-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> +; AVX2-NEXT: [[TMP25:%.*]] = fcmp fast ogt <8 x float> [[BIN_RDX]], [[RDX_SHUF1]] +; AVX2-NEXT: [[BIN_RDX2:%.*]] = select <8 x i1> [[TMP25]], <8 x float> [[BIN_RDX]], <8 x float> [[RDX_SHUF1]] +; AVX2-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> +; AVX2-NEXT: [[TMP26:%.*]] = fcmp fast ogt <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]] +; AVX2-NEXT: [[BIN_RDX4:%.*]] = select <8 x i1> [[TMP26]], <8 x float> [[BIN_RDX2]], <8 x float> [[RDX_SHUF3]] +; AVX2-NEXT: [[TMP27:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0 +; AVX2: ret float [[TMP27]] ; ; SKX-LABEL: @maxf8( -; SKX-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16 -; SKX-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4 -; SKX-NEXT: [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]] -; SKX-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]] -; SKX-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8 -; SKX-NEXT: [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]] -; SKX-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]] -; SKX-NEXT: [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4 -; SKX-NEXT: [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]] -; SKX-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]] -; SKX-NEXT: [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16 -; SKX-NEXT: [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]] -; SKX-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]] -; SKX-NEXT: [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4 -; SKX-NEXT: [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]] -; SKX-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]] -; SKX-NEXT: [[TMP18:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8 -; SKX-NEXT: [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]] -; SKX-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]] -; SKX-NEXT: [[TMP21:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4 -; SKX-NEXT: [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]] -; SKX-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]] -; SKX-NEXT: ret float [[TMP23]] +; SKX-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([32 x float]* @arr1 to <8 x float>*), align 16 +; SKX: [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> undef, <8 x i32> +; SKX-NEXT: [[TMP24:%.*]] = fcmp fast ogt <8 x float> [[TMP2]], [[RDX_SHUF]] +; SKX-NEXT: [[BIN_RDX:%.*]] = select <8 x i1> [[TMP24]], <8 x float> [[TMP2]], <8 x float> [[RDX_SHUF]] +; SKX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> +; SKX-NEXT: [[TMP25:%.*]] = fcmp fast ogt <8 x float> [[BIN_RDX]], [[RDX_SHUF1]] +; SKX-NEXT: [[BIN_RDX2:%.*]] = select <8 x i1> [[TMP25]], <8 x float> [[BIN_RDX]], <8 x float> [[RDX_SHUF1]] +; SKX-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> +; SKX-NEXT: [[TMP26:%.*]] = fcmp fast ogt <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]] +; SKX-NEXT: [[BIN_RDX4:%.*]] = select <8 x i1> [[TMP26]], <8 x float> [[BIN_RDX2]], <8 x float> [[RDX_SHUF3]] +; SKX-NEXT: [[TMP27:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0 +; SKX: ret float [[TMP27]] ; %2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16 %3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4 @@ -1042,151 +572,55 @@ ; CHECK-NEXT: ret float [[TMP47]] ; ; AVX-LABEL: @maxf16( -; AVX-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16 -; AVX-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4 -; AVX-NEXT: [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]] -; AVX-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]] -; AVX-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8 -; AVX-NEXT: [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]] -; AVX-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]] -; AVX-NEXT: [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4 -; AVX-NEXT: [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]] -; AVX-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]] -; AVX-NEXT: [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16 -; AVX-NEXT: [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]] -; AVX-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]] -; AVX-NEXT: [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4 -; AVX-NEXT: [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]] -; AVX-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]] -; AVX-NEXT: [[TMP18:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8 -; AVX-NEXT: [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]] -; AVX-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]] -; AVX-NEXT: [[TMP21:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4 -; AVX-NEXT: [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]] -; AVX-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]] -; AVX-NEXT: [[TMP24:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 8), align 16 -; AVX-NEXT: [[TMP25:%.*]] = fcmp fast ogt float [[TMP23]], [[TMP24]] -; AVX-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP23]], float [[TMP24]] -; AVX-NEXT: [[TMP27:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 9), align 4 -; AVX-NEXT: [[TMP28:%.*]] = fcmp fast ogt float [[TMP26]], [[TMP27]] -; AVX-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], float [[TMP26]], float [[TMP27]] -; AVX-NEXT: [[TMP30:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 10), align 8 -; AVX-NEXT: [[TMP31:%.*]] = fcmp fast ogt float [[TMP29]], [[TMP30]] -; AVX-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], float [[TMP29]], float [[TMP30]] -; AVX-NEXT: [[TMP33:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 11), align 4 -; AVX-NEXT: [[TMP34:%.*]] = fcmp fast ogt float [[TMP32]], [[TMP33]] -; AVX-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], float [[TMP32]], float [[TMP33]] -; AVX-NEXT: [[TMP36:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 12), align 16 -; AVX-NEXT: [[TMP37:%.*]] = fcmp fast ogt float [[TMP35]], [[TMP36]] -; AVX-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], float [[TMP35]], float [[TMP36]] -; AVX-NEXT: [[TMP39:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 13), align 4 -; AVX-NEXT: [[TMP40:%.*]] = fcmp fast ogt float [[TMP38]], [[TMP39]] -; AVX-NEXT: [[TMP41:%.*]] = select i1 [[TMP40]], float [[TMP38]], float [[TMP39]] -; AVX-NEXT: [[TMP42:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 14), align 8 -; AVX-NEXT: [[TMP43:%.*]] = fcmp fast ogt float [[TMP41]], [[TMP42]] -; AVX-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], float [[TMP41]], float [[TMP42]] -; AVX-NEXT: [[TMP45:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4 -; AVX-NEXT: [[TMP46:%.*]] = fcmp fast ogt float [[TMP44]], [[TMP45]] -; AVX-NEXT: [[TMP47:%.*]] = select i1 [[TMP46]], float [[TMP44]], float [[TMP45]] -; AVX-NEXT: ret float [[TMP47]] +; AVX-NEXT: [[TMP2:%.*]] = load <16 x float>, <16 x float>* bitcast ([32 x float]* @arr1 to <16 x float>*), align 16 +; AVX: [[RDX_SHUF:%.*]] = shufflevector <16 x float> [[TMP2]], <16 x float> undef, <16 x i32> +; AVX-NEXT: [[TMP48:%.*]] = fcmp fast ogt <16 x float> [[TMP2]], [[RDX_SHUF]] +; AVX-NEXT: [[BIN_RDX:%.*]] = select <16 x i1> [[TMP48]], <16 x float> [[TMP2]], <16 x float> [[RDX_SHUF]] +; AVX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <16 x float> [[BIN_RDX]], <16 x float> undef, <16 x i32> +; AVX-NEXT: [[TMP49:%.*]] = fcmp fast ogt <16 x float> [[BIN_RDX]], [[RDX_SHUF1]] +; AVX-NEXT: [[BIN_RDX2:%.*]] = select <16 x i1> [[TMP49]], <16 x float> [[BIN_RDX]], <16 x float> [[RDX_SHUF1]] +; AVX-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <16 x float> [[BIN_RDX2]], <16 x float> undef, <16 x i32> +; AVX-NEXT: [[TMP50:%.*]] = fcmp fast ogt <16 x float> [[BIN_RDX2]], [[RDX_SHUF3]] +; AVX-NEXT: [[BIN_RDX4:%.*]] = select <16 x i1> [[TMP50]], <16 x float> [[BIN_RDX2]], <16 x float> [[RDX_SHUF3]] +; AVX-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <16 x float> [[BIN_RDX4]], <16 x float> undef, <16 x i32> +; AVX-NEXT: [[TMP51:%.*]] = fcmp fast ogt <16 x float> [[BIN_RDX4]], [[RDX_SHUF5]] +; AVX-NEXT: [[BIN_RDX6:%.*]] = select <16 x i1> [[TMP51]], <16 x float> [[BIN_RDX4]], <16 x float> [[RDX_SHUF5]] +; AVX-NEXT: [[TMP52:%.*]] = extractelement <16 x float> [[BIN_RDX6]], i32 0 +; AVX: ret float [[TMP52]] ; ; AVX2-LABEL: @maxf16( -; AVX2-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16 -; AVX2-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4 -; AVX2-NEXT: [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]] -; AVX2-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]] -; AVX2-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8 -; AVX2-NEXT: [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]] -; AVX2-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]] -; AVX2-NEXT: [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4 -; AVX2-NEXT: [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]] -; AVX2-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]] -; AVX2-NEXT: [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16 -; AVX2-NEXT: [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]] -; AVX2-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]] -; AVX2-NEXT: [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4 -; AVX2-NEXT: [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]] -; AVX2-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]] -; AVX2-NEXT: [[TMP18:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8 -; AVX2-NEXT: [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]] -; AVX2-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]] -; AVX2-NEXT: [[TMP21:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4 -; AVX2-NEXT: [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]] -; AVX2-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]] -; AVX2-NEXT: [[TMP24:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 8), align 16 -; AVX2-NEXT: [[TMP25:%.*]] = fcmp fast ogt float [[TMP23]], [[TMP24]] -; AVX2-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP23]], float [[TMP24]] -; AVX2-NEXT: [[TMP27:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 9), align 4 -; AVX2-NEXT: [[TMP28:%.*]] = fcmp fast ogt float [[TMP26]], [[TMP27]] -; AVX2-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], float [[TMP26]], float [[TMP27]] -; AVX2-NEXT: [[TMP30:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 10), align 8 -; AVX2-NEXT: [[TMP31:%.*]] = fcmp fast ogt float [[TMP29]], [[TMP30]] -; AVX2-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], float [[TMP29]], float [[TMP30]] -; AVX2-NEXT: [[TMP33:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 11), align 4 -; AVX2-NEXT: [[TMP34:%.*]] = fcmp fast ogt float [[TMP32]], [[TMP33]] -; AVX2-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], float [[TMP32]], float [[TMP33]] -; AVX2-NEXT: [[TMP36:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 12), align 16 -; AVX2-NEXT: [[TMP37:%.*]] = fcmp fast ogt float [[TMP35]], [[TMP36]] -; AVX2-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], float [[TMP35]], float [[TMP36]] -; AVX2-NEXT: [[TMP39:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 13), align 4 -; AVX2-NEXT: [[TMP40:%.*]] = fcmp fast ogt float [[TMP38]], [[TMP39]] -; AVX2-NEXT: [[TMP41:%.*]] = select i1 [[TMP40]], float [[TMP38]], float [[TMP39]] -; AVX2-NEXT: [[TMP42:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 14), align 8 -; AVX2-NEXT: [[TMP43:%.*]] = fcmp fast ogt float [[TMP41]], [[TMP42]] -; AVX2-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], float [[TMP41]], float [[TMP42]] -; AVX2-NEXT: [[TMP45:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4 -; AVX2-NEXT: [[TMP46:%.*]] = fcmp fast ogt float [[TMP44]], [[TMP45]] -; AVX2-NEXT: [[TMP47:%.*]] = select i1 [[TMP46]], float [[TMP44]], float [[TMP45]] -; AVX2-NEXT: ret float [[TMP47]] +; AVX2-NEXT: [[TMP2:%.*]] = load <16 x float>, <16 x float>* bitcast ([32 x float]* @arr1 to <16 x float>*), align 16 +; AVX2: [[RDX_SHUF:%.*]] = shufflevector <16 x float> [[TMP2]], <16 x float> undef, <16 x i32> +; AVX2-NEXT: [[TMP48:%.*]] = fcmp fast ogt <16 x float> [[TMP2]], [[RDX_SHUF]] +; AVX2-NEXT: [[BIN_RDX:%.*]] = select <16 x i1> [[TMP48]], <16 x float> [[TMP2]], <16 x float> [[RDX_SHUF]] +; AVX2-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <16 x float> [[BIN_RDX]], <16 x float> undef, <16 x i32> +; AVX2-NEXT: [[TMP49:%.*]] = fcmp fast ogt <16 x float> [[BIN_RDX]], [[RDX_SHUF1]] +; AVX2-NEXT: [[BIN_RDX2:%.*]] = select <16 x i1> [[TMP49]], <16 x float> [[BIN_RDX]], <16 x float> [[RDX_SHUF1]] +; AVX2-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <16 x float> [[BIN_RDX2]], <16 x float> undef, <16 x i32> +; AVX2-NEXT: [[TMP50:%.*]] = fcmp fast ogt <16 x float> [[BIN_RDX2]], [[RDX_SHUF3]] +; AVX2-NEXT: [[BIN_RDX4:%.*]] = select <16 x i1> [[TMP50]], <16 x float> [[BIN_RDX2]], <16 x float> [[RDX_SHUF3]] +; AVX2-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <16 x float> [[BIN_RDX4]], <16 x float> undef, <16 x i32> +; AVX2-NEXT: [[TMP51:%.*]] = fcmp fast ogt <16 x float> [[BIN_RDX4]], [[RDX_SHUF5]] +; AVX2-NEXT: [[BIN_RDX6:%.*]] = select <16 x i1> [[TMP51]], <16 x float> [[BIN_RDX4]], <16 x float> [[RDX_SHUF5]] +; AVX2-NEXT: [[TMP52:%.*]] = extractelement <16 x float> [[BIN_RDX6]], i32 0 +; AVX2: ret float [[TMP52]] ; ; SKX-LABEL: @maxf16( -; SKX-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16 -; SKX-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4 -; SKX-NEXT: [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]] -; SKX-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]] -; SKX-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8 -; SKX-NEXT: [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]] -; SKX-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]] -; SKX-NEXT: [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4 -; SKX-NEXT: [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]] -; SKX-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]] -; SKX-NEXT: [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16 -; SKX-NEXT: [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]] -; SKX-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]] -; SKX-NEXT: [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4 -; SKX-NEXT: [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]] -; SKX-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]] -; SKX-NEXT: [[TMP18:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8 -; SKX-NEXT: [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]] -; SKX-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]] -; SKX-NEXT: [[TMP21:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4 -; SKX-NEXT: [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]] -; SKX-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]] -; SKX-NEXT: [[TMP24:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 8), align 16 -; SKX-NEXT: [[TMP25:%.*]] = fcmp fast ogt float [[TMP23]], [[TMP24]] -; SKX-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP23]], float [[TMP24]] -; SKX-NEXT: [[TMP27:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 9), align 4 -; SKX-NEXT: [[TMP28:%.*]] = fcmp fast ogt float [[TMP26]], [[TMP27]] -; SKX-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], float [[TMP26]], float [[TMP27]] -; SKX-NEXT: [[TMP30:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 10), align 8 -; SKX-NEXT: [[TMP31:%.*]] = fcmp fast ogt float [[TMP29]], [[TMP30]] -; SKX-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], float [[TMP29]], float [[TMP30]] -; SKX-NEXT: [[TMP33:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 11), align 4 -; SKX-NEXT: [[TMP34:%.*]] = fcmp fast ogt float [[TMP32]], [[TMP33]] -; SKX-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], float [[TMP32]], float [[TMP33]] -; SKX-NEXT: [[TMP36:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 12), align 16 -; SKX-NEXT: [[TMP37:%.*]] = fcmp fast ogt float [[TMP35]], [[TMP36]] -; SKX-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], float [[TMP35]], float [[TMP36]] -; SKX-NEXT: [[TMP39:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 13), align 4 -; SKX-NEXT: [[TMP40:%.*]] = fcmp fast ogt float [[TMP38]], [[TMP39]] -; SKX-NEXT: [[TMP41:%.*]] = select i1 [[TMP40]], float [[TMP38]], float [[TMP39]] -; SKX-NEXT: [[TMP42:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 14), align 8 -; SKX-NEXT: [[TMP43:%.*]] = fcmp fast ogt float [[TMP41]], [[TMP42]] -; SKX-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], float [[TMP41]], float [[TMP42]] -; SKX-NEXT: [[TMP45:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4 -; SKX-NEXT: [[TMP46:%.*]] = fcmp fast ogt float [[TMP44]], [[TMP45]] -; SKX-NEXT: [[TMP47:%.*]] = select i1 [[TMP46]], float [[TMP44]], float [[TMP45]] -; SKX-NEXT: ret float [[TMP47]] +; SKX-NEXT: [[TMP2:%.*]] = load <16 x float>, <16 x float>* bitcast ([32 x float]* @arr1 to <16 x float>*), align 16 +; SKX: [[RDX_SHUF:%.*]] = shufflevector <16 x float> [[TMP2]], <16 x float> undef, <16 x i32> +; SKX-NEXT: [[TMP48:%.*]] = fcmp fast ogt <16 x float> [[TMP2]], [[RDX_SHUF]] +; SKX-NEXT: [[BIN_RDX:%.*]] = select <16 x i1> [[TMP48]], <16 x float> [[TMP2]], <16 x float> [[RDX_SHUF]] +; SKX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <16 x float> [[BIN_RDX]], <16 x float> undef, <16 x i32> +; SKX-NEXT: [[TMP49:%.*]] = fcmp fast ogt <16 x float> [[BIN_RDX]], [[RDX_SHUF1]] +; SKX-NEXT: [[BIN_RDX2:%.*]] = select <16 x i1> [[TMP49]], <16 x float> [[BIN_RDX]], <16 x float> [[RDX_SHUF1]] +; SKX-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <16 x float> [[BIN_RDX2]], <16 x float> undef, <16 x i32> +; SKX-NEXT: [[TMP50:%.*]] = fcmp fast ogt <16 x float> [[BIN_RDX2]], [[RDX_SHUF3]] +; SKX-NEXT: [[BIN_RDX4:%.*]] = select <16 x i1> [[TMP50]], <16 x float> [[BIN_RDX2]], <16 x float> [[RDX_SHUF3]] +; SKX-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <16 x float> [[BIN_RDX4]], <16 x float> undef, <16 x i32> +; SKX-NEXT: [[TMP51:%.*]] = fcmp fast ogt <16 x float> [[BIN_RDX4]], [[RDX_SHUF5]] +; SKX-NEXT: [[BIN_RDX6:%.*]] = select <16 x i1> [[TMP51]], <16 x float> [[BIN_RDX4]], <16 x float> [[RDX_SHUF5]] +; SKX-NEXT: [[TMP52:%.*]] = extractelement <16 x float> [[BIN_RDX6]], i32 0 +; SKX: ret float [[TMP52]] ; %2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16 %3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4 @@ -1336,295 +770,64 @@ ; CHECK-NEXT: ret float [[TMP95]] ; ; AVX-LABEL: @maxf32( -; AVX-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16 -; AVX-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4 -; AVX-NEXT: [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]] -; AVX-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]] -; AVX-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8 -; AVX-NEXT: [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]] -; AVX-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]] -; AVX-NEXT: [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4 -; AVX-NEXT: [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]] -; AVX-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]] -; AVX-NEXT: [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16 -; AVX-NEXT: [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]] -; AVX-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]] -; AVX-NEXT: [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4 -; AVX-NEXT: [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]] -; AVX-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]] -; AVX-NEXT: [[TMP18:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8 -; AVX-NEXT: [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]] -; AVX-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]] -; AVX-NEXT: [[TMP21:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4 -; AVX-NEXT: [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]] -; AVX-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]] -; AVX-NEXT: [[TMP24:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 8), align 16 -; AVX-NEXT: [[TMP25:%.*]] = fcmp fast ogt float [[TMP23]], [[TMP24]] -; AVX-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP23]], float [[TMP24]] -; AVX-NEXT: [[TMP27:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 9), align 4 -; AVX-NEXT: [[TMP28:%.*]] = fcmp fast ogt float [[TMP26]], [[TMP27]] -; AVX-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], float [[TMP26]], float [[TMP27]] -; AVX-NEXT: [[TMP30:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 10), align 8 -; AVX-NEXT: [[TMP31:%.*]] = fcmp fast ogt float [[TMP29]], [[TMP30]] -; AVX-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], float [[TMP29]], float [[TMP30]] -; AVX-NEXT: [[TMP33:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 11), align 4 -; AVX-NEXT: [[TMP34:%.*]] = fcmp fast ogt float [[TMP32]], [[TMP33]] -; AVX-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], float [[TMP32]], float [[TMP33]] -; AVX-NEXT: [[TMP36:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 12), align 16 -; AVX-NEXT: [[TMP37:%.*]] = fcmp fast ogt float [[TMP35]], [[TMP36]] -; AVX-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], float [[TMP35]], float [[TMP36]] -; AVX-NEXT: [[TMP39:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 13), align 4 -; AVX-NEXT: [[TMP40:%.*]] = fcmp fast ogt float [[TMP38]], [[TMP39]] -; AVX-NEXT: [[TMP41:%.*]] = select i1 [[TMP40]], float [[TMP38]], float [[TMP39]] -; AVX-NEXT: [[TMP42:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 14), align 8 -; AVX-NEXT: [[TMP43:%.*]] = fcmp fast ogt float [[TMP41]], [[TMP42]] -; AVX-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], float [[TMP41]], float [[TMP42]] -; AVX-NEXT: [[TMP45:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4 -; AVX-NEXT: [[TMP46:%.*]] = fcmp fast ogt float [[TMP44]], [[TMP45]] -; AVX-NEXT: [[TMP47:%.*]] = select i1 [[TMP46]], float [[TMP44]], float [[TMP45]] -; AVX-NEXT: [[TMP48:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 16), align 16 -; AVX-NEXT: [[TMP49:%.*]] = fcmp fast ogt float [[TMP47]], [[TMP48]] -; AVX-NEXT: [[TMP50:%.*]] = select i1 [[TMP49]], float [[TMP47]], float [[TMP48]] -; AVX-NEXT: [[TMP51:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 17), align 4 -; AVX-NEXT: [[TMP52:%.*]] = fcmp fast ogt float [[TMP50]], [[TMP51]] -; AVX-NEXT: [[TMP53:%.*]] = select i1 [[TMP52]], float [[TMP50]], float [[TMP51]] -; AVX-NEXT: [[TMP54:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 18), align 8 -; AVX-NEXT: [[TMP55:%.*]] = fcmp fast ogt float [[TMP53]], [[TMP54]] -; AVX-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], float [[TMP53]], float [[TMP54]] -; AVX-NEXT: [[TMP57:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 19), align 4 -; AVX-NEXT: [[TMP58:%.*]] = fcmp fast ogt float [[TMP56]], [[TMP57]] -; AVX-NEXT: [[TMP59:%.*]] = select i1 [[TMP58]], float [[TMP56]], float [[TMP57]] -; AVX-NEXT: [[TMP60:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 20), align 16 -; AVX-NEXT: [[TMP61:%.*]] = fcmp fast ogt float [[TMP59]], [[TMP60]] -; AVX-NEXT: [[TMP62:%.*]] = select i1 [[TMP61]], float [[TMP59]], float [[TMP60]] -; AVX-NEXT: [[TMP63:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 21), align 4 -; AVX-NEXT: [[TMP64:%.*]] = fcmp fast ogt float [[TMP62]], [[TMP63]] -; AVX-NEXT: [[TMP65:%.*]] = select i1 [[TMP64]], float [[TMP62]], float [[TMP63]] -; AVX-NEXT: [[TMP66:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 22), align 8 -; AVX-NEXT: [[TMP67:%.*]] = fcmp fast ogt float [[TMP65]], [[TMP66]] -; AVX-NEXT: [[TMP68:%.*]] = select i1 [[TMP67]], float [[TMP65]], float [[TMP66]] -; AVX-NEXT: [[TMP69:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 23), align 4 -; AVX-NEXT: [[TMP70:%.*]] = fcmp fast ogt float [[TMP68]], [[TMP69]] -; AVX-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], float [[TMP68]], float [[TMP69]] -; AVX-NEXT: [[TMP72:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 24), align 16 -; AVX-NEXT: [[TMP73:%.*]] = fcmp fast ogt float [[TMP71]], [[TMP72]] -; AVX-NEXT: [[TMP74:%.*]] = select i1 [[TMP73]], float [[TMP71]], float [[TMP72]] -; AVX-NEXT: [[TMP75:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 25), align 4 -; AVX-NEXT: [[TMP76:%.*]] = fcmp fast ogt float [[TMP74]], [[TMP75]] -; AVX-NEXT: [[TMP77:%.*]] = select i1 [[TMP76]], float [[TMP74]], float [[TMP75]] -; AVX-NEXT: [[TMP78:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 26), align 8 -; AVX-NEXT: [[TMP79:%.*]] = fcmp fast ogt float [[TMP77]], [[TMP78]] -; AVX-NEXT: [[TMP80:%.*]] = select i1 [[TMP79]], float [[TMP77]], float [[TMP78]] -; AVX-NEXT: [[TMP81:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 27), align 4 -; AVX-NEXT: [[TMP82:%.*]] = fcmp fast ogt float [[TMP80]], [[TMP81]] -; AVX-NEXT: [[TMP83:%.*]] = select i1 [[TMP82]], float [[TMP80]], float [[TMP81]] -; AVX-NEXT: [[TMP84:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 28), align 16 -; AVX-NEXT: [[TMP85:%.*]] = fcmp fast ogt float [[TMP83]], [[TMP84]] -; AVX-NEXT: [[TMP86:%.*]] = select i1 [[TMP85]], float [[TMP83]], float [[TMP84]] -; AVX-NEXT: [[TMP87:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 29), align 4 -; AVX-NEXT: [[TMP88:%.*]] = fcmp fast ogt float [[TMP86]], [[TMP87]] -; AVX-NEXT: [[TMP89:%.*]] = select i1 [[TMP88]], float [[TMP86]], float [[TMP87]] -; AVX-NEXT: [[TMP90:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 30), align 8 -; AVX-NEXT: [[TMP91:%.*]] = fcmp fast ogt float [[TMP89]], [[TMP90]] -; AVX-NEXT: [[TMP92:%.*]] = select i1 [[TMP91]], float [[TMP89]], float [[TMP90]] -; AVX-NEXT: [[TMP93:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 31), align 4 -; AVX-NEXT: [[TMP94:%.*]] = fcmp fast ogt float [[TMP92]], [[TMP93]] -; AVX-NEXT: [[TMP95:%.*]] = select i1 [[TMP94]], float [[TMP92]], float [[TMP93]] -; AVX-NEXT: ret float [[TMP95]] +; AVX-NEXT: [[TMP2:%.*]] = load <32 x float>, <32 x float>* bitcast ([32 x float]* @arr1 to <32 x float>*), align 16 +; AVX: [[RDX_SHUF:%.*]] = shufflevector <32 x float> [[TMP2]], <32 x float> undef, <32 x i32> +; AVX-NEXT: [[TMP96:%.*]] = fcmp fast ogt <32 x float> [[TMP2]], [[RDX_SHUF]] +; AVX-NEXT: [[BIN_RDX:%.*]] = select <32 x i1> [[TMP96]], <32 x float> [[TMP2]], <32 x float> [[RDX_SHUF]] +; AVX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <32 x float> [[BIN_RDX]], <32 x float> undef, <32 x i32> +; AVX-NEXT: [[TMP97:%.*]] = fcmp fast ogt <32 x float> [[BIN_RDX]], [[RDX_SHUF1]] +; AVX-NEXT: [[BIN_RDX2:%.*]] = select <32 x i1> [[TMP97]], <32 x float> [[BIN_RDX]], <32 x float> [[RDX_SHUF1]] +; AVX-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <32 x float> [[BIN_RDX2]], <32 x float> undef, <32 x i32> +; AVX-NEXT: [[TMP98:%.*]] = fcmp fast ogt <32 x float> [[BIN_RDX2]], [[RDX_SHUF3]] +; AVX-NEXT: [[BIN_RDX4:%.*]] = select <32 x i1> [[TMP98]], <32 x float> [[BIN_RDX2]], <32 x float> [[RDX_SHUF3]] +; AVX-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <32 x float> [[BIN_RDX4]], <32 x float> undef, <32 x i32> +; AVX-NEXT: [[TMP99:%.*]] = fcmp fast ogt <32 x float> [[BIN_RDX4]], [[RDX_SHUF5]] +; AVX-NEXT: [[BIN_RDX6:%.*]] = select <32 x i1> [[TMP99]], <32 x float> [[BIN_RDX4]], <32 x float> [[RDX_SHUF5]] +; AVX-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <32 x float> [[BIN_RDX6]], <32 x float> undef, <32 x i32> +; AVX-NEXT: [[TMP100:%.*]] = fcmp fast ogt <32 x float> [[BIN_RDX6]], [[RDX_SHUF7]] +; AVX-NEXT: [[BIN_RDX8:%.*]] = select <32 x i1> [[TMP100]], <32 x float> [[BIN_RDX6]], <32 x float> [[RDX_SHUF7]] +; AVX-NEXT: [[TMP101:%.*]] = extractelement <32 x float> [[BIN_RDX8]], i32 0 +; AVX: ret float [[TMP101]] ; ; AVX2-LABEL: @maxf32( -; AVX2-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16 -; AVX2-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4 -; AVX2-NEXT: [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]] -; AVX2-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]] -; AVX2-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8 -; AVX2-NEXT: [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]] -; AVX2-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]] -; AVX2-NEXT: [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4 -; AVX2-NEXT: [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]] -; AVX2-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]] -; AVX2-NEXT: [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16 -; AVX2-NEXT: [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]] -; AVX2-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]] -; AVX2-NEXT: [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4 -; AVX2-NEXT: [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]] -; AVX2-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]] -; AVX2-NEXT: [[TMP18:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8 -; AVX2-NEXT: [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]] -; AVX2-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]] -; AVX2-NEXT: [[TMP21:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4 -; AVX2-NEXT: [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]] -; AVX2-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]] -; AVX2-NEXT: [[TMP24:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 8), align 16 -; AVX2-NEXT: [[TMP25:%.*]] = fcmp fast ogt float [[TMP23]], [[TMP24]] -; AVX2-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP23]], float [[TMP24]] -; AVX2-NEXT: [[TMP27:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 9), align 4 -; AVX2-NEXT: [[TMP28:%.*]] = fcmp fast ogt float [[TMP26]], [[TMP27]] -; AVX2-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], float [[TMP26]], float [[TMP27]] -; AVX2-NEXT: [[TMP30:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 10), align 8 -; AVX2-NEXT: [[TMP31:%.*]] = fcmp fast ogt float [[TMP29]], [[TMP30]] -; AVX2-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], float [[TMP29]], float [[TMP30]] -; AVX2-NEXT: [[TMP33:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 11), align 4 -; AVX2-NEXT: [[TMP34:%.*]] = fcmp fast ogt float [[TMP32]], [[TMP33]] -; AVX2-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], float [[TMP32]], float [[TMP33]] -; AVX2-NEXT: [[TMP36:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 12), align 16 -; AVX2-NEXT: [[TMP37:%.*]] = fcmp fast ogt float [[TMP35]], [[TMP36]] -; AVX2-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], float [[TMP35]], float [[TMP36]] -; AVX2-NEXT: [[TMP39:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 13), align 4 -; AVX2-NEXT: [[TMP40:%.*]] = fcmp fast ogt float [[TMP38]], [[TMP39]] -; AVX2-NEXT: [[TMP41:%.*]] = select i1 [[TMP40]], float [[TMP38]], float [[TMP39]] -; AVX2-NEXT: [[TMP42:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 14), align 8 -; AVX2-NEXT: [[TMP43:%.*]] = fcmp fast ogt float [[TMP41]], [[TMP42]] -; AVX2-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], float [[TMP41]], float [[TMP42]] -; AVX2-NEXT: [[TMP45:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4 -; AVX2-NEXT: [[TMP46:%.*]] = fcmp fast ogt float [[TMP44]], [[TMP45]] -; AVX2-NEXT: [[TMP47:%.*]] = select i1 [[TMP46]], float [[TMP44]], float [[TMP45]] -; AVX2-NEXT: [[TMP48:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 16), align 16 -; AVX2-NEXT: [[TMP49:%.*]] = fcmp fast ogt float [[TMP47]], [[TMP48]] -; AVX2-NEXT: [[TMP50:%.*]] = select i1 [[TMP49]], float [[TMP47]], float [[TMP48]] -; AVX2-NEXT: [[TMP51:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 17), align 4 -; AVX2-NEXT: [[TMP52:%.*]] = fcmp fast ogt float [[TMP50]], [[TMP51]] -; AVX2-NEXT: [[TMP53:%.*]] = select i1 [[TMP52]], float [[TMP50]], float [[TMP51]] -; AVX2-NEXT: [[TMP54:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 18), align 8 -; AVX2-NEXT: [[TMP55:%.*]] = fcmp fast ogt float [[TMP53]], [[TMP54]] -; AVX2-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], float [[TMP53]], float [[TMP54]] -; AVX2-NEXT: [[TMP57:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 19), align 4 -; AVX2-NEXT: [[TMP58:%.*]] = fcmp fast ogt float [[TMP56]], [[TMP57]] -; AVX2-NEXT: [[TMP59:%.*]] = select i1 [[TMP58]], float [[TMP56]], float [[TMP57]] -; AVX2-NEXT: [[TMP60:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 20), align 16 -; AVX2-NEXT: [[TMP61:%.*]] = fcmp fast ogt float [[TMP59]], [[TMP60]] -; AVX2-NEXT: [[TMP62:%.*]] = select i1 [[TMP61]], float [[TMP59]], float [[TMP60]] -; AVX2-NEXT: [[TMP63:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 21), align 4 -; AVX2-NEXT: [[TMP64:%.*]] = fcmp fast ogt float [[TMP62]], [[TMP63]] -; AVX2-NEXT: [[TMP65:%.*]] = select i1 [[TMP64]], float [[TMP62]], float [[TMP63]] -; AVX2-NEXT: [[TMP66:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 22), align 8 -; AVX2-NEXT: [[TMP67:%.*]] = fcmp fast ogt float [[TMP65]], [[TMP66]] -; AVX2-NEXT: [[TMP68:%.*]] = select i1 [[TMP67]], float [[TMP65]], float [[TMP66]] -; AVX2-NEXT: [[TMP69:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 23), align 4 -; AVX2-NEXT: [[TMP70:%.*]] = fcmp fast ogt float [[TMP68]], [[TMP69]] -; AVX2-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], float [[TMP68]], float [[TMP69]] -; AVX2-NEXT: [[TMP72:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 24), align 16 -; AVX2-NEXT: [[TMP73:%.*]] = fcmp fast ogt float [[TMP71]], [[TMP72]] -; AVX2-NEXT: [[TMP74:%.*]] = select i1 [[TMP73]], float [[TMP71]], float [[TMP72]] -; AVX2-NEXT: [[TMP75:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 25), align 4 -; AVX2-NEXT: [[TMP76:%.*]] = fcmp fast ogt float [[TMP74]], [[TMP75]] -; AVX2-NEXT: [[TMP77:%.*]] = select i1 [[TMP76]], float [[TMP74]], float [[TMP75]] -; AVX2-NEXT: [[TMP78:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 26), align 8 -; AVX2-NEXT: [[TMP79:%.*]] = fcmp fast ogt float [[TMP77]], [[TMP78]] -; AVX2-NEXT: [[TMP80:%.*]] = select i1 [[TMP79]], float [[TMP77]], float [[TMP78]] -; AVX2-NEXT: [[TMP81:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 27), align 4 -; AVX2-NEXT: [[TMP82:%.*]] = fcmp fast ogt float [[TMP80]], [[TMP81]] -; AVX2-NEXT: [[TMP83:%.*]] = select i1 [[TMP82]], float [[TMP80]], float [[TMP81]] -; AVX2-NEXT: [[TMP84:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 28), align 16 -; AVX2-NEXT: [[TMP85:%.*]] = fcmp fast ogt float [[TMP83]], [[TMP84]] -; AVX2-NEXT: [[TMP86:%.*]] = select i1 [[TMP85]], float [[TMP83]], float [[TMP84]] -; AVX2-NEXT: [[TMP87:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 29), align 4 -; AVX2-NEXT: [[TMP88:%.*]] = fcmp fast ogt float [[TMP86]], [[TMP87]] -; AVX2-NEXT: [[TMP89:%.*]] = select i1 [[TMP88]], float [[TMP86]], float [[TMP87]] -; AVX2-NEXT: [[TMP90:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 30), align 8 -; AVX2-NEXT: [[TMP91:%.*]] = fcmp fast ogt float [[TMP89]], [[TMP90]] -; AVX2-NEXT: [[TMP92:%.*]] = select i1 [[TMP91]], float [[TMP89]], float [[TMP90]] -; AVX2-NEXT: [[TMP93:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 31), align 4 -; AVX2-NEXT: [[TMP94:%.*]] = fcmp fast ogt float [[TMP92]], [[TMP93]] -; AVX2-NEXT: [[TMP95:%.*]] = select i1 [[TMP94]], float [[TMP92]], float [[TMP93]] -; AVX2-NEXT: ret float [[TMP95]] +; AVX2-NEXT: [[TMP2:%.*]] = load <32 x float>, <32 x float>* bitcast ([32 x float]* @arr1 to <32 x float>*), align 16 +; AVX2: [[RDX_SHUF:%.*]] = shufflevector <32 x float> [[TMP2]], <32 x float> undef, <32 x i32> +; AVX2-NEXT: [[TMP96:%.*]] = fcmp fast ogt <32 x float> [[TMP2]], [[RDX_SHUF]] +; AVX2-NEXT: [[BIN_RDX:%.*]] = select <32 x i1> [[TMP96]], <32 x float> [[TMP2]], <32 x float> [[RDX_SHUF]] +; AVX2-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <32 x float> [[BIN_RDX]], <32 x float> undef, <32 x i32> +; AVX2-NEXT: [[TMP97:%.*]] = fcmp fast ogt <32 x float> [[BIN_RDX]], [[RDX_SHUF1]] +; AVX2-NEXT: [[BIN_RDX2:%.*]] = select <32 x i1> [[TMP97]], <32 x float> [[BIN_RDX]], <32 x float> [[RDX_SHUF1]] +; AVX2-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <32 x float> [[BIN_RDX2]], <32 x float> undef, <32 x i32> +; AVX2-NEXT: [[TMP98:%.*]] = fcmp fast ogt <32 x float> [[BIN_RDX2]], [[RDX_SHUF3]] +; AVX2-NEXT: [[BIN_RDX4:%.*]] = select <32 x i1> [[TMP98]], <32 x float> [[BIN_RDX2]], <32 x float> [[RDX_SHUF3]] +; AVX2-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <32 x float> [[BIN_RDX4]], <32 x float> undef, <32 x i32> +; AVX2-NEXT: [[TMP99:%.*]] = fcmp fast ogt <32 x float> [[BIN_RDX4]], [[RDX_SHUF5]] +; AVX2-NEXT: [[BIN_RDX6:%.*]] = select <32 x i1> [[TMP99]], <32 x float> [[BIN_RDX4]], <32 x float> [[RDX_SHUF5]] +; AVX2-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <32 x float> [[BIN_RDX6]], <32 x float> undef, <32 x i32> +; AVX2-NEXT: [[TMP100:%.*]] = fcmp fast ogt <32 x float> [[BIN_RDX6]], [[RDX_SHUF7]] +; AVX2-NEXT: [[BIN_RDX8:%.*]] = select <32 x i1> [[TMP100]], <32 x float> [[BIN_RDX6]], <32 x float> [[RDX_SHUF7]] +; AVX2-NEXT: [[TMP101:%.*]] = extractelement <32 x float> [[BIN_RDX8]], i32 0 +; AVX2: ret float [[TMP101]] ; ; SKX-LABEL: @maxf32( -; SKX-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16 -; SKX-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4 -; SKX-NEXT: [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]] -; SKX-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]] -; SKX-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8 -; SKX-NEXT: [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]] -; SKX-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]] -; SKX-NEXT: [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4 -; SKX-NEXT: [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]] -; SKX-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]] -; SKX-NEXT: [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16 -; SKX-NEXT: [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]] -; SKX-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]] -; SKX-NEXT: [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4 -; SKX-NEXT: [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]] -; SKX-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]] -; SKX-NEXT: [[TMP18:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8 -; SKX-NEXT: [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]] -; SKX-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]] -; SKX-NEXT: [[TMP21:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4 -; SKX-NEXT: [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]] -; SKX-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]] -; SKX-NEXT: [[TMP24:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 8), align 16 -; SKX-NEXT: [[TMP25:%.*]] = fcmp fast ogt float [[TMP23]], [[TMP24]] -; SKX-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP23]], float [[TMP24]] -; SKX-NEXT: [[TMP27:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 9), align 4 -; SKX-NEXT: [[TMP28:%.*]] = fcmp fast ogt float [[TMP26]], [[TMP27]] -; SKX-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], float [[TMP26]], float [[TMP27]] -; SKX-NEXT: [[TMP30:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 10), align 8 -; SKX-NEXT: [[TMP31:%.*]] = fcmp fast ogt float [[TMP29]], [[TMP30]] -; SKX-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], float [[TMP29]], float [[TMP30]] -; SKX-NEXT: [[TMP33:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 11), align 4 -; SKX-NEXT: [[TMP34:%.*]] = fcmp fast ogt float [[TMP32]], [[TMP33]] -; SKX-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], float [[TMP32]], float [[TMP33]] -; SKX-NEXT: [[TMP36:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 12), align 16 -; SKX-NEXT: [[TMP37:%.*]] = fcmp fast ogt float [[TMP35]], [[TMP36]] -; SKX-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], float [[TMP35]], float [[TMP36]] -; SKX-NEXT: [[TMP39:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 13), align 4 -; SKX-NEXT: [[TMP40:%.*]] = fcmp fast ogt float [[TMP38]], [[TMP39]] -; SKX-NEXT: [[TMP41:%.*]] = select i1 [[TMP40]], float [[TMP38]], float [[TMP39]] -; SKX-NEXT: [[TMP42:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 14), align 8 -; SKX-NEXT: [[TMP43:%.*]] = fcmp fast ogt float [[TMP41]], [[TMP42]] -; SKX-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], float [[TMP41]], float [[TMP42]] -; SKX-NEXT: [[TMP45:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4 -; SKX-NEXT: [[TMP46:%.*]] = fcmp fast ogt float [[TMP44]], [[TMP45]] -; SKX-NEXT: [[TMP47:%.*]] = select i1 [[TMP46]], float [[TMP44]], float [[TMP45]] -; SKX-NEXT: [[TMP48:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 16), align 16 -; SKX-NEXT: [[TMP49:%.*]] = fcmp fast ogt float [[TMP47]], [[TMP48]] -; SKX-NEXT: [[TMP50:%.*]] = select i1 [[TMP49]], float [[TMP47]], float [[TMP48]] -; SKX-NEXT: [[TMP51:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 17), align 4 -; SKX-NEXT: [[TMP52:%.*]] = fcmp fast ogt float [[TMP50]], [[TMP51]] -; SKX-NEXT: [[TMP53:%.*]] = select i1 [[TMP52]], float [[TMP50]], float [[TMP51]] -; SKX-NEXT: [[TMP54:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 18), align 8 -; SKX-NEXT: [[TMP55:%.*]] = fcmp fast ogt float [[TMP53]], [[TMP54]] -; SKX-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], float [[TMP53]], float [[TMP54]] -; SKX-NEXT: [[TMP57:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 19), align 4 -; SKX-NEXT: [[TMP58:%.*]] = fcmp fast ogt float [[TMP56]], [[TMP57]] -; SKX-NEXT: [[TMP59:%.*]] = select i1 [[TMP58]], float [[TMP56]], float [[TMP57]] -; SKX-NEXT: [[TMP60:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 20), align 16 -; SKX-NEXT: [[TMP61:%.*]] = fcmp fast ogt float [[TMP59]], [[TMP60]] -; SKX-NEXT: [[TMP62:%.*]] = select i1 [[TMP61]], float [[TMP59]], float [[TMP60]] -; SKX-NEXT: [[TMP63:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 21), align 4 -; SKX-NEXT: [[TMP64:%.*]] = fcmp fast ogt float [[TMP62]], [[TMP63]] -; SKX-NEXT: [[TMP65:%.*]] = select i1 [[TMP64]], float [[TMP62]], float [[TMP63]] -; SKX-NEXT: [[TMP66:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 22), align 8 -; SKX-NEXT: [[TMP67:%.*]] = fcmp fast ogt float [[TMP65]], [[TMP66]] -; SKX-NEXT: [[TMP68:%.*]] = select i1 [[TMP67]], float [[TMP65]], float [[TMP66]] -; SKX-NEXT: [[TMP69:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 23), align 4 -; SKX-NEXT: [[TMP70:%.*]] = fcmp fast ogt float [[TMP68]], [[TMP69]] -; SKX-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], float [[TMP68]], float [[TMP69]] -; SKX-NEXT: [[TMP72:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 24), align 16 -; SKX-NEXT: [[TMP73:%.*]] = fcmp fast ogt float [[TMP71]], [[TMP72]] -; SKX-NEXT: [[TMP74:%.*]] = select i1 [[TMP73]], float [[TMP71]], float [[TMP72]] -; SKX-NEXT: [[TMP75:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 25), align 4 -; SKX-NEXT: [[TMP76:%.*]] = fcmp fast ogt float [[TMP74]], [[TMP75]] -; SKX-NEXT: [[TMP77:%.*]] = select i1 [[TMP76]], float [[TMP74]], float [[TMP75]] -; SKX-NEXT: [[TMP78:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 26), align 8 -; SKX-NEXT: [[TMP79:%.*]] = fcmp fast ogt float [[TMP77]], [[TMP78]] -; SKX-NEXT: [[TMP80:%.*]] = select i1 [[TMP79]], float [[TMP77]], float [[TMP78]] -; SKX-NEXT: [[TMP81:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 27), align 4 -; SKX-NEXT: [[TMP82:%.*]] = fcmp fast ogt float [[TMP80]], [[TMP81]] -; SKX-NEXT: [[TMP83:%.*]] = select i1 [[TMP82]], float [[TMP80]], float [[TMP81]] -; SKX-NEXT: [[TMP84:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 28), align 16 -; SKX-NEXT: [[TMP85:%.*]] = fcmp fast ogt float [[TMP83]], [[TMP84]] -; SKX-NEXT: [[TMP86:%.*]] = select i1 [[TMP85]], float [[TMP83]], float [[TMP84]] -; SKX-NEXT: [[TMP87:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 29), align 4 -; SKX-NEXT: [[TMP88:%.*]] = fcmp fast ogt float [[TMP86]], [[TMP87]] -; SKX-NEXT: [[TMP89:%.*]] = select i1 [[TMP88]], float [[TMP86]], float [[TMP87]] -; SKX-NEXT: [[TMP90:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 30), align 8 -; SKX-NEXT: [[TMP91:%.*]] = fcmp fast ogt float [[TMP89]], [[TMP90]] -; SKX-NEXT: [[TMP92:%.*]] = select i1 [[TMP91]], float [[TMP89]], float [[TMP90]] -; SKX-NEXT: [[TMP93:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 31), align 4 -; SKX-NEXT: [[TMP94:%.*]] = fcmp fast ogt float [[TMP92]], [[TMP93]] -; SKX-NEXT: [[TMP95:%.*]] = select i1 [[TMP94]], float [[TMP92]], float [[TMP93]] -; SKX-NEXT: ret float [[TMP95]] +; SKX-NEXT: [[TMP2:%.*]] = load <32 x float>, <32 x float>* bitcast ([32 x float]* @arr1 to <32 x float>*), align 16 +; SKX: [[RDX_SHUF:%.*]] = shufflevector <32 x float> [[TMP2]], <32 x float> undef, <32 x i32> +; SKX-NEXT: [[TMP96:%.*]] = fcmp fast ogt <32 x float> [[TMP2]], [[RDX_SHUF]] +; SKX-NEXT: [[BIN_RDX:%.*]] = select <32 x i1> [[TMP96]], <32 x float> [[TMP2]], <32 x float> [[RDX_SHUF]] +; SKX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <32 x float> [[BIN_RDX]], <32 x float> undef, <32 x i32> +; SKX-NEXT: [[TMP97:%.*]] = fcmp fast ogt <32 x float> [[BIN_RDX]], [[RDX_SHUF1]] +; SKX-NEXT: [[BIN_RDX2:%.*]] = select <32 x i1> [[TMP97]], <32 x float> [[BIN_RDX]], <32 x float> [[RDX_SHUF1]] +; SKX-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <32 x float> [[BIN_RDX2]], <32 x float> undef, <32 x i32> +; SKX-NEXT: [[TMP98:%.*]] = fcmp fast ogt <32 x float> [[BIN_RDX2]], [[RDX_SHUF3]] +; SKX-NEXT: [[BIN_RDX4:%.*]] = select <32 x i1> [[TMP98]], <32 x float> [[BIN_RDX2]], <32 x float> [[RDX_SHUF3]] +; SKX-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <32 x float> [[BIN_RDX4]], <32 x float> undef, <32 x i32> +; SKX-NEXT: [[TMP99:%.*]] = fcmp fast ogt <32 x float> [[BIN_RDX4]], [[RDX_SHUF5]] +; SKX-NEXT: [[BIN_RDX6:%.*]] = select <32 x i1> [[TMP99]], <32 x float> [[BIN_RDX4]], <32 x float> [[RDX_SHUF5]] +; SKX-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <32 x float> [[BIN_RDX6]], <32 x float> undef, <32 x i32> +; SKX-NEXT: [[TMP100:%.*]] = fcmp fast ogt <32 x float> [[BIN_RDX6]], [[RDX_SHUF7]] +; SKX-NEXT: [[BIN_RDX8:%.*]] = select <32 x i1> [[TMP100]], <32 x float> [[BIN_RDX6]], <32 x float> [[RDX_SHUF7]] +; SKX-NEXT: [[TMP101:%.*]] = extractelement <32 x float> [[BIN_RDX8]], i32 0 +; SKX: ret float [[TMP101]] ; %2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16 %3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4 Index: test/Transforms/SLPVectorizer/X86/horizontal.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/horizontal.ll +++ test/Transforms/SLPVectorizer/X86/horizontal.ll @@ -817,22 +817,22 @@ define void @i32_red_call(i32 %val) { ; CHECK-LABEL: @i32_red_call( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4 -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8 -; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 [[TMP2]], [[ADD]] -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4 -; CHECK-NEXT: [[ADD_2:%.*]] = add nsw i32 [[TMP3]], [[ADD_1]] -; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16 -; CHECK-NEXT: [[ADD_3:%.*]] = add nsw i32 [[TMP4]], [[ADD_2]] -; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4 -; CHECK-NEXT: [[ADD_4:%.*]] = add nsw i32 [[TMP5]], [[ADD_3]] -; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8 -; CHECK-NEXT: [[ADD_5:%.*]] = add nsw i32 [[TMP6]], [[ADD_4]] -; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4 -; CHECK-NEXT: [[ADD_6:%.*]] = add nsw i32 [[TMP7]], [[ADD_5]] -; CHECK-NEXT: [[RES:%.*]] = call i32 @foobar(i32 [[ADD_6]]) +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 undef, undef +; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 undef, [[ADD]] +; CHECK-NEXT: [[ADD_2:%.*]] = add nsw i32 undef, [[ADD_1]] +; CHECK-NEXT: [[ADD_3:%.*]] = add nsw i32 undef, [[ADD_2]] +; CHECK-NEXT: [[ADD_4:%.*]] = add nsw i32 undef, [[ADD_3]] +; CHECK-NEXT: [[ADD_5:%.*]] = add nsw i32 undef, [[ADD_4]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <8 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = add nsw <8 x i32> [[TMP0]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> +; CHECK-NEXT: [[BIN_RDX2:%.*]] = add nsw <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> +; CHECK-NEXT: [[BIN_RDX4:%.*]] = add nsw <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 +; CHECK-NEXT: [[ADD_6:%.*]] = add nsw i32 undef, [[ADD_5]] +; CHECK-NEXT: [[RES:%.*]] = call i32 @foobar(i32 [[TMP1]]) ; CHECK-NEXT: ret void ; entry: @@ -858,22 +858,22 @@ define void @i32_red_invoke(i32 %val) personality i32 (...)* @__gxx_personality_v0 { ; CHECK-LABEL: @i32_red_invoke( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4 -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8 -; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 [[TMP2]], [[ADD]] -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4 -; CHECK-NEXT: [[ADD_2:%.*]] = add nsw i32 [[TMP3]], [[ADD_1]] -; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16 -; CHECK-NEXT: [[ADD_3:%.*]] = add nsw i32 [[TMP4]], [[ADD_2]] -; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4 -; CHECK-NEXT: [[ADD_4:%.*]] = add nsw i32 [[TMP5]], [[ADD_3]] -; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8 -; CHECK-NEXT: [[ADD_5:%.*]] = add nsw i32 [[TMP6]], [[ADD_4]] -; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4 -; CHECK-NEXT: [[ADD_6:%.*]] = add nsw i32 [[TMP7]], [[ADD_5]] -; CHECK-NEXT: [[RES:%.*]] = invoke i32 @foobar(i32 [[ADD_6]]) +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 undef, undef +; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 undef, [[ADD]] +; CHECK-NEXT: [[ADD_2:%.*]] = add nsw i32 undef, [[ADD_1]] +; CHECK-NEXT: [[ADD_3:%.*]] = add nsw i32 undef, [[ADD_2]] +; CHECK-NEXT: [[ADD_4:%.*]] = add nsw i32 undef, [[ADD_3]] +; CHECK-NEXT: [[ADD_5:%.*]] = add nsw i32 undef, [[ADD_4]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <8 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = add nsw <8 x i32> [[TMP0]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> +; CHECK-NEXT: [[BIN_RDX2:%.*]] = add nsw <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> +; CHECK-NEXT: [[BIN_RDX4:%.*]] = add nsw <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 +; CHECK-NEXT: [[ADD_6:%.*]] = add nsw i32 undef, [[ADD_5]] +; CHECK-NEXT: [[RES:%.*]] = invoke i32 @foobar(i32 [[TMP1]]) ; CHECK-NEXT: to label [[NORMAL:%.*]] unwind label [[EXCEPTION:%.*]] ; CHECK: exception: ; CHECK-NEXT: [[CLEANUP:%.*]] = landingpad i8 Index: test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll +++ test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll @@ -303,24 +303,30 @@ ; CHECK-NEXT: [[B1:%.*]] = extractelement <4 x float> %b, i32 1 ; CHECK-NEXT: [[B2:%.*]] = extractelement <4 x float> %b, i32 2 ; CHECK-NEXT: [[B3:%.*]] = extractelement <4 x float> %b, i32 3 -; CHECK-NEXT: [[CMP0:%.*]] = icmp ne i32 [[C0]], 0 -; CHECK-NEXT: [[CMP1:%.*]] = icmp ne i32 [[C1]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[C2]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C3]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[C0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C1]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[S0:%.*]] = select i1 [[CMP0]], float [[A0]], float [[B0]] -; CHECK-NEXT: [[S1:%.*]] = select i1 [[CMP1]], float [[A1]], float [[B1]] -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> undef, float [[A2]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[A3]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> undef, float [[B2]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[B3]], i32 1 -; CHECK-NEXT: [[TMP8:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP5]], <2 x float> [[TMP7]] -; CHECK-NEXT: [[RA:%.*]] = insertelement <4 x float> undef, float [[S0]], i32 0 -; CHECK-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[S1]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0 -; CHECK-NEXT: [[RC:%.*]] = insertelement <4 x float> undef, float [[TMP9]], i32 2 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1 -; CHECK-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP10]], i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> undef, i32 [[C2]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[C3]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <2 x i32> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x float> undef, float [[A0]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[A1]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x float> undef, float [[B0]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[B1]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP8]], <2 x float> [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x float> undef, float [[A2]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x float> [[TMP12]], float [[A3]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x float> undef, float [[B2]], i32 0 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[B3]], i32 1 +; CHECK-NEXT: [[TMP16:%.*]] = select <2 x i1> [[TMP6]], <2 x float> [[TMP13]], <2 x float> [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[TMP11]], i32 0 +; CHECK-NEXT: [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP17]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP11]], i32 1 +; CHECK-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP18]], i32 1 +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x float> [[TMP16]], i32 0 +; CHECK-NEXT: [[RC:%.*]] = insertelement <4 x float> undef, float [[TMP19]], i32 2 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[TMP16]], i32 1 +; CHECK-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP20]], i32 3 ; CHECK-NEXT: ret <4 x float> [[RD]] ; ; ZEROTHRESH-LABEL: @simple_select_no_users(