diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h --- a/llvm/include/llvm/Analysis/IVDescriptors.h +++ b/llvm/include/llvm/Analysis/IVDescriptors.h @@ -33,25 +33,31 @@ /// These are the kinds of recurrences that we support. enum class RecurKind { - None, ///< Not a recurrence. - Add, ///< Sum of integers. - Mul, ///< Product of integers. - Or, ///< Bitwise or logical OR of integers. - And, ///< Bitwise or logical AND of integers. - Xor, ///< Bitwise or logical XOR of integers. - SMin, ///< Signed integer min implemented in terms of select(cmp()). - SMax, ///< Signed integer max implemented in terms of select(cmp()). - UMin, ///< Unisgned integer min implemented in terms of select(cmp()). - UMax, ///< Unsigned integer max implemented in terms of select(cmp()). - FAdd, ///< Sum of floats. - FMul, ///< Product of floats. - FMin, ///< FP min implemented in terms of select(cmp()). - FMax, ///< FP max implemented in terms of select(cmp()). - FMulAdd, ///< Fused multiply-add of floats (a * b + c). - SelectICmp, ///< Integer select(icmp(),x,y) where one of (x,y) is loop - ///< invariant - SelectFCmp ///< Integer select(fcmp(),x,y) where one of (x,y) is loop - ///< invariant + None, ///< Not a recurrence. + Add, ///< Sum of integers. + Mul, ///< Product of integers. + Or, ///< Bitwise or logical OR of integers. + And, ///< Bitwise or logical AND of integers. + Xor, ///< Bitwise or logical XOR of integers. + SMin, ///< Signed integer min implemented in terms of select(cmp()). + SMax, ///< Signed integer max implemented in terms of select(cmp()). + UMin, ///< Unisgned integer min implemented in terms of select(cmp()). + UMax, ///< Unsigned integer max implemented in terms of select(cmp()). + FAdd, ///< Sum of floats. + FMul, ///< Product of floats. + FMin, ///< FP min implemented in terms of select(cmp()). + FMax, ///< FP max implemented in terms of select(cmp()). + FMulAdd, ///< Fused multiply-add of floats (a * b + c). + SelectICmp, ///< Integer select(icmp(),x,y) where one of (x,y) is loop + ///< invariant + SelectFCmp, ///< Integer select(fcmp(),x,y) where one of (x,y) is loop + ///< invariant + SelectIVICmp, ///< Integer select(icmp(),x,y) where one of (x,y) is increasing + ///< loop induction PHI + SelectIVFCmp, ///< Integer select(fcmp(),x,y) where one of (x,y) is increasing + ///< loop induction PHI + MinMaxFirstIdx, ///< Min/Max with first index + MinMaxLastIdx ///< Min/Max with last index }; /// The RecurrenceDescriptor is used to identify recurrences variables in a @@ -74,11 +80,13 @@ RecurKind K, FastMathFlags FMF, Instruction *ExactFP, Type *RT, bool Signed, bool Ordered, SmallPtrSetImpl<Instruction *> &CI, - unsigned MinWidthCastToRecurTy) + unsigned MinWidthCastToRecurTy, PHINode *UserRecurPhi, + RecurKind UserRecurKind) : IntermediateStore(Store), StartValue(Start), LoopExitInstr(Exit), Kind(K), FMF(FMF), ExactFPMathInst(ExactFP), RecurrenceType(RT), IsSigned(Signed), IsOrdered(Ordered), - MinWidthCastToRecurrenceType(MinWidthCastToRecurTy) { + MinWidthCastToRecurrenceType(MinWidthCastToRecurTy), + UserRecurPhi(UserRecurPhi), UserRecurKind(UserRecurKind) { CastInsts.insert(CI.begin(), CI.end()); } @@ -93,6 +101,12 @@ : IsRecurrence(true), PatternLastInst(I), RecKind(K), ExactFPMathInst(ExactFP) {} + InstDesc(bool IsRecur, Instruction *I, PHINode *CandUserRecurPhi, + RecurKind CandUserRecurKind, Instruction *ExactFP = nullptr) + : IsRecurrence(IsRecur), PatternLastInst(I), RecKind(RecurKind::None), + CandUserRecurPhi(CandUserRecurPhi), + CandUserRecurKind(CandUserRecurKind), ExactFPMathInst(ExactFP) {} + bool isRecurrence() const { return IsRecurrence; } bool needsExactFPMath() const { return ExactFPMathInst != nullptr; } @@ -103,6 +117,14 @@ Instruction *getPatternInst() const { return PatternLastInst; } + PHINode *getCandUserRecurPhi() const { return CandUserRecurPhi; } + + RecurKind getCandUserRecurKind() const { return CandUserRecurKind; } + + bool isCandidateUser() const { + return getCandUserRecurPhi() && getCandUserRecurKind() != RecurKind::None; + } + private: // Is this instruction a recurrence candidate. bool IsRecurrence; @@ -113,6 +135,11 @@ RecurKind RecKind; // Recurrence does not allow floating-point reassociation. Instruction *ExactFPMathInst; + // This instruction may be the operation of another recurrence. + // Record potential recurrence phi. + PHINode *CandUserRecurPhi = nullptr; + // And expected recurrence kind. + RecurKind CandUserRecurKind = RecurKind::None; }; /// Returns a struct describing if the instruction 'I' can be a recurrence @@ -123,7 +150,7 @@ /// the returned struct. static InstDesc isRecurrenceInstr(Loop *L, PHINode *Phi, Instruction *I, RecurKind Kind, InstDesc &Prev, - FastMathFlags FuncFMF); + FastMathFlags FuncFMF, ScalarEvolution *SE); /// Returns true if instruction I has multiple uses in Insts static bool hasMultipleUsesOf(Instruction *I, @@ -139,7 +166,18 @@ /// Kind. \p Prev specifies the description of an already processed select /// instruction, so its corresponding cmp can be matched to it. static InstDesc isMinMaxPattern(Instruction *I, RecurKind Kind, - const InstDesc &Prev); + const InstDesc &Prev, Loop *Loop, + PHINode *OrigPhi, ScalarEvolution *SE); + + /// Returns RecurKind describing which min/max recurrence kind the instruction + /// \p I belongs to. Return RecurKind::None if instruction \p I is not matched + /// any of min/max recurrence kind. Unlike isMinMaxPattern, this function does + /// not limit exactly one use of cmp value. + static RecurKind isMinMaxOperation(Instruction *I); + + static InstDesc isMinMaxIdxPattern(Loop *Loop, Instruction *I, + PHINode *MinMaxPhi, RecurKind MinMaxKind, + ScalarEvolution *SE); /// Returns a struct describing whether the instruction is either a /// Select(ICmp(A, B), X, Y), or @@ -148,7 +186,8 @@ /// value. \p Prev specifies the description of an already processed select /// instruction, so its corresponding cmp can be matched to it. static InstDesc isSelectCmpPattern(Loop *Loop, PHINode *OrigPhi, - Instruction *I, InstDesc &Prev); + Instruction *I, InstDesc &Prev, + ScalarEvolution *SE); /// Returns a struct describing if the instruction is a /// Select(FCmp(X, Y), (Z = X op PHINode), PHINode) instruction pattern. @@ -234,10 +273,23 @@ return isIntMinMaxRecurrenceKind(Kind) || isFPMinMaxRecurrenceKind(Kind); } + /// Returns true if the recurrence kind is a max kind. + static bool isMaxRecurrenceKind(RecurKind Kind) { + return Kind == RecurKind::UMax || Kind == RecurKind::SMax || + Kind == RecurKind::FMax; + } + + static bool isMinMaxIdxRecurrenceKind(RecurKind Kind) { + return Kind == RecurKind::MinMaxFirstIdx || + Kind == RecurKind::MinMaxLastIdx; + } + /// Returns true if the recurrence kind is of the form /// select(cmp(),x,y) where one of (x,y) is loop invariant. static bool isSelectCmpRecurrenceKind(RecurKind Kind) { - return Kind == RecurKind::SelectICmp || Kind == RecurKind::SelectFCmp; + return Kind == RecurKind::SelectICmp || Kind == RecurKind::SelectFCmp || + Kind == RecurKind::SelectIVICmp || Kind == RecurKind::SelectIVFCmp || + isMinMaxIdxRecurrenceKind(Kind); } /// Returns the type of the recurrence. This type can be narrower than the @@ -248,6 +300,29 @@ /// recurrence. const SmallPtrSet<Instruction *, 8> &getCastInsts() const { return CastInsts; } + PHINode *getUserRecurPhi() const { return UserRecurPhi; } + + void setRecurKind(RecurKind K) { + assert((K != RecurKind::None) && "Unexpected recurrence kind."); + Kind = K; + } + + void setDependMinMaxRecDes(RecurrenceDescriptor *MMRD) { + assert(isMinMaxRecurrenceKind(MMRD->getRecurrenceKind()) && + "DependMinMaxRecDes must be a min/max recurrence."); + DependMinMaxRecDes = MMRD; + } + + RecurrenceDescriptor *getDependMinMaxRecDes() const { + return DependMinMaxRecDes; + } + + bool hasUserRecurrence() const { + return UserRecurPhi && UserRecurKind != RecurKind::None; + } + + bool fixUserRecurrence(RecurrenceDescriptor &UserRedDes); + /// Returns the minimum width used by the recurrence in bits. unsigned getMinWidthCastToRecurrenceTypeInBits() const { return MinWidthCastToRecurrenceType; @@ -300,6 +375,12 @@ SmallPtrSet<Instruction *, 8> CastInsts; // The minimum width used by the recurrence. unsigned MinWidthCastToRecurrenceType; + + PHINode *UserRecurPhi = nullptr; + + RecurKind UserRecurKind = RecurKind::None; + + RecurrenceDescriptor *DependMinMaxRecDes = nullptr; }; /// A struct for saving information about induction variables. diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h --- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -359,7 +359,7 @@ /// to select between \p Left and \p Right. Any lane value in \p Left that /// matches 2) will be merged into \p Right. Value *createSelectCmpOp(IRBuilderBase &Builder, Value *StartVal, RecurKind RK, - Value *Left, Value *Right); + Value *Left, Value *Right, Value *SrcCmp = nullptr); /// Returns a Min/Max operation corresponding to MinMaxRecurrenceKind. /// The Builder's fast-math-flags must be set to propagate the expected values. @@ -388,11 +388,25 @@ /// Create a target reduction of the given vector \p Src for a reduction of the /// kind RecurKind::SelectICmp or RecurKind::SelectFCmp. The reduction operation /// is described by \p Desc. -Value *createSelectCmpTargetReduction(IRBuilderBase &B, - const TargetTransformInfo *TTI, - Value *Src, - const RecurrenceDescriptor &Desc, - PHINode *OrigPhi); +Value *createInvariantSelectCmpTargetReduction(IRBuilderBase &B, + const TargetTransformInfo *TTI, + Value *Src, + const RecurrenceDescriptor &Desc, + PHINode *OrigPhi); + +Value *createMMISelectCmpTargetReduction(IRBuilderBase &Builder, + const TargetTransformInfo *TTI, + Value *Src, + const RecurrenceDescriptor &Desc, + PHINode *OrigPhi, Value *SrcMask); + +/// Create a target reduction of the given vector \p Src for a reduction of the +/// kind RecurKind::SelectICmp or RecurKind::SelectFCmp. The reduction operation +/// is described by \p Desc. +Value * +createSelectCmpTargetReduction(IRBuilderBase &B, const TargetTransformInfo *TTI, + Value *Src, const RecurrenceDescriptor &Desc, + PHINode *OrigPhi, Value *SrcMask = nullptr); /// Create a generic target reduction using a recurrence descriptor \p Desc /// The target is queried to determine if intrinsics or shuffle sequences are @@ -400,7 +414,8 @@ /// Fast-math-flags are propagated using the RecurrenceDescriptor. Value *createTargetReduction(IRBuilderBase &B, const TargetTransformInfo *TTI, const RecurrenceDescriptor &Desc, Value *Src, - PHINode *OrigPhi = nullptr); + PHINode *OrigPhi = nullptr, + Value *SrcMask = nullptr); /// Create an ordered reduction intrinsic using the given recurrence /// descriptor \p Desc. @@ -408,6 +423,11 @@ const RecurrenceDescriptor &Desc, Value *Src, Value *Start); +Value *createSentinelValueHandling(IRBuilderBase &Builder, + const TargetTransformInfo *TTI, + const RecurrenceDescriptor &Desc, + Value *Rdx); + /// Get the intersection (logical and) of all of the potential IR flags /// of each scalar operation (VL) that will be converted into a vector (I). /// If OpValue is non-null, we only consider operations similar to OpValue diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp --- a/llvm/lib/Analysis/IVDescriptors.cpp +++ b/llvm/lib/Analysis/IVDescriptors.cpp @@ -54,6 +54,10 @@ case RecurKind::UMin: case RecurKind::SelectICmp: case RecurKind::SelectFCmp: + case RecurKind::SelectIVICmp: + case RecurKind::SelectIVFCmp: + case RecurKind::MinMaxFirstIdx: + case RecurKind::MinMaxLastIdx: return true; } return false; @@ -63,6 +67,21 @@ return (Kind != RecurKind::None) && !isIntegerRecurrenceKind(Kind); } +bool RecurrenceDescriptor::fixUserRecurrence(RecurrenceDescriptor &UserRedDes) { + RecurKind UserCurrKind = UserRedDes.getRecurrenceKind(); + assert((UserCurrKind != RecurKind::None) && "Unexpected recurrence kind."); + + if (isMinMaxRecurrenceKind(Kind)) + if (UserCurrKind == RecurKind::SelectIVICmp || + UserCurrKind == RecurKind::SelectIVFCmp) { + UserRedDes.setRecurKind(UserRecurKind); + UserRedDes.setDependMinMaxRecDes(this); + return true; + } + + return false; +} + /// Determines if Phi may have been type-promoted. If Phi has a single user /// that ANDs the Phi with a type mask, return the user. RT is updated to /// account for the narrower bit width represented by the mask, and the AND @@ -247,6 +266,9 @@ // must include the original PHI. bool FoundStartPHI = false; + PHINode *UserRecurPHI = nullptr; + RecurKind UserRecurKind = RecurKind::None; + // To recognize min/max patterns formed by a icmp select sequence, we store // the number of instruction we saw from the recognized min/max pattern, // to make sure we only see exactly the two instructions. @@ -375,12 +397,32 @@ // type-promoted). if (Cur != Start) { ReduxDesc = - isRecurrenceInstr(TheLoop, Phi, Cur, Kind, ReduxDesc, FuncFMF); + isRecurrenceInstr(TheLoop, Phi, Cur, Kind, ReduxDesc, FuncFMF, SE); ExactFPMathInst = ExactFPMathInst == nullptr ? ReduxDesc.getExactFPMathInst() : ExactFPMathInst; - if (!ReduxDesc.isRecurrence()) - return false; + if (!ReduxDesc.isRecurrence()) { + if (!ReduxDesc.isCandidateUser()) + return false; + + // TODO: Only allow one user recurrence now. + if (UserRecurPHI) + return false; + + UserRecurPHI = ReduxDesc.getCandUserRecurPhi(); + UserRecurKind = ReduxDesc.getCandUserRecurKind(); + // TODO: Call AddReductionVar here? + + // Fix NumCmpSelectPatternInst + if (auto *SI = dyn_cast<SelectInst>(Cur)) { + auto *CI = dyn_cast<CmpInst>(SI->getCondition()); + if (CI->hasOneUse()) + --NumCmpSelectPatternInst; + } + // Stop visiting the users of current instruction if it contains user + // recurrence. + continue; + } // FIXME: FMF is allowed on phi, but propagation is not handled correctly. if (isa<FPMathOperator>(ReduxDesc.getPatternInst()) && !IsAPhi) { FastMathFlags CurFMF = ReduxDesc.getPatternInst()->getFastMathFlags(); @@ -419,10 +461,12 @@ if (IsAPhi && Cur != Phi && !areAllUsesIn(Cur, VisitedInsts)) return false; - if ((isIntMinMaxRecurrenceKind(Kind) || Kind == RecurKind::SelectICmp) && + if ((isIntMinMaxRecurrenceKind(Kind) || Kind == RecurKind::SelectICmp || + Kind == RecurKind::SelectIVICmp) && (isa<ICmpInst>(Cur) || isa<SelectInst>(Cur))) ++NumCmpSelectPatternInst; - if ((isFPMinMaxRecurrenceKind(Kind) || Kind == RecurKind::SelectFCmp) && + if ((isFPMinMaxRecurrenceKind(Kind) || Kind == RecurKind::SelectFCmp || + Kind == RecurKind::SelectIVFCmp) && (isa<FCmpInst>(Cur) || isa<SelectInst>(Cur))) ++NumCmpSelectPatternInst; @@ -488,9 +532,10 @@ ((!isa<FCmpInst>(UI) && !isa<ICmpInst>(UI) && !isa<SelectInst>(UI)) || (!isConditionalRdxPattern(Kind, UI).isRecurrence() && - !isSelectCmpPattern(TheLoop, Phi, UI, IgnoredVal) + !isSelectCmpPattern(TheLoop, Phi, UI, IgnoredVal, SE) .isRecurrence() && - !isMinMaxPattern(UI, Kind, IgnoredVal).isRecurrence()))) + !isMinMaxPattern(UI, Kind, IgnoredVal, TheLoop, Phi, SE) + .isRecurrence()))) return false; // Remember that we completed the cycle. @@ -600,7 +645,8 @@ // Save the description of this reduction variable. RecurrenceDescriptor RD(RdxStart, ExitInstruction, IntermediateStore, Kind, FMF, ExactFPMathInst, RecurrenceType, IsSigned, - IsOrdered, CastInsts, MinWidthCastToRecurrenceType); + IsOrdered, CastInsts, MinWidthCastToRecurrenceType, + UserRecurPHI, UserRecurKind); RedDes = RD; return true; @@ -629,7 +675,8 @@ // value (3 in the example above). RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isSelectCmpPattern(Loop *Loop, PHINode *OrigPhi, - Instruction *I, InstDesc &Prev) { + Instruction *I, InstDesc &Prev, + ScalarEvolution *SE) { // We must handle the select(cmp(),x,y) as a single instruction. Advance to // the select. CmpInst::Predicate Pred; @@ -653,19 +700,43 @@ else return InstDesc(false, I); + auto isIncreasingLoopInduction = [&SE, &Loop](Value *V) { + if (!SE) + return false; + + auto *Phi = dyn_cast<PHINode>(V); + if (!Phi) + return false; + + auto LB = Loop::LoopBounds::getBounds(*Loop, *Phi, *SE); + if (!LB) + return false; + + auto Direction = LB->getDirection(); + return Direction == Loop::LoopBounds::Direction::Increasing; + }; + // We are looking for selects of the form: // select(cmp(), phi, loop_invariant) or // select(cmp(), loop_invariant, phi) - if (!Loop->isLoopInvariant(NonPhi)) - return InstDesc(false, I); + if (Loop->isLoopInvariant(NonPhi)) + return InstDesc(I, isa<ICmpInst>(I->getOperand(0)) ? RecurKind::SelectICmp + : RecurKind::SelectFCmp); + // or + // select(cmp(), phi, loop_induction) or + // select(cmp(), loop_induction, phi) + if (isIncreasingLoopInduction(NonPhi)) + return InstDesc(I, isa<ICmpInst>(I->getOperand(0)) + ? RecurKind::SelectIVICmp + : RecurKind::SelectIVFCmp); - return InstDesc(I, isa<ICmpInst>(I->getOperand(0)) ? RecurKind::SelectICmp - : RecurKind::SelectFCmp); + return InstDesc(false, I); } RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isMinMaxPattern(Instruction *I, RecurKind Kind, - const InstDesc &Prev) { + const InstDesc &Prev, Loop *Loop, + PHINode *OrigPhi, ScalarEvolution *SE) { assert((isa<CmpInst>(I) || isa<SelectInst>(I) || isa<CallInst>(I)) && "Expected a cmp or select or call instruction"); if (!isMinMaxRecurrenceKind(Kind)) @@ -685,29 +756,135 @@ m_Value()))) return InstDesc(false, I); + RecurKind MMRK = isMinMaxOperation(I); + if (MMRK != RecurKind::None) + return InstDesc(Kind == MMRK, I); + + if (isa<SelectInst>(I)) + return isMinMaxIdxPattern(Loop, I, OrigPhi, Kind, SE); + + return InstDesc(false, I); +} + +RecurKind RecurrenceDescriptor::isMinMaxOperation(Instruction *I) { // Look for a min/max pattern. if (match(I, m_UMin(m_Value(), m_Value()))) - return InstDesc(Kind == RecurKind::UMin, I); + return RecurKind::UMin; if (match(I, m_UMax(m_Value(), m_Value()))) - return InstDesc(Kind == RecurKind::UMax, I); + return RecurKind::UMax; if (match(I, m_SMax(m_Value(), m_Value()))) - return InstDesc(Kind == RecurKind::SMax, I); + return RecurKind::SMax; if (match(I, m_SMin(m_Value(), m_Value()))) - return InstDesc(Kind == RecurKind::SMin, I); + return RecurKind::SMin; if (match(I, m_OrdFMin(m_Value(), m_Value()))) - return InstDesc(Kind == RecurKind::FMin, I); + return RecurKind::FMin; if (match(I, m_OrdFMax(m_Value(), m_Value()))) - return InstDesc(Kind == RecurKind::FMax, I); + return RecurKind::FMax; if (match(I, m_UnordFMin(m_Value(), m_Value()))) - return InstDesc(Kind == RecurKind::FMin, I); + return RecurKind::FMin; if (match(I, m_UnordFMax(m_Value(), m_Value()))) - return InstDesc(Kind == RecurKind::FMax, I); + return RecurKind::FMax; if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value()))) - return InstDesc(Kind == RecurKind::FMin, I); + return RecurKind::FMin; if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value()))) - return InstDesc(Kind == RecurKind::FMax, I); + return RecurKind::FMax; - return InstDesc(false, I); + return RecurKind::None; +} + +RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isMinMaxIdxPattern( + Loop *Loop, Instruction *I, PHINode *MinMaxPhi, RecurKind MinMaxKind, + ScalarEvolution *SE) { + assert(isa<SelectInst>(I) && "Expected a select instruction"); + // TODO: FP MinMax + if (!isIntMinMaxRecurrenceKind(MinMaxKind)) + return InstDesc(false, I); + + // Requires SCEV to check the index part + if (!SE) { + LLVM_DEBUG(dbgs() << "MinMaxIdx patterns are not recognized without " + << "Scalar Evolution Analysis\n"); + return InstDesc(false, I); + } + + // Check the index select + auto *SI = dyn_cast<SelectInst>(I); + auto *CI = dyn_cast<CmpInst>(SI->getCondition()); + Value *LHS = CI->getOperand(0), *RHS = CI->getOperand(1); + + // %cmp = icmp pred, %mmphi, %0 + // %select = select %cmp, %update, %idxphi + // Check if cmp used min/max phi + bool IsLHSPhi; + if (MinMaxPhi == dyn_cast<PHINode>(LHS)) + IsLHSPhi = true; + else if (MinMaxPhi == dyn_cast<PHINode>(RHS)) + IsLHSPhi = false; + else + return InstDesc(false, I); + + // Normalize the predicate, and get which side the select should update idx + // TODO: Need to consider commutable. + CmpInst::Predicate NormPred = + IsLHSPhi ? CI->getPredicate() : CI->getInversePredicate(); + bool UpdateSide; + RecurKind ExpectedIdxRK; + switch (NormPred) { + case CmpInst::ICMP_SLT: + case CmpInst::ICMP_ULT: + // %mmphi < %0 + UpdateSide = isMaxRecurrenceKind(MinMaxKind); + ExpectedIdxRK = isMaxRecurrenceKind(MinMaxKind) ? RecurKind::MinMaxFirstIdx + : RecurKind::MinMaxLastIdx; + break; + case CmpInst::ICMP_SLE: + case CmpInst::ICMP_ULE: + // %mmphi <= %0 + UpdateSide = isMaxRecurrenceKind(MinMaxKind); + ExpectedIdxRK = isMaxRecurrenceKind(MinMaxKind) ? RecurKind::MinMaxLastIdx + : RecurKind::MinMaxFirstIdx; + break; + case CmpInst::ICMP_SGT: + case CmpInst::ICMP_UGT: + // %mmphi > %0 + UpdateSide = !isMaxRecurrenceKind(MinMaxKind); + ExpectedIdxRK = isMaxRecurrenceKind(MinMaxKind) ? RecurKind::MinMaxLastIdx + : RecurKind::MinMaxFirstIdx; + break; + case CmpInst::ICMP_SGE: + case CmpInst::ICMP_UGE: + // %mmphi >= %0 + UpdateSide = !isMaxRecurrenceKind(MinMaxKind); + ExpectedIdxRK = isMaxRecurrenceKind(MinMaxKind) ? RecurKind::MinMaxFirstIdx + : RecurKind::MinMaxLastIdx; + break; + default: + return InstDesc(false, I); + } + + // Get the reduction phi of index select + Value *IdxUpdateV = UpdateSide ? SI->getTrueValue() : SI->getFalseValue(); + Value *IdxReduxV = UpdateSide ? SI->getFalseValue() : SI->getTrueValue(); + // Handle the operand of index select may have been casted. + if (auto *Cast = dyn_cast<CastInst>(IdxUpdateV)) + IdxUpdateV = Cast->getOperand(0); + + auto *IdxUpdatePhi = dyn_cast<PHINode>(IdxUpdateV); + auto *IdxReduxPhi = dyn_cast<PHINode>(IdxReduxV); + if (!IdxUpdatePhi || !IdxReduxPhi) + return InstDesc(false, I); + + // Check update side is a loop induction variable + InductionDescriptor ID; + if (!InductionDescriptor::isInductionPHI(IdxUpdatePhi, Loop, SE, ID)) + return InstDesc(false, I); + + // The reduction phi of index select and reduction phi of min/max must not the + // same + if (IdxReduxPhi == MinMaxPhi) + return InstDesc(false, I); + + return InstDesc(false, I, IdxReduxPhi, ExpectedIdxRK); } /// Returns true if the select instruction has users in the compare-and-add @@ -762,10 +939,9 @@ return InstDesc(true, SI); } -RecurrenceDescriptor::InstDesc -RecurrenceDescriptor::isRecurrenceInstr(Loop *L, PHINode *OrigPhi, - Instruction *I, RecurKind Kind, - InstDesc &Prev, FastMathFlags FuncFMF) { +RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isRecurrenceInstr( + Loop *L, PHINode *OrigPhi, Instruction *I, RecurKind Kind, InstDesc &Prev, + FastMathFlags FuncFMF, ScalarEvolution *SE) { assert(Prev.getRecKind() == RecurKind::None || Prev.getRecKind() == Kind); switch (I->getOpcode()) { default: @@ -800,13 +976,13 @@ case Instruction::ICmp: case Instruction::Call: if (isSelectCmpRecurrenceKind(Kind)) - return isSelectCmpPattern(L, OrigPhi, I, Prev); + return isSelectCmpPattern(L, OrigPhi, I, Prev, SE); if (isIntMinMaxRecurrenceKind(Kind) || (((FuncFMF.noNaNs() && FuncFMF.noSignedZeros()) || (isa<FPMathOperator>(I) && I->hasNoNaNs() && I->hasNoSignedZeros())) && isFPMinMaxRecurrenceKind(Kind))) - return isMinMaxPattern(I, Kind, Prev); + return isMinMaxPattern(I, Kind, Prev, L, OrigPhi, SE); else if (isFMulAddIntrinsic(I)) return InstDesc(Kind == RecurKind::FMulAdd, I, I->hasAllowReassoc() ? nullptr : I); @@ -1128,6 +1304,12 @@ case RecurKind::SelectFCmp: return getRecurrenceStartValue(); break; + case RecurKind::SelectIVICmp: + case RecurKind::SelectIVFCmp: + case RecurKind::MinMaxFirstIdx: + case RecurKind::MinMaxLastIdx: + // FIXME: SMax or UMax, I'm not sure which one is correct. + return getRecurrenceIdentity(RecurKind::SMax, Tp, FMF); default: llvm_unreachable("Unknown recurrence kind"); } @@ -1155,10 +1337,15 @@ case RecurKind::UMax: case RecurKind::UMin: case RecurKind::SelectICmp: + case RecurKind::SelectIVICmp: + // TODO: maybe new FMinMaxFirstIdx/ FMinMaxLastIdx + case RecurKind::MinMaxFirstIdx: + case RecurKind::MinMaxLastIdx: return Instruction::ICmp; case RecurKind::FMax: case RecurKind::FMin: case RecurKind::SelectFCmp: + case RecurKind::SelectIVFCmp: return Instruction::FCmp; default: llvm_unreachable("Unknown recurrence operation"); diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -900,12 +900,37 @@ } Value *llvm::createSelectCmpOp(IRBuilderBase &Builder, Value *StartVal, - RecurKind RK, Value *Left, Value *Right) { - if (auto VTy = dyn_cast<VectorType>(Left->getType())) - StartVal = Builder.CreateVectorSplat(VTy->getElementCount(), StartVal); - Value *Cmp = - Builder.CreateCmp(CmpInst::ICMP_NE, Left, StartVal, "rdx.select.cmp"); - return Builder.CreateSelect(Cmp, Left, Right, "rdx.select"); + RecurKind RK, Value *Left, Value *Right, + Value *SrcCmp) { + switch (RK) { + case RecurKind::SelectICmp: + case RecurKind::SelectFCmp: { + if (auto VTy = dyn_cast<VectorType>(Left->getType())) + StartVal = Builder.CreateVectorSplat(VTy->getElementCount(), StartVal); + Value *Cmp = + Builder.CreateCmp(CmpInst::ICMP_NE, Left, StartVal, "rdx.select.cmp"); + return Builder.CreateSelect(Cmp, Left, Right, "rdx.select"); + } + case RecurKind::SelectIVICmp: + case RecurKind::SelectIVFCmp: + // TODO: SMax or UMax? + return createMinMaxOp(Builder, RecurKind::SMax, Left, Right); + case RecurKind::MinMaxFirstIdx: { + assert((SrcCmp && isa<CmpInst>(SrcCmp)) && + "SrcCmp should not be nullptr when MinMaxFirstIdx recurrence"); + auto *SrcCI = dyn_cast<CmpInst>(SrcCmp); + CmpInst::Predicate Pred = SrcCI->getNonStrictPredicate(); + Value *Cmp = Builder.CreateCmp(Pred, SrcCI->getOperand(0), + SrcCI->getOperand(1), "rdx.select.cmp"); + return Builder.CreateSelect(Cmp, Left, Right, "rdx.select"); + } + case RecurKind::MinMaxLastIdx: + assert((SrcCmp && isa<CmpInst>(SrcCmp)) && + "SrcCmp should not be nullptr when MinMaxLastIdx recurrence"); + return Builder.CreateSelect(SrcCmp, Left, Right, "rdx.select"); + default: + llvm_unreachable("Unknown SelectCmp recurrence kind"); + } } Value *llvm::createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left, @@ -982,13 +1007,11 @@ return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); } -Value *llvm::createSelectCmpTargetReduction(IRBuilderBase &Builder, - const TargetTransformInfo *TTI, - Value *Src, - const RecurrenceDescriptor &Desc, - PHINode *OrigPhi) { - assert(RecurrenceDescriptor::isSelectCmpRecurrenceKind( - Desc.getRecurrenceKind()) && +Value *llvm::createInvariantSelectCmpTargetReduction( + IRBuilderBase &Builder, const TargetTransformInfo *TTI, Value *Src, + const RecurrenceDescriptor &Desc, PHINode *OrigPhi) { + assert((Desc.getRecurrenceKind() == RecurKind::SelectICmp || + Desc.getRecurrenceKind() == RecurKind::SelectFCmp) && "Unexpected reduction kind"); Value *InitVal = Desc.getRecurrenceStartValue(); Value *NewVal = nullptr; @@ -1022,6 +1045,58 @@ return Builder.CreateSelect(Cmp, NewVal, InitVal, "rdx.select"); } +Value *llvm::createMMISelectCmpTargetReduction( + IRBuilderBase &Builder, const TargetTransformInfo *TTI, Value *Src, + const RecurrenceDescriptor &Desc, PHINode *OrigPhi, Value *SrcMask) { + assert(RecurrenceDescriptor::isMinMaxIdxRecurrenceKind( + Desc.getRecurrenceKind()) && + "Unexpected reduction kind"); + RecurKind Kind = Desc.getRecurrenceKind(); + // FIXME: UMax/SMax or UMin/UMax? + RecurKind RdxExtractK = + Kind == RecurKind::MinMaxFirstIdx ? RecurKind::SMin : RecurKind::SMax; + + assert(SrcMask && "MinMaxIdx recurrence requests mask"); + // TODO: If vp reduction intrinsic is supported, there is no need to generate + // additional select here. + auto *SrcVecEltTy = cast<VectorType>(Src->getType())->getElementType(); + Value *RdxOpIden = Desc.getRecurrenceIdentity(RdxExtractK, SrcVecEltTy, + Desc.getFastMathFlags()); + ElementCount EC = cast<VectorType>(Src->getType())->getElementCount(); + RdxOpIden = Builder.CreateVectorSplat(EC, RdxOpIden); + Value *NewVal = Builder.CreateSelect(SrcMask, Src, RdxOpIden, "mask.select"); + + return createSimpleTargetReduction(Builder, TTI, NewVal, RdxExtractK); +} + +Value *llvm::createSelectCmpTargetReduction(IRBuilderBase &Builder, + const TargetTransformInfo *TTI, + Value *Src, + const RecurrenceDescriptor &Desc, + PHINode *OrigPhi, Value *SrcMask) { + assert(RecurrenceDescriptor::isSelectCmpRecurrenceKind( + Desc.getRecurrenceKind()) && + "Unexpected reduction kind"); + RecurKind RdxKind = Desc.getRecurrenceKind(); + switch (RdxKind) { + case RecurKind::SelectICmp: + case RecurKind::SelectFCmp: + return createInvariantSelectCmpTargetReduction(Builder, TTI, Src, Desc, + OrigPhi); + case RecurKind::SelectIVICmp: + case RecurKind::SelectIVFCmp: + // FIXME: SMax or UMax? + // TODO: Decreasing induction need fix here + return Builder.CreateIntMaxReduce(Src, true); + case RecurKind::MinMaxFirstIdx: + case RecurKind::MinMaxLastIdx: + return createMMISelectCmpTargetReduction(Builder, TTI, Src, Desc, OrigPhi, + SrcMask); + default: + llvm_unreachable("Unknown SelectCmp recurrence kind"); + } +} + Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder, const TargetTransformInfo *TTI, Value *Src, RecurKind RdxKind) { @@ -1063,7 +1138,7 @@ Value *llvm::createTargetReduction(IRBuilderBase &B, const TargetTransformInfo *TTI, const RecurrenceDescriptor &Desc, Value *Src, - PHINode *OrigPhi) { + PHINode *OrigPhi, Value *SrcMask) { // TODO: Support in-order reductions based on the recurrence descriptor. // All ops in the reduction inherit fast-math-flags from the recurrence // descriptor. @@ -1072,7 +1147,7 @@ RecurKind RK = Desc.getRecurrenceKind(); if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) - return createSelectCmpTargetReduction(B, TTI, Src, Desc, OrigPhi); + return createSelectCmpTargetReduction(B, TTI, Src, Desc, OrigPhi, SrcMask); return createSimpleTargetReduction(B, TTI, Src, RK); } @@ -1089,6 +1164,17 @@ return B.CreateFAddReduce(Start, Src); } +Value *llvm::createSentinelValueHandling(IRBuilderBase &Builder, + const TargetTransformInfo *TTI, + const RecurrenceDescriptor &Desc, + Value *Rdx) { + Value *InitVal = Desc.getRecurrenceStartValue(); + Value *Iden = Desc.getRecurrenceIdentity( + Desc.getRecurrenceKind(), Rdx->getType(), Desc.getFastMathFlags()); + Value *Cmp = Builder.CreateCmp(CmpInst::ICMP_NE, Rdx, Iden, "rdx.select.cmp"); + return Builder.CreateSelect(Cmp, Rdx, InitVal, "rdx.select"); +} + void llvm::propagateIRFlags(Value *I, ArrayRef<Value *> VL, Value *OpValue, bool IncludeWrapFlags) { auto *VecOp = dyn_cast<Instruction>(I); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -878,6 +878,21 @@ } // next instr. } + // Second comfirm the incomplete reductions + for (auto R : Reductions) { + RecurrenceDescriptor &RedDes = Reductions.find(R.first)->second; + if (!RedDes.hasUserRecurrence()) + continue; + + PHINode *UserPhi = RedDes.getUserRecurPhi(); + if (!isReductionVariable(UserPhi)) + return false; + + RecurrenceDescriptor &UserRedDes = Reductions.find(UserPhi)->second; + if (!RedDes.fixUserRecurrence(UserRedDes)) + return false; + } + if (!PrimaryInduction) { if (Inductions.empty()) { reportVectorizationFailure("Did not find one integer induction var", diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -535,6 +535,11 @@ // generated by fixReduction. PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc); + // Returns the recurrence mask (mask.cmp) for a recurrence as generated by + // fixReduction. + std::pair<Value *, VectorParts> + getDependRecurrenceMask(const RecurrenceDescriptor &RdxDesc); + /// Create a new phi node for the induction variable \p OrigPhi to resume /// iteration count in the scalar epilogue, from where the vectorized loop /// left off. In cases where the loop skeleton is more complicated (eg. @@ -744,6 +749,12 @@ // correct start value of reduction PHIs when vectorizing the epilogue. SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4> ReductionResumeValues; + + // Holds the masks for recurrences in the loops, be used for reduction when + // there is a reduction that depends on the recurrence. + SmallMapVector<const RecurrenceDescriptor *, std::pair<Value *, VectorParts>, + 4> + DependRecurrenceMasks; }; class InnerLoopUnroller : public InnerLoopVectorizer { @@ -1120,6 +1131,15 @@ return It->second; } +std::pair<Value *, InnerLoopVectorizer::VectorParts> +InnerLoopVectorizer::getDependRecurrenceMask( + const RecurrenceDescriptor &RdxDesc) { + auto It = DependRecurrenceMasks.find(&RdxDesc); + assert(It != DependRecurrenceMasks.end() && + "Expected to find a dependence mask for the recurrence."); + return It->second; +} + namespace llvm { // Loop vectorization cost-model hints how the scalar epilogue loop should be @@ -3745,10 +3765,24 @@ // the incoming edges. VPBasicBlock *Header = State.Plan->getVectorLoopRegion()->getEntryBasicBlock(); - for (VPRecipeBase &R : Header->phis()) { - if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) + // FIXME: Maybe I should not choose std::queue... + std::queue<VPRecipeBase *> Worklist; + for (VPRecipeBase &R : Header->phis()) + Worklist.push(&R); + + while (!Worklist.empty()) { + VPRecipeBase &R = *(Worklist.front()); + Worklist.pop(); + if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) { + const RecurrenceDescriptor &RecDesc = + ReductionPhi->getRecurrenceDescriptor(); + RecurrenceDescriptor *DependRecDesc = RecDesc.getDependMinMaxRecDes(); + if (DependRecDesc && !DependRecurrenceMasks.count(DependRecDesc)) { + Worklist.push(&R); + continue; + } fixReduction(ReductionPhi, State); - else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) + } else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) fixFixedOrderRecurrence(FOR, State); } } @@ -3966,6 +4000,19 @@ Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); unsigned Op = RecurrenceDescriptor::getOpcode(RK); + // Get the reduction mask if the reduction depend on another one. + RecurrenceDescriptor *DependDesc = RdxDesc.getDependMinMaxRecDes(); + Value *DependRdxMask = nullptr; + VectorParts DependPartMasks; + if (DependDesc) { + Builder.SetInsertPoint(&*LoopMiddleBlock->getTerminator()); + std::tie(DependRdxMask, DependPartMasks) = + getDependRecurrenceMask(*DependDesc); + } + + Value *NewRdxMask = nullptr; + VectorParts NewPartMasks(UF); + // The middle block terminator has already been assigned a DebugLoc here (the // OrigLoop's single latch terminator). We want the whole middle block to // appear to execute on this line because: (a) it is all compiler generated, @@ -3982,30 +4029,61 @@ Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); for (unsigned Part = 1; Part < UF; ++Part) { Value *RdxPart = State.get(LoopExitInstDef, Part); + Value *PartMask = DependDesc ? DependPartMasks[Part] : nullptr; if (Op != Instruction::ICmp && Op != Instruction::FCmp) { ReducedPartRdx = Builder.CreateBinOp( (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK, - ReducedPartRdx, RdxPart); - else + ReducedPartRdx, RdxPart, PartMask); + else { ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); + // Keep the part mask on demand. + if (RdxDesc.hasUserRecurrence()) { + auto *SI = dyn_cast<SelectInst>(ReducedPartRdx); + auto *CI = dyn_cast<CmpInst>(SI->getCondition()); + NewPartMasks[Part] = CI; + } + } } } // Create the reduction after the loop. Note that inloop reductions create the // target reduction in the loop using a Reduction recipe. if (VF.isVector() && !PhiR->isInLoop()) { - ReducedPartRdx = - createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi); + Value *ReducedPart = ReducedPartRdx; + ReducedPartRdx = createTargetReduction( + Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi, DependRdxMask); // If the reduction can be performed in a smaller type, we need to extend // the reduction to the wider type before we branch to the original loop. if (PhiTy != RdxDesc.getRecurrenceType()) ReducedPartRdx = RdxDesc.isSigned() ? Builder.CreateSExt(ReducedPartRdx, PhiTy) : Builder.CreateZExt(ReducedPartRdx, PhiTy); + + // Create depend recurrence mask on demand. + if (RdxDesc.hasUserRecurrence()) { + ElementCount EC = + cast<VectorType>(ReducedPart->getType())->getElementCount(); + Value *RdxSplat = Builder.CreateVectorSplat(EC, ReducedPartRdx); + // FIXME: Not sure use FCMP_OEQ is right or not. + CmpInst::Predicate MaskPred = + (ReducedPartRdx->getType()->isFloatingPointTy()) ? CmpInst::FCMP_OEQ + : CmpInst::ICMP_EQ; + NewRdxMask = + Builder.CreateCmp(MaskPred, RdxSplat, ReducedPart, "mask.cmp"); + } } + if (RecurrenceDescriptor::isMinMaxIdxRecurrenceKind(RK) || + (RK == RecurKind::SelectIVICmp) || (RK == RecurKind::SelectIVFCmp)) + ReducedPartRdx = + createSentinelValueHandling(Builder, TTI, RdxDesc, ReducedPartRdx); + + // Set the recurrence mask for this reduction on demand. + if (RdxDesc.hasUserRecurrence()) + DependRecurrenceMasks.insert({&RdxDesc, {NewRdxMask, NewPartMasks}}); + PHINode *ResumePhi = dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue()); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -1217,7 +1217,7 @@ Value *Iden = nullptr; RecurKind RK = RdxDesc.getRecurrenceKind(); if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) || - RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) { + (RK == RecurKind::SelectICmp || RK == RecurKind::SelectFCmp)) { // MinMax reduction have the start value as their identify. if (ScalarPHI) { Iden = StartV; @@ -1227,6 +1227,16 @@ StartV = Iden = Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident"); } + } else if (RecurrenceDescriptor::isMinMaxIdxRecurrenceKind(RK) || + (RK == RecurKind::SelectIVICmp || RK == RecurKind::SelectIVFCmp)) { + StartV = Iden = RdxDesc.getRecurrenceIdentity(RK, VecTy->getScalarType(), + RdxDesc.getFastMathFlags()); + + if (!ScalarPHI) { + IRBuilderBase::InsertPointGuard IPBuilder(Builder); + Builder.SetInsertPoint(VectorPH->getTerminator()); + StartV = Iden = Builder.CreateVectorSplat(State.VF, Iden); + } } else { Iden = RdxDesc.getRecurrenceIdentity(RK, VecTy->getScalarType(), RdxDesc.getFastMathFlags()); diff --git a/llvm/test/Transforms/LoopVectorize/select-min-index.ll b/llvm/test/Transforms/LoopVectorize/select-min-index.ll --- a/llvm/test/Transforms/LoopVectorize/select-min-index.ll +++ b/llvm/test/Transforms/LoopVectorize/select-min-index.ll @@ -1,6 +1,7 @@ -; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck %s -; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -S %s | FileCheck %s -; RUN: opt -passes=loop-vectorize -force-vector-width=1 -force-vector-interleave=2 -S %s | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck %s --check-prefix=CHECK-VF4IC1 +; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -S %s | FileCheck %s --check-prefix=CHECK-VF4IC2 +; RUN: opt -passes=loop-vectorize -force-vector-width=1 -force-vector-interleave=2 -S %s | FileCheck %s --check-prefix=CHECK-VF1IC2 ; Test cases for selecting the index with the minimum value. @@ -30,8 +31,187 @@ } define i64 @test_vectorize_select_umin_idx_all_exit_inst(ptr %src, ptr %umin) { -; CHECK-LABEL: @test_vectorize_select_umin_idx_all_exit_inst( -; CHECK-NOT: vector.body: +; CHECK-VF4IC1-LABEL: @test_vectorize_select_umin_idx_all_exit_inst( +; CHECK-VF4IC1-NEXT: entry: +; CHECK-VF4IC1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF4IC1: vector.ph: +; CHECK-VF4IC1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF4IC1: vector.body: +; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ <i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = getelementptr i64, ptr [[SRC:%.*]], i64 [[TMP0]] +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0 +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = icmp ugt <4 x i64> [[VEC_PHI1]], [[WIDE_LOAD]] +; CHECK-VF4IC1-NEXT: [[TMP4]] = call <4 x i64> @llvm.umin.v4i64(<4 x i64> [[VEC_PHI1]], <4 x i64> [[WIDE_LOAD]]) +; CHECK-VF4IC1-NEXT: [[TMP5]] = select <4 x i1> [[TMP3]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4> +; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-VF4IC1: middle.block: +; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> [[TMP4]]) +; CHECK-VF4IC1-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP7]], i64 0 +; CHECK-VF4IC1-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: [[MASK_CMP:%.*]] = icmp eq <4 x i64> [[DOTSPLAT]], [[TMP4]] +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 0, 0 +; CHECK-VF4IC1-NEXT: [[MASK_SELECT:%.*]] = select <4 x i1> [[MASK_CMP]], <4 x i64> [[TMP5]], <4 x i64> <i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807> +; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> [[MASK_SELECT]]) +; CHECK-VF4IC1-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP8]], -9223372036854775808 +; CHECK-VF4IC1-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP8]], i64 0 +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF4IC1: scalar.ph: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX2:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: br label [[LOOP:%.*]] +; CHECK-VF4IC1: loop: +; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-VF4IC1-NEXT: [[MIN_IDX:%.*]] = phi i64 [ [[BC_MERGE_RDX2]], [[SCALAR_PH]] ], [ [[MIN_IDX_NEXT:%.*]], [[LOOP]] ] +; CHECK-VF4IC1-NEXT: [[MIN_VAL:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MIN_VAL_NEXT:%.*]], [[LOOP]] ] +; CHECK-VF4IC1-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[L:%.*]] = load i64, ptr [[GEP]], align 4 +; CHECK-VF4IC1-NEXT: [[CMP:%.*]] = icmp ugt i64 [[MIN_VAL]], [[L]] +; CHECK-VF4IC1-NEXT: [[MIN_VAL_NEXT]] = tail call i64 @llvm.umin.i64(i64 [[MIN_VAL]], i64 [[L]]) +; CHECK-VF4IC1-NEXT: [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]] +; CHECK-VF4IC1-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-VF4IC1: exit: +; CHECK-VF4IC1-NEXT: [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], [[LOOP]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: [[RES_UMIN:%.*]] = phi i64 [ [[MIN_VAL_NEXT]], [[LOOP]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: store i64 [[RES_UMIN]], ptr [[UMIN:%.*]], align 4 +; CHECK-VF4IC1-NEXT: ret i64 [[RES]] +; +; CHECK-VF4IC2-LABEL: @test_vectorize_select_umin_idx_all_exit_inst( +; CHECK-VF4IC2-NEXT: entry: +; CHECK-VF4IC2-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF4IC2: vector.ph: +; CHECK-VF4IC2-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF4IC2: vector.body: +; CHECK-VF4IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC2-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC2-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ <i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808>, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC2-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i64> [ <i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808>, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC2-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC2-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC2-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4> +; CHECK-VF4IC2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF4IC2-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 +; CHECK-VF4IC2-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[SRC:%.*]], i64 [[TMP0]] +; CHECK-VF4IC2-NEXT: [[TMP3:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[TMP1]] +; CHECK-VF4IC2-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[TMP2]], i32 0 +; CHECK-VF4IC2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 4 +; CHECK-VF4IC2-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[TMP2]], i32 4 +; CHECK-VF4IC2-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i64>, ptr [[TMP5]], align 4 +; CHECK-VF4IC2-NEXT: [[TMP6:%.*]] = icmp ugt <4 x i64> [[VEC_PHI3]], [[WIDE_LOAD]] +; CHECK-VF4IC2-NEXT: [[TMP7:%.*]] = icmp ugt <4 x i64> [[VEC_PHI4]], [[WIDE_LOAD5]] +; CHECK-VF4IC2-NEXT: [[TMP8]] = call <4 x i64> @llvm.umin.v4i64(<4 x i64> [[VEC_PHI3]], <4 x i64> [[WIDE_LOAD]]) +; CHECK-VF4IC2-NEXT: [[TMP9]] = call <4 x i64> @llvm.umin.v4i64(<4 x i64> [[VEC_PHI4]], <4 x i64> [[WIDE_LOAD5]]) +; CHECK-VF4IC2-NEXT: [[TMP10]] = select <4 x i1> [[TMP6]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] +; CHECK-VF4IC2-NEXT: [[TMP11]] = select <4 x i1> [[TMP7]], <4 x i64> [[STEP_ADD]], <4 x i64> [[VEC_PHI2]] +; CHECK-VF4IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-VF4IC2-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], <i64 4, i64 4, i64 4, i64 4> +; CHECK-VF4IC2-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-VF4IC2-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-VF4IC2: middle.block: +; CHECK-VF4IC2-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp ult <4 x i64> [[TMP8]], [[TMP9]] +; CHECK-VF4IC2-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i64> [[TMP8]], <4 x i64> [[TMP9]] +; CHECK-VF4IC2-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> [[RDX_MINMAX_SELECT]]) +; CHECK-VF4IC2-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP13]], i64 0 +; CHECK-VF4IC2-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC2-NEXT: [[MASK_CMP:%.*]] = icmp eq <4 x i64> [[DOTSPLAT]], [[RDX_MINMAX_SELECT]] +; CHECK-VF4IC2-NEXT: [[CMP_N:%.*]] = icmp eq i64 0, 0 +; CHECK-VF4IC2-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ule <4 x i64> [[TMP8]], [[TMP9]] +; CHECK-VF4IC2-NEXT: [[RDX_SELECT:%.*]] = select <4 x i1> [[RDX_SELECT_CMP]], <4 x i64> [[TMP10]], <4 x i64> [[TMP11]] +; CHECK-VF4IC2-NEXT: [[MASK_SELECT:%.*]] = select <4 x i1> [[MASK_CMP]], <4 x i64> [[RDX_SELECT]], <4 x i64> <i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807> +; CHECK-VF4IC2-NEXT: [[TMP14:%.*]] = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> [[MASK_SELECT]]) +; CHECK-VF4IC2-NEXT: [[RDX_SELECT_CMP6:%.*]] = icmp ne i64 [[TMP14]], -9223372036854775808 +; CHECK-VF4IC2-NEXT: [[RDX_SELECT7:%.*]] = select i1 [[RDX_SELECT_CMP6]], i64 [[TMP14]], i64 0 +; CHECK-VF4IC2-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF4IC2: scalar.ph: +; CHECK-VF4IC2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-VF4IC2-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC2-NEXT: [[BC_MERGE_RDX8:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[RDX_SELECT7]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC2-NEXT: br label [[LOOP:%.*]] +; CHECK-VF4IC2: loop: +; CHECK-VF4IC2-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-VF4IC2-NEXT: [[MIN_IDX:%.*]] = phi i64 [ [[BC_MERGE_RDX8]], [[SCALAR_PH]] ], [ [[MIN_IDX_NEXT:%.*]], [[LOOP]] ] +; CHECK-VF4IC2-NEXT: [[MIN_VAL:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MIN_VAL_NEXT:%.*]], [[LOOP]] ] +; CHECK-VF4IC2-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]] +; CHECK-VF4IC2-NEXT: [[L:%.*]] = load i64, ptr [[GEP]], align 4 +; CHECK-VF4IC2-NEXT: [[CMP:%.*]] = icmp ugt i64 [[MIN_VAL]], [[L]] +; CHECK-VF4IC2-NEXT: [[MIN_VAL_NEXT]] = tail call i64 @llvm.umin.i64(i64 [[MIN_VAL]], i64 [[L]]) +; CHECK-VF4IC2-NEXT: [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]] +; CHECK-VF4IC2-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC2-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; CHECK-VF4IC2-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-VF4IC2: exit: +; CHECK-VF4IC2-NEXT: [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], [[LOOP]] ], [ [[RDX_SELECT7]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC2-NEXT: [[RES_UMIN:%.*]] = phi i64 [ [[MIN_VAL_NEXT]], [[LOOP]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC2-NEXT: store i64 [[RES_UMIN]], ptr [[UMIN:%.*]], align 4 +; CHECK-VF4IC2-NEXT: ret i64 [[RES]] +; +; CHECK-VF1IC2-LABEL: @test_vectorize_select_umin_idx_all_exit_inst( +; CHECK-VF1IC2-NEXT: entry: +; CHECK-VF1IC2-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF1IC2: vector.ph: +; CHECK-VF1IC2-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF1IC2: vector.body: +; CHECK-VF1IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC2-NEXT: [[VEC_PHI:%.*]] = phi i64 [ -9223372036854775808, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC2-NEXT: [[VEC_PHI1:%.*]] = phi i64 [ -9223372036854775808, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC2-NEXT: [[VEC_PHI2:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC2-NEXT: [[VEC_PHI3:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF1IC2-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-VF1IC2-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[SRC:%.*]], i64 [[TMP0]] +; CHECK-VF1IC2-NEXT: [[TMP3:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[TMP1]] +; CHECK-VF1IC2-NEXT: [[TMP4:%.*]] = load i64, ptr [[TMP2]], align 4 +; CHECK-VF1IC2-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 4 +; CHECK-VF1IC2-NEXT: [[TMP6:%.*]] = icmp ugt i64 [[VEC_PHI2]], [[TMP4]] +; CHECK-VF1IC2-NEXT: [[TMP7:%.*]] = icmp ugt i64 [[VEC_PHI3]], [[TMP5]] +; CHECK-VF1IC2-NEXT: [[TMP8]] = tail call i64 @llvm.umin.i64(i64 [[VEC_PHI2]], i64 [[TMP4]]) +; CHECK-VF1IC2-NEXT: [[TMP9]] = tail call i64 @llvm.umin.i64(i64 [[VEC_PHI3]], i64 [[TMP5]]) +; CHECK-VF1IC2-NEXT: [[TMP10]] = select i1 [[TMP6]], i64 [[TMP0]], i64 [[VEC_PHI]] +; CHECK-VF1IC2-NEXT: [[TMP11]] = select i1 [[TMP7]], i64 [[TMP1]], i64 [[VEC_PHI1]] +; CHECK-VF1IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-VF1IC2-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-VF1IC2-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-VF1IC2: middle.block: +; CHECK-VF1IC2-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp ult i64 [[TMP8]], [[TMP9]] +; CHECK-VF1IC2-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select i1 [[RDX_MINMAX_CMP]], i64 [[TMP8]], i64 [[TMP9]] +; CHECK-VF1IC2-NEXT: [[CMP_N:%.*]] = icmp eq i64 0, 0 +; CHECK-VF1IC2-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ule i64 [[TMP8]], [[TMP9]] +; CHECK-VF1IC2-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP10]], i64 [[TMP11]] +; CHECK-VF1IC2-NEXT: [[RDX_SELECT_CMP4:%.*]] = icmp ne i64 [[RDX_SELECT]], -9223372036854775808 +; CHECK-VF1IC2-NEXT: [[RDX_SELECT5:%.*]] = select i1 [[RDX_SELECT_CMP4]], i64 [[RDX_SELECT]], i64 0 +; CHECK-VF1IC2-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF1IC2: scalar.ph: +; CHECK-VF1IC2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-VF1IC2-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[RDX_MINMAX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1IC2-NEXT: [[BC_MERGE_RDX6:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[RDX_SELECT5]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1IC2-NEXT: br label [[LOOP:%.*]] +; CHECK-VF1IC2: loop: +; CHECK-VF1IC2-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-VF1IC2-NEXT: [[MIN_IDX:%.*]] = phi i64 [ [[BC_MERGE_RDX6]], [[SCALAR_PH]] ], [ [[MIN_IDX_NEXT:%.*]], [[LOOP]] ] +; CHECK-VF1IC2-NEXT: [[MIN_VAL:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MIN_VAL_NEXT:%.*]], [[LOOP]] ] +; CHECK-VF1IC2-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]] +; CHECK-VF1IC2-NEXT: [[L:%.*]] = load i64, ptr [[GEP]], align 4 +; CHECK-VF1IC2-NEXT: [[CMP:%.*]] = icmp ugt i64 [[MIN_VAL]], [[L]] +; CHECK-VF1IC2-NEXT: [[MIN_VAL_NEXT]] = tail call i64 @llvm.umin.i64(i64 [[MIN_VAL]], i64 [[L]]) +; CHECK-VF1IC2-NEXT: [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]] +; CHECK-VF1IC2-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF1IC2-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; CHECK-VF1IC2-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-VF1IC2: exit: +; CHECK-VF1IC2-NEXT: [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], [[LOOP]] ], [ [[RDX_SELECT5]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1IC2-NEXT: [[RES_UMIN:%.*]] = phi i64 [ [[MIN_VAL_NEXT]], [[LOOP]] ], [ [[RDX_MINMAX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1IC2-NEXT: store i64 [[RES_UMIN]], ptr [[UMIN:%.*]], align 4 +; CHECK-VF1IC2-NEXT: ret i64 [[RES]] ; entry: br label %loop @@ -82,8 +262,178 @@ } define i64 @test_not_vectorize_select_no_min_reduction(ptr %src) { -; CHECK-LABEL: @test_not_vectorize_select_no_min_reduction( -; CHECK-NOT: vector.body: +; CHECK-VF4IC1-LABEL: @test_not_vectorize_select_no_min_reduction( +; CHECK-VF4IC1-NEXT: entry: +; CHECK-VF4IC1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF4IC1: vector.ph: +; CHECK-VF4IC1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF4IC1: vector.body: +; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ <i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808>, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 0>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = getelementptr i64, ptr [[SRC:%.*]], i64 [[TMP0]] +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0 +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP3]] = add <4 x i64> [[WIDE_LOAD]], <i64 1, i64 1, i64 1, i64 1> +; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6> +; CHECK-VF4IC1-NEXT: [[TMP5:%.*]] = icmp ugt <4 x i64> [[TMP4]], [[WIDE_LOAD]] +; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.umin.v4i64(<4 x i64> [[TMP4]], <4 x i64> [[WIDE_LOAD]]) +; CHECK-VF4IC1-NEXT: [[TMP7]] = select <4 x i1> [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4> +; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-VF4IC1-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-VF4IC1: middle.block: +; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP7]]) +; CHECK-VF4IC1-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP9]], -9223372036854775808 +; CHECK-VF4IC1-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP9]], i64 0 +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 0, 0 +; CHECK-VF4IC1-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; CHECK-VF4IC1-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF4IC1: scalar.ph: +; CHECK-VF4IC1-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: br label [[LOOP:%.*]] +; CHECK-VF4IC1: loop: +; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-VF4IC1-NEXT: [[MIN_IDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MIN_IDX_NEXT:%.*]], [[LOOP]] ] +; CHECK-VF4IC1-NEXT: [[SCALAR_RECUR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[MIN_VAL_NEXT:%.*]], [[LOOP]] ] +; CHECK-VF4IC1-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[L:%.*]] = load i64, ptr [[GEP]], align 4 +; CHECK-VF4IC1-NEXT: [[CMP:%.*]] = icmp ugt i64 [[SCALAR_RECUR]], [[L]] +; CHECK-VF4IC1-NEXT: [[MIN_VAL_NEXT]] = add i64 [[L]], 1 +; CHECK-VF4IC1-NEXT: [[FOO:%.*]] = call i64 @llvm.umin.i64(i64 [[SCALAR_RECUR]], i64 [[L]]) +; CHECK-VF4IC1-NEXT: [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]] +; CHECK-VF4IC1-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-VF4IC1: exit: +; CHECK-VF4IC1-NEXT: [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], [[LOOP]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: ret i64 [[RES]] +; +; CHECK-VF4IC2-LABEL: @test_not_vectorize_select_no_min_reduction( +; CHECK-VF4IC2-NEXT: entry: +; CHECK-VF4IC2-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF4IC2: vector.ph: +; CHECK-VF4IC2-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF4IC2: vector.body: +; CHECK-VF4IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC2-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC2-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ <i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808>, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC2-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i64> [ <i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808>, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC2-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 0>, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC2-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4> +; CHECK-VF4IC2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF4IC2-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 +; CHECK-VF4IC2-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[SRC:%.*]], i64 [[TMP0]] +; CHECK-VF4IC2-NEXT: [[TMP3:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[TMP1]] +; CHECK-VF4IC2-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[TMP2]], i32 0 +; CHECK-VF4IC2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 4 +; CHECK-VF4IC2-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[TMP2]], i32 4 +; CHECK-VF4IC2-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP5]], align 4 +; CHECK-VF4IC2-NEXT: [[TMP6:%.*]] = add <4 x i64> [[WIDE_LOAD]], <i64 1, i64 1, i64 1, i64 1> +; CHECK-VF4IC2-NEXT: [[TMP7]] = add <4 x i64> [[WIDE_LOAD3]], <i64 1, i64 1, i64 1, i64 1> +; CHECK-VF4IC2-NEXT: [[TMP8:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> [[TMP6]], <4 x i32> <i32 3, i32 4, i32 5, i32 6> +; CHECK-VF4IC2-NEXT: [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> [[TMP7]], <4 x i32> <i32 3, i32 4, i32 5, i32 6> +; CHECK-VF4IC2-NEXT: [[TMP10:%.*]] = icmp ugt <4 x i64> [[TMP8]], [[WIDE_LOAD]] +; CHECK-VF4IC2-NEXT: [[TMP11:%.*]] = icmp ugt <4 x i64> [[TMP9]], [[WIDE_LOAD3]] +; CHECK-VF4IC2-NEXT: [[TMP12:%.*]] = call <4 x i64> @llvm.umin.v4i64(<4 x i64> [[TMP8]], <4 x i64> [[WIDE_LOAD]]) +; CHECK-VF4IC2-NEXT: [[TMP13:%.*]] = call <4 x i64> @llvm.umin.v4i64(<4 x i64> [[TMP9]], <4 x i64> [[WIDE_LOAD3]]) +; CHECK-VF4IC2-NEXT: [[TMP14]] = select <4 x i1> [[TMP10]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] +; CHECK-VF4IC2-NEXT: [[TMP15]] = select <4 x i1> [[TMP11]], <4 x i64> [[STEP_ADD]], <4 x i64> [[VEC_PHI2]] +; CHECK-VF4IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-VF4IC2-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], <i64 4, i64 4, i64 4, i64 4> +; CHECK-VF4IC2-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-VF4IC2-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-VF4IC2: middle.block: +; CHECK-VF4IC2-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i64> [[TMP14]], [[TMP15]] +; CHECK-VF4IC2-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i64> [[TMP14]], <4 x i64> [[TMP15]] +; CHECK-VF4IC2-NEXT: [[TMP17:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[RDX_MINMAX_SELECT]]) +; CHECK-VF4IC2-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP17]], -9223372036854775808 +; CHECK-VF4IC2-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP17]], i64 0 +; CHECK-VF4IC2-NEXT: [[CMP_N:%.*]] = icmp eq i64 0, 0 +; CHECK-VF4IC2-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3 +; CHECK-VF4IC2-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2 +; CHECK-VF4IC2-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF4IC2: scalar.ph: +; CHECK-VF4IC2-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-VF4IC2-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC2-NEXT: br label [[LOOP:%.*]] +; CHECK-VF4IC2: loop: +; CHECK-VF4IC2-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-VF4IC2-NEXT: [[MIN_IDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MIN_IDX_NEXT:%.*]], [[LOOP]] ] +; CHECK-VF4IC2-NEXT: [[SCALAR_RECUR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[MIN_VAL_NEXT:%.*]], [[LOOP]] ] +; CHECK-VF4IC2-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]] +; CHECK-VF4IC2-NEXT: [[L:%.*]] = load i64, ptr [[GEP]], align 4 +; CHECK-VF4IC2-NEXT: [[CMP:%.*]] = icmp ugt i64 [[SCALAR_RECUR]], [[L]] +; CHECK-VF4IC2-NEXT: [[MIN_VAL_NEXT]] = add i64 [[L]], 1 +; CHECK-VF4IC2-NEXT: [[FOO:%.*]] = call i64 @llvm.umin.i64(i64 [[SCALAR_RECUR]], i64 [[L]]) +; CHECK-VF4IC2-NEXT: [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]] +; CHECK-VF4IC2-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC2-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; CHECK-VF4IC2-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-VF4IC2: exit: +; CHECK-VF4IC2-NEXT: [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], [[LOOP]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC2-NEXT: ret i64 [[RES]] +; +; CHECK-VF1IC2-LABEL: @test_not_vectorize_select_no_min_reduction( +; CHECK-VF1IC2-NEXT: entry: +; CHECK-VF1IC2-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF1IC2: vector.ph: +; CHECK-VF1IC2-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF1IC2: vector.body: +; CHECK-VF1IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC2-NEXT: [[VEC_PHI:%.*]] = phi i64 [ -9223372036854775808, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC2-NEXT: [[VEC_PHI1:%.*]] = phi i64 [ -9223372036854775808, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC2-NEXT: [[VECTOR_RECUR:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF1IC2-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-VF1IC2-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[SRC:%.*]], i64 [[TMP0]] +; CHECK-VF1IC2-NEXT: [[TMP3:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[TMP1]] +; CHECK-VF1IC2-NEXT: [[TMP4:%.*]] = load i64, ptr [[TMP2]], align 4 +; CHECK-VF1IC2-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 4 +; CHECK-VF1IC2-NEXT: [[TMP6:%.*]] = add i64 [[TMP4]], 1 +; CHECK-VF1IC2-NEXT: [[TMP7]] = add i64 [[TMP5]], 1 +; CHECK-VF1IC2-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[VECTOR_RECUR]], [[TMP4]] +; CHECK-VF1IC2-NEXT: [[TMP9:%.*]] = icmp ugt i64 [[TMP6]], [[TMP5]] +; CHECK-VF1IC2-NEXT: [[TMP10]] = select i1 [[TMP8]], i64 [[TMP0]], i64 [[VEC_PHI]] +; CHECK-VF1IC2-NEXT: [[TMP11]] = select i1 [[TMP9]], i64 [[TMP1]], i64 [[VEC_PHI1]] +; CHECK-VF1IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-VF1IC2-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-VF1IC2-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-VF1IC2: middle.block: +; CHECK-VF1IC2-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt i64 [[TMP10]], [[TMP11]] +; CHECK-VF1IC2-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select i1 [[RDX_MINMAX_CMP]], i64 [[TMP10]], i64 [[TMP11]] +; CHECK-VF1IC2-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[RDX_MINMAX_SELECT]], -9223372036854775808 +; CHECK-VF1IC2-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[RDX_MINMAX_SELECT]], i64 0 +; CHECK-VF1IC2-NEXT: [[CMP_N:%.*]] = icmp eq i64 0, 0 +; CHECK-VF1IC2-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF1IC2: scalar.ph: +; CHECK-VF1IC2-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1IC2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-VF1IC2-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1IC2-NEXT: br label [[LOOP:%.*]] +; CHECK-VF1IC2: loop: +; CHECK-VF1IC2-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-VF1IC2-NEXT: [[MIN_IDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MIN_IDX_NEXT:%.*]], [[LOOP]] ] +; CHECK-VF1IC2-NEXT: [[SCALAR_RECUR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[MIN_VAL_NEXT:%.*]], [[LOOP]] ] +; CHECK-VF1IC2-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]] +; CHECK-VF1IC2-NEXT: [[L:%.*]] = load i64, ptr [[GEP]], align 4 +; CHECK-VF1IC2-NEXT: [[CMP:%.*]] = icmp ugt i64 [[SCALAR_RECUR]], [[L]] +; CHECK-VF1IC2-NEXT: [[MIN_VAL_NEXT]] = add i64 [[L]], 1 +; CHECK-VF1IC2-NEXT: [[FOO:%.*]] = call i64 @llvm.umin.i64(i64 [[SCALAR_RECUR]], i64 [[L]]) +; CHECK-VF1IC2-NEXT: [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]] +; CHECK-VF1IC2-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF1IC2-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; CHECK-VF1IC2-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-VF1IC2: exit: +; CHECK-VF1IC2-NEXT: [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], [[LOOP]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1IC2-NEXT: ret i64 [[RES]] ; entry: br label %loop diff --git a/llvm/test/Transforms/LoopVectorize/smax-idx.ll b/llvm/test/Transforms/LoopVectorize/smax-idx.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/smax-idx.ll @@ -0,0 +1,1450 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -debug-only=loop-vectorize,iv-descriptors -S < %s 2>&1 | FileCheck %s --check-prefix=CHECK-VF4IC1 +; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=4 -debug-only=loop-vectorize,iv-descriptors -S < %s 2>&1 | FileCheck %s --check-prefix=CHECK-VF4IC4 +; RUN: opt -passes=loop-vectorize -force-vector-width=1 -force-vector-interleave=4 -debug-only=loop-vectorize,iv-descriptors -S < %s 2>&1 | FileCheck %s --check-prefix=CHECK-VF1IC4 + +define i64 @smax_idx(ptr nocapture readonly %a, i64 %mm, i64 %ii, ptr nocapture writeonly %res_max, i64 %n) { +; CHECK-LABEL: @smax_idx( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[MM:%.*]], i64 0 +; CHECK-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i64> [[MINMAX_IDENT_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i64> <i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808>, i64 [[II:%.*]], i32 0 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[VEC_PHI]], <4 x i64> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP5:%.*]] = icmp slt <4 x i64> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP6]] = select <4 x i1> [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI1]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4> +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP4]]) +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP8]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[MASK_CMP:%.*]] = icmp eq <4 x i64> [[DOTSPLAT]], [[TMP4]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: [[MASK_SELECT:%.*]] = select <4 x i1> [[MASK_CMP]], <4 x i64> [[TMP6]], <4 x i64> <i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807> +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> [[MASK_SELECT]]) +; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP9]], -9223372036854775808 +; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP9]], i64 [[II]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[MM]], [[ENTRY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX2:%.*]] = phi i64 [ [[II]], [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[MAX_09:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP11:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[IDX_011:%.*]] = phi i64 [ [[BC_MERGE_RDX2]], [[SCALAR_PH]] ], [ [[SPEC_SELECT7:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP10:%.*]] = load i64, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP11]] = tail call i64 @llvm.smax.i64(i64 [[MAX_09]], i64 [[TMP10]]) +; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i64 [[MAX_09]], [[TMP10]] +; CHECK-NEXT: [[SPEC_SELECT7]] = select i1 [[CMP1]], i64 [[INDVARS_IV]], i64 [[IDX_011]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i64 [ [[TMP11]], [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[SPEC_SELECT7_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT7]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: store i64 [[DOTLCSSA]], ptr [[RES_MAX:%.*]], align 4 +; CHECK-NEXT: ret i64 [[SPEC_SELECT7_LCSSA]] +; +; CHECK-VF4IC1-LABEL: @smax_idx( +; CHECK-VF4IC1-NEXT: entry: +; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 +; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF4IC1: vector.ph: +; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF4IC1-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[MM:%.*]], i64 0 +; CHECK-VF4IC1-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i64> [[MINMAX_IDENT_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF4IC1: vector.body: +; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ <i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0 +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP3]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[VEC_PHI]], <4 x i64> [[WIDE_LOAD]]) +; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = icmp slt <4 x i64> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-VF4IC1-NEXT: [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI1]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4> +; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-VF4IC1: middle.block: +; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP3]]) +; CHECK-VF4IC1-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP7]], i64 0 +; CHECK-VF4IC1-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: [[MASK_CMP:%.*]] = icmp eq <4 x i64> [[DOTSPLAT]], [[TMP3]] +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: [[MASK_SELECT:%.*]] = select <4 x i1> [[MASK_CMP]], <4 x i64> [[TMP5]], <4 x i64> <i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807> +; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> [[MASK_SELECT]]) +; CHECK-VF4IC1-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP8]], -9223372036854775808 +; CHECK-VF4IC1-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP8]], i64 [[II:%.*]] +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF4IC1: scalar.ph: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[MM]], [[ENTRY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX2:%.*]] = phi i64 [ [[II]], [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF4IC1: for.body: +; CHECK-VF4IC1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[MAX_09:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP10:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[IDX_011:%.*]] = phi i64 [ [[BC_MERGE_RDX2]], [[SCALAR_PH]] ], [ [[SPEC_SELECT7:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = load i64, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP10]] = tail call i64 @llvm.smax.i64(i64 [[MAX_09]], i64 [[TMP9]]) +; CHECK-VF4IC1-NEXT: [[CMP1:%.*]] = icmp slt i64 [[MAX_09]], [[TMP9]] +; CHECK-VF4IC1-NEXT: [[SPEC_SELECT7]] = select i1 [[CMP1]], i64 [[INDVARS_IV]], i64 [[IDX_011]] +; CHECK-VF4IC1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-VF4IC1: exit: +; CHECK-VF4IC1-NEXT: [[DOTLCSSA:%.*]] = phi i64 [ [[TMP10]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: [[SPEC_SELECT7_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT7]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: store i64 [[DOTLCSSA]], ptr [[RES_MAX:%.*]], align 4 +; CHECK-VF4IC1-NEXT: ret i64 [[SPEC_SELECT7_LCSSA]] +; +; CHECK-VF4IC4-LABEL: @smax_idx( +; CHECK-VF4IC4-NEXT: entry: +; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 16 +; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF4IC4: vector.ph: +; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16 +; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF4IC4-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[MM:%.*]], i64 0 +; CHECK-VF4IC4-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i64> [[MINMAX_IDENT_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF4IC4: vector.body: +; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i64> [ <i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808>, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI8:%.*]] = phi <4 x i64> [ <i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808>, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI9:%.*]] = phi <4 x i64> [ <i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808>, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI10:%.*]] = phi <4 x i64> [ <i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808>, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4> +; CHECK-VF4IC4-NEXT: [[STEP_ADD1:%.*]] = add <4 x i64> [[STEP_ADD]], <i64 4, i64 4, i64 4, i64 4> +; CHECK-VF4IC4-NEXT: [[STEP_ADD2:%.*]] = add <4 x i64> [[STEP_ADD1]], <i64 4, i64 4, i64 4, i64 4> +; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8 +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 12 +; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-VF4IC4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] +; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]] +; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0 +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP8]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 4 +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i64>, ptr [[TMP9]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 8 +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i64>, ptr [[TMP10]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 12 +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x i64>, ptr [[TMP11]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP12]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[VEC_PHI]], <4 x i64> [[WIDE_LOAD]]) +; CHECK-VF4IC4-NEXT: [[TMP13]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[VEC_PHI4]], <4 x i64> [[WIDE_LOAD11]]) +; CHECK-VF4IC4-NEXT: [[TMP14]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[VEC_PHI5]], <4 x i64> [[WIDE_LOAD12]]) +; CHECK-VF4IC4-NEXT: [[TMP15]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[VEC_PHI6]], <4 x i64> [[WIDE_LOAD13]]) +; CHECK-VF4IC4-NEXT: [[TMP16:%.*]] = icmp slt <4 x i64> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-VF4IC4-NEXT: [[TMP17:%.*]] = icmp slt <4 x i64> [[VEC_PHI4]], [[WIDE_LOAD11]] +; CHECK-VF4IC4-NEXT: [[TMP18:%.*]] = icmp slt <4 x i64> [[VEC_PHI5]], [[WIDE_LOAD12]] +; CHECK-VF4IC4-NEXT: [[TMP19:%.*]] = icmp slt <4 x i64> [[VEC_PHI6]], [[WIDE_LOAD13]] +; CHECK-VF4IC4-NEXT: [[TMP20]] = select <4 x i1> [[TMP16]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI7]] +; CHECK-VF4IC4-NEXT: [[TMP21]] = select <4 x i1> [[TMP17]], <4 x i64> [[STEP_ADD]], <4 x i64> [[VEC_PHI8]] +; CHECK-VF4IC4-NEXT: [[TMP22]] = select <4 x i1> [[TMP18]], <4 x i64> [[STEP_ADD1]], <4 x i64> [[VEC_PHI9]] +; CHECK-VF4IC4-NEXT: [[TMP23]] = select <4 x i1> [[TMP19]], <4 x i64> [[STEP_ADD2]], <4 x i64> [[VEC_PHI10]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD2]], <i64 4, i64 4, i64 4, i64 4> +; CHECK-VF4IC4-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-VF4IC4: middle.block: +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i64> [[TMP12]], [[TMP13]] +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i64> [[TMP12]], <4 x i64> [[TMP13]] +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX_CMP14:%.*]] = icmp sgt <4 x i64> [[RDX_MINMAX_SELECT]], [[TMP14]] +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX_SELECT15:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP14]], <4 x i64> [[RDX_MINMAX_SELECT]], <4 x i64> [[TMP14]] +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX_CMP16:%.*]] = icmp sgt <4 x i64> [[RDX_MINMAX_SELECT15]], [[TMP15]] +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX_SELECT17:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP16]], <4 x i64> [[RDX_MINMAX_SELECT15]], <4 x i64> [[TMP15]] +; CHECK-VF4IC4-NEXT: [[TMP25:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[RDX_MINMAX_SELECT17]]) +; CHECK-VF4IC4-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP25]], i64 0 +; CHECK-VF4IC4-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: [[MASK_CMP:%.*]] = icmp eq <4 x i64> [[DOTSPLAT]], [[RDX_MINMAX_SELECT17]] +; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp sge <4 x i64> [[TMP12]], [[TMP13]] +; CHECK-VF4IC4-NEXT: [[RDX_SELECT:%.*]] = select <4 x i1> [[RDX_SELECT_CMP]], <4 x i64> [[TMP20]], <4 x i64> [[TMP21]] +; CHECK-VF4IC4-NEXT: [[RDX_SELECT_CMP18:%.*]] = icmp sge <4 x i64> [[RDX_MINMAX_SELECT]], [[TMP14]] +; CHECK-VF4IC4-NEXT: [[RDX_SELECT19:%.*]] = select <4 x i1> [[RDX_SELECT_CMP18]], <4 x i64> [[RDX_SELECT]], <4 x i64> [[TMP22]] +; CHECK-VF4IC4-NEXT: [[RDX_SELECT_CMP20:%.*]] = icmp sge <4 x i64> [[RDX_MINMAX_SELECT15]], [[TMP15]] +; CHECK-VF4IC4-NEXT: [[RDX_SELECT21:%.*]] = select <4 x i1> [[RDX_SELECT_CMP20]], <4 x i64> [[RDX_SELECT19]], <4 x i64> [[TMP23]] +; CHECK-VF4IC4-NEXT: [[MASK_SELECT:%.*]] = select <4 x i1> [[MASK_CMP]], <4 x i64> [[RDX_SELECT21]], <4 x i64> <i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807> +; CHECK-VF4IC4-NEXT: [[TMP26:%.*]] = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> [[MASK_SELECT]]) +; CHECK-VF4IC4-NEXT: [[RDX_SELECT_CMP22:%.*]] = icmp ne i64 [[TMP26]], -9223372036854775808 +; CHECK-VF4IC4-NEXT: [[RDX_SELECT23:%.*]] = select i1 [[RDX_SELECT_CMP22]], i64 [[TMP26]], i64 [[II:%.*]] +; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF4IC4: scalar.ph: +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[MM]], [[ENTRY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX24:%.*]] = phi i64 [ [[II]], [[ENTRY]] ], [ [[RDX_SELECT23]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC4-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF4IC4: for.body: +; CHECK-VF4IC4-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[MAX_09:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP28:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[IDX_011:%.*]] = phi i64 [ [[BC_MERGE_RDX24]], [[SCALAR_PH]] ], [ [[SPEC_SELECT7:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF4IC4-NEXT: [[TMP27:%.*]] = load i64, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP28]] = tail call i64 @llvm.smax.i64(i64 [[MAX_09]], i64 [[TMP27]]) +; CHECK-VF4IC4-NEXT: [[CMP1:%.*]] = icmp slt i64 [[MAX_09]], [[TMP27]] +; CHECK-VF4IC4-NEXT: [[SPEC_SELECT7]] = select i1 [[CMP1]], i64 [[INDVARS_IV]], i64 [[IDX_011]] +; CHECK-VF4IC4-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-VF4IC4: exit: +; CHECK-VF4IC4-NEXT: [[DOTLCSSA:%.*]] = phi i64 [ [[TMP28]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC4-NEXT: [[SPEC_SELECT7_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT7]], [[FOR_BODY]] ], [ [[RDX_SELECT23]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC4-NEXT: store i64 [[DOTLCSSA]], ptr [[RES_MAX:%.*]], align 4 +; CHECK-VF4IC4-NEXT: ret i64 [[SPEC_SELECT7_LCSSA]] +; +; CHECK-VF1IC4-LABEL: @smax_idx( +; CHECK-VF1IC4-NEXT: entry: +; CHECK-VF1IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 +; CHECK-VF1IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF1IC4: vector.ph: +; CHECK-VF1IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-VF1IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF1IC4-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF1IC4: vector.body: +; CHECK-VF1IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI:%.*]] = phi i64 [ [[MM:%.*]], [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI1:%.*]] = phi i64 [ [[MM]], [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI2:%.*]] = phi i64 [ [[MM]], [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI3:%.*]] = phi i64 [ [[MM]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI4:%.*]] = phi i64 [ -9223372036854775808, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI5:%.*]] = phi i64 [ -9223372036854775808, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI6:%.*]] = phi i64 [ -9223372036854775808, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI7:%.*]] = phi i64 [ -9223372036854775808, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF1IC4-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-VF1IC4-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-VF1IC4-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-VF1IC4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-VF1IC4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] +; CHECK-VF1IC4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]] +; CHECK-VF1IC4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; CHECK-VF1IC4-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP4]], align 4 +; CHECK-VF1IC4-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP5]], align 4 +; CHECK-VF1IC4-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 4 +; CHECK-VF1IC4-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 4 +; CHECK-VF1IC4-NEXT: [[TMP12]] = tail call i64 @llvm.smax.i64(i64 [[VEC_PHI]], i64 [[TMP8]]) +; CHECK-VF1IC4-NEXT: [[TMP13]] = tail call i64 @llvm.smax.i64(i64 [[VEC_PHI1]], i64 [[TMP9]]) +; CHECK-VF1IC4-NEXT: [[TMP14]] = tail call i64 @llvm.smax.i64(i64 [[VEC_PHI2]], i64 [[TMP10]]) +; CHECK-VF1IC4-NEXT: [[TMP15]] = tail call i64 @llvm.smax.i64(i64 [[VEC_PHI3]], i64 [[TMP11]]) +; CHECK-VF1IC4-NEXT: [[TMP16:%.*]] = icmp slt i64 [[VEC_PHI]], [[TMP8]] +; CHECK-VF1IC4-NEXT: [[TMP17:%.*]] = icmp slt i64 [[VEC_PHI1]], [[TMP9]] +; CHECK-VF1IC4-NEXT: [[TMP18:%.*]] = icmp slt i64 [[VEC_PHI2]], [[TMP10]] +; CHECK-VF1IC4-NEXT: [[TMP19:%.*]] = icmp slt i64 [[VEC_PHI3]], [[TMP11]] +; CHECK-VF1IC4-NEXT: [[TMP20]] = select i1 [[TMP16]], i64 [[TMP0]], i64 [[VEC_PHI4]] +; CHECK-VF1IC4-NEXT: [[TMP21]] = select i1 [[TMP17]], i64 [[TMP1]], i64 [[VEC_PHI5]] +; CHECK-VF1IC4-NEXT: [[TMP22]] = select i1 [[TMP18]], i64 [[TMP2]], i64 [[VEC_PHI6]] +; CHECK-VF1IC4-NEXT: [[TMP23]] = select i1 [[TMP19]], i64 [[TMP3]], i64 [[VEC_PHI7]] +; CHECK-VF1IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF1IC4-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF1IC4-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-VF1IC4: middle.block: +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt i64 [[TMP12]], [[TMP13]] +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select i1 [[RDX_MINMAX_CMP]], i64 [[TMP12]], i64 [[TMP13]] +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX_CMP8:%.*]] = icmp sgt i64 [[RDX_MINMAX_SELECT]], [[TMP14]] +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX_SELECT9:%.*]] = select i1 [[RDX_MINMAX_CMP8]], i64 [[RDX_MINMAX_SELECT]], i64 [[TMP14]] +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX_CMP10:%.*]] = icmp sgt i64 [[RDX_MINMAX_SELECT9]], [[TMP15]] +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX_SELECT11:%.*]] = select i1 [[RDX_MINMAX_CMP10]], i64 [[RDX_MINMAX_SELECT9]], i64 [[TMP15]] +; CHECK-VF1IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF1IC4-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp sge i64 [[TMP12]], [[TMP13]] +; CHECK-VF1IC4-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP20]], i64 [[TMP21]] +; CHECK-VF1IC4-NEXT: [[RDX_SELECT_CMP12:%.*]] = icmp sge i64 [[RDX_MINMAX_SELECT]], [[TMP14]] +; CHECK-VF1IC4-NEXT: [[RDX_SELECT13:%.*]] = select i1 [[RDX_SELECT_CMP12]], i64 [[RDX_SELECT]], i64 [[TMP22]] +; CHECK-VF1IC4-NEXT: [[RDX_SELECT_CMP14:%.*]] = icmp sge i64 [[RDX_MINMAX_SELECT9]], [[TMP15]] +; CHECK-VF1IC4-NEXT: [[RDX_SELECT15:%.*]] = select i1 [[RDX_SELECT_CMP14]], i64 [[RDX_SELECT13]], i64 [[TMP23]] +; CHECK-VF1IC4-NEXT: [[RDX_SELECT_CMP16:%.*]] = icmp ne i64 [[RDX_SELECT15]], -9223372036854775808 +; CHECK-VF1IC4-NEXT: [[RDX_SELECT17:%.*]] = select i1 [[RDX_SELECT_CMP16]], i64 [[RDX_SELECT15]], i64 [[II:%.*]] +; CHECK-VF1IC4-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF1IC4: scalar.ph: +; CHECK-VF1IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-VF1IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[MM]], [[ENTRY]] ], [ [[RDX_MINMAX_SELECT11]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1IC4-NEXT: [[BC_MERGE_RDX18:%.*]] = phi i64 [ [[II]], [[ENTRY]] ], [ [[RDX_SELECT17]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1IC4-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF1IC4: for.body: +; CHECK-VF1IC4-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[MAX_09:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP26:%.*]], [[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[IDX_011:%.*]] = phi i64 [ [[BC_MERGE_RDX18]], [[SCALAR_PH]] ], [ [[SPEC_SELECT7:%.*]], [[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF1IC4-NEXT: [[TMP25:%.*]] = load i64, ptr [[ARRAYIDX]], align 4 +; CHECK-VF1IC4-NEXT: [[TMP26]] = tail call i64 @llvm.smax.i64(i64 [[MAX_09]], i64 [[TMP25]]) +; CHECK-VF1IC4-NEXT: [[CMP1:%.*]] = icmp slt i64 [[MAX_09]], [[TMP25]] +; CHECK-VF1IC4-NEXT: [[SPEC_SELECT7]] = select i1 [[CMP1]], i64 [[INDVARS_IV]], i64 [[IDX_011]] +; CHECK-VF1IC4-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF1IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF1IC4-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-VF1IC4: exit: +; CHECK-VF1IC4-NEXT: [[DOTLCSSA:%.*]] = phi i64 [ [[TMP26]], [[FOR_BODY]] ], [ [[RDX_MINMAX_SELECT11]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1IC4-NEXT: [[SPEC_SELECT7_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT7]], [[FOR_BODY]] ], [ [[RDX_SELECT17]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1IC4-NEXT: store i64 [[DOTLCSSA]], ptr [[RES_MAX:%.*]], align 4 +; CHECK-VF1IC4-NEXT: ret i64 [[SPEC_SELECT7_LCSSA]] +; +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %max.09 = phi i64 [ %mm, %entry ], [ %1, %for.body ] + %idx.011 = phi i64 [ %ii, %entry ], [ %spec.select7, %for.body ] + %arrayidx = getelementptr inbounds i64, ptr %a, i64 %indvars.iv + %0 = load i64, ptr %arrayidx + %1 = tail call i64 @llvm.smax.i64(i64 %max.09, i64 %0) + %cmp1 = icmp slt i64 %max.09, %0 + %spec.select7 = select i1 %cmp1, i64 %indvars.iv, i64 %idx.011 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body + +exit: + store i64 %1, ptr %res_max + ret i64 %spec.select7 +} + +; +; Check the different order of reduction phis. +; +define i64 @smax_idx_inverted_phi(ptr nocapture readonly %a, i64 %mm, i64 %ii, ptr nocapture writeonly %res_max, i64 %n) { +; CHECK-LABEL: @smax_idx_inverted_phi( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i64> <i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808>, i64 [[II:%.*]], i32 0 +; CHECK-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[MM:%.*]], i64 0 +; CHECK-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i64> [[MINMAX_IDENT_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[VEC_PHI1]], <4 x i64> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP5:%.*]] = icmp slt <4 x i64> [[VEC_PHI1]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP6]] = select <4 x i1> [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4> +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP4]]) +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP8]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[MASK_CMP:%.*]] = icmp eq <4 x i64> [[DOTSPLAT]], [[TMP4]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: [[MASK_SELECT:%.*]] = select <4 x i1> [[MASK_CMP]], <4 x i64> [[TMP6]], <4 x i64> <i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807> +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> [[MASK_SELECT]]) +; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP9]], -9223372036854775808 +; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP9]], i64 [[II]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[MM]], [[ENTRY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX2:%.*]] = phi i64 [ [[II]], [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[IDX_011:%.*]] = phi i64 [ [[BC_MERGE_RDX2]], [[SCALAR_PH]] ], [ [[SPEC_SELECT7:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[MAX_09:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP11:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP10:%.*]] = load i64, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP11]] = tail call i64 @llvm.smax.i64(i64 [[MAX_09]], i64 [[TMP10]]) +; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i64 [[MAX_09]], [[TMP10]] +; CHECK-NEXT: [[SPEC_SELECT7]] = select i1 [[CMP1]], i64 [[INDVARS_IV]], i64 [[IDX_011]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i64 [ [[TMP11]], [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[SPEC_SELECT7_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT7]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: store i64 [[DOTLCSSA]], ptr [[RES_MAX:%.*]], align 4 +; CHECK-NEXT: ret i64 [[SPEC_SELECT7_LCSSA]] +; +; CHECK-VF4IC1-LABEL: @smax_idx_inverted_phi( +; CHECK-VF4IC1-NEXT: entry: +; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 +; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF4IC1: vector.ph: +; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF4IC1-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[MM:%.*]], i64 0 +; CHECK-VF4IC1-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i64> [[MINMAX_IDENT_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF4IC1: vector.body: +; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ <i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0 +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP3]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[VEC_PHI1]], <4 x i64> [[WIDE_LOAD]]) +; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = icmp slt <4 x i64> [[VEC_PHI1]], [[WIDE_LOAD]] +; CHECK-VF4IC1-NEXT: [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4> +; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-VF4IC1: middle.block: +; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP3]]) +; CHECK-VF4IC1-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP7]], i64 0 +; CHECK-VF4IC1-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: [[MASK_CMP:%.*]] = icmp eq <4 x i64> [[DOTSPLAT]], [[TMP3]] +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: [[MASK_SELECT:%.*]] = select <4 x i1> [[MASK_CMP]], <4 x i64> [[TMP5]], <4 x i64> <i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807> +; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> [[MASK_SELECT]]) +; CHECK-VF4IC1-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP8]], -9223372036854775808 +; CHECK-VF4IC1-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP8]], i64 [[II:%.*]] +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF4IC1: scalar.ph: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[MM]], [[ENTRY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX2:%.*]] = phi i64 [ [[II]], [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF4IC1: for.body: +; CHECK-VF4IC1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[IDX_011:%.*]] = phi i64 [ [[BC_MERGE_RDX2]], [[SCALAR_PH]] ], [ [[SPEC_SELECT7:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[MAX_09:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP10:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = load i64, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP10]] = tail call i64 @llvm.smax.i64(i64 [[MAX_09]], i64 [[TMP9]]) +; CHECK-VF4IC1-NEXT: [[CMP1:%.*]] = icmp slt i64 [[MAX_09]], [[TMP9]] +; CHECK-VF4IC1-NEXT: [[SPEC_SELECT7]] = select i1 [[CMP1]], i64 [[INDVARS_IV]], i64 [[IDX_011]] +; CHECK-VF4IC1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-VF4IC1: exit: +; CHECK-VF4IC1-NEXT: [[DOTLCSSA:%.*]] = phi i64 [ [[TMP10]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: [[SPEC_SELECT7_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT7]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: store i64 [[DOTLCSSA]], ptr [[RES_MAX:%.*]], align 4 +; CHECK-VF4IC1-NEXT: ret i64 [[SPEC_SELECT7_LCSSA]] +; +; CHECK-VF4IC4-LABEL: @smax_idx_inverted_phi( +; CHECK-VF4IC4-NEXT: entry: +; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 16 +; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF4IC4: vector.ph: +; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16 +; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF4IC4-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[MM:%.*]], i64 0 +; CHECK-VF4IC4-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i64> [[MINMAX_IDENT_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF4IC4: vector.body: +; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ <i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808>, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i64> [ <i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808>, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i64> [ <i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808>, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i64> [ <i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808>, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI8:%.*]] = phi <4 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI9:%.*]] = phi <4 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI10:%.*]] = phi <4 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4> +; CHECK-VF4IC4-NEXT: [[STEP_ADD1:%.*]] = add <4 x i64> [[STEP_ADD]], <i64 4, i64 4, i64 4, i64 4> +; CHECK-VF4IC4-NEXT: [[STEP_ADD2:%.*]] = add <4 x i64> [[STEP_ADD1]], <i64 4, i64 4, i64 4, i64 4> +; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8 +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 12 +; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-VF4IC4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] +; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]] +; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0 +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP8]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 4 +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i64>, ptr [[TMP9]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 8 +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i64>, ptr [[TMP10]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 12 +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x i64>, ptr [[TMP11]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP12]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[VEC_PHI7]], <4 x i64> [[WIDE_LOAD]]) +; CHECK-VF4IC4-NEXT: [[TMP13]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[VEC_PHI8]], <4 x i64> [[WIDE_LOAD11]]) +; CHECK-VF4IC4-NEXT: [[TMP14]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[VEC_PHI9]], <4 x i64> [[WIDE_LOAD12]]) +; CHECK-VF4IC4-NEXT: [[TMP15]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[VEC_PHI10]], <4 x i64> [[WIDE_LOAD13]]) +; CHECK-VF4IC4-NEXT: [[TMP16:%.*]] = icmp slt <4 x i64> [[VEC_PHI7]], [[WIDE_LOAD]] +; CHECK-VF4IC4-NEXT: [[TMP17:%.*]] = icmp slt <4 x i64> [[VEC_PHI8]], [[WIDE_LOAD11]] +; CHECK-VF4IC4-NEXT: [[TMP18:%.*]] = icmp slt <4 x i64> [[VEC_PHI9]], [[WIDE_LOAD12]] +; CHECK-VF4IC4-NEXT: [[TMP19:%.*]] = icmp slt <4 x i64> [[VEC_PHI10]], [[WIDE_LOAD13]] +; CHECK-VF4IC4-NEXT: [[TMP20]] = select <4 x i1> [[TMP16]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] +; CHECK-VF4IC4-NEXT: [[TMP21]] = select <4 x i1> [[TMP17]], <4 x i64> [[STEP_ADD]], <4 x i64> [[VEC_PHI4]] +; CHECK-VF4IC4-NEXT: [[TMP22]] = select <4 x i1> [[TMP18]], <4 x i64> [[STEP_ADD1]], <4 x i64> [[VEC_PHI5]] +; CHECK-VF4IC4-NEXT: [[TMP23]] = select <4 x i1> [[TMP19]], <4 x i64> [[STEP_ADD2]], <4 x i64> [[VEC_PHI6]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD2]], <i64 4, i64 4, i64 4, i64 4> +; CHECK-VF4IC4-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-VF4IC4: middle.block: +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i64> [[TMP12]], [[TMP13]] +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i64> [[TMP12]], <4 x i64> [[TMP13]] +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX_CMP14:%.*]] = icmp sgt <4 x i64> [[RDX_MINMAX_SELECT]], [[TMP14]] +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX_SELECT15:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP14]], <4 x i64> [[RDX_MINMAX_SELECT]], <4 x i64> [[TMP14]] +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX_CMP16:%.*]] = icmp sgt <4 x i64> [[RDX_MINMAX_SELECT15]], [[TMP15]] +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX_SELECT17:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP16]], <4 x i64> [[RDX_MINMAX_SELECT15]], <4 x i64> [[TMP15]] +; CHECK-VF4IC4-NEXT: [[TMP25:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[RDX_MINMAX_SELECT17]]) +; CHECK-VF4IC4-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP25]], i64 0 +; CHECK-VF4IC4-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: [[MASK_CMP:%.*]] = icmp eq <4 x i64> [[DOTSPLAT]], [[RDX_MINMAX_SELECT17]] +; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp sge <4 x i64> [[TMP12]], [[TMP13]] +; CHECK-VF4IC4-NEXT: [[RDX_SELECT:%.*]] = select <4 x i1> [[RDX_SELECT_CMP]], <4 x i64> [[TMP20]], <4 x i64> [[TMP21]] +; CHECK-VF4IC4-NEXT: [[RDX_SELECT_CMP18:%.*]] = icmp sge <4 x i64> [[RDX_MINMAX_SELECT]], [[TMP14]] +; CHECK-VF4IC4-NEXT: [[RDX_SELECT19:%.*]] = select <4 x i1> [[RDX_SELECT_CMP18]], <4 x i64> [[RDX_SELECT]], <4 x i64> [[TMP22]] +; CHECK-VF4IC4-NEXT: [[RDX_SELECT_CMP20:%.*]] = icmp sge <4 x i64> [[RDX_MINMAX_SELECT15]], [[TMP15]] +; CHECK-VF4IC4-NEXT: [[RDX_SELECT21:%.*]] = select <4 x i1> [[RDX_SELECT_CMP20]], <4 x i64> [[RDX_SELECT19]], <4 x i64> [[TMP23]] +; CHECK-VF4IC4-NEXT: [[MASK_SELECT:%.*]] = select <4 x i1> [[MASK_CMP]], <4 x i64> [[RDX_SELECT21]], <4 x i64> <i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807> +; CHECK-VF4IC4-NEXT: [[TMP26:%.*]] = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> [[MASK_SELECT]]) +; CHECK-VF4IC4-NEXT: [[RDX_SELECT_CMP22:%.*]] = icmp ne i64 [[TMP26]], -9223372036854775808 +; CHECK-VF4IC4-NEXT: [[RDX_SELECT23:%.*]] = select i1 [[RDX_SELECT_CMP22]], i64 [[TMP26]], i64 [[II:%.*]] +; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF4IC4: scalar.ph: +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[MM]], [[ENTRY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX24:%.*]] = phi i64 [ [[II]], [[ENTRY]] ], [ [[RDX_SELECT23]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC4-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF4IC4: for.body: +; CHECK-VF4IC4-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[IDX_011:%.*]] = phi i64 [ [[BC_MERGE_RDX24]], [[SCALAR_PH]] ], [ [[SPEC_SELECT7:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[MAX_09:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP28:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF4IC4-NEXT: [[TMP27:%.*]] = load i64, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP28]] = tail call i64 @llvm.smax.i64(i64 [[MAX_09]], i64 [[TMP27]]) +; CHECK-VF4IC4-NEXT: [[CMP1:%.*]] = icmp slt i64 [[MAX_09]], [[TMP27]] +; CHECK-VF4IC4-NEXT: [[SPEC_SELECT7]] = select i1 [[CMP1]], i64 [[INDVARS_IV]], i64 [[IDX_011]] +; CHECK-VF4IC4-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-VF4IC4: exit: +; CHECK-VF4IC4-NEXT: [[DOTLCSSA:%.*]] = phi i64 [ [[TMP28]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC4-NEXT: [[SPEC_SELECT7_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT7]], [[FOR_BODY]] ], [ [[RDX_SELECT23]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC4-NEXT: store i64 [[DOTLCSSA]], ptr [[RES_MAX:%.*]], align 4 +; CHECK-VF4IC4-NEXT: ret i64 [[SPEC_SELECT7_LCSSA]] +; +; CHECK-VF1IC4-LABEL: @smax_idx_inverted_phi( +; CHECK-VF1IC4-NEXT: entry: +; CHECK-VF1IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 +; CHECK-VF1IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF1IC4: vector.ph: +; CHECK-VF1IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-VF1IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF1IC4-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF1IC4: vector.body: +; CHECK-VF1IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI:%.*]] = phi i64 [ -9223372036854775808, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI1:%.*]] = phi i64 [ -9223372036854775808, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI2:%.*]] = phi i64 [ -9223372036854775808, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI3:%.*]] = phi i64 [ -9223372036854775808, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI4:%.*]] = phi i64 [ [[MM:%.*]], [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI5:%.*]] = phi i64 [ [[MM]], [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI6:%.*]] = phi i64 [ [[MM]], [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI7:%.*]] = phi i64 [ [[MM]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF1IC4-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-VF1IC4-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-VF1IC4-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-VF1IC4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-VF1IC4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] +; CHECK-VF1IC4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]] +; CHECK-VF1IC4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; CHECK-VF1IC4-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP4]], align 4 +; CHECK-VF1IC4-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP5]], align 4 +; CHECK-VF1IC4-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 4 +; CHECK-VF1IC4-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 4 +; CHECK-VF1IC4-NEXT: [[TMP12]] = tail call i64 @llvm.smax.i64(i64 [[VEC_PHI4]], i64 [[TMP8]]) +; CHECK-VF1IC4-NEXT: [[TMP13]] = tail call i64 @llvm.smax.i64(i64 [[VEC_PHI5]], i64 [[TMP9]]) +; CHECK-VF1IC4-NEXT: [[TMP14]] = tail call i64 @llvm.smax.i64(i64 [[VEC_PHI6]], i64 [[TMP10]]) +; CHECK-VF1IC4-NEXT: [[TMP15]] = tail call i64 @llvm.smax.i64(i64 [[VEC_PHI7]], i64 [[TMP11]]) +; CHECK-VF1IC4-NEXT: [[TMP16:%.*]] = icmp slt i64 [[VEC_PHI4]], [[TMP8]] +; CHECK-VF1IC4-NEXT: [[TMP17:%.*]] = icmp slt i64 [[VEC_PHI5]], [[TMP9]] +; CHECK-VF1IC4-NEXT: [[TMP18:%.*]] = icmp slt i64 [[VEC_PHI6]], [[TMP10]] +; CHECK-VF1IC4-NEXT: [[TMP19:%.*]] = icmp slt i64 [[VEC_PHI7]], [[TMP11]] +; CHECK-VF1IC4-NEXT: [[TMP20]] = select i1 [[TMP16]], i64 [[TMP0]], i64 [[VEC_PHI]] +; CHECK-VF1IC4-NEXT: [[TMP21]] = select i1 [[TMP17]], i64 [[TMP1]], i64 [[VEC_PHI1]] +; CHECK-VF1IC4-NEXT: [[TMP22]] = select i1 [[TMP18]], i64 [[TMP2]], i64 [[VEC_PHI2]] +; CHECK-VF1IC4-NEXT: [[TMP23]] = select i1 [[TMP19]], i64 [[TMP3]], i64 [[VEC_PHI3]] +; CHECK-VF1IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF1IC4-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF1IC4-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-VF1IC4: middle.block: +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt i64 [[TMP12]], [[TMP13]] +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select i1 [[RDX_MINMAX_CMP]], i64 [[TMP12]], i64 [[TMP13]] +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX_CMP8:%.*]] = icmp sgt i64 [[RDX_MINMAX_SELECT]], [[TMP14]] +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX_SELECT9:%.*]] = select i1 [[RDX_MINMAX_CMP8]], i64 [[RDX_MINMAX_SELECT]], i64 [[TMP14]] +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX_CMP10:%.*]] = icmp sgt i64 [[RDX_MINMAX_SELECT9]], [[TMP15]] +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX_SELECT11:%.*]] = select i1 [[RDX_MINMAX_CMP10]], i64 [[RDX_MINMAX_SELECT9]], i64 [[TMP15]] +; CHECK-VF1IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF1IC4-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp sge i64 [[TMP12]], [[TMP13]] +; CHECK-VF1IC4-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP20]], i64 [[TMP21]] +; CHECK-VF1IC4-NEXT: [[RDX_SELECT_CMP12:%.*]] = icmp sge i64 [[RDX_MINMAX_SELECT]], [[TMP14]] +; CHECK-VF1IC4-NEXT: [[RDX_SELECT13:%.*]] = select i1 [[RDX_SELECT_CMP12]], i64 [[RDX_SELECT]], i64 [[TMP22]] +; CHECK-VF1IC4-NEXT: [[RDX_SELECT_CMP14:%.*]] = icmp sge i64 [[RDX_MINMAX_SELECT9]], [[TMP15]] +; CHECK-VF1IC4-NEXT: [[RDX_SELECT15:%.*]] = select i1 [[RDX_SELECT_CMP14]], i64 [[RDX_SELECT13]], i64 [[TMP23]] +; CHECK-VF1IC4-NEXT: [[RDX_SELECT_CMP16:%.*]] = icmp ne i64 [[RDX_SELECT15]], -9223372036854775808 +; CHECK-VF1IC4-NEXT: [[RDX_SELECT17:%.*]] = select i1 [[RDX_SELECT_CMP16]], i64 [[RDX_SELECT15]], i64 [[II:%.*]] +; CHECK-VF1IC4-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF1IC4: scalar.ph: +; CHECK-VF1IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-VF1IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[MM]], [[ENTRY]] ], [ [[RDX_MINMAX_SELECT11]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1IC4-NEXT: [[BC_MERGE_RDX18:%.*]] = phi i64 [ [[II]], [[ENTRY]] ], [ [[RDX_SELECT17]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1IC4-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF1IC4: for.body: +; CHECK-VF1IC4-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[IDX_011:%.*]] = phi i64 [ [[BC_MERGE_RDX18]], [[SCALAR_PH]] ], [ [[SPEC_SELECT7:%.*]], [[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[MAX_09:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP26:%.*]], [[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF1IC4-NEXT: [[TMP25:%.*]] = load i64, ptr [[ARRAYIDX]], align 4 +; CHECK-VF1IC4-NEXT: [[TMP26]] = tail call i64 @llvm.smax.i64(i64 [[MAX_09]], i64 [[TMP25]]) +; CHECK-VF1IC4-NEXT: [[CMP1:%.*]] = icmp slt i64 [[MAX_09]], [[TMP25]] +; CHECK-VF1IC4-NEXT: [[SPEC_SELECT7]] = select i1 [[CMP1]], i64 [[INDVARS_IV]], i64 [[IDX_011]] +; CHECK-VF1IC4-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF1IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF1IC4-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-VF1IC4: exit: +; CHECK-VF1IC4-NEXT: [[DOTLCSSA:%.*]] = phi i64 [ [[TMP26]], [[FOR_BODY]] ], [ [[RDX_MINMAX_SELECT11]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1IC4-NEXT: [[SPEC_SELECT7_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT7]], [[FOR_BODY]] ], [ [[RDX_SELECT17]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1IC4-NEXT: store i64 [[DOTLCSSA]], ptr [[RES_MAX:%.*]], align 4 +; CHECK-VF1IC4-NEXT: ret i64 [[SPEC_SELECT7_LCSSA]] +; +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %idx.011 = phi i64 [ %ii, %entry ], [ %spec.select7, %for.body ] ;; + %max.09 = phi i64 [ %mm, %entry ], [ %1, %for.body ] ;; + %arrayidx = getelementptr inbounds i64, ptr %a, i64 %indvars.iv + %0 = load i64, ptr %arrayidx + %1 = tail call i64 @llvm.smax.i64(i64 %max.09, i64 %0) + %cmp1 = icmp slt i64 %max.09, %0 + %spec.select7 = select i1 %cmp1, i64 %indvars.iv, i64 %idx.011 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body + +exit: + store i64 %1, ptr %res_max + ret i64 %spec.select7 +} + +; Check if it is a MMI when smax is not used outside the loop. +; +; Currently at the end, it will check if smax has exitInstruction. +; But in fact MMI should be possible to use the exitInstruction of +; SelectICmp be the exitInstruction. +; +define i64 @smax_idx_max_no_exit_user(ptr nocapture readonly %a, i64 %mm, i64 %ii, i64 %n) { +; CHECK-LABEL: @smax_idx_max_no_exit_user( +; CHECK-NOT: vector.body: +; +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %max.09 = phi i64 [ %mm, %entry ], [ %1, %for.body ] + %idx.011 = phi i64 [ %ii, %entry ], [ %spec.select7, %for.body ] + %arrayidx = getelementptr inbounds i64, ptr %a, i64 %indvars.iv + %0 = load i64, ptr %arrayidx + %1 = tail call i64 @llvm.smax.i64(i64 %max.09, i64 %0) + %cmp1 = icmp slt i64 %max.09, %0 + %spec.select7 = select i1 %cmp1, i64 %indvars.iv, i64 %idx.011 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ; %1 has no external users + ret i64 %spec.select7 +} + +; Check smax implemented in terms of select(cmp()). +; +; Currently SelectICmp does not support icmp with multiple users. +; It may be possible to reuse some of the methods in Combination pass to check +; whether icmp can be copied. +; +define i64 @smax_idx_select_cmp(ptr nocapture readonly %a, i64 %mm, i64 %ii, ptr nocapture writeonly %res_max, i64 %n) { +; CHECK-LABEL: @smax_idx_select_cmp( +; CHECK-NOT: vector.body: +; +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %max.09 = phi i64 [ %mm, %entry ], [ %spec.select, %for.body ] + %idx.011 = phi i64 [ %ii, %entry ], [ %spec.select7, %for.body ] + %arrayidx = getelementptr inbounds i64, ptr %a, i64 %indvars.iv + %0 = load i64, ptr %arrayidx + %cmp1 = icmp slt i64 %max.09, %0 ;; + %spec.select = select i1 %cmp1, i64 %0, i64 %max.09 ;; + %spec.select7 = select i1 %cmp1, i64 %indvars.iv, i64 %idx.011 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body + +exit: + store i64 %spec.select, ptr %res_max + ret i64 %spec.select7 +} + +; +; Check sge case. +; +define i64 @smax_idx_inverted_pred(ptr nocapture readonly %a, i64 %mm, i64 %ii, ptr nocapture writeonly %res_max, i64 %n) { +; CHECK-LABEL: @smax_idx_inverted_pred( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[MM:%.*]], i64 0 +; CHECK-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i64> [[MINMAX_IDENT_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i64> <i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808>, i64 [[II:%.*]], i32 0 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[VEC_PHI]], <4 x i64> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP5:%.*]] = icmp sge <4 x i64> [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP6]] = select <4 x i1> [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI1]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4> +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP4]]) +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP8]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[MASK_CMP:%.*]] = icmp eq <4 x i64> [[DOTSPLAT]], [[TMP4]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: [[MASK_SELECT:%.*]] = select <4 x i1> [[MASK_CMP]], <4 x i64> [[TMP6]], <4 x i64> <i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807> +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> [[MASK_SELECT]]) +; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP9]], -9223372036854775808 +; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP9]], i64 [[II]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[MM]], [[ENTRY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX2:%.*]] = phi i64 [ [[II]], [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[MAX_09:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP11:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[IDX_011:%.*]] = phi i64 [ [[BC_MERGE_RDX2]], [[SCALAR_PH]] ], [ [[SPEC_SELECT7:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP10:%.*]] = load i64, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP11]] = tail call i64 @llvm.smax.i64(i64 [[MAX_09]], i64 [[TMP10]]) +; CHECK-NEXT: [[CMP1:%.*]] = icmp sge i64 [[TMP10]], [[MAX_09]] +; CHECK-NEXT: [[SPEC_SELECT7]] = select i1 [[CMP1]], i64 [[INDVARS_IV]], i64 [[IDX_011]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i64 [ [[TMP11]], [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[SPEC_SELECT7_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT7]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: store i64 [[DOTLCSSA]], ptr [[RES_MAX:%.*]], align 4 +; CHECK-NEXT: ret i64 [[SPEC_SELECT7_LCSSA]] +; +; CHECK-VF4IC1-LABEL: @smax_idx_inverted_pred( +; CHECK-VF4IC1-NEXT: entry: +; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 +; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF4IC1: vector.ph: +; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF4IC1-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[MM:%.*]], i64 0 +; CHECK-VF4IC1-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i64> [[MINMAX_IDENT_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF4IC1: vector.body: +; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ <i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0 +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP3]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[VEC_PHI]], <4 x i64> [[WIDE_LOAD]]) +; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = icmp sge <4 x i64> [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI1]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4> +; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-VF4IC1: middle.block: +; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP3]]) +; CHECK-VF4IC1-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP7]], i64 0 +; CHECK-VF4IC1-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: [[MASK_CMP:%.*]] = icmp eq <4 x i64> [[DOTSPLAT]], [[TMP3]] +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: [[MASK_SELECT:%.*]] = select <4 x i1> [[MASK_CMP]], <4 x i64> [[TMP5]], <4 x i64> <i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807> +; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> [[MASK_SELECT]]) +; CHECK-VF4IC1-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP8]], -9223372036854775808 +; CHECK-VF4IC1-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP8]], i64 [[II:%.*]] +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF4IC1: scalar.ph: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[MM]], [[ENTRY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX2:%.*]] = phi i64 [ [[II]], [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF4IC1: for.body: +; CHECK-VF4IC1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[MAX_09:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP10:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[IDX_011:%.*]] = phi i64 [ [[BC_MERGE_RDX2]], [[SCALAR_PH]] ], [ [[SPEC_SELECT7:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = load i64, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP10]] = tail call i64 @llvm.smax.i64(i64 [[MAX_09]], i64 [[TMP9]]) +; CHECK-VF4IC1-NEXT: [[CMP1:%.*]] = icmp sge i64 [[TMP9]], [[MAX_09]] +; CHECK-VF4IC1-NEXT: [[SPEC_SELECT7]] = select i1 [[CMP1]], i64 [[INDVARS_IV]], i64 [[IDX_011]] +; CHECK-VF4IC1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-VF4IC1: exit: +; CHECK-VF4IC1-NEXT: [[DOTLCSSA:%.*]] = phi i64 [ [[TMP10]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: [[SPEC_SELECT7_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT7]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: store i64 [[DOTLCSSA]], ptr [[RES_MAX:%.*]], align 4 +; CHECK-VF4IC1-NEXT: ret i64 [[SPEC_SELECT7_LCSSA]] +; +; CHECK-VF4IC4-LABEL: @smax_idx_inverted_pred( +; CHECK-VF4IC4-NEXT: entry: +; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 16 +; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF4IC4: vector.ph: +; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16 +; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF4IC4-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[MM:%.*]], i64 0 +; CHECK-VF4IC4-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i64> [[MINMAX_IDENT_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF4IC4: vector.body: +; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i64> [ <i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808>, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI8:%.*]] = phi <4 x i64> [ <i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808>, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI9:%.*]] = phi <4 x i64> [ <i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808>, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI10:%.*]] = phi <4 x i64> [ <i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808>, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4> +; CHECK-VF4IC4-NEXT: [[STEP_ADD1:%.*]] = add <4 x i64> [[STEP_ADD]], <i64 4, i64 4, i64 4, i64 4> +; CHECK-VF4IC4-NEXT: [[STEP_ADD2:%.*]] = add <4 x i64> [[STEP_ADD1]], <i64 4, i64 4, i64 4, i64 4> +; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8 +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 12 +; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-VF4IC4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] +; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]] +; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0 +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP8]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 4 +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i64>, ptr [[TMP9]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 8 +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i64>, ptr [[TMP10]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 12 +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x i64>, ptr [[TMP11]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP12]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[VEC_PHI]], <4 x i64> [[WIDE_LOAD]]) +; CHECK-VF4IC4-NEXT: [[TMP13]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[VEC_PHI4]], <4 x i64> [[WIDE_LOAD11]]) +; CHECK-VF4IC4-NEXT: [[TMP14]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[VEC_PHI5]], <4 x i64> [[WIDE_LOAD12]]) +; CHECK-VF4IC4-NEXT: [[TMP15]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[VEC_PHI6]], <4 x i64> [[WIDE_LOAD13]]) +; CHECK-VF4IC4-NEXT: [[TMP16:%.*]] = icmp sge <4 x i64> [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-VF4IC4-NEXT: [[TMP17:%.*]] = icmp sge <4 x i64> [[WIDE_LOAD11]], [[VEC_PHI4]] +; CHECK-VF4IC4-NEXT: [[TMP18:%.*]] = icmp sge <4 x i64> [[WIDE_LOAD12]], [[VEC_PHI5]] +; CHECK-VF4IC4-NEXT: [[TMP19:%.*]] = icmp sge <4 x i64> [[WIDE_LOAD13]], [[VEC_PHI6]] +; CHECK-VF4IC4-NEXT: [[TMP20]] = select <4 x i1> [[TMP16]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI7]] +; CHECK-VF4IC4-NEXT: [[TMP21]] = select <4 x i1> [[TMP17]], <4 x i64> [[STEP_ADD]], <4 x i64> [[VEC_PHI8]] +; CHECK-VF4IC4-NEXT: [[TMP22]] = select <4 x i1> [[TMP18]], <4 x i64> [[STEP_ADD1]], <4 x i64> [[VEC_PHI9]] +; CHECK-VF4IC4-NEXT: [[TMP23]] = select <4 x i1> [[TMP19]], <4 x i64> [[STEP_ADD2]], <4 x i64> [[VEC_PHI10]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD2]], <i64 4, i64 4, i64 4, i64 4> +; CHECK-VF4IC4-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-VF4IC4: middle.block: +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i64> [[TMP12]], [[TMP13]] +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i64> [[TMP12]], <4 x i64> [[TMP13]] +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX_CMP14:%.*]] = icmp sgt <4 x i64> [[RDX_MINMAX_SELECT]], [[TMP14]] +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX_SELECT15:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP14]], <4 x i64> [[RDX_MINMAX_SELECT]], <4 x i64> [[TMP14]] +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX_CMP16:%.*]] = icmp sgt <4 x i64> [[RDX_MINMAX_SELECT15]], [[TMP15]] +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX_SELECT17:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP16]], <4 x i64> [[RDX_MINMAX_SELECT15]], <4 x i64> [[TMP15]] +; CHECK-VF4IC4-NEXT: [[TMP25:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[RDX_MINMAX_SELECT17]]) +; CHECK-VF4IC4-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP25]], i64 0 +; CHECK-VF4IC4-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: [[MASK_CMP:%.*]] = icmp eq <4 x i64> [[DOTSPLAT]], [[RDX_MINMAX_SELECT17]] +; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp sge <4 x i64> [[TMP12]], [[TMP13]] +; CHECK-VF4IC4-NEXT: [[RDX_SELECT:%.*]] = select <4 x i1> [[RDX_SELECT_CMP]], <4 x i64> [[TMP20]], <4 x i64> [[TMP21]] +; CHECK-VF4IC4-NEXT: [[RDX_SELECT_CMP18:%.*]] = icmp sge <4 x i64> [[RDX_MINMAX_SELECT]], [[TMP14]] +; CHECK-VF4IC4-NEXT: [[RDX_SELECT19:%.*]] = select <4 x i1> [[RDX_SELECT_CMP18]], <4 x i64> [[RDX_SELECT]], <4 x i64> [[TMP22]] +; CHECK-VF4IC4-NEXT: [[RDX_SELECT_CMP20:%.*]] = icmp sge <4 x i64> [[RDX_MINMAX_SELECT15]], [[TMP15]] +; CHECK-VF4IC4-NEXT: [[RDX_SELECT21:%.*]] = select <4 x i1> [[RDX_SELECT_CMP20]], <4 x i64> [[RDX_SELECT19]], <4 x i64> [[TMP23]] +; CHECK-VF4IC4-NEXT: [[MASK_SELECT:%.*]] = select <4 x i1> [[MASK_CMP]], <4 x i64> [[RDX_SELECT21]], <4 x i64> <i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807> +; CHECK-VF4IC4-NEXT: [[TMP26:%.*]] = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> [[MASK_SELECT]]) +; CHECK-VF4IC4-NEXT: [[RDX_SELECT_CMP22:%.*]] = icmp ne i64 [[TMP26]], -9223372036854775808 +; CHECK-VF4IC4-NEXT: [[RDX_SELECT23:%.*]] = select i1 [[RDX_SELECT_CMP22]], i64 [[TMP26]], i64 [[II:%.*]] +; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF4IC4: scalar.ph: +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[MM]], [[ENTRY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX24:%.*]] = phi i64 [ [[II]], [[ENTRY]] ], [ [[RDX_SELECT23]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC4-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF4IC4: for.body: +; CHECK-VF4IC4-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[MAX_09:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP28:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[IDX_011:%.*]] = phi i64 [ [[BC_MERGE_RDX24]], [[SCALAR_PH]] ], [ [[SPEC_SELECT7:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF4IC4-NEXT: [[TMP27:%.*]] = load i64, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP28]] = tail call i64 @llvm.smax.i64(i64 [[MAX_09]], i64 [[TMP27]]) +; CHECK-VF4IC4-NEXT: [[CMP1:%.*]] = icmp sge i64 [[TMP27]], [[MAX_09]] +; CHECK-VF4IC4-NEXT: [[SPEC_SELECT7]] = select i1 [[CMP1]], i64 [[INDVARS_IV]], i64 [[IDX_011]] +; CHECK-VF4IC4-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-VF4IC4: exit: +; CHECK-VF4IC4-NEXT: [[DOTLCSSA:%.*]] = phi i64 [ [[TMP28]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC4-NEXT: [[SPEC_SELECT7_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT7]], [[FOR_BODY]] ], [ [[RDX_SELECT23]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC4-NEXT: store i64 [[DOTLCSSA]], ptr [[RES_MAX:%.*]], align 4 +; CHECK-VF4IC4-NEXT: ret i64 [[SPEC_SELECT7_LCSSA]] +; +; CHECK-VF1IC4-LABEL: @smax_idx_inverted_pred( +; CHECK-VF1IC4-NEXT: entry: +; CHECK-VF1IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 +; CHECK-VF1IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF1IC4: vector.ph: +; CHECK-VF1IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-VF1IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF1IC4-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF1IC4: vector.body: +; CHECK-VF1IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI:%.*]] = phi i64 [ [[MM:%.*]], [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI1:%.*]] = phi i64 [ [[MM]], [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI2:%.*]] = phi i64 [ [[MM]], [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI3:%.*]] = phi i64 [ [[MM]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI4:%.*]] = phi i64 [ -9223372036854775808, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI5:%.*]] = phi i64 [ -9223372036854775808, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI6:%.*]] = phi i64 [ -9223372036854775808, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI7:%.*]] = phi i64 [ -9223372036854775808, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF1IC4-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-VF1IC4-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-VF1IC4-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-VF1IC4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-VF1IC4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] +; CHECK-VF1IC4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]] +; CHECK-VF1IC4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; CHECK-VF1IC4-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP4]], align 4 +; CHECK-VF1IC4-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP5]], align 4 +; CHECK-VF1IC4-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 4 +; CHECK-VF1IC4-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 4 +; CHECK-VF1IC4-NEXT: [[TMP12]] = tail call i64 @llvm.smax.i64(i64 [[VEC_PHI]], i64 [[TMP8]]) +; CHECK-VF1IC4-NEXT: [[TMP13]] = tail call i64 @llvm.smax.i64(i64 [[VEC_PHI1]], i64 [[TMP9]]) +; CHECK-VF1IC4-NEXT: [[TMP14]] = tail call i64 @llvm.smax.i64(i64 [[VEC_PHI2]], i64 [[TMP10]]) +; CHECK-VF1IC4-NEXT: [[TMP15]] = tail call i64 @llvm.smax.i64(i64 [[VEC_PHI3]], i64 [[TMP11]]) +; CHECK-VF1IC4-NEXT: [[TMP16:%.*]] = icmp sge i64 [[TMP8]], [[VEC_PHI]] +; CHECK-VF1IC4-NEXT: [[TMP17:%.*]] = icmp sge i64 [[TMP9]], [[VEC_PHI1]] +; CHECK-VF1IC4-NEXT: [[TMP18:%.*]] = icmp sge i64 [[TMP10]], [[VEC_PHI2]] +; CHECK-VF1IC4-NEXT: [[TMP19:%.*]] = icmp sge i64 [[TMP11]], [[VEC_PHI3]] +; CHECK-VF1IC4-NEXT: [[TMP20]] = select i1 [[TMP16]], i64 [[TMP0]], i64 [[VEC_PHI4]] +; CHECK-VF1IC4-NEXT: [[TMP21]] = select i1 [[TMP17]], i64 [[TMP1]], i64 [[VEC_PHI5]] +; CHECK-VF1IC4-NEXT: [[TMP22]] = select i1 [[TMP18]], i64 [[TMP2]], i64 [[VEC_PHI6]] +; CHECK-VF1IC4-NEXT: [[TMP23]] = select i1 [[TMP19]], i64 [[TMP3]], i64 [[VEC_PHI7]] +; CHECK-VF1IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF1IC4-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF1IC4-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-VF1IC4: middle.block: +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt i64 [[TMP12]], [[TMP13]] +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select i1 [[RDX_MINMAX_CMP]], i64 [[TMP12]], i64 [[TMP13]] +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX_CMP8:%.*]] = icmp sgt i64 [[RDX_MINMAX_SELECT]], [[TMP14]] +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX_SELECT9:%.*]] = select i1 [[RDX_MINMAX_CMP8]], i64 [[RDX_MINMAX_SELECT]], i64 [[TMP14]] +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX_CMP10:%.*]] = icmp sgt i64 [[RDX_MINMAX_SELECT9]], [[TMP15]] +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX_SELECT11:%.*]] = select i1 [[RDX_MINMAX_CMP10]], i64 [[RDX_MINMAX_SELECT9]], i64 [[TMP15]] +; CHECK-VF1IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF1IC4-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp sge i64 [[TMP12]], [[TMP13]] +; CHECK-VF1IC4-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP20]], i64 [[TMP21]] +; CHECK-VF1IC4-NEXT: [[RDX_SELECT_CMP12:%.*]] = icmp sge i64 [[RDX_MINMAX_SELECT]], [[TMP14]] +; CHECK-VF1IC4-NEXT: [[RDX_SELECT13:%.*]] = select i1 [[RDX_SELECT_CMP12]], i64 [[RDX_SELECT]], i64 [[TMP22]] +; CHECK-VF1IC4-NEXT: [[RDX_SELECT_CMP14:%.*]] = icmp sge i64 [[RDX_MINMAX_SELECT9]], [[TMP15]] +; CHECK-VF1IC4-NEXT: [[RDX_SELECT15:%.*]] = select i1 [[RDX_SELECT_CMP14]], i64 [[RDX_SELECT13]], i64 [[TMP23]] +; CHECK-VF1IC4-NEXT: [[RDX_SELECT_CMP16:%.*]] = icmp ne i64 [[RDX_SELECT15]], -9223372036854775808 +; CHECK-VF1IC4-NEXT: [[RDX_SELECT17:%.*]] = select i1 [[RDX_SELECT_CMP16]], i64 [[RDX_SELECT15]], i64 [[II:%.*]] +; CHECK-VF1IC4-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF1IC4: scalar.ph: +; CHECK-VF1IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-VF1IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[MM]], [[ENTRY]] ], [ [[RDX_MINMAX_SELECT11]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1IC4-NEXT: [[BC_MERGE_RDX18:%.*]] = phi i64 [ [[II]], [[ENTRY]] ], [ [[RDX_SELECT17]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1IC4-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF1IC4: for.body: +; CHECK-VF1IC4-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[MAX_09:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP26:%.*]], [[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[IDX_011:%.*]] = phi i64 [ [[BC_MERGE_RDX18]], [[SCALAR_PH]] ], [ [[SPEC_SELECT7:%.*]], [[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF1IC4-NEXT: [[TMP25:%.*]] = load i64, ptr [[ARRAYIDX]], align 4 +; CHECK-VF1IC4-NEXT: [[TMP26]] = tail call i64 @llvm.smax.i64(i64 [[MAX_09]], i64 [[TMP25]]) +; CHECK-VF1IC4-NEXT: [[CMP1:%.*]] = icmp sge i64 [[TMP25]], [[MAX_09]] +; CHECK-VF1IC4-NEXT: [[SPEC_SELECT7]] = select i1 [[CMP1]], i64 [[INDVARS_IV]], i64 [[IDX_011]] +; CHECK-VF1IC4-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF1IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF1IC4-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-VF1IC4: exit: +; CHECK-VF1IC4-NEXT: [[DOTLCSSA:%.*]] = phi i64 [ [[TMP26]], [[FOR_BODY]] ], [ [[RDX_MINMAX_SELECT11]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1IC4-NEXT: [[SPEC_SELECT7_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT7]], [[FOR_BODY]] ], [ [[RDX_SELECT17]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1IC4-NEXT: store i64 [[DOTLCSSA]], ptr [[RES_MAX:%.*]], align 4 +; CHECK-VF1IC4-NEXT: ret i64 [[SPEC_SELECT7_LCSSA]] +; +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %max.09 = phi i64 [ %mm, %entry ], [ %1, %for.body ] + %idx.011 = phi i64 [ %ii, %entry ], [ %spec.select7, %for.body ] + %arrayidx = getelementptr inbounds i64, ptr %a, i64 %indvars.iv + %0 = load i64, ptr %arrayidx + %1 = tail call i64 @llvm.smax.i64(i64 %max.09, i64 %0) + %cmp1 = icmp sge i64 %0, %max.09 ;; + %spec.select7 = select i1 %cmp1, i64 %indvars.iv, i64 %idx.011 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body + +exit: + store i64 %1, ptr %res_max + ret i64 %spec.select7 +} + +; +; In such cases, the last index should be extracted. +; +define i64 @smax_idx_extract_last(ptr nocapture readonly %a, i64 %mm, i64 %ii, ptr nocapture writeonly %res_max, i64 %n) { +; CHECK-LABEL: @smax_idx_extract_last( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[MM:%.*]], i64 0 +; CHECK-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i64> [[MINMAX_IDENT_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i64> <i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808>, i64 [[II:%.*]], i32 0 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[VEC_PHI]], <4 x i64> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP5:%.*]] = icmp sgt <4 x i64> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP6]] = select <4 x i1> [[TMP5]], <4 x i64> [[VEC_PHI1]], <4 x i64> [[VEC_IND]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4> +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP4]]) +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP8]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[MASK_CMP:%.*]] = icmp eq <4 x i64> [[DOTSPLAT]], [[TMP4]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: [[MASK_SELECT:%.*]] = select <4 x i1> [[MASK_CMP]], <4 x i64> [[TMP6]], <4 x i64> <i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808> +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[MASK_SELECT]]) +; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP9]], -9223372036854775808 +; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP9]], i64 [[II]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[MM]], [[ENTRY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX2:%.*]] = phi i64 [ [[II]], [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[MAX_09:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP11:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[IDX_011:%.*]] = phi i64 [ [[BC_MERGE_RDX2]], [[SCALAR_PH]] ], [ [[SPEC_SELECT7:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP10:%.*]] = load i64, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP11]] = tail call i64 @llvm.smax.i64(i64 [[MAX_09]], i64 [[TMP10]]) +; CHECK-NEXT: [[CMP1_NOT:%.*]] = icmp sgt i64 [[MAX_09]], [[TMP10]] +; CHECK-NEXT: [[SPEC_SELECT7]] = select i1 [[CMP1_NOT]], i64 [[IDX_011]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i64 [ [[TMP11]], [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[SPEC_SELECT7_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT7]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: store i64 [[DOTLCSSA]], ptr [[RES_MAX:%.*]], align 4 +; CHECK-NEXT: ret i64 [[SPEC_SELECT7_LCSSA]] +; +; CHECK-VF4IC1-LABEL: @smax_idx_extract_last( +; CHECK-VF4IC1-NEXT: entry: +; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 +; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF4IC1: vector.ph: +; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF4IC1-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[MM:%.*]], i64 0 +; CHECK-VF4IC1-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i64> [[MINMAX_IDENT_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF4IC1: vector.body: +; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ <i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0 +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP3]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[VEC_PHI]], <4 x i64> [[WIDE_LOAD]]) +; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i64> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-VF4IC1-NEXT: [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i64> [[VEC_PHI1]], <4 x i64> [[VEC_IND]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4> +; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-VF4IC1: middle.block: +; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP3]]) +; CHECK-VF4IC1-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP7]], i64 0 +; CHECK-VF4IC1-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: [[MASK_CMP:%.*]] = icmp eq <4 x i64> [[DOTSPLAT]], [[TMP3]] +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: [[MASK_SELECT:%.*]] = select <4 x i1> [[MASK_CMP]], <4 x i64> [[TMP5]], <4 x i64> <i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808> +; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[MASK_SELECT]]) +; CHECK-VF4IC1-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP8]], -9223372036854775808 +; CHECK-VF4IC1-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP8]], i64 [[II:%.*]] +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF4IC1: scalar.ph: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[MM]], [[ENTRY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX2:%.*]] = phi i64 [ [[II]], [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF4IC1: for.body: +; CHECK-VF4IC1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[MAX_09:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP10:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[IDX_011:%.*]] = phi i64 [ [[BC_MERGE_RDX2]], [[SCALAR_PH]] ], [ [[SPEC_SELECT7:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = load i64, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP10]] = tail call i64 @llvm.smax.i64(i64 [[MAX_09]], i64 [[TMP9]]) +; CHECK-VF4IC1-NEXT: [[CMP1_NOT:%.*]] = icmp sgt i64 [[MAX_09]], [[TMP9]] +; CHECK-VF4IC1-NEXT: [[SPEC_SELECT7]] = select i1 [[CMP1_NOT]], i64 [[IDX_011]], i64 [[INDVARS_IV]] +; CHECK-VF4IC1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-VF4IC1: exit: +; CHECK-VF4IC1-NEXT: [[DOTLCSSA:%.*]] = phi i64 [ [[TMP10]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: [[SPEC_SELECT7_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT7]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: store i64 [[DOTLCSSA]], ptr [[RES_MAX:%.*]], align 4 +; CHECK-VF4IC1-NEXT: ret i64 [[SPEC_SELECT7_LCSSA]] +; +; CHECK-VF4IC4-LABEL: @smax_idx_extract_last( +; CHECK-VF4IC4-NEXT: entry: +; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 16 +; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF4IC4: vector.ph: +; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16 +; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF4IC4-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[MM:%.*]], i64 0 +; CHECK-VF4IC4-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i64> [[MINMAX_IDENT_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF4IC4: vector.body: +; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i64> [ <i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808>, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI8:%.*]] = phi <4 x i64> [ <i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808>, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI9:%.*]] = phi <4 x i64> [ <i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808>, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI10:%.*]] = phi <4 x i64> [ <i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808>, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4> +; CHECK-VF4IC4-NEXT: [[STEP_ADD1:%.*]] = add <4 x i64> [[STEP_ADD]], <i64 4, i64 4, i64 4, i64 4> +; CHECK-VF4IC4-NEXT: [[STEP_ADD2:%.*]] = add <4 x i64> [[STEP_ADD1]], <i64 4, i64 4, i64 4, i64 4> +; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8 +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 12 +; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-VF4IC4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] +; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]] +; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0 +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP8]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 4 +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i64>, ptr [[TMP9]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 8 +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i64>, ptr [[TMP10]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 12 +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x i64>, ptr [[TMP11]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP12]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[VEC_PHI]], <4 x i64> [[WIDE_LOAD]]) +; CHECK-VF4IC4-NEXT: [[TMP13]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[VEC_PHI4]], <4 x i64> [[WIDE_LOAD11]]) +; CHECK-VF4IC4-NEXT: [[TMP14]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[VEC_PHI5]], <4 x i64> [[WIDE_LOAD12]]) +; CHECK-VF4IC4-NEXT: [[TMP15]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[VEC_PHI6]], <4 x i64> [[WIDE_LOAD13]]) +; CHECK-VF4IC4-NEXT: [[TMP16:%.*]] = icmp sgt <4 x i64> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-VF4IC4-NEXT: [[TMP17:%.*]] = icmp sgt <4 x i64> [[VEC_PHI4]], [[WIDE_LOAD11]] +; CHECK-VF4IC4-NEXT: [[TMP18:%.*]] = icmp sgt <4 x i64> [[VEC_PHI5]], [[WIDE_LOAD12]] +; CHECK-VF4IC4-NEXT: [[TMP19:%.*]] = icmp sgt <4 x i64> [[VEC_PHI6]], [[WIDE_LOAD13]] +; CHECK-VF4IC4-NEXT: [[TMP20]] = select <4 x i1> [[TMP16]], <4 x i64> [[VEC_PHI7]], <4 x i64> [[VEC_IND]] +; CHECK-VF4IC4-NEXT: [[TMP21]] = select <4 x i1> [[TMP17]], <4 x i64> [[VEC_PHI8]], <4 x i64> [[STEP_ADD]] +; CHECK-VF4IC4-NEXT: [[TMP22]] = select <4 x i1> [[TMP18]], <4 x i64> [[VEC_PHI9]], <4 x i64> [[STEP_ADD1]] +; CHECK-VF4IC4-NEXT: [[TMP23]] = select <4 x i1> [[TMP19]], <4 x i64> [[VEC_PHI10]], <4 x i64> [[STEP_ADD2]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD2]], <i64 4, i64 4, i64 4, i64 4> +; CHECK-VF4IC4-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-VF4IC4: middle.block: +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i64> [[TMP12]], [[TMP13]] +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i64> [[TMP12]], <4 x i64> [[TMP13]] +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX_CMP14:%.*]] = icmp sgt <4 x i64> [[RDX_MINMAX_SELECT]], [[TMP14]] +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX_SELECT15:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP14]], <4 x i64> [[RDX_MINMAX_SELECT]], <4 x i64> [[TMP14]] +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX_CMP16:%.*]] = icmp sgt <4 x i64> [[RDX_MINMAX_SELECT15]], [[TMP15]] +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX_SELECT17:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP16]], <4 x i64> [[RDX_MINMAX_SELECT15]], <4 x i64> [[TMP15]] +; CHECK-VF4IC4-NEXT: [[TMP25:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[RDX_MINMAX_SELECT17]]) +; CHECK-VF4IC4-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP25]], i64 0 +; CHECK-VF4IC4-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: [[MASK_CMP:%.*]] = icmp eq <4 x i64> [[DOTSPLAT]], [[RDX_MINMAX_SELECT17]] +; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: [[RDX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i64> [[TMP20]], <4 x i64> [[TMP21]] +; CHECK-VF4IC4-NEXT: [[RDX_SELECT18:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP14]], <4 x i64> [[RDX_SELECT]], <4 x i64> [[TMP22]] +; CHECK-VF4IC4-NEXT: [[RDX_SELECT19:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP16]], <4 x i64> [[RDX_SELECT18]], <4 x i64> [[TMP23]] +; CHECK-VF4IC4-NEXT: [[MASK_SELECT:%.*]] = select <4 x i1> [[MASK_CMP]], <4 x i64> [[RDX_SELECT19]], <4 x i64> <i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808> +; CHECK-VF4IC4-NEXT: [[TMP26:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[MASK_SELECT]]) +; CHECK-VF4IC4-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP26]], -9223372036854775808 +; CHECK-VF4IC4-NEXT: [[RDX_SELECT20:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP26]], i64 [[II:%.*]] +; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF4IC4: scalar.ph: +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[MM]], [[ENTRY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX21:%.*]] = phi i64 [ [[II]], [[ENTRY]] ], [ [[RDX_SELECT20]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC4-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF4IC4: for.body: +; CHECK-VF4IC4-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[MAX_09:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP28:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[IDX_011:%.*]] = phi i64 [ [[BC_MERGE_RDX21]], [[SCALAR_PH]] ], [ [[SPEC_SELECT7:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF4IC4-NEXT: [[TMP27:%.*]] = load i64, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP28]] = tail call i64 @llvm.smax.i64(i64 [[MAX_09]], i64 [[TMP27]]) +; CHECK-VF4IC4-NEXT: [[CMP1_NOT:%.*]] = icmp sgt i64 [[MAX_09]], [[TMP27]] +; CHECK-VF4IC4-NEXT: [[SPEC_SELECT7]] = select i1 [[CMP1_NOT]], i64 [[IDX_011]], i64 [[INDVARS_IV]] +; CHECK-VF4IC4-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-VF4IC4: exit: +; CHECK-VF4IC4-NEXT: [[DOTLCSSA:%.*]] = phi i64 [ [[TMP28]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC4-NEXT: [[SPEC_SELECT7_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT7]], [[FOR_BODY]] ], [ [[RDX_SELECT20]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC4-NEXT: store i64 [[DOTLCSSA]], ptr [[RES_MAX:%.*]], align 4 +; CHECK-VF4IC4-NEXT: ret i64 [[SPEC_SELECT7_LCSSA]] +; +; CHECK-VF1IC4-LABEL: @smax_idx_extract_last( +; CHECK-VF1IC4-NEXT: entry: +; CHECK-VF1IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 +; CHECK-VF1IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF1IC4: vector.ph: +; CHECK-VF1IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-VF1IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF1IC4-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF1IC4: vector.body: +; CHECK-VF1IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI:%.*]] = phi i64 [ [[MM:%.*]], [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI1:%.*]] = phi i64 [ [[MM]], [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI2:%.*]] = phi i64 [ [[MM]], [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI3:%.*]] = phi i64 [ [[MM]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI4:%.*]] = phi i64 [ -9223372036854775808, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI5:%.*]] = phi i64 [ -9223372036854775808, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI6:%.*]] = phi i64 [ -9223372036854775808, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI7:%.*]] = phi i64 [ -9223372036854775808, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF1IC4-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-VF1IC4-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-VF1IC4-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-VF1IC4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-VF1IC4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] +; CHECK-VF1IC4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]] +; CHECK-VF1IC4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; CHECK-VF1IC4-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP4]], align 4 +; CHECK-VF1IC4-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP5]], align 4 +; CHECK-VF1IC4-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 4 +; CHECK-VF1IC4-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 4 +; CHECK-VF1IC4-NEXT: [[TMP12]] = tail call i64 @llvm.smax.i64(i64 [[VEC_PHI]], i64 [[TMP8]]) +; CHECK-VF1IC4-NEXT: [[TMP13]] = tail call i64 @llvm.smax.i64(i64 [[VEC_PHI1]], i64 [[TMP9]]) +; CHECK-VF1IC4-NEXT: [[TMP14]] = tail call i64 @llvm.smax.i64(i64 [[VEC_PHI2]], i64 [[TMP10]]) +; CHECK-VF1IC4-NEXT: [[TMP15]] = tail call i64 @llvm.smax.i64(i64 [[VEC_PHI3]], i64 [[TMP11]]) +; CHECK-VF1IC4-NEXT: [[TMP16:%.*]] = icmp sgt i64 [[VEC_PHI]], [[TMP8]] +; CHECK-VF1IC4-NEXT: [[TMP17:%.*]] = icmp sgt i64 [[VEC_PHI1]], [[TMP9]] +; CHECK-VF1IC4-NEXT: [[TMP18:%.*]] = icmp sgt i64 [[VEC_PHI2]], [[TMP10]] +; CHECK-VF1IC4-NEXT: [[TMP19:%.*]] = icmp sgt i64 [[VEC_PHI3]], [[TMP11]] +; CHECK-VF1IC4-NEXT: [[TMP20]] = select i1 [[TMP16]], i64 [[VEC_PHI4]], i64 [[TMP0]] +; CHECK-VF1IC4-NEXT: [[TMP21]] = select i1 [[TMP17]], i64 [[VEC_PHI5]], i64 [[TMP1]] +; CHECK-VF1IC4-NEXT: [[TMP22]] = select i1 [[TMP18]], i64 [[VEC_PHI6]], i64 [[TMP2]] +; CHECK-VF1IC4-NEXT: [[TMP23]] = select i1 [[TMP19]], i64 [[VEC_PHI7]], i64 [[TMP3]] +; CHECK-VF1IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF1IC4-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF1IC4-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-VF1IC4: middle.block: +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt i64 [[TMP12]], [[TMP13]] +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select i1 [[RDX_MINMAX_CMP]], i64 [[TMP12]], i64 [[TMP13]] +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX_CMP8:%.*]] = icmp sgt i64 [[RDX_MINMAX_SELECT]], [[TMP14]] +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX_SELECT9:%.*]] = select i1 [[RDX_MINMAX_CMP8]], i64 [[RDX_MINMAX_SELECT]], i64 [[TMP14]] +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX_CMP10:%.*]] = icmp sgt i64 [[RDX_MINMAX_SELECT9]], [[TMP15]] +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX_SELECT11:%.*]] = select i1 [[RDX_MINMAX_CMP10]], i64 [[RDX_MINMAX_SELECT9]], i64 [[TMP15]] +; CHECK-VF1IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF1IC4-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_MINMAX_CMP]], i64 [[TMP20]], i64 [[TMP21]] +; CHECK-VF1IC4-NEXT: [[RDX_SELECT12:%.*]] = select i1 [[RDX_MINMAX_CMP8]], i64 [[RDX_SELECT]], i64 [[TMP22]] +; CHECK-VF1IC4-NEXT: [[RDX_SELECT13:%.*]] = select i1 [[RDX_MINMAX_CMP10]], i64 [[RDX_SELECT12]], i64 [[TMP23]] +; CHECK-VF1IC4-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[RDX_SELECT13]], -9223372036854775808 +; CHECK-VF1IC4-NEXT: [[RDX_SELECT14:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[RDX_SELECT13]], i64 [[II:%.*]] +; CHECK-VF1IC4-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF1IC4: scalar.ph: +; CHECK-VF1IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-VF1IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[MM]], [[ENTRY]] ], [ [[RDX_MINMAX_SELECT11]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1IC4-NEXT: [[BC_MERGE_RDX15:%.*]] = phi i64 [ [[II]], [[ENTRY]] ], [ [[RDX_SELECT14]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1IC4-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF1IC4: for.body: +; CHECK-VF1IC4-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[MAX_09:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP26:%.*]], [[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[IDX_011:%.*]] = phi i64 [ [[BC_MERGE_RDX15]], [[SCALAR_PH]] ], [ [[SPEC_SELECT7:%.*]], [[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF1IC4-NEXT: [[TMP25:%.*]] = load i64, ptr [[ARRAYIDX]], align 4 +; CHECK-VF1IC4-NEXT: [[TMP26]] = tail call i64 @llvm.smax.i64(i64 [[MAX_09]], i64 [[TMP25]]) +; CHECK-VF1IC4-NEXT: [[CMP1_NOT:%.*]] = icmp sgt i64 [[MAX_09]], [[TMP25]] +; CHECK-VF1IC4-NEXT: [[SPEC_SELECT7]] = select i1 [[CMP1_NOT]], i64 [[IDX_011]], i64 [[INDVARS_IV]] +; CHECK-VF1IC4-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF1IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF1IC4-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-VF1IC4: exit: +; CHECK-VF1IC4-NEXT: [[DOTLCSSA:%.*]] = phi i64 [ [[TMP26]], [[FOR_BODY]] ], [ [[RDX_MINMAX_SELECT11]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1IC4-NEXT: [[SPEC_SELECT7_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT7]], [[FOR_BODY]] ], [ [[RDX_SELECT14]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1IC4-NEXT: store i64 [[DOTLCSSA]], ptr [[RES_MAX:%.*]], align 4 +; CHECK-VF1IC4-NEXT: ret i64 [[SPEC_SELECT7_LCSSA]] +; +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %max.09 = phi i64 [ %mm, %entry ], [ %1, %for.body ] + %idx.011 = phi i64 [ %ii, %entry ], [ %spec.select7, %for.body ] + %arrayidx = getelementptr inbounds i64, ptr %a, i64 %indvars.iv + %0 = load i64, ptr %arrayidx + %1 = tail call i64 @llvm.smax.i64(i64 %max.09, i64 %0) + %cmp1.not = icmp sgt i64 %max.09, %0 + %spec.select7 = select i1 %cmp1.not, i64 %idx.011, i64 %indvars.iv + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body + +exit: + store i64 %1, ptr %res_max + ret i64 %spec.select7 +} + +; +; The operands of smax intrinsic and icmp are not the same to be recognized as MMI. +; +; FIXME: this case should not be vectorized. We have to check the operands of intrinsic and icmp. +define i64 @smax_idx_not_vec_1(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %mm, i64 %ii, ptr nocapture writeonly %res_max, i64 %n) { +; CHECK-LABEL: @smax_idx_not_vec_1( +; CHECK-NOT: vector.body: +; + entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %max.09 = phi i64 [ %mm, %entry ], [ %2, %for.body ] + %idx.011 = phi i64 [ %ii, %entry ], [ %spec.select7, %for.body ] + %arrayidx = getelementptr inbounds i64, ptr %a, i64 %indvars.iv + %0 = load i64, ptr %arrayidx + %arrayidx.01 = getelementptr inbounds i64, ptr %b, i64 %indvars.iv + %1 = load i64, ptr %arrayidx + %2 = tail call i64 @llvm.smax.i64(i64 %max.09, i64 %0) + %cmp1 = icmp slt i64 %max.09, %1 ;; + %spec.select7 = select i1 %cmp1, i64 %indvars.iv, i64 %idx.011 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body + +exit: + store i64 %2, ptr %res_max + ret i64 %spec.select7 +} + +; +; It cannot be recognized as MMI when the operand of index select is not an induction variable. +; +define i64 @smax_idx_not_vec_2(ptr nocapture readonly %a, i64 %mm, i64 %ii, ptr nocapture writeonly %res_max, i64 %n) { +; CHECK-LABEL: @smax_idx_not_vec_2( +; CHECK-NOT: vector.body: +; +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %max.09 = phi i64 [ %mm, %entry ], [ %1, %for.body ] + %idx.011 = phi i64 [ %ii, %entry ], [ %spec.select7, %for.body ] + %arrayidx = getelementptr inbounds i64, ptr %a, i64 %indvars.iv + %0 = load i64, ptr %arrayidx + %1 = tail call i64 @llvm.smax.i64(i64 %max.09, i64 %0) + %cmp1 = icmp slt i64 %max.09, %0 + %spec.select7 = select i1 %cmp1, i64 123, i64 %idx.011 ;; + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body + +exit: + store i64 %1, ptr %res_max + ret i64 %spec.select7 +} + +declare i64 @llvm.smax.i64(i64, i64)