diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h --- a/llvm/include/llvm/Analysis/IVDescriptors.h +++ b/llvm/include/llvm/Analysis/IVDescriptors.h @@ -56,6 +56,8 @@ ///< loop induction PHI SelectIVFCmp, ///< Integer select(fcmp(),x,y) where one of (x,y) is increasing ///< loop induction PHI + MinMaxFirstIdx, ///< Min/Max with first index + MinMaxLastIdx ///< Min/Max with last index }; /// The RecurrenceDescriptor is used to identify recurrences variables in a @@ -78,11 +80,13 @@ RecurKind K, FastMathFlags FMF, Instruction *ExactFP, Type *RT, bool Signed, bool Ordered, SmallPtrSetImpl &CI, - unsigned MinWidthCastToRecurTy) + unsigned MinWidthCastToRecurTy, PHINode *UserRecurPhi, + RecurKind UserRecurKind) : IntermediateStore(Store), StartValue(Start), LoopExitInstr(Exit), Kind(K), FMF(FMF), ExactFPMathInst(ExactFP), RecurrenceType(RT), IsSigned(Signed), IsOrdered(Ordered), - MinWidthCastToRecurrenceType(MinWidthCastToRecurTy) { + MinWidthCastToRecurrenceType(MinWidthCastToRecurTy), + UserRecurPhi(UserRecurPhi), UserRecurKind(UserRecurKind) { CastInsts.insert(CI.begin(), CI.end()); } @@ -97,6 +101,12 @@ : IsRecurrence(true), PatternLastInst(I), RecKind(K), ExactFPMathInst(ExactFP) {} + InstDesc(bool IsRecur, Instruction *I, PHINode *CandUserRecurPhi, + RecurKind CandUserRecurKind, Instruction *ExactFP = nullptr) + : IsRecurrence(IsRecur), PatternLastInst(I), RecKind(RecurKind::None), + CandUserRecurPhi(CandUserRecurPhi), + CandUserRecurKind(CandUserRecurKind), ExactFPMathInst(ExactFP) {} + bool isRecurrence() const { return IsRecurrence; } bool needsExactFPMath() const { return ExactFPMathInst != nullptr; } @@ -107,6 +117,14 @@ Instruction *getPatternInst() const { return PatternLastInst; } + PHINode *getCandUserRecurPhi() const { return CandUserRecurPhi; } + + RecurKind getCandUserRecurKind() const { return CandUserRecurKind; } + + bool isCandidateUser() const { + return getCandUserRecurPhi() && getCandUserRecurKind() != RecurKind::None; + } + private: // Is this instruction a recurrence candidate. bool IsRecurrence; @@ -115,6 +133,11 @@ Instruction *PatternLastInst; // If this is a min/max pattern. RecurKind RecKind; + // This instruction may be the operation of another recurrence. + // Record potential recurrence phi. + PHINode *CandUserRecurPhi = nullptr; + // And expected recurrence kind. + RecurKind CandUserRecurKind = RecurKind::None; // Recurrence does not allow floating-point reassociation. Instruction *ExactFPMathInst; }; @@ -143,7 +166,25 @@ /// Kind. \p Prev specifies the description of an already processed select /// instruction, so its corresponding cmp can be matched to it. static InstDesc isMinMaxPattern(Instruction *I, RecurKind Kind, - const InstDesc &Prev); + const InstDesc &Prev, Loop *Loop, + PHINode *OrigPhi, ScalarEvolution *SE); + + /// Returns RecurKind describing which min/max recurrence kind the instruction + /// \p I belongs to. Return RecurKind::None if instruction \p I is not matched + /// any of min/max recurrence kind. Unlike isMinMaxPattern, this function does + /// not limit exactly one use of cmp value. + static RecurKind isMinMaxOperation(Instruction *I); + + /// Returns a struct describing if the instruction is + /// Select(ICmp(A, B), X, Y) + /// where one of (X, Y) is a loop induction variable and the other is a index + /// reduction phi. A and B must be used by a min max recurrence. The check of + /// A and B will be in AddReductionVar, not in this function. \p MinMaxPhi + /// specifies the phi of min max recurrence, and \p MinMaxKind indicates the + /// kind of min max recurrence. + static InstDesc isMinMaxIdxPattern(Loop *Loop, Instruction *I, + PHINode *MinMaxPhi, RecurKind MinMaxKind, + ScalarEvolution *SE); /// Returns a struct describing whether the instruction is either a /// Select(ICmp(A, B), X, Y), or @@ -237,11 +278,27 @@ return isIntMinMaxRecurrenceKind(Kind) || isFPMinMaxRecurrenceKind(Kind); } + /// Returns true if the recurrence kind is a max kind. + static bool isMaxRecurrenceKind(RecurKind Kind) { + return Kind == RecurKind::UMax || Kind == RecurKind::SMax || + Kind == RecurKind::FMax; + } + + /// Returns true if the recurrence kind is of the form + /// select(icmp(a,b),x,y) where one of (x,y) is increasing loop induction + /// variable, and icmp(a,b) depends on a min/max recurrence. + static bool isMinMaxIdxRecurrenceKind(RecurKind Kind) { + return Kind == RecurKind::MinMaxFirstIdx || + Kind == RecurKind::MinMaxLastIdx; + } + /// Returns true if the recurrence kind is of the form - /// select(cmp(),x,y) where one of (x,y) is loop invariant. + /// select(cmp(),x,y) where one of (x,y) is loop invariant or increasing + /// loop induction. static bool isSelectCmpRecurrenceKind(RecurKind Kind) { return Kind == RecurKind::SelectICmp || Kind == RecurKind::SelectFCmp || - Kind == RecurKind::SelectIVICmp || Kind == RecurKind::SelectIVFCmp; + Kind == RecurKind::SelectIVICmp || Kind == RecurKind::SelectIVFCmp || + isMinMaxIdxRecurrenceKind(Kind); } /// Returns the type of the recurrence. This type can be narrower than the @@ -252,6 +309,37 @@ /// recurrence. const SmallPtrSet &getCastInsts() const { return CastInsts; } + /// Returns the PHI of another recurrence who uses the recurrence. + PHINode *getUserRecurPhi() const { return UserRecurPhi; } + + /// Set the recurrence kind. + void setRecurKind(RecurKind K) { + assert(K != RecurKind::None && "Unexpected recurrence kind."); + Kind = K; + } + + /// Set the min/max recurrence that the recurrence depends on. + void setDependMinMaxRecurDes(RecurrenceDescriptor *MMRD) { + assert(isMinMaxRecurrenceKind(MMRD->getRecurrenceKind()) && + "DependMinMaxRecDes must be a min/max recurrence."); + DependMinMaxRecDes = MMRD; + } + + /// Returns the min/max recurrence that is depended by the recurrence. + RecurrenceDescriptor *getDependMinMaxRecDes() const { + return DependMinMaxRecDes; + } + + /// Returns true if the recurrence is used by another. + bool hasUserRecurrence() const { + return UserRecurPhi && UserRecurKind != RecurKind::None; + } + + /// Converts \p UserRedDes to the correct recurrence kind, and complete the + /// recurrence descriptor. Returns true if successful, otherwise returns + /// false. + bool fixUserRecurrence(RecurrenceDescriptor &UserRedDes); + /// Returns the minimum width used by the recurrence in bits. unsigned getMinWidthCastToRecurrenceTypeInBits() const { return MinWidthCastToRecurrenceType; @@ -304,6 +392,12 @@ SmallPtrSet CastInsts; // The minimum width used by the recurrence. unsigned MinWidthCastToRecurrenceType; + // The PHI of another potential recurrence who uses the recurrence. + PHINode *UserRecurPhi = nullptr; + // The kind of another potential recurrence who uses the recurrence. + RecurKind UserRecurKind = RecurKind::None; + // The min/max recurrence that is depended by the recurrence. + RecurrenceDescriptor *DependMinMaxRecDes = nullptr; }; /// A struct for saving information about induction variables. diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h --- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -363,7 +363,7 @@ /// and \p Right. Any lane value in \p Left that matches 2) will be merged into /// \p Right. Value *createSelectCmpOp(IRBuilderBase &Builder, Value *StartVal, RecurKind RK, - Value *Left, Value *Right); + Value *Left, Value *Right, Value *SrcCmp = nullptr); /// Returns a Min/Max operation in select-cmp form corresponding to /// MinMaxRecurrenceKind. @@ -405,14 +405,24 @@ const RecurrenceDescriptor &Desc, PHINode *OrigPhi); +/// Create a target reduction of the given vector \p Src for a reduction of the +/// kind RecurKind::MinMaxLastIdx or RecurKind::MinMaxFirstIdx. The reduction +/// operation is described by \p Desc. \p SrcMask is a mask generated by min/max +/// reduction, used to restrict the range of selectable \p Src for target +/// reduction. +Value *createMMISelectCmpTargetReduction(IRBuilderBase &Builder, + const TargetTransformInfo *TTI, + Value *Src, + const RecurrenceDescriptor &Desc, + PHINode *OrigPhi, Value *SrcMask); + /// Create a target reduction of the given vector \p Src for a reduction of the /// kind conforms to RecurrenceDescriptor::isSelectCmpPattern. The reduction /// operation is described by \p Desc. -Value *createSelectCmpTargetReduction(IRBuilderBase &B, - const TargetTransformInfo *TTI, - Value *Src, - const RecurrenceDescriptor &Desc, - PHINode *OrigPhi); +Value * +createSelectCmpTargetReduction(IRBuilderBase &B, const TargetTransformInfo *TTI, + Value *Src, const RecurrenceDescriptor &Desc, + PHINode *OrigPhi, Value *SrcMask = nullptr); /// Create a generic target reduction using a recurrence descriptor \p Desc /// The target is queried to determine if intrinsics or shuffle sequences are @@ -420,7 +430,8 @@ /// Fast-math-flags are propagated using the RecurrenceDescriptor. Value *createTargetReduction(IRBuilderBase &B, const TargetTransformInfo *TTI, const RecurrenceDescriptor &Desc, Value *Src, - PHINode *OrigPhi = nullptr); + PHINode *OrigPhi = nullptr, + Value *SrcMask = nullptr); /// Create an ordered reduction intrinsic using the given recurrence /// descriptor \p Desc. diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp --- a/llvm/lib/Analysis/IVDescriptors.cpp +++ b/llvm/lib/Analysis/IVDescriptors.cpp @@ -56,6 +56,8 @@ case RecurKind::SelectFCmp: case RecurKind::SelectIVICmp: case RecurKind::SelectIVFCmp: + case RecurKind::MinMaxFirstIdx: + case RecurKind::MinMaxLastIdx: return true; } return false; @@ -65,6 +67,21 @@ return (Kind != RecurKind::None) && !isIntegerRecurrenceKind(Kind); } +bool RecurrenceDescriptor::fixUserRecurrence(RecurrenceDescriptor &UserRedDes) { + RecurKind UserCurrKind = UserRedDes.getRecurrenceKind(); + assert(UserCurrKind != RecurKind::None && "Unexpected recurrence kind."); + + if (isMinMaxRecurrenceKind(Kind)) + if (UserCurrKind == RecurKind::SelectIVICmp || + UserCurrKind == RecurKind::SelectIVFCmp) { + UserRedDes.setRecurKind(UserRecurKind); + UserRedDes.setDependMinMaxRecurDes(this); + return true; + } + + return false; +} + /// Determines if Phi may have been type-promoted. If Phi has a single user /// that ANDs the Phi with a type mask, return the user. RT is updated to /// account for the narrower bit width represented by the mask, and the AND @@ -249,6 +266,18 @@ // must include the original PHI. bool FoundStartPHI = false; + // UserRecurPHI refers to the starting PHI of another recurrence that may use + // this reduction operation. It is used for recognize the min/max with index + // pattern. + // TODO: So far only one user is allowed, but ideally, multiple user + // recurrences should be supported. + PHINode *UserRecurPHI = nullptr; + // UserRecurKind refers to the expected kind of user recurrence. + RecurKind UserRecurKind = RecurKind::None; + // UserRecurInstr refers to the ExitInstruction of a user recurrence. + // FIXME: Should rename to UserRecurExit + Instruction *UserRecurInstr = nullptr; + // To recognize min/max patterns formed by a icmp select sequence, we store // the number of instruction we saw from the recognized min/max pattern, // to make sure we only see exactly the two instructions. @@ -381,8 +410,34 @@ ExactFPMathInst = ExactFPMathInst == nullptr ? ReduxDesc.getExactFPMathInst() : ExactFPMathInst; - if (!ReduxDesc.isRecurrence()) - return false; + if (!ReduxDesc.isRecurrence()) { + if (!ReduxDesc.isCandidateUser()) + return false; + + // TODO: Only allow one user recurrence now. + if (UserRecurPHI) + return false; + + UserRecurPHI = ReduxDesc.getCandUserRecurPhi(); + UserRecurKind = ReduxDesc.getCandUserRecurKind(); + UserRecurInstr = Cur; + // TODO: Call AddReductionVar here? + + // Fix NumCmpSelectPatternInst + // When searching min/max with index pattern, the cmp belonging to index + // reduction will be mistaken for the cmp belonging to min/max + // reduction. This will cause the min/max reduction to be unrecognizable + // due to the number exception of NumCmpSelectPatternInst. + // FIXME: There may be a better way to handle NumCmpSelectPatternInst + // issue. + if (match(UserRecurInstr, + m_Select(m_OneUse(m_Cmp()), m_Value(), m_Value()))) + --NumCmpSelectPatternInst; + + // Stop visiting the users of current instruction if it contains user + // recurrence. + continue; + } // FIXME: FMF is allowed on phi, but propagation is not handled correctly. if (isa(ReduxDesc.getPatternInst()) && !IsAPhi) { FastMathFlags CurFMF = ReduxDesc.getPatternInst()->getFastMathFlags(); @@ -494,7 +549,8 @@ (!isConditionalRdxPattern(Kind, UI).isRecurrence() && !isSelectCmpPattern(TheLoop, Phi, UI, IgnoredVal, SE) .isRecurrence() && - !isMinMaxPattern(UI, Kind, IgnoredVal).isRecurrence()))) + !isMinMaxPattern(UI, Kind, IgnoredVal, TheLoop, Phi, SE) + .isRecurrence()))) return false; // Remember that we completed the cycle. @@ -544,6 +600,31 @@ if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction) return false; + // Check for the min/max with index pattern. Check if the operands used by cmp + // instruction of UserRecurInstr is the same as the operands used by min/max + // recurrence. + if (isMinMaxRecurrenceKind(Kind) && UserRecurPHI) { + auto *UserRecurSI = cast(UserRecurInstr); + Value *UserRecurCond = UserRecurSI->getCondition(); + if (auto *MinMaxSI = dyn_cast(ExitInstruction)) { + // TODO: As long as the operands are the same, it is not limited to the + // same cmp instruction. + if (UserRecurCond != MinMaxSI->getCondition()) + return false; + } else if (auto *MinMaxII = dyn_cast(ExitInstruction)) { + // Match smax(%maxphi, %0), icmp(pred, %maxphi, %0) or + // smax(%maxphi, %0), icmp(swapped_pred, %0, %maxphi) + Value *MinMaxOp0 = MinMaxII->getOperand(0); + Value *MinMaxOp1 = MinMaxII->getOperand(1); + CmpInst::Predicate Pred; + if (!match(UserRecurCond, + m_Cmp(Pred, m_Specific(MinMaxOp0), m_Specific(MinMaxOp1))) && + !match(UserRecurCond, + m_Cmp(Pred, m_Specific(MinMaxOp1), m_Specific(MinMaxOp0)))) + return false; + } + } + const bool IsOrdered = checkOrderedReduction(Kind, ExactFPMathInst, ExitInstruction, Phi); @@ -604,7 +685,8 @@ // Save the description of this reduction variable. RecurrenceDescriptor RD(RdxStart, ExitInstruction, IntermediateStore, Kind, FMF, ExactFPMathInst, RecurrenceType, IsSigned, - IsOrdered, CastInsts, MinWidthCastToRecurrenceType); + IsOrdered, CastInsts, MinWidthCastToRecurrenceType, + UserRecurPHI, UserRecurKind); RedDes = RD; return true; @@ -693,7 +775,8 @@ RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isMinMaxPattern(Instruction *I, RecurKind Kind, - const InstDesc &Prev) { + const InstDesc &Prev, Loop *Loop, + PHINode *OrigPhi, ScalarEvolution *SE) { assert((isa(I) || isa(I) || isa(I)) && "Expected a cmp or select or call instruction"); if (!isMinMaxRecurrenceKind(Kind)) @@ -713,29 +796,133 @@ m_Value()))) return InstDesc(false, I); + RecurKind MMRK = isMinMaxOperation(I); + if (MMRK != RecurKind::None) + return InstDesc(Kind == MMRK, I); + + if (isa(I)) + return isMinMaxIdxPattern(Loop, I, OrigPhi, Kind, SE); + + return InstDesc(false, I); +} + +RecurKind RecurrenceDescriptor::isMinMaxOperation(Instruction *I) { // Look for a min/max pattern. if (match(I, m_UMin(m_Value(), m_Value()))) - return InstDesc(Kind == RecurKind::UMin, I); + return RecurKind::UMin; if (match(I, m_UMax(m_Value(), m_Value()))) - return InstDesc(Kind == RecurKind::UMax, I); + return RecurKind::UMax; if (match(I, m_SMax(m_Value(), m_Value()))) - return InstDesc(Kind == RecurKind::SMax, I); + return RecurKind::SMax; if (match(I, m_SMin(m_Value(), m_Value()))) - return InstDesc(Kind == RecurKind::SMin, I); + return RecurKind::SMin; if (match(I, m_OrdFMin(m_Value(), m_Value()))) - return InstDesc(Kind == RecurKind::FMin, I); + return RecurKind::FMin; if (match(I, m_OrdFMax(m_Value(), m_Value()))) - return InstDesc(Kind == RecurKind::FMax, I); + return RecurKind::FMax; if (match(I, m_UnordFMin(m_Value(), m_Value()))) - return InstDesc(Kind == RecurKind::FMin, I); + return RecurKind::FMin; if (match(I, m_UnordFMax(m_Value(), m_Value()))) - return InstDesc(Kind == RecurKind::FMax, I); + return RecurKind::FMax; if (match(I, m_Intrinsic(m_Value(), m_Value()))) - return InstDesc(Kind == RecurKind::FMin, I); + return RecurKind::FMin; if (match(I, m_Intrinsic(m_Value(), m_Value()))) - return InstDesc(Kind == RecurKind::FMax, I); + return RecurKind::FMax; - return InstDesc(false, I); + return RecurKind::None; +} + +RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isMinMaxIdxPattern( + Loop *Loop, Instruction *I, PHINode *MinMaxPhi, RecurKind MinMaxKind, + ScalarEvolution *SE) { + assert(isa(I) && "Expected a select instruction"); + // TODO: FP MinMax + if (!isIntMinMaxRecurrenceKind(MinMaxKind)) + return InstDesc(false, I); + + // Requires SCEV to check the index part + if (!SE) { + LLVM_DEBUG(dbgs() << "MinMaxIdx patterns are not recognized without " + << "Scalar Evolution Analysis\n"); + return InstDesc(false, I); + } + + // Check the index select + auto *SI = cast(I); + Value *Cond = SI->getCondition(); + CmpInst::Predicate Pred; + CmpInst::Predicate NormPred; + + // %cmp = icmp pred, %mmphi, %0 + // %select = select %cmp, %update, %idxphi + // Check if cmp used min/max phi + if (match(Cond, m_Cmp(Pred, m_Specific(MinMaxPhi), m_Value()))) + NormPred = Pred; + else if (match(Cond, m_Cmp(Pred, m_Value(), m_Specific(MinMaxPhi)))) + // Normalize the predicate, and get which side the select should update idx + // TODO: Need to consider commutable. + NormPred = CmpInst::getSwappedPredicate(Pred); + else + return InstDesc(false, I); + + bool UpdateSide; + RecurKind ExpectedIdxRK; + switch (NormPred) { + case CmpInst::ICMP_SLT: + case CmpInst::ICMP_ULT: + // %mmphi < %0 + UpdateSide = isMaxRecurrenceKind(MinMaxKind); + ExpectedIdxRK = isMaxRecurrenceKind(MinMaxKind) ? RecurKind::MinMaxFirstIdx + : RecurKind::MinMaxLastIdx; + break; + case CmpInst::ICMP_SLE: + case CmpInst::ICMP_ULE: + // %mmphi <= %0 + UpdateSide = isMaxRecurrenceKind(MinMaxKind); + ExpectedIdxRK = isMaxRecurrenceKind(MinMaxKind) ? RecurKind::MinMaxLastIdx + : RecurKind::MinMaxFirstIdx; + break; + case CmpInst::ICMP_SGT: + case CmpInst::ICMP_UGT: + // %mmphi > %0 + UpdateSide = !isMaxRecurrenceKind(MinMaxKind); + ExpectedIdxRK = isMaxRecurrenceKind(MinMaxKind) ? RecurKind::MinMaxLastIdx + : RecurKind::MinMaxFirstIdx; + break; + case CmpInst::ICMP_SGE: + case CmpInst::ICMP_UGE: + // %mmphi >= %0 + UpdateSide = !isMaxRecurrenceKind(MinMaxKind); + ExpectedIdxRK = isMaxRecurrenceKind(MinMaxKind) ? RecurKind::MinMaxFirstIdx + : RecurKind::MinMaxLastIdx; + break; + default: + return InstDesc(false, I); + } + + // Get the reduction phi of index select + Value *IdxUpdateV = UpdateSide ? SI->getTrueValue() : SI->getFalseValue(); + Value *IdxReduxV = UpdateSide ? SI->getFalseValue() : SI->getTrueValue(); + // Handle the operand of index select may have been casted. + if (auto *Cast = dyn_cast(IdxUpdateV)) + IdxUpdateV = Cast->getOperand(0); + + auto *IdxUpdatePhi = dyn_cast(IdxUpdateV); + auto *IdxReduxPhi = dyn_cast(IdxReduxV); + if (!IdxUpdatePhi || !IdxReduxPhi) + return InstDesc(false, I); + + // Check update side is a loop induction variable + InductionDescriptor ID; + if (!InductionDescriptor::isInductionPHI(IdxUpdatePhi, Loop, SE, ID)) + return InstDesc(false, I); + + // The reduction phi of index select and reduction phi of min/max must not the + // same + if (IdxReduxPhi == MinMaxPhi) + return InstDesc(false, I); + + return InstDesc(false, I, IdxReduxPhi, ExpectedIdxRK); } /// Returns true if the select instruction has users in the compare-and-add @@ -833,7 +1020,7 @@ (isa(I) && I->hasNoNaNs() && I->hasNoSignedZeros())) && isFPMinMaxRecurrenceKind(Kind))) - return isMinMaxPattern(I, Kind, Prev); + return isMinMaxPattern(I, Kind, Prev, L, OrigPhi, SE); else if (isFMulAddIntrinsic(I)) return InstDesc(Kind == RecurKind::FMulAdd, I, I->hasAllowReassoc() ? nullptr : I); @@ -1096,6 +1283,8 @@ break; case RecurKind::SelectIVICmp: case RecurKind::SelectIVFCmp: + case RecurKind::MinMaxFirstIdx: + case RecurKind::MinMaxLastIdx: // FIXME: SMax or UMax, I'm not sure which one is correct. return getRecurrenceIdentity(RecurKind::SMax, Tp, FMF); default: @@ -1126,6 +1315,9 @@ case RecurKind::UMin: case RecurKind::SelectICmp: case RecurKind::SelectIVICmp: + // TODO: maybe new FMinMaxFirstIdx/ FMinMaxLastIdx + case RecurKind::MinMaxFirstIdx: + case RecurKind::MinMaxLastIdx: return Instruction::ICmp; case RecurKind::FMax: case RecurKind::FMin: diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -932,7 +932,8 @@ } Value *llvm::createSelectCmpOp(IRBuilderBase &Builder, Value *StartVal, - RecurKind RK, Value *Left, Value *Right) { + RecurKind RK, Value *Left, Value *Right, + Value *SrcCmp) { switch (RK) { case RecurKind::SelectICmp: case RecurKind::SelectFCmp: { @@ -946,6 +947,19 @@ case RecurKind::SelectIVFCmp: // TODO: SMax or UMax? return createMinMaxOp(Builder, RecurKind::SMax, Left, Right); + case RecurKind::MinMaxFirstIdx: { + assert(isa_and_nonnull(SrcCmp) && + "SrcCmp should not be nullptr when MinMaxFirstIdx recurrence"); + auto *SrcCI = cast(SrcCmp); + CmpInst::Predicate Pred = SrcCI->getNonStrictPredicate(); + Value *Cmp = Builder.CreateCmp(Pred, SrcCI->getOperand(0), + SrcCI->getOperand(1), "rdx.select.cmp"); + return Builder.CreateSelect(Cmp, Left, Right, "rdx.select"); + } + case RecurKind::MinMaxLastIdx: + assert(isa_and_nonnull(SrcCmp) && + "SrcCmp should not be nullptr when MinMaxLastIdx recurrence"); + return Builder.CreateSelect(SrcCmp, Left, Right, "rdx.select"); default: llvm_unreachable("Unknown SelectCmp recurrence kind"); } @@ -1075,11 +1089,35 @@ return Builder.CreateSelect(Cmp, NewVal, InitVal, "rdx.select"); } +Value *llvm::createMMISelectCmpTargetReduction( + IRBuilderBase &Builder, const TargetTransformInfo *TTI, Value *Src, + const RecurrenceDescriptor &Desc, PHINode *OrigPhi, Value *SrcMask) { + assert(RecurrenceDescriptor::isMinMaxIdxRecurrenceKind( + Desc.getRecurrenceKind()) && + "Unexpected reduction kind"); + RecurKind Kind = Desc.getRecurrenceKind(); + // FIXME: UMax/SMax or UMin/UMax? + RecurKind RdxExtractK = + Kind == RecurKind::MinMaxFirstIdx ? RecurKind::SMin : RecurKind::SMax; + + assert(SrcMask && "MinMaxIdx recurrence requests mask"); + // TODO: If vp reduction intrinsic is supported, there is no need to generate + // additional select here. + auto *SrcVecEltTy = cast(Src->getType())->getElementType(); + Value *RdxOpIden = Desc.getRecurrenceIdentity(RdxExtractK, SrcVecEltTy, + Desc.getFastMathFlags()); + ElementCount EC = cast(Src->getType())->getElementCount(); + RdxOpIden = Builder.CreateVectorSplat(EC, RdxOpIden); + Value *NewVal = Builder.CreateSelect(SrcMask, Src, RdxOpIden, "mask.select"); + + return createSimpleTargetReduction(Builder, TTI, NewVal, RdxExtractK); +} + Value *llvm::createSelectCmpTargetReduction(IRBuilderBase &Builder, const TargetTransformInfo *TTI, Value *Src, const RecurrenceDescriptor &Desc, - PHINode *OrigPhi) { + PHINode *OrigPhi, Value *SrcMask) { assert(RecurrenceDescriptor::isSelectCmpRecurrenceKind( Desc.getRecurrenceKind()) && "Unexpected reduction kind"); @@ -1094,6 +1132,10 @@ // FIXME: SMax or UMax? // TODO: Decreasing induction need fix here return Builder.CreateIntMaxReduce(Src, true); + case RecurKind::MinMaxFirstIdx: + case RecurKind::MinMaxLastIdx: + return createMMISelectCmpTargetReduction(Builder, TTI, Src, Desc, OrigPhi, + SrcMask); default: llvm_unreachable("Unknown SelectCmp recurrence kind"); } @@ -1140,7 +1182,7 @@ Value *llvm::createTargetReduction(IRBuilderBase &B, const TargetTransformInfo *TTI, const RecurrenceDescriptor &Desc, Value *Src, - PHINode *OrigPhi) { + PHINode *OrigPhi, Value *SrcMask) { // TODO: Support in-order reductions based on the recurrence descriptor. // All ops in the reduction inherit fast-math-flags from the recurrence // descriptor. @@ -1149,7 +1191,7 @@ RecurKind RK = Desc.getRecurrenceKind(); if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) - return createSelectCmpTargetReduction(B, TTI, Src, Desc, OrigPhi); + return createSelectCmpTargetReduction(B, TTI, Src, Desc, OrigPhi, SrcMask); return createSimpleTargetReduction(B, TTI, Src, RK); } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -900,6 +900,21 @@ } // next instr. } + // Second comfirm the incomplete reductions + for (auto R : Reductions) { + RecurrenceDescriptor &RedDes = Reductions.find(R.first)->second; + if (!RedDes.hasUserRecurrence()) + continue; + + PHINode *UserPhi = RedDes.getUserRecurPhi(); + if (!isReductionVariable(UserPhi)) + return false; + + RecurrenceDescriptor &UserRedDes = Reductions.find(UserPhi)->second; + if (!RedDes.fixUserRecurrence(UserRedDes)) + return false; + } + if (!PrimaryInduction) { if (Inductions.empty()) { reportVectorizationFailure("Did not find one integer induction var", diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -557,6 +557,11 @@ // generated by fixReduction. PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc); + // Returns the recurrence mask (mask.cmp) for a recurrence as generated by + // fixReduction. + std::pair + getDependRecurrenceMask(const RecurrenceDescriptor &RdxDesc); + /// Create a new phi node for the induction variable \p OrigPhi to resume /// iteration count in the scalar epilogue, from where the vectorized loop /// left off. \p Step is the SCEV-expanded induction step to use. In cases @@ -773,6 +778,12 @@ // correct start value of reduction PHIs when vectorizing the epilogue. SmallMapVector ReductionResumeValues; + + // Holds the masks for recurrences in the loops, be used for reduction when + // there is a reduction that depends on the recurrence. + SmallMapVector, + 4> + ReductionDependMasks; }; class InnerLoopUnroller : public InnerLoopVectorizer { @@ -1144,6 +1155,15 @@ return It->second; } +std::pair +InnerLoopVectorizer::getDependRecurrenceMask( + const RecurrenceDescriptor &RdxDesc) { + auto It = ReductionDependMasks.find(&RdxDesc); + assert(It != ReductionDependMasks.end() && + "Expected to find a dependence mask for the recurrence."); + return It->second; +} + namespace llvm { // Loop vectorization cost-model hints how the scalar epilogue loop should be @@ -3731,10 +3751,24 @@ // the incoming edges. VPBasicBlock *Header = State.Plan->getVectorLoopRegion()->getEntryBasicBlock(); - for (VPRecipeBase &R : Header->phis()) { - if (auto *ReductionPhi = dyn_cast(&R)) + // FIXME: Maybe I should not choose std::queue... + std::queue Worklist; + for (VPRecipeBase &R : Header->phis()) + Worklist.push(&R); + + while (!Worklist.empty()) { + VPRecipeBase &R = *(Worklist.front()); + Worklist.pop(); + if (auto *ReductionPhi = dyn_cast(&R)) { + const RecurrenceDescriptor &RecDesc = + ReductionPhi->getRecurrenceDescriptor(); + RecurrenceDescriptor *DependRecDesc = RecDesc.getDependMinMaxRecDes(); + if (DependRecDesc && !ReductionDependMasks.count(DependRecDesc)) { + Worklist.push(&R); + continue; + } fixReduction(ReductionPhi, State); - else if (auto *FOR = dyn_cast(&R)) + } else if (auto *FOR = dyn_cast(&R)) fixFixedOrderRecurrence(FOR, State); } } @@ -3958,6 +3992,19 @@ Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); unsigned Op = RecurrenceDescriptor::getOpcode(RK); + // Get the reduction mask if the reduction depend on another one. + RecurrenceDescriptor *DependDesc = RdxDesc.getDependMinMaxRecDes(); + Value *DependRdxMask = nullptr; + VectorParts DependPartMasks; + if (DependDesc) { + Builder.SetInsertPoint(&*LoopMiddleBlock->getTerminator()); + std::tie(DependRdxMask, DependPartMasks) = + getDependRecurrenceMask(*DependDesc); + } + + Value *NewRdxMask = nullptr; + VectorParts NewPartMasks(UF); + // The middle block terminator has already been assigned a DebugLoc here (the // OrigLoop's single latch terminator). We want the whole middle block to // appear to execute on this line because: (a) it is all compiler generated, @@ -3974,34 +4021,63 @@ Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); for (unsigned Part = 1; Part < UF; ++Part) { Value *RdxPart = State.get(LoopExitInstDef, Part); + Value *PartMask = DependDesc ? DependPartMasks[Part] : nullptr; if (Op != Instruction::ICmp && Op != Instruction::FCmp) { ReducedPartRdx = Builder.CreateBinOp( (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK, - ReducedPartRdx, RdxPart); - else - ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); + ReducedPartRdx, RdxPart, PartMask); + else { + if (RdxDesc.hasUserRecurrence()) { + ReducedPartRdx = + createMinMaxSelectCmpOp(Builder, RK, ReducedPartRdx, RdxPart); + // Keep the part mask on demand. + Value *Cond = cast(ReducedPartRdx)->getCondition(); + NewPartMasks[Part] = Cond; + } else { + ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); + } + } } } // Create the reduction after the loop. Note that inloop reductions create the // target reduction in the loop using a Reduction recipe. if (VF.isVector() && !PhiR->isInLoop()) { - ReducedPartRdx = - createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi); + Value *ReducedPart = ReducedPartRdx; + ReducedPartRdx = createTargetReduction( + Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi, DependRdxMask); // If the reduction can be performed in a smaller type, we need to extend // the reduction to the wider type before we branch to the original loop. if (PhiTy != RdxDesc.getRecurrenceType()) ReducedPartRdx = RdxDesc.isSigned() ? Builder.CreateSExt(ReducedPartRdx, PhiTy) : Builder.CreateZExt(ReducedPartRdx, PhiTy); + + // Create depend recurrence mask on demand. + if (RdxDesc.hasUserRecurrence()) { + ElementCount EC = + cast(ReducedPart->getType())->getElementCount(); + Value *RdxSplat = Builder.CreateVectorSplat(EC, ReducedPartRdx); + // FIXME: Not sure use FCMP_OEQ is right or not. + CmpInst::Predicate MaskPred = + (ReducedPartRdx->getType()->isFloatingPointTy()) ? CmpInst::FCMP_OEQ + : CmpInst::ICMP_EQ; + NewRdxMask = + Builder.CreateCmp(MaskPred, RdxSplat, ReducedPart, "mask.cmp"); + } } - if (RK == RecurKind::SelectIVICmp || RK == RecurKind::SelectIVFCmp) + if (RK == RecurKind::SelectIVICmp || RK == RecurKind::SelectIVFCmp || + RecurrenceDescriptor::isMinMaxIdxRecurrenceKind(RK)) ReducedPartRdx = createSentinelValueHandling(Builder, TTI, RdxDesc, ReducedPartRdx); + // Set the recurrence mask for this reduction on demand. + if (RdxDesc.hasUserRecurrence()) + ReductionDependMasks.insert({&RdxDesc, {NewRdxMask, NewPartMasks}}); + PHINode *ResumePhi = dyn_cast(PhiR->getStartValue()->getUnderlyingValue()); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -1262,7 +1262,8 @@ StartV = Iden = Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident"); } - } else if (RK == RecurKind::SelectIVICmp || RK == RecurKind::SelectIVFCmp) { + } else if (RK == RecurKind::SelectIVICmp || RK == RecurKind::SelectIVFCmp || + RecurrenceDescriptor::isMinMaxIdxRecurrenceKind(RK)) { StartV = Iden = RdxDesc.getRecurrenceIdentity(RK, VecTy->getScalarType(), RdxDesc.getFastMathFlags()); if (!ScalarPHI) { diff --git a/llvm/test/Transforms/LoopVectorize/select-min-index.ll b/llvm/test/Transforms/LoopVectorize/select-min-index.ll --- a/llvm/test/Transforms/LoopVectorize/select-min-index.ll +++ b/llvm/test/Transforms/LoopVectorize/select-min-index.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function test_not_vectorize_select_no_min_reduction --version 2 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 ; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck %s --check-prefix=CHECK-VF4IC1 --check-prefix=CHECK ; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -S %s | FileCheck %s --check-prefix=CHECK-VF4IC2 --check-prefix=CHECK ; RUN: opt -passes=loop-vectorize -force-vector-width=1 -force-vector-interleave=2 -S %s | FileCheck %s --check-prefix=CHECK-VF1IC2 --check-prefix=CHECK @@ -31,8 +31,187 @@ } define i64 @test_vectorize_select_umin_idx_all_exit_inst(ptr %src, ptr %umin) { -; CHECK-LABEL: @test_vectorize_select_umin_idx_all_exit_inst( -; CHECK-NOT: vector.body: +; CHECK-VF4IC1-LABEL: @test_vectorize_select_umin_idx_all_exit_inst( +; CHECK-VF4IC1-NEXT: entry: +; CHECK-VF4IC1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF4IC1: vector.ph: +; CHECK-VF4IC1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF4IC1: vector.body: +; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = getelementptr i64, ptr [[SRC:%.*]], i64 [[TMP0]] +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0 +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = icmp ugt <4 x i64> [[VEC_PHI1]], [[WIDE_LOAD]] +; CHECK-VF4IC1-NEXT: [[TMP4]] = call <4 x i64> @llvm.umin.v4i64(<4 x i64> [[VEC_PHI1]], <4 x i64> [[WIDE_LOAD]]) +; CHECK-VF4IC1-NEXT: [[TMP5]] = select <4 x i1> [[TMP3]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-VF4IC1: middle.block: +; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> [[TMP4]]) +; CHECK-VF4IC1-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP7]], i64 0 +; CHECK-VF4IC1-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: [[MASK_CMP:%.*]] = icmp eq <4 x i64> [[DOTSPLAT]], [[TMP4]] +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 0, 0 +; CHECK-VF4IC1-NEXT: [[MASK_SELECT:%.*]] = select <4 x i1> [[MASK_CMP]], <4 x i64> [[TMP5]], <4 x i64> +; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> [[MASK_SELECT]]) +; CHECK-VF4IC1-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP8]], -9223372036854775808 +; CHECK-VF4IC1-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP8]], i64 0 +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF4IC1: scalar.ph: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX2:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: br label [[LOOP:%.*]] +; CHECK-VF4IC1: loop: +; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-VF4IC1-NEXT: [[MIN_IDX:%.*]] = phi i64 [ [[BC_MERGE_RDX2]], [[SCALAR_PH]] ], [ [[MIN_IDX_NEXT:%.*]], [[LOOP]] ] +; CHECK-VF4IC1-NEXT: [[MIN_VAL:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MIN_VAL_NEXT:%.*]], [[LOOP]] ] +; CHECK-VF4IC1-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[L:%.*]] = load i64, ptr [[GEP]], align 4 +; CHECK-VF4IC1-NEXT: [[CMP:%.*]] = icmp ugt i64 [[MIN_VAL]], [[L]] +; CHECK-VF4IC1-NEXT: [[MIN_VAL_NEXT]] = tail call i64 @llvm.umin.i64(i64 [[MIN_VAL]], i64 [[L]]) +; CHECK-VF4IC1-NEXT: [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]] +; CHECK-VF4IC1-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-VF4IC1: exit: +; CHECK-VF4IC1-NEXT: [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], [[LOOP]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: [[RES_UMIN:%.*]] = phi i64 [ [[MIN_VAL_NEXT]], [[LOOP]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: store i64 [[RES_UMIN]], ptr [[UMIN:%.*]], align 4 +; CHECK-VF4IC1-NEXT: ret i64 [[RES]] +; +; CHECK-VF4IC2-LABEL: @test_vectorize_select_umin_idx_all_exit_inst( +; CHECK-VF4IC2-NEXT: entry: +; CHECK-VF4IC2-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF4IC2: vector.ph: +; CHECK-VF4IC2-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF4IC2: vector.body: +; CHECK-VF4IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC2-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC2-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC2-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC2-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC2-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC2-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], +; CHECK-VF4IC2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF4IC2-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 +; CHECK-VF4IC2-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[SRC:%.*]], i64 [[TMP0]] +; CHECK-VF4IC2-NEXT: [[TMP3:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[TMP1]] +; CHECK-VF4IC2-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[TMP2]], i32 0 +; CHECK-VF4IC2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 4 +; CHECK-VF4IC2-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[TMP2]], i32 4 +; CHECK-VF4IC2-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i64>, ptr [[TMP5]], align 4 +; CHECK-VF4IC2-NEXT: [[TMP6:%.*]] = icmp ugt <4 x i64> [[VEC_PHI3]], [[WIDE_LOAD]] +; CHECK-VF4IC2-NEXT: [[TMP7:%.*]] = icmp ugt <4 x i64> [[VEC_PHI4]], [[WIDE_LOAD5]] +; CHECK-VF4IC2-NEXT: [[TMP8]] = call <4 x i64> @llvm.umin.v4i64(<4 x i64> [[VEC_PHI3]], <4 x i64> [[WIDE_LOAD]]) +; CHECK-VF4IC2-NEXT: [[TMP9]] = call <4 x i64> @llvm.umin.v4i64(<4 x i64> [[VEC_PHI4]], <4 x i64> [[WIDE_LOAD5]]) +; CHECK-VF4IC2-NEXT: [[TMP10]] = select <4 x i1> [[TMP6]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] +; CHECK-VF4IC2-NEXT: [[TMP11]] = select <4 x i1> [[TMP7]], <4 x i64> [[STEP_ADD]], <4 x i64> [[VEC_PHI2]] +; CHECK-VF4IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-VF4IC2-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], +; CHECK-VF4IC2-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-VF4IC2-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-VF4IC2: middle.block: +; CHECK-VF4IC2-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp ult <4 x i64> [[TMP8]], [[TMP9]] +; CHECK-VF4IC2-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i64> [[TMP8]], <4 x i64> [[TMP9]] +; CHECK-VF4IC2-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> [[RDX_MINMAX_SELECT]]) +; CHECK-VF4IC2-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP13]], i64 0 +; CHECK-VF4IC2-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC2-NEXT: [[MASK_CMP:%.*]] = icmp eq <4 x i64> [[DOTSPLAT]], [[RDX_MINMAX_SELECT]] +; CHECK-VF4IC2-NEXT: [[CMP_N:%.*]] = icmp eq i64 0, 0 +; CHECK-VF4IC2-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ule <4 x i64> [[TMP8]], [[TMP9]] +; CHECK-VF4IC2-NEXT: [[RDX_SELECT:%.*]] = select <4 x i1> [[RDX_SELECT_CMP]], <4 x i64> [[TMP10]], <4 x i64> [[TMP11]] +; CHECK-VF4IC2-NEXT: [[MASK_SELECT:%.*]] = select <4 x i1> [[MASK_CMP]], <4 x i64> [[RDX_SELECT]], <4 x i64> +; CHECK-VF4IC2-NEXT: [[TMP14:%.*]] = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> [[MASK_SELECT]]) +; CHECK-VF4IC2-NEXT: [[RDX_SELECT_CMP6:%.*]] = icmp ne i64 [[TMP14]], -9223372036854775808 +; CHECK-VF4IC2-NEXT: [[RDX_SELECT7:%.*]] = select i1 [[RDX_SELECT_CMP6]], i64 [[TMP14]], i64 0 +; CHECK-VF4IC2-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF4IC2: scalar.ph: +; CHECK-VF4IC2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-VF4IC2-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC2-NEXT: [[BC_MERGE_RDX8:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[RDX_SELECT7]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC2-NEXT: br label [[LOOP:%.*]] +; CHECK-VF4IC2: loop: +; CHECK-VF4IC2-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-VF4IC2-NEXT: [[MIN_IDX:%.*]] = phi i64 [ [[BC_MERGE_RDX8]], [[SCALAR_PH]] ], [ [[MIN_IDX_NEXT:%.*]], [[LOOP]] ] +; CHECK-VF4IC2-NEXT: [[MIN_VAL:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MIN_VAL_NEXT:%.*]], [[LOOP]] ] +; CHECK-VF4IC2-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]] +; CHECK-VF4IC2-NEXT: [[L:%.*]] = load i64, ptr [[GEP]], align 4 +; CHECK-VF4IC2-NEXT: [[CMP:%.*]] = icmp ugt i64 [[MIN_VAL]], [[L]] +; CHECK-VF4IC2-NEXT: [[MIN_VAL_NEXT]] = tail call i64 @llvm.umin.i64(i64 [[MIN_VAL]], i64 [[L]]) +; CHECK-VF4IC2-NEXT: [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]] +; CHECK-VF4IC2-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC2-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; CHECK-VF4IC2-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-VF4IC2: exit: +; CHECK-VF4IC2-NEXT: [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], [[LOOP]] ], [ [[RDX_SELECT7]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC2-NEXT: [[RES_UMIN:%.*]] = phi i64 [ [[MIN_VAL_NEXT]], [[LOOP]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC2-NEXT: store i64 [[RES_UMIN]], ptr [[UMIN:%.*]], align 4 +; CHECK-VF4IC2-NEXT: ret i64 [[RES]] +; +; CHECK-VF1IC2-LABEL: @test_vectorize_select_umin_idx_all_exit_inst( +; CHECK-VF1IC2-NEXT: entry: +; CHECK-VF1IC2-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF1IC2: vector.ph: +; CHECK-VF1IC2-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF1IC2: vector.body: +; CHECK-VF1IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC2-NEXT: [[VEC_PHI:%.*]] = phi i64 [ -9223372036854775808, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC2-NEXT: [[VEC_PHI1:%.*]] = phi i64 [ -9223372036854775808, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC2-NEXT: [[VEC_PHI2:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC2-NEXT: [[VEC_PHI3:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF1IC2-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-VF1IC2-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[SRC:%.*]], i64 [[TMP0]] +; CHECK-VF1IC2-NEXT: [[TMP3:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[TMP1]] +; CHECK-VF1IC2-NEXT: [[TMP4:%.*]] = load i64, ptr [[TMP2]], align 4 +; CHECK-VF1IC2-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 4 +; CHECK-VF1IC2-NEXT: [[TMP6:%.*]] = icmp ugt i64 [[VEC_PHI2]], [[TMP4]] +; CHECK-VF1IC2-NEXT: [[TMP7:%.*]] = icmp ugt i64 [[VEC_PHI3]], [[TMP5]] +; CHECK-VF1IC2-NEXT: [[TMP8]] = tail call i64 @llvm.umin.i64(i64 [[VEC_PHI2]], i64 [[TMP4]]) +; CHECK-VF1IC2-NEXT: [[TMP9]] = tail call i64 @llvm.umin.i64(i64 [[VEC_PHI3]], i64 [[TMP5]]) +; CHECK-VF1IC2-NEXT: [[TMP10]] = select i1 [[TMP6]], i64 [[TMP0]], i64 [[VEC_PHI]] +; CHECK-VF1IC2-NEXT: [[TMP11]] = select i1 [[TMP7]], i64 [[TMP1]], i64 [[VEC_PHI1]] +; CHECK-VF1IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-VF1IC2-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-VF1IC2-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-VF1IC2: middle.block: +; CHECK-VF1IC2-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp ult i64 [[TMP8]], [[TMP9]] +; CHECK-VF1IC2-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select i1 [[RDX_MINMAX_CMP]], i64 [[TMP8]], i64 [[TMP9]] +; CHECK-VF1IC2-NEXT: [[CMP_N:%.*]] = icmp eq i64 0, 0 +; CHECK-VF1IC2-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ule i64 [[TMP8]], [[TMP9]] +; CHECK-VF1IC2-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP10]], i64 [[TMP11]] +; CHECK-VF1IC2-NEXT: [[RDX_SELECT_CMP4:%.*]] = icmp ne i64 [[RDX_SELECT]], -9223372036854775808 +; CHECK-VF1IC2-NEXT: [[RDX_SELECT5:%.*]] = select i1 [[RDX_SELECT_CMP4]], i64 [[RDX_SELECT]], i64 0 +; CHECK-VF1IC2-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF1IC2: scalar.ph: +; CHECK-VF1IC2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-VF1IC2-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[RDX_MINMAX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1IC2-NEXT: [[BC_MERGE_RDX6:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[RDX_SELECT5]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1IC2-NEXT: br label [[LOOP:%.*]] +; CHECK-VF1IC2: loop: +; CHECK-VF1IC2-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-VF1IC2-NEXT: [[MIN_IDX:%.*]] = phi i64 [ [[BC_MERGE_RDX6]], [[SCALAR_PH]] ], [ [[MIN_IDX_NEXT:%.*]], [[LOOP]] ] +; CHECK-VF1IC2-NEXT: [[MIN_VAL:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MIN_VAL_NEXT:%.*]], [[LOOP]] ] +; CHECK-VF1IC2-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]] +; CHECK-VF1IC2-NEXT: [[L:%.*]] = load i64, ptr [[GEP]], align 4 +; CHECK-VF1IC2-NEXT: [[CMP:%.*]] = icmp ugt i64 [[MIN_VAL]], [[L]] +; CHECK-VF1IC2-NEXT: [[MIN_VAL_NEXT]] = tail call i64 @llvm.umin.i64(i64 [[MIN_VAL]], i64 [[L]]) +; CHECK-VF1IC2-NEXT: [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]] +; CHECK-VF1IC2-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF1IC2-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; CHECK-VF1IC2-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-VF1IC2: exit: +; CHECK-VF1IC2-NEXT: [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], [[LOOP]] ], [ [[RDX_SELECT5]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1IC2-NEXT: [[RES_UMIN:%.*]] = phi i64 [ [[MIN_VAL_NEXT]], [[LOOP]] ], [ [[RDX_MINMAX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1IC2-NEXT: store i64 [[RES_UMIN]], ptr [[UMIN:%.*]], align 4 +; CHECK-VF1IC2-NEXT: ret i64 [[RES]] ; entry: br label %loop @@ -130,7 +309,7 @@ ; CHECK-VF4IC1-NEXT: [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]] ; CHECK-VF4IC1-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0 -; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK-VF4IC1: exit: ; CHECK-VF4IC1-NEXT: [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], [[LOOP]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] ; CHECK-VF4IC1-NEXT: ret i64 [[RES]] @@ -150,7 +329,7 @@ ; CHECK-VF4IC2-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], ; CHECK-VF4IC2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-VF4IC2-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 -; CHECK-VF4IC2-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[TMP0]] +; CHECK-VF4IC2-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[SRC:%.*]], i64 [[TMP0]] ; CHECK-VF4IC2-NEXT: [[TMP3:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[TMP1]] ; CHECK-VF4IC2-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[TMP2]], i32 0 ; CHECK-VF4IC2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 4 @@ -167,7 +346,7 @@ ; CHECK-VF4IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-VF4IC2-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], ; CHECK-VF4IC2-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 -; CHECK-VF4IC2-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-VF4IC2-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-VF4IC2: middle.block: ; CHECK-VF4IC2-NEXT: [[RDX_MINMAX:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[TMP12]], <4 x i64> [[TMP13]]) ; CHECK-VF4IC2-NEXT: [[TMP15:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[RDX_MINMAX]]) @@ -193,7 +372,7 @@ ; CHECK-VF4IC2-NEXT: [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]] ; CHECK-VF4IC2-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-VF4IC2-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0 -; CHECK-VF4IC2-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-VF4IC2-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK-VF4IC2: exit: ; CHECK-VF4IC2-NEXT: [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], [[LOOP]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] ; CHECK-VF4IC2-NEXT: ret i64 [[RES]] @@ -211,7 +390,7 @@ ; CHECK-VF1IC2-NEXT: [[VECTOR_RECUR:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] ; CHECK-VF1IC2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-VF1IC2-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 -; CHECK-VF1IC2-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[TMP0]] +; CHECK-VF1IC2-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[SRC:%.*]], i64 [[TMP0]] ; CHECK-VF1IC2-NEXT: [[TMP3:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[TMP1]] ; CHECK-VF1IC2-NEXT: [[TMP4:%.*]] = load i64, ptr [[TMP2]], align 4 ; CHECK-VF1IC2-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 4 @@ -223,7 +402,7 @@ ; CHECK-VF1IC2-NEXT: [[TMP11]] = select i1 [[TMP9]], i64 [[TMP1]], i64 [[VEC_PHI1]] ; CHECK-VF1IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-VF1IC2-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 -; CHECK-VF1IC2-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-VF1IC2-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-VF1IC2: middle.block: ; CHECK-VF1IC2-NEXT: [[RDX_MINMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[TMP10]], i64 [[TMP11]]) ; CHECK-VF1IC2-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[RDX_MINMAX]], -9223372036854775808 @@ -247,7 +426,7 @@ ; CHECK-VF1IC2-NEXT: [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]] ; CHECK-VF1IC2-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-VF1IC2-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0 -; CHECK-VF1IC2-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-VF1IC2-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK-VF1IC2: exit: ; CHECK-VF1IC2-NEXT: [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], [[LOOP]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] ; CHECK-VF1IC2-NEXT: ret i64 [[RES]] diff --git a/llvm/test/Transforms/LoopVectorize/smax-idx.ll b/llvm/test/Transforms/LoopVectorize/smax-idx.ll --- a/llvm/test/Transforms/LoopVectorize/smax-idx.ll +++ b/llvm/test/Transforms/LoopVectorize/smax-idx.ll @@ -1,10 +1,255 @@ -; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefix=CHECK -; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=4 -S < %s | FileCheck %s --check-prefix=CHECK -; RUN: opt -passes=loop-vectorize -force-vector-width=1 -force-vector-interleave=4 -S < %s | FileCheck %s --check-prefix=CHECK +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefix=CHECK-VF4IC1 --check-prefix=CHECK +; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=4 -S < %s | FileCheck %s --check-prefix=CHECK-VF4IC4 --check-prefix=CHECK +; RUN: opt -passes=loop-vectorize -force-vector-width=1 -force-vector-interleave=4 -S < %s | FileCheck %s --check-prefix=CHECK-VF1IC4 --check-prefix=CHECK define i64 @smax_idx(ptr nocapture readonly %a, i64 %mm, i64 %ii, ptr nocapture writeonly %res_max, i64 %n) { -; CHECK-LABEL: @smax_idx( -; CHECK-NOT: vector.body: +; CHECK-VF4IC1-LABEL: @smax_idx( +; CHECK-VF4IC1-NEXT: entry: +; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 +; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF4IC1: vector.ph: +; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF4IC1-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[MM:%.*]], i64 0 +; CHECK-VF4IC1-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i64> [[MINMAX_IDENT_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF4IC1: vector.body: +; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0 +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP3]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[VEC_PHI]], <4 x i64> [[WIDE_LOAD]]) +; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = icmp slt <4 x i64> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-VF4IC1-NEXT: [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI1]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-VF4IC1: middle.block: +; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP3]]) +; CHECK-VF4IC1-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP7]], i64 0 +; CHECK-VF4IC1-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: [[MASK_CMP:%.*]] = icmp eq <4 x i64> [[DOTSPLAT]], [[TMP3]] +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: [[MASK_SELECT:%.*]] = select <4 x i1> [[MASK_CMP]], <4 x i64> [[TMP5]], <4 x i64> +; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> [[MASK_SELECT]]) +; CHECK-VF4IC1-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP8]], -9223372036854775808 +; CHECK-VF4IC1-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP8]], i64 [[II:%.*]] +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF4IC1: scalar.ph: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[MM]], [[ENTRY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX2:%.*]] = phi i64 [ [[II]], [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF4IC1: for.body: +; CHECK-VF4IC1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[MAX_09:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP10:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[IDX_011:%.*]] = phi i64 [ [[BC_MERGE_RDX2]], [[SCALAR_PH]] ], [ [[SPEC_SELECT7:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = load i64, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP10]] = tail call i64 @llvm.smax.i64(i64 [[MAX_09]], i64 [[TMP9]]) +; CHECK-VF4IC1-NEXT: [[CMP1:%.*]] = icmp slt i64 [[MAX_09]], [[TMP9]] +; CHECK-VF4IC1-NEXT: [[SPEC_SELECT7]] = select i1 [[CMP1]], i64 [[INDVARS_IV]], i64 [[IDX_011]] +; CHECK-VF4IC1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-VF4IC1: exit: +; CHECK-VF4IC1-NEXT: [[DOTLCSSA:%.*]] = phi i64 [ [[TMP10]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: [[SPEC_SELECT7_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT7]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: store i64 [[DOTLCSSA]], ptr [[RES_MAX:%.*]], align 4 +; CHECK-VF4IC1-NEXT: ret i64 [[SPEC_SELECT7_LCSSA]] +; +; CHECK-VF4IC4-LABEL: @smax_idx( +; CHECK-VF4IC4-NEXT: entry: +; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 16 +; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF4IC4: vector.ph: +; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16 +; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF4IC4-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[MM:%.*]], i64 0 +; CHECK-VF4IC4-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i64> [[MINMAX_IDENT_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF4IC4: vector.body: +; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI8:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI9:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI10:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], +; CHECK-VF4IC4-NEXT: [[STEP_ADD1:%.*]] = add <4 x i64> [[STEP_ADD]], +; CHECK-VF4IC4-NEXT: [[STEP_ADD2:%.*]] = add <4 x i64> [[STEP_ADD1]], +; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8 +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 12 +; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-VF4IC4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] +; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]] +; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0 +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP8]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 4 +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i64>, ptr [[TMP9]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 8 +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i64>, ptr [[TMP10]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 12 +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x i64>, ptr [[TMP11]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP12]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[VEC_PHI]], <4 x i64> [[WIDE_LOAD]]) +; CHECK-VF4IC4-NEXT: [[TMP13]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[VEC_PHI4]], <4 x i64> [[WIDE_LOAD11]]) +; CHECK-VF4IC4-NEXT: [[TMP14]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[VEC_PHI5]], <4 x i64> [[WIDE_LOAD12]]) +; CHECK-VF4IC4-NEXT: [[TMP15]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[VEC_PHI6]], <4 x i64> [[WIDE_LOAD13]]) +; CHECK-VF4IC4-NEXT: [[TMP16:%.*]] = icmp slt <4 x i64> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-VF4IC4-NEXT: [[TMP17:%.*]] = icmp slt <4 x i64> [[VEC_PHI4]], [[WIDE_LOAD11]] +; CHECK-VF4IC4-NEXT: [[TMP18:%.*]] = icmp slt <4 x i64> [[VEC_PHI5]], [[WIDE_LOAD12]] +; CHECK-VF4IC4-NEXT: [[TMP19:%.*]] = icmp slt <4 x i64> [[VEC_PHI6]], [[WIDE_LOAD13]] +; CHECK-VF4IC4-NEXT: [[TMP20]] = select <4 x i1> [[TMP16]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI7]] +; CHECK-VF4IC4-NEXT: [[TMP21]] = select <4 x i1> [[TMP17]], <4 x i64> [[STEP_ADD]], <4 x i64> [[VEC_PHI8]] +; CHECK-VF4IC4-NEXT: [[TMP22]] = select <4 x i1> [[TMP18]], <4 x i64> [[STEP_ADD1]], <4 x i64> [[VEC_PHI9]] +; CHECK-VF4IC4-NEXT: [[TMP23]] = select <4 x i1> [[TMP19]], <4 x i64> [[STEP_ADD2]], <4 x i64> [[VEC_PHI10]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD2]], +; CHECK-VF4IC4-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-VF4IC4: middle.block: +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i64> [[TMP12]], [[TMP13]] +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i64> [[TMP12]], <4 x i64> [[TMP13]] +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX_CMP14:%.*]] = icmp sgt <4 x i64> [[RDX_MINMAX_SELECT]], [[TMP14]] +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX_SELECT15:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP14]], <4 x i64> [[RDX_MINMAX_SELECT]], <4 x i64> [[TMP14]] +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX_CMP16:%.*]] = icmp sgt <4 x i64> [[RDX_MINMAX_SELECT15]], [[TMP15]] +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX_SELECT17:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP16]], <4 x i64> [[RDX_MINMAX_SELECT15]], <4 x i64> [[TMP15]] +; CHECK-VF4IC4-NEXT: [[TMP25:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[RDX_MINMAX_SELECT17]]) +; CHECK-VF4IC4-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP25]], i64 0 +; CHECK-VF4IC4-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: [[MASK_CMP:%.*]] = icmp eq <4 x i64> [[DOTSPLAT]], [[RDX_MINMAX_SELECT17]] +; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp sge <4 x i64> [[TMP12]], [[TMP13]] +; CHECK-VF4IC4-NEXT: [[RDX_SELECT:%.*]] = select <4 x i1> [[RDX_SELECT_CMP]], <4 x i64> [[TMP20]], <4 x i64> [[TMP21]] +; CHECK-VF4IC4-NEXT: [[RDX_SELECT_CMP18:%.*]] = icmp sge <4 x i64> [[RDX_MINMAX_SELECT]], [[TMP14]] +; CHECK-VF4IC4-NEXT: [[RDX_SELECT19:%.*]] = select <4 x i1> [[RDX_SELECT_CMP18]], <4 x i64> [[RDX_SELECT]], <4 x i64> [[TMP22]] +; CHECK-VF4IC4-NEXT: [[RDX_SELECT_CMP20:%.*]] = icmp sge <4 x i64> [[RDX_MINMAX_SELECT15]], [[TMP15]] +; CHECK-VF4IC4-NEXT: [[RDX_SELECT21:%.*]] = select <4 x i1> [[RDX_SELECT_CMP20]], <4 x i64> [[RDX_SELECT19]], <4 x i64> [[TMP23]] +; CHECK-VF4IC4-NEXT: [[MASK_SELECT:%.*]] = select <4 x i1> [[MASK_CMP]], <4 x i64> [[RDX_SELECT21]], <4 x i64> +; CHECK-VF4IC4-NEXT: [[TMP26:%.*]] = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> [[MASK_SELECT]]) +; CHECK-VF4IC4-NEXT: [[RDX_SELECT_CMP22:%.*]] = icmp ne i64 [[TMP26]], -9223372036854775808 +; CHECK-VF4IC4-NEXT: [[RDX_SELECT23:%.*]] = select i1 [[RDX_SELECT_CMP22]], i64 [[TMP26]], i64 [[II:%.*]] +; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF4IC4: scalar.ph: +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[MM]], [[ENTRY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX24:%.*]] = phi i64 [ [[II]], [[ENTRY]] ], [ [[RDX_SELECT23]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC4-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF4IC4: for.body: +; CHECK-VF4IC4-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[MAX_09:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP28:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[IDX_011:%.*]] = phi i64 [ [[BC_MERGE_RDX24]], [[SCALAR_PH]] ], [ [[SPEC_SELECT7:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF4IC4-NEXT: [[TMP27:%.*]] = load i64, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP28]] = tail call i64 @llvm.smax.i64(i64 [[MAX_09]], i64 [[TMP27]]) +; CHECK-VF4IC4-NEXT: [[CMP1:%.*]] = icmp slt i64 [[MAX_09]], [[TMP27]] +; CHECK-VF4IC4-NEXT: [[SPEC_SELECT7]] = select i1 [[CMP1]], i64 [[INDVARS_IV]], i64 [[IDX_011]] +; CHECK-VF4IC4-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-VF4IC4: exit: +; CHECK-VF4IC4-NEXT: [[DOTLCSSA:%.*]] = phi i64 [ [[TMP28]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC4-NEXT: [[SPEC_SELECT7_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT7]], [[FOR_BODY]] ], [ [[RDX_SELECT23]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC4-NEXT: store i64 [[DOTLCSSA]], ptr [[RES_MAX:%.*]], align 4 +; CHECK-VF4IC4-NEXT: ret i64 [[SPEC_SELECT7_LCSSA]] +; +; CHECK-VF1IC4-LABEL: @smax_idx( +; CHECK-VF1IC4-NEXT: entry: +; CHECK-VF1IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 +; CHECK-VF1IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF1IC4: vector.ph: +; CHECK-VF1IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-VF1IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF1IC4-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF1IC4: vector.body: +; CHECK-VF1IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI:%.*]] = phi i64 [ [[MM:%.*]], [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI1:%.*]] = phi i64 [ [[MM]], [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI2:%.*]] = phi i64 [ [[MM]], [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI3:%.*]] = phi i64 [ [[MM]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI4:%.*]] = phi i64 [ -9223372036854775808, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI5:%.*]] = phi i64 [ -9223372036854775808, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI6:%.*]] = phi i64 [ -9223372036854775808, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI7:%.*]] = phi i64 [ -9223372036854775808, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF1IC4-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-VF1IC4-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-VF1IC4-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-VF1IC4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-VF1IC4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] +; CHECK-VF1IC4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]] +; CHECK-VF1IC4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; CHECK-VF1IC4-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP4]], align 4 +; CHECK-VF1IC4-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP5]], align 4 +; CHECK-VF1IC4-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 4 +; CHECK-VF1IC4-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 4 +; CHECK-VF1IC4-NEXT: [[TMP12]] = tail call i64 @llvm.smax.i64(i64 [[VEC_PHI]], i64 [[TMP8]]) +; CHECK-VF1IC4-NEXT: [[TMP13]] = tail call i64 @llvm.smax.i64(i64 [[VEC_PHI1]], i64 [[TMP9]]) +; CHECK-VF1IC4-NEXT: [[TMP14]] = tail call i64 @llvm.smax.i64(i64 [[VEC_PHI2]], i64 [[TMP10]]) +; CHECK-VF1IC4-NEXT: [[TMP15]] = tail call i64 @llvm.smax.i64(i64 [[VEC_PHI3]], i64 [[TMP11]]) +; CHECK-VF1IC4-NEXT: [[TMP16:%.*]] = icmp slt i64 [[VEC_PHI]], [[TMP8]] +; CHECK-VF1IC4-NEXT: [[TMP17:%.*]] = icmp slt i64 [[VEC_PHI1]], [[TMP9]] +; CHECK-VF1IC4-NEXT: [[TMP18:%.*]] = icmp slt i64 [[VEC_PHI2]], [[TMP10]] +; CHECK-VF1IC4-NEXT: [[TMP19:%.*]] = icmp slt i64 [[VEC_PHI3]], [[TMP11]] +; CHECK-VF1IC4-NEXT: [[TMP20]] = select i1 [[TMP16]], i64 [[TMP0]], i64 [[VEC_PHI4]] +; CHECK-VF1IC4-NEXT: [[TMP21]] = select i1 [[TMP17]], i64 [[TMP1]], i64 [[VEC_PHI5]] +; CHECK-VF1IC4-NEXT: [[TMP22]] = select i1 [[TMP18]], i64 [[TMP2]], i64 [[VEC_PHI6]] +; CHECK-VF1IC4-NEXT: [[TMP23]] = select i1 [[TMP19]], i64 [[TMP3]], i64 [[VEC_PHI7]] +; CHECK-VF1IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF1IC4-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF1IC4-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-VF1IC4: middle.block: +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt i64 [[TMP12]], [[TMP13]] +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select i1 [[RDX_MINMAX_CMP]], i64 [[TMP12]], i64 [[TMP13]] +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX_CMP8:%.*]] = icmp sgt i64 [[RDX_MINMAX_SELECT]], [[TMP14]] +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX_SELECT9:%.*]] = select i1 [[RDX_MINMAX_CMP8]], i64 [[RDX_MINMAX_SELECT]], i64 [[TMP14]] +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX_CMP10:%.*]] = icmp sgt i64 [[RDX_MINMAX_SELECT9]], [[TMP15]] +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX_SELECT11:%.*]] = select i1 [[RDX_MINMAX_CMP10]], i64 [[RDX_MINMAX_SELECT9]], i64 [[TMP15]] +; CHECK-VF1IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF1IC4-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp sge i64 [[TMP12]], [[TMP13]] +; CHECK-VF1IC4-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP20]], i64 [[TMP21]] +; CHECK-VF1IC4-NEXT: [[RDX_SELECT_CMP12:%.*]] = icmp sge i64 [[RDX_MINMAX_SELECT]], [[TMP14]] +; CHECK-VF1IC4-NEXT: [[RDX_SELECT13:%.*]] = select i1 [[RDX_SELECT_CMP12]], i64 [[RDX_SELECT]], i64 [[TMP22]] +; CHECK-VF1IC4-NEXT: [[RDX_SELECT_CMP14:%.*]] = icmp sge i64 [[RDX_MINMAX_SELECT9]], [[TMP15]] +; CHECK-VF1IC4-NEXT: [[RDX_SELECT15:%.*]] = select i1 [[RDX_SELECT_CMP14]], i64 [[RDX_SELECT13]], i64 [[TMP23]] +; CHECK-VF1IC4-NEXT: [[RDX_SELECT_CMP16:%.*]] = icmp ne i64 [[RDX_SELECT15]], -9223372036854775808 +; CHECK-VF1IC4-NEXT: [[RDX_SELECT17:%.*]] = select i1 [[RDX_SELECT_CMP16]], i64 [[RDX_SELECT15]], i64 [[II:%.*]] +; CHECK-VF1IC4-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF1IC4: scalar.ph: +; CHECK-VF1IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-VF1IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[MM]], [[ENTRY]] ], [ [[RDX_MINMAX_SELECT11]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1IC4-NEXT: [[BC_MERGE_RDX18:%.*]] = phi i64 [ [[II]], [[ENTRY]] ], [ [[RDX_SELECT17]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1IC4-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF1IC4: for.body: +; CHECK-VF1IC4-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[MAX_09:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP26:%.*]], [[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[IDX_011:%.*]] = phi i64 [ [[BC_MERGE_RDX18]], [[SCALAR_PH]] ], [ [[SPEC_SELECT7:%.*]], [[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF1IC4-NEXT: [[TMP25:%.*]] = load i64, ptr [[ARRAYIDX]], align 4 +; CHECK-VF1IC4-NEXT: [[TMP26]] = tail call i64 @llvm.smax.i64(i64 [[MAX_09]], i64 [[TMP25]]) +; CHECK-VF1IC4-NEXT: [[CMP1:%.*]] = icmp slt i64 [[MAX_09]], [[TMP25]] +; CHECK-VF1IC4-NEXT: [[SPEC_SELECT7]] = select i1 [[CMP1]], i64 [[INDVARS_IV]], i64 [[IDX_011]] +; CHECK-VF1IC4-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF1IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF1IC4-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-VF1IC4: exit: +; CHECK-VF1IC4-NEXT: [[DOTLCSSA:%.*]] = phi i64 [ [[TMP26]], [[FOR_BODY]] ], [ [[RDX_MINMAX_SELECT11]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1IC4-NEXT: [[SPEC_SELECT7_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT7]], [[FOR_BODY]] ], [ [[RDX_SELECT17]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1IC4-NEXT: store i64 [[DOTLCSSA]], ptr [[RES_MAX:%.*]], align 4 +; CHECK-VF1IC4-NEXT: ret i64 [[SPEC_SELECT7_LCSSA]] ; entry: br label %for.body @@ -31,8 +276,252 @@ ; Check the different order of reduction phis. ; define i64 @smax_idx_inverted_phi(ptr nocapture readonly %a, i64 %mm, i64 %ii, ptr nocapture writeonly %res_max, i64 %n) { -; CHECK-LABEL: @smax_idx_inverted_phi( -; CHECK-NOT: vector.body: +; CHECK-VF4IC1-LABEL: @smax_idx_inverted_phi( +; CHECK-VF4IC1-NEXT: entry: +; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 +; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF4IC1: vector.ph: +; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF4IC1-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[MM:%.*]], i64 0 +; CHECK-VF4IC1-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i64> [[MINMAX_IDENT_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF4IC1: vector.body: +; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0 +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP3]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[VEC_PHI1]], <4 x i64> [[WIDE_LOAD]]) +; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = icmp slt <4 x i64> [[VEC_PHI1]], [[WIDE_LOAD]] +; CHECK-VF4IC1-NEXT: [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-VF4IC1: middle.block: +; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP3]]) +; CHECK-VF4IC1-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP7]], i64 0 +; CHECK-VF4IC1-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: [[MASK_CMP:%.*]] = icmp eq <4 x i64> [[DOTSPLAT]], [[TMP3]] +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: [[MASK_SELECT:%.*]] = select <4 x i1> [[MASK_CMP]], <4 x i64> [[TMP5]], <4 x i64> +; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> [[MASK_SELECT]]) +; CHECK-VF4IC1-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP8]], -9223372036854775808 +; CHECK-VF4IC1-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP8]], i64 [[II:%.*]] +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF4IC1: scalar.ph: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[MM]], [[ENTRY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX2:%.*]] = phi i64 [ [[II]], [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF4IC1: for.body: +; CHECK-VF4IC1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[IDX_011:%.*]] = phi i64 [ [[BC_MERGE_RDX2]], [[SCALAR_PH]] ], [ [[SPEC_SELECT7:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[MAX_09:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP10:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = load i64, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP10]] = tail call i64 @llvm.smax.i64(i64 [[MAX_09]], i64 [[TMP9]]) +; CHECK-VF4IC1-NEXT: [[CMP1:%.*]] = icmp slt i64 [[MAX_09]], [[TMP9]] +; CHECK-VF4IC1-NEXT: [[SPEC_SELECT7]] = select i1 [[CMP1]], i64 [[INDVARS_IV]], i64 [[IDX_011]] +; CHECK-VF4IC1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-VF4IC1: exit: +; CHECK-VF4IC1-NEXT: [[DOTLCSSA:%.*]] = phi i64 [ [[TMP10]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: [[SPEC_SELECT7_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT7]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: store i64 [[DOTLCSSA]], ptr [[RES_MAX:%.*]], align 4 +; CHECK-VF4IC1-NEXT: ret i64 [[SPEC_SELECT7_LCSSA]] +; +; CHECK-VF4IC4-LABEL: @smax_idx_inverted_phi( +; CHECK-VF4IC4-NEXT: entry: +; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 16 +; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF4IC4: vector.ph: +; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16 +; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF4IC4-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[MM:%.*]], i64 0 +; CHECK-VF4IC4-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i64> [[MINMAX_IDENT_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF4IC4: vector.body: +; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI8:%.*]] = phi <4 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI9:%.*]] = phi <4 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI10:%.*]] = phi <4 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], +; CHECK-VF4IC4-NEXT: [[STEP_ADD1:%.*]] = add <4 x i64> [[STEP_ADD]], +; CHECK-VF4IC4-NEXT: [[STEP_ADD2:%.*]] = add <4 x i64> [[STEP_ADD1]], +; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8 +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 12 +; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-VF4IC4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] +; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]] +; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0 +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP8]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 4 +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i64>, ptr [[TMP9]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 8 +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i64>, ptr [[TMP10]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 12 +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x i64>, ptr [[TMP11]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP12]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[VEC_PHI7]], <4 x i64> [[WIDE_LOAD]]) +; CHECK-VF4IC4-NEXT: [[TMP13]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[VEC_PHI8]], <4 x i64> [[WIDE_LOAD11]]) +; CHECK-VF4IC4-NEXT: [[TMP14]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[VEC_PHI9]], <4 x i64> [[WIDE_LOAD12]]) +; CHECK-VF4IC4-NEXT: [[TMP15]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[VEC_PHI10]], <4 x i64> [[WIDE_LOAD13]]) +; CHECK-VF4IC4-NEXT: [[TMP16:%.*]] = icmp slt <4 x i64> [[VEC_PHI7]], [[WIDE_LOAD]] +; CHECK-VF4IC4-NEXT: [[TMP17:%.*]] = icmp slt <4 x i64> [[VEC_PHI8]], [[WIDE_LOAD11]] +; CHECK-VF4IC4-NEXT: [[TMP18:%.*]] = icmp slt <4 x i64> [[VEC_PHI9]], [[WIDE_LOAD12]] +; CHECK-VF4IC4-NEXT: [[TMP19:%.*]] = icmp slt <4 x i64> [[VEC_PHI10]], [[WIDE_LOAD13]] +; CHECK-VF4IC4-NEXT: [[TMP20]] = select <4 x i1> [[TMP16]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] +; CHECK-VF4IC4-NEXT: [[TMP21]] = select <4 x i1> [[TMP17]], <4 x i64> [[STEP_ADD]], <4 x i64> [[VEC_PHI4]] +; CHECK-VF4IC4-NEXT: [[TMP22]] = select <4 x i1> [[TMP18]], <4 x i64> [[STEP_ADD1]], <4 x i64> [[VEC_PHI5]] +; CHECK-VF4IC4-NEXT: [[TMP23]] = select <4 x i1> [[TMP19]], <4 x i64> [[STEP_ADD2]], <4 x i64> [[VEC_PHI6]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD2]], +; CHECK-VF4IC4-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-VF4IC4: middle.block: +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i64> [[TMP12]], [[TMP13]] +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i64> [[TMP12]], <4 x i64> [[TMP13]] +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX_CMP14:%.*]] = icmp sgt <4 x i64> [[RDX_MINMAX_SELECT]], [[TMP14]] +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX_SELECT15:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP14]], <4 x i64> [[RDX_MINMAX_SELECT]], <4 x i64> [[TMP14]] +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX_CMP16:%.*]] = icmp sgt <4 x i64> [[RDX_MINMAX_SELECT15]], [[TMP15]] +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX_SELECT17:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP16]], <4 x i64> [[RDX_MINMAX_SELECT15]], <4 x i64> [[TMP15]] +; CHECK-VF4IC4-NEXT: [[TMP25:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[RDX_MINMAX_SELECT17]]) +; CHECK-VF4IC4-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP25]], i64 0 +; CHECK-VF4IC4-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: [[MASK_CMP:%.*]] = icmp eq <4 x i64> [[DOTSPLAT]], [[RDX_MINMAX_SELECT17]] +; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp sge <4 x i64> [[TMP12]], [[TMP13]] +; CHECK-VF4IC4-NEXT: [[RDX_SELECT:%.*]] = select <4 x i1> [[RDX_SELECT_CMP]], <4 x i64> [[TMP20]], <4 x i64> [[TMP21]] +; CHECK-VF4IC4-NEXT: [[RDX_SELECT_CMP18:%.*]] = icmp sge <4 x i64> [[RDX_MINMAX_SELECT]], [[TMP14]] +; CHECK-VF4IC4-NEXT: [[RDX_SELECT19:%.*]] = select <4 x i1> [[RDX_SELECT_CMP18]], <4 x i64> [[RDX_SELECT]], <4 x i64> [[TMP22]] +; CHECK-VF4IC4-NEXT: [[RDX_SELECT_CMP20:%.*]] = icmp sge <4 x i64> [[RDX_MINMAX_SELECT15]], [[TMP15]] +; CHECK-VF4IC4-NEXT: [[RDX_SELECT21:%.*]] = select <4 x i1> [[RDX_SELECT_CMP20]], <4 x i64> [[RDX_SELECT19]], <4 x i64> [[TMP23]] +; CHECK-VF4IC4-NEXT: [[MASK_SELECT:%.*]] = select <4 x i1> [[MASK_CMP]], <4 x i64> [[RDX_SELECT21]], <4 x i64> +; CHECK-VF4IC4-NEXT: [[TMP26:%.*]] = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> [[MASK_SELECT]]) +; CHECK-VF4IC4-NEXT: [[RDX_SELECT_CMP22:%.*]] = icmp ne i64 [[TMP26]], -9223372036854775808 +; CHECK-VF4IC4-NEXT: [[RDX_SELECT23:%.*]] = select i1 [[RDX_SELECT_CMP22]], i64 [[TMP26]], i64 [[II:%.*]] +; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF4IC4: scalar.ph: +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[MM]], [[ENTRY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX24:%.*]] = phi i64 [ [[II]], [[ENTRY]] ], [ [[RDX_SELECT23]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC4-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF4IC4: for.body: +; CHECK-VF4IC4-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[IDX_011:%.*]] = phi i64 [ [[BC_MERGE_RDX24]], [[SCALAR_PH]] ], [ [[SPEC_SELECT7:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[MAX_09:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP28:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF4IC4-NEXT: [[TMP27:%.*]] = load i64, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP28]] = tail call i64 @llvm.smax.i64(i64 [[MAX_09]], i64 [[TMP27]]) +; CHECK-VF4IC4-NEXT: [[CMP1:%.*]] = icmp slt i64 [[MAX_09]], [[TMP27]] +; CHECK-VF4IC4-NEXT: [[SPEC_SELECT7]] = select i1 [[CMP1]], i64 [[INDVARS_IV]], i64 [[IDX_011]] +; CHECK-VF4IC4-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-VF4IC4: exit: +; CHECK-VF4IC4-NEXT: [[DOTLCSSA:%.*]] = phi i64 [ [[TMP28]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC4-NEXT: [[SPEC_SELECT7_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT7]], [[FOR_BODY]] ], [ [[RDX_SELECT23]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC4-NEXT: store i64 [[DOTLCSSA]], ptr [[RES_MAX:%.*]], align 4 +; CHECK-VF4IC4-NEXT: ret i64 [[SPEC_SELECT7_LCSSA]] +; +; CHECK-VF1IC4-LABEL: @smax_idx_inverted_phi( +; CHECK-VF1IC4-NEXT: entry: +; CHECK-VF1IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 +; CHECK-VF1IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF1IC4: vector.ph: +; CHECK-VF1IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-VF1IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF1IC4-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF1IC4: vector.body: +; CHECK-VF1IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI:%.*]] = phi i64 [ -9223372036854775808, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI1:%.*]] = phi i64 [ -9223372036854775808, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI2:%.*]] = phi i64 [ -9223372036854775808, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI3:%.*]] = phi i64 [ -9223372036854775808, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI4:%.*]] = phi i64 [ [[MM:%.*]], [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI5:%.*]] = phi i64 [ [[MM]], [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI6:%.*]] = phi i64 [ [[MM]], [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI7:%.*]] = phi i64 [ [[MM]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF1IC4-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-VF1IC4-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-VF1IC4-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-VF1IC4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-VF1IC4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] +; CHECK-VF1IC4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]] +; CHECK-VF1IC4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; CHECK-VF1IC4-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP4]], align 4 +; CHECK-VF1IC4-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP5]], align 4 +; CHECK-VF1IC4-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 4 +; CHECK-VF1IC4-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 4 +; CHECK-VF1IC4-NEXT: [[TMP12]] = tail call i64 @llvm.smax.i64(i64 [[VEC_PHI4]], i64 [[TMP8]]) +; CHECK-VF1IC4-NEXT: [[TMP13]] = tail call i64 @llvm.smax.i64(i64 [[VEC_PHI5]], i64 [[TMP9]]) +; CHECK-VF1IC4-NEXT: [[TMP14]] = tail call i64 @llvm.smax.i64(i64 [[VEC_PHI6]], i64 [[TMP10]]) +; CHECK-VF1IC4-NEXT: [[TMP15]] = tail call i64 @llvm.smax.i64(i64 [[VEC_PHI7]], i64 [[TMP11]]) +; CHECK-VF1IC4-NEXT: [[TMP16:%.*]] = icmp slt i64 [[VEC_PHI4]], [[TMP8]] +; CHECK-VF1IC4-NEXT: [[TMP17:%.*]] = icmp slt i64 [[VEC_PHI5]], [[TMP9]] +; CHECK-VF1IC4-NEXT: [[TMP18:%.*]] = icmp slt i64 [[VEC_PHI6]], [[TMP10]] +; CHECK-VF1IC4-NEXT: [[TMP19:%.*]] = icmp slt i64 [[VEC_PHI7]], [[TMP11]] +; CHECK-VF1IC4-NEXT: [[TMP20]] = select i1 [[TMP16]], i64 [[TMP0]], i64 [[VEC_PHI]] +; CHECK-VF1IC4-NEXT: [[TMP21]] = select i1 [[TMP17]], i64 [[TMP1]], i64 [[VEC_PHI1]] +; CHECK-VF1IC4-NEXT: [[TMP22]] = select i1 [[TMP18]], i64 [[TMP2]], i64 [[VEC_PHI2]] +; CHECK-VF1IC4-NEXT: [[TMP23]] = select i1 [[TMP19]], i64 [[TMP3]], i64 [[VEC_PHI3]] +; CHECK-VF1IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF1IC4-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF1IC4-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-VF1IC4: middle.block: +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt i64 [[TMP12]], [[TMP13]] +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select i1 [[RDX_MINMAX_CMP]], i64 [[TMP12]], i64 [[TMP13]] +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX_CMP8:%.*]] = icmp sgt i64 [[RDX_MINMAX_SELECT]], [[TMP14]] +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX_SELECT9:%.*]] = select i1 [[RDX_MINMAX_CMP8]], i64 [[RDX_MINMAX_SELECT]], i64 [[TMP14]] +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX_CMP10:%.*]] = icmp sgt i64 [[RDX_MINMAX_SELECT9]], [[TMP15]] +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX_SELECT11:%.*]] = select i1 [[RDX_MINMAX_CMP10]], i64 [[RDX_MINMAX_SELECT9]], i64 [[TMP15]] +; CHECK-VF1IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF1IC4-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp sge i64 [[TMP12]], [[TMP13]] +; CHECK-VF1IC4-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP20]], i64 [[TMP21]] +; CHECK-VF1IC4-NEXT: [[RDX_SELECT_CMP12:%.*]] = icmp sge i64 [[RDX_MINMAX_SELECT]], [[TMP14]] +; CHECK-VF1IC4-NEXT: [[RDX_SELECT13:%.*]] = select i1 [[RDX_SELECT_CMP12]], i64 [[RDX_SELECT]], i64 [[TMP22]] +; CHECK-VF1IC4-NEXT: [[RDX_SELECT_CMP14:%.*]] = icmp sge i64 [[RDX_MINMAX_SELECT9]], [[TMP15]] +; CHECK-VF1IC4-NEXT: [[RDX_SELECT15:%.*]] = select i1 [[RDX_SELECT_CMP14]], i64 [[RDX_SELECT13]], i64 [[TMP23]] +; CHECK-VF1IC4-NEXT: [[RDX_SELECT_CMP16:%.*]] = icmp ne i64 [[RDX_SELECT15]], -9223372036854775808 +; CHECK-VF1IC4-NEXT: [[RDX_SELECT17:%.*]] = select i1 [[RDX_SELECT_CMP16]], i64 [[RDX_SELECT15]], i64 [[II:%.*]] +; CHECK-VF1IC4-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF1IC4: scalar.ph: +; CHECK-VF1IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-VF1IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[MM]], [[ENTRY]] ], [ [[RDX_MINMAX_SELECT11]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1IC4-NEXT: [[BC_MERGE_RDX18:%.*]] = phi i64 [ [[II]], [[ENTRY]] ], [ [[RDX_SELECT17]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1IC4-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF1IC4: for.body: +; CHECK-VF1IC4-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[IDX_011:%.*]] = phi i64 [ [[BC_MERGE_RDX18]], [[SCALAR_PH]] ], [ [[SPEC_SELECT7:%.*]], [[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[MAX_09:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP26:%.*]], [[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF1IC4-NEXT: [[TMP25:%.*]] = load i64, ptr [[ARRAYIDX]], align 4 +; CHECK-VF1IC4-NEXT: [[TMP26]] = tail call i64 @llvm.smax.i64(i64 [[MAX_09]], i64 [[TMP25]]) +; CHECK-VF1IC4-NEXT: [[CMP1:%.*]] = icmp slt i64 [[MAX_09]], [[TMP25]] +; CHECK-VF1IC4-NEXT: [[SPEC_SELECT7]] = select i1 [[CMP1]], i64 [[INDVARS_IV]], i64 [[IDX_011]] +; CHECK-VF1IC4-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF1IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF1IC4-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-VF1IC4: exit: +; CHECK-VF1IC4-NEXT: [[DOTLCSSA:%.*]] = phi i64 [ [[TMP26]], [[FOR_BODY]] ], [ [[RDX_MINMAX_SELECT11]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1IC4-NEXT: [[SPEC_SELECT7_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT7]], [[FOR_BODY]] ], [ [[RDX_SELECT17]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1IC4-NEXT: store i64 [[DOTLCSSA]], ptr [[RES_MAX:%.*]], align 4 +; CHECK-VF1IC4-NEXT: ret i64 [[SPEC_SELECT7_LCSSA]] ; entry: br label %for.body @@ -121,8 +610,246 @@ ; Check sge case. ; define i64 @smax_idx_inverted_pred(ptr nocapture readonly %a, i64 %mm, i64 %ii, ptr nocapture writeonly %res_max, i64 %n) { -; CHECK-LABEL: @smax_idx_inverted_pred( -; CHECK-NOT: vector.body: +; CHECK-VF4IC1-LABEL: @smax_idx_inverted_pred( +; CHECK-VF4IC1-NEXT: entry: +; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 +; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF4IC1: vector.ph: +; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF4IC1-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[MM:%.*]], i64 0 +; CHECK-VF4IC1-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i64> [[MINMAX_IDENT_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF4IC1: vector.body: +; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0 +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP3]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[VEC_PHI]], <4 x i64> [[WIDE_LOAD]]) +; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = icmp sge <4 x i64> [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI1]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-VF4IC1: middle.block: +; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP3]]) +; CHECK-VF4IC1-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP7]], i64 0 +; CHECK-VF4IC1-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: [[MASK_CMP:%.*]] = icmp eq <4 x i64> [[DOTSPLAT]], [[TMP3]] +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: [[MASK_SELECT:%.*]] = select <4 x i1> [[MASK_CMP]], <4 x i64> [[TMP5]], <4 x i64> +; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[MASK_SELECT]]) +; CHECK-VF4IC1-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP8]], -9223372036854775808 +; CHECK-VF4IC1-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP8]], i64 [[II:%.*]] +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF4IC1: scalar.ph: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[MM]], [[ENTRY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX2:%.*]] = phi i64 [ [[II]], [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF4IC1: for.body: +; CHECK-VF4IC1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[MAX_09:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP10:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[IDX_011:%.*]] = phi i64 [ [[BC_MERGE_RDX2]], [[SCALAR_PH]] ], [ [[SPEC_SELECT7:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = load i64, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP10]] = tail call i64 @llvm.smax.i64(i64 [[MAX_09]], i64 [[TMP9]]) +; CHECK-VF4IC1-NEXT: [[CMP1:%.*]] = icmp sge i64 [[TMP9]], [[MAX_09]] +; CHECK-VF4IC1-NEXT: [[SPEC_SELECT7]] = select i1 [[CMP1]], i64 [[INDVARS_IV]], i64 [[IDX_011]] +; CHECK-VF4IC1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-VF4IC1: exit: +; CHECK-VF4IC1-NEXT: [[DOTLCSSA:%.*]] = phi i64 [ [[TMP10]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: [[SPEC_SELECT7_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT7]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: store i64 [[DOTLCSSA]], ptr [[RES_MAX:%.*]], align 4 +; CHECK-VF4IC1-NEXT: ret i64 [[SPEC_SELECT7_LCSSA]] +; +; CHECK-VF4IC4-LABEL: @smax_idx_inverted_pred( +; CHECK-VF4IC4-NEXT: entry: +; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 16 +; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF4IC4: vector.ph: +; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16 +; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF4IC4-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[MM:%.*]], i64 0 +; CHECK-VF4IC4-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i64> [[MINMAX_IDENT_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF4IC4: vector.body: +; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI8:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI9:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI10:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], +; CHECK-VF4IC4-NEXT: [[STEP_ADD1:%.*]] = add <4 x i64> [[STEP_ADD]], +; CHECK-VF4IC4-NEXT: [[STEP_ADD2:%.*]] = add <4 x i64> [[STEP_ADD1]], +; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8 +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 12 +; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-VF4IC4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] +; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]] +; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0 +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP8]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 4 +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i64>, ptr [[TMP9]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 8 +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i64>, ptr [[TMP10]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 12 +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x i64>, ptr [[TMP11]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP12]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[VEC_PHI]], <4 x i64> [[WIDE_LOAD]]) +; CHECK-VF4IC4-NEXT: [[TMP13]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[VEC_PHI4]], <4 x i64> [[WIDE_LOAD11]]) +; CHECK-VF4IC4-NEXT: [[TMP14]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[VEC_PHI5]], <4 x i64> [[WIDE_LOAD12]]) +; CHECK-VF4IC4-NEXT: [[TMP15]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[VEC_PHI6]], <4 x i64> [[WIDE_LOAD13]]) +; CHECK-VF4IC4-NEXT: [[TMP16:%.*]] = icmp sge <4 x i64> [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-VF4IC4-NEXT: [[TMP17:%.*]] = icmp sge <4 x i64> [[WIDE_LOAD11]], [[VEC_PHI4]] +; CHECK-VF4IC4-NEXT: [[TMP18:%.*]] = icmp sge <4 x i64> [[WIDE_LOAD12]], [[VEC_PHI5]] +; CHECK-VF4IC4-NEXT: [[TMP19:%.*]] = icmp sge <4 x i64> [[WIDE_LOAD13]], [[VEC_PHI6]] +; CHECK-VF4IC4-NEXT: [[TMP20]] = select <4 x i1> [[TMP16]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI7]] +; CHECK-VF4IC4-NEXT: [[TMP21]] = select <4 x i1> [[TMP17]], <4 x i64> [[STEP_ADD]], <4 x i64> [[VEC_PHI8]] +; CHECK-VF4IC4-NEXT: [[TMP22]] = select <4 x i1> [[TMP18]], <4 x i64> [[STEP_ADD1]], <4 x i64> [[VEC_PHI9]] +; CHECK-VF4IC4-NEXT: [[TMP23]] = select <4 x i1> [[TMP19]], <4 x i64> [[STEP_ADD2]], <4 x i64> [[VEC_PHI10]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD2]], +; CHECK-VF4IC4-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-VF4IC4: middle.block: +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i64> [[TMP12]], [[TMP13]] +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i64> [[TMP12]], <4 x i64> [[TMP13]] +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX_CMP14:%.*]] = icmp sgt <4 x i64> [[RDX_MINMAX_SELECT]], [[TMP14]] +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX_SELECT15:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP14]], <4 x i64> [[RDX_MINMAX_SELECT]], <4 x i64> [[TMP14]] +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX_CMP16:%.*]] = icmp sgt <4 x i64> [[RDX_MINMAX_SELECT15]], [[TMP15]] +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX_SELECT17:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP16]], <4 x i64> [[RDX_MINMAX_SELECT15]], <4 x i64> [[TMP15]] +; CHECK-VF4IC4-NEXT: [[TMP25:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[RDX_MINMAX_SELECT17]]) +; CHECK-VF4IC4-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP25]], i64 0 +; CHECK-VF4IC4-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: [[MASK_CMP:%.*]] = icmp eq <4 x i64> [[DOTSPLAT]], [[RDX_MINMAX_SELECT17]] +; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: [[RDX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i64> [[TMP20]], <4 x i64> [[TMP21]] +; CHECK-VF4IC4-NEXT: [[RDX_SELECT18:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP14]], <4 x i64> [[RDX_SELECT]], <4 x i64> [[TMP22]] +; CHECK-VF4IC4-NEXT: [[RDX_SELECT19:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP16]], <4 x i64> [[RDX_SELECT18]], <4 x i64> [[TMP23]] +; CHECK-VF4IC4-NEXT: [[MASK_SELECT:%.*]] = select <4 x i1> [[MASK_CMP]], <4 x i64> [[RDX_SELECT19]], <4 x i64> +; CHECK-VF4IC4-NEXT: [[TMP26:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[MASK_SELECT]]) +; CHECK-VF4IC4-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP26]], -9223372036854775808 +; CHECK-VF4IC4-NEXT: [[RDX_SELECT20:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP26]], i64 [[II:%.*]] +; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF4IC4: scalar.ph: +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[MM]], [[ENTRY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX21:%.*]] = phi i64 [ [[II]], [[ENTRY]] ], [ [[RDX_SELECT20]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC4-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF4IC4: for.body: +; CHECK-VF4IC4-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[MAX_09:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP28:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[IDX_011:%.*]] = phi i64 [ [[BC_MERGE_RDX21]], [[SCALAR_PH]] ], [ [[SPEC_SELECT7:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF4IC4-NEXT: [[TMP27:%.*]] = load i64, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP28]] = tail call i64 @llvm.smax.i64(i64 [[MAX_09]], i64 [[TMP27]]) +; CHECK-VF4IC4-NEXT: [[CMP1:%.*]] = icmp sge i64 [[TMP27]], [[MAX_09]] +; CHECK-VF4IC4-NEXT: [[SPEC_SELECT7]] = select i1 [[CMP1]], i64 [[INDVARS_IV]], i64 [[IDX_011]] +; CHECK-VF4IC4-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-VF4IC4: exit: +; CHECK-VF4IC4-NEXT: [[DOTLCSSA:%.*]] = phi i64 [ [[TMP28]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC4-NEXT: [[SPEC_SELECT7_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT7]], [[FOR_BODY]] ], [ [[RDX_SELECT20]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC4-NEXT: store i64 [[DOTLCSSA]], ptr [[RES_MAX:%.*]], align 4 +; CHECK-VF4IC4-NEXT: ret i64 [[SPEC_SELECT7_LCSSA]] +; +; CHECK-VF1IC4-LABEL: @smax_idx_inverted_pred( +; CHECK-VF1IC4-NEXT: entry: +; CHECK-VF1IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 +; CHECK-VF1IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF1IC4: vector.ph: +; CHECK-VF1IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-VF1IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF1IC4-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF1IC4: vector.body: +; CHECK-VF1IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI:%.*]] = phi i64 [ [[MM:%.*]], [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI1:%.*]] = phi i64 [ [[MM]], [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI2:%.*]] = phi i64 [ [[MM]], [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI3:%.*]] = phi i64 [ [[MM]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI4:%.*]] = phi i64 [ -9223372036854775808, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI5:%.*]] = phi i64 [ -9223372036854775808, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI6:%.*]] = phi i64 [ -9223372036854775808, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI7:%.*]] = phi i64 [ -9223372036854775808, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF1IC4-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-VF1IC4-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-VF1IC4-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-VF1IC4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-VF1IC4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] +; CHECK-VF1IC4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]] +; CHECK-VF1IC4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; CHECK-VF1IC4-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP4]], align 4 +; CHECK-VF1IC4-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP5]], align 4 +; CHECK-VF1IC4-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 4 +; CHECK-VF1IC4-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 4 +; CHECK-VF1IC4-NEXT: [[TMP12]] = tail call i64 @llvm.smax.i64(i64 [[VEC_PHI]], i64 [[TMP8]]) +; CHECK-VF1IC4-NEXT: [[TMP13]] = tail call i64 @llvm.smax.i64(i64 [[VEC_PHI1]], i64 [[TMP9]]) +; CHECK-VF1IC4-NEXT: [[TMP14]] = tail call i64 @llvm.smax.i64(i64 [[VEC_PHI2]], i64 [[TMP10]]) +; CHECK-VF1IC4-NEXT: [[TMP15]] = tail call i64 @llvm.smax.i64(i64 [[VEC_PHI3]], i64 [[TMP11]]) +; CHECK-VF1IC4-NEXT: [[TMP16:%.*]] = icmp sge i64 [[TMP8]], [[VEC_PHI]] +; CHECK-VF1IC4-NEXT: [[TMP17:%.*]] = icmp sge i64 [[TMP9]], [[VEC_PHI1]] +; CHECK-VF1IC4-NEXT: [[TMP18:%.*]] = icmp sge i64 [[TMP10]], [[VEC_PHI2]] +; CHECK-VF1IC4-NEXT: [[TMP19:%.*]] = icmp sge i64 [[TMP11]], [[VEC_PHI3]] +; CHECK-VF1IC4-NEXT: [[TMP20]] = select i1 [[TMP16]], i64 [[TMP0]], i64 [[VEC_PHI4]] +; CHECK-VF1IC4-NEXT: [[TMP21]] = select i1 [[TMP17]], i64 [[TMP1]], i64 [[VEC_PHI5]] +; CHECK-VF1IC4-NEXT: [[TMP22]] = select i1 [[TMP18]], i64 [[TMP2]], i64 [[VEC_PHI6]] +; CHECK-VF1IC4-NEXT: [[TMP23]] = select i1 [[TMP19]], i64 [[TMP3]], i64 [[VEC_PHI7]] +; CHECK-VF1IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF1IC4-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF1IC4-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-VF1IC4: middle.block: +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt i64 [[TMP12]], [[TMP13]] +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select i1 [[RDX_MINMAX_CMP]], i64 [[TMP12]], i64 [[TMP13]] +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX_CMP8:%.*]] = icmp sgt i64 [[RDX_MINMAX_SELECT]], [[TMP14]] +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX_SELECT9:%.*]] = select i1 [[RDX_MINMAX_CMP8]], i64 [[RDX_MINMAX_SELECT]], i64 [[TMP14]] +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX_CMP10:%.*]] = icmp sgt i64 [[RDX_MINMAX_SELECT9]], [[TMP15]] +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX_SELECT11:%.*]] = select i1 [[RDX_MINMAX_CMP10]], i64 [[RDX_MINMAX_SELECT9]], i64 [[TMP15]] +; CHECK-VF1IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF1IC4-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_MINMAX_CMP]], i64 [[TMP20]], i64 [[TMP21]] +; CHECK-VF1IC4-NEXT: [[RDX_SELECT12:%.*]] = select i1 [[RDX_MINMAX_CMP8]], i64 [[RDX_SELECT]], i64 [[TMP22]] +; CHECK-VF1IC4-NEXT: [[RDX_SELECT13:%.*]] = select i1 [[RDX_MINMAX_CMP10]], i64 [[RDX_SELECT12]], i64 [[TMP23]] +; CHECK-VF1IC4-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[RDX_SELECT13]], -9223372036854775808 +; CHECK-VF1IC4-NEXT: [[RDX_SELECT14:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[RDX_SELECT13]], i64 [[II:%.*]] +; CHECK-VF1IC4-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF1IC4: scalar.ph: +; CHECK-VF1IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-VF1IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[MM]], [[ENTRY]] ], [ [[RDX_MINMAX_SELECT11]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1IC4-NEXT: [[BC_MERGE_RDX15:%.*]] = phi i64 [ [[II]], [[ENTRY]] ], [ [[RDX_SELECT14]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1IC4-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF1IC4: for.body: +; CHECK-VF1IC4-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[MAX_09:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP26:%.*]], [[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[IDX_011:%.*]] = phi i64 [ [[BC_MERGE_RDX15]], [[SCALAR_PH]] ], [ [[SPEC_SELECT7:%.*]], [[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF1IC4-NEXT: [[TMP25:%.*]] = load i64, ptr [[ARRAYIDX]], align 4 +; CHECK-VF1IC4-NEXT: [[TMP26]] = tail call i64 @llvm.smax.i64(i64 [[MAX_09]], i64 [[TMP25]]) +; CHECK-VF1IC4-NEXT: [[CMP1:%.*]] = icmp sge i64 [[TMP25]], [[MAX_09]] +; CHECK-VF1IC4-NEXT: [[SPEC_SELECT7]] = select i1 [[CMP1]], i64 [[INDVARS_IV]], i64 [[IDX_011]] +; CHECK-VF1IC4-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF1IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF1IC4-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-VF1IC4: exit: +; CHECK-VF1IC4-NEXT: [[DOTLCSSA:%.*]] = phi i64 [ [[TMP26]], [[FOR_BODY]] ], [ [[RDX_MINMAX_SELECT11]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1IC4-NEXT: [[SPEC_SELECT7_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT7]], [[FOR_BODY]] ], [ [[RDX_SELECT14]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1IC4-NEXT: store i64 [[DOTLCSSA]], ptr [[RES_MAX:%.*]], align 4 +; CHECK-VF1IC4-NEXT: ret i64 [[SPEC_SELECT7_LCSSA]] ; entry: br label %for.body @@ -149,8 +876,246 @@ ; In such cases, the last index should be extracted. ; define i64 @smax_idx_extract_last(ptr nocapture readonly %a, i64 %mm, i64 %ii, ptr nocapture writeonly %res_max, i64 %n) { -; CHECK-LABEL: @smax_idx_extract_last( -; CHECK-NOT: vector.body: +; CHECK-VF4IC1-LABEL: @smax_idx_extract_last( +; CHECK-VF4IC1-NEXT: entry: +; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 +; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF4IC1: vector.ph: +; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF4IC1-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[MM:%.*]], i64 0 +; CHECK-VF4IC1-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i64> [[MINMAX_IDENT_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF4IC1: vector.body: +; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0 +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP3]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[VEC_PHI]], <4 x i64> [[WIDE_LOAD]]) +; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i64> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-VF4IC1-NEXT: [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i64> [[VEC_PHI1]], <4 x i64> [[VEC_IND]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-VF4IC1: middle.block: +; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP3]]) +; CHECK-VF4IC1-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP7]], i64 0 +; CHECK-VF4IC1-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: [[MASK_CMP:%.*]] = icmp eq <4 x i64> [[DOTSPLAT]], [[TMP3]] +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: [[MASK_SELECT:%.*]] = select <4 x i1> [[MASK_CMP]], <4 x i64> [[TMP5]], <4 x i64> +; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[MASK_SELECT]]) +; CHECK-VF4IC1-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP8]], -9223372036854775808 +; CHECK-VF4IC1-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP8]], i64 [[II:%.*]] +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF4IC1: scalar.ph: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[MM]], [[ENTRY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX2:%.*]] = phi i64 [ [[II]], [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF4IC1: for.body: +; CHECK-VF4IC1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[MAX_09:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP10:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[IDX_011:%.*]] = phi i64 [ [[BC_MERGE_RDX2]], [[SCALAR_PH]] ], [ [[SPEC_SELECT7:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = load i64, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP10]] = tail call i64 @llvm.smax.i64(i64 [[MAX_09]], i64 [[TMP9]]) +; CHECK-VF4IC1-NEXT: [[CMP1_NOT:%.*]] = icmp sgt i64 [[MAX_09]], [[TMP9]] +; CHECK-VF4IC1-NEXT: [[SPEC_SELECT7]] = select i1 [[CMP1_NOT]], i64 [[IDX_011]], i64 [[INDVARS_IV]] +; CHECK-VF4IC1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-VF4IC1: exit: +; CHECK-VF4IC1-NEXT: [[DOTLCSSA:%.*]] = phi i64 [ [[TMP10]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: [[SPEC_SELECT7_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT7]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: store i64 [[DOTLCSSA]], ptr [[RES_MAX:%.*]], align 4 +; CHECK-VF4IC1-NEXT: ret i64 [[SPEC_SELECT7_LCSSA]] +; +; CHECK-VF4IC4-LABEL: @smax_idx_extract_last( +; CHECK-VF4IC4-NEXT: entry: +; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 16 +; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF4IC4: vector.ph: +; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16 +; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF4IC4-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[MM:%.*]], i64 0 +; CHECK-VF4IC4-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i64> [[MINMAX_IDENT_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF4IC4: vector.body: +; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI8:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI9:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI10:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], +; CHECK-VF4IC4-NEXT: [[STEP_ADD1:%.*]] = add <4 x i64> [[STEP_ADD]], +; CHECK-VF4IC4-NEXT: [[STEP_ADD2:%.*]] = add <4 x i64> [[STEP_ADD1]], +; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8 +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 12 +; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-VF4IC4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] +; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]] +; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0 +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP8]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 4 +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i64>, ptr [[TMP9]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 8 +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i64>, ptr [[TMP10]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 12 +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x i64>, ptr [[TMP11]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP12]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[VEC_PHI]], <4 x i64> [[WIDE_LOAD]]) +; CHECK-VF4IC4-NEXT: [[TMP13]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[VEC_PHI4]], <4 x i64> [[WIDE_LOAD11]]) +; CHECK-VF4IC4-NEXT: [[TMP14]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[VEC_PHI5]], <4 x i64> [[WIDE_LOAD12]]) +; CHECK-VF4IC4-NEXT: [[TMP15]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[VEC_PHI6]], <4 x i64> [[WIDE_LOAD13]]) +; CHECK-VF4IC4-NEXT: [[TMP16:%.*]] = icmp sgt <4 x i64> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-VF4IC4-NEXT: [[TMP17:%.*]] = icmp sgt <4 x i64> [[VEC_PHI4]], [[WIDE_LOAD11]] +; CHECK-VF4IC4-NEXT: [[TMP18:%.*]] = icmp sgt <4 x i64> [[VEC_PHI5]], [[WIDE_LOAD12]] +; CHECK-VF4IC4-NEXT: [[TMP19:%.*]] = icmp sgt <4 x i64> [[VEC_PHI6]], [[WIDE_LOAD13]] +; CHECK-VF4IC4-NEXT: [[TMP20]] = select <4 x i1> [[TMP16]], <4 x i64> [[VEC_PHI7]], <4 x i64> [[VEC_IND]] +; CHECK-VF4IC4-NEXT: [[TMP21]] = select <4 x i1> [[TMP17]], <4 x i64> [[VEC_PHI8]], <4 x i64> [[STEP_ADD]] +; CHECK-VF4IC4-NEXT: [[TMP22]] = select <4 x i1> [[TMP18]], <4 x i64> [[VEC_PHI9]], <4 x i64> [[STEP_ADD1]] +; CHECK-VF4IC4-NEXT: [[TMP23]] = select <4 x i1> [[TMP19]], <4 x i64> [[VEC_PHI10]], <4 x i64> [[STEP_ADD2]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD2]], +; CHECK-VF4IC4-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-VF4IC4: middle.block: +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i64> [[TMP12]], [[TMP13]] +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i64> [[TMP12]], <4 x i64> [[TMP13]] +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX_CMP14:%.*]] = icmp sgt <4 x i64> [[RDX_MINMAX_SELECT]], [[TMP14]] +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX_SELECT15:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP14]], <4 x i64> [[RDX_MINMAX_SELECT]], <4 x i64> [[TMP14]] +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX_CMP16:%.*]] = icmp sgt <4 x i64> [[RDX_MINMAX_SELECT15]], [[TMP15]] +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX_SELECT17:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP16]], <4 x i64> [[RDX_MINMAX_SELECT15]], <4 x i64> [[TMP15]] +; CHECK-VF4IC4-NEXT: [[TMP25:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[RDX_MINMAX_SELECT17]]) +; CHECK-VF4IC4-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP25]], i64 0 +; CHECK-VF4IC4-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: [[MASK_CMP:%.*]] = icmp eq <4 x i64> [[DOTSPLAT]], [[RDX_MINMAX_SELECT17]] +; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: [[RDX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i64> [[TMP20]], <4 x i64> [[TMP21]] +; CHECK-VF4IC4-NEXT: [[RDX_SELECT18:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP14]], <4 x i64> [[RDX_SELECT]], <4 x i64> [[TMP22]] +; CHECK-VF4IC4-NEXT: [[RDX_SELECT19:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP16]], <4 x i64> [[RDX_SELECT18]], <4 x i64> [[TMP23]] +; CHECK-VF4IC4-NEXT: [[MASK_SELECT:%.*]] = select <4 x i1> [[MASK_CMP]], <4 x i64> [[RDX_SELECT19]], <4 x i64> +; CHECK-VF4IC4-NEXT: [[TMP26:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[MASK_SELECT]]) +; CHECK-VF4IC4-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP26]], -9223372036854775808 +; CHECK-VF4IC4-NEXT: [[RDX_SELECT20:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP26]], i64 [[II:%.*]] +; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF4IC4: scalar.ph: +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[MM]], [[ENTRY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX21:%.*]] = phi i64 [ [[II]], [[ENTRY]] ], [ [[RDX_SELECT20]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC4-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF4IC4: for.body: +; CHECK-VF4IC4-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[MAX_09:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP28:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[IDX_011:%.*]] = phi i64 [ [[BC_MERGE_RDX21]], [[SCALAR_PH]] ], [ [[SPEC_SELECT7:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF4IC4-NEXT: [[TMP27:%.*]] = load i64, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP28]] = tail call i64 @llvm.smax.i64(i64 [[MAX_09]], i64 [[TMP27]]) +; CHECK-VF4IC4-NEXT: [[CMP1_NOT:%.*]] = icmp sgt i64 [[MAX_09]], [[TMP27]] +; CHECK-VF4IC4-NEXT: [[SPEC_SELECT7]] = select i1 [[CMP1_NOT]], i64 [[IDX_011]], i64 [[INDVARS_IV]] +; CHECK-VF4IC4-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-VF4IC4: exit: +; CHECK-VF4IC4-NEXT: [[DOTLCSSA:%.*]] = phi i64 [ [[TMP28]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC4-NEXT: [[SPEC_SELECT7_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT7]], [[FOR_BODY]] ], [ [[RDX_SELECT20]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC4-NEXT: store i64 [[DOTLCSSA]], ptr [[RES_MAX:%.*]], align 4 +; CHECK-VF4IC4-NEXT: ret i64 [[SPEC_SELECT7_LCSSA]] +; +; CHECK-VF1IC4-LABEL: @smax_idx_extract_last( +; CHECK-VF1IC4-NEXT: entry: +; CHECK-VF1IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 +; CHECK-VF1IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF1IC4: vector.ph: +; CHECK-VF1IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-VF1IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF1IC4-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF1IC4: vector.body: +; CHECK-VF1IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI:%.*]] = phi i64 [ [[MM:%.*]], [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI1:%.*]] = phi i64 [ [[MM]], [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI2:%.*]] = phi i64 [ [[MM]], [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI3:%.*]] = phi i64 [ [[MM]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI4:%.*]] = phi i64 [ -9223372036854775808, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI5:%.*]] = phi i64 [ -9223372036854775808, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI6:%.*]] = phi i64 [ -9223372036854775808, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI7:%.*]] = phi i64 [ -9223372036854775808, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF1IC4-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-VF1IC4-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-VF1IC4-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-VF1IC4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-VF1IC4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] +; CHECK-VF1IC4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]] +; CHECK-VF1IC4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; CHECK-VF1IC4-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP4]], align 4 +; CHECK-VF1IC4-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP5]], align 4 +; CHECK-VF1IC4-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 4 +; CHECK-VF1IC4-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 4 +; CHECK-VF1IC4-NEXT: [[TMP12]] = tail call i64 @llvm.smax.i64(i64 [[VEC_PHI]], i64 [[TMP8]]) +; CHECK-VF1IC4-NEXT: [[TMP13]] = tail call i64 @llvm.smax.i64(i64 [[VEC_PHI1]], i64 [[TMP9]]) +; CHECK-VF1IC4-NEXT: [[TMP14]] = tail call i64 @llvm.smax.i64(i64 [[VEC_PHI2]], i64 [[TMP10]]) +; CHECK-VF1IC4-NEXT: [[TMP15]] = tail call i64 @llvm.smax.i64(i64 [[VEC_PHI3]], i64 [[TMP11]]) +; CHECK-VF1IC4-NEXT: [[TMP16:%.*]] = icmp sgt i64 [[VEC_PHI]], [[TMP8]] +; CHECK-VF1IC4-NEXT: [[TMP17:%.*]] = icmp sgt i64 [[VEC_PHI1]], [[TMP9]] +; CHECK-VF1IC4-NEXT: [[TMP18:%.*]] = icmp sgt i64 [[VEC_PHI2]], [[TMP10]] +; CHECK-VF1IC4-NEXT: [[TMP19:%.*]] = icmp sgt i64 [[VEC_PHI3]], [[TMP11]] +; CHECK-VF1IC4-NEXT: [[TMP20]] = select i1 [[TMP16]], i64 [[VEC_PHI4]], i64 [[TMP0]] +; CHECK-VF1IC4-NEXT: [[TMP21]] = select i1 [[TMP17]], i64 [[VEC_PHI5]], i64 [[TMP1]] +; CHECK-VF1IC4-NEXT: [[TMP22]] = select i1 [[TMP18]], i64 [[VEC_PHI6]], i64 [[TMP2]] +; CHECK-VF1IC4-NEXT: [[TMP23]] = select i1 [[TMP19]], i64 [[VEC_PHI7]], i64 [[TMP3]] +; CHECK-VF1IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF1IC4-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF1IC4-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-VF1IC4: middle.block: +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt i64 [[TMP12]], [[TMP13]] +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select i1 [[RDX_MINMAX_CMP]], i64 [[TMP12]], i64 [[TMP13]] +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX_CMP8:%.*]] = icmp sgt i64 [[RDX_MINMAX_SELECT]], [[TMP14]] +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX_SELECT9:%.*]] = select i1 [[RDX_MINMAX_CMP8]], i64 [[RDX_MINMAX_SELECT]], i64 [[TMP14]] +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX_CMP10:%.*]] = icmp sgt i64 [[RDX_MINMAX_SELECT9]], [[TMP15]] +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX_SELECT11:%.*]] = select i1 [[RDX_MINMAX_CMP10]], i64 [[RDX_MINMAX_SELECT9]], i64 [[TMP15]] +; CHECK-VF1IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF1IC4-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_MINMAX_CMP]], i64 [[TMP20]], i64 [[TMP21]] +; CHECK-VF1IC4-NEXT: [[RDX_SELECT12:%.*]] = select i1 [[RDX_MINMAX_CMP8]], i64 [[RDX_SELECT]], i64 [[TMP22]] +; CHECK-VF1IC4-NEXT: [[RDX_SELECT13:%.*]] = select i1 [[RDX_MINMAX_CMP10]], i64 [[RDX_SELECT12]], i64 [[TMP23]] +; CHECK-VF1IC4-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[RDX_SELECT13]], -9223372036854775808 +; CHECK-VF1IC4-NEXT: [[RDX_SELECT14:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[RDX_SELECT13]], i64 [[II:%.*]] +; CHECK-VF1IC4-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF1IC4: scalar.ph: +; CHECK-VF1IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-VF1IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[MM]], [[ENTRY]] ], [ [[RDX_MINMAX_SELECT11]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1IC4-NEXT: [[BC_MERGE_RDX15:%.*]] = phi i64 [ [[II]], [[ENTRY]] ], [ [[RDX_SELECT14]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1IC4-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF1IC4: for.body: +; CHECK-VF1IC4-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[MAX_09:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP26:%.*]], [[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[IDX_011:%.*]] = phi i64 [ [[BC_MERGE_RDX15]], [[SCALAR_PH]] ], [ [[SPEC_SELECT7:%.*]], [[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF1IC4-NEXT: [[TMP25:%.*]] = load i64, ptr [[ARRAYIDX]], align 4 +; CHECK-VF1IC4-NEXT: [[TMP26]] = tail call i64 @llvm.smax.i64(i64 [[MAX_09]], i64 [[TMP25]]) +; CHECK-VF1IC4-NEXT: [[CMP1_NOT:%.*]] = icmp sgt i64 [[MAX_09]], [[TMP25]] +; CHECK-VF1IC4-NEXT: [[SPEC_SELECT7]] = select i1 [[CMP1_NOT]], i64 [[IDX_011]], i64 [[INDVARS_IV]] +; CHECK-VF1IC4-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF1IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF1IC4-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-VF1IC4: exit: +; CHECK-VF1IC4-NEXT: [[DOTLCSSA:%.*]] = phi i64 [ [[TMP26]], [[FOR_BODY]] ], [ [[RDX_MINMAX_SELECT11]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1IC4-NEXT: [[SPEC_SELECT7_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT7]], [[FOR_BODY]] ], [ [[RDX_SELECT14]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1IC4-NEXT: store i64 [[DOTLCSSA]], ptr [[RES_MAX:%.*]], align 4 +; CHECK-VF1IC4-NEXT: ret i64 [[SPEC_SELECT7_LCSSA]] ; entry: br label %for.body