diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h --- a/llvm/include/llvm/Analysis/IVDescriptors.h +++ b/llvm/include/llvm/Analysis/IVDescriptors.h @@ -123,7 +123,7 @@ /// the returned struct. static InstDesc isRecurrenceInstr(Loop *L, PHINode *Phi, Instruction *I, RecurKind Kind, InstDesc &Prev, - FastMathFlags FuncFMF); + FastMathFlags FuncFMF, ScalarEvolution *SE); /// Returns true if instruction I has multiple uses in Insts static bool hasMultipleUsesOf(Instruction *I, @@ -148,7 +148,8 @@ /// value. \p Prev specifies the description of an already processed select /// instruction, so its corresponding cmp can be matched to it. static InstDesc isSelectCmpPattern(Loop *Loop, PHINode *OrigPhi, - Instruction *I, InstDesc &Prev); + Instruction *I, InstDesc &Prev, + ScalarEvolution *SE); /// Returns a struct describing if the instruction is a /// Select(FCmp(X, Y), (Z = X op PHINode), PHINode) instruction pattern. diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h --- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -15,6 +15,7 @@ #include "llvm/Analysis/IVDescriptors.h" #include "llvm/Analysis/LoopAccessAnalysis.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Transforms/Utils/ValueMapper.h" namespace llvm { @@ -398,7 +399,8 @@ const TargetTransformInfo *TTI, Value *Src, const RecurrenceDescriptor &Desc, - PHINode *OrigPhi); + PHINode *OrigPhi, Loop *OrigLoop, + Loop::LoopBounds::Direction Direction); /// Create a generic target reduction using a recurrence descriptor \p Desc /// The target is queried to determine if intrinsics or shuffle sequences are @@ -406,7 +408,10 @@ /// Fast-math-flags are propagated using the RecurrenceDescriptor. Value *createTargetReduction(IRBuilderBase &B, const TargetTransformInfo *TTI, const RecurrenceDescriptor &Desc, Value *Src, - PHINode *OrigPhi = nullptr); + PHINode *OrigPhi = nullptr, + Loop *OrigLoop = nullptr, + Loop::LoopBounds::Direction Direction = + Loop::LoopBounds::Direction::Unknown); /// Create an ordered reduction intrinsic using the given recurrence /// descriptor \p Desc. diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp --- a/llvm/lib/Analysis/IVDescriptors.cpp +++ b/llvm/lib/Analysis/IVDescriptors.cpp @@ -375,7 +375,7 @@ // type-promoted). if (Cur != Start) { ReduxDesc = - isRecurrenceInstr(TheLoop, Phi, Cur, Kind, ReduxDesc, FuncFMF); + isRecurrenceInstr(TheLoop, Phi, Cur, Kind, ReduxDesc, FuncFMF, SE); ExactFPMathInst = ExactFPMathInst == nullptr ? ReduxDesc.getExactFPMathInst() : ExactFPMathInst; @@ -488,7 +488,7 @@ ((!isa(UI) && !isa(UI) && !isa(UI)) || (!isConditionalRdxPattern(Kind, UI).isRecurrence() && - !isSelectCmpPattern(TheLoop, Phi, UI, IgnoredVal) + !isSelectCmpPattern(TheLoop, Phi, UI, IgnoredVal, SE) .isRecurrence() && !isMinMaxPattern(UI, Kind, IgnoredVal).isRecurrence()))) return false; @@ -606,7 +606,9 @@ return true; } -// We are looking for loops that do something like this: +// We are looking for two patterns. +// +// The first is loops that do something like this: // int r = 0; // for (int i = 0; i < n; i++) { // if (src[i] > 3) @@ -627,9 +629,25 @@ // across-vector reduction after the loop simply involves choosing the start // value if nothing changed (0 in the example above) or the other selected // value (3 in the example above). +// +// The second is loops that do something like this: +// int r = 0; +// for (int i = 0; i < n; i++) { +// if (src[i] > 3) +// r = i; +// } +// where the reduction value (r) has many states, in this example 0 or the +// maximal value of the loop induction variable, with the mask src[i] > 3. +// +// In general we can support vectorization of loops where 'r' is a variable. The +// only thing we actually care about at the end of the loop is whether or not +// any lane in the selected vector is different from the start value. The final +// across-vector min/max reduction after the loop involves setting the value +// from a mask, depening on whether i is increasing or decreasing in the loop. RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isSelectCmpPattern(Loop *Loop, PHINode *OrigPhi, - Instruction *I, InstDesc &Prev) { + Instruction *I, InstDesc &Prev, + ScalarEvolution *SE) { // We must handle the select(cmp(),x,y) as a single instruction. Advance to // the select. CmpInst::Predicate Pred; @@ -646,6 +664,9 @@ SelectInst *SI = cast(I); Value *NonPhi = nullptr; + // In the loop-invariant case, we are looking for selects of the form: + // select(cmp(), phi, loop_invariant) or + // select(cmp(), loop_invariant, phi) if (OrigPhi == dyn_cast(SI->getTrueValue())) NonPhi = SI->getFalseValue(); else if (OrigPhi == dyn_cast(SI->getFalseValue())) @@ -653,11 +674,24 @@ else return InstDesc(false, I); - // We are looking for selects of the form: - // select(cmp(), phi, loop_invariant) or - // select(cmp(), loop_invariant, phi) - if (!Loop->isLoopInvariant(NonPhi)) - return InstDesc(false, I); + if (!Loop->isLoopInvariant(NonPhi)) { + // If the loop is not invariant, we need to know the direction of the loop. + auto Bounds = SE ? Loop->getBounds(*SE) : std::nullopt; + if (!Bounds || + Bounds->getDirection() == Loop::LoopBounds::Direction::Unknown) + return InstDesc(false, I); + + // If the loop is non-invariant, we need a constant Step AddRec expression. + if (!SE->isSCEVable(NonPhi->getType())) + return InstDesc(false, I); + else if (auto *AR = dyn_cast(SE->getSCEV(NonPhi))) { + const SCEV *Step = AR->getStepRecurrence(*SE); + const SCEVConstant *ConstStep = dyn_cast(Step); + if (!ConstStep && !SE->isLoopInvariant(Step, Loop)) + return InstDesc(false, I); + } else + return InstDesc(false, I); + } return InstDesc(I, isa(I->getOperand(0)) ? RecurKind::SelectICmp : RecurKind::SelectFCmp); @@ -762,10 +796,9 @@ return InstDesc(true, SI); } -RecurrenceDescriptor::InstDesc -RecurrenceDescriptor::isRecurrenceInstr(Loop *L, PHINode *OrigPhi, - Instruction *I, RecurKind Kind, - InstDesc &Prev, FastMathFlags FuncFMF) { +RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isRecurrenceInstr( + Loop *L, PHINode *OrigPhi, Instruction *I, RecurKind Kind, InstDesc &Prev, + FastMathFlags FuncFMF, ScalarEvolution *SE) { assert(Prev.getRecKind() == RecurKind::None || Prev.getRecKind() == Kind); switch (I->getOpcode()) { default: @@ -800,7 +833,7 @@ case Instruction::ICmp: case Instruction::Call: if (isSelectCmpRecurrenceKind(Kind)) - return isSelectCmpPattern(L, OrigPhi, I, Prev); + return isSelectCmpPattern(L, OrigPhi, I, Prev, SE); if (isIntMinMaxRecurrenceKind(Kind) || (((FuncFMF.noNaNs() && FuncFMF.noSignedZeros()) || (isa(I) && I->hasNoNaNs() && diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -1021,11 +1021,10 @@ return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); } -Value *llvm::createSelectCmpTargetReduction(IRBuilderBase &Builder, - const TargetTransformInfo *TTI, - Value *Src, - const RecurrenceDescriptor &Desc, - PHINode *OrigPhi) { +Value *llvm::createSelectCmpTargetReduction( + IRBuilderBase &Builder, const TargetTransformInfo *TTI, Value *Src, + const RecurrenceDescriptor &Desc, PHINode *OrigPhi, Loop *OrigLoop, + Loop::LoopBounds::Direction Direction) { assert(RecurrenceDescriptor::isSelectCmpRecurrenceKind( Desc.getRecurrenceKind()) && "Unexpected reduction kind"); @@ -1053,11 +1052,55 @@ // we want to reduce. ElementCount EC = cast(Src->getType())->getElementCount(); Value *Right = Builder.CreateVectorSplat(EC, InitVal); - Value *Cmp = + Value *CmpVector = Builder.CreateCmp(CmpInst::ICMP_NE, Src, Right, "rdx.select.cmp"); + Value *Cmp = Builder.CreateOrReduce(CmpVector); + + if (!OrigLoop->isLoopInvariant(NewVal)) { + // If NewVal isn't loop invariant, consider the following example: + // int src[n] = {4, 5, 2}; + // int r = 331; + // for (int i = 0; i < n; i++) { + // if (src[i] > 3) + // r = i; + // } + // return r; + // We vectorize this case as follows: + // 1. Create a Splat with the int_min/int_max values, depending on the loop + // direction. + // 2. The Src is filled with values: {0, 1, 2, 3, 4, ...}. + // 3. The Right is filled with values: {0, 1, 331, 331, 331, ...}. + // 4. The CmpVector is filled with values: {1, 1, 0, 0, 0, ...}. + // 5. Select using this CmpVector between Src and the Splat with + // int_min/int_max values. + // 6. Use a max-reduce/min-reduce on the result of the Select. + // 7. Use the exising Cmp which determines whether or not any assignment + // took place in the loop to select between the result of the + // max-reduce/min-reduce, and the initial value (InitVal). + assert(Direction != Loop::LoopBounds::Direction::Unknown && + "Unknown loop direction"); + auto IdxTy = cast(InitVal->getType()); + if (Direction == Loop::LoopBounds::Direction::Increasing) { + auto MinValue = Desc.isSigned() + ? APInt::getSignedMinValue(IdxTy->getBitWidth()) + : APInt::getMinValue(IdxTy->getBitWidth()); + Value *MinSplat = + Builder.CreateVectorSplat(EC, ConstantInt::get(IdxTy, MinValue)); + Value *Mask = + Builder.CreateSelect(CmpVector, Src, MinSplat, "rdx.select.mask"); + NewVal = Builder.CreateIntMaxReduce(Mask, Desc.isSigned()); + } else { + auto MaxValue = Desc.isSigned() + ? APInt::getSignedMaxValue(IdxTy->getBitWidth()) + : APInt::getMaxValue(IdxTy->getBitWidth()); + Value *MaxSplat = + Builder.CreateVectorSplat(EC, ConstantInt::get(IdxTy, MaxValue)); + Value *Mask = + Builder.CreateSelect(CmpVector, Src, MaxSplat, "rdx.select.mask"); + NewVal = Builder.CreateIntMinReduce(Mask, Desc.isSigned()); + } + } - // If any predicate is true it means that we want to select the new value. - Cmp = Builder.CreateOrReduce(Cmp); return Builder.CreateSelect(Cmp, NewVal, InitVal, "rdx.select"); } @@ -1102,7 +1145,8 @@ Value *llvm::createTargetReduction(IRBuilderBase &B, const TargetTransformInfo *TTI, const RecurrenceDescriptor &Desc, Value *Src, - PHINode *OrigPhi) { + PHINode *OrigPhi, Loop *OrigLoop, + Loop::LoopBounds::Direction Direction) { // TODO: Support in-order reductions based on the recurrence descriptor. // All ops in the reduction inherit fast-math-flags from the recurrence // descriptor. @@ -1111,7 +1155,8 @@ RecurKind RK = Desc.getRecurrenceKind(); if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) - return createSelectCmpTargetReduction(B, TTI, Src, Desc, OrigPhi); + return createSelectCmpTargetReduction(B, TTI, Src, Desc, OrigPhi, OrigLoop, + Direction); return createSimpleTargetReduction(B, TTI, Src, RK); } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -488,6 +488,13 @@ this->MinProfitableTripCount = VecWidth; else this->MinProfitableTripCount = MinProfitableTripCount; + + auto Bounds = OrigLoop->getBounds(*PSE.getSE()); + if (!Bounds || + Bounds->getDirection() == Loop::LoopBounds::Direction::Unknown) + Direction = Loop::LoopBounds::Direction::Unknown; + else + Direction = Bounds->getDirection(); } virtual ~InnerLoopVectorizer() = default; @@ -678,6 +685,9 @@ /// The original loop. Loop *OrigLoop; + /// The direction of iteration of the original loop. + Loop::LoopBounds::Direction Direction; + /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies /// dynamic knowledge to simplify SCEV expressions and converts them to a /// more usable form. @@ -3989,7 +3999,8 @@ // target reduction in the loop using a Reduction recipe. if (VF.isVector() && !PhiR->isInLoop()) { ReducedPartRdx = - createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi); + createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi, + OrigLoop, State.ILV->Direction); // If the reduction can be performed in a smaller type, we need to extend // the reduction to the wider type before we branch to the original loop. if (PhiTy != RdxDesc.getRecurrenceType()) diff --git a/llvm/test/Transforms/LoopVectorize/if-reduction.ll b/llvm/test/Transforms/LoopVectorize/if-reduction.ll --- a/llvm/test/Transforms/LoopVectorize/if-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/if-reduction.ll @@ -912,9 +912,10 @@ @table = constant [13 x i16] [i16 10, i16 35, i16 69, i16 147, i16 280, i16 472, i16 682, i16 1013, i16 1559, i16 2544, i16 4553, i16 6494, i16 10000], align 1 -; CHECK-LABEL: @non_reduction_index( -; CHECK-NOT: <4 x i16> -define i16 @non_reduction_index(i16 noundef %val) { +; CHECK-LABEL: @reduction_index( +; CHECK: %[[V1:.*]] = insertelement <4 x i16> poison, i16 %[[ARG:.*]], i64 0 +; CHECK: shufflevector <4 x i16> %[[V1]], <4 x i16> poison, <4 x i32> zeroinitializer +define i16 @reduction_index(i16 noundef %val) { entry: br label %for.body @@ -936,9 +937,10 @@ @tablef = constant [13 x half] [half 10.0, half 35.0, half 69.0, half 147.0, half 280.0, half 472.0, half 682.0, half 1013.0, half 1559.0, half 2544.0, half 4556.0, half 6496.0, half 10000.0], align 1 -; CHECK-LABEL: @non_reduction_index_half( -; CHECK-NOT: <4 x half> -define i16 @non_reduction_index_half(half noundef %val) { +; CHECK-LABEL: @reduction_index_half( +; CHECK: %[[V1:.*]] = insertelement <4 x half> poison, half %[[ARG:.*]], i64 0 +; CHECK: shufflevector <4 x half> %[[V1]], <4 x half> poison, <4 x i32> zeroinitializer +define i16 @reduction_index_half(half noundef %val) { entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/select-cmp.ll b/llvm/test/Transforms/LoopVectorize/select-cmp.ll --- a/llvm/test/Transforms/LoopVectorize/select-cmp.ll +++ b/llvm/test/Transforms/LoopVectorize/select-cmp.ll @@ -244,6 +244,171 @@ ret i32 %3 } +define i32 @test_select_variant_i32_from_fcmp(ptr %a) { +; CHECK-LABEL: @test_select_variant_i32_from_fcmp( +; CHECK-VF4IC1-NEXT: entry: +; CHECK-VF4IC1-NEXT: br label [[FOR_COND1_PREHEADER:%.*]] +; CHECK-VF4IC1: for.cond1.preheader: +; CHECK-VF4IC1-NEXT: [[NL_015:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC7:%.*]], [[FOR_COND_CLEANUP3:%.*]] ] +; CHECK-VF4IC1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF4IC1: vector.ph: +; CHECK-VF4IC1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF4IC1: vector.body: +; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0 +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP2]], align 8 +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = fcmp ogt <4 x double> [[WIDE_LOAD]], +; CHECK-VF4IC1-NEXT: [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], +; CHECK-VF4IC1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32000 +; CHECK-VF4IC1-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-VF4IC1: middle.block: +; CHECK-VF4IC1-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne <4 x i32> [[TMP4]], +; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP]]) +; CHECK-VF4IC1-NEXT: [[RDX_SELECT_MASK:%.*]] = select <4 x i1> [[RDX_SELECT_CMP]], <4 x i32> [[TMP4]], <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[RDX_SELECT_MASK]]) +; CHECK-VF4IC1-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP6]], i32 [[TMP7]], i32 -1 +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 32000, 32000 +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]] +; CHECK-VF4IC1: scalar.ph: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 32000, [[MIDDLE_BLOCK]] ], [ 0, [[FOR_COND1_PREHEADER]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ -1, [[FOR_COND1_PREHEADER]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: br label [[FOR_BODY4:%.*]] +; CHECK-VF4IC1: for.cond.cleanup: +; CHECK-VF4IC1-NEXT: [[J_2_LCSSA_LCSSA:%.*]] = phi i32 [ [[J_2_LCSSA:%.*]], [[FOR_COND_CLEANUP3]] ] +; CHECK-VF4IC1-NEXT: ret i32 [[J_2_LCSSA_LCSSA]] +; CHECK-VF4IC1: for.cond.cleanup3: +; CHECK-VF4IC1-NEXT: [[J_2_LCSSA]] = phi i32 [ [[J_2:%.*]], [[FOR_BODY4]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: [[INC7]] = add nuw nsw i32 [[NL_015]], 1 +; CHECK-VF4IC1-NEXT: [[EXITCOND16_NOT:%.*]] = icmp eq i32 [[INC7]], 200000 +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND16_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]] +; CHECK-VF4IC1: for.body4: +; CHECK-VF4IC1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY4]] ] +; CHECK-VF4IC1-NEXT: [[J_113:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[J_2]], [[FOR_BODY4]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = load double, ptr [[ARRAYIDX]], align 8 +; CHECK-VF4IC1-NEXT: [[CMP5:%.*]] = fcmp ogt double [[TMP8]], 3.000000e+00 +; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-VF4IC1-NEXT: [[J_2]] = select i1 [[CMP5]], i32 [[TMP9]], i32 [[J_113]] +; CHECK-VF4IC1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 32000 +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP3]], label [[FOR_BODY4]], !llvm.loop [[LOOP3:![0-9]+]] +; +entry: + br label %for.cond1.preheader + +for.cond1.preheader: + %nl.015 = phi i32 [ 0, %entry ], [ %inc7, %for.cond.cleanup3 ] + br label %for.body4 + +for.cond.cleanup: + ret i32 %j.2 + +for.cond.cleanup3: + %inc7 = add nuw nsw i32 %nl.015, 1 + %exitcond16.not = icmp eq i32 %inc7, 200000 + br i1 %exitcond16.not, label %for.cond.cleanup, label %for.cond1.preheader + +for.body4: + %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body4 ] + %j.113 = phi i32 [ -1, %for.cond1.preheader ], [ %j.2, %for.body4 ] + %arrayidx = getelementptr inbounds double, ptr %a, i64 %indvars.iv + %0 = load double, ptr %arrayidx + %cmp5 = fcmp ogt double %0, 3.000000e+00 + %1 = trunc i64 %indvars.iv to i32 + %j.2 = select i1 %cmp5, i32 %1, i32 %j.113 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, 32000 + br i1 %exitcond.not, label %for.cond.cleanup3, label %for.body4 +} + +define i32 @test_select_variant_i32_from_fcmp_reverse(ptr %a) { +; CHECK-LABEL: @test_select_variant_i32_from_fcmp_reverse( +; CHECK-VF4IC1-NEXT: entry: +; CHECK-VF4IC1-NEXT: br label [[FOR_COND1_PREHEADER:%.*]] +; CHECK-VF4IC1: for.cond1.preheader: +; CHECK-VF4IC1-NEXT: [[NL_014:%.*]] = phi i32 [ 199999, [[ENTRY:%.*]] ], [ [[DEC:%.*]], [[FOR_COND_CLEANUP3:%.*]] ] +; CHECK-VF4IC1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF4IC1: vector.ph: +; CHECK-VF4IC1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF4IC1: vector.body: +; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0 +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP2]], align 8 +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = fcmp ogt <4 x double> [[WIDE_LOAD]], +; CHECK-VF4IC1-NEXT: [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], +; CHECK-VF4IC1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32000 +; CHECK-VF4IC1-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-VF4IC1: middle.block: +; CHECK-VF4IC1-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne <4 x i32> [[TMP4]], +; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP]]) +; CHECK-VF4IC1-NEXT: [[RDX_SELECT_MASK:%.*]] = select <4 x i1> [[RDX_SELECT_CMP]], <4 x i32> [[TMP4]], <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[RDX_SELECT_MASK]]) +; CHECK-VF4IC1-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP6]], i32 [[TMP7]], i32 -1 +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 32000, 32000 +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]] +; CHECK-VF4IC1: scalar.ph: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 32000, [[MIDDLE_BLOCK]] ], [ 0, [[FOR_COND1_PREHEADER]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ -1, [[FOR_COND1_PREHEADER]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: br label [[FOR_BODY4:%.*]] +; CHECK-VF4IC1: for.cond.cleanup: +; CHECK-VF4IC1-NEXT: [[J_2_LCSSA_LCSSA:%.*]] = phi i32 [ [[J_2_LCSSA:%.*]], [[FOR_COND_CLEANUP3]] ] +; CHECK-VF4IC1-NEXT: ret i32 [[J_2_LCSSA_LCSSA]] +; CHECK-VF4IC1: for.cond.cleanup3: +; CHECK-VF4IC1-NEXT: [[J_2_LCSSA]] = phi i32 [ [[J_2:%.*]], [[FOR_BODY4]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: [[DEC]] = add nsw i32 [[NL_014]], -1 +; CHECK-VF4IC1-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[NL_014]], 0 +; CHECK-VF4IC1-NEXT: br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]] +; CHECK-VF4IC1: for.body4: +; CHECK-VF4IC1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY4]] ] +; CHECK-VF4IC1-NEXT: [[J_112:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[J_2]], [[FOR_BODY4]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = load double, ptr [[ARRAYIDX]], align 8 +; CHECK-VF4IC1-NEXT: [[CMP5:%.*]] = fcmp ogt double [[TMP8]], 3.000000e+00 +; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-VF4IC1-NEXT: [[J_2]] = select i1 [[CMP5]], i32 [[TMP9]], i32 [[J_112]] +; CHECK-VF4IC1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 32000 +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP3]], label [[FOR_BODY4]], !llvm.loop [[LOOP5:![0-9]+]] +; +entry: + br label %for.cond1.preheader + +for.cond1.preheader: + %nl.014 = phi i32 [ 199999, %entry ], [ %dec, %for.cond.cleanup3 ] + br label %for.body4 + +for.cond.cleanup: + ret i32 %j.2 + +for.cond.cleanup3: + %dec = add nsw i32 %nl.014, -1 + %cmp.not = icmp eq i32 %nl.014, 0 + br i1 %cmp.not, label %for.cond.cleanup, label %for.cond1.preheader + +for.body4: + %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body4 ] + %j.112 = phi i32 [ -1, %for.cond1.preheader ], [ %j.2, %for.body4 ] + %arrayidx = getelementptr inbounds double, ptr %a, i64 %indvars.iv + %0 = load double, ptr %arrayidx + %cmp5 = fcmp ogt double %0, 3.000000e+00 + %1 = trunc i64 %indvars.iv to i32 + %j.2 = select i1 %cmp5, i32 %1, i32 %j.112 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, 32000 + br i1 %exitcond.not, label %for.cond.cleanup3, label %for.body4 +} ; Negative tests @@ -296,10 +461,9 @@ ret i32 %6 } - -; We don't support selecting loop-variant values. -define i32 @select_variant_i32_from_icmp(ptr nocapture readonly %v1, ptr nocapture readonly %v2, i64 %n) { -; CHECK-LABEL: @select_variant_i32_from_icmp +; We only support selecting loop-invariant values or values with a constant AddRec step. +define i32 @select_variant_load_from_icmp(ptr nocapture readonly %v1, ptr nocapture readonly %v2, i64 %n) { +; CHECK-LABEL: @select_variant_load_from_icmp ; CHECK-NOT: vector.body entry: br label %for.body @@ -321,7 +485,6 @@ ret i32 %7 } - ; We only support selects where the input comes from the same PHI as the ; reduction PHI. In the example below, the select uses the induction ; variable input and the icmp uses the reduction PHI. diff --git a/llvm/test/Transforms/LoopVectorize/select-min-index.ll b/llvm/test/Transforms/LoopVectorize/select-min-index.ll --- a/llvm/test/Transforms/LoopVectorize/select-min-index.ll +++ b/llvm/test/Transforms/LoopVectorize/select-min-index.ll @@ -1,6 +1,4 @@ ; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck %s -; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -S %s | FileCheck %s -; RUN: opt -passes=loop-vectorize -force-vector-width=1 -force-vector-interleave=2 -S %s | FileCheck %s ; Test cases for selecting the index with the minimum value. @@ -81,9 +79,59 @@ ret i64 %res } -define i64 @test_not_vectorize_select_no_min_reduction(ptr %src) { -; CHECK-LABEL: @test_not_vectorize_select_no_min_reduction( -; CHECK-NOT: vector.body: +define i64 @test_vectorize_select_no_min_reduction(ptr %src) { +; CHECK-LABEL: @test_vectorize_select_no_min_reduction( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i64, ptr [[SRC:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3]] = add <4 x i64> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = icmp ugt <4 x i64> [[TMP4]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP6]] = select <4 x i1> [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne <4 x i64> [[TMP6]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP]]) +; CHECK-NEXT: [[RDX_SELECT_MASK:%.*]] = select <4 x i1> [[RDX_SELECT_CMP]], <4 x i64> [[TMP6]], <4 x i64> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> [[RDX_SELECT_MASK]]) +; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP8]], i64 [[TMP9]], i64 0 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 0, 0 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[MIN_IDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MIN_IDX_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[SCALAR_RECUR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[MIN_VAL_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]] +; CHECK-NEXT: [[L:%.*]] = load i64, ptr [[GEP]], align 4 +; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i64 [[SCALAR_RECUR]], [[L]] +; CHECK-NEXT: [[MIN_VAL_NEXT]] = add i64 [[L]], 1 +; CHECK-NEXT: [[FOO:%.*]] = call i64 @llvm.umin.i64(i64 [[SCALAR_RECUR]], i64 [[L]]) +; CHECK-NEXT: [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]] +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], [[LOOP]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i64 [[RES]] ; entry: br label %loop @@ -107,7 +155,6 @@ ret i64 %res } - define i64 @test_not_vectorize_cmp_value(i64 %x) { ; CHECK-LABEL: @test_not_vectorize_cmp_value( ; CHECK-NOT: vector.body: