diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp --- a/llvm/lib/Analysis/IVDescriptors.cpp +++ b/llvm/lib/Analysis/IVDescriptors.cpp @@ -619,8 +619,73 @@ enum class LoopInductionVariable { None, Increasing, Decreasing }; -// We are looking for loops that do something like this: -// The reduction value (r) only has two states, in this example 0 or 3. +/// Handles the case where \p V is a casted induction variable (due to +/// IndVarSimplify, for instance). In cases where \p V is truncated, we check +/// that the orignal final IV value in the loop guard is the same type as the +/// target of the truncation, and that the loop guard icmp is signed, in which +/// case we don't need a runtime-check, and can directly say that the truncated +/// IV will not hit the sentinel value. +/// TODO: Only signed IVs are currently supported. +static PHINode *handleCastedIV(Value *V, ScalarEvolution *SE, Loop *Loop) { + if (auto *Cast = dyn_cast(V)) { + auto *Phi = dyn_cast(Cast->getOperand(0)); + Type *CastDest = Cast->getDestTy(); + if (Cast->getSrcTy()->getScalarSizeInBits() < + Cast->getDestTy()->getScalarSizeInBits()) + return Phi; + else if (BranchInst *Guard = Loop->getLoopGuardBranch()) { + if (auto *GuardCmp = dyn_cast(Guard->getCondition())) { + if (GuardCmp->getOperand(0)->getType() == CastDest && + GuardCmp->isSigned()) + return Phi; + } + } else if (auto Bounds = Loop->getBounds(*SE)) + if (auto *ConstFinal = dyn_cast(&Bounds->getFinalIVValue())) + if (ConstantInt::isValueValidForType( + CastDest, ConstFinal->getValue().getSExtValue())) + return Phi; + return nullptr; + } + return dyn_cast(V); +} + +/// Checks that \p V is a loop induction Phi that we can handle, for the +/// purposes of SelectCmp with a non loop-invariant value in the select. We can +/// currently handle indvars in loops that have a constant start value that's +/// not INT_MIN (in the case of LoopInductionVariable::Increasing) or INT_MAX +/// (in the case of LoopInductionVariable::Decreasing), and doesn't wrap to +/// potentially hit these values, since these values are the sentinel values of +/// the reductions. +static LoopInductionVariable getLoopInduction(Value *V, ScalarEvolution *SE, + Loop *Loop) { + if (!SE) + return LoopInductionVariable::None; + + if (auto *Phi = handleCastedIV(V, SE, Loop)) { + InductionDescriptor ID; + if (InductionDescriptor::isInductionPHI(Phi, Loop, SE, ID)) { + const auto *AR = cast(SE->getSCEV(Phi)); + if (!AR->hasNoSignedWrap()) + return LoopInductionVariable::None; + + if (const auto *IVStartValue = + dyn_cast(ID.getStartValue())) { + const SCEV *Step = ID.getStep(); + if (SE->isKnownPositive(Step) && + !IVStartValue->isMinValue(/* Signed */ true)) + return LoopInductionVariable::Increasing; + else if (SE->isKnownNegative(Step) && + !IVStartValue->isMaxValue(/* Signed */ true)) + return LoopInductionVariable::Decreasing; + } + } + } + + return LoopInductionVariable::None; +} + +// We are looking for loops the following two types of loops: +// 1) The reduction value (r) only has two states, in this example 0 or 3. // int r = 0; // for (int i = 0; i < n; i++) { // if (src[i] > 3) @@ -730,38 +795,10 @@ else return InstDesc(false, I); - auto GetLoopInduction = [&SE, &Loop](Value *V) { - auto *Phi = dyn_cast(V); - if (!SE || !Phi) - return LoopInductionVariable::None; - - InductionDescriptor ID; - if (!InductionDescriptor::isInductionPHI(Phi, Loop, SE, ID)) - return LoopInductionVariable::None; - - const auto *AR = cast(SE->getSCEV(V)); - if (!AR->hasNoSignedWrap()) - return LoopInductionVariable::None; - - const auto *IVStartValue = dyn_cast(ID.getStartValue()); - if (!IVStartValue) - return LoopInductionVariable::None; - - const SCEV *Step = ID.getStep(); - if (SE->isKnownPositive(Step) && - !IVStartValue->isMinValue(/* Signed */ true)) - return LoopInductionVariable::Increasing; - else if (SE->isKnownNegative(Step) && - !IVStartValue->isMaxValue(/* Signed */ true)) - return LoopInductionVariable::Decreasing; - else - return LoopInductionVariable::None; - }; - // We are looking for selects of the form: // select(cmp(), phi, loop_induction) or // select(cmp(), loop_induction, phi) - switch (GetLoopInduction(NonRdxPhi)) { + switch (getLoopInduction(NonRdxPhi, SE, Loop)) { case LoopInductionVariable::Increasing: return InstDesc(I, isa(I->getOperand(0)) ? RecurKind::IFindLastIV diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll --- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll +++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll @@ -1955,11 +1955,217 @@ ret i64 %spec.select } -; Negative tests - -define i32 @not_vectorized_select_icmp_const_truncated_iv_widened_exit(ptr nocapture readonly %a, i32 %n) { -; CHECK-LABEL: define i32 @not_vectorized_select_icmp_const_truncated_iv_widened_exit -; CHECK-NOT: vector.body: +define i32 @select_icmp_const_truncated_iv_widened_exit(ptr nocapture readonly %a, i32 %n) { +; CHECK-VF4IC1-LABEL: @select_icmp_const_truncated_iv_widened_exit( +; CHECK-VF4IC1-NEXT: entry: +; CHECK-VF4IC1-NEXT: [[CMP_SGT:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-VF4IC1-NEXT: br i1 [[CMP_SGT]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]] +; CHECK-VF4IC1: for.body.preheader: +; CHECK-VF4IC1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF4IC1: vector.ph: +; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-VF4IC1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF4IC1: vector.body: +; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0 +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], +; CHECK-VF4IC1-NEXT: [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], +; CHECK-VF4IC1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-VF4IC1: middle.block: +; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP4]]) +; CHECK-VF4IC1-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[TMP6]], -2147483648 +; CHECK-VF4IC1-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[TMP6]], i32 331 +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF4IC1: scalar.ph: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 331, [[FOR_BODY_PREHEADER]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF4IC1: for.body: +; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 +; CHECK-VF4IC1-NEXT: [[CMP:%.*]] = icmp sgt i64 [[TMP7]], 3 +; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF4IC1-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP8]], i32 [[RDX]] +; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]] +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] +; CHECK-VF4IC1: exit.loopexit: +; CHECK-VF4IC1-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: br label [[EXIT]] +; CHECK-VF4IC1: exit: +; CHECK-VF4IC1-NEXT: [[RDX_LCSSA:%.*]] = phi i32 [ 331, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT]] ] +; CHECK-VF4IC1-NEXT: ret i32 [[RDX_LCSSA]] +; +; CHECK-VF4IC4-LABEL: @select_icmp_const_truncated_iv_widened_exit( +; CHECK-VF4IC4-NEXT: entry: +; CHECK-VF4IC4-NEXT: [[CMP_SGT:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-VF4IC4-NEXT: br i1 [[CMP_SGT]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]] +; CHECK-VF4IC4: for.body.preheader: +; CHECK-VF4IC4-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 +; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF4IC4: vector.ph: +; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 +; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-VF4IC4-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF4IC4: vector.body: +; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], +; CHECK-VF4IC4-NEXT: [[STEP_ADD4:%.*]] = add <4 x i32> [[STEP_ADD]], +; CHECK-VF4IC4-NEXT: [[STEP_ADD5:%.*]] = add <4 x i32> [[STEP_ADD4]], +; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8 +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 12 +; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-VF4IC4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] +; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]] +; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0 +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP8]], align 8 +; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 4 +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i64>, ptr [[TMP9]], align 8 +; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 8 +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8 +; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 12 +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x i64>, ptr [[TMP11]], align 8 +; CHECK-VF4IC4-NEXT: [[TMP12:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], +; CHECK-VF4IC4-NEXT: [[TMP13:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD7]], +; CHECK-VF4IC4-NEXT: [[TMP14:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD8]], +; CHECK-VF4IC4-NEXT: [[TMP15:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD9]], +; CHECK-VF4IC4-NEXT: [[TMP16]] = select <4 x i1> [[TMP12]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC4-NEXT: [[TMP17]] = select <4 x i1> [[TMP13]], <4 x i32> [[STEP_ADD]], <4 x i32> [[VEC_PHI1]] +; CHECK-VF4IC4-NEXT: [[TMP18]] = select <4 x i1> [[TMP14]], <4 x i32> [[STEP_ADD4]], <4 x i32> [[VEC_PHI2]] +; CHECK-VF4IC4-NEXT: [[TMP19]] = select <4 x i1> [[TMP15]], <4 x i32> [[STEP_ADD5]], <4 x i32> [[VEC_PHI3]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD5]], +; CHECK-VF4IC4-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-VF4IC4: middle.block: +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP16]], <4 x i32> [[TMP17]]) +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX10:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[RDX_MINMAX]], <4 x i32> [[TMP18]]) +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX11:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[RDX_MINMAX10]], <4 x i32> [[TMP19]]) +; CHECK-VF4IC4-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[RDX_MINMAX11]]) +; CHECK-VF4IC4-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[TMP21]], -2147483648 +; CHECK-VF4IC4-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[TMP21]], i32 331 +; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF4IC4: scalar.ph: +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 331, [[FOR_BODY_PREHEADER]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC4-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF4IC4: for.body: +; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC4-NEXT: [[TMP22:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 +; CHECK-VF4IC4-NEXT: [[CMP:%.*]] = icmp sgt i64 [[TMP22]], 3 +; CHECK-VF4IC4-NEXT: [[TMP23:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF4IC4-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP23]], i32 [[RDX]] +; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]] +; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] +; CHECK-VF4IC4: exit.loopexit: +; CHECK-VF4IC4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC4-NEXT: br label [[EXIT]] +; CHECK-VF4IC4: exit: +; CHECK-VF4IC4-NEXT: [[RDX_LCSSA:%.*]] = phi i32 [ 331, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT]] ] +; CHECK-VF4IC4-NEXT: ret i32 [[RDX_LCSSA]] +; +; CHECK-VF1IC4-LABEL: @select_icmp_const_truncated_iv_widened_exit( +; CHECK-VF1IC4-NEXT: entry: +; CHECK-VF1IC4-NEXT: [[CMP_SGT:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-VF1IC4-NEXT: br i1 [[CMP_SGT]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]] +; CHECK-VF1IC4: for.body.preheader: +; CHECK-VF1IC4-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-VF1IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF1IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF1IC4: vector.ph: +; CHECK-VF1IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF1IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-VF1IC4-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF1IC4: vector.body: +; CHECK-VF1IC4-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI:%.*]] = phi i32 [ -2147483648, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ -2147483648, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI2:%.*]] = phi i32 [ -2147483648, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI3:%.*]] = phi i32 [ -2147483648, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[TMP0:%.*]] = trunc i64 [[OFFSET_IDX]] to i32 +; CHECK-VF1IC4-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], 0 +; CHECK-VF1IC4-NEXT: [[TMP2:%.*]] = add i32 [[TMP0]], 1 +; CHECK-VF1IC4-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], 2 +; CHECK-VF1IC4-NEXT: [[TMP4:%.*]] = add i32 [[TMP0]], 3 +; CHECK-VF1IC4-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-VF1IC4-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 1 +; CHECK-VF1IC4-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 2 +; CHECK-VF1IC4-NEXT: [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 3 +; CHECK-VF1IC4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP5]] +; CHECK-VF1IC4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; CHECK-VF1IC4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] +; CHECK-VF1IC4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] +; CHECK-VF1IC4-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8 +; CHECK-VF1IC4-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8 +; CHECK-VF1IC4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8 +; CHECK-VF1IC4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8 +; CHECK-VF1IC4-NEXT: [[TMP17:%.*]] = icmp sgt i64 [[TMP13]], 3 +; CHECK-VF1IC4-NEXT: [[TMP18:%.*]] = icmp sgt i64 [[TMP14]], 3 +; CHECK-VF1IC4-NEXT: [[TMP19:%.*]] = icmp sgt i64 [[TMP15]], 3 +; CHECK-VF1IC4-NEXT: [[TMP20:%.*]] = icmp sgt i64 [[TMP16]], 3 +; CHECK-VF1IC4-NEXT: [[TMP21]] = select i1 [[TMP17]], i32 [[TMP1]], i32 [[VEC_PHI]] +; CHECK-VF1IC4-NEXT: [[TMP22]] = select i1 [[TMP18]], i32 [[TMP2]], i32 [[VEC_PHI1]] +; CHECK-VF1IC4-NEXT: [[TMP23]] = select i1 [[TMP19]], i32 [[TMP3]], i32 [[VEC_PHI2]] +; CHECK-VF1IC4-NEXT: [[TMP24]] = select i1 [[TMP20]], i32 [[TMP4]], i32 [[VEC_PHI3]] +; CHECK-VF1IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 4 +; CHECK-VF1IC4-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF1IC4-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-VF1IC4: middle.block: +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP21]], i32 [[TMP22]]) +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX4:%.*]] = call i32 @llvm.smax.i32(i32 [[RDX_MINMAX]], i32 [[TMP23]]) +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX5:%.*]] = call i32 @llvm.smax.i32(i32 [[RDX_MINMAX4]], i32 [[TMP24]]) +; CHECK-VF1IC4-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[RDX_MINMAX5]], -2147483648 +; CHECK-VF1IC4-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[RDX_MINMAX5]], i32 331 +; CHECK-VF1IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-VF1IC4-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF1IC4: scalar.ph: +; CHECK-VF1IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-VF1IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 331, [[FOR_BODY_PREHEADER]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1IC4-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF1IC4: for.body: +; CHECK-VF1IC4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] +; CHECK-VF1IC4-NEXT: [[TMP26:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 +; CHECK-VF1IC4-NEXT: [[CMP:%.*]] = icmp sgt i64 [[TMP26]], 3 +; CHECK-VF1IC4-NEXT: [[TMP27:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF1IC4-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP27]], i32 [[RDX]] +; CHECK-VF1IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF1IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]] +; CHECK-VF1IC4-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] +; CHECK-VF1IC4: exit.loopexit: +; CHECK-VF1IC4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1IC4-NEXT: br label [[EXIT]] +; CHECK-VF1IC4: exit: +; CHECK-VF1IC4-NEXT: [[RDX_LCSSA:%.*]] = phi i32 [ 331, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT]] ] +; CHECK-VF1IC4-NEXT: ret i32 [[RDX_LCSSA]] ; entry: %cmp.sgt = icmp sgt i32 %n, 0 @@ -1986,9 +2192,187 @@ ret i32 %rdx.lcssa } -define i32 @not_vectorized_select_icmp_const_truncated_iv_const_exit(ptr nocapture readonly %a) { -; CHECK-LABEL: define i32 @not_vectorized_select_icmp_const_truncated_iv_const_exit -; CHECK-NOT: vector.body: +define i32 @select_icmp_const_truncated_iv_const_exit(ptr nocapture readonly %a) { +; CHECK-VF4IC1-LABEL: @select_icmp_const_truncated_iv_const_exit( +; CHECK-VF4IC1-NEXT: entry: +; CHECK-VF4IC1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF4IC1: vector.ph: +; CHECK-VF4IC1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF4IC1: vector.body: +; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0 +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], +; CHECK-VF4IC1-NEXT: [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], +; CHECK-VF4IC1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 20000 +; CHECK-VF4IC1-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-VF4IC1: middle.block: +; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP4]]) +; CHECK-VF4IC1-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[TMP6]], -2147483648 +; CHECK-VF4IC1-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[TMP6]], i32 331 +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 20000, 20000 +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF4IC1: scalar.ph: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 20000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 331, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF4IC1: for.body: +; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 +; CHECK-VF4IC1-NEXT: [[CMP:%.*]] = icmp sgt i64 [[TMP7]], 3 +; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF4IC1-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP8]], i32 [[RDX]] +; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 20000 +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] +; CHECK-VF4IC1: exit: +; CHECK-VF4IC1-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: ret i32 [[SPEC_SELECT_LCSSA]] +; +; CHECK-VF4IC4-LABEL: @select_icmp_const_truncated_iv_const_exit( +; CHECK-VF4IC4-NEXT: entry: +; CHECK-VF4IC4-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF4IC4: vector.ph: +; CHECK-VF4IC4-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF4IC4: vector.body: +; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], +; CHECK-VF4IC4-NEXT: [[STEP_ADD4:%.*]] = add <4 x i32> [[STEP_ADD]], +; CHECK-VF4IC4-NEXT: [[STEP_ADD5:%.*]] = add <4 x i32> [[STEP_ADD4]], +; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8 +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 12 +; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-VF4IC4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] +; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]] +; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0 +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP8]], align 8 +; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 4 +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i64>, ptr [[TMP9]], align 8 +; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 8 +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8 +; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 12 +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x i64>, ptr [[TMP11]], align 8 +; CHECK-VF4IC4-NEXT: [[TMP12:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], +; CHECK-VF4IC4-NEXT: [[TMP13:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD7]], +; CHECK-VF4IC4-NEXT: [[TMP14:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD8]], +; CHECK-VF4IC4-NEXT: [[TMP15:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD9]], +; CHECK-VF4IC4-NEXT: [[TMP16]] = select <4 x i1> [[TMP12]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC4-NEXT: [[TMP17]] = select <4 x i1> [[TMP13]], <4 x i32> [[STEP_ADD]], <4 x i32> [[VEC_PHI1]] +; CHECK-VF4IC4-NEXT: [[TMP18]] = select <4 x i1> [[TMP14]], <4 x i32> [[STEP_ADD4]], <4 x i32> [[VEC_PHI2]] +; CHECK-VF4IC4-NEXT: [[TMP19]] = select <4 x i1> [[TMP15]], <4 x i32> [[STEP_ADD5]], <4 x i32> [[VEC_PHI3]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD5]], +; CHECK-VF4IC4-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 20000 +; CHECK-VF4IC4-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-VF4IC4: middle.block: +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP16]], <4 x i32> [[TMP17]]) +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX10:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[RDX_MINMAX]], <4 x i32> [[TMP18]]) +; CHECK-VF4IC4-NEXT: [[RDX_MINMAX11:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[RDX_MINMAX10]], <4 x i32> [[TMP19]]) +; CHECK-VF4IC4-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[RDX_MINMAX11]]) +; CHECK-VF4IC4-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[TMP21]], -2147483648 +; CHECK-VF4IC4-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[TMP21]], i32 331 +; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 20000, 20000 +; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF4IC4: scalar.ph: +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 20000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 331, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC4-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF4IC4: for.body: +; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC4-NEXT: [[TMP22:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 +; CHECK-VF4IC4-NEXT: [[CMP:%.*]] = icmp sgt i64 [[TMP22]], 3 +; CHECK-VF4IC4-NEXT: [[TMP23:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF4IC4-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP23]], i32 [[RDX]] +; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 20000 +; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] +; CHECK-VF4IC4: exit: +; CHECK-VF4IC4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4IC4-NEXT: ret i32 [[SPEC_SELECT_LCSSA]] +; +; CHECK-VF1IC4-LABEL: @select_icmp_const_truncated_iv_const_exit( +; CHECK-VF1IC4-NEXT: entry: +; CHECK-VF1IC4-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF1IC4: vector.ph: +; CHECK-VF1IC4-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF1IC4: vector.body: +; CHECK-VF1IC4-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI:%.*]] = phi i32 [ -2147483648, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ -2147483648, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI2:%.*]] = phi i32 [ -2147483648, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI3:%.*]] = phi i32 [ -2147483648, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[TMP0:%.*]] = trunc i64 [[OFFSET_IDX]] to i32 +; CHECK-VF1IC4-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], 0 +; CHECK-VF1IC4-NEXT: [[TMP2:%.*]] = add i32 [[TMP0]], 1 +; CHECK-VF1IC4-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], 2 +; CHECK-VF1IC4-NEXT: [[TMP4:%.*]] = add i32 [[TMP0]], 3 +; CHECK-VF1IC4-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-VF1IC4-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 1 +; CHECK-VF1IC4-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 2 +; CHECK-VF1IC4-NEXT: [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 3 +; CHECK-VF1IC4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP5]] +; CHECK-VF1IC4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; CHECK-VF1IC4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] +; CHECK-VF1IC4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] +; CHECK-VF1IC4-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8 +; CHECK-VF1IC4-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8 +; CHECK-VF1IC4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8 +; CHECK-VF1IC4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8 +; CHECK-VF1IC4-NEXT: [[TMP17:%.*]] = icmp sgt i64 [[TMP13]], 3 +; CHECK-VF1IC4-NEXT: [[TMP18:%.*]] = icmp sgt i64 [[TMP14]], 3 +; CHECK-VF1IC4-NEXT: [[TMP19:%.*]] = icmp sgt i64 [[TMP15]], 3 +; CHECK-VF1IC4-NEXT: [[TMP20:%.*]] = icmp sgt i64 [[TMP16]], 3 +; CHECK-VF1IC4-NEXT: [[TMP21]] = select i1 [[TMP17]], i32 [[TMP1]], i32 [[VEC_PHI]] +; CHECK-VF1IC4-NEXT: [[TMP22]] = select i1 [[TMP18]], i32 [[TMP2]], i32 [[VEC_PHI1]] +; CHECK-VF1IC4-NEXT: [[TMP23]] = select i1 [[TMP19]], i32 [[TMP3]], i32 [[VEC_PHI2]] +; CHECK-VF1IC4-NEXT: [[TMP24]] = select i1 [[TMP20]], i32 [[TMP4]], i32 [[VEC_PHI3]] +; CHECK-VF1IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 4 +; CHECK-VF1IC4-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 20000 +; CHECK-VF1IC4-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-VF1IC4: middle.block: +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP21]], i32 [[TMP22]]) +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX4:%.*]] = call i32 @llvm.smax.i32(i32 [[RDX_MINMAX]], i32 [[TMP23]]) +; CHECK-VF1IC4-NEXT: [[RDX_MINMAX5:%.*]] = call i32 @llvm.smax.i32(i32 [[RDX_MINMAX4]], i32 [[TMP24]]) +; CHECK-VF1IC4-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[RDX_MINMAX5]], -2147483648 +; CHECK-VF1IC4-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[RDX_MINMAX5]], i32 331 +; CHECK-VF1IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 20000, 20000 +; CHECK-VF1IC4-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF1IC4: scalar.ph: +; CHECK-VF1IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 20000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-VF1IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 331, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1IC4-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF1IC4: for.body: +; CHECK-VF1IC4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] +; CHECK-VF1IC4-NEXT: [[TMP26:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 +; CHECK-VF1IC4-NEXT: [[CMP:%.*]] = icmp sgt i64 [[TMP26]], 3 +; CHECK-VF1IC4-NEXT: [[TMP27:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF1IC4-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP27]], i32 [[RDX]] +; CHECK-VF1IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF1IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 20000 +; CHECK-VF1IC4-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] +; CHECK-VF1IC4: exit: +; CHECK-VF1IC4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1IC4-NEXT: ret i32 [[SPEC_SELECT_LCSSA]] ; entry: br label %for.body @@ -2009,6 +2393,8 @@ ret i32 %spec.select } +; Negative tests + define i32 @not_vectorized_select_icmp_const_truncated_iv_unwidened_exit(ptr nocapture readonly %a, i64 %n) { ; CHECK-LABEL: define i32 @not_vectorized_select_icmp_const_truncated_iv_unwidened_exit ; CHECK-NOT: vector.body: