diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h --- a/llvm/include/llvm/Analysis/IVDescriptors.h +++ b/llvm/include/llvm/Analysis/IVDescriptors.h @@ -36,20 +36,24 @@ /// These are the kinds of recurrences that we support. enum class RecurKind { - None, ///< Not a recurrence. - Add, ///< Sum of integers. - Mul, ///< Product of integers. - Or, ///< Bitwise or logical OR of integers. - And, ///< Bitwise or logical AND of integers. - Xor, ///< Bitwise or logical XOR of integers. - SMin, ///< Signed integer min implemented in terms of select(cmp()). - SMax, ///< Signed integer max implemented in terms of select(cmp()). - UMin, ///< Unisgned integer min implemented in terms of select(cmp()). - UMax, ///< Unsigned integer max implemented in terms of select(cmp()). - FAdd, ///< Sum of floats. - FMul, ///< Product of floats. - FMin, ///< FP min implemented in terms of select(cmp()). - FMax ///< FP max implemented in terms of select(cmp()). + None, ///< Not a recurrence. + Add, ///< Sum of integers. + Mul, ///< Product of integers. + Or, ///< Bitwise or logical OR of integers. + And, ///< Bitwise or logical AND of integers. + Xor, ///< Bitwise or logical XOR of integers. + SMin, ///< Signed integer min implemented in terms of select(cmp()). + SMax, ///< Signed integer max implemented in terms of select(cmp()). + UMin, ///< Unisgned integer min implemented in terms of select(cmp()). + UMax, ///< Unsigned integer max implemented in terms of select(cmp()). + FAdd, ///< Sum of floats. + FMul, ///< Product of floats. + FMin, ///< FP min implemented in terms of select(cmp()). + FMax, ///< FP max implemented in terms of select(cmp()). + SelectICmp, ///< Integer select(icmp(),x,y) where one of (x,y) is loop + ///< invariant + SelectFCmp ///< Integer select(fcmp(),x,y) where one of (x,y) is loop + ///< invariant }; /// The RecurrenceDescriptor is used to identify recurrences variables in a @@ -116,7 +120,7 @@ /// select(icmp()) this function advances the instruction pointer 'I' from the /// compare instruction to the select instruction and stores this pointer in /// 'PatternLastInst' member of the returned struct. - static InstDesc isRecurrenceInstr(Instruction *I, RecurKind Kind, + static InstDesc isRecurrenceInstr(Loop *L, Instruction *I, RecurKind Kind, InstDesc &Prev, FastMathFlags FuncFMF); /// Returns true if instruction I has multiple uses in Insts @@ -135,13 +139,21 @@ static InstDesc isMinMaxPattern(Instruction *I, RecurKind Kind, const InstDesc &Prev); + /// Returns a struct describing whether the instruction is either a + /// Select(ICmp(A, B), X, Y), or + /// Select(FCmp(A, B), X, Y) + /// where one of (X, Y) is a loop invariant integer and the other is a PHI + /// value. \p Prev specifies the description of an already processed select + /// instruction, so its corresponding cmp can be matched to it. + static InstDesc isSelectCmpPattern(Loop *Loop, Instruction *I, + InstDesc &Prev); + /// Returns a struct describing if the instruction is a /// Select(FCmp(X, Y), (Z = X op PHINode), PHINode) instruction pattern. static InstDesc isConditionalRdxPattern(RecurKind Kind, Instruction *I); /// Returns identity corresponding to the RecurrenceKind. - static Constant *getRecurrenceIdentity(RecurKind K, Type *Tp, - FastMathFlags FMF); + Value *getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF); /// Returns the opcode corresponding to the RecurrenceKind. static unsigned getOpcode(RecurKind Kind); @@ -221,6 +233,12 @@ return isIntMinMaxRecurrenceKind(Kind) || isFPMinMaxRecurrenceKind(Kind); } + /// Returns true if the recurrence kind is of the form + /// select(cmp(),x,y) where one of (x,y) is loop invariant. + static bool isSelectCmpRecurrenceKind(RecurKind Kind) { + return Kind == RecurKind::SelectICmp || Kind == RecurKind::SelectFCmp; + } + /// Returns the type of the recurrence. This type can be narrower than the /// actual type of the Phi if the recurrence has been type-promoted. Type *getRecurrenceType() const { return RecurrenceType; } diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h --- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -348,6 +348,15 @@ SinkAndHoistLICMFlags *LICMFlags = nullptr, OptimizationRemarkEmitter *ORE = nullptr); +/// See RecurrenceDescriptor::isSelectCmpPattern for a description of the +/// pattern we are trying to match. In this pattern we are only ever selecting +/// between two values: 1) an initial PHI start value, and 2) a loop invariant +/// value. This function uses \p LoopExitInst to determine 2), which we then use +/// to select between \p Left and \p Right. Any lane value in \p Left that +/// matches 2) will be merged into \p Right. +Value *createSelectCmpOp(IRBuilderBase &Builder, Value *StartVal, RecurKind RK, + Value *Left, Value *Right); + /// Returns a Min/Max operation corresponding to MinMaxRecurrenceKind. /// The Builder's fast-math-flags must be set to propagate the expected values. Value *createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left, @@ -375,12 +384,22 @@ RecurKind RdxKind, ArrayRef RedOps = None); +/// Create a target reduction of the given vector \p Src for a reduction of the +/// kind RecurKind::SelectICmp or RecurKind::SelectFCmp. The reduction operation +/// is described by \p Desc. +Value *createSelectCmpTargetReduction(IRBuilderBase &B, + const TargetTransformInfo *TTI, + Value *Src, + const RecurrenceDescriptor &Desc, + PHINode *OrigPhi); + /// Create a generic target reduction using a recurrence descriptor \p Desc /// The target is queried to determine if intrinsics or shuffle sequences are /// required to implement the reduction. /// Fast-math-flags are propagated using the RecurrenceDescriptor. Value *createTargetReduction(IRBuilderBase &B, const TargetTransformInfo *TTI, - const RecurrenceDescriptor &Desc, Value *Src); + const RecurrenceDescriptor &Desc, Value *Src, + PHINode *OrigPhi = nullptr); /// Create an ordered reduction intrinsic using the given recurrence /// descriptor \p Desc. diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp --- a/llvm/lib/Analysis/IVDescriptors.cpp +++ b/llvm/lib/Analysis/IVDescriptors.cpp @@ -62,6 +62,8 @@ case RecurKind::SMin: case RecurKind::UMax: case RecurKind::UMin: + case RecurKind::SelectICmp: + case RecurKind::SelectFCmp: return true; } return false; @@ -327,7 +329,7 @@ // the starting value (the Phi or an AND instruction if the Phi has been // type-promoted). if (Cur != Start) { - ReduxDesc = isRecurrenceInstr(Cur, Kind, ReduxDesc, FuncFMF); + ReduxDesc = isRecurrenceInstr(TheLoop, Cur, Kind, ReduxDesc, FuncFMF); if (!ReduxDesc.isRecurrence()) return false; // FIXME: FMF is allowed on phi, but propagation is not handled correctly. @@ -360,6 +362,7 @@ // A reduction operation must only have one use of the reduction value. if (!IsAPhi && !IsASelect && !isMinMaxRecurrenceKind(Kind) && + !isSelectCmpRecurrenceKind(Kind) && hasMultipleUsesOf(Cur, VisitedInsts, 1)) return false; @@ -367,10 +370,10 @@ if (IsAPhi && Cur != Phi && !areAllUsesIn(Cur, VisitedInsts)) return false; - if (isIntMinMaxRecurrenceKind(Kind) && + if ((isIntMinMaxRecurrenceKind(Kind) || Kind == RecurKind::SelectICmp) && (isa(Cur) || isa(Cur))) ++NumCmpSelectPatternInst; - if (isFPMinMaxRecurrenceKind(Kind) && + if ((isFPMinMaxRecurrenceKind(Kind) || Kind == RecurKind::SelectFCmp) && (isa(Cur) || isa(Cur))) ++NumCmpSelectPatternInst; @@ -423,7 +426,8 @@ ((!isa(UI) && !isa(UI) && !isa(UI)) || (!isConditionalRdxPattern(Kind, UI).isRecurrence() && - !isMinMaxPattern(UI, Kind, IgnoredVal) + !isMinMaxPattern(UI, Kind, IgnoredVal).isRecurrence() && + !isSelectCmpPattern(TheLoop, UI, IgnoredVal) .isRecurrence()))) return false; @@ -442,6 +446,9 @@ NumCmpSelectPatternInst != 0) return false; + if (isSelectCmpRecurrenceKind(Kind) && NumCmpSelectPatternInst != 1) + return false; + if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction) return false; @@ -508,6 +515,63 @@ return true; } +// We are looking for loops that do something like this: +// int r = 0; +// for (int i = 0; i < n; i++) { +// if (src[i] > 3) +// r = 3; +// } +// where the reduction value (r) only has two states, in this example 0 or 3. +// The generated LLVM IR for this type of loop will be like this: +// for.body: +// %r = phi i32 [ %spec.select, %for.body ], [ 0, %entry ] +// ... +// %cmp = icmp sgt i32 %5, 3 +// %spec.select = select i1 %cmp, i32 3, i32 %r +// ... +// In general we can support vectorization of loops where 'r' flips between +// any two non-constants, provided they are loop invariant. The only thing +// we actually care about at the end of the loop is whether or not any lane +// in the selected vector is different from the start value. The final +// across-vector reduction after the loop simply involves choosing the start +// value if nothing changed (0 in the example above) or the other selected +// value (3 in the example above). +RecurrenceDescriptor::InstDesc +RecurrenceDescriptor::isSelectCmpPattern(Loop *Loop, Instruction *I, + InstDesc &Prev) { + // We must handle the select(cmp(),x,y) as a single instruction. Advance to + // the select. + CmpInst::Predicate Pred; + if (match(I, m_OneUse(m_Cmp(Pred, m_Value(), m_Value())))) { + if (auto *Select = dyn_cast(*I->user_begin())) + return InstDesc(Select, Prev.getRecKind()); + } + + // Only match select with single use cmp condition. + if (!match(I, m_Select(m_OneUse(m_Cmp(Pred, m_Value(), m_Value())), m_Value(), + m_Value()))) + return InstDesc(false, I); + + SelectInst *SI = cast(I); + Value *NonPhi = nullptr; + + if (isa(SI->getTrueValue())) + NonPhi = SI->getFalseValue(); + else if (isa(SI->getFalseValue())) + NonPhi = SI->getTrueValue(); + else + return InstDesc(false, I); + + // We are looking for selects of the form: + // select(cmp(), phi, loop_invariant) or + // select(cmp(), loop_invariant, phi) + if (!Loop->isLoopInvariant(NonPhi)) + return InstDesc(false, I); + + return InstDesc(I, isa(I->getOperand(0)) ? RecurKind::SelectICmp + : RecurKind::SelectFCmp); +} + RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isMinMaxPattern(Instruction *I, RecurKind Kind, const InstDesc &Prev) { @@ -602,7 +666,7 @@ } RecurrenceDescriptor::InstDesc -RecurrenceDescriptor::isRecurrenceInstr(Instruction *I, RecurKind Kind, +RecurrenceDescriptor::isRecurrenceInstr(Loop *L, Instruction *I, RecurKind Kind, InstDesc &Prev, FastMathFlags FuncFMF) { assert(Prev.getRecKind() == RecurKind::None || Prev.getRecKind() == Kind); switch (I->getOpcode()) { @@ -636,6 +700,8 @@ case Instruction::FCmp: case Instruction::ICmp: case Instruction::Call: + if (isSelectCmpRecurrenceKind(Kind)) + return isSelectCmpPattern(L, I, Prev); if (isIntMinMaxRecurrenceKind(Kind) || (((FuncFMF.noNaNs() && FuncFMF.noSignedZeros()) || (isa(I) && I->hasNoNaNs() && @@ -664,7 +730,6 @@ RecurrenceDescriptor &RedDes, DemandedBits *DB, AssumptionCache *AC, DominatorTree *DT) { - BasicBlock *Header = TheLoop->getHeader(); Function &F = *Header->getParent(); FastMathFlags FMF; @@ -709,6 +774,12 @@ LLVM_DEBUG(dbgs() << "Found a UMIN reduction PHI." << *Phi << "\n"); return true; } + if (AddReductionVar(Phi, RecurKind::SelectICmp, TheLoop, FMF, RedDes, DB, AC, + DT)) { + LLVM_DEBUG(dbgs() << "Found an integer conditional select reduction PHI." + << *Phi << "\n"); + return true; + } if (AddReductionVar(Phi, RecurKind::FMul, TheLoop, FMF, RedDes, DB, AC, DT)) { LLVM_DEBUG(dbgs() << "Found an FMult reduction PHI." << *Phi << "\n"); return true; @@ -725,6 +796,12 @@ LLVM_DEBUG(dbgs() << "Found a float MIN reduction PHI." << *Phi << "\n"); return true; } + if (AddReductionVar(Phi, RecurKind::SelectFCmp, TheLoop, FMF, RedDes, DB, AC, + DT)) { + LLVM_DEBUG(dbgs() << "Found a float conditional select reduction PHI." + << " PHI." << *Phi << "\n"); + return true; + } // Not a reduction of known type. return false; } @@ -831,8 +908,8 @@ /// This function returns the identity element (or neutral element) for /// the operation K. -Constant *RecurrenceDescriptor::getRecurrenceIdentity(RecurKind K, Type *Tp, - FastMathFlags FMF) { +Value *RecurrenceDescriptor::getRecurrenceIdentity(RecurKind K, Type *Tp, + FastMathFlags FMF) { switch (K) { case RecurKind::Xor: case RecurKind::Add: @@ -872,6 +949,9 @@ return ConstantFP::getInfinity(Tp, true); case RecurKind::FMax: return ConstantFP::getInfinity(Tp, false); + case RecurKind::SelectICmp: + case RecurKind::SelectFCmp: + return getRecurrenceStartValue(); default: llvm_unreachable("Unknown recurrence kind"); } @@ -897,9 +977,11 @@ case RecurKind::SMin: case RecurKind::UMax: case RecurKind::UMin: + case RecurKind::SelectICmp: return Instruction::ICmp; case RecurKind::FMax: case RecurKind::FMin: + case RecurKind::SelectFCmp: return Instruction::FCmp; default: llvm_unreachable("Unknown recurrence operation"); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1949,6 +1949,8 @@ case RecurKind::UMax: case RecurKind::FMin: case RecurKind::FMax: + case RecurKind::SelectICmp: + case RecurKind::SelectFCmp: return true; default: return false; diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -889,6 +889,15 @@ return true; } +Value *llvm::createSelectCmpOp(IRBuilderBase &Builder, Value *StartVal, + RecurKind RK, Value *Left, Value *Right) { + if (auto VTy = dyn_cast(Left->getType())) + StartVal = Builder.CreateVectorSplat(VTy->getElementCount(), StartVal); + Value *Cmp = + Builder.CreateCmp(CmpInst::ICMP_NE, Left, StartVal, "rdx.select.cmp"); + return Builder.CreateSelect(Cmp, Left, Right, "rdx.select"); +} + Value *llvm::createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left, Value *Right) { CmpInst::Predicate Pred; @@ -992,6 +1001,46 @@ return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); } +Value *llvm::createSelectCmpTargetReduction(IRBuilderBase &Builder, + const TargetTransformInfo *TTI, + Value *Src, + const RecurrenceDescriptor &Desc, + PHINode *OrigPhi) { + assert(RecurrenceDescriptor::isSelectCmpRecurrenceKind( + Desc.getRecurrenceKind()) && + "Unexpected reduction kind"); + Value *InitVal = Desc.getRecurrenceStartValue(); + Value *NewVal = nullptr; + + // First use the original phi to determine the new value we're trying to + // select from in the loop. + SelectInst *SI = nullptr; + for (auto *U : OrigPhi->users()) { + if ((SI = dyn_cast(U))) + break; + } + assert(SI && "One user of the original phi should be a select"); + + if (SI->getTrueValue() == OrigPhi) + NewVal = SI->getFalseValue(); + else { + assert(SI->getFalseValue() == OrigPhi && + "At least one input to the select should be the original Phi"); + NewVal = SI->getTrueValue(); + } + + // Create a splat vector with the new value and compare this to the vector + // we want to reduce. + ElementCount EC = cast(Src->getType())->getElementCount(); + Value *Right = Builder.CreateVectorSplat(EC, InitVal); + Value *Cmp = + Builder.CreateCmp(CmpInst::ICMP_NE, Src, Right, "rdx.select.cmp"); + + // If any predicate is true it means that we want to select the new value. + Cmp = Builder.CreateOrReduce(Cmp); + return Builder.CreateSelect(Cmp, NewVal, InitVal, "rdx.select"); +} + Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder, const TargetTransformInfo *TTI, Value *Src, RecurKind RdxKind, @@ -1032,14 +1081,19 @@ Value *llvm::createTargetReduction(IRBuilderBase &B, const TargetTransformInfo *TTI, - const RecurrenceDescriptor &Desc, - Value *Src) { + const RecurrenceDescriptor &Desc, Value *Src, + PHINode *OrigPhi) { // TODO: Support in-order reductions based on the recurrence descriptor. // All ops in the reduction inherit fast-math-flags from the recurrence // descriptor. IRBuilderBase::FastMathFlagGuard FMFGuard(B); B.setFastMathFlags(Desc.getFastMathFlags()); - return createSimpleTargetReduction(B, TTI, Src, Desc.getRecurrenceKind()); + + RecurKind RK = Desc.getRecurrenceKind(); + if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) + return createSelectCmpTargetReduction(B, TTI, Src, Desc, OrigPhi); + + return createSimpleTargetReduction(B, TTI, Src, RK); } Value *llvm::createOrderedReduction(IRBuilderBase &B, diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4414,9 +4414,11 @@ if (Op != Instruction::ICmp && Op != Instruction::FCmp) { ReducedPartRdx = Builder.CreateBinOp( (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); - } else { + } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) + ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK, + ReducedPartRdx, RdxPart); + else ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); - } } } @@ -4424,7 +4426,7 @@ // target reduction in the loop using a Reduction recipe. if (VF.isVector() && !PhiR->isInLoop()) { ReducedPartRdx = - createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx); + createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi); // If the reduction can be performed in a smaller type, we need to extend // the reduction to the wider type before we branch to the original loop. if (PhiTy != RdxDesc.getRecurrenceType()) @@ -6518,6 +6520,22 @@ unsigned StoresIC = IC / (NumStores ? NumStores : 1); unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); + // There is little point in interleaving for reductions containing selects + // and compares when VF=1 since it may just create more overhead than it's + // worth for loops with small trip counts. This is because we still have to + // do the final reduction after the loop. + bool HasSelectCmpReductions = + HasReductions && + any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { + const RecurrenceDescriptor &RdxDesc = Reduction.second; + return RecurrenceDescriptor::isSelectCmpRecurrenceKind( + RdxDesc.getRecurrenceKind()); + }); + if (HasSelectCmpReductions) { + LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); + return 1; + } + // If we have a scalar reduction (vector reductions are already dealt with // by this point), we can increase the critical path length if the loop // we're interleaving is inside another loop. For tree-wise reductions @@ -9243,6 +9261,8 @@ RecipeBuilder.recordRecipeOf(R); // For min/max reducitons, where we have a pair of icmp/select, we also // need to record the ICmp recipe, so it can be removed later. + assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && + "Only min/max recurrences allowed for inloop reductions"); if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) RecipeBuilder.recordRecipeOf(cast(R->getOperand(0))); } @@ -9566,6 +9586,8 @@ VPValue *ChainOp = Plan->getVPValue(Chain); unsigned FirstOpId; + assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && + "Only min/max recurrences allowed for inloop reductions"); if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { assert(isa(WidenRecipe) && "Expected to replace a VPWidenSelectSC"); @@ -9738,10 +9760,10 @@ if (VPValue *Cond = getCondOp()) { Value *NewCond = State.get(Cond, Part); VectorType *VecTy = cast(NewVecOp->getType()); - Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( + Value *Iden = RdxDesc->getRecurrenceIdentity( Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); - Constant *IdenVec = - ConstantVector::getSplat(VecTy->getElementCount(), Iden); + Value *IdenVec = + State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); NewVecOp = Select; } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -1336,7 +1336,8 @@ Value *Iden = nullptr; RecurKind RK = RdxDesc.getRecurrenceKind(); - if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) { + if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) || + RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) { // MinMax reduction have the start value as their identify. if (ScalarPHI) { Iden = StartV; @@ -1347,12 +1348,11 @@ Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident"); } } else { - Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity( - RK, VecTy->getScalarType(), RdxDesc.getFastMathFlags()); - Iden = IdenC; + Iden = RdxDesc.getRecurrenceIdentity(RK, VecTy->getScalarType(), + RdxDesc.getFastMathFlags()); if (!ScalarPHI) { - Iden = ConstantVector::getSplat(State.VF, IdenC); + Iden = Builder.CreateVectorSplat(State.VF, Iden); IRBuilderBase::InsertPointGuard IPBuilder(Builder); Builder.SetInsertPoint(State.CFG.VectorPreHeader->getTerminator()); Constant *Zero = Builder.getInt32(0); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll @@ -0,0 +1,204 @@ +; RUN: opt -loop-vectorize -scalable-vectorization=preferred -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK-VF4IC1 +; RUN: opt -loop-vectorize -scalable-vectorization=preferred -force-vector-interleave=4 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK-VF4IC4 + +target triple = "aarch64-linux-gnu" + +define i32 @select_const_i32_from_icmp(i32* nocapture readonly %v, i64 %n) #0 { +; CHECK-VF4IC1-LABEL: @select_const_i32_from_icmp +; CHECK-VF4IC1: vector.body: +; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi [ shufflevector ( insertelement ( poison, i32 3, i32 0), poison, zeroinitializer), %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ] +; CHECK-VF4IC1: [[VEC_LOAD:%.*]] = load +; CHECK-VF4IC1-NEXT: [[VEC_ICMP:%.*]] = icmp eq [[VEC_LOAD]], shufflevector ( insertelement ( poison, i32 3, i32 0), poison, zeroinitializer) +; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = select [[VEC_ICMP]], [[VEC_PHI]], shufflevector ( insertelement ( poison, i32 7, i32 0), poison, zeroinitializer) +; CHECK-VF4IC1: middle.block: +; CHECK-VF4IC1-NEXT: [[FIN_ICMP:%.*]] = icmp ne [[VEC_SEL]], shufflevector ( insertelement ( poison, i32 3, i32 0), poison, zeroinitializer) +; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[FIN_ICMP]]) +; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[OR_RDX]], i32 7, i32 3 + +; CHECK-VF4IC4-LABEL: @select_const_i32_from_icmp +; CHECK-VF4IC4: vector.body: +; CHECK-VF4IC4: [[VEC_PHI1:%.*]] = phi [ shufflevector ( insertelement ( poison, i32 3, i32 0), poison, zeroinitializer), %vector.ph ], [ [[VEC_SEL1:%.*]], %vector.body ] +; CHECK-VF4IC4: [[VEC_PHI2:%.*]] = phi [ shufflevector ( insertelement ( poison, i32 3, i32 0), poison, zeroinitializer), %vector.ph ], [ [[VEC_SEL2:%.*]], %vector.body ] +; CHECK-VF4IC4: [[VEC_PHI3:%.*]] = phi [ shufflevector ( insertelement ( poison, i32 3, i32 0), poison, zeroinitializer), %vector.ph ], [ [[VEC_SEL3:%.*]], %vector.body ] +; CHECK-VF4IC4: [[VEC_PHI4:%.*]] = phi [ shufflevector ( insertelement ( poison, i32 3, i32 0), poison, zeroinitializer), %vector.ph ], [ [[VEC_SEL4:%.*]], %vector.body ] +; CHECK-VF4IC4: [[VEC_ICMP1:%.*]] = icmp eq {{.*}}, shufflevector ( insertelement ( poison, i32 3, i32 0), poison, zeroinitializer) +; CHECK-VF4IC4-NEXT: [[VEC_ICMP2:%.*]] = icmp eq {{.*}}, shufflevector ( insertelement ( poison, i32 3, i32 0), poison, zeroinitializer) +; CHECK-VF4IC4-NEXT: [[VEC_ICMP3:%.*]] = icmp eq {{.*}}, shufflevector ( insertelement ( poison, i32 3, i32 0), poison, zeroinitializer) +; CHECK-VF4IC4-NEXT: [[VEC_ICMP4:%.*]] = icmp eq {{.*}}, shufflevector ( insertelement ( poison, i32 3, i32 0), poison, zeroinitializer) +; CHECK-VF4IC4-NEXT: [[VEC_SEL1]] = select [[VEC_ICMP1]], [[VEC_PHI1]], shufflevector ( insertelement ( poison, i32 7, i32 0), poison, zeroinitializer) +; CHECK-VF4IC4-NEXT: [[VEC_SEL2]] = select [[VEC_ICMP2]], [[VEC_PHI2]], shufflevector ( insertelement ( poison, i32 7, i32 0), poison, zeroinitializer) +; CHECK-VF4IC4-NEXT: [[VEC_SEL3]] = select [[VEC_ICMP3]], [[VEC_PHI3]], shufflevector ( insertelement ( poison, i32 7, i32 0), poison, zeroinitializer) +; CHECK-VF4IC4-NEXT: [[VEC_SEL4]] = select [[VEC_ICMP4]], [[VEC_PHI4]], shufflevector ( insertelement ( poison, i32 7, i32 0), poison, zeroinitializer) +; CHECK-VF4IC4: middle.block: +; CHECK-VF4IC4-NEXT: [[VEC_ICMP5:%.*]] = icmp ne [[VEC_SEL1]], shufflevector ( insertelement ( poison, i32 3, i32 0), poison, zeroinitializer) +; CHECK-VF4IC4-NEXT: [[VEC_SEL5:%.*]] = select [[VEC_ICMP5]], [[VEC_SEL1]], [[VEC_SEL2]] +; CHECK-VF4IC4-NEXT: [[VEC_ICMP6:%.*]] = icmp ne [[VEC_SEL5]], shufflevector ( insertelement ( poison, i32 3, i32 0), poison, zeroinitializer) +; CHECK-VF4IC4-NEXT: [[VEC_SEL6:%.*]] = select [[VEC_ICMP6]], [[VEC_SEL5]], [[VEC_SEL3]] +; CHECK-VF4IC4-NEXT: [[VEC_ICMP7:%.*]] = icmp ne [[VEC_SEL6]], shufflevector ( insertelement ( poison, i32 3, i32 0), poison, zeroinitializer) +; CHECK-VF4IC4-NEXT: [[VEC_SEL7:%.*]] = select [[VEC_ICMP7]], [[VEC_SEL6]], [[VEC_SEL4]] +; CHECK-VF4IC4-NEXT: [[FIN_ICMP:%.*]] = icmp ne [[VEC_SEL7]], shufflevector ( insertelement ( poison, i32 3, i32 0), poison, zeroinitializer) +; CHECK-VF4IC4-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[FIN_ICMP]]) +; CHECK-VF4IC4-NEXT: {{.*}} = select i1 [[OR_RDX]], i32 7, i32 3 +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %0 = phi i64 [ 0, %entry ], [ %6, %for.body ] + %1 = phi i32 [ 3, %entry ], [ %5, %for.body ] + %2 = getelementptr inbounds i32, i32* %v, i64 %0 + %3 = load i32, i32* %2, align 4 + %4 = icmp eq i32 %3, 3 + %5 = select i1 %4, i32 %1, i32 7 + %6 = add nuw nsw i64 %0, 1 + %7 = icmp eq i64 %6, %n + br i1 %7, label %exit, label %for.body, !llvm.loop !0 + +exit: ; preds = %for.body + ret i32 %5 +} + +define i32 @select_i32_from_icmp(i32* nocapture readonly %v, i32 %a, i32 %b, i64 %n) #0 { +; CHECK-VF4IC1-LABEL: @select_i32_from_icmp +; CHECK-VF4IC1: vector.ph: +; CHECK-VF4IC1: [[TMP1:%.*]] = insertelement poison, i32 %a, i32 0 +; CHECK-VF4IC1-NEXT: [[SPLAT_OF_A:%.*]] = shufflevector [[TMP1]], poison, zeroinitializer +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = insertelement poison, i32 %b, i32 0 +; CHECK-VF4IC1-NEXT: [[SPLAT_OF_B:%.*]] = shufflevector [[TMP2]], poison, zeroinitializer +; CHECK-VF4IC1: vector.body: +; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi [ [[SPLAT_OF_A]], %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ] +; CHECK-VF4IC1: [[VEC_LOAD:%.*]] = load +; CHECK-VF4IC1-NEXT: [[VEC_ICMP:%.*]] = icmp eq [[VEC_LOAD]], shufflevector ( insertelement ( poison, i32 3, i32 0), poison, zeroinitializer) +; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = select [[VEC_ICMP]], [[VEC_PHI]], [[SPLAT_OF_B]] +; CHECK-VF4IC1: middle.block: +; CHECK-VF4IC1-NEXT: [[FIN_INS:%.*]] = insertelement poison, i32 %a, i32 0 +; CHECK-VF4IC1-NEXT: [[FIN_SPLAT:%.*]] = shufflevector [[FIN_INS]], poison, zeroinitializer +; CHECK-VF4IC1-NEXT: [[FIN_CMP:%.*]] = icmp ne [[VEC_SEL]], [[FIN_SPLAT]] +; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[FIN_CMP]]) +; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[OR_RDX]], i32 %b, i32 %a + +; CHECK-VF4IC4-LABEL: @select_i32_from_icmp +; CHECK-VF4IC4: vector.body: +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %0 = phi i64 [ 0, %entry ], [ %6, %for.body ] + %1 = phi i32 [ %a, %entry ], [ %5, %for.body ] + %2 = getelementptr inbounds i32, i32* %v, i64 %0 + %3 = load i32, i32* %2, align 4 + %4 = icmp eq i32 %3, 3 + %5 = select i1 %4, i32 %1, i32 %b + %6 = add nuw nsw i64 %0, 1 + %7 = icmp eq i64 %6, %n + br i1 %7, label %exit, label %for.body, !llvm.loop !0 + +exit: ; preds = %for.body + ret i32 %5 +} + +define i32 @select_const_i32_from_fcmp(float* nocapture readonly %v, i64 %n) #0 { +; CHECK-VF4IC1-LABEL: @select_const_i32_from_fcmp +; CHECK-VF4IC1: vector.body: +; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi [ shufflevector ( insertelement ( poison, i32 2, i32 0), poison, zeroinitializer), %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ] +; CHECK-VF4IC1: [[VEC_LOAD:%.*]] = load +; CHECK-VF4IC1-NEXT: [[VEC_ICMP:%.*]] = fcmp fast ueq [[VEC_LOAD]], shufflevector ( insertelement ( poison, float 3.000000e+00, i32 0), poison, zeroinitializer) +; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = select [[VEC_ICMP]], [[VEC_PHI]], shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) +; CHECK-VF4IC1: middle.block: +; CHECK-VF4IC1-NEXT: [[FIN_ICMP:%.*]] = icmp ne [[VEC_SEL]], shufflevector ( insertelement ( poison, i32 2, i32 0), poison, zeroinitializer) +; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[FIN_ICMP]]) +; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[OR_RDX]], i32 1, i32 2 + +; CHECK-VF4IC4-LABEL: @select_const_i32_from_fcmp +; CHECK-VF4IC4: vector.body: +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %0 = phi i64 [ 0, %entry ], [ %6, %for.body ] + %1 = phi i32 [ 2, %entry ], [ %5, %for.body ] + %2 = getelementptr inbounds float, float* %v, i64 %0 + %3 = load float, float* %2, align 4 + %4 = fcmp fast ueq float %3, 3.0 + %5 = select i1 %4, i32 %1, i32 1 + %6 = add nuw nsw i64 %0, 1 + %7 = icmp eq i64 %6, %n + br i1 %7, label %exit, label %for.body, !llvm.loop !0 + +exit: ; preds = %for.body + ret i32 %5 +} + +define float @select_const_f32_from_icmp(i32* nocapture readonly %v, i64 %n) #0 { +; CHECK-VF4IC1-LABEL: @select_const_f32_from_icmp +; CHECK-VF4IC1-NOT: vector.body +; CHECK-VF4IC4-LABEL: @select_const_f32_from_icmp +; CHECK-VF4IC4-NOT: vector.body +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %0 = phi i64 [ 0, %entry ], [ %6, %for.body ] + %1 = phi fast float [ 3.0, %entry ], [ %5, %for.body ] + %2 = getelementptr inbounds i32, i32* %v, i64 %0 + %3 = load i32, i32* %2, align 4 + %4 = icmp eq i32 %3, 3 + %5 = select fast i1 %4, float %1, float 7.0 + %6 = add nuw nsw i64 %0, 1 + %7 = icmp eq i64 %6, %n + br i1 %7, label %exit, label %for.body, !llvm.loop !0 + +exit: ; preds = %for.body + ret float %5 +} + +define i32 @pred_select_const_i32_from_icmp(i32* noalias nocapture readonly %src1, i32* noalias nocapture readonly %src2, i64 %n) #0 { +; CHECK-VF4IC1-LABEL: @pred_select_const_i32_from_icmp +; CHECK-VF4IC1: vector.body: +; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi [ shufflevector ( insertelement ( poison, i32 0, i32 0), poison, zeroinitializer), %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ] +; CHECK-VF4IC1: [[VEC_LOAD:%.*]] = load +; CHECK-VF4IC1: [[MASK:%.*]] = icmp sgt [[VEC_LOAD]], shufflevector ( insertelement ( poison, i32 35, i32 0), poison, zeroinitializer) +; CHECK-VF4IC1: [[MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* {{%.*}}, i32 4, [[MASK]], poison) +; CHECK-VF4IC1-NEXT: [[VEC_ICMP:%.*]] = icmp eq [[MASKED_LOAD]], shufflevector ( insertelement ( poison, i32 2, i32 0), poison, zeroinitializer) +; CHECK-VF4IC1-NEXT: [[VEC_SEL_TMP:%.*]] = select [[VEC_ICMP]], shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer), [[VEC_PHI]] +; CHECK-VF4IC1: [[VEC_SEL:%.*]] = select [[MASK]], [[VEC_SEL_TMP]], [[VEC_PHI]] +; CHECK-VF4IC1: middle.block: +; CHECK-VF4IC1-NEXT: [[FIN_ICMP:%.*]] = icmp ne [[VEC_SEL]], shufflevector ( insertelement ( poison, i32 0, i32 0), poison, zeroinitializer) +; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[FIN_ICMP]]) +; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[OR_RDX]], i32 1, i32 0 + +; CHECK-VF4IC4-LABEL: @pred_select_const_i32_from_icmp +; CHECK-VF4IC4: vector.body: +entry: + br label %for.body + +for.body: ; preds = %entry, %for.inc + %i.013 = phi i64 [ %inc, %for.inc ], [ 0, %entry ] + %r.012 = phi i32 [ %r.1, %for.inc ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, i32* %src1, i64 %i.013 + %0 = load i32, i32* %arrayidx, align 4 + %cmp1 = icmp sgt i32 %0, 35 + br i1 %cmp1, label %if.then, label %for.inc + +if.then: ; preds = %for.body + %arrayidx2 = getelementptr inbounds i32, i32* %src2, i64 %i.013 + %1 = load i32, i32* %arrayidx2, align 4 + %cmp3 = icmp eq i32 %1, 2 + %spec.select = select i1 %cmp3, i32 1, i32 %r.012 + br label %for.inc + +for.inc: ; preds = %if.then, %for.body + %r.1 = phi i32 [ %r.012, %for.body ], [ %spec.select, %if.then ] + %inc = add nuw nsw i64 %i.013, 1 + %exitcond.not = icmp eq i64 %inc, %n + br i1 %exitcond.not, label %for.end.loopexit, label %for.body, !llvm.loop !0 + +for.end.loopexit: ; preds = %for.inc + %r.1.lcssa = phi i32 [ %r.1, %for.inc ] + ret i32 %r.1.lcssa +} + + +attributes #0 = { "target-features"="+sve" } + +!0 = distinct !{!0, !1} +!1 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} diff --git a/llvm/test/Transforms/LoopVectorize/scalable-reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/scalable-reduction-inloop.ll --- a/llvm/test/Transforms/LoopVectorize/scalable-reduction-inloop.ll +++ b/llvm/test/Transforms/LoopVectorize/scalable-reduction-inloop.ll @@ -7,8 +7,8 @@ ; CHECK-LABEL: @reduction_add_trunc( ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( zeroinitializer, i32 255, i32 0), %vector.ph ], [ [[TMP34:%.*]], %vector.body ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, %vector.ph ], [ [[TMP36:%.*]], %vector.body ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, i32 0, i32 0), poison, zeroinitializer), i32 255, i32 0), %vector.ph ], [ [[TMP34:%.*]], %vector.body ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi [ shufflevector ( insertelement ( poison, i32 0, i32 0), poison, zeroinitializer), %vector.ph ], [ [[TMP36:%.*]], %vector.body ] ; CHECK: [[TMP14:%.*]] = and [[VEC_PHI]], shufflevector ( insertelement ( poison, i32 255, i32 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP15:%.*]] = and [[VEC_PHI1]], shufflevector ( insertelement ( poison, i32 255, i32 0), poison, zeroinitializer) ; CHECK: [[WIDE_LOAD:%.*]] = load , * diff --git a/llvm/test/Transforms/LoopVectorize/select-cmp-predicated.ll b/llvm/test/Transforms/LoopVectorize/select-cmp-predicated.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/select-cmp-predicated.ll @@ -0,0 +1,143 @@ +; RUN: opt -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -S < %s | FileCheck %s --check-prefix=CHECK-VF2IC1 +; RUN: opt -loop-vectorize -force-vector-interleave=2 -force-vector-width=1 -S < %s | FileCheck %s --check-prefix=CHECK-VF1IC2 + +define i32 @pred_select_const_i32_from_icmp(i32* noalias nocapture readonly %src1, i32* noalias nocapture readonly %src2, i64 %n) { +; CHECK-VF2IC1-LABEL: @pred_select_const_i32_from_icmp( +; CHECK-VF2IC1: vector.body: +; CHECK-VF2IC1: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, %vector.ph ], [ [[PREDPHI:%.*]], %pred.load.continue2 ] +; CHECK-VF2IC1: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* {{%.*}}, align 4 +; CHECK-VF2IC1-NEXT: [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], +; CHECK-VF2IC1-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 +; CHECK-VF2IC1-NEXT: br i1 [[TMP5]], label %pred.load.if, label %pred.load.continue +; CHECK-VF2IC1: pred.load.if: +; CHECK-VF2IC1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[SRC2:%.*]], i64 {{%.*}} +; CHECK-VF2IC1-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 +; CHECK-VF2IC1-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0 +; CHECK-VF2IC1-NEXT: br label %pred.load.continue +; CHECK-VF2IC1: pred.load.continue: +; CHECK-VF2IC1-NEXT: [[TMP9:%.*]] = phi <2 x i32> [ poison, %vector.body ], [ [[TMP8]], %pred.load.if ] +; CHECK-VF2IC1-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 +; CHECK-VF2IC1-NEXT: br i1 [[TMP10]], label %pred.load.if1, label %pred.load.continue2 +; CHECK-VF2IC1: pred.load.if1: +; CHECK-VF2IC1: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[SRC2]], i64 {{%.*}} +; CHECK-VF2IC1-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4 +; CHECK-VF2IC1-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP13]], i32 1 +; CHECK-VF2IC1-NEXT: br label %pred.load.continue2 +; CHECK-VF2IC1: pred.load.continue2: +; CHECK-VF2IC1-NEXT: [[TMP15:%.*]] = phi <2 x i32> [ [[TMP9]], %pred.load.continue ], [ [[TMP14]], %pred.load.if1 ] +; CHECK-VF2IC1-NEXT: [[TMP16:%.*]] = icmp eq <2 x i32> [[TMP15]], +; CHECK-VF2IC1-NEXT: [[TMP17:%.*]] = select <2 x i1> [[TMP16]], <2 x i32> , <2 x i32> [[VEC_PHI]] +; CHECK-VF2IC1-NEXT: [[TMP18:%.*]] = xor <2 x i1> [[TMP4]], +; CHECK-VF2IC1-NEXT: [[PREDPHI]] = select <2 x i1> [[TMP4]], <2 x i32> [[TMP17]], <2 x i32> [[VEC_PHI]] +; CHECK-VF2IC1: br i1 {{%.*}}, label %middle.block, label %vector.body +; CHECK-VF2IC1: middle.block: +; CHECK-VF2IC1-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne <2 x i32> [[PREDPHI]], zeroinitializer +; CHECK-VF2IC1-NEXT: [[TMP20:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[RDX_SELECT_CMP]]) +; CHECK-VF2IC1-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP20]], i32 1, i32 0 +; CHECK-VF2IC1: scalar.ph: +; CHECK-VF2IC1: [[BC_RESUME_VAL:%.*]] = phi i64 [ {{%.*}}, %middle.block ], [ 0, %entry ] +; CHECK-VF2IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, %entry ], [ [[RDX_SELECT]], %middle.block ] +; CHECK-VF2IC1-NEXT: br label %for.body +; CHECK-VF2IC1: for.body: +; CHECK-VF2IC1: [[R_012:%.*]] = phi i32 [ [[R_1:%.*]], %for.inc ], [ [[BC_MERGE_RDX]], %scalar.ph ] +; CHECK-VF2IC1: [[TMP21:%.*]] = load i32, i32* {{%.*}}, align 4 +; CHECK-VF2IC1-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP21]], 35 +; CHECK-VF2IC1-NEXT: br i1 [[CMP1]], label %if.then, label %for.inc +; CHECK-VF2IC1: if.then: +; CHECK-VF2IC1: [[TMP22:%.*]] = load i32, i32* {{%.*}}, align 4 +; CHECK-VF2IC1-NEXT: [[CMP3:%.*]] = icmp eq i32 [[TMP22]], 2 +; CHECK-VF2IC1-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[CMP3]], i32 1, i32 [[R_012]] +; CHECK-VF2IC1-NEXT: br label %for.inc +; CHECK-VF2IC1: for.inc: +; CHECK-VF2IC1-NEXT: [[R_1]] = phi i32 [ [[R_012]], %for.body ], [ [[SPEC_SELECT]], %if.then ] +; CHECK-VF2IC1: for.end.loopexit: +; CHECK-VF2IC1-NEXT: [[R_1_LCSSA:%.*]] = phi i32 [ [[R_1]], %for.inc ], [ [[RDX_SELECT]], %middle.block ] +; CHECK-VF2IC1-NEXT: ret i32 [[R_1_LCSSA]] +; +; CHECK-VF1IC2-LABEL: @pred_select_const_i32_from_icmp( +; CHECK-VF1IC2: vector.body: +; CHECK-VF1IC2: [[VEC_PHI:%.*]] = phi i32 [ 0, %vector.ph ], [ [[PREDPHI:%.*]], %pred.load.continue4 ] +; CHECK-VF1IC2-NEXT: [[VEC_PHI2:%.*]] = phi i32 [ 0, %vector.ph ], [ [[PREDPHI5:%.*]], %pred.load.continue4 ] +; CHECK-VF1IC2: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[SRC1:%.*]], i64 {{%.*}} +; CHECK-VF1IC2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[SRC1]], i64 {{%.*}} +; CHECK-VF1IC2-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]], align 4 +; CHECK-VF1IC2-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 4 +; CHECK-VF1IC2-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], 35 +; CHECK-VF1IC2-NEXT: [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], 35 +; CHECK-VF1IC2-NEXT: br i1 [[TMP4]], label %pred.load.if, label %pred.load.continue +; CHECK-VF1IC2: pred.load.if: +; CHECK-VF1IC2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[SRC2:%.*]], i64 {{%.*}} +; CHECK-VF1IC2-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 +; CHECK-VF1IC2-NEXT: br label %pred.load.continue +; CHECK-VF1IC2: pred.load.continue: +; CHECK-VF1IC2-NEXT: [[TMP8:%.*]] = phi i32 [ poison, %vector.body ], [ [[TMP7]], %pred.load.if ] +; CHECK-VF1IC2-NEXT: br i1 [[TMP5]], label %pred.load.if3, label %pred.load.continue4 +; CHECK-VF1IC2: pred.load.if3: +; CHECK-VF1IC2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[SRC2]], i64 {{%.*}} +; CHECK-VF1IC2-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4 +; CHECK-VF1IC2-NEXT: br label %pred.load.continue4 +; CHECK-VF1IC2: pred.load.continue4: +; CHECK-VF1IC2-NEXT: [[TMP11:%.*]] = phi i32 [ poison, %pred.load.continue ], [ [[TMP10]], %pred.load.if3 ] +; CHECK-VF1IC2-NEXT: [[TMP12:%.*]] = icmp eq i32 [[TMP8]], 2 +; CHECK-VF1IC2-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP11]], 2 +; CHECK-VF1IC2-NEXT: [[TMP14:%.*]] = select i1 [[TMP12]], i32 1, i32 [[VEC_PHI]] +; CHECK-VF1IC2-NEXT: [[TMP15:%.*]] = select i1 [[TMP13]], i32 1, i32 [[VEC_PHI2]] +; CHECK-VF1IC2-NEXT: [[TMP16:%.*]] = xor i1 [[TMP4]], true +; CHECK-VF1IC2-NEXT: [[TMP17:%.*]] = xor i1 [[TMP5]], true +; CHECK-VF1IC2-NEXT: [[PREDPHI]] = select i1 [[TMP4]], i32 [[TMP14]], i32 [[VEC_PHI]] +; CHECK-VF1IC2-NEXT: [[PREDPHI5]] = select i1 [[TMP5]], i32 [[TMP15]], i32 [[VEC_PHI2]] +; CHECK-VF1IC2: br i1 {{%.*}}, label %middle.block, label %vector.body +; CHECK-VF1IC2: middle.block: +; CHECK-VF1IC2-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[PREDPHI]], 0 +; CHECK-VF1IC2-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[PREDPHI]], i32 [[PREDPHI5]] +; CHECK-VF1IC2: br i1 {{%.*}}, label %for.end.loopexit, label %scalar.ph +; CHECK-VF1IC2: scalar.ph: +; CHECK-VF1IC2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ {{%.*}}, %middle.block ], [ 0, %entry ] +; CHECK-VF1IC2-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, %entry ], [ [[RDX_SELECT]], %middle.block ] +; CHECK-VF1IC2-NEXT: br label %for.body +; CHECK-VF1IC2: for.body: +; CHECK-VF1IC2-NEXT: [[I_013:%.*]] = phi i64 [ [[INC:%.*]], %for.inc ], [ [[BC_RESUME_VAL]], %scalar.ph ] +; CHECK-VF1IC2-NEXT: [[R_012:%.*]] = phi i32 [ [[R_1:%.*]], %for.inc ], [ [[BC_MERGE_RDX]], %scalar.ph ] +; CHECK-VF1IC2: [[TMP19:%.*]] = load i32, i32* {{%.*}}, align 4 +; CHECK-VF1IC2-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP19]], 35 +; CHECK-VF1IC2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label %for.inc +; CHECK-VF1IC2: if.then: +; CHECK-VF1IC2: [[TMP20:%.*]] = load i32, i32* {{%.*}}, align 4 +; CHECK-VF1IC2-NEXT: [[CMP3:%.*]] = icmp eq i32 [[TMP20]], 2 +; CHECK-VF1IC2-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[CMP3]], i32 1, i32 [[R_012]] +; CHECK-VF1IC2-NEXT: br label %for.inc +; CHECK-VF1IC2: for.inc: +; CHECK-VF1IC2-NEXT: [[R_1]] = phi i32 [ [[R_012]], %for.body ], [ [[SPEC_SELECT]], %if.then ] +; CHECK-VF1IC2: br i1 {{%.*}}, label %for.end.loopexit, label %for.body +; CHECK-VF1IC2: for.end.loopexit: +; CHECK-VF1IC2-NEXT: [[R_1_LCSSA:%.*]] = phi i32 [ [[R_1]], %for.inc ], [ [[RDX_SELECT]], %middle.block ] +; CHECK-VF1IC2-NEXT: ret i32 [[R_1_LCSSA]] +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.inc + %i.013 = phi i64 [ %inc, %for.inc ], [ 0, %entry ] + %r.012 = phi i32 [ %r.1, %for.inc ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, i32* %src1, i64 %i.013 + %0 = load i32, i32* %arrayidx, align 4 + %cmp1 = icmp sgt i32 %0, 35 + br i1 %cmp1, label %if.then, label %for.inc + +if.then: ; preds = %for.body + %arrayidx2 = getelementptr inbounds i32, i32* %src2, i64 %i.013 + %1 = load i32, i32* %arrayidx2, align 4 + %cmp3 = icmp eq i32 %1, 2 + %spec.select = select i1 %cmp3, i32 1, i32 %r.012 + br label %for.inc + +for.inc: ; preds = %if.then, %for.body + %r.1 = phi i32 [ %r.012, %for.body ], [ %spec.select, %if.then ] + %inc = add nuw nsw i64 %i.013, 1 + %exitcond.not = icmp eq i64 %inc, %n + br i1 %exitcond.not, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.inc + %r.1.lcssa = phi i32 [ %r.1, %for.inc ] + ret i32 %r.1.lcssa +} diff --git a/llvm/test/Transforms/LoopVectorize/select-cmp.ll b/llvm/test/Transforms/LoopVectorize/select-cmp.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/select-cmp.ll @@ -0,0 +1,288 @@ +; RUN: opt -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK-VF4IC1 --check-prefix=CHECK +; RUN: opt -loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK-VF4IC4 --check-prefix=CHECK +; RUN: opt -loop-vectorize -force-vector-interleave=4 -force-vector-width=1 -S < %s | FileCheck %s --check-prefix=CHECK-VF1IC4 --check-prefix=CHECK + +define i32 @select_const_i32_from_icmp(i32* nocapture readonly %v, i64 %n) { +; CHECK-LABEL: @select_const_i32_from_icmp +; CHECK-VF4IC1: vector.body: +; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi <4 x i32> [ , %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ] +; CHECK-VF4IC1: [[VEC_LOAD:%.*]] = load <4 x i32> +; CHECK-VF4IC1-NEXT: [[VEC_ICMP:%.*]] = icmp eq <4 x i32> [[VEC_LOAD]], +; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = select <4 x i1> [[VEC_ICMP]], <4 x i32> [[VEC_PHI]], <4 x i32> +; CHECK-VF4IC1: middle.block: +; CHECK-VF4IC1-NEXT: [[FIN_ICMP:%.*]] = icmp ne <4 x i32> [[VEC_SEL]], +; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[FIN_ICMP]]) +; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[OR_RDX]], i32 7, i32 3 + +; CHECK-VF4IC4: vector.body: +; CHECK-VF4IC4: [[VEC_PHI1:%.*]] = phi <4 x i32> [ , %vector.ph ], [ [[VEC_SEL1:%.*]], %vector.body ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ , %vector.ph ], [ [[VEC_SEL2:%.*]], %vector.body ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ , %vector.ph ], [ [[VEC_SEL3:%.*]], %vector.body ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ , %vector.ph ], [ [[VEC_SEL4:%.*]], %vector.body ] +; CHECK-VF4IC4: [[VEC_ICMP1:%.*]] = icmp eq <4 x i32> {{.*}}, +; CHECK-VF4IC4-NEXT: [[VEC_ICMP2:%.*]] = icmp eq <4 x i32> {{.*}}, +; CHECK-VF4IC4-NEXT: [[VEC_ICMP3:%.*]] = icmp eq <4 x i32> {{.*}}, +; CHECK-VF4IC4-NEXT: [[VEC_ICMP4:%.*]] = icmp eq <4 x i32> {{.*}}, +; CHECK-VF4IC4-NEXT: [[VEC_SEL1:%.*]] = select <4 x i1> [[VEC_ICMP1]], <4 x i32> [[VEC_PHI1]], <4 x i32> +; CHECK-VF4IC4-NEXT: [[VEC_SEL2:%.*]] = select <4 x i1> [[VEC_ICMP2]], <4 x i32> [[VEC_PHI2]], <4 x i32> +; CHECK-VF4IC4-NEXT: [[VEC_SEL3:%.*]] = select <4 x i1> [[VEC_ICMP3]], <4 x i32> [[VEC_PHI3]], <4 x i32> +; CHECK-VF4IC4-NEXT: [[VEC_SEL4:%.*]] = select <4 x i1> [[VEC_ICMP4]], <4 x i32> [[VEC_PHI4]], <4 x i32> +; CHECK-VF4IC4: middle.block: +; CHECK-VF4IC4-NEXT: [[VEC_ICMP5:%.*]] = icmp ne <4 x i32> [[VEC_SEL1]], +; CHECK-VF4IC4-NEXT: [[VEC_SEL5:%.*]] = select <4 x i1> [[VEC_ICMP5]], <4 x i32> [[VEC_SEL1]], <4 x i32> [[VEC_SEL2]] +; CHECK-VF4IC4-NEXT: [[VEC_ICMP6:%.*]] = icmp ne <4 x i32> [[VEC_SEL5]], +; CHECK-VF4IC4-NEXT: [[VEC_SEL6:%.*]] = select <4 x i1> [[VEC_ICMP6]], <4 x i32> [[VEC_SEL5]], <4 x i32> [[VEC_SEL3]] +; CHECK-VF4IC4-NEXT: [[VEC_ICMP7:%.*]] = icmp ne <4 x i32> [[VEC_SEL6]], +; CHECK-VF4IC4-NEXT: [[VEC_SEL_FIN:%.*]] = select <4 x i1> [[VEC_ICMP7]], <4 x i32> [[VEC_SEL6]], <4 x i32> [[VEC_SEL4]] +; CHECK-VF4IC4-NEXT: [[FIN_ICMP:%.*]] = icmp ne <4 x i32> [[VEC_SEL_FIN]], +; CHECK-VF4IC4-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[FIN_ICMP]]) +; CHECK-VF4IC4-NEXT: {{.*}} = select i1 [[OR_RDX]], i32 7, i32 3 + + +; CHECK-VF1IC4: vector.body: +; CHECK-VF1IC4: [[VEC_PHI1:%.*]] = phi i32 [ 3, %vector.ph ], [ [[VEC_SEL1:%.*]], %vector.body ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI2:%.*]] = phi i32 [ 3, %vector.ph ], [ [[VEC_SEL2:%.*]], %vector.body ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI3:%.*]] = phi i32 [ 3, %vector.ph ], [ [[VEC_SEL3:%.*]], %vector.body ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI4:%.*]] = phi i32 [ 3, %vector.ph ], [ [[VEC_SEL4:%.*]], %vector.body ] +; CHECK-VF1IC4: [[VEC_LOAD1:%.*]] = load i32 +; CHECK-VF1IC4-NEXT: [[VEC_LOAD2:%.*]] = load i32 +; CHECK-VF1IC4-NEXT: [[VEC_LOAD3:%.*]] = load i32 +; CHECK-VF1IC4-NEXT: [[VEC_LOAD4:%.*]] = load i32 +; CHECK-VF1IC4-NEXT: [[VEC_ICMP1:%.*]] = icmp eq i32 [[VEC_LOAD1]], 3 +; CHECK-VF1IC4-NEXT: [[VEC_ICMP2:%.*]] = icmp eq i32 [[VEC_LOAD2]], 3 +; CHECK-VF1IC4-NEXT: [[VEC_ICMP3:%.*]] = icmp eq i32 [[VEC_LOAD3]], 3 +; CHECK-VF1IC4-NEXT: [[VEC_ICMP4:%.*]] = icmp eq i32 [[VEC_LOAD4]], 3 +; CHECK-VF1IC4-NEXT: [[VEC_SEL1]] = select i1 [[VEC_ICMP1]], i32 [[VEC_PHI1]], i32 7 +; CHECK-VF1IC4-NEXT: [[VEC_SEL2]] = select i1 [[VEC_ICMP2]], i32 [[VEC_PHI2]], i32 7 +; CHECK-VF1IC4-NEXT: [[VEC_SEL3]] = select i1 [[VEC_ICMP3]], i32 [[VEC_PHI3]], i32 7 +; CHECK-VF1IC4-NEXT: [[VEC_SEL4]] = select i1 [[VEC_ICMP4]], i32 [[VEC_PHI4]], i32 7 +; CHECK-VF1IC4: middle.block: +; CHECK-VF1IC4-NEXT: [[VEC_ICMP4:%.*]] = icmp ne i32 [[VEC_SEL1]], 3 +; CHECK-VF1IC4-NEXT: [[VEC_SEL5:%.*]] = select i1 [[VEC_ICMP4]], i32 [[VEC_SEL1]], i32 [[VEC_SEL2]] +; CHECK-VF1IC4-NEXT: [[VEC_ICMP5:%.*]] = icmp ne i32 [[VEC_SEL5]], 3 +; CHECK-VF1IC4-NEXT: [[VEC_SEL6:%.*]] = select i1 [[VEC_ICMP5]], i32 [[VEC_SEL5]], i32 [[VEC_SEL3]] +; CHECK-VF1IC4-NEXT: [[VEC_ICMP6:%.*]] = icmp ne i32 [[VEC_SEL6]], 3 +; CHECK-VF1IC4-NEXT: {{.*}} = select i1 [[VEC_ICMP6]], i32 [[VEC_SEL6]], i32 [[VEC_SEL4]] + +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %0 = phi i64 [ 0, %entry ], [ %6, %for.body ] + %1 = phi i32 [ 3, %entry ], [ %5, %for.body ] + %2 = getelementptr inbounds i32, i32* %v, i64 %0 + %3 = load i32, i32* %2, align 4 + %4 = icmp eq i32 %3, 3 + %5 = select i1 %4, i32 %1, i32 7 + %6 = add nuw nsw i64 %0, 1 + %7 = icmp eq i64 %6, %n + br i1 %7, label %exit, label %for.body + +exit: ; preds = %for.body + ret i32 %5 +} + + +define i32 @select_const_i32_from_icmp2(i32* nocapture readonly %v, i64 %n) { +; CHECK-LABEL: @select_const_i32_from_icmp2 +; CHECK-VF4IC1: vector.body: +; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi <4 x i32> [ , %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ] +; CHECK-VF4IC1: [[VEC_LOAD:%.*]] = load <4 x i32> +; CHECK-VF4IC1-NEXT: [[VEC_ICMP:%.*]] = icmp eq <4 x i32> [[VEC_LOAD]], +; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = select <4 x i1> [[VEC_ICMP]], <4 x i32> , <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC1: middle.block: +; CHECK-VF4IC1-NEXT: [[FIN_ICMP:%.*]] = icmp ne <4 x i32> [[VEC_SEL]], +; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[FIN_ICMP]]) +; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[OR_RDX]], i32 7, i32 3 + +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %0 = phi i64 [ 0, %entry ], [ %6, %for.body ] + %1 = phi i32 [ 3, %entry ], [ %5, %for.body ] + %2 = getelementptr inbounds i32, i32* %v, i64 %0 + %3 = load i32, i32* %2, align 4 + %4 = icmp eq i32 %3, 3 + %5 = select i1 %4, i32 7, i32 %1 + %6 = add nuw nsw i64 %0, 1 + %7 = icmp eq i64 %6, %n + br i1 %7, label %exit, label %for.body + +exit: ; preds = %for.body + ret i32 %5 +} + + +define i32 @select_i32_from_icmp(i32* nocapture readonly %v, i32 %a, i32 %b, i64 %n) { +; CHECK-LABEL: @select_i32_from_icmp +; CHECK-VF4IC1: vector.ph: +; CHECK-VF4IC1: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 %a, i32 0 +; CHECK-VF4IC1-NEXT: [[SPLAT_OF_A:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 %b, i32 0 +; CHECK-VF4IC1-NEXT: [[SPLAT_OF_B:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1: vector.body: +; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[SPLAT_OF_A]], %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ] +; CHECK-VF4IC1: [[VEC_LOAD:%.*]] = load <4 x i32> +; CHECK-VF4IC1-NEXT: [[VEC_ICMP:%.*]] = icmp eq <4 x i32> [[VEC_LOAD]], +; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = select <4 x i1> [[VEC_ICMP]], <4 x i32> [[VEC_PHI]], <4 x i32> [[SPLAT_OF_B]] +; CHECK-VF4IC1: middle.block: +; CHECK-VF4IC1-NEXT: [[FIN_INS:%.*]] = insertelement <4 x i32> poison, i32 %a, i32 0 +; CHECK-VF4IC1-NEXT: [[FIN_SPLAT:%.*]] = shufflevector <4 x i32> [[FIN_INS]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: [[FIN_CMP:%.*]] = icmp ne <4 x i32> [[VEC_SEL]], [[FIN_SPLAT]] +; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[FIN_CMP]]) +; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[OR_RDX]], i32 %b, i32 %a +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %0 = phi i64 [ 0, %entry ], [ %6, %for.body ] + %1 = phi i32 [ %a, %entry ], [ %5, %for.body ] + %2 = getelementptr inbounds i32, i32* %v, i64 %0 + %3 = load i32, i32* %2, align 4 + %4 = icmp eq i32 %3, 3 + %5 = select i1 %4, i32 %1, i32 %b + %6 = add nuw nsw i64 %0, 1 + %7 = icmp eq i64 %6, %n + br i1 %7, label %exit, label %for.body + +exit: ; preds = %for.body + ret i32 %5 +} + + +define i32 @select_const_i32_from_fcmp_fast(float* nocapture readonly %v, i64 %n) { +; CHECK-LABEL: @select_const_i32_from_fcmp_fast +; CHECK-VF4IC1: vector.body: +; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi <4 x i32> [ , %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ] +; CHECK-VF4IC1: [[VEC_LOAD:%.*]] = load <4 x float> +; CHECK-VF4IC1-NEXT: [[VEC_FCMP:%.*]] = fcmp fast ueq <4 x float> [[VEC_LOAD]], +; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = select <4 x i1> [[VEC_FCMP]], <4 x i32> [[VEC_PHI]], <4 x i32> +; CHECK-VF4IC1: middle.block: +; CHECK-VF4IC1-NEXT: [[FIN_ICMP:%.*]] = icmp ne <4 x i32> [[VEC_SEL]], +; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[FIN_ICMP]]) +; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[OR_RDX]], i32 1, i32 2 +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %0 = phi i64 [ 0, %entry ], [ %6, %for.body ] + %1 = phi i32 [ 2, %entry ], [ %5, %for.body ] + %2 = getelementptr inbounds float, float* %v, i64 %0 + %3 = load float, float* %2, align 4 + %4 = fcmp fast ueq float %3, 3.0 + %5 = select i1 %4, i32 %1, i32 1 + %6 = add nuw nsw i64 %0, 1 + %7 = icmp eq i64 %6, %n + br i1 %7, label %exit, label %for.body + +exit: ; preds = %for.body + ret i32 %5 +} + + +define i32 @select_const_i32_from_fcmp(float* nocapture readonly %v, i64 %n) { +; CHECK-LABEL: @select_const_i32_from_fcmp +; CHECK-VF4IC1: vector.body: +; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi <4 x i32> [ , %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ] +; CHECK-VF4IC1: [[VEC_LOAD:%.*]] = load <4 x float> +; CHECK-VF4IC1-NEXT: [[VEC_FCMP:%.*]] = fcmp ueq <4 x float> [[VEC_LOAD]], +; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = select <4 x i1> [[VEC_FCMP]], <4 x i32> [[VEC_PHI]], <4 x i32> +; CHECK-VF4IC1: middle.block: +; CHECK-VF4IC1-NEXT: [[FIN_ICMP:%.*]] = icmp ne <4 x i32> [[VEC_SEL]], +; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[FIN_ICMP]]) +; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[OR_RDX]], i32 1, i32 2 +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %0 = phi i64 [ 0, %entry ], [ %6, %for.body ] + %1 = phi i32 [ 2, %entry ], [ %5, %for.body ] + %2 = getelementptr inbounds float, float* %v, i64 %0 + %3 = load float, float* %2, align 4 + %4 = fcmp ueq float %3, 3.0 + %5 = select i1 %4, i32 %1, i32 1 + %6 = add nuw nsw i64 %0, 1 + %7 = icmp eq i64 %6, %n + br i1 %7, label %exit, label %for.body + +exit: ; preds = %for.body + ret i32 %5 +} + + +; Negative tests + +; We don't support FP reduction variables at the moment. +define float @select_const_f32_from_icmp(i32* nocapture readonly %v, i64 %n) { +; CHECK: @select_const_f32_from_icmp +; CHECK-NOT: vector.body +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %0 = phi i64 [ 0, %entry ], [ %6, %for.body ] + %1 = phi fast float [ 3.0, %entry ], [ %5, %for.body ] + %2 = getelementptr inbounds i32, i32* %v, i64 %0 + %3 = load i32, i32* %2, align 4 + %4 = icmp eq i32 %3, 3 + %5 = select fast i1 %4, float %1, float 7.0 + %6 = add nuw nsw i64 %0, 1 + %7 = icmp eq i64 %6, %n + br i1 %7, label %exit, label %for.body + +exit: ; preds = %for.body + ret float %5 +} + + +; We don't support select/cmp reduction patterns where there is more than one +; use of the icmp/fcmp. +define i32 @select_const_i32_from_icmp_mul_use(i32* nocapture readonly %v1, i32* %v2, i64 %n) { +; CHECK-LABEL: @select_const_i32_from_icmp_mul_use +; CHECK-NOT: vector.body +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %0 = phi i64 [ 0, %entry ], [ %8, %for.body ] + %1 = phi i32 [ 3, %entry ], [ %6, %for.body ] + %2 = phi i32 [ 0, %entry ], [ %7, %for.body ] + %3 = getelementptr inbounds i32, i32* %v1, i64 %0 + %4 = load i32, i32* %3, align 4 + %5 = icmp eq i32 %4, 3 + %6 = select i1 %5, i32 %1, i32 7 + %7 = zext i1 %5 to i32 + %8 = add nuw nsw i64 %0, 1 + %9 = icmp eq i64 %8, %n + br i1 %9, label %exit, label %for.body + +exit: ; preds = %for.body + store i32 %7, i32* %v2, align 4 + ret i32 %6 +} + + +; We don't support selecting loop-variant values. +define i32 @select_variant_i32_from_icmp(i32* nocapture readonly %v1, i32* nocapture readonly %v2, i64 %n) { +; CHECK-LABEL: @select_variant_i32_from_icmp +; CHECK-NOT: vector.body +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %0 = phi i64 [ 0, %entry ], [ %8, %for.body ] + %1 = phi i32 [ 3, %entry ], [ %7, %for.body ] + %2 = getelementptr inbounds i32, i32* %v1, i64 %0 + %3 = load i32, i32* %2, align 4 + %4 = getelementptr inbounds i32, i32* %v2, i64 %0 + %5 = load i32, i32* %4, align 4 + %6 = icmp eq i32 %3, 3 + %7 = select i1 %6, i32 %1, i32 %5 + %8 = add nuw nsw i64 %0, 1 + %9 = icmp eq i64 %8, %n + br i1 %9, label %exit, label %for.body + +exit: ; preds = %for.body + ret i32 %7 +}