Index: llvm/include/llvm/Analysis/IVDescriptors.h =================================================================== --- llvm/include/llvm/Analysis/IVDescriptors.h +++ llvm/include/llvm/Analysis/IVDescriptors.h @@ -36,20 +36,24 @@ /// These are the kinds of recurrences that we support. enum class RecurKind { - None, ///< Not a recurrence. - Add, ///< Sum of integers. - Mul, ///< Product of integers. - Or, ///< Bitwise or logical OR of integers. - And, ///< Bitwise or logical AND of integers. - Xor, ///< Bitwise or logical XOR of integers. - SMin, ///< Signed integer min implemented in terms of select(cmp()). - SMax, ///< Signed integer max implemented in terms of select(cmp()). - UMin, ///< Unisgned integer min implemented in terms of select(cmp()). - UMax, ///< Unsigned integer max implemented in terms of select(cmp()). - FAdd, ///< Sum of floats. - FMul, ///< Product of floats. - FMin, ///< FP min implemented in terms of select(cmp()). - FMax ///< FP max implemented in terms of select(cmp()). + None, ///< Not a recurrence. + Add, ///< Sum of integers. + Mul, ///< Product of integers. + Or, ///< Bitwise or logical OR of integers. + And, ///< Bitwise or logical AND of integers. + Xor, ///< Bitwise or logical XOR of integers. + SMin, ///< Signed integer min implemented in terms of select(cmp()). + SMax, ///< Signed integer max implemented in terms of select(cmp()). + UMin, ///< Unisgned integer min implemented in terms of select(cmp()). + UMax, ///< Unsigned integer max implemented in terms of select(cmp()). + FAdd, ///< Sum of floats. + FMul, ///< Product of floats. + FMin, ///< FP min implemented in terms of select(cmp()). + FMax, ///< FP max implemented in terms of select(cmp()). + SelectICmp, ///< Integer select(icmp(),x,y) where one of (x,y) is loop + ///< invariant + SelectFCmp ///< Integer select(fcmp(),x,y) where one of (x,y) is loop + ///< invariant }; /// The RecurrenceDescriptor is used to identify recurrences variables in a @@ -116,7 +120,7 @@ /// select(icmp()) this function advances the instruction pointer 'I' from the /// compare instruction to the select instruction and stores this pointer in /// 'PatternLastInst' member of the returned struct. - static InstDesc isRecurrenceInstr(Instruction *I, RecurKind Kind, + static InstDesc isRecurrenceInstr(Loop *L, Instruction *I, RecurKind Kind, InstDesc &Prev, FastMathFlags FMF); /// Returns true if instruction I has multiple uses in Insts @@ -134,13 +138,21 @@ static InstDesc isMinMaxSelectCmpPattern(Instruction *I, const InstDesc &Prev); + /// Returns a struct describing whether the instruction is either a + /// Select(ICmp(A, B), X, Y), or + /// Select(FCmp(A, B), X, Y) + /// where one of (X, Y) is a loop invariant integer and the other is a PHI + /// value. \p Prev specifies the description of an already processed select + /// instruction, so its corresponding cmp can be matched to it. + static InstDesc isSelectCmpPattern(Loop *Loop, Instruction *I, + InstDesc &Prev); + /// Returns a struct describing if the instruction is a /// Select(FCmp(X, Y), (Z = X op PHINode), PHINode) instruction pattern. static InstDesc isConditionalRdxPattern(RecurKind Kind, Instruction *I); /// Returns identity corresponding to the RecurrenceKind. - static Constant *getRecurrenceIdentity(RecurKind K, Type *Tp, - FastMathFlags FMF); + Value *getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF); /// Returns the opcode corresponding to the RecurrenceKind. static unsigned getOpcode(RecurKind Kind); @@ -220,6 +232,12 @@ return isIntMinMaxRecurrenceKind(Kind) || isFPMinMaxRecurrenceKind(Kind); } + /// Returns true if the recurrence kind is of the form + /// select(cmp(),x,y) where one of (x,y) is loop invariant. + static bool isSelectCmpRecurrenceKind(RecurKind Kind) { + return Kind == RecurKind::SelectICmp || Kind == RecurKind::SelectFCmp; + } + /// Returns the type of the recurrence. This type can be narrower than the /// actual type of the Phi if the recurrence has been type-promoted. Type *getRecurrenceType() const { return RecurrenceType; } Index: llvm/include/llvm/Transforms/Utils/LoopUtils.h =================================================================== --- llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -348,6 +348,15 @@ SinkAndHoistLICMFlags *LICMFlags = nullptr, OptimizationRemarkEmitter *ORE = nullptr); +/// See RecurrenceDescriptor::isSelectCmpPattern for a description of the +/// pattern we are trying to match. In this pattern we are only ever selecting +/// between two values: 1) an initial PHI start value, and 2) a loop invariant +/// value. This function uses \p LoopExitInst to determine 2), which we then use +/// to select between \p Left and \p Right. Any lane value in \p Left that +/// matches 2) will be merged into \p Right. +Value *createSelectCmpOp(IRBuilderBase &Builder, Instruction *SI, RecurKind RK, + Value *Left, Value *Right); + /// Returns a Min/Max operation corresponding to MinMaxRecurrenceKind. /// The Builder's fast-math-flags must be set to propagate the expected values. Value *createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left, @@ -375,6 +384,14 @@ RecurKind RdxKind, ArrayRef RedOps = None); +/// Create a target reduction of the given vector \p Src for a reduction of the +/// kind RecurKind::SelectICmp or RecurKind::SelectFCmp. The reduction operation +/// is described by \p Desc. +Value *createSelectCmpTargetReduction(IRBuilderBase &B, + const TargetTransformInfo *TTI, + Value *Src, + const RecurrenceDescriptor &Desc); + /// Create a generic target reduction using a recurrence descriptor \p Desc /// The target is queried to determine if intrinsics or shuffle sequences are /// required to implement the reduction. Index: llvm/lib/Analysis/IVDescriptors.cpp =================================================================== --- llvm/lib/Analysis/IVDescriptors.cpp +++ llvm/lib/Analysis/IVDescriptors.cpp @@ -62,6 +62,8 @@ case RecurKind::SMin: case RecurKind::UMax: case RecurKind::UMin: + case RecurKind::SelectICmp: + case RecurKind::SelectFCmp: return true; } return false; @@ -327,7 +329,7 @@ // the starting value (the Phi or an AND instruction if the Phi has been // type-promoted). if (Cur != Start) { - ReduxDesc = isRecurrenceInstr(Cur, Kind, ReduxDesc, FuncFMF); + ReduxDesc = isRecurrenceInstr(TheLoop, Cur, Kind, ReduxDesc, FuncFMF); if (!ReduxDesc.isRecurrence()) return false; // FIXME: FMF is allowed on phi, but propagation is not handled correctly. @@ -360,6 +362,7 @@ // A reduction operation must only have one use of the reduction value. if (!IsAPhi && !IsASelect && !isMinMaxRecurrenceKind(Kind) && + !isSelectCmpRecurrenceKind(Kind) && hasMultipleUsesOf(Cur, VisitedInsts, 1)) return false; @@ -367,10 +370,10 @@ if (IsAPhi && Cur != Phi && !areAllUsesIn(Cur, VisitedInsts)) return false; - if (isIntMinMaxRecurrenceKind(Kind) && + if ((isIntMinMaxRecurrenceKind(Kind) || Kind == RecurKind::SelectICmp) && (isa(Cur) || isa(Cur))) ++NumCmpSelectPatternInst; - if (isFPMinMaxRecurrenceKind(Kind) && + if ((isFPMinMaxRecurrenceKind(Kind) || Kind == RecurKind::SelectFCmp) && (isa(Cur) || isa(Cur))) ++NumCmpSelectPatternInst; @@ -423,7 +426,9 @@ ((!isa(UI) && !isa(UI) && !isa(UI)) || (!isConditionalRdxPattern(Kind, UI).isRecurrence() && - !isMinMaxSelectCmpPattern(UI, IgnoredVal).isRecurrence()))) + !isMinMaxSelectCmpPattern(UI, IgnoredVal).isRecurrence() && + !isSelectCmpPattern(TheLoop, UI, IgnoredVal) + .isRecurrence()))) return false; // Remember that we completed the cycle. @@ -439,6 +444,9 @@ if (isMinMaxRecurrenceKind(Kind) && NumCmpSelectPatternInst != 2) return false; + if (isSelectCmpRecurrenceKind(Kind) && NumCmpSelectPatternInst != 1) + return false; + if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction) return false; @@ -505,6 +513,63 @@ return true; } +// We are looking for loops that do something like this: +// int r = 0; +// for (int i = 0; i < n; i++) { +// if (src[i] > 3) +// r = 3; +// } +// where the reduction value (r) only has two states, in this example 0 or 3. +// The generated LLVM IR for this type of loop will be like this: +// for.body: +// %r = phi i32 [ %spec.select, %for.body ], [ 0, %entry ] +// ... +// %cmp = icmp sgt i32 %5, 3 +// %spec.select = select i1 %cmp, i32 3, i32 %r +// ... +// In general we can support vectorization of loops where 'r' flips between +// any two non-constants, provided they are loop invariant. The only thing +// we actually care about at the end of the loop is whether or not any lane +// in the selected vector is different from the start value. The final +// across-vector reduction after the loop simply involves choosing the start +// value if nothing changed (0 in the example above) or the other selected +// value (3 in the example above). +RecurrenceDescriptor::InstDesc +RecurrenceDescriptor::isSelectCmpPattern(Loop *Loop, Instruction *I, + InstDesc &Prev) { + // We must handle the select(cmp(),x,y) as a single instruction. Advance to + // the select. + CmpInst::Predicate Pred; + if (match(I, m_OneUse(m_Cmp(Pred, m_Value(), m_Value())))) { + if (auto *Select = dyn_cast(*I->user_begin())) + return InstDesc(Select, Prev.getRecKind()); + } + + // Only match select with single use cmp condition. + if (!match(I, m_Select(m_OneUse(m_Cmp(Pred, m_Value(), m_Value())), m_Value(), + m_Value()))) + return InstDesc(false, I); + + SelectInst *SI = cast(I); + Value *NonPhi = nullptr; + + if (isa(SI->getTrueValue())) + NonPhi = SI->getFalseValue(); + else if (isa(SI->getFalseValue())) + NonPhi = SI->getTrueValue(); + else + return InstDesc(false, I); + + // We are looking for selects of the form: + // select(cmp(), phi, loop_invariant) or + // select(cmp(), loop_invariant, phi) + if (!Loop->isLoopInvariant(NonPhi)) + return InstDesc(false, I); + + return InstDesc(I, isa(I->getOperand(0)) ? RecurKind::SelectICmp + : RecurKind::SelectFCmp); +} + RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isMinMaxSelectCmpPattern(Instruction *I, const InstDesc &Prev) { @@ -592,7 +657,7 @@ } RecurrenceDescriptor::InstDesc -RecurrenceDescriptor::isRecurrenceInstr(Instruction *I, RecurKind Kind, +RecurrenceDescriptor::isRecurrenceInstr(Loop *L, Instruction *I, RecurKind Kind, InstDesc &Prev, FastMathFlags FMF) { switch (I->getOpcode()) { default: @@ -624,6 +689,8 @@ LLVM_FALLTHROUGH; case Instruction::FCmp: case Instruction::ICmp: + if (isSelectCmpRecurrenceKind(Kind)) + return isSelectCmpPattern(L, I, Prev); if (isIntMinMaxRecurrenceKind(Kind) || (FMF.noNaNs() && FMF.noSignedZeros() && isFPMinMaxRecurrenceKind(Kind))) return isMinMaxSelectCmpPattern(I, Prev); @@ -649,7 +716,6 @@ RecurrenceDescriptor &RedDes, DemandedBits *DB, AssumptionCache *AC, DominatorTree *DT) { - BasicBlock *Header = TheLoop->getHeader(); Function &F = *Header->getParent(); FastMathFlags FMF; @@ -694,6 +760,12 @@ LLVM_DEBUG(dbgs() << "Found a UMIN reduction PHI." << *Phi << "\n"); return true; } + if (AddReductionVar(Phi, RecurKind::SelectICmp, TheLoop, FMF, RedDes, DB, AC, + DT)) { + LLVM_DEBUG(dbgs() << "Found an integer conditional select reduction PHI." + << *Phi << "\n"); + return true; + } if (AddReductionVar(Phi, RecurKind::FMul, TheLoop, FMF, RedDes, DB, AC, DT)) { LLVM_DEBUG(dbgs() << "Found an FMult reduction PHI." << *Phi << "\n"); return true; @@ -710,6 +782,12 @@ LLVM_DEBUG(dbgs() << "Found a float MIN reduction PHI." << *Phi << "\n"); return true; } + if (AddReductionVar(Phi, RecurKind::SelectFCmp, TheLoop, FMF, RedDes, DB, AC, + DT)) { + LLVM_DEBUG(dbgs() << "Found a float conditional select reduction PHI." + << " PHI." << *Phi << "\n"); + return true; + } // Not a reduction of known type. return false; } @@ -816,8 +894,8 @@ /// This function returns the identity element (or neutral element) for /// the operation K. -Constant *RecurrenceDescriptor::getRecurrenceIdentity(RecurKind K, Type *Tp, - FastMathFlags FMF) { +Value *RecurrenceDescriptor::getRecurrenceIdentity(RecurKind K, Type *Tp, + FastMathFlags FMF) { switch (K) { case RecurKind::Xor: case RecurKind::Add: @@ -857,6 +935,10 @@ return ConstantFP::getInfinity(Tp, true); case RecurKind::FMax: return ConstantFP::getInfinity(Tp, false); + case RecurKind::SelectICmp: + case RecurKind::SelectFCmp: + return getRecurrenceStartValue(); + break; default: llvm_unreachable("Unknown recurrence kind"); } @@ -882,9 +964,11 @@ case RecurKind::SMin: case RecurKind::UMax: case RecurKind::UMin: + case RecurKind::SelectICmp: return Instruction::ICmp; case RecurKind::FMax: case RecurKind::FMin: + case RecurKind::SelectFCmp: return Instruction::FCmp; default: llvm_unreachable("Unknown recurrence operation"); Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1938,6 +1938,8 @@ case RecurKind::UMax: case RecurKind::FMin: case RecurKind::FMax: + case RecurKind::SelectICmp: + case RecurKind::SelectFCmp: return true; default: return false; Index: llvm/lib/Transforms/Utils/LoopUtils.cpp =================================================================== --- llvm/lib/Transforms/Utils/LoopUtils.cpp +++ llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -852,6 +852,23 @@ return true; } +Value *llvm::createSelectCmpOp(IRBuilderBase &Builder, + Instruction *LoopExitInst, RecurKind RK, + Value *Left, Value *Right) { + SelectInst *LSI = cast(LoopExitInst); + Value *NewVal = nullptr; + if (isa(LSI->getOperand(1))) + NewVal = LSI->getOperand(2); + else + NewVal = LSI->getOperand(1); + if (isa(Left->getType())) + NewVal = Builder.CreateVectorSplat( + cast(Left->getType())->getElementCount(), NewVal); + Value *Cmp = + Builder.CreateCmp(CmpInst::ICMP_EQ, Left, NewVal, "rdx.select.cmp"); + return Builder.CreateSelect(Cmp, Left, Right, "rdx.select"); +} + Value *llvm::createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left, Value *Right) { CmpInst::Predicate Pred; @@ -955,6 +972,43 @@ return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); } +Value *llvm::createSelectCmpTargetReduction(IRBuilderBase &Builder, + const TargetTransformInfo *TTI, + Value *Src, + const RecurrenceDescriptor &Desc) { + assert(RecurrenceDescriptor::isSelectCmpRecurrenceKind( + Desc.getRecurrenceKind())); + + // First we use the loop exit instruction to determine the new value we're + // trying to select from in the loop. + auto *SI = cast(Desc.getLoopExitInstr()); + Value *InitVal = nullptr, *NewVal = nullptr; + PHINode *PHI = nullptr; + if (PHI = dyn_cast(SI->getTrueValue())) + NewVal = SI->getFalseValue(); + else if (PHI = cast(SI->getFalseValue())) + NewVal = SI->getTrueValue(); + assert(PHI && "At least one select input value should be a PHI"); + + // The start value for the reduction must be the incoming PHI value that + // doesn't match the select. + if (SI != PHI->getIncomingValue(0)) + InitVal = PHI->getIncomingValue(0); + else + InitVal = PHI->getIncomingValue(1); + + // Create a splat vector with the new value and compare this to the vector + // we want to reduce. + ElementCount EC = cast(Src->getType())->getElementCount(); + Value *Right = Builder.CreateVectorSplat(EC, InitVal); + Value *Cmp = + Builder.CreateCmp(CmpInst::ICMP_NE, Src, Right, "rdx.select.cmp"); + + // If any predicate is true it means that we want to select the new value. + Cmp = Builder.CreateOrReduce(Cmp); + return Builder.CreateSelect(Cmp, NewVal, InitVal, "rdx.select"); +} + Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder, const TargetTransformInfo *TTI, Value *Src, RecurKind RdxKind, @@ -1002,7 +1056,15 @@ // descriptor. IRBuilderBase::FastMathFlagGuard FMFGuard(B); B.setFastMathFlags(Desc.getFastMathFlags()); - return createSimpleTargetReduction(B, TTI, Src, Desc.getRecurrenceKind()); + + RecurKind RK = Desc.getRecurrenceKind(); + switch (RK) { + case RecurKind::SelectICmp: + case RecurKind::SelectFCmp: + return createSelectCmpTargetReduction(B, TTI, Src, Desc); + default: + return createSimpleTargetReduction(B, TTI, Src, RK); + } } Value *llvm::createOrderedReduction(IRBuilderBase &B, Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4415,9 +4415,11 @@ if (Op != Instruction::ICmp && Op != Instruction::FCmp) { ReducedPartRdx = Builder.CreateBinOp( (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); - } else { + } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) + ReducedPartRdx = createSelectCmpOp(Builder, LoopExitInst, RK, + ReducedPartRdx, RdxPart); + else ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); - } } } @@ -9242,6 +9244,8 @@ RecipeBuilder.recordRecipeOf(R); // For min/max reducitons, where we have a pair of icmp/select, we also // need to record the ICmp recipe, so it can be removed later. + assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && + "Only min/max recurrences allowed for inloop reductions"); if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) RecipeBuilder.recordRecipeOf(cast(R->getOperand(0))); } @@ -9561,6 +9565,8 @@ VPValue *ChainOp = Plan->getVPValue(Chain); unsigned FirstOpId; + assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && + "Only min/max recurrences allowed for inloop reductions"); if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { assert(isa(WidenRecipe) && "Expected to replace a VPWidenSelectSC"); @@ -9733,10 +9739,10 @@ if (VPValue *Cond = getCondOp()) { Value *NewCond = State.get(Cond, Part); VectorType *VecTy = cast(NewVecOp->getType()); - Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( + Value *Iden = RdxDesc->getRecurrenceIdentity( Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); - Constant *IdenVec = - ConstantVector::getSplat(VecTy->getElementCount(), Iden); + Value *IdenVec = + State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); NewVecOp = Select; } Index: llvm/lib/Transforms/Vectorize/VPlan.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.cpp +++ llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -1336,7 +1336,8 @@ Value *Iden = nullptr; RecurKind RK = RdxDesc.getRecurrenceKind(); - if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) { + if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) || + RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) { // MinMax reduction have the start value as their identify. if (ScalarPHI) { Iden = StartV; @@ -1347,12 +1348,11 @@ Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident"); } } else { - Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity( - RK, VecTy->getScalarType(), RdxDesc.getFastMathFlags()); - Iden = IdenC; + Iden = RdxDesc.getRecurrenceIdentity(RK, VecTy->getScalarType(), + RdxDesc.getFastMathFlags()); if (!ScalarPHI) { - Iden = ConstantVector::getSplat(State.VF, IdenC); + Iden = Builder.CreateVectorSplat(State.VF, Iden); IRBuilderBase::InsertPointGuard IPBuilder(Builder); Builder.SetInsertPoint(State.CFG.VectorPreHeader->getTerminator()); Constant *Zero = Builder.getInt32(0); Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll @@ -0,0 +1,148 @@ +; RUN: opt -loop-vectorize -scalable-vectorization=preferred -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK-VF4IC1 --check-prefix=CHECK +; RUN: opt -loop-vectorize -scalable-vectorization=preferred -force-vector-interleave=4 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK-VF4IC4 --check-prefix=CHECK + +target triple = "aarch64-linux-gnu" + +define i32 @select_const_i32_from_icmp(i32* nocapture readonly %v, i64 %n) #0 { +; CHECK-LABEL: @select_const_i32_from_icmp +; CHECK-VF4IC1: vector.body: +; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi [ shufflevector ( insertelement ( poison, i32 3, i32 0), poison, zeroinitializer), %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ] +; CHECK-VF4IC1: [[VEC_LOAD:%.*]] = load +; CHECK-VF4IC1-NEXT: [[VEC_ICMP:%.*]] = icmp eq [[VEC_LOAD]], shufflevector ( insertelement ( poison, i32 3, i32 0), poison, zeroinitializer) +; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = select [[VEC_ICMP]], [[VEC_PHI]], shufflevector ( insertelement ( poison, i32 7, i32 0), poison, zeroinitializer) +; CHECK-VF4IC1: middle.block: +; CHECK-VF4IC1-NEXT: [[FIN_ICMP:%.*]] = icmp ne [[VEC_SEL]], shufflevector ( insertelement ( poison, i32 3, i32 0), poison, zeroinitializer) +; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[FIN_ICMP]]) +; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[OR_RDX]], i32 7, i32 3 + +; CHECK-VF4IC4: vector.body: +; CHECK-VF4IC4: [[VEC_PHI1:%.*]] = phi [ shufflevector ( insertelement ( poison, i32 3, i32 0), poison, zeroinitializer), %vector.ph ], [ [[VEC_SEL1:%.*]], %vector.body ] +; CHECK-VF4IC4: [[VEC_PHI2:%.*]] = phi [ shufflevector ( insertelement ( poison, i32 3, i32 0), poison, zeroinitializer), %vector.ph ], [ [[VEC_SEL2:%.*]], %vector.body ] +; CHECK-VF4IC4: [[VEC_PHI3:%.*]] = phi [ shufflevector ( insertelement ( poison, i32 3, i32 0), poison, zeroinitializer), %vector.ph ], [ [[VEC_SEL3:%.*]], %vector.body ] +; CHECK-VF4IC4: [[VEC_PHI4:%.*]] = phi [ shufflevector ( insertelement ( poison, i32 3, i32 0), poison, zeroinitializer), %vector.ph ], [ [[VEC_SEL4:%.*]], %vector.body ] +; CHECK-VF4IC4: [[VEC_ICMP1:%.*]] = icmp eq {{.*}}, shufflevector ( insertelement ( poison, i32 3, i32 0), poison, zeroinitializer) +; CHECK-VF4IC4-NEXT: [[VEC_ICMP2:%.*]] = icmp eq {{.*}}, shufflevector ( insertelement ( poison, i32 3, i32 0), poison, zeroinitializer) +; CHECK-VF4IC4-NEXT: [[VEC_ICMP3:%.*]] = icmp eq {{.*}}, shufflevector ( insertelement ( poison, i32 3, i32 0), poison, zeroinitializer) +; CHECK-VF4IC4-NEXT: [[VEC_ICMP4:%.*]] = icmp eq {{.*}}, shufflevector ( insertelement ( poison, i32 3, i32 0), poison, zeroinitializer) +; CHECK-VF4IC4-NEXT: [[VEC_SEL1]] = select [[VEC_ICMP1]], [[VEC_PHI1]], shufflevector ( insertelement ( poison, i32 7, i32 0), poison, zeroinitializer) +; CHECK-VF4IC4-NEXT: [[VEC_SEL2]] = select [[VEC_ICMP2]], [[VEC_PHI2]], shufflevector ( insertelement ( poison, i32 7, i32 0), poison, zeroinitializer) +; CHECK-VF4IC4-NEXT: [[VEC_SEL3]] = select [[VEC_ICMP3]], [[VEC_PHI3]], shufflevector ( insertelement ( poison, i32 7, i32 0), poison, zeroinitializer) +; CHECK-VF4IC4-NEXT: [[VEC_SEL4]] = select [[VEC_ICMP4]], [[VEC_PHI4]], shufflevector ( insertelement ( poison, i32 7, i32 0), poison, zeroinitializer) +; CHECK-VF4IC4: middle.block: +; CHECK-VF4IC4-NEXT: [[VEC_ICMP5:%.*]] = icmp eq [[VEC_SEL1]], shufflevector ( insertelement ( poison, i32 7, i32 0), poison, zeroinitializer) +; CHECK-VF4IC4-NEXT: [[VEC_SEL5:%.*]] = select [[VEC_ICMP5]], [[VEC_SEL1]], [[VEC_SEL2]] +; CHECK-VF4IC4-NEXT: [[VEC_ICMP6:%.*]] = icmp eq [[VEC_SEL5]], shufflevector ( insertelement ( poison, i32 7, i32 0), poison, zeroinitializer) +; CHECK-VF4IC4-NEXT: [[VEC_SEL6:%.*]] = select [[VEC_ICMP6]], [[VEC_SEL5]], [[VEC_SEL3]] +; CHECK-VF4IC4-NEXT: [[VEC_ICMP7:%.*]] = icmp eq [[VEC_SEL6]], shufflevector ( insertelement ( poison, i32 7, i32 0), poison, zeroinitializer) +; CHECK-VF4IC4-NEXT: [[VEC_SEL7:%.*]] = select [[VEC_ICMP7]], [[VEC_SEL6]], [[VEC_SEL4]] +; CHECK-VF4IC4-NEXT: [[FIN_ICMP:%.*]] = icmp ne [[VEC_SEL7]], shufflevector ( insertelement ( poison, i32 3, i32 0), poison, zeroinitializer) +; CHECK-VF4IC4-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[FIN_ICMP]]) +; CHECK-VF4IC4-NEXT: {{.*}} = select i1 [[OR_RDX]], i32 7, i32 3 +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %0 = phi i64 [ 0, %entry ], [ %6, %for.body ] + %1 = phi i32 [ 3, %entry ], [ %5, %for.body ] + %2 = getelementptr inbounds i32, i32* %v, i64 %0 + %3 = load i32, i32* %2, align 4 + %4 = icmp eq i32 %3, 3 + %5 = select i1 %4, i32 %1, i32 7 + %6 = add nuw nsw i64 %0, 1 + %7 = icmp eq i64 %6, %n + br i1 %7, label %exit, label %for.body, !llvm.loop !0 + +exit: ; preds = %for.body + ret i32 %5 +} + +define i32 @select_i32_from_icmp(i32* nocapture readonly %v, i32 %a, i32 %b, i64 %n) #0 { +; CHECK-LABEL: @select_i32_from_icmp +; CHECK-VF4IC1: vector.ph: +; CHECK-VF4IC1: [[TMP1:%.*]] = insertelement poison, i32 %a, i32 0 +; CHECK-VF4IC1-NEXT: [[SPLAT_OF_A:%.*]] = shufflevector [[TMP1]], poison, zeroinitializer +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = insertelement poison, i32 %b, i32 0 +; CHECK-VF4IC1-NEXT: [[SPLAT_OF_B:%.*]] = shufflevector [[TMP2]], poison, zeroinitializer +; CHECK-VF4IC1: vector.body: +; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi [ [[SPLAT_OF_A]], %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ] +; CHECK-VF4IC1: [[VEC_LOAD:%.*]] = load +; CHECK-VF4IC1-NEXT: [[VEC_ICMP:%.*]] = icmp eq [[VEC_LOAD]], shufflevector ( insertelement ( poison, i32 3, i32 0), poison, zeroinitializer) +; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = select [[VEC_ICMP]], [[VEC_PHI]], [[SPLAT_OF_B]] +; CHECK-VF4IC1: middle.block: +; CHECK-VF4IC1-NEXT: [[FIN_INS:%.*]] = insertelement poison, i32 %a, i32 0 +; CHECK-VF4IC1-NEXT: [[FIN_SPLAT:%.*]] = shufflevector [[FIN_INS]], poison, zeroinitializer +; CHECK-VF4IC1-NEXT: [[FIN_CMP:%.*]] = icmp ne [[VEC_SEL]], [[FIN_SPLAT]] +; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[FIN_CMP]]) +; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[OR_RDX]], i32 %b, i32 %a +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %0 = phi i64 [ 0, %entry ], [ %6, %for.body ] + %1 = phi i32 [ %a, %entry ], [ %5, %for.body ] + %2 = getelementptr inbounds i32, i32* %v, i64 %0 + %3 = load i32, i32* %2, align 4 + %4 = icmp eq i32 %3, 3 + %5 = select i1 %4, i32 %1, i32 %b + %6 = add nuw nsw i64 %0, 1 + %7 = icmp eq i64 %6, %n + br i1 %7, label %exit, label %for.body, !llvm.loop !0 + +exit: ; preds = %for.body + ret i32 %5 +} + +define i32 @select_const_i32_from_fcmp(float* nocapture readonly %v, i64 %n) #0 { +; CHECK-LABEL: @select_const_i32_from_fcmp +; CHECK-VF4IC1: vector.body: +; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi [ shufflevector ( insertelement ( poison, i32 2, i32 0), poison, zeroinitializer), %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ] +; CHECK-VF4IC1: [[VEC_LOAD:%.*]] = load +; CHECK-VF4IC1-NEXT: [[VEC_ICMP:%.*]] = fcmp fast ueq [[VEC_LOAD]], shufflevector ( insertelement ( poison, float 3.000000e+00, i32 0), poison, zeroinitializer) +; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = select [[VEC_ICMP]], [[VEC_PHI]], shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) +; CHECK-VF4IC1: middle.block: +; CHECK-VF4IC1-NEXT: [[FIN_ICMP:%.*]] = icmp ne [[VEC_SEL]], shufflevector ( insertelement ( poison, i32 2, i32 0), poison, zeroinitializer) +; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[FIN_ICMP]]) +; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[OR_RDX]], i32 1, i32 2 +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %0 = phi i64 [ 0, %entry ], [ %6, %for.body ] + %1 = phi i32 [ 2, %entry ], [ %5, %for.body ] + %2 = getelementptr inbounds float, float* %v, i64 %0 + %3 = load float, float* %2, align 4 + %4 = fcmp fast ueq float %3, 3.0 + %5 = select i1 %4, i32 %1, i32 1 + %6 = add nuw nsw i64 %0, 1 + %7 = icmp eq i64 %6, %n + br i1 %7, label %exit, label %for.body, !llvm.loop !0 + +exit: ; preds = %for.body + ret i32 %5 +} + +define float @select_const_f32_from_icmp(i32* nocapture readonly %v, i64 %n) #0 { +; CHECK: @select_const_f32_from_icmp +; CHECK-NOT: vector.body +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %0 = phi i64 [ 0, %entry ], [ %6, %for.body ] + %1 = phi fast float [ 3.0, %entry ], [ %5, %for.body ] + %2 = getelementptr inbounds i32, i32* %v, i64 %0 + %3 = load i32, i32* %2, align 4 + %4 = icmp eq i32 %3, 3 + %5 = select fast i1 %4, float %1, float 7.0 + %6 = add nuw nsw i64 %0, 1 + %7 = icmp eq i64 %6, %n + br i1 %7, label %exit, label %for.body, !llvm.loop !0 + +exit: ; preds = %for.body + ret float %5 +} + +attributes #0 = { "target-features"="+sve" } + +!0 = distinct !{!0, !1} +!1 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} Index: llvm/test/Transforms/LoopVectorize/scalable-reduction-inloop.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/scalable-reduction-inloop.ll +++ llvm/test/Transforms/LoopVectorize/scalable-reduction-inloop.ll @@ -7,8 +7,8 @@ ; CHECK-LABEL: @reduction_add_trunc( ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( zeroinitializer, i32 255, i32 0), %vector.ph ], [ [[TMP34:%.*]], %vector.body ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, %vector.ph ], [ [[TMP36:%.*]], %vector.body ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, i32 0, i32 0), poison, zeroinitializer), i32 255, i32 0), %vector.ph ], [ [[TMP34:%.*]], %vector.body ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi [ shufflevector ( insertelement ( poison, i32 0, i32 0), poison, zeroinitializer), %vector.ph ], [ [[TMP36:%.*]], %vector.body ] ; CHECK: [[TMP14:%.*]] = and [[VEC_PHI]], shufflevector ( insertelement ( poison, i32 255, i32 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP15:%.*]] = and [[VEC_PHI1]], shufflevector ( insertelement ( poison, i32 255, i32 0), poison, zeroinitializer) ; CHECK: [[WIDE_LOAD:%.*]] = load , * Index: llvm/test/Transforms/LoopVectorize/select-cmp.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/select-cmp.ll @@ -0,0 +1,283 @@ +; RUN: opt -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK-VF4IC1 --check-prefix=CHECK +; RUN: opt -loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK-VF4IC4 --check-prefix=CHECK +; RUN: opt -loop-vectorize -force-vector-interleave=4 -force-vector-width=1 -S < %s | FileCheck %s --check-prefix=CHECK-VF1IC4 --check-prefix=CHECK + +define i32 @select_const_i32_from_icmp(i32* nocapture readonly %v, i64 %n) { +; CHECK-LABEL: @select_const_i32_from_icmp +; CHECK-VF4IC1: vector.body: +; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi <4 x i32> [ , %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ] +; CHECK-VF4IC1: [[VEC_LOAD:%.*]] = load <4 x i32> +; CHECK-VF4IC1-NEXT: [[VEC_ICMP:%.*]] = icmp eq <4 x i32> [[VEC_LOAD]], +; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = select <4 x i1> [[VEC_ICMP]], <4 x i32> [[VEC_PHI]], <4 x i32> +; CHECK-VF4IC1: middle.block: +; CHECK-VF4IC1-NEXT: [[FIN_ICMP:%.*]] = icmp ne <4 x i32> [[VEC_SEL]], +; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[FIN_ICMP]]) +; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[OR_RDX]], i32 7, i32 3 + +; CHECK-VF4IC4: vector.body: +; CHECK-VF4IC4: [[VEC_PHI1:%.*]] = phi <4 x i32> [ , %vector.ph ], [ [[VEC_SEL1:%.*]], %vector.body ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ , %vector.ph ], [ [[VEC_SEL2:%.*]], %vector.body ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ , %vector.ph ], [ [[VEC_SEL3:%.*]], %vector.body ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ , %vector.ph ], [ [[VEC_SEL4:%.*]], %vector.body ] +; CHECK-VF4IC4: [[VEC_ICMP1:%.*]] = icmp eq <4 x i32> {{.*}}, +; CHECK-VF4IC4-NEXT: [[VEC_ICMP2:%.*]] = icmp eq <4 x i32> {{.*}}, +; CHECK-VF4IC4-NEXT: [[VEC_ICMP3:%.*]] = icmp eq <4 x i32> {{.*}}, +; CHECK-VF4IC4-NEXT: [[VEC_ICMP4:%.*]] = icmp eq <4 x i32> {{.*}}, +; CHECK-VF4IC4-NEXT: [[VEC_SEL1:%.*]] = select <4 x i1> [[VEC_ICMP1]], <4 x i32> [[VEC_PHI1]], <4 x i32> +; CHECK-VF4IC4-NEXT: [[VEC_SEL2:%.*]] = select <4 x i1> [[VEC_ICMP2]], <4 x i32> [[VEC_PHI2]], <4 x i32> +; CHECK-VF4IC4-NEXT: [[VEC_SEL3:%.*]] = select <4 x i1> [[VEC_ICMP3]], <4 x i32> [[VEC_PHI3]], <4 x i32> +; CHECK-VF4IC4-NEXT: [[VEC_SEL4:%.*]] = select <4 x i1> [[VEC_ICMP4]], <4 x i32> [[VEC_PHI4]], <4 x i32> +; CHECK-VF4IC4: middle.block: +; CHECK-VF4IC4-NEXT: [[VEC_ICMP5:%.*]] = icmp eq <4 x i32> [[VEC_SEL1]], +; CHECK-VF4IC4-NEXT: [[VEC_SEL5:%.*]] = select <4 x i1> [[VEC_ICMP5]], <4 x i32> [[VEC_SEL1]], <4 x i32> [[VEC_SEL2]] +; CHECK-VF4IC4-NEXT: [[VEC_ICMP6:%.*]] = icmp eq <4 x i32> [[VEC_SEL5]], +; CHECK-VF4IC4-NEXT: [[VEC_SEL6:%.*]] = select <4 x i1> [[VEC_ICMP6]], <4 x i32> [[VEC_SEL5]], <4 x i32> [[VEC_SEL3]] +; CHECK-VF4IC4-NEXT: [[VEC_ICMP7:%.*]] = icmp eq <4 x i32> [[VEC_SEL6]], +; CHECK-VF4IC4-NEXT: [[VEC_SEL_FIN:%.*]] = select <4 x i1> [[VEC_ICMP7]], <4 x i32> [[VEC_SEL6]], <4 x i32> [[VEC_SEL4]] +; CHECK-VF4IC4-NEXT: [[FIN_ICMP:%.*]] = icmp ne <4 x i32> [[VEC_SEL_FIN]], +; CHECK-VF4IC4-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[FIN_ICMP]]) +; CHECK-VF4IC4-NEXT: {{.*}} = select i1 [[OR_RDX]], i32 7, i32 3 + + +; CHECK-VF1IC4: vector.body: +; CHECK-VF1IC4: [[VEC_PHI1:%.*]] = phi i32 [ 3, %vector.ph ], [ [[VEC_SEL1:%.*]], %vector.body ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI2:%.*]] = phi i32 [ 3, %vector.ph ], [ [[VEC_SEL2:%.*]], %vector.body ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI3:%.*]] = phi i32 [ 3, %vector.ph ], [ [[VEC_SEL3:%.*]], %vector.body ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI4:%.*]] = phi i32 [ 3, %vector.ph ], [ [[VEC_SEL4:%.*]], %vector.body ] +; CHECK-VF1IC4: [[VEC_LOAD1:%.*]] = load i32 +; CHECK-VF1IC4-NEXT: [[VEC_LOAD2:%.*]] = load i32 +; CHECK-VF1IC4-NEXT: [[VEC_LOAD3:%.*]] = load i32 +; CHECK-VF1IC4-NEXT: [[VEC_LOAD4:%.*]] = load i32 +; CHECK-VF1IC4-NEXT: [[VEC_ICMP1:%.*]] = icmp eq i32 [[VEC_LOAD1]], 3 +; CHECK-VF1IC4-NEXT: [[VEC_ICMP2:%.*]] = icmp eq i32 [[VEC_LOAD2]], 3 +; CHECK-VF1IC4-NEXT: [[VEC_ICMP3:%.*]] = icmp eq i32 [[VEC_LOAD3]], 3 +; CHECK-VF1IC4-NEXT: [[VEC_ICMP4:%.*]] = icmp eq i32 [[VEC_LOAD4]], 3 +; CHECK-VF1IC4-NEXT: [[VEC_SEL1]] = select i1 [[VEC_ICMP1]], i32 [[VEC_PHI1]], i32 7 +; CHECK-VF1IC4-NEXT: [[VEC_SEL2]] = select i1 [[VEC_ICMP2]], i32 [[VEC_PHI2]], i32 7 +; CHECK-VF1IC4-NEXT: [[VEC_SEL3]] = select i1 [[VEC_ICMP3]], i32 [[VEC_PHI3]], i32 7 +; CHECK-VF1IC4-NEXT: [[VEC_SEL4]] = select i1 [[VEC_ICMP4]], i32 [[VEC_PHI4]], i32 7 +; CHECK-VF1IC4: middle.block: +; CHECK-VF1IC4-NEXT: [[VEC_ICMP4:%.*]] = icmp eq i32 [[VEC_SEL1]], 7 +; CHECK-VF1IC4-NEXT: [[VEC_SEL5:%.*]] = select i1 [[VEC_ICMP4]], i32 [[VEC_SEL1]], i32 [[VEC_SEL2]] +; CHECK-VF1IC4-NEXT: [[VEC_ICMP5:%.*]] = icmp eq i32 [[VEC_SEL5]], 7 +; CHECK-VF1IC4-NEXT: [[VEC_SEL6:%.*]] = select i1 [[VEC_ICMP5]], i32 [[VEC_SEL5]], i32 [[VEC_SEL3]] +; CHECK-VF1IC4-NEXT: [[VEC_ICMP6:%.*]] = icmp eq i32 [[VEC_SEL6]], 7 +; CHECK-VF1IC4-NEXT: {{.*}} = select i1 [[VEC_ICMP6]], i32 [[VEC_SEL6]], i32 [[VEC_SEL4]] + +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %0 = phi i64 [ 0, %entry ], [ %6, %for.body ] + %1 = phi i32 [ 3, %entry ], [ %5, %for.body ] + %2 = getelementptr inbounds i32, i32* %v, i64 %0 + %3 = load i32, i32* %2, align 4 + %4 = icmp eq i32 %3, 3 + %5 = select i1 %4, i32 %1, i32 7 + %6 = add nuw nsw i64 %0, 1 + %7 = icmp eq i64 %6, %n + br i1 %7, label %exit, label %for.body + +exit: ; preds = %for.body + ret i32 %5 +} + + +define i32 @select_const_i32_from_icmp2(i32* nocapture readonly %v, i64 %n) { +; CHECK-LABEL: @select_const_i32_from_icmp2 +; CHECK-VF4IC1: vector.body: +; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi <4 x i32> [ , %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ] +; CHECK-VF4IC1: [[VEC_LOAD:%.*]] = load <4 x i32> +; CHECK-VF4IC1-NEXT: [[VEC_ICMP:%.*]] = icmp eq <4 x i32> [[VEC_LOAD]], +; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = select <4 x i1> [[VEC_ICMP]], <4 x i32> , <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC1: middle.block: +; CHECK-VF4IC1-NEXT: [[FIN_ICMP:%.*]] = icmp ne <4 x i32> [[VEC_SEL]], +; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[FIN_ICMP]]) +; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[OR_RDX]], i32 7, i32 3 + +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %0 = phi i64 [ 0, %entry ], [ %6, %for.body ] + %1 = phi i32 [ 3, %entry ], [ %5, %for.body ] + %2 = getelementptr inbounds i32, i32* %v, i64 %0 + %3 = load i32, i32* %2, align 4 + %4 = icmp eq i32 %3, 3 + %5 = select i1 %4, i32 7, i32 %1 + %6 = add nuw nsw i64 %0, 1 + %7 = icmp eq i64 %6, %n + br i1 %7, label %exit, label %for.body + +exit: ; preds = %for.body + ret i32 %5 +} + + +define i32 @select_i32_from_icmp(i32* nocapture readonly %v, i32 %a, i32 %b, i64 %n) { +; CHECK-LABEL: @select_i32_from_icmp +; CHECK-VF4IC1: vector.ph: +; CHECK-VF4IC1: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 %a, i32 0 +; CHECK-VF4IC1-NEXT: [[SPLAT_OF_A:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 %b, i32 0 +; CHECK-VF4IC1-NEXT: [[SPLAT_OF_B:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1: vector.body: +; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[SPLAT_OF_A]], %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ] +; CHECK-VF4IC1: [[VEC_LOAD:%.*]] = load <4 x i32> +; CHECK-VF4IC1-NEXT: [[VEC_ICMP:%.*]] = icmp eq <4 x i32> [[VEC_LOAD]], +; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = select <4 x i1> [[VEC_ICMP]], <4 x i32> [[VEC_PHI]], <4 x i32> [[SPLAT_OF_B]] +; CHECK-VF4IC1: middle.block: +; CHECK-VF4IC1-NEXT: [[FIN_INS:%.*]] = insertelement <4 x i32> poison, i32 %a, i32 0 +; CHECK-VF4IC1-NEXT: [[FIN_SPLAT:%.*]] = shufflevector <4 x i32> [[FIN_INS]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: [[FIN_CMP:%.*]] = icmp ne <4 x i32> [[VEC_SEL]], [[FIN_SPLAT]] +; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[FIN_CMP]]) +; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[OR_RDX]], i32 %b, i32 %a +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %0 = phi i64 [ 0, %entry ], [ %6, %for.body ] + %1 = phi i32 [ %a, %entry ], [ %5, %for.body ] + %2 = getelementptr inbounds i32, i32* %v, i64 %0 + %3 = load i32, i32* %2, align 4 + %4 = icmp eq i32 %3, 3 + %5 = select i1 %4, i32 %1, i32 %b + %6 = add nuw nsw i64 %0, 1 + %7 = icmp eq i64 %6, %n + br i1 %7, label %exit, label %for.body + +exit: ; preds = %for.body + ret i32 %5 +} + + +define i32 @select_const_i32_from_fcmp_fast(float* nocapture readonly %v, i64 %n) { +; CHECK-LABEL: @select_const_i32_from_fcmp_fast +; CHECK-VF4IC1: vector.body: +; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi <4 x i32> [ , %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ] +; CHECK-VF4IC1: [[VEC_LOAD:%.*]] = load <4 x float> +; CHECK-VF4IC1-NEXT: [[VEC_FCMP:%.*]] = fcmp fast ueq <4 x float> [[VEC_LOAD]], +; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = select <4 x i1> [[VEC_FCMP]], <4 x i32> [[VEC_PHI]], <4 x i32> +; CHECK-VF4IC1: middle.block: +; CHECK-VF4IC1-NEXT: [[FIN_ICMP:%.*]] = icmp ne <4 x i32> [[VEC_SEL]], +; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[FIN_ICMP]]) +; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[OR_RDX]], i32 1, i32 2 +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %0 = phi i64 [ 0, %entry ], [ %6, %for.body ] + %1 = phi i32 [ 2, %entry ], [ %5, %for.body ] + %2 = getelementptr inbounds float, float* %v, i64 %0 + %3 = load float, float* %2, align 4 + %4 = fcmp fast ueq float %3, 3.0 + %5 = select i1 %4, i32 %1, i32 1 + %6 = add nuw nsw i64 %0, 1 + %7 = icmp eq i64 %6, %n + br i1 %7, label %exit, label %for.body + +exit: ; preds = %for.body + ret i32 %5 +} + + +define i32 @select_const_i32_from_fcmp(float* nocapture readonly %v, i64 %n) { +; CHECK-LABEL: @select_const_i32_from_fcmp +; CHECK-VF4IC1: vector.body: +; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi <4 x i32> [ , %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ] +; CHECK-VF4IC1: [[VEC_LOAD:%.*]] = load <4 x float> +; CHECK-VF4IC1-NEXT: [[VEC_FCMP:%.*]] = fcmp ueq <4 x float> [[VEC_LOAD]], +; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = select <4 x i1> [[VEC_FCMP]], <4 x i32> [[VEC_PHI]], <4 x i32> +; CHECK-VF4IC1: middle.block: +; CHECK-VF4IC1-NEXT: [[FIN_ICMP:%.*]] = icmp ne <4 x i32> [[VEC_SEL]], +; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[FIN_ICMP]]) +; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[OR_RDX]], i32 1, i32 2 +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %0 = phi i64 [ 0, %entry ], [ %6, %for.body ] + %1 = phi i32 [ 2, %entry ], [ %5, %for.body ] + %2 = getelementptr inbounds float, float* %v, i64 %0 + %3 = load float, float* %2, align 4 + %4 = fcmp ueq float %3, 3.0 + %5 = select i1 %4, i32 %1, i32 1 + %6 = add nuw nsw i64 %0, 1 + %7 = icmp eq i64 %6, %n + br i1 %7, label %exit, label %for.body + +exit: ; preds = %for.body + ret i32 %5 +} + + +; Negative tests + + +define float @select_const_f32_from_icmp(i32* nocapture readonly %v, i64 %n) { +; CHECK: @select_const_f32_from_icmp +; CHECK-NOT: vector.body +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %0 = phi i64 [ 0, %entry ], [ %6, %for.body ] + %1 = phi fast float [ 3.0, %entry ], [ %5, %for.body ] + %2 = getelementptr inbounds i32, i32* %v, i64 %0 + %3 = load i32, i32* %2, align 4 + %4 = icmp eq i32 %3, 3 + %5 = select fast i1 %4, float %1, float 7.0 + %6 = add nuw nsw i64 %0, 1 + %7 = icmp eq i64 %6, %n + br i1 %7, label %exit, label %for.body + +exit: ; preds = %for.body + ret float %5 +} + + +define i32 @select_const_i32_from_icmp_mul_use(i32* nocapture readonly %v1, i32* %v2, i64 %n) { +; CHECK-LABEL: @select_const_i32_from_icmp_mul_use +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %0 = phi i64 [ 0, %entry ], [ %8, %for.body ] + %1 = phi i32 [ 3, %entry ], [ %6, %for.body ] + %2 = phi i32 [ 0, %entry ], [ %7, %for.body ] + %3 = getelementptr inbounds i32, i32* %v1, i64 %0 + %4 = load i32, i32* %3, align 4 + %5 = icmp eq i32 %4, 3 + %6 = select i1 %5, i32 %1, i32 7 + %7 = zext i1 %5 to i32 + %8 = add nuw nsw i64 %0, 1 + %9 = icmp eq i64 %8, %n + br i1 %9, label %exit, label %for.body + +exit: ; preds = %for.body + store i32 %7, i32* %v2, align 4 + ret i32 %6 +} + + +define i32 @select_variant_i32_from_icmp(i32* nocapture readonly %v1, i32* nocapture readonly %v2, i64 %n) { +; CHECK-LABEL: @select_variant_i32_from_icmp +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %0 = phi i64 [ 0, %entry ], [ %8, %for.body ] + %1 = phi i32 [ 3, %entry ], [ %7, %for.body ] + %2 = getelementptr inbounds i32, i32* %v1, i64 %0 + %3 = load i32, i32* %2, align 4 + %4 = getelementptr inbounds i32, i32* %v2, i64 %0 + %5 = load i32, i32* %4, align 4 + %6 = icmp eq i32 %3, 3 + %7 = select i1 %6, i32 %1, i32 %5 + %8 = add nuw nsw i64 %0, 1 + %9 = icmp eq i64 %8, %n + br i1 %9, label %exit, label %for.body + +exit: ; preds = %for.body + ret i32 %7 +}