diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -470,10 +470,11 @@ /// on, while the old loop will be used as the scalar remainder. Control flow /// is generated around the vectorized (and scalar epilogue) loops consisting /// of various checks and bypasses. Return the pre-header block of the new - /// loop. - /// In the case of epilogue vectorization, this function is overriden to - /// handle the more complex control flow around the loops. - virtual BasicBlock *createVectorizedLoopSkeleton(); + /// loop and the start value for the canonical induction, if it is != 0. The + /// latter is the case when vectorizing the epilogue loop. In the case of + /// epilogue vectorization, this function is overriden to handle the more + /// complex control flow around the loops. + virtual std::pair createVectorizedLoopSkeleton(); /// Widen a single call instruction within the innermost loop. void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, @@ -507,10 +508,11 @@ /// Widen an integer or floating-point induction variable \p IV. If \p Trunc /// is provided, the integer induction variable will first be truncated to - /// the corresponding type. + /// the corresponding type. \p CanonicalIV is the scalar value generated for + /// the canonical induction variable. void widenIntOrFpInduction(PHINode *IV, const InductionDescriptor &ID, Value *Start, TruncInst *Trunc, VPValue *Def, - VPTransformState &State); + VPTransformState &State, Value *CanonicalIV); /// Construct the vector value of a scalarized value \p V one lane at a time. void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, @@ -573,9 +575,10 @@ Value *CountRoundDown, Value *EndValue, BasicBlock *MiddleBlock); - /// Create a new induction variable inside L. - PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, - Value *Step, Instruction *DL); + /// Introduce a conditional branch (on true, condition to be set later) at the + /// end of the header=latch connecting it to itself (across the backedge) and + /// to the exit block of \p L. + void createHeaderBranch(Loop *L); /// Handle all cross-iteration phis in the header. void fixCrossIterationPHIs(VPTransformState &State); @@ -678,14 +681,13 @@ Loop *createVectorLoopSkeleton(StringRef Prefix); /// Create new phi nodes for the induction variables to resume iteration count - /// in the scalar epilogue, from where the vectorized loop left off (given by - /// \p VectorTripCount). + /// in the scalar epilogue, from where the vectorized loop left off. /// In cases where the loop skeleton is more complicated (eg. epilogue /// vectorization) and the resume values can come from an additional bypass /// block, the \p AdditionalBypass pair provides information about the bypass /// block and the end value on the edge from bypass to this loop. void createInductionResumeValues( - Loop *L, Value *VectorTripCount, + Loop *L, std::pair AdditionalBypass = {nullptr, nullptr}); /// Complete the loop skeleton by adding debug MDs, creating appropriate @@ -788,12 +790,6 @@ /// A list of all bypass blocks. The first block is the entry of the loop. SmallVector LoopBypassBlocks; - /// The new Induction variable which was added to the new block. - PHINode *Induction = nullptr; - - /// The induction variable of the old basic block. - PHINode *OldInduction = nullptr; - /// Store instructions that were predicated. SmallVector PredicatedInstructions; @@ -899,14 +895,16 @@ // Override this function to handle the more complex control flow around the // three loops. - BasicBlock *createVectorizedLoopSkeleton() final override { + std::pair + createVectorizedLoopSkeleton() final override { return createEpilogueVectorizedLoopSkeleton(); } /// The interface for creating a vectorized skeleton using one of two /// different strategies, each corresponding to one execution of the vplan /// as described above. - virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; + virtual std::pair + createEpilogueVectorizedLoopSkeleton() = 0; /// Holds and updates state information required to vectorize the main loop /// and its epilogue in two separate passes. This setup helps us avoid @@ -934,7 +932,8 @@ EPI, LVL, CM, BFI, PSI, Check) {} /// Implements the interface for creating a vectorized skeleton using the /// *main loop* strategy (ie the first pass of vplan execution). - BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; + std::pair + createEpilogueVectorizedLoopSkeleton() final override; protected: /// Emits an iteration count bypass check once for the main loop (when \p @@ -963,7 +962,8 @@ EPI, LVL, CM, BFI, PSI, Checks) {} /// Implements the interface for creating a vectorized skeleton using the /// *epilogue loop* strategy (ie the second pass of vplan execution). - BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; + std::pair + createEpilogueVectorizedLoopSkeleton() final override; protected: /// Emits an iteration count bypass check after the main vector loop has @@ -1059,16 +1059,16 @@ return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); } +namespace llvm { + /// Return a value for Step multiplied by VF. -static Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF, - int64_t Step) { +Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF, + int64_t Step) { assert(Ty->isIntegerTy() && "Expected an integer step"); Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue()); return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; } -namespace llvm { - /// Return the runtime value for VF. Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); @@ -1153,7 +1153,8 @@ // will lead to gather/scatter instructions, which don't need to be // handled. if (isa(CurRec) || - isa(CurRec)) + isa(CurRec) || + isa(CurRec)) continue; // This recipe contributes to the address computation of a widen @@ -2467,8 +2468,7 @@ // placement of all induction updates. auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); auto *Br = cast(LoopVectorLatch->getTerminator()); - auto *ICmp = cast(Br->getCondition()); - LastInduction->moveBefore(ICmp); + LastInduction->moveBefore(Br); LastInduction->setName("vec.ind.next"); VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); @@ -2490,14 +2490,18 @@ return llvm::any_of(IV->users(), isScalarInst); } -void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, - const InductionDescriptor &ID, - Value *Start, TruncInst *Trunc, - VPValue *Def, - VPTransformState &State) { +/// Returns true if \p ID starts at 0 and has a step of 1. +static bool isCanonicalID(const InductionDescriptor &ID) { + if (!ID.getConstIntStepValue() || !ID.getConstIntStepValue()->isOne()) + return false; + auto *StartC = dyn_cast(ID.getStartValue()); + return StartC && StartC->isZero(); +} + +void InnerLoopVectorizer::widenIntOrFpInduction( + PHINode *IV, const InductionDescriptor &ID, Value *Start, TruncInst *Trunc, + VPValue *Def, VPTransformState &State, Value *CanonicalIV) { IRBuilder<> &Builder = State.Builder; - assert((IV->getType()->isIntegerTy() || IV != OldInduction) && - "Primary induction variable must have an integer type"); assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); assert(!State.VF.isZero() && "VF must be non-zero"); @@ -2525,11 +2529,11 @@ // induction variable and step. Otherwise, derive these values from the // induction descriptor. auto CreateScalarIV = [&](Value *&Step) -> Value * { - Value *ScalarIV = Induction; - if (IV != OldInduction) { + Value *ScalarIV = CanonicalIV; + if (!isCanonicalID(ID) || CanonicalIV->getType() != IV->getType()) { ScalarIV = IV->getType()->isIntegerTy() - ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) - : Builder.CreateCast(Instruction::SIToFP, Induction, + ? Builder.CreateSExtOrTrunc(ScalarIV, IV->getType()) + : Builder.CreateCast(Instruction::SIToFP, ScalarIV, IV->getType()); ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID, State.CFG.PrevBB); @@ -3003,43 +3007,21 @@ PredicatedInstructions.push_back(Cloned); } -PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, - Value *End, Value *Step, - Instruction *DL) { +void InnerLoopVectorizer::createHeaderBranch(Loop *L) { BasicBlock *Header = L->getHeader(); - BasicBlock *Latch = L->getLoopLatch(); - // As we're just creating this loop, it's possible no latch exists - // yet. If so, use the header as this will be a single block loop. - if (!Latch) - Latch = Header; - - IRBuilder<> B(&*Header->getFirstInsertionPt()); - Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); - setDebugLocFromInst(OldInst, &B); - auto *Induction = B.CreatePHI(Start->getType(), 2, "index"); + assert(!L->getLoopLatch() && "loop should not have a latch at this point"); - B.SetInsertPoint(Latch->getTerminator()); + IRBuilder<> B(Header->getTerminator()); + Instruction *OldInst = + getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); setDebugLocFromInst(OldInst, &B); - // Create i+1 and fill the PHINode. - // - // If the tail is not folded, we know that End - Start >= Step (either - // statically or through the minimum iteration checks). We also know that both - // Start % Step == 0 and End % Step == 0. We exit the vector loop if %IV + - // %Step == %End. Hence we must exit the loop before %IV + %Step unsigned - // overflows and we can mark the induction increment as NUW. - Value *Next = B.CreateAdd(Induction, Step, "index.next", - /*NUW=*/!Cost->foldTailByMasking(), /*NSW=*/false); - Induction->addIncoming(Start, L->getLoopPreheader()); - Induction->addIncoming(Next, Latch); - // Create the compare. - Value *ICmp = B.CreateICmpEQ(Next, End); - B.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); + // Connect the header to the exit and header blocks and replace the old + // terminator. + B.CreateCondBr(B.getTrue(), L->getUniqueExitBlock(), Header); // Now we have two terminators. Remove the old one from the block. - Latch->getTerminator()->eraseFromParent(); - - return Induction; + Header->getTerminator()->eraseFromParent(); } Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { @@ -3446,12 +3428,13 @@ } void InnerLoopVectorizer::createInductionResumeValues( - Loop *L, Value *VectorTripCount, - std::pair AdditionalBypass) { - assert(VectorTripCount && L && "Expected valid arguments"); + Loop *L, std::pair AdditionalBypass) { assert(((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && "Inconsistent information about additional bypass."); + + Value *VectorTripCount = getOrCreateVectorTripCount(L); + assert(VectorTripCount && L && "Expected valid arguments"); // We are going to resume the execution of the scalar loop. // Go over all of the induction variables that we found and fix the // PHIs that are left in the scalar version of the loop. @@ -3459,6 +3442,7 @@ // iteration in the vectorized loop. // If we come from a bypass edge then we need to start from the original // start value. + Instruction *OldInduction = Legal->getPrimaryInduction(); for (auto &InductionEntry : Legal->getInductionVars()) { PHINode *OrigPhi = InductionEntry.first; InductionDescriptor II = InductionEntry.second; @@ -3583,7 +3567,8 @@ return LoopVectorPreHeader; } -BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { +std::pair +InnerLoopVectorizer::createVectorizedLoopSkeleton() { /* In this function we generate a new loop. The new loop will contain the vectorized instructions while the old loop will continue to run the @@ -3648,33 +3633,12 @@ // faster. emitMemRuntimeChecks(Lp, LoopScalarPreHeader); - // Some loops have a single integer induction variable, while other loops - // don't. One example is c++ iterators that often have multiple pointer - // induction variables. In the code below we also support a case where we - // don't have a single induction variable. - // - // We try to obtain an induction variable from the original loop as hard - // as possible. However if we don't find one that: - // - is an integer - // - counts from zero, stepping by one - // - is the size of the widest induction variable type - // then we create a new one. - OldInduction = Legal->getPrimaryInduction(); - Type *IdxTy = Legal->getWidestInductionType(); - Value *StartIdx = ConstantInt::get(IdxTy, 0); - // The loop step is equal to the vectorization factor (num of SIMD elements) - // times the unroll factor (num of SIMD instructions). - Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); - Value *Step = createStepForVF(Builder, IdxTy, VF, UF); - Value *CountRoundDown = getOrCreateVectorTripCount(Lp); - Induction = - createInductionVariable(Lp, StartIdx, CountRoundDown, Step, - getDebugLocFromInstOrOperands(OldInduction)); + createHeaderBranch(Lp); // Emit phis for the new starting index of the scalar loop. - createInductionResumeValues(Lp, CountRoundDown); + createInductionResumeValues(Lp); - return completeLoopSkeleton(Lp, OrigLoopID); + return {completeLoopSkeleton(Lp, OrigLoopID), nullptr}; } // Fix up external users of the induction variable. At this point, we are @@ -4567,6 +4531,9 @@ InductionDescriptor II = Legal->getInductionVars().lookup(P); const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); + auto *IVR = PhiR->getParent()->getPlan()->getCanonicalIV(); + PHINode *CanonicalIV = cast(State.get(IVR, 0)); + // FIXME: The newly created binary instructions should contain nsw/nuw flags, // which can be found from the original scalar operations. switch (II.getKind()) { @@ -4582,7 +4549,7 @@ if (Cost->isScalarAfterVectorization(P, State.VF)) { // This is the normalized GEP that starts counting at zero. Value *PtrInd = - Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); + Builder.CreateSExtOrTrunc(CanonicalIV, II.getStep()->getType()); // Determine the number of scalars we need to generate for each unroll // iteration. If the instruction is uniform, we only need to generate the // first lane. Otherwise, we generate all VF values. @@ -4615,7 +4582,7 @@ Value *ScalarStartValue = II.getStartValue(); Type *ScStValueType = ScalarStartValue->getType(); PHINode *NewPointerPhi = - PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); + PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV); NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); // A pointer induction, performed by using a gep @@ -7962,8 +7929,9 @@ // 1. Create a new empty loop. Unlink the old loop and connect the new one. VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; - State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); - State.CanonicalIV = ILV.Induction; + Value *CanonicalIVStartValue; + std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = + ILV.createVectorizedLoopSkeleton(); ILV.collectPoisonGeneratingRecipes(State); ILV.printDebugTracesAtStart(); @@ -7977,7 +7945,9 @@ //===------------------------------------------------===// // 2. Copy and widen instructions from the old loop into the new loop. - BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr), State); + BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr), + ILV.getOrCreateVectorTripCount(nullptr), + CanonicalIVStartValue, State); BestVPlan.execute(&State); // 3. Fix the vectorized code: take care of header phi's, live-outs, @@ -8086,7 +8056,8 @@ /// This function is partially responsible for generating the control flow /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. -BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { +std::pair +EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { MDNode *OrigLoopID = OrigLoop->getLoopID(); Loop *Lp = createVectorLoopSkeleton(""); @@ -8115,24 +8086,16 @@ emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); // Generate the induction variable. - OldInduction = Legal->getPrimaryInduction(); - Type *IdxTy = Legal->getWidestInductionType(); - Value *StartIdx = ConstantInt::get(IdxTy, 0); - - IRBuilder<> B(&*Lp->getLoopPreheader()->getFirstInsertionPt()); - Value *Step = getRuntimeVF(B, IdxTy, VF * UF); Value *CountRoundDown = getOrCreateVectorTripCount(Lp); EPI.VectorTripCount = CountRoundDown; - Induction = - createInductionVariable(Lp, StartIdx, CountRoundDown, Step, - getDebugLocFromInstOrOperands(OldInduction)); + createHeaderBranch(Lp); // Skip induction resume value creation here because they will be created in // the second pass. If we created them here, they wouldn't be used anyway, // because the vplan in the second pass still contains the inductions from the // original loop. - return completeLoopSkeleton(Lp, OrigLoopID); + return {completeLoopSkeleton(Lp, OrigLoopID), nullptr}; } void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { @@ -8214,7 +8177,7 @@ /// This function is partially responsible for generating the control flow /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. -BasicBlock * +std::pair EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { MDNode *OrigLoopID = OrigLoop->getLoopID(); Loop *Lp = createVectorLoopSkeleton("vec.epilog."); @@ -8280,13 +8243,7 @@ EPI.MainLoopIterationCountCheck); // Generate the induction variable. - OldInduction = Legal->getPrimaryInduction(); - Value *CountRoundDown = getOrCreateVectorTripCount(Lp); - Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); - Value *StartIdx = EPResumeVal; - Induction = - createInductionVariable(Lp, StartIdx, CountRoundDown, Step, - getDebugLocFromInstOrOperands(OldInduction)); + createHeaderBranch(Lp); // Generate induction resume values. These variables save the new starting // indexes for the scalar loop. They are used to test if there are any tail @@ -8295,12 +8252,11 @@ // check, then the resume value for the induction variable comes from // the trip count of the main vector loop, hence passing the AdditionalBypass // argument. - createInductionResumeValues(Lp, CountRoundDown, - {VecEpilogueIterationCountCheck, - EPI.VectorTripCount} /* AdditionalBypass */); + createInductionResumeValues(Lp, {VecEpilogueIterationCountCheck, + EPI.VectorTripCount} /* AdditionalBypass */); AddRuntimeUnrollDisableMetaData(Lp); - return completeLoopSkeleton(Lp, OrigLoopID); + return {completeLoopSkeleton(Lp, OrigLoopID), EPResumeVal}; } BasicBlock * @@ -8957,6 +8913,33 @@ } } +// Add a VPCanonicalIVPHIRecipe starting at 0 to the header and a +// CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF to the +// latch. +static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, + bool HasNUW, bool IsVPlanNative) { + Value *StartIdx = ConstantInt::get(IdxTy, 0); + auto *StartV = Plan.getOrAddVPValue(StartIdx); + + auto *CanonicalIV = new VPCanonicalIVPHIRecipe(StartV, DL); + VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); + VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); + if (IsVPlanNative) + Header = cast(Header->getSingleSuccessor()); + Header->insert(CanonicalIV, Header->begin()); + + auto *CanonicalIVIncrement = + new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW + : VPInstruction::CanonicalIVIncrement, + {CanonicalIV}, DL); + CanonicalIV->addOperand(CanonicalIVIncrement); + + VPBasicBlock *EB = TopRegion->getExitBasicBlock(); + if (IsVPlanNative) + EB = cast(EB->getSinglePredecessor()); + EB->appendRecipe(CanonicalIVIncrement); +} + VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( VFRange &Range, SmallPtrSetImpl &DeadInstructions, const MapVector &SinkAfter) { @@ -9024,6 +9007,12 @@ auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop"); auto Plan = std::make_unique(TopRegion); + Instruction *DLInst = + getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); + addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), + DLInst ? DLInst->getDebugLoc() : DebugLoc(), + !CM.foldTailByMasking(), false); + // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. LoopBlocksDFS DFS(OrigLoop); @@ -9316,6 +9305,9 @@ OrigLoop, Plan, [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, DeadInstructions, *PSE.getSE()); + + addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), + true, true); return Plan; } @@ -9673,9 +9665,10 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Int or FP induction being replicated."); - State.ILV->widenIntOrFpInduction(IV, getInductionDescriptor(), - getStartValue()->getLiveInIRValue(), - getTruncInst(), getVPValue(0), State); + auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0); + State.ILV->widenIntOrFpInduction( + IV, getInductionDescriptor(), getStartValue()->getLiveInIRValue(), + getTruncInst(), getVPValue(0), State, CanonicalIV); } void VPWidenPHIRecipe::execute(VPTransformState &State) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -69,6 +69,9 @@ /// vectors it is an expression determined at runtime. Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF); +/// Return a value for Step multiplied by VF. +Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF, int64_t Step); + /// A range of powers-of-2 vectorization factors with fixed start and /// adjustable end. The range includes start and excludes end, e.g.,: /// [1, 9) = {1, 2, 4, 8} @@ -790,6 +793,8 @@ SLPLoad, SLPStore, ActiveLaneMask, + CanonicalIVIncrement, + CanonicalIVIncrementNUW, }; private: @@ -1074,14 +1079,18 @@ /// Method to support type inquiry through isa, cast, and dyn_cast. static inline bool classof(const VPRecipeBase *B) { - return B->getVPDefID() == VPRecipeBase::VPWidenPHISC || + return B->getVPDefID() == VPRecipeBase::VPCanonicalIVPHISC || B->getVPDefID() == VPRecipeBase::VPFirstOrderRecurrencePHISC || - B->getVPDefID() == VPRecipeBase::VPReductionPHISC; + B->getVPDefID() == VPRecipeBase::VPReductionPHISC || + B->getVPDefID() == VPRecipeBase::VPWidenIntOrFpInductionSC || + B->getVPDefID() == VPRecipeBase::VPWidenPHISC; } static inline bool classof(const VPValue *V) { - return V->getVPValueID() == VPValue::VPVWidenPHISC || + return V->getVPValueID() == VPValue::VPVCanonicalIVPHISC || V->getVPValueID() == VPValue::VPVFirstOrderRecurrencePHISC || - V->getVPValueID() == VPValue::VPVReductionPHISC; + V->getVPValueID() == VPValue::VPVReductionPHISC || + V->getVPValueID() == VPValue::VPVWidenIntOrFpInductionSC || + V->getVPValueID() == VPValue::VPVWidenPHISC; } /// Generate the phi nodes. @@ -1133,6 +1142,9 @@ static inline bool classof(const VPRecipeBase *B) { return B->getVPDefID() == VPRecipeBase::VPWidenPHISC; } + static inline bool classof(const VPHeaderPHIRecipe *R) { + return R->getVPDefID() == VPRecipeBase::VPWidenPHISC; + } static inline bool classof(const VPValue *V) { return V->getVPValueID() == VPValue::VPVWidenPHISC; } @@ -1171,8 +1183,8 @@ static inline bool classof(const VPRecipeBase *R) { return R->getVPDefID() == VPRecipeBase::VPFirstOrderRecurrencePHISC; } - static inline bool classof(const VPWidenPHIRecipe *D) { - return D->getVPDefID() == VPRecipeBase::VPFirstOrderRecurrencePHISC; + static inline bool classof(const VPHeaderPHIRecipe *R) { + return R->getVPDefID() == VPRecipeBase::VPFirstOrderRecurrencePHISC; } static inline bool classof(const VPValue *V) { return V->getVPValueID() == VPValue::VPVFirstOrderRecurrencePHISC; @@ -1217,12 +1229,12 @@ static inline bool classof(const VPRecipeBase *R) { return R->getVPDefID() == VPRecipeBase::VPReductionPHISC; } + static inline bool classof(const VPHeaderPHIRecipe *R) { + return R->getVPDefID() == VPRecipeBase::VPReductionPHISC; + } static inline bool classof(const VPValue *V) { return V->getVPValueID() == VPValue::VPVReductionPHISC; } - static inline bool classof(const VPWidenPHIRecipe *R) { - return R->getVPDefID() == VPRecipeBase::VPReductionPHISC; - } /// Generate the phi/select nodes. void execute(VPTransformState &State) override; @@ -1620,6 +1632,36 @@ #endif }; +/// Canonical scalar induction phi of the vector loop. Starting at the specified +/// start value (either 0 or the resume value when vectorizing the epilogue +/// loop). VPWidenCanonicalIVRecipe represents the vector version of the +/// canonical induction variable. +class VPCanonicalIVPHIRecipe : public VPHeaderPHIRecipe { + DebugLoc DL; + +public: + VPCanonicalIVPHIRecipe(VPValue *StartV, DebugLoc DL) + : VPHeaderPHIRecipe(VPValue::VPVCanonicalIVPHISC, VPCanonicalIVPHISC, + nullptr, StartV), + DL(DL) {} + + ~VPCanonicalIVPHIRecipe() override = default; + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == VPCanonicalIVPHISC; + } + + /// Generate the canonical scalar induction phi of the vector loop. + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif +}; + /// A Recipe for widening the canonical induction variable of the vector loop. class VPWidenCanonicalIVRecipe : public VPRecipeBase, public VPValue { public: @@ -2139,6 +2181,9 @@ /// the tail. It equals TripCount - 1. VPValue *BackedgeTakenCount = nullptr; + /// Represents the vector trip count. + VPValue VectorTripCount; + /// Holds a mapping between Values and their corresponding VPValue inside /// VPlan. Value2VPValueTy Value2VPValue; @@ -2179,7 +2224,8 @@ } /// Prepare the plan for execution, setting up the required live-in values. - void prepareToExecute(Value *TripCount, VPTransformState &State); + void prepareToExecute(Value *TripCount, Value *VectorTripCount, + Value *CanonicalIVStartValue, VPTransformState &State); /// Generate the IR code for this VPlan. void execute(struct VPTransformState *State); @@ -2207,6 +2253,9 @@ return BackedgeTakenCount; } + /// The vector trip count. + VPValue &getVectorTripCount() { return VectorTripCount; } + /// Mark the plan to indicate that using Value2VPValue is not safe any /// longer, because it may be stale. void disableValue2VPValue() { Value2VPValueEnabled = false; } @@ -2299,6 +2348,21 @@ return !VPV->getDef() || (RepR && RepR->isUniform()); } + /// Returns the VPRegionBlock of the vector loop. + VPRegionBlock *getVectorLoopRegion() { + return cast(getEntry()); + } + + /// Returns the canonical induction recipe of the vector loop. + VPCanonicalIVPHIRecipe *getCanonicalIV() { + VPBasicBlock *EntryVPBB = getVectorLoopRegion()->getEntryBasicBlock(); + if (EntryVPBB->empty()) { + // VPlan native path. + EntryVPBB = cast(EntryVPBB->getSingleSuccessor()); + } + return cast(&*EntryVPBB->begin()); + } + private: /// Add to the given dominator tree the header block and every new basic block /// that was created between it and the latch block, inclusive. diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -711,6 +711,24 @@ } break; } + + case VPInstruction::CanonicalIVIncrement: + case VPInstruction::CanonicalIVIncrementNUW: { + if (Part != 0) { + Value *Next = State.get(this, 0); + State.set(this, Next, Part); + break; + } + bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementNUW; + auto *Phi = State.get(getOperand(0), 0); + // The loop step is equal to the vectorization factor (num of SIMD elements) + // times the unroll factor (num of SIMD instructions). + Value *Step = createStepForVF(Builder, Phi->getType(), State.VF, State.UF); + Value *Next = Builder.CreateAdd(Phi, Step, "index.next", IsNUW, false); + State.set(this, Next, Part); + + break; + } default: llvm_unreachable("Unsupported opcode for instruction"); } @@ -758,6 +776,12 @@ case VPInstruction::FirstOrderRecurrenceSplice: O << "first-order splice"; break; + case VPInstruction::CanonicalIVIncrement: + O << "VF * UF + "; + break; + case VPInstruction::CanonicalIVIncrementNUW: + O << "VF * UF +(nuw) "; + break; default: O << Instruction::getOpcodeName(getOpcode()); } @@ -786,7 +810,9 @@ FMF = FMFNew; } -void VPlan::prepareToExecute(Value *TripCountV, VPTransformState &State) { +void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, + Value *CanonicalIVStartValue, + VPTransformState &State) { // Check if the trip count is needed, and if so build it. if (TripCount && TripCount->getNumUsers()) { for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) @@ -805,6 +831,18 @@ for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) State.set(BackedgeTakenCount, VTCMO, Part); } + + for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) + State.set(&VectorTripCount, VectorTripCountV, Part); + + // When vectorizing the epilogue loop, the canonical induction start value + // needs to be changed from zero to the value after the main vector loop. + if (CanonicalIVStartValue) { + VPValue *VPV = new VPValue(CanonicalIVStartValue); + addExternalDef(VPV); + auto *IV = getCanonicalIV(); + IV->setOperand(0, VPV); + } } /// Generate the code inside the body of the vectorized loop. Assumes a single @@ -842,28 +880,6 @@ for (VPBlockBase *Block : depth_first(Entry)) Block->execute(State); - // Fix the latch value of reduction and first-order recurrences phis in the - // vector loop. - VPBasicBlock *Header = Entry->getEntryBasicBlock(); - for (VPRecipeBase &R : Header->phis()) { - auto *PhiR = dyn_cast(&R); - if (!PhiR || !(isa(&R) || - isa(&R))) - continue; - // For first-order recurrences and in-order reduction phis, only a single - // part is generated, which provides the last part from the previous - // iteration. Otherwise all UF parts are generated. - bool SinglePartNeeded = isa(&R) || - cast(&R)->isOrdered(); - unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF; - for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { - Value *VecPhi = State->get(PhiR, Part); - Value *Val = State->get(PhiR->getBackedgeValue(), - SinglePartNeeded ? State->UF - 1 : Part); - cast(VecPhi)->addIncoming(Val, VectorLatchBB); - } - } - // Setup branch terminator successors for VPBBs in VPBBsToFix based on // VPBB's successors. for (auto VPBB : State->CFG.VPBBsToFix) { @@ -899,6 +915,47 @@ assert(Merged && "Could not merge last basic block with latch."); VectorLatchBB = LastBB; + // Fix the latch value of canonical, reduction and first-order recurrences + // phis in the vector loop. + VPBasicBlock *Header = Entry->getEntryBasicBlock(); + if (Header->empty()) { + assert(EnableVPlanNativePath); + Header = cast(Header->getSingleSuccessor()); + } + for (VPRecipeBase &R : Header->phis()) { + // Skip phi-like recipes that generate their backedege values themselves. + // TODO: Model their backedge values explicitly. + if (isa(&R) || isa(&R)) + continue; + + auto *PhiR = cast(&R); + // For canonical IV, first-order recurrences and in-order reduction phis, + // only a single part is generated, which provides the last part from the + // previous iteration. For non-ordered reductions all UF parts are + // generated. + bool SinglePartNeeded = isa(PhiR) || + isa(PhiR) || + cast(PhiR)->isOrdered(); + unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF; + + for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { + Value *Phi = State->get(PhiR, Part); + Value *Val = State->get(PhiR->getBackedgeValue(), + SinglePartNeeded ? State->UF - 1 : Part); + cast(Phi)->addIncoming(Val, VectorLatchBB); + } + } + + // Add the loop exit condition and branch based on the canonical induction. + auto *CanonicalIV = getCanonicalIV(); + // TODO: Model compare and branch explicitly in VPlan as recipes. + auto *Next = State->get(CanonicalIV->getBackedgeValue(), 0); + auto *TermBr = cast(VectorLatchBB->getTerminator()); + State->Builder.SetInsertPoint(TermBr); + auto *ICmp = + State->Builder.CreateICmpEQ(Next, State->get(&getVectorTripCount(), 0)); + TermBr->setCondition(ICmp); + // We do not attempt to preserve DT for outer loop vectorization currently. if (!EnableVPlanNativePath) updateDominatorTree(State->DT, VectorPreHeaderBB, VectorLatchBB, @@ -1272,8 +1329,27 @@ } #endif +void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) { + Value *Start = getStartValue()->getLiveInIRValue(); + PHINode *EntryPart = PHINode::Create( + Start->getType(), 2, "index", &*State.CFG.PrevBB->getFirstInsertionPt()); + EntryPart->addIncoming(Start, State.CFG.VectorPreHeader); + EntryPart->setDebugLoc(DL); + for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) + State.set(this, EntryPart, Part); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "EMIT "; + getVPSingleValue()->printAsOperand(O, SlotTracker); + O << " = CANONICAL-INDUCTION"; +} +#endif + void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { - Value *CanonicalIV = State.CanonicalIV; + Value *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0); Type *STy = CanonicalIV->getType(); IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); ElementCount VF = State.VF; diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -102,6 +102,7 @@ // Phi-like VPValues. Need to be kept together. VPVBlendSC, + VPVCanonicalIVPHISC, VPVFirstOrderRecurrencePHISC, VPVWidenPHISC, VPVWidenIntOrFpInductionSC, @@ -333,6 +334,7 @@ // Phi-like recipes. Need to be kept together. VPBlendSC, + VPCanonicalIVPHISC, VPFirstOrderRecurrencePHISC, VPWidenPHISC, VPWidenIntOrFpInductionSC, diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -163,6 +163,13 @@ errs() << "VPlan entry block is not a VPBasicBlock\n"; return false; } + + if (!isa(&*Entry->begin())) { + errs() << "VPlan vector loop header does not start with a " + "VPCanonicalIVPHIRecipe\n"; + return false; + } + const VPBasicBlock *Exit = dyn_cast(TopRegion->getExit()); if (!Exit) { errs() << "VPlan exit block is not a VPBasicBlock\n"; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll @@ -31,30 +31,30 @@ ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32 -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 32 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP5]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 16 -; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[TMP8]], 0 -; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 1 -; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP12]], i32 0 -; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to * -; CHECK-NEXT: store shufflevector ( insertelement ( poison, i8 1, i32 0), poison, zeroinitializer), * [[TMP15]], align 1 -; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP17:%.*]] = mul i32 [[TMP16]], 16 -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, i8* [[TMP12]], i32 [[TMP17]] -; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8* [[TMP18]] to * -; CHECK-NEXT: store shufflevector ( insertelement ( poison, i8 1, i32 0), poison, zeroinitializer), * [[TMP19]], align 1 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16 +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1 +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP12]] to * +; CHECK-NEXT: store shufflevector ( insertelement ( poison, i8 1, i32 0), poison, zeroinitializer), * [[TMP13]], align 1 +; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 16 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, i8* [[TMP10]], i32 [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to * +; CHECK-NEXT: store shufflevector ( insertelement ( poison, i8 1, i32 0), poison, zeroinitializer), * [[TMP17]], align 1 +; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 32 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]] ; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: @@ -68,18 +68,18 @@ ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-NEXT: [[INDEX4:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX4]], 0 ; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[TMP21]] ; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, i8* [[TMP22]], i32 0 ; CHECK-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP23]] to <8 x i8>* ; CHECK-NEXT: store <8 x i8> , <8 x i8>* [[TMP24]], align 1 -; CHECK-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 8 -; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 1024 +; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX4]], 8 +; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 1024 ; CHECK-NEXT: br i1 [[TMP25]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[CMP_N3:%.*]] = icmp eq i64 1024, 1024 -; CHECK-NEXT: br i1 [[CMP_N3]], label [[EXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK-NEXT: [[CMP_N1:%.*]] = icmp eq i64 1024, 1024 +; CHECK-NEXT: br i1 [[CMP_N1]], label [[EXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] @@ -94,6 +94,7 @@ ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: ; CHECK-NEXT: ret void +; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll @@ -10,6 +10,7 @@ ; CHECK: VPlan 'Initial VPlan for VF={vscale x 2},UF>=1' { ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: loop.body: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next ; CHECK-NEXT: WIDEN-PHI %ptr.iv.1 = phi %start.1, %ptr.iv.1.next ; CHECK-NEXT: WIDEN-PHI %ptr.iv.2 = phi %start.2, %ptr.iv.2.next @@ -18,6 +19,7 @@ ; CHECK-NEXT: WIDEN ir<%lv> = load ir<%ptr.iv.2> ; CHECK-NEXT: WIDEN ir<%add> = add ir<%lv>, ir<1> ; CHECK-NEXT: WIDEN store ir<%ptr.iv.2>, ir<%add> +; CHECK-NEXT: EMIT vp<{{.+}}> = VF * UF +(nuw) vp<[[CAN_IV]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } @@ -71,8 +73,8 @@ ; CHECK-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP23]] -; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, i8* [[POINTER_PHI]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] @@ -140,7 +142,7 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi i8* [ [[START]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX3:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2 ; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 1 @@ -152,7 +154,7 @@ ; CHECK-NEXT: [[TMP11:%.*]] = add [[DOTSPLAT]], [[TMP10]] ; CHECK-NEXT: [[VECTOR_GEP:%.*]] = mul [[TMP11]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[POINTER_PHI]], [[VECTOR_GEP]] -; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX3]], 0 ; CHECK-NEXT: [[TMP14:%.*]] = extractelement [[TMP12]], i32 0 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i8, i8* [[TMP14]], i32 0 ; CHECK-NEXT: [[TMP16:%.*]] = bitcast i8* [[TMP15]] to * @@ -161,20 +163,20 @@ ; CHECK-NEXT: [[TMP18:%.*]] = icmp eq [[TMP17]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[TMP20]] -; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX3]], [[TMP20]] ; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, i8* [[POINTER_PHI]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[PTR_PHI:%.*]] = phi i8* [ [[PTR_PHI_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[INDEX_NXT]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP22:%.*]] = load i8, i8* [[PTR_PHI]], align 1 ; CHECK-NEXT: [[PTR_PHI_NEXT]] = getelementptr inbounds i8, i8* [[PTR_PHI]], i64 1 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll @@ -74,8 +74,8 @@ ; CHECK-NEXT: [[TMP33:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP34:%.*]] = shl nuw nsw i64 [[TMP33]], 3 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP34]] -; CHECK-NEXT: [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: [[PTR_IND]] = getelementptr i32, i32* [[POINTER_PHI]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 @@ -266,8 +266,8 @@ ; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]] -; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: [[PTR_IND]] = getelementptr i32, i32* [[POINTER_PHI]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[TMP9]]) @@ -343,8 +343,8 @@ ; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP9:%.*]] = shl nuw nsw i64 [[TMP8]], 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: [[PTR_IND]] = getelementptr i16, i16* [[POINTER_PHI]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll @@ -128,8 +128,8 @@ ; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) ; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i32> [[VEC_PHI]], [[WIDE_MASKED_LOAD]] ; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i32> [[TMP4]], [[WIDE_MASKED_LOAD1]] -; CHECK-NEXT: [[TMP6]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP5]], <4 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP6]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP5]], <4 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: @@ -182,8 +182,8 @@ ; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) ; CHECK-NEXT: [[TMP4:%.*]] = and <4 x i32> [[VEC_PHI]], [[WIDE_MASKED_LOAD]] ; CHECK-NEXT: [[TMP5:%.*]] = and <4 x i32> [[TMP4]], [[WIDE_MASKED_LOAD1]] -; CHECK-NEXT: [[TMP6]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP5]], <4 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP6]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP5]], <4 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: @@ -235,9 +235,9 @@ ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) ; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP4]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP6]] = or <4 x i32> [[VEC_PHI]], [[TMP5]] -; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: @@ -289,9 +289,9 @@ ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) ; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP4]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP6]] = xor <4 x i32> [[VEC_PHI]], [[TMP5]] -; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: @@ -344,8 +344,8 @@ ; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> poison) ; CHECK-NEXT: [[TMP4:%.*]] = fadd fast <4 x float> [[VEC_PHI]], [[WIDE_MASKED_LOAD]] ; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <4 x float> [[TMP4]], [[WIDE_MASKED_LOAD1]] -; CHECK-NEXT: [[TMP6]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> [[TMP5]], <4 x float> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP6]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> [[TMP5]], <4 x float> [[VEC_PHI]] ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: @@ -398,8 +398,8 @@ ; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> poison) ; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <4 x float> [[VEC_PHI]], [[WIDE_MASKED_LOAD]] ; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <4 x float> [[TMP4]], [[WIDE_MASKED_LOAD1]] -; CHECK-NEXT: [[TMP6]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> [[TMP5]], <4 x float> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP6]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> [[TMP5]], <4 x float> [[VEC_PHI]] ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll @@ -276,8 +276,8 @@ ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) ; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP4]], <4 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP4]], <4 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: @@ -347,8 +347,8 @@ ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) ; CHECK-NEXT: [[TMP4:%.*]] = and <4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP4]], <4 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP4]], <4 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: @@ -418,8 +418,8 @@ ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) ; CHECK-NEXT: [[TMP4:%.*]] = or <4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP4]], <4 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP4]], <4 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: @@ -489,8 +489,8 @@ ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) ; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP4]], <4 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP4]], <4 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: @@ -560,8 +560,8 @@ ; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> poison) ; CHECK-NEXT: [[TMP4:%.*]] = fadd fast <4 x float> [[WIDE_MASKED_LOAD]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> [[TMP4]], <4 x float> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> [[TMP4]], <4 x float> [[VEC_PHI]] ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: middle.block: @@ -631,8 +631,8 @@ ; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> poison) ; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <4 x float> [[WIDE_MASKED_LOAD]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> [[TMP4]], <4 x float> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> [[TMP4]], <4 x float> [[VEC_PHI]] ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll b/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll --- a/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll @@ -116,8 +116,8 @@ ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[NEXT_GEP]] to <4 x i32>* ; CHECK-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 996 ; CHECK-NEXT: [[PTR_IND]] = getelementptr i32, i32* [[POINTER_PHI]], i32 12 +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 996 ; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: for.body: ; CHECK-NEXT: [[A_ADDR_09:%.*]] = phi i32* [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ] @@ -555,8 +555,8 @@ ; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[NEXT_GEP]] to <4 x float>* ; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float>* [[TMP2]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 996 ; CHECK-NEXT: [[PTR_IND]] = getelementptr float, float* [[POINTER_PHI]], i32 12 +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 996 ; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK: for.body: ; CHECK-NEXT: [[A_ADDR_09:%.*]] = phi float* [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ] @@ -769,8 +769,8 @@ ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* ; CHECK-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* [[TMP6]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 9992 ; CHECK-NEXT: [[PTR_IND]] = getelementptr i32, i32* [[POINTER_PHI]], i32 48 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 9992 ; CHECK-NEXT: br i1 [[TMP7]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void @@ -853,8 +853,8 @@ ; CHECK-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>* ; CHECK-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP14]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], 9984 ; CHECK-NEXT: [[PTR_IND]] = getelementptr i32, i32* [[POINTER_PHI]], i32 96 +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], 9984 ; CHECK-NEXT: br i1 [[TMP15]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void @@ -924,9 +924,9 @@ ; CHECK-NEXT: call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> [[TMP5]], <4 x i8*> [[TMP7]], i32 1, <4 x i1> ), !alias.scope !31, !noalias !28 ; CHECK-NEXT: call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> [[TMP6]], <4 x i8*> [[TMP8]], i32 1, <4 x i1> ), !alias.scope !31, !noalias !28 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 ; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, i8* [[POINTER_PHI]], i32 12 ; CHECK-NEXT: [[PTR_IND6]] = getelementptr i8, i8* [[POINTER_PHI5]], i32 12 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 ; CHECK-NEXT: br i1 [[TMP9]], label [[END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]] ; CHECK: for.body: ; CHECK-NEXT: [[X_ADDR_050:%.*]] = phi i8* [ [[INCDEC_PTR2:%.*]], [[FOR_BODY]] ], [ [[X]], [[ENTRY:%.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll --- a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll @@ -25,22 +25,22 @@ ; AVX512-NEXT: iter.check: ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX512: vector.body: -; AVX512-NEXT: [[INDEX7:%.*]] = phi i64 [ 0, [[ITER_CHECK:%.*]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY]] ] -; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDEX7]] +; AVX512-NEXT: [[INDEX8:%.*]] = phi i64 [ 0, [[ITER_CHECK:%.*]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY]] ] +; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDEX8]] ; AVX512-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP1]], align 4 ; AVX512-NEXT: [[TMP2:%.*]] = icmp sgt <16 x i32> [[WIDE_LOAD]], zeroinitializer -; AVX512-NEXT: [[TMP3:%.*]] = getelementptr i32, i32* [[INDEX:%.*]], i64 [[INDEX7]] +; AVX512-NEXT: [[TMP3:%.*]] = getelementptr i32, i32* [[INDEX:%.*]], i64 [[INDEX8]] ; AVX512-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP4]], i32 4, <16 x i1> [[TMP2]], <16 x i32> poison) ; AVX512-NEXT: [[TMP5:%.*]] = sext <16 x i32> [[WIDE_MASKED_LOAD]] to <16 x i64> ; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[IN:%.*]], <16 x i64> [[TMP5]] ; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP6]], i32 4, <16 x i1> [[TMP2]], <16 x float> undef) ; AVX512-NEXT: [[TMP7:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER]], -; AVX512-NEXT: [[TMP8:%.*]] = getelementptr float, float* [[OUT:%.*]], i64 [[INDEX7]] +; AVX512-NEXT: [[TMP8:%.*]] = getelementptr float, float* [[OUT:%.*]], i64 [[INDEX8]] ; AVX512-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <16 x float>* ; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP7]], <16 x float>* [[TMP9]], i32 4, <16 x i1> [[TMP2]]) -; AVX512-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX7]], 16 +; AVX512-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX8]], 16 ; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT]] ; AVX512-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_LOAD_1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP11]], align 4 @@ -55,7 +55,7 @@ ; AVX512-NEXT: [[TMP18:%.*]] = getelementptr float, float* [[OUT]], i64 [[INDEX_NEXT]] ; AVX512-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP18]] to <16 x float>* ; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP17]], <16 x float>* [[TMP19]], i32 4, <16 x i1> [[TMP12]]) -; AVX512-NEXT: [[INDEX_NEXT_1:%.*]] = or i64 [[INDEX7]], 32 +; AVX512-NEXT: [[INDEX_NEXT_1:%.*]] = or i64 [[INDEX8]], 32 ; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT_1]] ; AVX512-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_LOAD_2:%.*]] = load <16 x i32>, <16 x i32>* [[TMP21]], align 4 @@ -70,7 +70,7 @@ ; AVX512-NEXT: [[TMP28:%.*]] = getelementptr float, float* [[OUT]], i64 [[INDEX_NEXT_1]] ; AVX512-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to <16 x float>* ; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP27]], <16 x float>* [[TMP29]], i32 4, <16 x i1> [[TMP22]]) -; AVX512-NEXT: [[INDEX_NEXT_2:%.*]] = or i64 [[INDEX7]], 48 +; AVX512-NEXT: [[INDEX_NEXT_2:%.*]] = or i64 [[INDEX8]], 48 ; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT_2]] ; AVX512-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_LOAD_3:%.*]] = load <16 x i32>, <16 x i32>* [[TMP31]], align 4 @@ -85,7 +85,7 @@ ; AVX512-NEXT: [[TMP38:%.*]] = getelementptr float, float* [[OUT]], i64 [[INDEX_NEXT_2]] ; AVX512-NEXT: [[TMP39:%.*]] = bitcast float* [[TMP38]] to <16 x float>* ; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP37]], <16 x float>* [[TMP39]], i32 4, <16 x i1> [[TMP32]]) -; AVX512-NEXT: [[INDEX_NEXT_3]] = add nuw nsw i64 [[INDEX7]], 64 +; AVX512-NEXT: [[INDEX_NEXT_3]] = add nuw nsw i64 [[INDEX8]], 64 ; AVX512-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT_3]], 4096 ; AVX512-NEXT: br i1 [[TMP40]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; AVX512: for.end: @@ -95,8 +95,8 @@ ; FVW2-NEXT: entry: ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] ; FVW2: vector.body: -; FVW2-NEXT: [[INDEX7:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FVW2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDEX7]] +; FVW2-NEXT: [[INDEX17:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; FVW2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDEX17]] ; FVW2-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <2 x i32>* ; FVW2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4 ; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2 @@ -112,7 +112,7 @@ ; FVW2-NEXT: [[TMP9:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD8]], zeroinitializer ; FVW2-NEXT: [[TMP10:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD9]], zeroinitializer ; FVW2-NEXT: [[TMP11:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD10]], zeroinitializer -; FVW2-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[INDEX:%.*]], i64 [[INDEX7]] +; FVW2-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[INDEX:%.*]], i64 [[INDEX17]] ; FVW2-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <2 x i32>* ; FVW2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP13]], i32 4, <2 x i1> [[TMP8]], <2 x i32> poison) ; FVW2-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[TMP12]], i64 2 @@ -140,7 +140,7 @@ ; FVW2-NEXT: [[TMP29:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER14]], ; FVW2-NEXT: [[TMP30:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER15]], ; FVW2-NEXT: [[TMP31:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER16]], -; FVW2-NEXT: [[TMP32:%.*]] = getelementptr float, float* [[OUT:%.*]], i64 [[INDEX7]] +; FVW2-NEXT: [[TMP32:%.*]] = getelementptr float, float* [[OUT:%.*]], i64 [[INDEX17]] ; FVW2-NEXT: [[TMP33:%.*]] = bitcast float* [[TMP32]] to <2 x float>* ; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP28]], <2 x float>* [[TMP33]], i32 4, <2 x i1> [[TMP8]]) ; FVW2-NEXT: [[TMP34:%.*]] = getelementptr float, float* [[TMP32]], i64 2 @@ -152,7 +152,7 @@ ; FVW2-NEXT: [[TMP38:%.*]] = getelementptr float, float* [[TMP32]], i64 6 ; FVW2-NEXT: [[TMP39:%.*]] = bitcast float* [[TMP38]] to <2 x float>* ; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP31]], <2 x float>* [[TMP39]], i32 4, <2 x i1> [[TMP11]]) -; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX7]], 8 +; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX17]], 8 ; FVW2-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 ; FVW2-NEXT: br i1 [[TMP40]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; FVW2: for.end: @@ -365,9 +365,9 @@ ; FVW2-NEXT: entry: ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] ; FVW2: vector.body: -; FVW2-NEXT: [[INDEX7:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE9:%.*]] ] +; FVW2-NEXT: [[INDEX10:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE9:%.*]] ] ; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE9]] ] -; FVW2-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX7]], 4 +; FVW2-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX10]], 4 ; FVW2-NEXT: [[TMP0:%.*]] = or i64 [[OFFSET_IDX]], 16 ; FVW2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[OFFSET_IDX]] ; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]] @@ -395,7 +395,7 @@ ; FVW2-NEXT: store float [[TMP15]], float* [[TMP14]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE9]] ; FVW2: pred.store.continue9: -; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX7]], 2 +; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX10]], 2 ; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], ; FVW2-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 ; FVW2-NEXT: br i1 [[TMP16]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] @@ -841,9 +841,9 @@ ; FVW2-NEXT: entry: ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] ; FVW2: vector.body: -; FVW2-NEXT: [[INDEX7:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE9:%.*]] ] +; FVW2-NEXT: [[INDEX10:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE9:%.*]] ] ; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE9]] ] -; FVW2-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX7]], 4 +; FVW2-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX10]], 4 ; FVW2-NEXT: [[TMP0:%.*]] = or i64 [[OFFSET_IDX]], 16 ; FVW2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[OFFSET_IDX]] ; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]] @@ -871,7 +871,7 @@ ; FVW2-NEXT: store float [[TMP15]], float addrspace(1)* [[TMP14]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE9]] ; FVW2: pred.store.continue9: -; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX7]], 2 +; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX10]], 2 ; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], ; FVW2-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 ; FVW2-NEXT: br i1 [[TMP16]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] @@ -1072,9 +1072,9 @@ ; FVW2-NEXT: entry: ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] ; FVW2: vector.body: -; FVW2-NEXT: [[INDEX7:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE9:%.*]] ] +; FVW2-NEXT: [[INDEX10:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE9:%.*]] ] ; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE9]] ] -; FVW2-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX7]], 4 +; FVW2-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX10]], 4 ; FVW2-NEXT: [[TMP0:%.*]] = or i64 [[OFFSET_IDX]], 16 ; FVW2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[OFFSET_IDX]] ; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]] @@ -1102,7 +1102,7 @@ ; FVW2-NEXT: store float [[TMP15]], float* [[TMP14]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE9]] ; FVW2: pred.store.continue9: -; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX7]], 2 +; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX10]], 2 ; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], ; FVW2-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 ; FVW2-NEXT: br i1 [[TMP16]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] @@ -1303,9 +1303,9 @@ ; FVW2-NEXT: entry: ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] ; FVW2: vector.body: -; FVW2-NEXT: [[INDEX7:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE9:%.*]] ] +; FVW2-NEXT: [[INDEX10:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE9:%.*]] ] ; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE9]] ] -; FVW2-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX7]], 4 +; FVW2-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX10]], 4 ; FVW2-NEXT: [[TMP0:%.*]] = or i64 [[OFFSET_IDX]], 16 ; FVW2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[OFFSET_IDX]] ; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]] @@ -1333,7 +1333,7 @@ ; FVW2-NEXT: store float [[TMP15]], float addrspace(1)* [[TMP14]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE9]] ; FVW2: pred.store.continue9: -; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX7]], 2 +; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX10]], 2 ; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], ; FVW2-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 ; FVW2-NEXT: br i1 [[TMP16]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll --- a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll @@ -142,19 +142,19 @@ ; CHECK: vector.ph10: ; CHECK-NEXT: [[N_RND_UP11:%.*]] = add nuw nsw i64 [[TMP19]], 4 ; CHECK-NEXT: [[N_VEC13:%.*]] = and i64 [[N_RND_UP11]], 8589934588 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT20:%.*]] = insertelement <4 x i64> poison, i64 [[TMP19]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT21:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT20]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT18:%.*]] = insertelement <4 x i64> poison, i64 [[TMP19]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT19:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT18]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY9:%.*]] ; CHECK: vector.body9: -; CHECK-NEXT: [[INDEX14:%.*]] = phi i64 [ 0, [[VECTOR_PH10]] ], [ [[INDEX_NEXT15:%.*]], [[PRED_STORE_CONTINUE37:%.*]] ] -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[I_0_LCSSA]], [[INDEX14]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT28:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX14]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT29:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT28]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[VEC_IV:%.*]] = or <4 x i64> [[BROADCAST_SPLAT29]], -; CHECK-NEXT: [[TMP20:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT21]] +; CHECK-NEXT: [[INDEX38:%.*]] = phi i64 [ 0, [[VECTOR_PH10]] ], [ [[INDEX_NEXT37:%.*]], [[PRED_STORE_CONTINUE36:%.*]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[I_0_LCSSA]], [[INDEX38]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT27:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX38]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT28:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT27]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = or <4 x i64> [[BROADCAST_SPLAT28]], +; CHECK-NEXT: [[TMP20:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT19]] ; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i1> [[TMP20]], i64 0 -; CHECK-NEXT: br i1 [[TMP21]], label [[PRED_STORE_IF30:%.*]], label [[PRED_STORE_CONTINUE31:%.*]] -; CHECK: pred.store.if30: +; CHECK-NEXT: br i1 [[TMP21]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30:%.*]] +; CHECK: pred.store.if29: ; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4 ; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 [[OFFSET_IDX]] @@ -162,11 +162,11 @@ ; CHECK-NEXT: [[TMP26:%.*]] = and i32 [[TMP25]], [[TMP23]] ; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[OFFSET_IDX]] ; CHECK-NEXT: store i32 [[TMP26]], i32* [[TMP27]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE31]] -; CHECK: pred.store.continue31: +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE30]] +; CHECK: pred.store.continue30: ; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i1> [[TMP20]], i64 1 -; CHECK-NEXT: br i1 [[TMP28]], label [[PRED_STORE_IF32:%.*]], label [[PRED_STORE_CONTINUE33:%.*]] -; CHECK: pred.store.if32: +; CHECK-NEXT: br i1 [[TMP28]], label [[PRED_STORE_IF31:%.*]], label [[PRED_STORE_CONTINUE32:%.*]] +; CHECK: pred.store.if31: ; CHECK-NEXT: [[TMP29:%.*]] = add i64 [[OFFSET_IDX]], 1 ; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP29]] ; CHECK-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4 @@ -175,11 +175,11 @@ ; CHECK-NEXT: [[TMP34:%.*]] = and i32 [[TMP33]], [[TMP31]] ; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[TMP29]] ; CHECK-NEXT: store i32 [[TMP34]], i32* [[TMP35]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE33]] -; CHECK: pred.store.continue33: +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE32]] +; CHECK: pred.store.continue32: ; CHECK-NEXT: [[TMP36:%.*]] = extractelement <4 x i1> [[TMP20]], i64 2 -; CHECK-NEXT: br i1 [[TMP36]], label [[PRED_STORE_IF34:%.*]], label [[PRED_STORE_CONTINUE35:%.*]] -; CHECK: pred.store.if34: +; CHECK-NEXT: br i1 [[TMP36]], label [[PRED_STORE_IF33:%.*]], label [[PRED_STORE_CONTINUE34:%.*]] +; CHECK: pred.store.if33: ; CHECK-NEXT: [[TMP37:%.*]] = add i64 [[OFFSET_IDX]], 2 ; CHECK-NEXT: [[TMP38:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP37]] ; CHECK-NEXT: [[TMP39:%.*]] = load i32, i32* [[TMP38]], align 4 @@ -188,11 +188,11 @@ ; CHECK-NEXT: [[TMP42:%.*]] = and i32 [[TMP41]], [[TMP39]] ; CHECK-NEXT: [[TMP43:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[TMP37]] ; CHECK-NEXT: store i32 [[TMP42]], i32* [[TMP43]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE35]] -; CHECK: pred.store.continue35: +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE34]] +; CHECK: pred.store.continue34: ; CHECK-NEXT: [[TMP44:%.*]] = extractelement <4 x i1> [[TMP20]], i64 3 -; CHECK-NEXT: br i1 [[TMP44]], label [[PRED_STORE_IF36:%.*]], label [[PRED_STORE_CONTINUE37]] -; CHECK: pred.store.if36: +; CHECK-NEXT: br i1 [[TMP44]], label [[PRED_STORE_IF35:%.*]], label [[PRED_STORE_CONTINUE36]] +; CHECK: pred.store.if35: ; CHECK-NEXT: [[TMP45:%.*]] = add i64 [[OFFSET_IDX]], 3 ; CHECK-NEXT: [[TMP46:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP45]] ; CHECK-NEXT: [[TMP47:%.*]] = load i32, i32* [[TMP46]], align 4 @@ -201,10 +201,10 @@ ; CHECK-NEXT: [[TMP50:%.*]] = and i32 [[TMP49]], [[TMP47]] ; CHECK-NEXT: [[TMP51:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[TMP45]] ; CHECK-NEXT: store i32 [[TMP50]], i32* [[TMP51]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE37]] -; CHECK: pred.store.continue37: -; CHECK-NEXT: [[INDEX_NEXT15]] = add i64 [[INDEX14]], 4 -; CHECK-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC13]] +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE36]] +; CHECK: pred.store.continue36: +; CHECK-NEXT: [[INDEX_NEXT37]] = add i64 [[INDEX38]], 4 +; CHECK-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT37]], [[N_VEC13]] ; CHECK-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK7:%.*]], label [[VECTOR_BODY9]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block7: ; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[SCALAR_PH8]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll --- a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll @@ -188,8 +188,8 @@ ; CHECK-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP10]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison) ; CHECK-NEXT: [[TMP11:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD3]], [[WIDE_MASKED_LOAD]] ; CHECK-NEXT: [[TMP12]] = add <8 x i32> [[TMP11]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP4]], <8 x i32> [[TMP12]], <8 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP4]], <8 x i32> [[TMP12]], <8 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]] ; CHECK: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/discriminator.ll b/llvm/test/Transforms/LoopVectorize/discriminator.ll --- a/llvm/test/Transforms/LoopVectorize/discriminator.ll +++ b/llvm/test/Transforms/LoopVectorize/discriminator.ll @@ -47,8 +47,8 @@ ;LOOPUNROLL_5: discriminator: 21 ; When unrolling after loop vectorize, both vec_body and remainder loop ; are unrolled. -;LOOPVEC_UNROLL: discriminator: 385 ;LOOPVEC_UNROLL: discriminator: 9 +;LOOPVEC_UNROLL: discriminator: 385 ;DBG_VALUE: ![[DBG]] = {{.*}}, scope: ![[TOP]] !llvm.dbg.cu = !{!0} diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll @@ -13,6 +13,7 @@ ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: loop: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ; CHECK-NEXT: FIRST-ORDER-RECURRENCE-PHI ir<%0> = phi ir<0>, ir<%conv> ; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next ; CHECK-NEXT: EMIT vp<[[MASK:%.+]]> = icmp ule ir<%iv> vp<[[BTC]]> @@ -61,6 +62,7 @@ ; CHECK-EMPTY: ; CHECK-NEXT: loop.1.split: ; CHECK-NEXT: WIDEN ir<%add> = add ir<%conv>, vp<[[PRED2]]> +; CHECK-NEXT: EMIT vp<{{.+}}> = VF * UF + vp<[[CAN_IV]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: No successors @@ -92,6 +94,7 @@ ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: loop: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ; CHECK-NEXT: FIRST-ORDER-RECURRENCE-PHI ir<%recur> = phi ir<0>, ir<%recur.next> ; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next ; CHECK-NEXT: EMIT vp<[[MASK:%.+]]> = icmp ule ir<%iv> vp<[[BTC]]> @@ -125,6 +128,7 @@ ; CHECK-NEXT: Successor(s): loop.1 ; CHECK-EMPTY: ; CHECK-NEXT: loop.1: +; CHECK-NEXT: EMIT vp<{{.+}}> = VF * UF + vp<[[CAN_IV]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: No successors @@ -156,6 +160,7 @@ ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: loop: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ; CHECK-NEXT: FIRST-ORDER-RECURRENCE-PHI ir<%recur> = phi ir<0>, ir<%recur.next> ; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next ; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%and.red> = phi ir<1234>, ir<%and.red.next> @@ -186,6 +191,7 @@ ; CHECK-NEXT: loop.0.split: ; CHECK-NEXT: WIDEN ir<%add> = add vp<[[PRED]]>, ir<%recur.next> ; CHECK-NEXT: WIDEN ir<%and.red.next> = and ir<%and.red>, ir<%add> +; CHECK-NEXT: EMIT vp<{{.+}}> = VF * UF + vp<[[CAN_IV]]> ; CHECK-NEXT: EMIT vp<[[SEL:%.+]]> = select vp<[[MASK]]> ir<%and.red.next> ir<%and.red> ; CHECK-NEXT: No successors ; CHECK-NEXT: } @@ -221,6 +227,7 @@ ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: loop: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ; CHECK-NEXT: FIRST-ORDER-RECURRENCE-PHI ir<%0> = phi ir<0>, ir<%conv> ; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next ; CHECK-NEXT: EMIT vp<[[MASK:%.+]]> = icmp ule ir<%iv> vp<[[BTC]]> @@ -275,6 +282,7 @@ ; CHECK-NEXT: WIDEN ir<%add.1> = add ir<%conv>, vp<[[PRED1]]> ; CHECK-NEXT: WIDEN ir<%conv.lv.2> = sext vp<[[PRED2]]> ; CHECK-NEXT: WIDEN ir<%add> = add ir<%add.1>, ir<%conv.lv.2> +; CHECK-NEXT: EMIT vp<{{.+}}> = VF * UF + vp<[[CAN_IV]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: No successors @@ -310,6 +318,7 @@ ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: loop: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ; CHECK-NEXT: FIRST-ORDER-RECURRENCE-PHI ir<%recur> = phi ir<0>, ir<%recur.next> ; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next ; CHECK-NEXT: EMIT vp<[[MASK:%.+]]> = icmp ule ir<%iv> vp<[[BTC]]> @@ -361,6 +370,7 @@ ; CHECK-NEXT: Successor(s): loop.2 ; CHECK-EMPTY: ; CHECK-NEXT: loop.2: +; CHECK-NEXT: EMIT vp<{{.+}}> = VF * UF + vp<[[CAN_IV]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: No successors diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll @@ -5509,9 +5509,9 @@ ; UNROLL-NO-IC-NEXT: [[TMP45:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> [[TMP43]], <4 x i32> ; UNROLL-NO-IC-NEXT: [[TMP46]] = add <4 x i32> [[VEC_PHI]], [[TMP44]] ; UNROLL-NO-IC-NEXT: [[TMP47]] = add <4 x i32> [[VEC_PHI2]], [[TMP45]] +; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 ; UNROLL-NO-IC-NEXT: [[TMP48:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP46]], <4 x i32> [[VEC_PHI]] ; UNROLL-NO-IC-NEXT: [[TMP49:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP47]], <4 x i32> [[VEC_PHI2]] -; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 ; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], ; UNROLL-NO-IC-NEXT: [[TMP50:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: br i1 [[TMP50]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF52:![0-9]+]], !llvm.loop [[LOOP53:![0-9]+]] @@ -5578,9 +5578,9 @@ ; UNROLL-NO-VF-NEXT: [[TMP7]] = phi i32 [ poison, [[PRED_UDIV_CONTINUE]] ], [ [[TMP6]], [[PRED_UDIV_IF4]] ] ; UNROLL-NO-VF-NEXT: [[TMP8]] = add i32 [[VEC_PHI]], [[VECTOR_RECUR]] ; UNROLL-NO-VF-NEXT: [[TMP9]] = add i32 [[VEC_PHI2]], [[TMP5]] +; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2 ; UNROLL-NO-VF-NEXT: [[TMP10:%.*]] = select i1 [[TMP2]], i32 [[TMP8]], i32 [[VEC_PHI]] ; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = select i1 [[TMP3]], i32 [[TMP9]], i32 [[VEC_PHI2]] -; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2 ; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NO-VF-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF51:![0-9]+]], !llvm.loop [[LOOP52:![0-9]+]] ; UNROLL-NO-VF: middle.block: @@ -5670,8 +5670,8 @@ ; SINK-AFTER-NEXT: [[TMP22]] = phi <4 x i32> [ [[TMP17]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP21]], [[PRED_UDIV_IF7]] ] ; SINK-AFTER-NEXT: [[TMP23:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP22]], <4 x i32> ; SINK-AFTER-NEXT: [[TMP24]] = add <4 x i32> [[VEC_PHI]], [[TMP23]] -; SINK-AFTER-NEXT: [[TMP25:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP24]], <4 x i32> [[VEC_PHI]] ; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; SINK-AFTER-NEXT: [[TMP25:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP24]], <4 x i32> [[VEC_PHI]] ; SINK-AFTER-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], ; SINK-AFTER-NEXT: [[TMP26:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; SINK-AFTER-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF52:![0-9]+]], !llvm.loop [[LOOP53:![0-9]+]] @@ -5764,8 +5764,8 @@ ; NO-SINK-AFTER-NEXT: [[TMP22]] = phi <4 x i32> [ [[TMP17]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP21]], [[PRED_UDIV_IF7]] ] ; NO-SINK-AFTER-NEXT: [[TMP23:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP22]], <4 x i32> ; NO-SINK-AFTER-NEXT: [[TMP24]] = add <4 x i32> [[VEC_PHI]], [[TMP23]] -; NO-SINK-AFTER-NEXT: [[TMP25:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP24]], <4 x i32> [[VEC_PHI]] ; NO-SINK-AFTER-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; NO-SINK-AFTER-NEXT: [[TMP25:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP24]], <4 x i32> [[VEC_PHI]] ; NO-SINK-AFTER-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], ; NO-SINK-AFTER-NEXT: [[TMP26:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; NO-SINK-AFTER-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF52:![0-9]+]], !llvm.loop [[LOOP53:![0-9]+]] @@ -6277,9 +6277,9 @@ ; UNROLL-NO-IC-NEXT: store i32 [[TMP9]], i32* [[TMP71]], align 4 ; UNROLL-NO-IC-NEXT: br label [[PRED_STORE_CONTINUE35]] ; UNROLL-NO-IC: pred.store.continue35: +; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 ; UNROLL-NO-IC-NEXT: [[TMP72:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP46]], <4 x i32> [[VEC_PHI]] ; UNROLL-NO-IC-NEXT: [[TMP73:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP47]], <4 x i32> [[VEC_PHI7]] -; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 ; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], ; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT6]] = add <4 x i32> [[STEP_ADD4]], ; UNROLL-NO-IC-NEXT: [[TMP74:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] @@ -6364,9 +6364,9 @@ ; UNROLL-NO-VF-NEXT: store i32 [[INDUCTION2]], i32* [[TMP11]], align 4 ; UNROLL-NO-VF-NEXT: br label [[PRED_STORE_CONTINUE9]] ; UNROLL-NO-VF: pred.store.continue9: +; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2 ; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = select i1 [[TMP2]], i32 [[TMP8]], i32 [[VEC_PHI]] ; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = select i1 [[TMP3]], i32 [[TMP9]], i32 [[VEC_PHI5]] -; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2 ; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NO-VF-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF51]], !llvm.loop [[LOOP55:![0-9]+]] ; UNROLL-NO-VF: middle.block: @@ -6491,8 +6491,8 @@ ; SINK-AFTER-NEXT: store i32 [[TMP5]], i32* [[TMP36]], align 4 ; SINK-AFTER-NEXT: br label [[PRED_STORE_CONTINUE15]] ; SINK-AFTER: pred.store.continue15: -; SINK-AFTER-NEXT: [[TMP37:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> [[TMP24]], <4 x i32> [[VEC_PHI]] ; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; SINK-AFTER-NEXT: [[TMP37:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> [[TMP24]], <4 x i32> [[VEC_PHI]] ; SINK-AFTER-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], ; SINK-AFTER-NEXT: [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], ; SINK-AFTER-NEXT: [[TMP38:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] @@ -6621,8 +6621,8 @@ ; NO-SINK-AFTER-NEXT: store i32 [[TMP5]], i32* [[TMP36]], align 4 ; NO-SINK-AFTER-NEXT: br label [[PRED_STORE_CONTINUE15]] ; NO-SINK-AFTER: pred.store.continue15: -; NO-SINK-AFTER-NEXT: [[TMP37:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> [[TMP24]], <4 x i32> [[VEC_PHI]] ; NO-SINK-AFTER-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; NO-SINK-AFTER-NEXT: [[TMP37:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> [[TMP24]], <4 x i32> [[VEC_PHI]] ; NO-SINK-AFTER-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], ; NO-SINK-AFTER-NEXT: [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], ; NO-SINK-AFTER-NEXT: [[TMP38:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll b/llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll --- a/llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll +++ b/llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll @@ -39,9 +39,11 @@ ; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: loop: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next ; CHECK-NEXT: WIDEN ir<%cond0> = icmp ir<%iv>, ir<13> ; CHECK-NEXT: WIDEN-SELECT ir<%s> = select ir<%cond0>, ir<10>, ir<20> +; CHECK-NEXT: EMIT vp<{{.+}}> = VF * UF + vp<[[CAN_IV]]> ; CHECK-NEXT: No successor ; CHECK-NEXT: } define void @test() { diff --git a/llvm/test/Transforms/LoopVectorize/outer-loop-vec-phi-predecessor-order.ll b/llvm/test/Transforms/LoopVectorize/outer-loop-vec-phi-predecessor-order.ll --- a/llvm/test/Transforms/LoopVectorize/outer-loop-vec-phi-predecessor-order.ll +++ b/llvm/test/Transforms/LoopVectorize/outer-loop-vec-phi-predecessor-order.ll @@ -43,8 +43,8 @@ ; CHECK: loop.1.latch5: ; CHECK-NEXT: [[TMP8:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <4 x i64> [[TMP8]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP9]], i32 0 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP9]], i32 0 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] diff --git a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll --- a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll +++ b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll @@ -160,8 +160,8 @@ ; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP7]] to <4 x i8>* ; CHECK-NEXT: store <4 x i8> [[TMP9]], <4 x i8>* [[TMP10]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, i8* [[POINTER_PHI]], i64 4 +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/pr51614-fold-tail-by-masking.ll b/llvm/test/Transforms/LoopVectorize/pr51614-fold-tail-by-masking.ll --- a/llvm/test/Transforms/LoopVectorize/pr51614-fold-tail-by-masking.ll +++ b/llvm/test/Transforms/LoopVectorize/pr51614-fold-tail-by-masking.ll @@ -58,8 +58,8 @@ ; CHECK-NEXT: [[TMP23:%.*]] = phi <2 x i16> [ [[TMP12]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP21]], [[PRED_LOAD_IF3]] ] ; CHECK-NEXT: [[TMP24:%.*]] = add nsw <2 x i16> [[TMP22]], [[TMP23]] ; CHECK-NEXT: [[TMP25]] = add <2 x i16> [[VEC_PHI]], [[TMP24]] -; CHECK-NEXT: [[TMP26:%.*]] = select <2 x i1> [[TMP1]], <2 x i16> [[TMP25]], <2 x i16> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP26:%.*]] = select <2 x i1> [[TMP1]], <2 x i16> [[TMP25]], <2 x i16> [[VEC_PHI]] ; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i32 [[INDEX_NEXT]], 42 ; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/reduction-order.ll b/llvm/test/Transforms/LoopVectorize/reduction-order.ll --- a/llvm/test/Transforms/LoopVectorize/reduction-order.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-order.ll @@ -10,7 +10,7 @@ ; CHECK: %[[VAR1:.*]] = add <4 x i32> , %vec.phi1 ; CHECK-NEXT: %[[VAR2:.*]] = add <4 x i32> %vec.phi, ; CHECK-NEXT: icmp ule <4 x i64> -; CHECK-NEXT: select <4 x i1> {{.*}}, <4 x i32> %[[VAR2]], <4 x i32> +; CHECK: select <4 x i1> {{.*}}, <4 x i32> %[[VAR2]], <4 x i32> ; CHECK-NEXT: select <4 x i1> {{.*}}, <4 x i32> %[[VAR1]], <4 x i32> ; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body ; diff --git a/llvm/test/Transforms/LoopVectorize/select-reduction.ll b/llvm/test/Transforms/LoopVectorize/select-reduction.ll --- a/llvm/test/Transforms/LoopVectorize/select-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/select-reduction.ll @@ -36,8 +36,8 @@ ; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[VEC_PHI]], ; CHECK-NEXT: [[TMP3]] = select <4 x i1> [[TMP2]], <4 x i32> [[VEC_PHI]], <4 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP3]], <4 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP3]], <4 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] ; CHECK: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll --- a/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll @@ -12,12 +12,14 @@ ; CHECK-NEXT: label="\ vector loop" ; CHECK-NEXT: N1 [label = ; CHECK-NEXT: "for.body:\l" + +; CHECK-NEXT: " EMIT vp\<[[CAN_IV:%.+]]\> = CANONICAL-INDUCTION\l" + ; CHECK-NEXT: " WIDEN-INDUCTION %iv = phi %iv.next, 0\l" + ; CHECK-NEXT: " CLONE ir\<%arrayidx\> = getelementptr ir\<%y\>, ir\<%iv\>\l" + ; CHECK-NEXT: " WIDEN ir\<%lv\> = load ir\<%arrayidx\>\l" + ; CHECK-NEXT: " WIDEN-CALL ir\<%call\> = call @llvm.sqrt.f32(ir\<%lv\>)\l" + ; CHECK-NEXT: " CLONE ir\<%arrayidx2\> = getelementptr ir\<%x\>, ir\<%iv\>\l" + ; CHECK-NEXT: " WIDEN store ir\<%arrayidx2\>, ir\<%call\>\l" + +; CHECK-NEXT: " EMIT vp\<{{.+}}\> = VF * UF +(nuw) vp\<[[CAN_IV]]\>\l" + ; CHECK-NEXT: "No successors\l" ; CHECK-NEXT: ] ; diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll --- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll @@ -11,12 +11,14 @@ ; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: for.body: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ; CHECK-NEXT: WIDEN-INDUCTION %iv = phi %iv.next, 0 ; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%y>, ir<%iv> ; CHECK-NEXT: WIDEN ir<%lv> = load ir<%arrayidx> ; CHECK-NEXT: WIDEN-CALL ir<%call> = call @llvm.sqrt.f32(ir<%lv>) ; CHECK-NEXT: CLONE ir<%arrayidx2> = getelementptr ir<%x>, ir<%iv> ; CHECK-NEXT: WIDEN store ir<%arrayidx2>, ir<%call> +; CHECK-NEXT: EMIT vp<{{.+}}> = VF * UF +(nuw) vp<[[CAN_IV]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: No successors @@ -46,6 +48,7 @@ ; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: for.body: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ; CHECK-NEXT: WIDEN-INDUCTION %iv = phi %iv.next, 0 ; CHECK-NEXT: WIDEN-GEP Inv[Var] ir<%arrayidx> = getelementptr ir<%y>, ir<%iv> ; CHECK-NEXT: WIDEN ir<%lv> = load ir<%arrayidx> @@ -54,6 +57,7 @@ ; CHECK-NEXT: WIDEN ir<%add> = fadd ir<%lv>, ir<%sel> ; CHECK-NEXT: CLONE ir<%arrayidx2> = getelementptr ir<%x>, ir<%iv> ; CHECK-NEXT: WIDEN store ir<%arrayidx2>, ir<%add> +; CHECK-NEXT: EMIT vp<{{.+}}> = VF * UF +(nuw) vp<[[CAN_IV]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: No successors @@ -85,11 +89,13 @@ ; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: for.body: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ; CHECK-NEXT: WIDEN-INDUCTION %iv = phi %iv.next, 0 ; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%red> = phi ir<0.000000e+00>, ir<%red.next> ; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%y>, ir<%iv> ; CHECK-NEXT: WIDEN ir<%lv> = load ir<%arrayidx> ; CHECK-NEXT: REDUCE ir<%red.next> = ir<%red> + fast reduce.fadd (ir<%lv>) +; CHECK-NEXT: EMIT vp<{{.+}}> = VF * UF +(nuw) vp<[[CAN_IV]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: No successors @@ -117,6 +123,7 @@ ; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: for.body: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ; CHECK-NEXT: WIDEN-INDUCTION %i = phi 0, %i.next ; CHECK-NEXT: WIDEN ir<%cmp> = icmp ir<%i>, ir<5> ; CHECK-NEXT: Successor(s): if.then @@ -148,6 +155,7 @@ ; CHECK-NEXT: BLEND %d = ir<0>/vp<[[NOT]]> vp<[[PRED]]>/ir<%cmp> ; CHECK-NEXT: CLONE ir<%idx> = getelementptr ir<%x>, ir<%i> ; CHECK-NEXT: WIDEN store ir<%idx>, ir<%d> +; CHECK-NEXT: EMIT vp<{{.+}}> = VF * UF +(nuw) vp<[[CAN_IV]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: No successors @@ -185,6 +193,7 @@ ; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: for.body: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next ; CHECK-NEXT: CLONE ir<%gep.AB.0> = getelementptr ir<@AB>, ir<0>, ir<%iv> ; CHECK-NEXT: INTERLEAVE-GROUP with factor 4 at %AB.0, ir<%gep.AB.0> @@ -206,6 +215,7 @@ ; CHECK-NEXT: store ir<1> to index 1 ; CHECK-NEXT: store ir<2> to index 2 ; CHECK-NEXT: store ir<%AB.3> to index 3 +; CHECK-NEXT: EMIT vp<{{.+}}> = VF * UF +(nuw) vp<[[CAN_IV]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: No successors @@ -247,6 +257,7 @@ ; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: for.body: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next ; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%sum.07> = phi ir<0.000000e+00>, ir<%muladd> ; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%a>, ir<%iv> @@ -255,6 +266,7 @@ ; CHECK-NEXT: WIDEN ir<%l.b> = load ir<%arrayidx2> ; CHECK-NEXT: EMIT vp<[[FMUL:%.]]> = fmul nnan ninf nsz ir<%l.a> ir<%l.b> ; CHECK-NEXT: REDUCE ir<[[MULADD:%.+]]> = ir<%sum.07> + nnan ninf nsz reduce.fadd (vp<[[FMUL]]>) +; CHECK-NEXT: EMIT vp<{{.+}}> = VF * UF +(nuw) vp<[[CAN_IV]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } @@ -282,6 +294,7 @@ ; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: loop: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next ; CHECK-NEXT: CLONE ir<%isd> = getelementptr ir<%asd>, ir<%iv> ; CHECK-NEXT: WIDEN ir<%lsd> = load ir<%isd> @@ -323,6 +336,7 @@ ; CHECK-NEXT: EMIT vp<[[SEL2:%.+]]> = select vp<[[NOT1]]> vp<[[NOT2]]> ir ; CHECK-NEXT: BLEND %ysd.0 = vp<[[PHI]]>/vp<[[OR1]]> ir<%psd>/vp<[[SEL2]]> ; CHECK-NEXT: WIDEN store ir<%isd>, ir<%ysd.0> +; CHECK-NEXT: EMIT vp<{{.+}}> = VF * UF +(nuw) vp<[[CAN_IV]]> ; CHECK-NEXT: No successors ; CHECK-NEXT:} ; CHECK-NEXT:No successors diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge-vf1.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge-vf1.ll --- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge-vf1.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge-vf1.ll @@ -10,6 +10,7 @@ ; CHECK: VPlan 'Initial VPlan for VF={1},UF>=1' { ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: for.body: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ; CHECK-NEXT: WIDEN-INDUCTION %tmp0 = phi %tmp6, 0 ; CHECK-NEXT: WIDEN-INDUCTION %tmp1 = phi %tmp7, 0 ; CHECK-NEXT: CLONE ir<%tmp2> = getelementptr ir<%ptr>, ir<%tmp0> @@ -40,6 +41,7 @@ ; CHECK-NEXT: Successor(s): for.inc ; CHECK: for.inc: +; CHECK-NEXT: EMIT vp<{{.+}}> = VF * UF +(nuw) vp<[[CAN_IV]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: No successors diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll --- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll @@ -15,6 +15,7 @@ ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: loop: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next ; CHECK-NEXT: EMIT vp<[[MASK:%.+]]> = icmp ule ir<%iv> vp<[[BTC]]> ; CHECK-NEXT: Successor(s): loop.0 @@ -45,6 +46,7 @@ ; CHECK: loop.1: ; CHECK-NEXT: CLONE ir<%large> = icmp ir<%iv>, ir<8> ; CHECK-NEXT: CLONE ir<%exitcond> = icmp ir<%iv>, ir<%k> +; CHECK-NEXT: EMIT vp<{{.+}}> = VF * UF + vp<[[CAN_IV]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; @@ -76,6 +78,7 @@ ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: loop: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next ; CHECK-NEXT: EMIT vp<[[MASK:%.+]]> = icmp ule ir<%iv> vp<[[BTC]]> ; CHECK-NEXT: Successor(s): pred.load @@ -119,6 +122,7 @@ ; CHECK: loop.1: ; CHECK-NEXT: CLONE ir<%large> = icmp ir<%iv>, ir<8> ; CHECK-NEXT: CLONE ir<%exitcond> = icmp ir<%iv>, ir<%k> +; CHECK-NEXT: EMIT vp<{{.+}}> = VF * UF + vp<[[CAN_IV]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; @@ -150,6 +154,7 @@ ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: loop: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next ; CHECK-NEXT: EMIT vp<[[MASK:%.+]]> = icmp ule ir<%iv> vp<[[BTC]]> ; CHECK-NEXT: Successor(s): pred.load @@ -193,6 +198,7 @@ ; CHECK: loop.1: ; CHECK-NEXT: CLONE ir<%large> = icmp ir<%iv>, ir<8> ; CHECK-NEXT: CLONE ir<%exitcond> = icmp ir<%iv>, ir<%k> +; CHECK-NEXT: EMIT vp<{{.+}}> = VF * UF + vp<[[CAN_IV]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; @@ -226,9 +232,10 @@ ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: loop: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 21, %iv.next -; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = WIDEN-CANONICAL-INDUCTION -; CHECK-NEXT: EMIT vp<[[MASK:%.+]]> = icmp ule vp<[[CAN_IV]]> vp<[[BTC]]> +; CHECK-NEXT: EMIT vp<[[WIDE_CAN_IV:%.+]]> = WIDEN-CANONICAL-INDUCTION +; CHECK-NEXT: EMIT vp<[[MASK:%.+]]> = icmp ule vp<[[WIDE_CAN_IV]]> vp<[[BTC]]> ; CHECK-NEXT: CLONE ir<%gep.A.uniform> = getelementptr ir<%A>, ir<0> ; CHECK-NEXT: Successor(s): pred.load ; CHECK-EMPTY: @@ -277,6 +284,7 @@ ; CHECK-NEXT: Successor(s): loop.latch ; CHECK-EMPTY: ; CHECK-NEXT: loop.latch: +; CHECK-NEXT: EMIT vp<{{.+}}> = VF * UF + vp<[[CAN_IV]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; @@ -311,6 +319,7 @@ ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: loop: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next ; CHECK-NEXT: WIDEN ir<%c.1> = icmp ir<%iv>, ir<%j> ; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%iv>, ir<10> @@ -367,6 +376,7 @@ ; CHECK-NEXT: next.0.0: ; CHECK-NEXT: CLONE ir<%large> = icmp ir<%iv>, ir<8> ; CHECK-NEXT: CLONE ir<%exitcond> = icmp ir<%iv>, ir<%k> +; CHECK-NEXT: EMIT vp<{{.+}}> = VF * UF + vp<[[CAN_IV]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; @@ -407,6 +417,7 @@ ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: loop: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next ; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%iv>, ir<10> ; CHECK-NEXT: WIDEN ir<%c.0> = icmp ir<%iv>, ir<%j> @@ -471,6 +482,7 @@ ; CHECK-NEXT: next.1: ; CHECK-NEXT: CLONE ir<%large> = icmp ir<%iv>, ir<8> ; CHECK-NEXT: CLONE ir<%exitcond> = icmp ir<%iv>, ir<%k> +; CHECK-NEXT: EMIT vp<{{.+}}> = VF * UF + vp<[[CAN_IV]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; @@ -518,6 +530,7 @@ ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: loop: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next ; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%iv>, ir<10> ; CHECK-NEXT: WIDEN ir<%c.0> = icmp ir<%iv>, ir<%j> @@ -580,6 +593,7 @@ ; CHECK-NEXT: next.1: ; CHECK-NEXT: CLONE ir<%large> = icmp ir<%iv>, ir<8> ; CHECK-NEXT: CLONE ir<%exitcond> = icmp ir<%iv>, ir<%k> +; CHECK-NEXT: EMIT vp<{{.+}}> = VF * UF + vp<[[CAN_IV]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; @@ -623,6 +637,7 @@ ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: loop: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next ; CHECK-NEXT: EMIT vp<[[MASK:%.+]]> = icmp ule ir<%iv> vp<[[BTC]]> ; CHECK-NEXT: REPLICATE ir<%gep.a> = getelementptr ir<@a>, ir<0>, ir<%iv> @@ -690,6 +705,7 @@ ; CHECK-NEXT: latch: ; CHECK-NEXT: CLONE ir<%large> = icmp ir<%iv>, ir<8> ; CHECK-NEXT: CLONE ir<%exitcond> = icmp ir<%iv>, ir<%k> +; CHECK-NEXT: EMIT vp<{{.+}}> = VF * UF + vp<[[CAN_IV]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; @@ -733,6 +749,7 @@ ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: loop: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next ; CHECK-NEXT: EMIT vp<[[MASK:%.+]]> = icmp ule ir<%iv> vp<[[BTC]]> ; CHECK-NEXT: REPLICATE ir<%gep.a> = getelementptr ir<@a>, ir<0>, ir<%iv> @@ -766,6 +783,7 @@ ; CHECK-NEXT: loop.2: ; CHECK-NEXT: CLONE ir<%large> = icmp ir<%iv>, ir<8> ; CHECK-NEXT: CLONE ir<%exitcond> = icmp ir<%iv>, ir<%k> +; CHECK-NEXT: EMIT vp<{{.+}}> = VF * UF + vp<[[CAN_IV]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; @@ -795,6 +813,7 @@ ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: loop: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next ; CHECK-NEXT: FIRST-ORDER-RECURRENCE-PHI ir<%for> = phi ir<0>, ir<%lv.a> ; CHECK-NEXT: EMIT vp<[[MASK:%.+]]> = icmp ule ir<%iv> vp<[[BTC]]> @@ -844,6 +863,7 @@ ; CHECK-NEXT: loop.2: ; CHECK-NEXT: CLONE ir<%large> = icmp ir<%iv>, ir<8> ; CHECK-NEXT: CLONE ir<%exitcond> = icmp ir<%iv>, ir<%k> +; CHECK-NEXT: EMIT vp<{{.+}}> = VF * UF + vp<[[CAN_IV]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; @@ -872,6 +892,7 @@ ; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: loop.header: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next ; CHECK-NEXT: Successor(s): loop.then ; CHECK-EMPTY: @@ -906,6 +927,7 @@ ; CHECK-NEXT: Successor(s): loop.latch ; CHECK-EMPTY: ; CHECK-NEXT: loop.latch: +; CHECK-NEXT: EMIT vp<{{.+}}> = VF * UF +(nuw) vp<[[CAN_IV]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; @@ -939,6 +961,7 @@ ; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: loop.header: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next ; CHECK-NEXT: CLONE ir<%gep> = getelementptr ir<%addr>, ir<%iv> ; CHECK-NEXT: Successor(s): loop.body @@ -972,6 +995,7 @@ ; CHECK-NEXT: Successor(s): loop.latch ; CHECK-EMPTY: ; CHECK-NEXT: loop.latch: +; CHECK-NEXT: EMIT vp<{{.+}}> = VF * UF +(nuw) vp<[[CAN_IV]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; diff --git a/llvm/test/Transforms/LoopVectorize/vplan-vectorize-inner-loop-reduction.ll b/llvm/test/Transforms/LoopVectorize/vplan-vectorize-inner-loop-reduction.ll --- a/llvm/test/Transforms/LoopVectorize/vplan-vectorize-inner-loop-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-vectorize-inner-loop-reduction.ll @@ -42,8 +42,8 @@ ; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> %[[REDUCTION]], <4 x double*> %[[C_PTR]], i32 8, <4 x i1> ) ; CHECK-NEXT: %[[VEC_INDEX_NEXT:.*]] = add nuw nsw <4 x i64> %[[VEC_INDEX]], ; CHECK-NEXT: %[[VEC_PTR:.*]] = icmp eq <4 x i64> %[[VEC_INDEX_NEXT]], -; CHECK-NEXT: %{{.*}} = extractelement <4 x i1> %[[VEC_PTR]], i32 0 ; CHECK-NEXT: %[[FOR1_INDEX_NEXT:.*]] = add nuw i64 %[[FOR1_INDEX]], 4 +; CHECK-NEXT: %{{.*}} = extractelement <4 x i1> %[[VEC_PTR]], i32 0 ; CHECK-NEXT: %{{.*}} = add <4 x i64> %[[VEC_INDEX]], ; CHECK-NEXT: %[[EXIT_COND:.*]] = icmp eq i64 %[[FOR1_INDEX_NEXT]], 1000 ; CHECK-NEXT: br i1 %[[EXIT_COND]], label %{{.*}}, label %vector.body diff --git a/llvm/test/Transforms/LoopVectorize/vplan-widen-call-instruction.ll b/llvm/test/Transforms/LoopVectorize/vplan-widen-call-instruction.ll --- a/llvm/test/Transforms/LoopVectorize/vplan-widen-call-instruction.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-widen-call-instruction.ll @@ -32,8 +32,8 @@ ; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> %[[REDUCTION]], <4 x double*> %[[C_PTR]], i32 8, <4 x i1> ) ; CHECK-NEXT: %[[VEC_INDEX_NEXT:.*]] = add nuw nsw <4 x i64> %[[VEC_INDEX]], ; CHECK-NEXT: %[[VEC_PTR:.*]] = icmp eq <4 x i64> %[[VEC_INDEX_NEXT]], -; CHECK-NEXT: %{{.*}} = extractelement <4 x i1> %[[VEC_PTR]], i32 0 ; CHECK-NEXT: %[[FOR1_INDEX_NEXT:.*]] = add nuw i64 %[[FOR1_INDEX]], 4 +; CHECK-NEXT: %{{.*}} = extractelement <4 x i1> %[[VEC_PTR]], i32 0 ; CHECK-NEXT: %{{.*}} = add <4 x i64> %[[VEC_INDEX]], ; CHECK-NEXT: %[[EXIT_COND:.*]] = icmp eq i64 %[[FOR1_INDEX_NEXT]], 1000 ; CHECK-NEXT: br i1 %[[EXIT_COND]], label %{{.*}}, label %vector.body