diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -605,8 +605,15 @@ /// represented as. void truncateToMinimalBitwidths(VPTransformState &State); - /// Returns (and creates if needed) the original loop trip count. - Value *getOrCreateTripCount(BasicBlock *InsertBlock); + /// Returns the original loop trip count. + Value *getTripCount() const { + assert(TripCount && "trip count must be set before being accessed"); + return TripCount; + } + + /// Used to set the trip count after ILV's construction and after the + /// preheader block has been executed. + void setTripCount(Value *TC) { TripCount = TC; } /// Returns (and creates if needed) the trip count of the widened loop. Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock); @@ -2869,41 +2876,12 @@ PredicatedInstructions.push_back(Cloned); } -Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) { - if (TripCount) - return TripCount; - - assert(InsertBlock); - IRBuilder<> Builder(InsertBlock->getTerminator()); - // Find the loop boundaries. - Type *IdxTy = Legal->getWidestInductionType(); - assert(IdxTy && "No type for induction"); - const SCEV *ExitCount = createTripCountSCEV(IdxTy, PSE, OrigLoop); - - const DataLayout &DL = InsertBlock->getModule()->getDataLayout(); - - // Expand the trip count and place the new instructions in the preheader. - // Notice that the pre-header does not change, only the loop body. - SCEVExpander Exp(*PSE.getSE(), DL, "induction"); - - // Count holds the overall loop count (N). - TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), - InsertBlock->getTerminator()); - - if (TripCount->getType()->isPointerTy()) - TripCount = - CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", - InsertBlock->getTerminator()); - - return TripCount; -} - Value * InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { if (VectorTripCount) return VectorTripCount; - Value *TC = getOrCreateTripCount(InsertBlock); + Value *TC = getTripCount(); IRBuilder<> Builder(InsertBlock->getTerminator()); Type *Ty = TC->getType(); @@ -2981,7 +2959,7 @@ } void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { - Value *Count = getOrCreateTripCount(LoopVectorPreHeader); + Value *Count = getTripCount(); // Reuse existing vector loop preheader for TC checks. // Note that new preheader block is generated for vector loop. BasicBlock *const TCCheckBlock = LoopVectorPreHeader; @@ -3241,7 +3219,7 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() { // The trip counts should be cached by now. - Value *Count = getOrCreateTripCount(LoopVectorPreHeader); + Value *Count = getTripCount(); Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); @@ -3281,8 +3259,8 @@ the vectorized instructions while the old loop will continue to run the scalar remainder. - [ ] <-- loop iteration number check. - / | + [ ] <-- old preheader - loop iteration number check and SCEVs in Plans + / | preheader are expanded here. / v | [ ] <-- vector loop bypass (may consist of multiple blocks). | / | @@ -7702,23 +7680,23 @@ LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF << '\n'); - // Workaround! Compute the trip count of the original loop and cache it - // before we start modifying the CFG. This code has a systemic problem - // wherein it tries to run analysis over partially constructed IR; this is - // wrong, and not simply for SCEV. The trip count of the original loop - // simply happens to be prone to hitting this in practice. In theory, we - // can hit the same issue for any SCEV, or ValueTracking query done during - // mutation. See PR49900. - ILV.getOrCreateTripCount(OrigLoop->getLoopPreheader()); - if (!IsEpilogueVectorization) VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE); // Perform the actual loop transformation. + VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; + + // 0. Generate SCEV-dependent code into the preheader, including TripCount, + // before making any changes to the CFG. + if (!BestVPlan.getPreheader()->empty()) { + State.CFG.PrevBB = OrigLoop->getLoopPreheader(); + State.Builder.SetInsertPoint(OrigLoop->getLoopPreheader()->getTerminator()); + BestVPlan.getPreheader()->execute(&State); + ILV.setTripCount(State.get(BestVPlan.getTripCount(), 0)); + } // 1. Set up the skeleton for vectorization, including vector pre-header and // middle block. The vector loop is created during VPlan execution. - VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; Value *CanonicalIVStartValue; std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = ILV.createVectorizedLoopSkeleton(); @@ -7754,10 +7732,9 @@ //===------------------------------------------------===// // 2. Copy and widen instructions from the old loop into the new loop. - BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr), - ILV.getOrCreateVectorTripCount(nullptr), - CanonicalIVStartValue, State, - IsEpilogueVectorization); + BestVPlan.prepareToExecute( + ILV.getTripCount(), ILV.getOrCreateVectorTripCount(nullptr), + CanonicalIVStartValue, State, IsEpilogueVectorization); BestVPlan.execute(&State); @@ -7872,7 +7849,7 @@ assert(Bypass && "Expected valid bypass basic block."); ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; - Value *Count = getOrCreateTripCount(LoopVectorPreHeader); + Value *Count = getTripCount(); // Reuse existing vector loop preheader for TC checks. // Note that new preheader block is generated for vector loop. BasicBlock *const TCCheckBlock = LoopVectorPreHeader; @@ -8191,7 +8168,7 @@ VPBuilder::InsertPointGuard Guard(Builder); Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); if (useActiveLaneMask(TFStyle)) { - VPValue *TC = Plan.getOrCreateTripCount(); + VPValue *TC = Plan.getTripCount(); BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}, nullptr, "active.lane.mask"); } else { @@ -8274,8 +8251,8 @@ assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) && "step must be loop invariant"); - VPValue *Step = - vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE); + VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, Plan.getEntry(), + IndDesc.getStep(), SE); if (auto *TruncI = dyn_cast(PhiOrTrunc)) { return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI); } @@ -8294,8 +8271,8 @@ // Check if this is pointer induction. If so, build the recipe for it. if (auto *II = Legal->getPointerInductionDescriptor(Phi)) { - VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(), - *PSE.getSE()); + VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr( + Plan, Plan.getEntry(), II->getStep(), *PSE.getSE()); return new VPWidenPointerInductionRecipe( Phi, Operands[0], Step, *II, LoopVectorizationPlanner::getDecisionAndClampRange( @@ -8768,7 +8745,7 @@ VecPreheader->appendRecipe(CanonicalIVIncrementParts); // Create the ActiveLaneMask instruction using the correct start values. - VPValue *TC = Plan.getOrCreateTripCount(); + VPValue *TC = Plan.getTripCount(); VPValue *TripCount, *IncrementValue; if (Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) { @@ -8857,6 +8834,20 @@ } } +static VPlanPtr createInitialVPlan(Type *InductionTy, + PredicatedScalarEvolution &PSE, Loop *L) { + VPBasicBlock *Preheader = new VPBasicBlock("ph"); + assert(InductionTy && "No type for induction"); + + const SCEV *ExitCount = createTripCountSCEV(InductionTy, PSE, L); + VPExpandSCEVRecipe *TripCount = + new VPExpandSCEVRecipe(ExitCount, *PSE.getSE()); + Preheader->appendRecipe(TripCount); + + VPBasicBlock *VecPreheader = new VPBasicBlock("vector.ph"); + return std::make_unique(Preheader, TripCount, VecPreheader); +} + std::optional LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( VFRange &Range, SmallPtrSetImpl &DeadInstructions) { @@ -8910,17 +8901,18 @@ // visit each basic block after having visited its predecessor basic blocks. // --------------------------------------------------------------------------- - // Create initial VPlan skeleton, starting with a block for the pre-header, - // followed by a region for the vector loop, followed by the middle block. The - // skeleton vector loop region contains a header and latch block. - VPBasicBlock *Preheader = new VPBasicBlock("vector.ph"); - auto Plan = std::make_unique(Preheader); - + // Create initial VPlan skeleton, having a basic block for the pre-header + // which contains SCEV expansions that need to happen before the CFG is + // modified; a basic block for the vector pre-header, followed by a region for + // the vector loop, followed by the middle basic block. The skeleton vector + // loop region contains a header and latch basic blocks. + VPlanPtr Plan = + createInitialVPlan(Legal->getWidestInductionType(), PSE, OrigLoop); VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body"); VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop"); - VPBlockUtils::insertBlockAfter(TopRegion, Preheader); + VPBlockUtils::insertBlockAfter(TopRegion, Plan->getEntry()); VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block"); VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion); @@ -9108,7 +9100,8 @@ assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); // Create new empty VPlan - auto Plan = std::make_unique(); + auto Plan = + createInitialVPlan(Legal->getWidestInductionType(), PSE, OrigLoop); // Build hierarchical CFG VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); @@ -10415,6 +10408,13 @@ Checks); VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); + // When vectorizing the epilogue loop, re-use the cached trip count in + // ILV from the main vector loop. + for (auto &R : make_early_inc_range(*BestEpiPlan.getPreheader())) { + assert(R.getVPSingleValue()->getNumUsers() == 0 && + "values not expected to be used in epilogue plan"); + R.eraseFromParent(); + } VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion(); VPBasicBlock *Header = VectorLoop->getEntryBasicBlock(); Header->setName("vec.epilog.vector.body"); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2218,6 +2218,10 @@ /// preheader of the vector loop. VPBasicBlock *Entry; + /// VPBasicBlock corresponding to the original preheader. Used to place + /// VPExpandSCEV recipes for expressions used during skeleton creation. + VPBasicBlock *Preheader; + /// Holds the VFs applicable to this VPlan. SmallSetVector VFs; @@ -2260,9 +2264,15 @@ DenseMap SCEVToExpansion; public: - VPlan(VPBasicBlock *Entry = nullptr) : Entry(Entry) { - if (Entry) - Entry->setPlan(this); + /// Construct an empty VPlan. + VPlan() {} + + /// Construct a VPlan with original preheader \p Preheader, \p TripCount and + /// \p Entry to the plan. + VPlan(VPBasicBlock *Preheader, VPValue *TripCount, VPBasicBlock *Entry) + : Entry(Entry), Preheader(Preheader), TripCount(TripCount) { + Entry->setPlan(this); + Preheader->setPlan(this); } ~VPlan(); @@ -2285,9 +2295,8 @@ } /// The trip count of the original loop. - VPValue *getOrCreateTripCount() { - if (!TripCount) - TripCount = new VPValue(); + VPValue *getTripCount() const { + assert(TripCount && "trip count needs to be set before accessing it"); return TripCount; } @@ -2435,6 +2444,10 @@ SCEVToExpansion[S] = V; } + /// \return The block corresponding to the original preheader. + VPBasicBlock *getPreheader() { return Preheader; } + const VPBasicBlock *getPreheader() const { return Preheader; } + private: /// Add to the given dominator tree the header block and every new basic block /// that was created between it and the latch block, inclusive. @@ -2743,8 +2756,8 @@ /// value. Otherwise return a VPExpandSCEVRecipe to expand \p Expr. If \p Plan's /// pre-header already contains a recipe expanding \p Expr, return it. If not, /// create a new one. -VPValue *getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr, - ScalarEvolution &SE); +VPValue *getOrCreateVPValueForSCEVExpr(VPlan &Plan, VPBasicBlock *Block, + const SCEV *Expr, ScalarEvolution &SE); /// Returns true if \p VPV is uniform after vectorization. inline bool isUniformAfterVectorization(VPValue *VPV) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -163,8 +163,9 @@ } void VPBlockBase::setPlan(VPlan *ParentPlan) { - assert(ParentPlan->getEntry() == this && - "Can only set plan on its entry block."); + assert( + (ParentPlan->getEntry() == this || ParentPlan->getPreheader() == this) && + "Can only set plan on its entry or preheader block."); Plan = ParentPlan; } @@ -593,8 +594,6 @@ } for (VPValue *VPV : VPLiveInsToFree) delete VPV; - if (TripCount) - delete TripCount; if (BackedgeTakenCount) delete BackedgeTakenCount; } @@ -612,13 +611,6 @@ Value *CanonicalIVStartValue, VPTransformState &State, bool IsEpilogueVectorization) { - - // Check if the trip count is needed, and if so build it. - if (TripCount && TripCount->getNumUsers()) { - for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) - State.set(TripCount, TripCountV, Part); - } - // Check if the backedge taken count is needed, and if so build it. if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) { IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); @@ -1106,11 +1098,13 @@ } void VPSlotTracker::assignSlots(const VPlan &Plan) { + for (const VPRecipeBase &Recipe : *Plan.getPreheader()) + for (VPValue *Def : Recipe.definedValues()) + assignSlot(Def); + assignSlot(&Plan.VectorTripCount); if (Plan.BackedgeTakenCount) assignSlot(Plan.BackedgeTakenCount); - if (Plan.TripCount) - assignSlot(Plan.TripCount); ReversePostOrderTraversal> RPOT(VPBlockDeepTraversalWrapper(Plan.getEntry())); @@ -1126,7 +1120,9 @@ [Def](VPUser *U) { return U->onlyFirstLaneUsed(Def); }); } -VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr, +VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, + VPBasicBlock *Block, + const SCEV *Expr, ScalarEvolution &SE) { if (auto *Expanded = Plan.getSCEVExpansion(Expr)) return Expanded; @@ -1136,9 +1132,8 @@ else if (auto *E = dyn_cast(Expr)) Expanded = Plan.getVPValueOrAddLiveIn(E->getValue()); else { - VPBasicBlock *Preheader = Plan.getEntry(); Expanded = new VPExpandSCEVRecipe(Expr, SE); - Preheader->appendRecipe(Expanded->getDefiningRecipe()); + Block->appendRecipe(Expanded->getDefiningRecipe()); } Plan.addSCEVExpansion(Expr, Expanded); return Expanded; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -53,8 +53,8 @@ auto *Phi = cast(VPPhi->getUnderlyingValue()); if (const auto *II = GetIntOrFpInductionDescriptor(Phi)) { VPValue *Start = Plan->getVPValueOrAddLiveIn(II->getStartValue()); - VPValue *Step = - vputils::getOrCreateVPValueForSCEVExpr(*Plan, II->getStep(), SE); + VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr( + *Plan, Plan->getEntry(), II->getStep(), SE); NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, *II); } else { Plan->addVPValue(Phi, VPPhi); @@ -521,8 +521,8 @@ if (Instruction *TruncI = WideIV->getTruncInst()) ResultTy = TruncI->getType(); const InductionDescriptor &ID = WideIV->getInductionDescriptor(); - VPValue *Step = - vputils::getOrCreateVPValueForSCEVExpr(Plan, ID.getStep(), SE); + VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr( + Plan, Plan.getEntry(), ID.getStep(), SE); VPValue *BaseIV = CanonicalIV; if (!CanonicalIV->isCanonical(ID.getKind(), WideIV->getStartValue(), Step, ResultTy)) { diff --git a/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll b/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll --- a/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll @@ -87,7 +87,7 @@ ; CHECK-NEXT: [[INDEX_NEXT16]] = add nuw i64 [[INDEX9]], 8 ; CHECK-NEXT: [[VEC_IND_NEXT14]] = add <8 x i32> [[VEC_IND13]], ; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT16]], [[N_VEC4]] -; CHECK-NEXT: br i1 [[TMP30]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP30]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: [[CMP_N8:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC4]] ; CHECK-NEXT: br i1 [[CMP_N8]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll @@ -895,15 +895,13 @@ ; UNROLL-NO-IC-NEXT: entry: ; UNROLL-NO-IC-NEXT: br label [[FOR_COND1_PREHEADER:%.*]] ; UNROLL-NO-IC: for.cond1.preheader: -; UNROLL-NO-IC-NEXT: [[INDVAR:%.*]] = phi i32 [ [[INDVAR_NEXT:%.*]], [[FOR_COND_CLEANUP3:%.*]] ], [ 0, [[ENTRY:%.*]] ] -; UNROLL-NO-IC-NEXT: [[I_016:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ [[INC:%.*]], [[FOR_COND_CLEANUP3]] ] +; UNROLL-NO-IC-NEXT: [[I_016:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_COND_CLEANUP3:%.*]] ] ; UNROLL-NO-IC-NEXT: [[E_015:%.*]] = phi i32 [ poison, [[ENTRY]] ], [ [[E_1_LCSSA:%.*]], [[FOR_COND_CLEANUP3]] ] -; UNROLL-NO-IC-NEXT: [[TMP0:%.*]] = add i32 [[INDVAR]], 1 -; UNROLL-NO-IC-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 8 +; UNROLL-NO-IC-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[I_016]], 8 ; UNROLL-NO-IC-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; UNROLL-NO-IC: vector.ph: -; UNROLL-NO-IC-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], 8 -; UNROLL-NO-IC-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP0]], [[N_MOD_VF]] +; UNROLL-NO-IC-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[I_016]], 8 +; UNROLL-NO-IC-NEXT: [[N_VEC:%.*]] = sub i32 [[I_016]], [[N_MOD_VF]] ; UNROLL-NO-IC-NEXT: [[IND_END:%.*]] = sub i32 [[I_016]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i32> poison, i32 [[E_015]], i32 3 ; UNROLL-NO-IC-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[I_016]], i64 0 @@ -915,14 +913,14 @@ ; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[STEP_ADD:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NO-IC-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NO-IC-NEXT: [[STEP_ADD]] = add <4 x i32> [[VEC_IND]], -; UNROLL-NO-IC-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[VEC_IND]], <4 x i32> -; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[VEC_IND]], <4 x i32> [[STEP_ADD]], <4 x i32> +; UNROLL-NO-IC-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[VEC_IND]], <4 x i32> +; UNROLL-NO-IC-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VEC_IND]], <4 x i32> [[STEP_ADD]], <4 x i32> ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], -; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-IC-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NO-IC-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; UNROLL-NO-IC: middle.block: -; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] +; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[I_016]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i32 3 ; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i32 2 ; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]] @@ -943,38 +941,35 @@ ; UNROLL-NO-IC-NEXT: [[E_1_LCSSA]] = phi i32 [ [[SCALAR_RECUR]], [[FOR_COND1]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-IC-NEXT: [[INC]] = add nuw nsw i32 [[I_016]], 1 ; UNROLL-NO-IC-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 49 -; UNROLL-NO-IC-NEXT: [[INDVAR_NEXT]] = add i32 [[INDVAR]], 1 ; UNROLL-NO-IC-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]] ; ; UNROLL-NO-VF-LABEL: @PR27246( ; UNROLL-NO-VF-NEXT: entry: ; UNROLL-NO-VF-NEXT: br label [[FOR_COND1_PREHEADER:%.*]] ; UNROLL-NO-VF: for.cond1.preheader: -; UNROLL-NO-VF-NEXT: [[INDVAR:%.*]] = phi i32 [ [[INDVAR_NEXT:%.*]], [[FOR_COND_CLEANUP3:%.*]] ], [ 0, [[ENTRY:%.*]] ] -; UNROLL-NO-VF-NEXT: [[I_016:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ [[INC:%.*]], [[FOR_COND_CLEANUP3]] ] +; UNROLL-NO-VF-NEXT: [[I_016:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_COND_CLEANUP3:%.*]] ] ; UNROLL-NO-VF-NEXT: [[E_015:%.*]] = phi i32 [ poison, [[ENTRY]] ], [ [[E_1_LCSSA:%.*]], [[FOR_COND_CLEANUP3]] ] -; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = add i32 [[INDVAR]], 1 -; UNROLL-NO-VF-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 2 +; UNROLL-NO-VF-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[I_016]], 2 ; UNROLL-NO-VF-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; UNROLL-NO-VF: vector.ph: -; UNROLL-NO-VF-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], 2 -; UNROLL-NO-VF-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP0]], [[N_MOD_VF]] +; UNROLL-NO-VF-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[I_016]], 2 +; UNROLL-NO-VF-NEXT: [[N_VEC:%.*]] = sub i32 [[I_016]], [[N_MOD_VF]] ; UNROLL-NO-VF-NEXT: [[IND_END:%.*]] = sub i32 [[I_016]], [[N_VEC]] ; UNROLL-NO-VF-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-VF: vector.body: ; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i32 [ [[E_015]], [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i32 [ [[E_015]], [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NO-VF-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[I_016]], [[INDEX]] -; UNROLL-NO-VF-NEXT: [[TMP1:%.*]] = add i32 [[OFFSET_IDX]], 0 -; UNROLL-NO-VF-NEXT: [[TMP2]] = add i32 [[OFFSET_IDX]], -1 +; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0 +; UNROLL-NO-VF-NEXT: [[TMP1]] = add i32 [[OFFSET_IDX]], -1 ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-VF-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NO-VF-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; UNROLL-NO-VF: middle.block: -; UNROLL-NO-VF-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] +; UNROLL-NO-VF-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[I_016]], [[N_VEC]] ; UNROLL-NO-VF-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]] ; UNROLL-NO-VF: scalar.ph: -; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[E_015]], [[FOR_COND1_PREHEADER]] ], [ [[TMP2]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[E_015]], [[FOR_COND1_PREHEADER]] ], [ [[TMP1]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-VF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ] ; UNROLL-NO-VF-NEXT: br label [[FOR_COND1:%.*]] ; UNROLL-NO-VF: for.cond.cleanup: @@ -987,25 +982,22 @@ ; UNROLL-NO-VF-NEXT: [[DEC]] = add nsw i32 [[K_0]], -1 ; UNROLL-NO-VF-NEXT: br i1 [[CMP2]], label [[FOR_COND1]], label [[FOR_COND_CLEANUP3]], !llvm.loop [[LOOP9:![0-9]+]] ; UNROLL-NO-VF: for.cond.cleanup3: -; UNROLL-NO-VF-NEXT: [[E_1_LCSSA]] = phi i32 [ [[SCALAR_RECUR]], [[FOR_COND1]] ], [ [[TMP1]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-VF-NEXT: [[E_1_LCSSA]] = phi i32 [ [[SCALAR_RECUR]], [[FOR_COND1]] ], [ [[TMP0]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-VF-NEXT: [[INC]] = add nuw nsw i32 [[I_016]], 1 ; UNROLL-NO-VF-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 49 -; UNROLL-NO-VF-NEXT: [[INDVAR_NEXT]] = add i32 [[INDVAR]], 1 ; UNROLL-NO-VF-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]] ; ; SINK-AFTER-LABEL: @PR27246( ; SINK-AFTER-NEXT: entry: ; SINK-AFTER-NEXT: br label [[FOR_COND1_PREHEADER:%.*]] ; SINK-AFTER: for.cond1.preheader: -; SINK-AFTER-NEXT: [[INDVAR:%.*]] = phi i32 [ [[INDVAR_NEXT:%.*]], [[FOR_COND_CLEANUP3:%.*]] ], [ 0, [[ENTRY:%.*]] ] -; SINK-AFTER-NEXT: [[I_016:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ [[INC:%.*]], [[FOR_COND_CLEANUP3]] ] +; SINK-AFTER-NEXT: [[I_016:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_COND_CLEANUP3:%.*]] ] ; SINK-AFTER-NEXT: [[E_015:%.*]] = phi i32 [ poison, [[ENTRY]] ], [ [[E_1_LCSSA:%.*]], [[FOR_COND_CLEANUP3]] ] -; SINK-AFTER-NEXT: [[TMP0:%.*]] = add i32 [[INDVAR]], 1 -; SINK-AFTER-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 4 +; SINK-AFTER-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[I_016]], 4 ; SINK-AFTER-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SINK-AFTER: vector.ph: -; SINK-AFTER-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], 4 -; SINK-AFTER-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP0]], [[N_MOD_VF]] +; SINK-AFTER-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[I_016]], 4 +; SINK-AFTER-NEXT: [[N_VEC:%.*]] = sub i32 [[I_016]], [[N_MOD_VF]] ; SINK-AFTER-NEXT: [[IND_END:%.*]] = sub i32 [[I_016]], [[N_VEC]] ; SINK-AFTER-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i32> poison, i32 [[E_015]], i32 3 ; SINK-AFTER-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[I_016]], i64 0 @@ -1016,13 +1008,13 @@ ; SINK-AFTER-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; SINK-AFTER-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[VEC_IND:%.*]], [[VECTOR_BODY]] ] ; SINK-AFTER-NEXT: [[VEC_IND]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; SINK-AFTER-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[VEC_IND]], <4 x i32> +; SINK-AFTER-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[VEC_IND]], <4 x i32> ; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; SINK-AFTER-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], -; SINK-AFTER-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; SINK-AFTER-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; SINK-AFTER-NEXT: [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; SINK-AFTER-NEXT: br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; SINK-AFTER: middle.block: -; SINK-AFTER-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] +; SINK-AFTER-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[I_016]], [[N_VEC]] ; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 3 ; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 2 ; SINK-AFTER-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]] @@ -1043,7 +1035,6 @@ ; SINK-AFTER-NEXT: [[E_1_LCSSA]] = phi i32 [ [[SCALAR_RECUR]], [[FOR_COND1]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ] ; SINK-AFTER-NEXT: [[INC]] = add nuw nsw i32 [[I_016]], 1 ; SINK-AFTER-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 49 -; SINK-AFTER-NEXT: [[INDVAR_NEXT]] = add i32 [[INDVAR]], 1 ; SINK-AFTER-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]] ; entry: