diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3228,33 +3228,6 @@ } BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() { - // The trip counts should be cached by now. - Value *Count = getTripCount(); - Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); - - auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); - - // Add a check in the middle block to see if we have completed - // all of the iterations in the first vector loop. Three cases: - // 1) If we require a scalar epilogue, there is no conditional branch as - // we unconditionally branch to the scalar preheader. Do nothing. - // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. - // Thus if tail is to be folded, we know we don't need to run the - // remainder and we can use the previous value for the condition (true). - // 3) Otherwise, construct a runtime check. - if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) { - Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, - Count, VectorTripCount, "cmp.n", - LoopMiddleBlock->getTerminator()); - - // Here we use the same DebugLoc as the scalar loop latch terminator instead - // of the corresponding compare because they may have ended up with - // different line numbers and we want to avoid awkward line stepping while - // debugging. Eg. if the compare has got a line number inside the loop. - CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); - cast(LoopMiddleBlock->getTerminator())->setCondition(CmpN); - } - #ifdef EXPENSIVE_CHECKS assert(DT->verify(DominatorTree::VerificationLevel::Fast)); #endif @@ -3740,7 +3713,7 @@ // in the exit block, so update the builder. State.Builder.SetInsertPoint(State.CFG.ExitBB->getFirstNonPHI()); for (const auto &KV : Plan.getLiveOuts()) - KV.second->fixPhi(Plan, State); + KV.second->execute(Plan, State); for (Instruction *PI : PredicatedInstructions) sinkScalarOperands(&*PI); @@ -3857,7 +3830,7 @@ "recurrence phi must have a single user: FirstOrderRecurrenceSplice"); SmallVector LiveOuts; for (VPUser *U : RecurSplice->users()) - if (auto *LiveOut = dyn_cast(U)) + if (auto *LiveOut = dyn_cast(U)) LiveOuts.push_back(LiveOut); if (!LiveOuts.empty()) { @@ -3882,7 +3855,7 @@ for (VPLiveOut *LiveOut : LiveOuts) { assert(!Cost->requiresScalarEpilogue(VF)); - PHINode *LCSSAPhi = LiveOut->getPhi(); + PHINode *LCSSAPhi = cast(LiveOut)->getPhi(); LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); State.Plan->removeLiveOut(LCSSAPhi); } @@ -8848,7 +8821,7 @@ Value *IncomingValue = ExitPhi.getIncomingValueForBlock(ExitingBB); VPValue *V = Plan.getVPValueOrAddLiveIn(IncomingValue); - Plan.addLiveOut(&ExitPhi, V); + Plan.addLiveOut(&ExitPhi, new VPLiveOutPhi(&ExitPhi, V)); } } @@ -9091,6 +9064,34 @@ VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan); VPlanTransforms::mergeBlocksIntoPredecessors(*Plan); + // Add a check in the middle block to see if we have completed + // all of the iterations in the first vector loop. Three cases: + // 1) If we require a scalar epilogue, there is no conditional branch as + // we unconditionally branch to the scalar preheader. Do nothing. + // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. + // Thus if tail is to be folded, we know we don't need to run the + // remainder and we can use the previous value for the condition (true). + // 3) Otherwise, construct a runtime check. + if (LoopVectorizationPlanner::getDecisionAndClampRange( + [this](ElementCount VF) { return !CM.requiresScalarEpilogue(VF); }, + Range) && + !CM.foldTailByMasking()) { + auto *VPBB = + cast(Plan->getVectorLoopRegion()->getSingleSuccessor()); + + auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); + // Here we use the same DebugLoc as the scalar loop latch terminator instead + // of the corresponding compare because they may have ended up with + // different line numbers and we want to avoid awkward line stepping while + // debugging. Eg. if the compare has got a line number inside the loop. + auto *Cmp = + new VPInstruction(VPInstruction::ICmpEQ, + {Plan->getTripCount(), &Plan->getVectorTripCount()}, + ScalarLatchTerm->getDebugLoc()); + VPBB->appendRecipe(Cmp); + Plan->addLiveOut(nullptr, new VPLiveOutBr(Cmp)); + } + assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); return std::make_optional(std::move(Plan)); } @@ -9128,6 +9129,14 @@ addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), CM.getTailFoldingStyle()); + + auto *Cmp = + new VPInstruction(VPInstruction::ICmpEQ, + {Plan->getTripCount(), &Plan->getVectorTripCount()}); + auto *VPBB = + cast(Plan->getVectorLoopRegion()->getSingleSuccessor()); + VPBB->appendRecipe(Cmp); + Plan->addLiveOut(nullptr, new VPLiveOutBr(Cmp)); return Plan; } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -662,25 +662,19 @@ #endif }; -/// A value that is used outside the VPlan. The operand of the user needs to be -/// added to the associated LCSSA phi node. +/// A value that is used outside the VPlan. class VPLiveOut : public VPUser { - PHINode *Phi; +protected: + VPLiveOut(VPValue *Op, VPUser::VPUserID ID) : VPUser({Op}, ID) {} public: - VPLiveOut(PHINode *Phi, VPValue *Op) - : VPUser({Op}, VPUser::VPUserID::LiveOut), Phi(Phi) {} + VPLiveOut(VPValue *Op) : VPLiveOut(Op, VPUser::VPUserID::LiveOut) {} static inline bool classof(const VPUser *U) { return U->getVPUserID() == VPUser::VPUserID::LiveOut; } - /// Fixup the wrapped LCSSA phi node in the unique exit block. This simply - /// means we need to add the appropriate incoming value from the middle - /// block as exiting edges from the scalar epilogue loop (if present) are - /// already in place, and we exit the vector loop exclusively to the middle - /// block. - void fixPhi(VPlan &Plan, VPTransformState &State); + virtual void execute(VPlan &Plan, VPTransformState &State) = 0; /// Returns true if the VPLiveOut uses scalars of operand \p Op. bool usesScalars(const VPValue *Op) const override { @@ -689,10 +683,44 @@ return true; } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + virtual void print(raw_ostream &O, VPSlotTracker &SlotTracker) const = 0; +#endif +}; + +/// A value that is used outside the VPlan. The operand of the user needs to be +/// added to the associated LCSSA phi node. +class VPLiveOutPhi : public VPLiveOut { + PHINode *Phi; + +public: + VPLiveOutPhi(PHINode *Phi, VPValue *Op) + : VPLiveOut(Op, VPUser::VPUserID::LiveOutPhi), Phi(Phi) {} + + static inline bool classof(const VPUser *U) { + return U->getVPUserID() == VPUser::VPUserID::LiveOutPhi; + } + + /// Fixup the wrapped LCSSA phi node in the unique exit block. This simply + /// means we need to add the appropriate incoming value from the middle + /// block as exiting edges from the scalar epilogue loop (if present) are + /// already in place, and we exit the vector loop exclusively to the middle + /// block. + void execute(VPlan &Plan, VPTransformState &State) override; + PHINode *getPhi() const { return Phi; } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - /// Print the VPLiveOut to \p O. + virtual void print(raw_ostream &O, VPSlotTracker &SlotTracker) const override; +#endif +}; + +struct VPLiveOutBr : public VPLiveOut { + VPLiveOutBr(VPValue *Op) : VPLiveOut(Op) {} + + void execute(VPlan &Plan, VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void print(raw_ostream &O, VPSlotTracker &SlotTracker) const override; #endif }; @@ -828,6 +856,7 @@ // values of a first-order recurrence. Not, ICmpULE, + ICmpEQ, SLPLoad, SLPStore, ActiveLaneMask, @@ -2618,7 +2647,7 @@ /// be only one at most. If there isn't one, then return nullptr. VPActiveLaneMaskPHIRecipe *getActiveLaneMaskPhi(); - void addLiveOut(PHINode *PN, VPValue *V); + void addLiveOut(PHINode *PN, VPLiveOut *V); void removeLiveOut(PHINode *PN) { delete LiveOuts[PN]; diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -354,9 +354,8 @@ assert(PredVPB->getSingleSuccessor() == this && "predecessor must have the current block as only successor"); BasicBlock *ExitingBB = State->CFG.VPBB2IRBB[ExitingVPBB]; - // The Exit block of a loop is always set to be successor 0 of the Exiting - // block. cast(ExitingBB->getTerminator())->setSuccessor(0, NewBB); + State->Builder.SetInsertPoint(NewBB->getTerminator()); } else if (PrevVPBB && /* A */ !((SingleHPred = getSingleHierarchicalPredecessor()) && SingleHPred->getExitingBasicBlock() == PrevVPBB && @@ -830,9 +829,9 @@ void VPlan::dump() const { print(dbgs()); } #endif -void VPlan::addLiveOut(PHINode *PN, VPValue *V) { +void VPlan::addLiveOut(PHINode *PN, VPLiveOut *LiveOut) { assert(LiveOuts.count(PN) == 0 && "an exit value for PN already exists"); - LiveOuts.insert({PN, new VPLiveOut(PN, V)}); + LiveOuts.insert({PN, LiveOut}); } void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopHeaderBB, diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -160,7 +160,7 @@ } } -void VPLiveOut::execute(VPlan &Plan, VPTransformState &State) { +void VPLiveOutPhi::execute(VPlan &Plan, VPTransformState &State) { auto Lane = VPLane::getLastLaneForVF(State.VF); VPValue *ExitValue = getOperand(0); if (vputils::isUniformAfterVectorization(ExitValue)) @@ -169,7 +169,7 @@ State.Builder.GetInsertBlock()); } -void VPLiveOut::print(raw_ostream &O, VPSlotTracker &SlotTracker) const { +void VPLiveOutPhi::print(raw_ostream &O, VPSlotTracker &SlotTracker) const { O << "Live-out "; getPhi()->printAsOperand(O); O << " = "; @@ -177,6 +177,17 @@ O << "\n"; } +void VPLiveOutBr::execute(VPlan &Plan, VPTransformState &State) { + cast(State.CFG.PrevBB->getTerminator()) + ->setCondition(State.get(getOperand(0), 0)); +} + +void VPLiveOutBr::print(raw_ostream &O, VPSlotTracker &SlotTracker) const { + O << "Live-out exit branch "; + getOperand(0)->printAsOperand(O, SlotTracker); + O << "\n"; +} + void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) { assert(!Parent && "Recipe already in some VPBasicBlock"); assert(InsertPos->getParent() && @@ -251,6 +262,19 @@ State.set(this, V, Part); break; } + case VPInstruction::ICmpEQ: { + if (Part != 0) + break; + Value *IV = State.get(getOperand(0), {Part, 0}); + Value *TC = State.get(getOperand(1), Part); + Instruction *Cmp = + CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IV, TC, "", + &*Builder.GetInsertBlock()->getTerminator()); + Cmp->setDebugLoc(DL); + State.set(this, Cmp, Part); + break; + } + case Instruction::Select: { Value *Cond = State.get(getOperand(0), Part); Value *Op1 = State.get(getOperand(1), Part); diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -207,6 +207,7 @@ enum class VPUserID { Recipe, LiveOut, + LiveOutPhi, }; private: diff --git a/llvm/test/Transforms/LoopVectorize/pr59319-loop-access-info-invalidation.ll b/llvm/test/Transforms/LoopVectorize/pr59319-loop-access-info-invalidation.ll --- a/llvm/test/Transforms/LoopVectorize/pr59319-loop-access-info-invalidation.ll +++ b/llvm/test/Transforms/LoopVectorize/pr59319-loop-access-info-invalidation.ll @@ -57,7 +57,7 @@ ; CHECK-NEXT: [[N_MOD_VF8:%.*]] = urem i64 [[TMP3]], 4 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF8]] ; CHECK-NEXT: br label [[VECTOR_BODY11:%.*]] -; CHECK: vector.body11: +; CHECK: vector.body10: ; CHECK-NEXT: [[INDEX12:%.*]] = phi i64 [ 0, [[VECTOR_PH7]] ], [ [[INDEX_NEXT13:%.*]], [[VECTOR_BODY11]] ] ; CHECK-NEXT: store i32 0, ptr [[TMP1]], align 4, !alias.scope !4, !noalias !7 ; CHECK-NEXT: [[INDEX_NEXT13]] = add nuw i64 [[INDEX12]], 4 @@ -87,7 +87,7 @@ ; CHECK-NEXT: [[N_MOD_VF24:%.*]] = urem i64 [[TMP3]], 4 ; CHECK-NEXT: [[N_VEC25:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF24]] ; CHECK-NEXT: br label [[VECTOR_BODY28:%.*]] -; CHECK: vector.body28: +; CHECK: vector.body27: ; CHECK-NEXT: [[INDEX29:%.*]] = phi i64 [ 0, [[VECTOR_PH23]] ], [ [[INDEX_NEXT30:%.*]], [[VECTOR_BODY28]] ] ; CHECK-NEXT: store i32 0, ptr [[TMP1]], align 4, !alias.scope !10, !noalias !13 ; CHECK-NEXT: [[INDEX_NEXT30]] = add nuw i64 [[INDEX29]], 4 diff --git a/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll b/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll --- a/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll @@ -24,7 +24,10 @@ ; CHECK-NEXT: Successor(s): middle.block ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<[[EC:%.+]]> = icmp eq ir<1000> vp<%0> ; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: Live-out exit branch vp<[[EC]]> ; CHECK-NEXT: } ; entry: