diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -500,7 +500,8 @@ /// latter is the case when vectorizing the epilogue loop. In the case of /// epilogue vectorization, this function is overriden to handle the more /// complex control flow around the loops. - virtual std::pair createVectorizedLoopSkeleton(); + virtual std::pair + createVectorizedLoopSkeleton(VPlan &Plan, VPTransformState &State); /// Fix the vectorized code, taking care of header phi's, live-outs, and more. void fixVectorizedLoop(VPTransformState &State, VPlan &Plan); @@ -608,7 +609,7 @@ void truncateToMinimalBitwidths(VPTransformState &State); /// Returns (and creates if needed) the original loop trip count. - Value *getOrCreateTripCount(BasicBlock *InsertBlock); + Value *getTripCount(BasicBlock *InsertBlock); /// Returns (and creates if needed) the trip count of the widened loop. Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock); @@ -832,15 +833,17 @@ // Override this function to handle the more complex control flow around the // three loops. - std::pair createVectorizedLoopSkeleton() final { - return createEpilogueVectorizedLoopSkeleton(); + std::pair + createVectorizedLoopSkeleton(VPlan &Plan, VPTransformState &State) final { + return createEpilogueVectorizedLoopSkeleton(Plan, State); } /// The interface for creating a vectorized skeleton using one of two /// different strategies, each corresponding to one execution of the vplan /// as described above. virtual std::pair - createEpilogueVectorizedLoopSkeleton() = 0; + createEpilogueVectorizedLoopSkeleton(VPlan &Plan, + VPTransformState &State) = 0; /// Holds and updates state information required to vectorize the main loop /// and its epilogue in two separate passes. This setup helps us avoid @@ -868,7 +871,9 @@ EPI, LVL, CM, BFI, PSI, Check) {} /// Implements the interface for creating a vectorized skeleton using the /// *main loop* strategy (ie the first pass of vplan execution). - std::pair createEpilogueVectorizedLoopSkeleton() final; + std::pair + createEpilogueVectorizedLoopSkeleton(VPlan &Plan, + VPTransformState &State) final; protected: /// Emits an iteration count bypass check once for the main loop (when \p @@ -898,7 +903,9 @@ } /// Implements the interface for creating a vectorized skeleton using the /// *epilogue loop* strategy (ie the second pass of vplan execution). - std::pair createEpilogueVectorizedLoopSkeleton() final; + std::pair + createEpilogueVectorizedLoopSkeleton(VPlan &Plan, + VPTransformState &State) final; protected: /// Emits an iteration count bypass check after the main vector loop has @@ -2883,32 +2890,8 @@ PredicatedInstructions.push_back(Cloned); } -Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) { - if (TripCount) - return TripCount; - - assert(InsertBlock); - IRBuilder<> Builder(InsertBlock->getTerminator()); - // Find the loop boundaries. - Type *IdxTy = Legal->getWidestInductionType(); - assert(IdxTy && "No type for induction"); - const SCEV *ExitCount = createTripCountSCEV(IdxTy, PSE); - - const DataLayout &DL = InsertBlock->getModule()->getDataLayout(); - - // Expand the trip count and place the new instructions in the preheader. - // Notice that the pre-header does not change, only the loop body. - SCEVExpander Exp(*PSE.getSE(), DL, "induction"); - - // Count holds the overall loop count (N). - TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), - InsertBlock->getTerminator()); - - if (TripCount->getType()->isPointerTy()) - TripCount = - CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", - InsertBlock->getTerminator()); - +Value *InnerLoopVectorizer::getTripCount(BasicBlock *InsertBlock) { + assert(TripCount); return TripCount; } @@ -2917,7 +2900,7 @@ if (VectorTripCount) return VectorTripCount; - Value *TC = getOrCreateTripCount(InsertBlock); + Value *TC = getTripCount(InsertBlock); IRBuilder<> Builder(InsertBlock->getTerminator()); Type *Ty = TC->getType(); @@ -2995,7 +2978,7 @@ } void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { - Value *Count = getOrCreateTripCount(LoopVectorPreHeader); + Value *Count = getTripCount(LoopVectorPreHeader); // Reuse existing vector loop preheader for TC checks. // Note that new preheader block is generated for vector loop. BasicBlock *const TCCheckBlock = LoopVectorPreHeader; @@ -3255,7 +3238,7 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() { // The trip counts should be cached by now. - Value *Count = getOrCreateTripCount(LoopVectorPreHeader); + Value *Count = getTripCount(LoopVectorPreHeader); Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); @@ -3289,7 +3272,8 @@ } std::pair -InnerLoopVectorizer::createVectorizedLoopSkeleton() { +InnerLoopVectorizer::createVectorizedLoopSkeleton(VPlan &Plan, + VPTransformState &State) { /* In this function we generate a new loop. The new loop will contain the vectorized instructions while the old loop will continue to run the @@ -3322,6 +3306,8 @@ ... */ + TripCount = State.get(Plan.getTripCount(), 0); + // Create an empty vector loop, and prepare basic blocks for the runtime // checks. createVectorLoopSkeleton(""); @@ -7706,15 +7692,6 @@ LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF << '\n'); - // Workaround! Compute the trip count of the original loop and cache it - // before we start modifying the CFG. This code has a systemic problem - // wherein it tries to run analysis over partially constructed IR; this is - // wrong, and not simply for SCEV. The trip count of the original loop - // simply happens to be prone to hitting this in practice. In theory, we - // can hit the same issue for any SCEV, or ValueTracking query done during - // mutation. See PR49900. - ILV.getOrCreateTripCount(OrigLoop->getLoopPreheader()); - if (!IsEpilogueVectorization) VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE); @@ -7723,9 +7700,14 @@ // 1. Set up the skeleton for vectorization, including vector pre-header and // middle block. The vector loop is created during VPlan execution. VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; + + State.CFG.PrevBB = OrigLoop->getLoopPreheader(); + State.Builder.SetInsertPoint(OrigLoop->getLoopPreheader()->getTerminator()); + BestVPlan.getEntry()->execute(&State); + Value *CanonicalIVStartValue; std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = - ILV.createVectorizedLoopSkeleton(); + ILV.createVectorizedLoopSkeleton(BestVPlan, State); // Only use noalias metadata when using memory checks guaranteeing no overlap // across all iterations. @@ -7756,10 +7738,9 @@ //===------------------------------------------------===// // 2. Copy and widen instructions from the old loop into the new loop. - BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr), - ILV.getOrCreateVectorTripCount(nullptr), - CanonicalIVStartValue, State, - IsEpilogueVectorization); + BestVPlan.prepareToExecute( + ILV.getTripCount(nullptr), ILV.getOrCreateVectorTripCount(nullptr), + CanonicalIVStartValue, State, IsEpilogueVectorization); BestVPlan.execute(&State); @@ -7813,7 +7794,10 @@ /// This function is partially responsible for generating the control flow /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. std::pair -EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { +EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton( + VPlan &Plan, VPTransformState &State) { + TripCount = State.get(Plan.getTripCount(), 0); + createVectorLoopSkeleton(""); // Generate the code to check the minimum iteration count of the vector @@ -7874,7 +7858,7 @@ assert(Bypass && "Expected valid bypass basic block."); ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; - Value *Count = getOrCreateTripCount(LoopVectorPreHeader); + Value *Count = getTripCount(LoopVectorPreHeader); // Reuse existing vector loop preheader for TC checks. // Note that new preheader block is generated for vector loop. BasicBlock *const TCCheckBlock = LoopVectorPreHeader; @@ -7931,7 +7915,8 @@ /// This function is partially responsible for generating the control flow /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. std::pair -EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { +EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton( + VPlan &Plan, VPTransformState &State) { createVectorLoopSkeleton("vec.epilog."); // Now, compare the remaining count and if there aren't enough iterations to @@ -8193,7 +8178,7 @@ VPBuilder::InsertPointGuard Guard(Builder); Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); if (useActiveLaneMask(TFStyle)) { - VPValue *TC = Plan.getOrCreateTripCount(); + VPValue *TC = Plan.getTripCount(); BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}, nullptr, "active.lane.mask"); } else { @@ -8769,7 +8754,8 @@ VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); if (useActiveLaneMaskForControlFlow(Style)) { // Create the active lane mask instruction in the vplan preheader. - VPBasicBlock *Preheader = Plan.getEntry()->getEntryBasicBlock(); + VPBasicBlock *Preheader = + cast(Plan.getVectorLoopRegion()->getSinglePredecessor()); // We can't use StartV directly in the ActiveLaneMask VPInstruction, since // we have to take unrolling into account. Each part needs to start at @@ -8781,7 +8767,7 @@ Preheader->appendRecipe(CanonicalIVIncrementParts); // Create the ActiveLaneMask instruction using the correct start values. - VPValue *TC = Plan.getOrCreateTripCount(); + VPValue *TC = Plan.getTripCount(); VPValue *TripCount, *IncrementValue; if (Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) { @@ -8926,14 +8912,23 @@ // Create initial VPlan skeleton, starting with a block for the pre-header, // followed by a region for the vector loop, followed by the middle block. The // skeleton vector loop region contains a header and latch block. - VPBasicBlock *Preheader = new VPBasicBlock("vector.ph"); - auto Plan = std::make_unique(Preheader); + VPBasicBlock *Preheader = new VPBasicBlock("ph"); + Type *IdxTy = Legal->getWidestInductionType(); + assert(IdxTy && "No type for induction"); + const SCEV *ExitCount = createTripCountSCEV(IdxTy, PSE); + VPExpandSCEVRecipe *TripCount = + new VPExpandSCEVRecipe(ExitCount, *PSE.getSE()); + Preheader->appendRecipe(TripCount); + + auto Plan = std::make_unique(Preheader, TripCount); + VPBasicBlock *VecPreheader = new VPBasicBlock("vector.ph"); + VPBlockUtils::insertBlockAfter(VecPreheader, Preheader); VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body"); VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop"); - VPBlockUtils::insertBlockAfter(TopRegion, Preheader); + VPBlockUtils::insertBlockAfter(TopRegion, VecPreheader); VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block"); VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion); @@ -9126,6 +9121,20 @@ VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); HCFGBuilder.buildHierarchicalCFG(); + VPBasicBlock *Preheader = new VPBasicBlock("ph"); + Type *IdxTy = Legal->getWidestInductionType(); + assert(IdxTy && "No type for induction"); + const SCEV *ExitCount = createTripCountSCEV(IdxTy, PSE); + VPExpandSCEVRecipe *TripCount = + new VPExpandSCEVRecipe(ExitCount, *PSE.getSE()); + Preheader->appendRecipe(TripCount); + + VPBlockBase *OldEntry = Plan->getEntry(); + VPBlockUtils::connectBlocks(Preheader, OldEntry); + Plan->setEntry(Preheader); + Preheader->setPlan(&*Plan); + Plan->setTripCount(TripCount); + for (ElementCount VF : Range) Plan->addVF(VF); @@ -9143,6 +9152,7 @@ addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), CM.getTailFoldingStyle()); + return Plan; } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2261,7 +2261,8 @@ MapVector LiveOuts; public: - VPlan(VPBlockBase *Entry = nullptr) : Entry(Entry) { + VPlan(VPBlockBase *Entry = nullptr, VPValue *TripCount = nullptr) + : Entry(Entry), TripCount(TripCount) { if (Entry) Entry->setPlan(this); } @@ -2286,12 +2287,17 @@ } /// The trip count of the original loop. - VPValue *getOrCreateTripCount() { + VPValue *getTripCount() { if (!TripCount) TripCount = new VPValue(); return TripCount; } + void setTripCount(VPValue *V) { + assert(!TripCount); + TripCount = V; + } + /// The backedge taken count of the original loop. VPValue *getOrCreateBackedgeTakenCount() { if (!BackedgeTakenCount) @@ -2407,10 +2413,16 @@ /// Returns the VPRegionBlock of the vector loop. VPRegionBlock *getVectorLoopRegion() { - return cast(getEntry()->getSingleSuccessor()); + if (auto *R = dyn_cast(getEntry()->getSingleSuccessor())) + return R; + return cast( + getEntry()->getSingleSuccessor()->getSingleSuccessor()); } const VPRegionBlock *getVectorLoopRegion() const { - return cast(getEntry()->getSingleSuccessor()); + if (auto *R = dyn_cast(getEntry()->getSingleSuccessor())) + return R; + return cast( + getEntry()->getSingleSuccessor()->getSingleSuccessor()); } /// Returns the canonical induction recipe of the vector loop. diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -593,8 +593,6 @@ } for (VPValue *VPV : VPValuesToFree) delete VPV; - if (TripCount) - delete TripCount; if (BackedgeTakenCount) delete BackedgeTakenCount; for (auto &P : VPExternalDefs) @@ -675,7 +673,7 @@ State->Builder.SetInsertPoint(VectorPreHeader->getTerminator()); // Generate code in the loop pre-header and body. - for (VPBlockBase *Block : vp_depth_first_shallow(Entry)) + for (VPBlockBase *Block : vp_depth_first_shallow(Entry->getSingleSuccessor())) Block->execute(State); VPBasicBlock *LatchVPBB = getVectorLoopRegion()->getExitingBasicBlock(); @@ -1111,8 +1109,6 @@ assignSlot(&Plan.VectorTripCount); if (Plan.BackedgeTakenCount) assignSlot(Plan.BackedgeTakenCount); - if (Plan.TripCount) - assignSlot(Plan.TripCount); ReversePostOrderTraversal> RPOT(VPBlockDeepTraversalWrapper(Plan.getEntry())); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -31,7 +31,7 @@ const TargetLibraryInfo &TLI) { ReversePostOrderTraversal> RPOT( - Plan->getEntry()); + Plan->getEntry()->getSingleSuccessor()); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(RPOT)) { VPRecipeBase *Term = VPBB->getTerminator(); auto EndIter = Term ? Term->getIterator() : VPBB->end(); @@ -395,7 +395,8 @@ vp_depth_first_deep(Plan.getEntry()))) { auto *PredVPBB = dyn_cast_or_null(VPBB->getSinglePredecessor()); - if (PredVPBB && PredVPBB->getNumSuccessors() == 1) + if (PredVPBB && PredVPBB->getNumSuccessors() == 1 && + PredVPBB != Plan.getEntry()) WorkList.push_back(VPBB); }