diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -304,7 +304,7 @@ /// Build a VPlan according to the information gathered by Legal. \return a /// VPlan for vectorization factors \p Range.Start and up to \p Range.End /// exclusive, possibly decreasing \p Range.End. - VPlanPtr buildVPlan(VFRange &Range); + VPlanPtr buildVPlan(const VFRange &Range); /// Build a VPlan using VPRecipes according to the information gather by /// Legal. This method is only used for the legacy inner loop vectorizer. @@ -312,10 +312,17 @@ VFRange &Range, SmallPtrSetImpl &DeadInstructions, const DenseMap &SinkAfter); + /// Returns a new VPlan based on \p OriginalPlan, but with all VPInstruction + /// in \p OriginalPlan lowered to widened recipes. + VPlanPtr + convertToVPRecipes(VPlan &OriginalPlan, VFRange &Range, + const DenseMap &SinkAfter); + /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, /// according to the information gathered by Legal when it checked if it is /// legal to vectorize the loop. This method creates VPlans using VPRecipes. - void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF); + void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF, + VPlan &OriginalPlan); /// Adjust the recipes for any inloop reductions. The chain of instructions /// leading from the loop exit instr to the phi need to be converted to diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7775,7 +7775,8 @@ // profitable to scalarize. CM.selectUserVectorizationFactor(VF); CM.collectInLoopReductions(); - buildVPlansWithVPRecipes(VF, VF); + auto InitialPlan = buildVPlan({VF, VF}); + buildVPlansWithVPRecipes(VF, VF, *InitialPlan); LLVM_DEBUG(printPlans(dbgs())); return {{VF, 0}}; } @@ -7796,7 +7797,8 @@ CM.collectInLoopReductions(); - buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF); + auto InitialPlan = buildVPlan({ElementCount::getFixed(1), MaxVF}); + buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF, *InitialPlan); LLVM_DEBUG(printPlans(dbgs())); if (MaxVF.isScalar()) return VectorizationFactor::Disabled(); @@ -8290,10 +8292,31 @@ /// buildVPlan(). void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, ElementCount MaxVF) { + // Outer loop handling: They may require CFG and instruction level + // transformations before even evaluating whether vectorization is profitable. + // Since we cannot modify the incoming IR, we need to build VPlan upfront in + // the vectorization pipeline. + assert(!OrigLoop->isInnermost()); + assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); + auto MaxVFPlusOne = MaxVF.getWithIncrement(1); for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { VFRange SubRange = {VF, MaxVFPlusOne}; - VPlans.push_back(buildVPlan(SubRange)); + auto Plan = buildVPlan(SubRange); + + if (EnableVPlanPredication) { + VPlanPredicator VPP(*Plan); + VPP.predicate(); + // Avoid running transformation to recipes until masked code generation in + // VPlan-native path is in place. + } else { + SmallPtrSet DeadInstructions; + VPlanTransforms::VPInstructionsToVPRecipes( + OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions, + *PSE.getSE()); + } + + VPlans.emplace_back(std::move(Plan)); VF = SubRange.End; } } @@ -8761,9 +8784,31 @@ } void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, - ElementCount MaxVF) { + ElementCount MaxVF, + VPlan &OriginalPlan) { assert(OrigLoop->isInnermost() && "Inner loop expected."); + // Remove the loop exit block and pre-header/entry block, to simplify the code + // dealing with VPlans for inner loops, as those blocks are not needed at the + // moment. + VPBasicBlock *Exit = + cast(OriginalPlan.getEntry()->getExitBasicBlock()); + SmallVector Preds(Exit->getPredecessors().begin(), + Exit->getPredecessors().end()); + for (VPBlockBase *Pred : Preds) + VPBlockUtils::disconnectBlocks(Pred, Exit); + delete Exit; + + auto *TopRegion = cast(OriginalPlan.getEntry()); + VPBasicBlock *PreEntry = cast(TopRegion->getEntryBasicBlock()); + assert(PreEntry->empty() && "Expecting empty pre-entry block."); + auto *NewEntry = PreEntry->getSingleSuccessor(); + VPBlockUtils::disconnectBlocks(PreEntry, NewEntry); + assert(NewEntry->getNumPredecessors() == 1); + VPBlockUtils::disconnectBlocks(NewEntry->getPredecessors()[0], NewEntry); + TopRegion->setEntry(NewEntry); + delete PreEntry; + // Collect instructions from the original loop that will become trivially dead // in the vectorized loop. We don't need to vectorize these instructions. For // example, original induction update instructions can become dead because we @@ -8785,17 +8830,36 @@ for (Instruction *I : DeadInstructions) SinkAfter.erase(I); + // Remove dead instructions from original VPlan. + auto *DummyVal = new VPValue(); + OriginalPlan.addExternalDef(DummyVal); + for (VPBlockBase *Base : + depth_first(TopRegion->getEntryBasicBlock())) { + VPBasicBlock *OriginalVPBB = Base->getEntryBasicBlock(); + for (VPRecipeBase &Ingredient : + make_early_inc_range(reverse(*OriginalVPBB))) { + VPInstruction *VPInst = dyn_cast(&Ingredient); + if (!VPInst) + continue; + Instruction *Instr = VPInst->getUnderlyingInstr(); + + if (DeadInstructions.contains(Instr) || isa(Instr)) { + Ingredient.getVPValue()->replaceAllUsesWith(DummyVal); + Ingredient.eraseFromParent(); + } + } + } + auto MaxVFPlusOne = MaxVF.getWithIncrement(1); for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { VFRange SubRange = {VF, MaxVFPlusOne}; - VPlans.push_back( - buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); + VPlans.push_back(convertToVPRecipes(OriginalPlan, SubRange, SinkAfter)); VF = SubRange.End; } } -VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( - VFRange &Range, SmallPtrSetImpl &DeadInstructions, +VPlanPtr LoopVectorizationPlanner::convertToVPRecipes( + VPlan &OriginalPlan, VFRange &Range, const DenseMap &SinkAfter) { SmallPtrSet *, 1> InterleaveGroups; @@ -8853,31 +8917,38 @@ // Create a dummy pre-entry VPBasicBlock to start building the VPlan. auto Plan = std::make_unique(); - VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); - Plan->setEntry(VPBB); - // Scan the body of the loop in a topological order to visit each basic block - // after having visited its predecessor basic blocks. - LoopBlocksDFS DFS(OrigLoop); - DFS.perform(LI); + ReversePostOrderTraversal RPOT( + OriginalPlan.getEntry()->getEntryBasicBlock()); + + VPBasicBlock *VPBB = nullptr; + for (VPBlockBase *Base : RPOT) { + VPBasicBlock *OriginalVPBB = Base->getEntryBasicBlock(); - for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { // Relevant instructions from basic block BB will be grouped into VPRecipe // ingredients and fill a new VPBasicBlock. unsigned VPBBsForBB = 0; - auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); - VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); + auto *FirstVPBBForBB = new VPBasicBlock(OriginalVPBB->getName()); + if (VPBB) + VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); + else + Plan->setEntry(FirstVPBBForBB); VPBB = FirstVPBBForBB; Builder.setInsertPoint(VPBB); // Introduce each ingredient into VPlan. - // TODO: Model and preserve debug instrinsics in VPlan. - for (Instruction &I : BB->instructionsWithoutDebug()) { - Instruction *Instr = &I; + for (auto I = OriginalVPBB->begin(), E = OriginalVPBB->end(); I != E;) { + VPRecipeBase *Ingredient = &*I++; + VPInstruction *VPInst = dyn_cast(Ingredient); + Instruction *Instr = + VPInst + ? VPInst->getUnderlyingInstr() + : cast( + cast(Ingredient)->getUnderlyingInstr()); // First filter out irrelevant instructions, to ensure no recipes are // built for them. - if (isa(Instr) || DeadInstructions.count(Instr)) + if (isa(Instr)) continue; SmallVector Operands; @@ -8914,21 +8985,11 @@ RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); if (NextVPBB != VPBB) { VPBB = NextVPBB; - VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) - : ""); + VPBB->setName(OriginalVPBB->getName() + "." + Twine(VPBBsForBB++)); } } } - // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks - // may also be empty, such as the last one VPBB, reflecting original - // basic-blocks with no recipes. - VPBasicBlock *PreEntry = cast(Plan->getEntry()); - assert(PreEntry->empty() && "Expecting empty pre-entry block."); - VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); - VPBlockUtils::disconnectBlocks(PreEntry, Entry); - delete PreEntry; - // --------------------------------------------------------------------------- // Transform initial VPlan: Apply previously taken decisions, in order, to // bring the VPlan to its final state. @@ -9015,14 +9076,7 @@ return Plan; } -VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { - // Outer loop handling: They may require CFG and instruction level - // transformations before even evaluating whether vectorization is profitable. - // Since we cannot modify the incoming IR, we need to build VPlan upfront in - // the vectorization pipeline. - assert(!OrigLoop->isInnermost()); - assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); - +VPlanPtr LoopVectorizationPlanner::buildVPlan(const VFRange &Range) { // Create new empty VPlan auto Plan = std::make_unique(); @@ -9034,19 +9088,6 @@ VF *= 2) Plan->addVF(VF); - if (EnableVPlanPredication) { - VPlanPredicator VPP(*Plan); - VPP.predicate(); - - // Avoid running transformation to recipes until masked code generation in - // VPlan-native path is in place. - return Plan; - } - - SmallPtrSet DeadInstructions; - VPlanTransforms::VPInstructionsToVPRecipes(OrigLoop, Plan, - Legal->getInductionVars(), - DeadInstructions, *PSE.getSE()); return Plan; }