diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -293,9 +293,6 @@ /// Return the fixed-order recurrences found in the loop. RecurrenceSet &getFixedOrderRecurrences() { return FixedOrderRecurrences; } - /// Return the set of instructions to sink to handle fixed-order recurrences. - MapVector &getSinkAfter() { return SinkAfter; } - /// Returns the widest induction type. Type *getWidestInductionType() { return WidestIndTy; } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -350,9 +350,9 @@ /// Build a VPlan using VPRecipes according to the information gather by /// Legal. This method is only used for the legacy inner loop vectorizer. - VPlanPtr buildVPlanWithVPRecipes( - VFRange &Range, SmallPtrSetImpl &DeadInstructions, - const MapVector &SinkAfter); + VPlanPtr + buildVPlanWithVPRecipes(VFRange &Range, + SmallPtrSetImpl &DeadInstructions); /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, /// according to the information gathered by Legal when it checked if it is diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8687,34 +8687,10 @@ auto &ConditionalAssumes = Legal->getConditionalAssumes(); DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); - MapVector &SinkAfter = Legal->getSinkAfter(); - // Dead instructions do not need sinking. Remove them from SinkAfter. - for (Instruction *I : DeadInstructions) - SinkAfter.erase(I); - - // Cannot sink instructions after dead instructions (there won't be any - // recipes for them). Instead, find the first non-dead previous instruction. - for (auto &P : Legal->getSinkAfter()) { - Instruction *SinkTarget = P.second; - Instruction *FirstInst = &*SinkTarget->getParent()->begin(); - (void)FirstInst; - while (DeadInstructions.contains(SinkTarget)) { - assert( - SinkTarget != FirstInst && - "Must find a live instruction (at least the one feeding the " - "fixed-order recurrence PHI) before reaching beginning of the block"); - SinkTarget = SinkTarget->getPrevNode(); - assert(SinkTarget != P.first && - "sink source equals target, no sinking required"); - } - P.second = SinkTarget; - } - auto MaxVFPlusOne = MaxVF.getWithIncrement(1); for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { VFRange SubRange = {VF, MaxVFPlusOne}; - VPlans.push_back( - buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); + VPlans.push_back(buildVPlanWithVPRecipes(SubRange, DeadInstructions)); VF = SubRange.End; } } @@ -8820,8 +8796,7 @@ } VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( - VFRange &Range, SmallPtrSetImpl &DeadInstructions, - const MapVector &SinkAfter) { + VFRange &Range, SmallPtrSetImpl &DeadInstructions) { SmallPtrSet *, 1> InterleaveGroups; @@ -8832,12 +8807,6 @@ // process after constructing the initial VPlan. // --------------------------------------------------------------------------- - // Mark instructions we'll need to sink later and their targets as - // ingredients whose recipe we'll need to record. - for (const auto &Entry : SinkAfter) { - RecipeBuilder.recordRecipeOf(Entry.first); - RecipeBuilder.recordRecipeOf(Entry.second); - } for (const auto &Reduction : CM.getInLoopReductionChains()) { PHINode *Phi = Reduction.first; RecurKind Kind = @@ -8905,7 +8874,6 @@ DFS.perform(LI); VPBasicBlock *VPBB = HeaderVPBB; - SmallVector InductionsToMove; for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { // Relevant instructions from basic block BB will be grouped into VPRecipe // ingredients and fill a new VPBasicBlock. @@ -8960,19 +8928,15 @@ Plan->addVPValue(UV, Def); } + RecipeBuilder.setRecipe(Instr, Recipe); if (isa(Recipe) && HeaderVPBB->getFirstNonPhi() != VPBB->end()) { - // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section - // of the header block. That can happen for truncates of induction - // variables. Those recipes are moved to the phi section of the header - // block after applying SinkAfter, which relies on the original - // position of the trunc. + // Move VPWidenIntOrFpInductionRecipes for optimized truncates to the + // phi section of HeaderVPBB. assert(isa(Instr)); - InductionsToMove.push_back( - cast(Recipe)); - } - RecipeBuilder.setRecipe(Instr, Recipe); - VPBB->appendRecipe(Recipe); + Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); + } else + VPBB->appendRecipe(Recipe); continue; } @@ -9007,115 +8971,14 @@ // bring the VPlan to its final state. // --------------------------------------------------------------------------- - // Apply Sink-After legal constraints. - auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { - auto *Region = dyn_cast_or_null(R->getParent()->getParent()); - if (Region && Region->isReplicator()) { - assert(Region->getNumSuccessors() == 1 && - Region->getNumPredecessors() == 1 && "Expected SESE region!"); - assert(R->getParent()->size() == 1 && - "A recipe in an original replicator region must be the only " - "recipe in its block"); - return Region; - } - return nullptr; - }; - for (const auto &Entry : SinkAfter) { - VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); - VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); - - auto *TargetRegion = GetReplicateRegion(Target); - auto *SinkRegion = GetReplicateRegion(Sink); - if (!SinkRegion) { - // If the sink source is not a replicate region, sink the recipe directly. - if (TargetRegion) { - // The target is in a replication region, make sure to move Sink to - // the block after it, not into the replication region itself. - VPBasicBlock *NextBlock = - cast(TargetRegion->getSuccessors().front()); - Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); - } else - Sink->moveAfter(Target); - continue; - } - - // The sink source is in a replicate region. Unhook the region from the CFG. - auto *SinkPred = SinkRegion->getSinglePredecessor(); - auto *SinkSucc = SinkRegion->getSingleSuccessor(); - VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion); - VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc); - VPBlockUtils::connectBlocks(SinkPred, SinkSucc); - - if (TargetRegion) { - // The target recipe is also in a replicate region, move the sink region - // after the target region. - auto *TargetSucc = TargetRegion->getSingleSuccessor(); - VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc); - VPBlockUtils::connectBlocks(TargetRegion, SinkRegion); - VPBlockUtils::connectBlocks(SinkRegion, TargetSucc); - } else { - // The sink source is in a replicate region, we need to move the whole - // replicate region, which should only contain a single recipe in the - // main block. - auto *SplitBlock = - Target->getParent()->splitAt(std::next(Target->getIterator())); - - auto *SplitPred = SplitBlock->getSinglePredecessor(); - - VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); - VPBlockUtils::connectBlocks(SplitPred, SinkRegion); - VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); - } - } - VPlanTransforms::removeRedundantCanonicalIVs(*Plan); VPlanTransforms::removeRedundantInductionCasts(*Plan); - // Now that sink-after is done, move induction recipes for optimized truncates - // to the phi section of the header block. - for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove) - Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); - // Adjust the recipes for any inloop reductions. adjustRecipesForReductions(cast(TopRegion->getExiting()), Plan, RecipeBuilder, Range.Start); - // Introduce a recipe to combine the incoming and previous values of a - // fixed-order recurrence. - for (VPRecipeBase &R : - Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { - auto *RecurPhi = dyn_cast(&R); - if (!RecurPhi) - continue; - - VPRecipeBase *PrevRecipe = &RecurPhi->getBackedgeRecipe(); - // Fixed-order recurrences do not contain cycles, so this loop is guaranteed - // to terminate. - while (auto *PrevPhi = - dyn_cast(PrevRecipe)) - PrevRecipe = &PrevPhi->getBackedgeRecipe(); - VPBasicBlock *InsertBlock = PrevRecipe->getParent(); - auto *Region = GetReplicateRegion(PrevRecipe); - if (Region) - InsertBlock = dyn_cast(Region->getSingleSuccessor()); - if (!InsertBlock) { - InsertBlock = new VPBasicBlock(Region->getName() + ".succ"); - VPBlockUtils::insertBlockAfter(InsertBlock, Region); - } - if (Region || PrevRecipe->isPhi()) - Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi()); - else - Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator())); - - auto *RecurSplice = cast( - Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, - {RecurPhi, RecurPhi->getBackedgeValue()})); - - RecurPhi->replaceAllUsesWith(RecurSplice); - // Set the first operand of RecurSplice to RecurPhi again, after replacing - // all users. - RecurSplice->setOperand(0, RecurPhi); - } + VPlanTransforms::adjustFixedOrderRecurrences(*Plan, Builder); // Interleave memory: for each Interleave Group we marked earlier as relevant // for this VPlan, replace the Recipes widening its memory instructions with a diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -25,6 +25,7 @@ class Loop; class PredicatedScalarEvolution; class TargetLibraryInfo; +class VPBuilder; struct VPlanTransforms { /// Replaces the VPInstructions in \p Plan with corresponding @@ -76,6 +77,8 @@ static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE); + + static void adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder); }; } // namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -12,6 +12,8 @@ //===----------------------------------------------------------------------===// #include "VPlanTransforms.h" +#include "VPlanDominatorTree.h" +#include "VPRecipeBuilder.h" #include "VPlanCFG.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SetVector.h" @@ -533,3 +535,186 @@ // 1. Replace inductions with constants. // 2. Replace vector loop region with VPBasicBlock. } + +static VPRegionBlock *GetReplicateRegion(VPRecipeBase *R) { + auto *Region = dyn_cast_or_null(R->getParent()->getParent()); + if (Region && Region->isReplicator()) { + assert(Region->getNumSuccessors() == 1 && + Region->getNumPredecessors() == 1 && "Expected SESE region!"); + assert(R->getParent()->size() == 1 && + "A recipe in an original replicator region must be the only " + "recipe in its block"); + return Region; + } + return nullptr; +} + +static bool dominates(const VPRecipeBase *A, const VPRecipeBase *B, + VPDominatorTree &VPDT) { + auto LocalComesBefore = [](const VPRecipeBase *A, const VPRecipeBase *B) { + for (auto &R : *A->getParent()) { + if (&R == A) + return true; + if (&R == B) + return false; + } + llvm_unreachable("recipe not found"); + }; + const VPBlockBase *ParentA = A->getParent(); + const VPBlockBase *ParentB = B->getParent(); + if (ParentA == ParentB) + return LocalComesBefore(A, B); + + const VPRegionBlock *RegionA = + GetReplicateRegion(const_cast(A)); + const VPRegionBlock *RegionB = + GetReplicateRegion(const_cast(B)); + if (RegionA) + ParentA = RegionA->getExiting(); + if (RegionB) + ParentB = RegionB->getExiting(); + return VPDT.dominates(ParentA, ParentB); +} + +// Sink users of \p FOR after the recipe defining the previous value \p Previous +// of the recurrence. +static void +sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, + VPRecipeBase *Previous, + VPDominatorTree &VPDT) { + // Collect recipes that need sinking. + SmallVector WorkList; + SmallPtrSet Seen; + Seen.insert(Previous); + auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) { + if (isa(SinkCandidate) || + !Seen.insert(SinkCandidate).second || + dominates(Previous, SinkCandidate, VPDT)) + return; + + WorkList.push_back(SinkCandidate); + }; + + // Recursively sink users of FOR after Previous. + WorkList.push_back(FOR); + for (unsigned I = 0; I != WorkList.size(); ++I) { + VPRecipeBase *Current = WorkList[I]; + assert(Current->getNumDefinedValues() == 1 && + "only recipes with a single defined value expected"); + for (VPUser *User : Current->getVPSingleValue()->users()) { + if (auto *R = dyn_cast(User)) + TryToPushSinkCandidate(R); + } + } + + // Keep recipes to sink ordered by dominance so earlier instructions are + // processed first. + sort(WorkList, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) { + return dominates(A, B, VPDT); + }); + + for (VPRecipeBase *SinkCandidate : WorkList) { + // VPPredInstPHIRecipes don't need sinking, because they will be sunk when + // sinking the containing replicate region. + if (isa(SinkCandidate) || SinkCandidate == FOR) + continue; + + VPRecipeBase *Target = Previous; + Previous = SinkCandidate; + auto *TargetRegion = GetReplicateRegion(Target); + auto *SinkRegion = GetReplicateRegion(SinkCandidate); + if (!SinkRegion) { + // If the sink source is not a replicate region, sink the recipe + // directly. + if (TargetRegion) { + // The target is in a replication region, make sure to move Sink to + // the block after it, not into the replication region itself. + VPBasicBlock *NextBlock = + cast(TargetRegion->getSuccessors().front()); + SinkCandidate->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); + } else + SinkCandidate->moveAfter(Target); + continue; + } + // The sink source is in a replicate region. Unhook the region from the + // CFG. + auto *SinkPred = SinkRegion->getSinglePredecessor(); + auto *SinkSucc = SinkRegion->getSingleSuccessor(); + VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion); + VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc); + VPBlockUtils::connectBlocks(SinkPred, SinkSucc); + + if (TargetRegion) { + // The target recipe is also in a replicate region, move the sink + // region after the target region. + auto *TargetSucc = TargetRegion->getSingleSuccessor(); + VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc); + VPBlockUtils::connectBlocks(TargetRegion, SinkRegion); + VPBlockUtils::connectBlocks(SinkRegion, TargetSucc); + } else { + // The sink source is in a replicate region, we need to move the whole + // replicate region, which should only contain a single recipe in the + // main block. + auto *SplitBlock = + Target->getParent()->splitAt(std::next(Target->getIterator())); + + auto *SplitPred = SplitBlock->getSinglePredecessor(); + + VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); + VPBlockUtils::connectBlocks(SplitPred, SinkRegion); + VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); + } + // We modified the CFG, update dominator tree. + VPDT.recalculate(*SinkRegion->getPlan()); + } +} + +void VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan, + VPBuilder &Builder) { + VPDominatorTree VPDT; + VPDT.recalculate(Plan); + + for (VPRecipeBase &R : + Plan.getVectorLoopRegion()->getEntry()->getEntryBasicBlock()->phis()) { + auto *FOR = dyn_cast(&R); + if (!FOR) + continue; + + SmallPtrSet SeenPhis; + VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe(); + // Fixed-order recurrences do not contain cycles, so this loop is guaranteed + // to terminate. + while (auto *PrevPhi = + dyn_cast_or_null(Previous)) { + assert(PrevPhi->getParent() == FOR->getParent()); + assert(SeenPhis.insert(PrevPhi).second); + Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe(); + } + + sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT); + + // Introduce a recipe to combine the incoming and previous values of a + // fixed-order recurrence. + VPBasicBlock *InsertBlock = Previous->getParent(); + auto *Region = GetReplicateRegion(Previous); + if (Region) + InsertBlock = dyn_cast(Region->getSingleSuccessor()); + if (!InsertBlock) { + InsertBlock = new VPBasicBlock(Region->getName() + ".succ"); + VPBlockUtils::insertBlockAfter(InsertBlock, Region); + } + if (Region || isa(Previous)) + Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi()); + else + Builder.setInsertPoint(InsertBlock, std::next(Previous->getIterator())); + + auto *RecurSplice = cast( + Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, + {FOR, FOR->getBackedgeValue()})); + + FOR->replaceAllUsesWith(RecurSplice); + // Set the first operand of RecurSplice to FOR again, after replacing + // all users. + RecurSplice->setOperand(0, FOR); + } +} diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll @@ -57,7 +57,7 @@ ; CHECK-NEXT: store i32 [[ADD_2]], ptr [[IDX_2]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 2000 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[FOR]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[FOR]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -1074,9 +1074,9 @@ ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD]] = load <4 x double>, ptr [[TMP2]], align 8 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR1]], <4 x double> [[WIDE_LOAD]], <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> [[WIDE_LOAD]], <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = fadd <4 x double> , [[TMP3]] -; CHECK-NEXT: [[TMP6:%.*]] = fadd <4 x double> [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x double> , [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> [[WIDE_LOAD]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = fadd <4 x double> [[TMP4]], [[TMP5]] ; CHECK-NEXT: store <4 x double> [[TMP6]], ptr [[TMP2]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 996