diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -293,9 +293,6 @@ /// Return the fixed-order recurrences found in the loop. RecurrenceSet &getFixedOrderRecurrences() { return FixedOrderRecurrences; } - /// Return the set of instructions to sink to handle fixed-order recurrences. - MapVector &getSinkAfter() { return SinkAfter; } - /// Returns the widest induction type. Type *getWidestInductionType() { return WidestIndTy; } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -350,9 +350,9 @@ /// Build a VPlan using VPRecipes according to the information gather by /// Legal. This method is only used for the legacy inner loop vectorizer. - VPlanPtr buildVPlanWithVPRecipes( - VFRange &Range, SmallPtrSetImpl &DeadInstructions, - const MapVector &SinkAfter); + VPlanPtr + buildVPlanWithVPRecipes(VFRange &Range, + SmallPtrSetImpl &DeadInstructions); /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, /// according to the information gathered by Legal when it checked if it is diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8677,34 +8677,10 @@ auto &ConditionalAssumes = Legal->getConditionalAssumes(); DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); - MapVector &SinkAfter = Legal->getSinkAfter(); - // Dead instructions do not need sinking. Remove them from SinkAfter. - for (Instruction *I : DeadInstructions) - SinkAfter.erase(I); - - // Cannot sink instructions after dead instructions (there won't be any - // recipes for them). Instead, find the first non-dead previous instruction. - for (auto &P : Legal->getSinkAfter()) { - Instruction *SinkTarget = P.second; - Instruction *FirstInst = &*SinkTarget->getParent()->begin(); - (void)FirstInst; - while (DeadInstructions.contains(SinkTarget)) { - assert( - SinkTarget != FirstInst && - "Must find a live instruction (at least the one feeding the " - "fixed-order recurrence PHI) before reaching beginning of the block"); - SinkTarget = SinkTarget->getPrevNode(); - assert(SinkTarget != P.first && - "sink source equals target, no sinking required"); - } - P.second = SinkTarget; - } - auto MaxVFPlusOne = MaxVF.getWithIncrement(1); for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { VFRange SubRange = {VF, MaxVFPlusOne}; - VPlans.push_back( - buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); + VPlans.push_back(buildVPlanWithVPRecipes(SubRange, DeadInstructions)); VF = SubRange.End; } } @@ -8810,8 +8786,7 @@ } VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( - VFRange &Range, SmallPtrSetImpl &DeadInstructions, - const MapVector &SinkAfter) { + VFRange &Range, SmallPtrSetImpl &DeadInstructions) { SmallPtrSet *, 1> InterleaveGroups; @@ -8822,12 +8797,6 @@ // process after constructing the initial VPlan. // --------------------------------------------------------------------------- - // Mark instructions we'll need to sink later and their targets as - // ingredients whose recipe we'll need to record. - for (const auto &Entry : SinkAfter) { - RecipeBuilder.recordRecipeOf(Entry.first); - RecipeBuilder.recordRecipeOf(Entry.second); - } for (const auto &Reduction : CM.getInLoopReductionChains()) { PHINode *Phi = Reduction.first; RecurKind Kind = @@ -8896,7 +8865,6 @@ DFS.perform(LI); VPBasicBlock *VPBB = HeaderVPBB; - SmallVector InductionsToMove; for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { // Relevant instructions from basic block BB will be grouped into VPRecipe // ingredients and fill a new VPBasicBlock. @@ -8951,6 +8919,7 @@ Plan->addVPValue(UV, Def); } + RecipeBuilder.setRecipe(Instr, Recipe); if (isa(Recipe) && HeaderVPBB->getFirstNonPhi() != VPBB->end()) { // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section @@ -8959,11 +8928,9 @@ // block after applying SinkAfter, which relies on the original // position of the trunc. assert(isa(Instr)); - InductionsToMove.push_back( - cast(Recipe)); - } - RecipeBuilder.setRecipe(Instr, Recipe); - VPBB->appendRecipe(Recipe); + Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); + } else + VPBB->appendRecipe(Recipe); continue; } @@ -9011,62 +8978,145 @@ } return nullptr; }; - for (const auto &Entry : SinkAfter) { - VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); - VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); - - auto *TargetRegion = GetReplicateRegion(Target); - auto *SinkRegion = GetReplicateRegion(Sink); - if (!SinkRegion) { - // If the sink source is not a replicate region, sink the recipe directly. - if (TargetRegion) { - // The target is in a replication region, make sure to move Sink to - // the block after it, not into the replication region itself. - VPBasicBlock *NextBlock = - cast(TargetRegion->getSuccessors().front()); - Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); - } else - Sink->moveAfter(Target); + + VPDominatorTree VPDT; + VPDT.recalculate(*Plan); + + for (VPRecipeBase &R : HeaderVPBB->phis()) { + SmallVector WorkList; + SmallPtrSet Seen; + auto *FOR = dyn_cast(&R); + if (!FOR) continue; + + auto *Previous = FOR->getPreviousValue()->getDefiningRecipe(); + + SmallPtrSet SeenPhis; + while (auto *PrevPhi = + dyn_cast_or_null(Previous)) { + assert(PrevPhi->getParent() == FOR->getParent()); + assert(SeenPhis.insert(PrevPhi).second); + Previous = PrevPhi->getPreviousValue()->getDefiningRecipe(); } - // The sink source is in a replicate region. Unhook the region from the CFG. - auto *SinkPred = SinkRegion->getSinglePredecessor(); - auto *SinkSucc = SinkRegion->getSingleSuccessor(); - VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion); - VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc); - VPBlockUtils::connectBlocks(SinkPred, SinkSucc); - - if (TargetRegion) { - // The target recipe is also in a replicate region, move the sink region - // after the target region. - auto *TargetSucc = TargetRegion->getSingleSuccessor(); - VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc); - VPBlockUtils::connectBlocks(TargetRegion, SinkRegion); - VPBlockUtils::connectBlocks(SinkRegion, TargetSucc); - } else { - // The sink source is in a replicate region, we need to move the whole - // replicate region, which should only contain a single recipe in the - // main block. - auto *SplitBlock = - Target->getParent()->splitAt(std::next(Target->getIterator())); + auto CompareByComesBefore = [&](const VPRecipeBase *A, + const VPRecipeBase *B) { + auto ComesBefore = [](const VPRecipeBase *A, const VPRecipeBase *B) { + for (auto &R : *A->getParent()) { + if (&R == A) + return true; + if (&R == B) + return false; + } + llvm_unreachable("recipe not found"); + }; + if (A->getParent() == B->getParent()) + return ComesBefore(A, B); + return VPDT.dominates(A->getParent(), B->getParent()); + }; + std::set InstrsToSink( + CompareByComesBefore); + + Seen.insert(Previous); + auto OrigPrevious = Previous; + auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) { + if (!Seen.insert(SinkCandidate).second) + return; + + if (CompareByComesBefore(OrigPrevious, SinkCandidate)) + return; + + if (isa(SinkCandidate)) + return; + WorkList.push_back(SinkCandidate); + InstrsToSink.insert(SinkCandidate); + }; - auto *SplitPred = SplitBlock->getSinglePredecessor(); + WorkList.push_back(FOR); + // Try to recursively sink instructions and their users after Previous. + for (unsigned I = 0; I != WorkList.size(); ++I) { + VPRecipeBase *Current = WorkList[I]; + for (VPUser *User : Current->getVPSingleValue()->users()) { + if (auto *R = dyn_cast(User)) + TryToPushSinkCandidate(R); + } + } + for (VPRecipeBase *SinkCandidate : InstrsToSink) { + if (!Previous) { + for (VPValue *Op : SinkCandidate->operands()) { + VPRecipeBase *Curr = Op->getDefiningRecipe(); + if (!Curr) + continue; + if (!Previous) { + if (Curr == FOR) { + Previous = OrigPrevious; + } else + Previous = Curr; + continue; + } - VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); - VPBlockUtils::connectBlocks(SplitPred, SinkRegion); - VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); + if (CompareByComesBefore(Previous, Curr)) + Previous = Curr; + } + } + + auto DoSink = [&]() { + auto *TargetRegion = GetReplicateRegion(Previous); + auto *SinkRegion = GetReplicateRegion(SinkCandidate); + if (!SinkRegion) { + // If the sink source is not a replicate region, sink the recipe + // directly. + if (TargetRegion) { + // The target is in a replication region, make sure to move Sink to + // the block after it, not into the replication region itself. + VPBasicBlock *NextBlock = + cast(TargetRegion->getSuccessors().front()); + SinkCandidate->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); + } else + SinkCandidate->moveAfter(Previous); + return; + } + // The sink source is in a replicate region. Unhook the region from the + // CFG. + auto *SinkPred = SinkRegion->getSinglePredecessor(); + auto *SinkSucc = SinkRegion->getSingleSuccessor(); + VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion); + VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc); + VPBlockUtils::connectBlocks(SinkPred, SinkSucc); + + if (TargetRegion) { + // The target recipe is also in a replicate region, move the sink + // region after the target region. + auto *TargetSucc = TargetRegion->getSingleSuccessor(); + VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc); + VPBlockUtils::connectBlocks(TargetRegion, SinkRegion); + VPBlockUtils::connectBlocks(SinkRegion, TargetSucc); + } else { + // The sink source is in a replicate region, we need to move the whole + // replicate region, which should only contain a single recipe in the + // main block. + auto *SplitBlock = Previous->getParent()->splitAt( + std::next(Previous->getIterator())); + + auto *SplitPred = SplitBlock->getSinglePredecessor(); + + VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); + VPBlockUtils::connectBlocks(SplitPred, SinkRegion); + VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); + } + + VPDT.recalculate(*Plan); + }; + DoSink(); + + Previous = nullptr; + WorkList.push_back(SinkCandidate); } } VPlanTransforms::removeRedundantCanonicalIVs(*Plan); VPlanTransforms::removeRedundantInductionCasts(*Plan); - // Now that sink-after is done, move induction recipes for optimized truncates - // to the phi section of the header block. - for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove) - Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); - // Adjust the recipes for any inloop reductions. adjustRecipesForReductions(cast(TopRegion->getExiting()), Plan, RecipeBuilder, Range.Start); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1259,6 +1259,8 @@ void execute(VPTransformState &State) override; + VPValue *getPreviousValue() { return getOperand(1); } + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll @@ -57,7 +57,7 @@ ; CHECK-NEXT: store i32 [[ADD_2]], ptr [[IDX_2]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 2000 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[FOR]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[FOR]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -460,18 +460,18 @@ ; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT3]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT2]], <4 x float> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[VECTOR_RECUR]], <4 x float> [[BROADCAST_SPLAT3]], <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <4 x float> [[TMP4]], +; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <4 x float> [[TMP4]], ; CHECK-NEXT: [[TMP6:%.*]] = fmul fast <4 x float> [[TMP4]], [[TMP2]] -; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <4 x float> [[TMP4]], +; CHECK-NEXT: [[TMP7:%.*]] = fmul fast <4 x float> [[TMP4]], ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[DST_1:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0 ; CHECK-NEXT: store <4 x float> [[TMP6]], ptr [[TMP9]], align 4 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[DST_2:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 0 -; CHECK-NEXT: store <4 x float> [[TMP5]], ptr [[TMP11]], align 4 +; CHECK-NEXT: store <4 x float> [[TMP7]], ptr [[TMP11]], align 4 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[DST_3:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0 -; CHECK-NEXT: store <4 x float> [[TMP7]], ptr [[TMP13]], align 4 +; CHECK-NEXT: store <4 x float> [[TMP5]], ptr [[TMP13]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 ; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] @@ -555,19 +555,19 @@ ; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT3]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT2]], <4 x float> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[VECTOR_RECUR]], <4 x float> [[BROADCAST_SPLAT3]], <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <4 x float> [[TMP4]], -; CHECK-NEXT: [[TMP6:%.*]] = fmul fast <4 x float> [[TMP5]], -; CHECK-NEXT: [[TMP7:%.*]] = fmul fast <4 x float> [[TMP6]], [[TMP2]] -; CHECK-NEXT: [[TMP8:%.*]] = fadd fast <4 x float> [[TMP4]], +; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <4 x float> [[TMP4]], +; CHECK-NEXT: [[TMP6:%.*]] = fmul fast <4 x float> [[TMP4]], +; CHECK-NEXT: [[TMP7:%.*]] = fmul fast <4 x float> [[TMP6]], +; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <4 x float> [[TMP7]], [[TMP2]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[DST_1:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 0 -; CHECK-NEXT: store <4 x float> [[TMP7]], ptr [[TMP10]], align 4 +; CHECK-NEXT: store <4 x float> [[TMP8]], ptr [[TMP10]], align 4 ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[DST_2:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0 -; CHECK-NEXT: store <4 x float> [[TMP5]], ptr [[TMP12]], align 4 +; CHECK-NEXT: store <4 x float> [[TMP6]], ptr [[TMP12]], align 4 ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[DST_3:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 0 -; CHECK-NEXT: store <4 x float> [[TMP8]], ptr [[TMP14]], align 4 +; CHECK-NEXT: store <4 x float> [[TMP5]], ptr [[TMP14]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 ; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] @@ -950,10 +950,10 @@ ; CHECK-NEXT: [[TMP5]] = zext <4 x i32> [[WIDE_LOAD]] to <4 x i64> ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> [[TMP5]], <4 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = trunc <4 x i64> [[TMP6]] to <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i32> [[TMP7]], -; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i32> [[TMP8]], -; CHECK-NEXT: [[TMP10:%.*]] = icmp slt <4 x i32> [[TMP7]], -; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP7]], <4 x i32> [[TMP9]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp slt <4 x i32> [[TMP7]], +; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP7]], +; CHECK-NEXT: [[TMP10:%.*]] = mul <4 x i32> [[TMP9]], +; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> [[TMP7]], <4 x i32> [[TMP10]] ; CHECK-NEXT: store <4 x i32> [[TMP11]], ptr [[TMP4]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]