diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -597,8 +597,7 @@ /// this phi node. void fixFirstOrderRecurrence(PHINode *Phi, VPTransformState &State); - /// Fix a reduction cross-iteration phi. This is the second phase of - /// vectorizing this phi node. + /// Create code for the loop exit value of the reduction. void fixReduction(VPWidenPHIRecipe *Phi, VPTransformState &State); /// Clear NSW/NUW flags from reduction instructions if necessary. @@ -4309,7 +4308,8 @@ TrackingVH ReductionStartValue = RdxDesc.getRecurrenceStartValue(); Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); setDebugLocFromInst(Builder, ReductionStartValue); - bool IsInLoopReductionPhi = PhiR->isInLoopReduction(); + VPReductionRecipe *InLoopRed = PhiR->getInLoopReduction(); + bool IsInLoopReductionPhi = InLoopRed != nullptr; VPValue *LoopExitInstDef = State.Plan->getVPValue(LoopExitInst); // This is the vector-clone of the value that leaves the loop. @@ -4318,26 +4318,6 @@ // Wrap flags are in general invalid after vectorization, clear them. clearReductionWrapFlags(RdxDesc, State); - // Fix the vector-loop phi. - - // Reductions do not have to start at zero. They can start with - // any loop invariant values. - BasicBlock *VectorLoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); - - bool IsOrdered = State.VF.isVector() && IsInLoopReductionPhi && - useOrderedReductions(RdxDesc); - - for (unsigned Part = 0; Part < UF; ++Part) { - if (IsOrdered && Part > 0) - break; - Value *VecRdxPhi = State.get(PhiR->getVPSingleValue(), Part); - Value *Val = State.get(PhiR->getBackedgeValue(), Part); - if (IsOrdered) - Val = State.get(PhiR->getBackedgeValue(), UF - 1); - - cast(VecRdxPhi)->addIncoming(Val, VectorLoopLatch); - } - // Before each round, move the insertion point right between // the PHIs and the values we are going to write. // This allows us to write both PHINodes and the extractelement @@ -4424,6 +4404,7 @@ // terminate on this line. This is the easiest way to ensure we don't // accidentally cause an extra step back into the loop while debugging. setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); + bool IsOrdered = State.VF.isVector() && InLoopRed && InLoopRed->isOrdered(); if (IsOrdered) ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); else { @@ -4723,7 +4704,8 @@ // this value when we vectorize all of the instructions that use the PHI. if (RdxDesc || Legal->isFirstOrderRecurrence(P)) { Value *Iden = nullptr; - bool IsInLoopReduction = PhiR->isInLoopReduction(); + VPReductionRecipe *InLoopRed = PhiR->getInLoopReduction(); + bool IsInLoopReduction = InLoopRed != nullptr; bool ScalarPHI = State.VF.isScalar() || IsInLoopReduction; Type *VecTy = ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF); @@ -4758,8 +4740,8 @@ } } - bool IsOrdered = State.VF.isVector() && IsInLoopReduction && - useOrderedReductions(*RdxDesc); + bool IsOrdered = + State.VF.isVector() && IsInLoopReduction && InLoopRed->isOrdered(); for (unsigned Part = 0; Part < State.UF; ++Part) { // This is phase one of vectorizing PHIs. @@ -9434,7 +9416,6 @@ Value *PrevInChain = State.get(getChainOp(), 0); for (unsigned Part = 0; Part < State.UF; ++Part) { RecurKind Kind = RdxDesc->getRecurrenceKind(); - bool IsOrdered = useOrderedReductions(*RdxDesc); Value *NewVecOp = State.get(getVecOp(), Part); if (VPValue *Cond = getCondOp()) { Value *NewCond = State.get(Cond, Part); @@ -9448,7 +9429,7 @@ } Value *NewRed; Value *NextInChain; - if (IsOrdered) { + if (isOrdered()) { NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, PrevInChain); PrevInChain = NewRed; @@ -9460,7 +9441,7 @@ NextInChain = createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), NewRed, PrevInChain); - } else if (IsOrdered) + } else if (isOrdered()) NextInChain = NewRed; else { NextInChain = State.Builder.CreateBinOp( diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -38,6 +38,7 @@ #include "llvm/ADT/Twine.h" #include "llvm/ADT/ilist.h" #include "llvm/ADT/ilist_node.h" +#include "llvm/Analysis/IVDescriptors.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/IRBuilder.h" #include "llvm/Support/InstructionCost.h" @@ -54,12 +55,12 @@ class InnerLoopVectorizer; class LoopInfo; class raw_ostream; -class RecurrenceDescriptor; class Value; class VPBasicBlock; class VPRegionBlock; class VPlan; class VPlanSlp; +class VPReductionRecipe; /// Returns a calculation for the total number of elements for a given \p VF. /// For fixed width vectors this value is a constant, whereas for scalable @@ -1069,7 +1070,9 @@ RecurrenceDescriptor *getRecurrenceDescriptor() { return RdxDesc; } /// Returns true if this phi is part of an in-loop reduction. - bool isInLoopReduction() const; + bool isInLoopReduction() { return getInLoopReduction(); } + + VPReductionRecipe *getInLoopReduction(); }; /// A recipe for vectorizing a phi-node as a sequence of mask-based select @@ -1229,6 +1232,8 @@ VPValue *getCondOp() const { return getNumOperands() > 2 ? getOperand(2) : nullptr; } + + bool isOrdered() const { return RdxDesc->isOrdered(); } }; /// VPReplicateRecipe replicates a given instruction producing multiple scalar @@ -1953,6 +1958,8 @@ /// Holds the VPLoopInfo analysis for this VPlan. VPLoopInfo VPLInfo; + void fixReduction(VPWidenPHIRecipe &PhiR, VPTransformState &State); + public: VPlan(VPBlockBase *Entry = nullptr) : Entry(Entry) { if (Entry) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -700,6 +700,24 @@ } #endif +void VPlan::fixReduction(VPWidenPHIRecipe &PhiR, VPTransformState &State) { + VPReductionRecipe *InLoopRed = PhiR.getInLoopReduction(); + bool IsOrdered = State.VF.isVector() && InLoopRed && InLoopRed->isOrdered(); + + for (unsigned Part = 0; Part < State.UF; ++Part) { + if (IsOrdered && Part > 0) + break; + auto *VecRdxPhi = cast(State.get(PhiR.getVPSingleValue(), Part)); + BasicBlock *VectorLoopLatch = + State.LI->getLoopFor(VecRdxPhi->getParent())->getLoopLatch(); + Value *Val = State.get(PhiR.getBackedgeValue(), Part); + if (IsOrdered) + Val = State.get(PhiR.getBackedgeValue(), State.UF - 1); + + VecRdxPhi->addIncoming(Val, VectorLoopLatch); + } +} + /// Generate the code inside the body of the vectorized loop. Assumes a single /// LoopVectorBody basic-block was created for this. Introduce additional /// basic-blocks as needed, and fill them all. @@ -786,6 +804,16 @@ if (!EnableVPlanNativePath) updateDominatorTree(State->DT, VectorPreHeaderBB, VectorLatchBB, L->getExitBlock()); + + // Fixup reduction PHI nodes in the vectorized loop header. + VPBasicBlock *Header = getEntry()->getEntryBasicBlock(); + for (VPRecipeBase &R : Header->phis()) { + VPWidenPHIRecipe *PhiR = dyn_cast(&R); + if (!PhiR) + continue; + if (PhiR->getRecurrenceDescriptor()) + fixReduction(*PhiR, *State); + } } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -1073,20 +1101,20 @@ printOperands(O, SlotTracker); } -bool VPWidenPHIRecipe::isInLoopReduction() const { +VPReductionRecipe *VPWidenPHIRecipe::getInLoopReduction() { if (!RdxDesc) - return false; + return nullptr; - SmallVector WorkList; + SmallVector WorkList; // Walk backwards the backedge value until we either reach a VPReductionRecipe // or a phi. WorkList.push_back(cast(getBackedgeValue()->getDef())); while (!WorkList.empty()) { - const VPRecipeBase *Current = WorkList.pop_back_val(); + VPRecipeBase *Current = WorkList.pop_back_val(); if (Current->isPhi()) continue; - if (isa(Current)) - return true; + if (auto *RedR = dyn_cast(Current)) + return RedR; for (VPValue *Op : Current->operands()) { if (!Op->getDef()) continue; @@ -1094,7 +1122,7 @@ WorkList.push_back(R); } } - return false; + return nullptr; } void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,