diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -500,7 +500,7 @@ bool InvariantCond, VPTransformState &State); /// Fix the vectorized code, taking care of header phi's, live-outs, and more. - void fixVectorizedLoop(); + void fixVectorizedLoop(VPTransformState &State, VPlan &Plan); // Return true if any runtime check is added. bool areSafetyChecksAdded() { return AddedSafetyChecks; } @@ -565,6 +565,10 @@ VectorLoopValueMap.setVectorValue(Scalar, Part, Vector); } + void resetVectorValue(Value *Scalar, unsigned Part, Value *Vector) { + VectorLoopValueMap.resetVectorValue(Scalar, Part, Vector); + } + void setScalarValue(Value *Scalar, const VPIteration &Instance, Value *V) { VectorLoopValueMap.setScalarValue(Scalar, Instance, V); } @@ -604,7 +608,7 @@ void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); /// Fix the non-induction PHIs in the OrigPHIsToFix vector. - void fixNonInductionPHIs(void); + void fixNonInductionPHIs(VPTransformState &State, VPlan &Plan); /// Create a broadcast instruction. This method generates a broadcast /// instruction (shuffle) for loop invariant values and for the induction @@ -635,15 +639,16 @@ Value *Step, Instruction *DL); /// Handle all cross-iteration phis in the header. - void fixCrossIterationPHIs(); + void fixCrossIterationPHIs(VPTransformState &State, VPlan &Plan); /// Fix a first-order recurrence. This is the second phase of vectorizing /// this phi node. - void fixFirstOrderRecurrence(PHINode *Phi); + void fixFirstOrderRecurrence(PHINode *Phi, VPTransformState &State, + VPlan &Plan); /// Fix a reduction cross-iteration phi. This is the second phase of /// vectorizing this phi node. - void fixReduction(PHINode *Phi); + void fixReduction(PHINode *Phi, VPTransformState &State, VPlan &Plan); /// Clear NSW/NUW flags from reduction instructions if necessary. void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc); @@ -653,7 +658,7 @@ /// block as exiting edges from the scalar epilogue loop (if present) are /// already in place, and we exit the vector loop exclusively to the middle /// block. - void fixLCSSAPHIs(); + void fixLCSSAPHIs(VPTransformState &State, VPlan &Plan); /// Iteratively sink the scalarized operands of a predicated instruction into /// the block that was created for it. @@ -3958,7 +3963,8 @@ } } -void InnerLoopVectorizer::fixVectorizedLoop() { +void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, + VPlan &Plan) { // Insert truncates and extends for any truncated instructions as hints to // InstCombine. if (VF.isVector()) @@ -3968,14 +3974,14 @@ if (OrigPHIsToFix.size()) { assert(EnableVPlanNativePath && "Unexpected non-induction PHIs for fixup in non VPlan-native path"); - fixNonInductionPHIs(); + fixNonInductionPHIs(State, Plan); } // At this point every instruction in the original loop is widened to a // vector form. Now we need to fix the recurrences in the loop. These PHI // nodes are currently empty because we did not want to introduce cycles. // This is the second stage of vectorizing recurrences. - fixCrossIterationPHIs(); + fixCrossIterationPHIs(State, Plan); // Forget the original basic block. PSE.getSE()->forgetLoop(OrigLoop); @@ -3986,7 +3992,7 @@ getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), IVEndValues[Entry.first], LoopMiddleBlock); - fixLCSSAPHIs(); + fixLCSSAPHIs(State, Plan); for (Instruction *PI : PredicatedInstructions) sinkScalarOperands(&*PI); @@ -4011,7 +4017,8 @@ LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); } -void InnerLoopVectorizer::fixCrossIterationPHIs() { +void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State, + VPlan &Plan) { // In order to support recurrences we need to be able to vectorize Phi nodes. // Phi nodes have cycles, so we need to vectorize them in two stages. This is // stage #2: We now need to fix the recurrences by adding incoming edges to @@ -4021,13 +4028,15 @@ for (PHINode &Phi : OrigLoop->getHeader()->phis()) { // Handle first-order recurrences and reductions that need to be fixed. if (Legal->isFirstOrderRecurrence(&Phi)) - fixFirstOrderRecurrence(&Phi); + fixFirstOrderRecurrence(&Phi, State, Plan); else if (Legal->isReductionVariable(&Phi)) - fixReduction(&Phi); + fixReduction(&Phi, State, Plan); } } -void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { +void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi, + VPTransformState &State, + VPlan &Plan) { // This is the second phase of vectorizing first-order recurrences. An // overview of the transformation is described below. Suppose we have the // following loop. @@ -4095,10 +4104,11 @@ Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init"); } + VPValue *PhiDef = Plan.getVPValue(Phi); + VPValue *PreviousDef = Plan.getVPValue(Previous); // We constructed a temporary phi node in the first phase of vectorization. // This phi node will eventually be deleted. - Builder.SetInsertPoint( - cast(VectorLoopValueMap.getVectorValue(Phi, 0))); + Builder.SetInsertPoint(cast(State.get(PhiDef, 0))); // Create a phi node for the new recurrence. The current value will either be // the initial value inserted into a vector or loop-varying vector value. @@ -4107,7 +4117,7 @@ // Get the vectorized previous value of the last part UF - 1. It appears last // among all unrolled iterations, due to the order of their construction. - Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); + Value *PreviousLastPart = State.get(PreviousDef, UF - 1); // Find and set the insertion point after the previous value if it is an // instruction. @@ -4145,15 +4155,15 @@ // Shuffle the current and previous vector and update the vector parts. for (unsigned Part = 0; Part < UF; ++Part) { - Value *PreviousPart = getOrCreateVectorValue(Previous, Part); - Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); + Value *PreviousPart = State.get(PreviousDef, Part); + Value *PhiPart = State.get(PhiDef, Part); auto *Shuffle = VF.isVector() ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask) : Incoming; PhiPart->replaceAllUsesWith(Shuffle); cast(PhiPart)->eraseFromParent(); - VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); + State.reset(PhiDef, Phi, Shuffle, Part); Incoming = PreviousPart; } @@ -4184,7 +4194,7 @@ // `Incoming`. This is analogous to the vectorized case above: extracting the // second last element when VF > 1. else if (UF > 1) - ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); + ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); // Fix the initial value of the original recurrence in the scalar loop. Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); @@ -4212,7 +4222,8 @@ LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); } -void InnerLoopVectorizer::fixReduction(PHINode *Phi) { +void InnerLoopVectorizer::fixReduction(PHINode *Phi, VPTransformState &State, + VPlan &Plan) { // Get it's reduction variable descriptor. assert(Legal->isReductionVariable(Phi) && "Unable to find the reduction variable"); @@ -4224,8 +4235,9 @@ setDebugLocFromInst(Builder, ReductionStartValue); bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi); + VPValue *LoopExitInstDef = Plan.getVPValue(LoopExitInst); // This is the vector-clone of the value that leaves the loop. - Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); + Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); // Wrap flags are in general invalid after vectorization, clear them. clearReductionWrapFlags(RdxDesc); @@ -4238,8 +4250,8 @@ Value *LoopVal = Phi->getIncomingValueForBlock(Latch); for (unsigned Part = 0; Part < UF; ++Part) { - Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); - Value *Val = getOrCreateVectorValue(LoopVal, Part); + Value *VecRdxPhi = State.get(Plan.getVPValue(Phi), Part); + Value *Val = State.get(Plan.getVPValue(LoopVal), Part); cast(VecRdxPhi) ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); } @@ -4258,8 +4270,7 @@ // be predicated, and does not need to be handled here. if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) { for (unsigned Part = 0; Part < UF; ++Part) { - Value *VecLoopExitInst = - VectorLoopValueMap.getVectorValue(LoopExitInst, Part); + Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); Value *Sel = nullptr; for (User *U : VecLoopExitInst->users()) { if (isa(U)) { @@ -4269,7 +4280,7 @@ assert(isa(U) && "Reduction exit must feed Phi's or select"); } assert(Sel && "Reduction exit feeds no select"); - VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); + State.reset(LoopExitInstDef, LoopExitInst, Sel, Part); // If the target can create a predicated operator for the reduction at no // extra cost in the loop (for example a predicated vadd), it can be @@ -4281,7 +4292,7 @@ TTI->preferPredicatedReductionSelect( RdxDesc.getOpcode(), Phi->getType(), TargetTransformInfo::ReductionFlags())) { - auto *VecRdxPhi = cast(getOrCreateVectorValue(Phi, Part)); + auto *VecRdxPhi = cast(State.get(Plan.getVPValue(Phi), Part)); VecRdxPhi->setIncomingValueForBlock( LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); } @@ -4299,7 +4310,7 @@ LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); VectorParts RdxParts(UF); for (unsigned Part = 0; Part < UF; ++Part) { - RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); + RdxParts[Part] = State.get(LoopExitInstDef, Part); Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) : Builder.CreateZExt(Trunc, VecTy); @@ -4315,12 +4326,12 @@ Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); for (unsigned Part = 0; Part < UF; ++Part) { RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); - VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]); + State.reset(LoopExitInstDef, LoopExitInst, RdxParts[Part], Part); } } // Reduce all of the unrolled parts into a single vector. - Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); + Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); unsigned Op = RecurrenceDescriptor::getOpcode(RK); // The middle block terminator has already been assigned a DebugLoc here (the @@ -4332,7 +4343,7 @@ // accidentally cause an extra step back into the loop while debugging. setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); for (unsigned Part = 1; Part < UF; ++Part) { - Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); + Value *RdxPart = State.get(LoopExitInstDef, Part); if (Op != Instruction::ICmp && Op != Instruction::FCmp) // Floating point operations had to be 'fast' to enable the reduction. ReducedPartRdx = addFastMathFlag( @@ -4417,7 +4428,7 @@ } } -void InnerLoopVectorizer::fixLCSSAPHIs() { +void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State, VPlan &Plan) { for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) // Some phis were already hand updated by the reduction and recurrence @@ -4438,7 +4449,10 @@ // extracted from the vectorized loop. Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); Value *lastIncomingValue = - getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); + isa(IncomingValue) && + OrigLoop->contains(cast(IncomingValue)) + ? State.get(Plan.getVPValue(IncomingValue), {UF - 1, LastLane}) + : IncomingValue; LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); } } @@ -4507,10 +4521,10 @@ } while (Changed); } -void InnerLoopVectorizer::fixNonInductionPHIs() { +void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State, + VPlan &Plan) { for (PHINode *OrigPhi : OrigPHIsToFix) { - PHINode *NewPhi = - cast(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); + PHINode *NewPhi = cast(State.get(Plan.getVPValue(OrigPhi), 0)); unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); SmallVector ScalarBBPredecessors( @@ -7777,7 +7791,7 @@ // 3. Fix the vectorized code: take care of header phi's, live-outs, // predication, updating analyses. - ILV.fixVectorizedLoop(); + ILV.fixVectorizedLoop(State, *VPlans.front()); ILV.printDebugTracesAtEnd(); } @@ -9258,6 +9272,12 @@ ILV->setVectorValue(IRDef, Part, V); } +void VPTransformState::reset(VPValue *Def, Value *IRDef, Value *V, + unsigned Part) { + set(Def, V, Part); + ILV->resetVectorValue(IRDef, Part, V); +} + Value *VPTransformState::get(VPValue *Def, unsigned Part) { // If Values have been set for this Def return the one relevant for \p Part. if (hasVectorValue(Def, Part)) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -308,6 +308,7 @@ Data.PerPartOutput[Def][Part] = V; } void set(VPValue *Def, Value *IRDef, Value *V, unsigned Part); + void reset(VPValue *Def, Value *IRDef, Value *V, unsigned Part); void set(VPValue *Def, Value *IRDef, Value *V, const VPIteration &Instance); void set(VPValue *Def, Value *V, const VPIteration &Instance) {