diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7962,7 +7962,6 @@ // 1. Create a new empty loop. Unlink the old loop and connect the new one. VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); - State.TripCount = ILV.getOrCreateTripCount(nullptr); State.CanonicalIV = ILV.Induction; ILV.collectPoisonGeneratingRecipes(State); @@ -7977,6 +7976,7 @@ //===------------------------------------------------===// // 2. Copy and widen instructions from the old loop into the new loop. + BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr), State); BestVPlan.execute(&State); // 3. Fix the vectorized code: take care of header phi's, live-outs, @@ -8461,11 +8461,8 @@ bool TailFolded = !CM.isScalarEpilogueAllowed(); if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { - // While ActiveLaneMask is a binary op that consumes the loop tripcount - // as a second argument, we only pass the IV here and extract the - // tripcount from the transform state where codegen of the VP instructions - // happen. - BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); + VPValue *TC = Plan->getOrCreateTripCount(); + BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}); } else { VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -341,9 +341,6 @@ /// Hold the canonical scalar IV of the vector loop (start=0, step=VF*UF). Value *CanonicalIV = nullptr; - /// Hold the trip count of the scalar loop. - Value *TripCount = nullptr; - /// Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods. InnerLoopVectorizer *ILV; @@ -2134,8 +2131,12 @@ // (operators '==' and '<'). SetVector VPExternalDefs; - /// Represents the backedge taken count of the original loop, for folding + /// Represents the trip count of the original loop, for folding /// the tail. + VPValue *TripCount = nullptr; + + /// Represents the backedge taken count of the original loop, for folding + /// the tail. It equals TripCount - 1. VPValue *BackedgeTakenCount = nullptr; /// Holds a mapping between Values and their corresponding VPValue inside @@ -2169,12 +2170,17 @@ } for (VPValue *VPV : VPValuesToFree) delete VPV; + if (TripCount) + delete TripCount; if (BackedgeTakenCount) delete BackedgeTakenCount; for (VPValue *Def : VPExternalDefs) delete Def; } + /// Prepare the plan for execution, setting up the required live-in values. + void prepareToExecute(Value *TripCount, VPTransformState &State); + /// Generate the IR code for this VPlan. void execute(struct VPTransformState *State); @@ -2187,6 +2193,13 @@ return Entry; } + /// The trip count of the original loop. + VPValue *getOrCreateTripCount() { + if (!TripCount) + TripCount = new VPValue(); + return TripCount; + } + /// The backedge taken count of the original loop. VPValue *getOrCreateBackedgeTakenCount() { if (!BackedgeTakenCount) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -677,7 +677,7 @@ // Get first lane of vector induction variable. Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0)); // Get the original loop tripcount. - Value *ScalarTC = State.TripCount; + Value *ScalarTC = State.get(getOperand(1), Part); auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); auto *PredTy = FixedVectorType::get(Int1Ty, State.VF.getKnownMinValue()); @@ -786,23 +786,31 @@ FMF = FMFNew; } -/// Generate the code inside the body of the vectorized loop. Assumes a single -/// LoopVectorBody basic-block was created for this. Introduce additional -/// basic-blocks as needed, and fill them all. -void VPlan::execute(VPTransformState *State) { - // -1. Check if the backedge taken count is needed, and if so build it. +void VPlan::prepareToExecute(Value *TripCountV, VPTransformState &State) { + // Check if the trip count is needed, and if so build it. + if (TripCount && TripCount->getNumUsers()) { + for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) + State.set(TripCount, TripCountV, Part); + } + + // Check if the backedge taken count is needed, and if so build it. if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) { - Value *TC = State->TripCount; - IRBuilder<> Builder(State->CFG.PrevBB->getTerminator()); - auto *TCMO = Builder.CreateSub(TC, ConstantInt::get(TC->getType(), 1), + IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); + auto *TCMO = Builder.CreateSub(TripCountV, + ConstantInt::get(TripCountV->getType(), 1), "trip.count.minus.1"); - auto VF = State->VF; + auto VF = State.VF; Value *VTCMO = VF.isScalar() ? TCMO : Builder.CreateVectorSplat(VF, TCMO, "broadcast"); - for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part) - State->set(BackedgeTakenCount, VTCMO, Part); + for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) + State.set(BackedgeTakenCount, VTCMO, Part); } +} +/// Generate the code inside the body of the vectorized loop. Assumes a single +/// LoopVectorBody basic-block was created for this. Introduce additional +/// basic-blocks as needed, and fill them all. +void VPlan::execute(VPTransformState *State) { // 0. Set the reverse mapping from VPValues to Values for code generation. for (auto &Entry : Value2VPValue) State->VPValue2Value[Entry.second] = Entry.first;