diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8572,6 +8572,22 @@ return EdgeMaskCache[Edge] = EdgeMask; } +VPValue *VPRecipeBuilder::getOrCreateIV(VPBasicBlock *VPBB, VPlanPtr &Plan) { + IVCacheTy::iterator IVEntryIt = IVCache.find(VPBB); + if (IVEntryIt != IVCache.end()) + return IVEntryIt->second; + + VPValue *IV = nullptr; + if (Legal->getPrimaryInduction()) + IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); + else { + auto *IVRecipe = new VPWidenCanonicalIVRecipe(); + Builder.getInsertBlock()->insert(IVRecipe, Builder.getInsertPoint()); + IV = IVRecipe->getVPSingleValue(); + } + return IVCache[VPBB] = IV; +} + VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); @@ -8596,14 +8612,7 @@ // Introduce the early-exit compare IV <= BTC to form header block mask. // This is used instead of IV < TC because TC may wrap, unlike BTC. // Start by constructing the desired canonical IV. - VPValue *IV = nullptr; - if (Legal->getPrimaryInduction()) - IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); - else { - auto IVRecipe = new VPWidenCanonicalIVRecipe(); - Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); - IV = IVRecipe->getVPSingleValue(); - } + VPValue *IV = getOrCreateIV(Builder.getInsertBlock(), Plan); VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); bool TailFolded = !CM.isScalarEpilogueAllowed(); diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -51,6 +51,11 @@ EdgeMaskCacheTy EdgeMaskCache; BlockMaskCacheTy BlockMaskCache; + /// Hold a mapping of Basic block to the canonical vector induction VPValue + /// inserted for that block or the primary induction if it exists. + using IVCacheTy = DenseMap; + IVCacheTy IVCache; + // VPlan-VPlan transformations support: Hold a mapping from ingredients to // their recipe. To save on memory, only do so for selected ingredients, // marked by having a nullptr entry in this map. @@ -103,6 +108,9 @@ /// Return a VPRecipeOrValueTy with VPRecipeBase * being set. This can be used to force the use as VPRecipeBase* for recipe sub-types that also inherit from VPValue. VPRecipeOrVPValueTy toVPRecipeResult(VPRecipeBase *R) const { return R; } + /// Insert and Cache Induction Variable + VPValue *getOrCreateIV(VPBasicBlock *VPBB, VPlanPtr &Plan); + public: VPRecipeBuilder(Loop *OrigLoop, const TargetLibraryInfo *TLI, LoopVectorizationLegality *Legal, diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2139,6 +2139,16 @@ /// the tail. VPValue *BackedgeTakenCount = nullptr; + /// Represents the trip count of the original loop, for computing EVL. + VPValue *TripCount = nullptr; + + /// Represents the runtime VF. Some recipes like Vector Predicated recipes may + /// use runtime VF as an operand. At the time of plan construction while it is + /// known that this value is a loop invariant, but the corresponding IR value + /// is only available at plan execution once the final VF and corresponding + /// plan are chosen. + VPValue *RuntimeVF = nullptr; + /// Holds a mapping between Values and their corresponding VPValue inside /// VPlan. Value2VPValueTy Value2VPValue; @@ -2168,6 +2178,10 @@ delete VPV; if (BackedgeTakenCount) delete BackedgeTakenCount; + if (TripCount) + delete TripCount; + if (RuntimeVF) + delete RuntimeVF; for (VPValue *Def : VPExternalDefs) delete Def; } @@ -2191,6 +2205,21 @@ return BackedgeTakenCount; } + /// The trip count of the original loop. + VPValue *getOrCreateTripCount() { + if (!TripCount) + TripCount = new VPValue(); + return TripCount; + } + + /// A VPValue representing the loop invariant runtime VF to be expanded at + /// paln execution. + VPValue *getOrCreateRuntimeVF() { + if (!RuntimeVF) + RuntimeVF = new VPValue(); + return RuntimeVF; + } + void addVF(ElementCount VF) { VFs.insert(VF); } bool hasVF(ElementCount VF) { return VFs.count(VF); } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -754,10 +754,26 @@ /// LoopVectorBody basic-block was created for this. Introduce additional /// basic-blocks as needed, and fill them all. void VPlan::execute(VPTransformState *State) { + IRBuilder<> Builder(State->CFG.PrevBB->getTerminator()); + + // -3 Check if the trip count is needed, if so build it. + if (TripCount && TripCount->getNumUsers()) { + Value *TC = State->TripCount; + for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part) + State->set(TripCount, TC, Part); + } + + // -2 Set the runtime VF if it is needed. + if (RuntimeVF && RuntimeVF->getNumUsers()) { + Value *RuntimeVFVal = + getRuntimeVF(Builder, Builder.getInt32Ty(), State->VF); + for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part) + State->set(RuntimeVF, RuntimeVFVal, Part); + } + // -1. Check if the backedge taken count is needed, and if so build it. if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) { Value *TC = State->TripCount; - IRBuilder<> Builder(State->CFG.PrevBB->getTerminator()); auto *TCMO = Builder.CreateSub(TC, ConstantInt::get(TC->getType(), 1), "trip.count.minus.1"); auto VF = State->VF; @@ -926,6 +942,16 @@ Plan.BackedgeTakenCount->print(OS, SlotTracker); OS << " := BackedgeTakenCount"; } + if (Plan.TripCount) { + OS << "\\n"; + Plan.RuntimeVF->print(OS, SlotTracker); + OS << " := TripCount"; + } + if (Plan.RuntimeVF) { + OS << "\\n"; + Plan.RuntimeVF->print(OS, SlotTracker); + OS << " := RuntimeVF"; + } OS << "\"]\n"; OS << "node [shape=rect, fontname=Courier, fontsize=30]\n"; OS << "edge [fontname=Courier, fontsize=30]\n"; @@ -1359,6 +1385,12 @@ if (Plan.BackedgeTakenCount) assignSlot(Plan.BackedgeTakenCount); + if (Plan.TripCount) + assignSlot(Plan.TripCount); + + if (Plan.RuntimeVF) + assignSlot(Plan.RuntimeVF); + ReversePostOrderTraversal< VPBlockRecursiveTraversalWrapper> RPOT(VPBlockRecursiveTraversalWrapper(