Index: include/llvm/Transforms/Utils/LoopVersioning.h =================================================================== --- include/llvm/Transforms/Utils/LoopVersioning.h +++ include/llvm/Transforms/Utils/LoopVersioning.h @@ -41,7 +41,7 @@ /// object having no checks and we expect the user to add them. LoopVersioning(const LoopAccessInfo &LAI, Loop *L, LoopInfo *LI, DominatorTree *DT, ScalarEvolution *SE, - bool UseLAIChecks = true); + bool UseLAIChecks = true, bool AssumeLoopSimplifyForm = false); /// \brief Performs the CFG manipulation part of versioning the loop including /// the DominatorTree and LoopInfo updates. @@ -146,6 +146,9 @@ LoopInfo *LI; DominatorTree *DT; ScalarEvolution *SE; + // Flag to allow non simplify form loops in LoopVersioning. + // Minimal expectation is Loop should have a pre header & single exit block. + bool AssumeLoopSimplifyForm; }; } Index: lib/Transforms/Utils/LoopVersioning.cpp =================================================================== --- lib/Transforms/Utils/LoopVersioning.cpp +++ lib/Transforms/Utils/LoopVersioning.cpp @@ -32,11 +32,14 @@ LoopVersioning::LoopVersioning(const LoopAccessInfo &LAI, Loop *L, LoopInfo *LI, DominatorTree *DT, ScalarEvolution *SE, - bool UseLAIChecks) + bool UseLAIChecks, bool AssumeLoopSimplifyForm) : VersionedLoop(L), NonVersionedLoop(nullptr), LAI(LAI), LI(LI), DT(DT), - SE(SE) { + SE(SE), AssumeLoopSimplifyForm(AssumeLoopSimplifyForm) { assert(L->getExitBlock() && "No single exit block"); - assert(L->isLoopSimplifyForm() && "Loop is not in loop-simplify form"); + if (AssumeLoopSimplifyForm) + assert(L->getLoopPreheader() && "No preheader in the loop"); + else + assert(L->isLoopSimplifyForm() && "Loop is not in loop-simplify form"); if (UseLAIChecks) { setAliasChecks(LAI.getRuntimePointerChecking()->getChecks()); setSCEVChecks(LAI.getPSE().getUnionPredicate()); Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -215,6 +215,14 @@ cl::desc("The maximum number of SCEV checks allowed with a " "vectorize(enable) pragma")); +static cl::opt EnableEpilogVectorization( + "enable-epilog-vectorization", cl::init(false), cl::Hidden, + cl::desc("Enable vectorization of epilog scalar loop.")); + +static cl::opt MinWidthEpilogVectorization( + "min-width-epilog-vectorization", cl::init(16), cl::Hidden, + cl::desc("Minimum vector loop width to enable epilog vectorization.")); + /// Create an analysis remark that explains why vectorization failed /// /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p @@ -406,20 +414,33 @@ const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, unsigned VecWidth, unsigned UnrollFactor, LoopVectorizationLegality *LVL, - LoopVectorizationCostModel *CM) + LoopVectorizationCostModel *CM, + unsigned EpilogVectorLoopWidth = 1, + std::function *GetLAA = nullptr, + AliasAnalysis *AA = nullptr) : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), - AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), + AA(AA), AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()), Induction(nullptr), OldInduction(nullptr), VectorLoopValueMap(UnrollFactor, VecWidth), TripCount(nullptr), VectorTripCount(nullptr), Legal(LVL), Cost(CM), - AddedSafetyChecks(false) {} + AddedSafetyChecks(false), EpilogVectorLoopWidth(EpilogVectorLoopWidth), + ScalarResumeValue(nullptr), AliasCheckResult(nullptr), GetLAA(GetLAA) {} // Perform the actual loop widening (vectorization). - void vectorize() { + void vectorize(Value *RemainderItrs = nullptr, + Value *EpilogResumeValue = nullptr, + PHINode *ACR = nullptr, + PHINode *EpilogInduction = nullptr) { + AliasCheckResult = ACR; // Create a new empty loop. Unlink the old loop and connect the new one. - createEmptyLoop(); + createEmptyLoop(RemainderItrs, EpilogResumeValue, + EpilogInduction); // Widen each instruction in the old loop to a new one in the new loop. vectorizeLoop(); + // If multi version is required, generate next VF nersion code. + if (canCreateVectorEpilog()) + createVectorEpilog(); + } // Return true if any runtime check is added. @@ -448,7 +469,9 @@ EdgeMaskCache; /// Create an empty loop, based on the loop ranges of the old loop. - void createEmptyLoop(); + void createEmptyLoop(Value *RemainderItrs = nullptr, + Value *EpilogResumeValue = nullptr, + PHINode *EpilogInduction = nullptr); /// Set up the values of the IVs correctly when exiting the vector loop. void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, @@ -584,19 +607,24 @@ Value *getOrCreateTripCount(Loop *NewLoop); /// Returns (and creates if needed) the trip count of the widened loop. - Value *getOrCreateVectorTripCount(Loop *NewLoop); + Value *getOrCreateVectorTripCount(Loop *NewLoop, bool ReCreateVecTC = false); /// Emit a bypass check to see if the trip count would overflow, or we /// wouldn't have enough iterations to execute one vector loop. - void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); + void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, + unsigned NewVF = 0, + Value *KnownTripCount = nullptr); /// Emit a bypass check to see if the vector trip count is nonzero. - void emitVectorLoopEnteredCheck(Loop *L, BasicBlock *Bypass); + void emitVectorLoopEnteredCheck(Loop *L, BasicBlock *Bypass, + bool ReCreateVecTC = false); + /// Emit a bypass check to see if all of the SCEV assumptions we've /// had to make are correct. void emitSCEVChecks(Loop *L, BasicBlock *Bypass); /// Emit bypass checks to check any memory assumptions we may have made. - void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); - + BasicBlock* emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); + /// Emit bypass checks by checking runtime alias result. + BasicBlock* emitAliasResultCheck(Loop *L, BasicBlock *Bypass, Value *ACR); /// Add additional metadata to \p To that was not present on \p Orig. /// /// Currently this is used to add the noalias annotations based on the @@ -611,6 +639,34 @@ /// loop. void addMetadata(Instruction *To, Instruction *From); + /// Returns true if epilog vectorization is legal. + /// Legality is already performed on the loop while first vector + /// loop generation, now checking minimal stuff required for versioning. + bool canCreateVectorEpilog() { + // Epilog vectorization option should be enabled. + if (!EnableEpilogVectorization) + return false; + // Epilog vector should have a valid vector width. + if (EpilogVectorLoopWidth == 1) + return false; + // Loop should have exit block. + if (!OrigLoop->getExitBlock()) + return false; + // Loop should have pre header. + if (!OrigLoop->getLoopPreheader()) + return false; + return true; + } + + // Transform for vector multi versioning. + void createVectorEpilog(); + + // Create alias check result phinode. + PHINode* createAliasCheckResultPHI(); + + // Checks feasiblity of vector multi versioning. + bool canCreateEpilogVectorLoop(LoopVectorizationLegality &LVL); + /// \brief Similar to the previous function but it adds the metadata to a /// vector of instructions. void addMetadata(ArrayRef To, Instruction *From); @@ -737,12 +793,12 @@ LoopInfo *LI; /// Dominator Tree. DominatorTree *DT; - /// Alias Analysis. - AliasAnalysis *AA; /// Target Library Info. const TargetLibraryInfo *TLI; /// Target Transform Info. const TargetTransformInfo *TTI; + /// Alias Analysis. + AliasAnalysis *AA; /// Assumption Cache. AssumptionCache *AC; /// Interface to emit optimization remarks. @@ -781,6 +837,8 @@ BasicBlock *LoopVectorBody; /// The scalar loop body. BasicBlock *LoopScalarBody; + /// Alias check block. + BasicBlock* AliasCheckBlock; /// A list of all bypass blocks. The first block is the entry of the loop. SmallVector LoopBypassBlocks; @@ -824,6 +882,14 @@ // Holds the end values for each induction variable. We save the end values // so we can later fix-up the external users of the induction variables. DenseMap IVEndValues; + // Epilog vector loop width. + unsigned EpilogVectorLoopWidth; + /// Scalar block resume value PHINode. + PHINode *ScalarResumeValue; + /// Alias check result. + PHINode *AliasCheckResult; + /// Loop access analysis. + std::function *GetLAA; }; class InnerLoopUnroller : public InnerLoopVectorizer { @@ -1581,7 +1647,7 @@ public: LoopVectorizationLegality( Loop *L, PredicatedScalarEvolution &PSE, DominatorTree *DT, - TargetLibraryInfo *TLI, AliasAnalysis *AA, Function *F, + const TargetLibraryInfo *TLI, AliasAnalysis *AA, Function *F, const TargetTransformInfo *TTI, std::function *GetLAA, LoopInfo *LI, OptimizationRemarkEmitter *ORE, LoopVectorizationRequirements *R, @@ -1797,7 +1863,7 @@ /// unrolling. PredicatedScalarEvolution &PSE; /// Target Library Info. - TargetLibraryInfo *TLI; + const TargetLibraryInfo *TLI; /// Target Transform Info const TargetTransformInfo *TTI; /// Dominator Tree. @@ -1871,6 +1937,9 @@ struct VectorizationFactor { unsigned Width; // Vector width with best cost unsigned Cost; // Cost of the loop with that width + VectorizationFactor() : Width(1), Cost(0) {} + VectorizationFactor(unsigned Width, unsigned Cost) + : Width(Width), Cost(Cost) {} }; /// \return The most profitable vectorization factor and the cost of that VF. /// This method checks every power of two up to VF. If UserVF is not ZERO @@ -1878,6 +1947,11 @@ /// possible. VectorizationFactor selectVectorizationFactor(bool OptForSize); + /// \return The next profitable vector factor after the input vector factor + /// If there is not any next profitable vector factor return VectorizationFactor + /// with scalar as next possible factor. + VectorizationFactor identifyNextProfitableVF(const unsigned IVF); + /// \return The size (in bits) of the smallest and widest types in the code /// that needs to be vectorized. We ignore values that remain scalar such as /// 64 bit loop indices. @@ -2198,6 +2272,8 @@ SmallPtrSet ValuesToIgnore; /// Values to ignore in the cost model when VF > 1. SmallPtrSet VecValuesToIgnore; + /// Profitable vector factors. + SmallVector ProfitableVF; }; /// \brief This holds vectorization requirements that must be verified late in @@ -3159,6 +3235,130 @@ VectorLoopValueMap.initScalar(Instr, Entry); } +/// This function will generate PHINode for alias result check in original +/// scalar loop's preheader block. +PHINode* InnerLoopVectorizer::createAliasCheckResultPHI() { + // Return nullptr if no alias check created previously. + if (!AliasCheckBlock) + return nullptr; + auto &M = *LoopScalarPreHeader->getTerminator()->getFunction()->getParent(); + auto &Ctx = M.getContext(); + // Create PHINode for alias check result. + PHINode* ACR = + PHINode::Create(Type::getInt1Ty(Ctx), 3, "acr.val", ScalarResumeValue); + // From middle block set true incoming value. + ACR->addIncoming(Builder.getInt1(true), LoopMiddleBlock); + // From bypassing blocks set false incoming value. + for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) + ACR->addIncoming(Builder.getInt1(false), LoopBypassBlocks[I]); + return ACR; +} + +/// Returns true if its feasible to generate vector multi version, else false. +bool InnerLoopVectorizer::canCreateEpilogVectorLoop(LoopVectorizationLegality &LVL) { + // Loop bypass blocks should not be empty. + if (!LoopBypassBlocks.size()) { + DEBUG(dbgs() << "Loop bypass blocks are empty !\n"); + return false; + } + // Unique induction phi node should exist. + if (!ScalarResumeValue) { + DEBUG(dbgs() << "Unique induction phi node missing !\n"); + return false; + } + // Original Trip count & resume value type should be same. + if (getOrCreateTripCount(OrigLoop)->getType() != ScalarResumeValue->getType()) { + DEBUG(dbgs() << "Type mismatch for resume & tripcount.\n"); + return false; + } + // Old Induction variable should exist. + if (!OldInduction) { + DEBUG(dbgs() << "Scalar loop does not has explicit induction variable.\n"); + return false; + } + // Should be legal to vectorize. + if (!LVL.canVectorize()) { + DEBUG(dbgs() << "MVer-LV: Not vectorizing: Cannot prove legality.\n"); + return false; + } + return true; +} + +/// This function will vectorize epilog by generating epilog vector version. +/// In this function we generate a new loop. The new loop will contain +/// the vectorized instructions with new vector factor. +/// +/// [ ] <-- original loop iteration number check. +/// / | +/// / v +/// | [ ] <-- original vector loop bypass (may consist of multiple blocks +/// | / | having checks like alias checks & scev checks.) +/// | / v +/// || [ ] <-- original vector pre header. +/// |/ | +/// | v +/// | [ ] \ +/// | [ ]_| <-- original vector loop. +/// | | +/// | v +/// | -[ ] <--- original middle-block. +/// | / | +/// | / v +/// |----[ ] <-- new version loop iteration number check. +/// | / | +/// |/ v +/// / [ ] <-- Previously executed alias result check. +/// || / | +/// ||/ v +/// |/ [ ] <-- new vector version pre header. +/// || | +/// || v +/// || [ ] \ +/// || [ ]_| <-- new vector loop. +/// || | +/// || v +/// || -[ ] <--- new middle-block. +/// || / | +/// | / v +/// -|- >[ ] <--- scalar preheader. +/// | | +/// | v +/// | [ ] \ +/// | [ ]_| <-- scalar loop to handle remainder. +/// \ | +/// \ v +/// >[ ] <-- exit block. +/// ... +/// +void InnerLoopVectorizer::createVectorEpilog() { + // Create instance of LoopVectorizationLegality. + Function *F = LoopScalarPreHeader->getParent(); + LoopVectorizationRequirements Requirements(*ORE); + LoopVectorizeHints Hints(OrigLoop, true, *ORE); + LoopVectorizationLegality MVLVL(OrigLoop, PSE, DT, TLI, AA, F, TTI, GetLAA, + LI, ORE, &Requirements, &Hints); + // Check if its legal to do vector multi versioning. + if (!canCreateEpilogVectorLoop(MVLVL)) { + DEBUG(dbgs() << "Epilog vectorization unable to prove legality !\n"); + return; + } + DEBUG(dbgs() << "Epilog vectorization is beneficial with width : " + << EpilogVectorLoopWidth << " in Function: " + << LoopScalarPreHeader->getParent()->getName() << "\n"); + // Create PHINode for alias check execution result. + AliasCheckResult = createAliasCheckResultPHI(); + // Identify remaining number of iterations. + IRBuilder<> ResumeBuilder(&*LoopScalarPreHeader->getTerminator()); + Value *RemainderItrs = ResumeBuilder.CreateSub(getOrCreateTripCount(OrigLoop), + ScalarResumeValue, "rem.itrs"); + // Create instance of inner loop vectorizer & vectorize. + InnerLoopVectorizer LB(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, + EpilogVectorLoopWidth, 1, &MVLVL, Cost); + LB.vectorize(RemainderItrs, ScalarResumeValue, + AliasCheckResult, OldInduction); +} + + PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, Value *End, Value *Step, Instruction *DL) { @@ -3236,8 +3436,9 @@ return TripCount; } -Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { - if (VectorTripCount) +Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L, + bool ReCreateVecTC) { + if (!ReCreateVecTC && VectorTripCount) return VectorTripCount; Value *TC = getOrCreateTripCount(L); @@ -3268,16 +3469,20 @@ return VectorTripCount; } -void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, - BasicBlock *Bypass) { - Value *Count = getOrCreateTripCount(L); +void InnerLoopVectorizer::emitMinimumIterationCountCheck( + Loop *L, BasicBlock *Bypass, unsigned NewVF, Value *KnownTripCount) { + // If its a known trip count then use it. + Value *Count = KnownTripCount ? KnownTripCount : getOrCreateTripCount(L); + // Minimum iteration count. + unsigned MinItrsCount = NewVF ? NewVF : (VF * UF); BasicBlock *BB = L->getLoopPreheader(); IRBuilder<> Builder(BB->getTerminator()); // Generate code to check that the loop's trip count that we computed by // adding one to the backedge-taken count will not overflow. Value *CheckMinIters = Builder.CreateICmpULT( - Count, ConstantInt::get(Count->getType(), VF * UF), "min.iters.check"); + Count, ConstantInt::get(Count->getType(), MinItrsCount), + "min.iters.check"); BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "min.iters.checked"); @@ -3293,8 +3498,9 @@ } void InnerLoopVectorizer::emitVectorLoopEnteredCheck(Loop *L, - BasicBlock *Bypass) { - Value *TC = getOrCreateVectorTripCount(L); + BasicBlock *Bypass, + bool ReCreateVecTC) { + Value *TC = getOrCreateVectorTripCount(L, ReCreateVecTC); BasicBlock *BB = L->getLoopPreheader(); IRBuilder<> Builder(BB->getTerminator()); @@ -3347,7 +3553,8 @@ AddedSafetyChecks = true; } -void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { +BasicBlock* InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, + BasicBlock *Bypass) { BasicBlock *BB = L->getLoopPreheader(); // Generate the code that checks in runtime if arrays overlap. We put the @@ -3358,7 +3565,7 @@ std::tie(FirstCheckInst, MemRuntimeCheck) = Legal->getLAI()->addRuntimeChecks(BB->getTerminator()); if (!MemRuntimeCheck) - return; + return nullptr; // Create a new block containing the memory check. BB->setName("vector.memcheck"); @@ -3377,11 +3584,44 @@ // We currently don't use LoopVersioning for the actual loop cloning but we // still use it to add the noalias metadata. LVer = llvm::make_unique(*Legal->getLAI(), OrigLoop, LI, DT, - PSE.getSE()); + PSE.getSE(), true, true); + LVer->prepareNoAliasMetadata(); + // Return alias check basic block. + return BB; +} + +/// Generate a check for alias check result, if alias check is executed +/// with no-alias result then jump to epilog loop min iteration check +/// else jump to scalar loop. +BasicBlock *InnerLoopVectorizer::emitAliasResultCheck(Loop *L, + BasicBlock *Bypass, + Value *ACR) { + + BasicBlock *BB = L->getLoopPreheader(); + // Create a new block containing the memory check. + BB->setName("alias.result.chk"); + auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); + // Update dominator tree immediately if the generated block is a + // LoopBypassBlock because SCEV expansions to generate loop bypass + // checks may query it before the current function is finished. + DT->addNewBlock(NewBB, BB); + if (L->getParentLoop()) + L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); + ReplaceInstWithInst(BB->getTerminator(), + BranchInst::Create(NewBB, Bypass, ACR)); + LoopBypassBlocks.push_back(BB); + + // We currently don't use LoopVersioning for the actual loop cloning but we + // still use it to add the noalias metadata. + LVer = llvm::make_unique(*Legal->getLAI(), OrigLoop, LI, DT, + PSE.getSE(), true, true); LVer->prepareNoAliasMetadata(); + return BB; } -void InnerLoopVectorizer::createEmptyLoop() { +void InnerLoopVectorizer::createEmptyLoop(Value *RemainderItrs, + Value *EpilogResumeValue, + PHINode *EpilogInduction) { /* In this function we generate a new loop. The new loop will contain the vectorized instructions while the old loop will continue to run the @@ -3431,7 +3671,8 @@ // - counts from zero, stepping by one // - is the size of the widest induction variable type // then we create a new one. - OldInduction = Legal->getPrimaryInduction(); + OldInduction = + EpilogInduction ? EpilogInduction : Legal->getPrimaryInduction(); Type *IdxTy = Legal->getWidestInductionType(); // Split the single block loop into the two loop structure described above. @@ -3462,14 +3703,24 @@ Value *StartIdx = ConstantInt::get(IdxTy, 0); - // We need to test whether the backedge-taken count is uint##_max. Adding one - // to it will cause overflow and an incorrect loop trip count in the vector - // body. In case of overflow we want to directly jump to the scalar remainder - // loop. - emitMinimumIterationCountCheck(Lp, ScalarPH); - // Now, compare the new count to zero. If it is zero skip the vector loop and - // jump to the scalar loop. - emitVectorLoopEnteredCheck(Lp, ScalarPH); + if (RemainderItrs) { + // Create minimum iteration check against remaining iterations. + emitMinimumIterationCountCheck(Lp, ScalarPH, VF, RemainderItrs); + // Now, compare the new count to zero. + // If it is zero skip the vector loop and jump to the scalar loop. + // Enfore recreation of vector trip count. + emitVectorLoopEnteredCheck(Lp, ScalarPH, true); + } else { + // We need to test whether the backedge-taken count is uint##_max. + // Adding one to it will cause overflow and an incorrect loop trip count + // in the vector body. In case of overflow we want to directly jump to the + // scalar remainder loop. + emitMinimumIterationCountCheck(Lp, ScalarPH); + // Now, compare the new count to zero. If it is zero skip the vector loop + // and jump to the scalar loop. + emitVectorLoopEnteredCheck(Lp, ScalarPH); + } + // Generate the code to check any assumptions that we've made for SCEV // expressions. emitSCEVChecks(Lp, ScalarPH); @@ -3477,17 +3728,27 @@ // Generate the code that checks in runtime if arrays overlap. We put the // checks into a separate block to make the more common case of few elements // faster. - emitMemRuntimeChecks(Lp, ScalarPH); - + // If alias check is already executed decide based on its result, + // else generate runtime alias check. + BasicBlock *RuntimeCheckBlock = nullptr; + if (AliasCheckResult) { + RuntimeCheckBlock = emitAliasResultCheck(Lp, ScalarPH, AliasCheckResult); + } else { + RuntimeCheckBlock = emitMemRuntimeChecks(Lp, ScalarPH); + } // Generate the induction variable. // The loop step is equal to the vectorization factor (num of SIMD elements) // times the unroll factor (num of SIMD instructions). Value *CountRoundDown = getOrCreateVectorTripCount(Lp); Constant *Step = ConstantInt::get(IdxTy, VF * UF); - Induction = - createInductionVariable(Lp, StartIdx, CountRoundDown, Step, - getDebugLocFromInstOrOperands(OldInduction)); - + if (EpilogResumeValue) { + Induction = createInductionVariable(Lp, EpilogResumeValue, + CountRoundDown, Step, + getDebugLocFromInstOrOperands(OldInduction)); + } else { + Induction = createInductionVariable(Lp, StartIdx, CountRoundDown, Step, + getDebugLocFromInstOrOperands(OldInduction)); + } // We are going to resume the execution of the scalar loop. // Go over all of the induction variables that we found and fix the // PHIs that are left in the scalar version of the loop. @@ -3499,6 +3760,7 @@ // This variable saves the new starting index for the scalar loop. It is used // to test if there are any tail iterations left once the vector loop has // completed. + ScalarResumeValue = nullptr; LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); for (auto &InductionEntry : *List) { PHINode *OrigPhi = InductionEntry.first; @@ -3531,11 +3793,22 @@ // The old induction's phi node in the scalar body needs the truncated // value. - for (BasicBlock *BB : LoopBypassBlocks) - BCResumeVal->addIncoming(II.getStartValue(), BB); + if (EpilogResumeValue) { + for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) + BCResumeVal->addIncoming(EpilogResumeValue, LoopBypassBlocks[I]); + } else { + for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) + BCResumeVal->addIncoming(II.getStartValue(), LoopBypassBlocks[I]); + } OrigPhi->setIncomingValue(BlockIdx, BCResumeVal); + ScalarResumeValue = BCResumeVal; } + // NOTE: Currently multiple induction variables not supported as multiple + // induction variable incurs multiple checks which may not be profitable. + if (List->size() > 1) + ScalarResumeValue = nullptr; + // Add a check in the middle block to see if we have completed // all of the iterations in the first vector loop. // If (N - N%VF) == N, then we *don't* need to run the remainder. @@ -3555,6 +3828,7 @@ LoopExitBlock = ExitBlock; LoopVectorBody = VecBody; LoopScalarBody = OldBasicBlock; + AliasCheckBlock = RuntimeCheckBlock; // Keep all loop hints from the original loop on the vector loop (we'll // replace the vectorizer-specific hints below). @@ -5025,8 +5299,10 @@ PSE.getSE()->forgetLoop(OrigLoop); // Update the dominator tree information. - assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && - "Entry does not dominate exit."); + // With epilog vectorization not all loop bypass blocks dominates exit blocks. + if (!EnableEpilogVectorization) + assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && + "Entry does not dominate exit."); // We don't predicate stores by this point, so the vector body should be a // single loop. @@ -5035,7 +5311,14 @@ DT->addNewBlock(LoopMiddleBlock, LoopVectorBody); DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]); DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); - DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]); + // Change immedeate dominator only when: + // 1) When epilog vectorization is disabled. + // 2) When epilog vectorization is enabled and EpilogVectorLoopWidth + // is not 1, this is when generating original vector version. + // EpilogVectorLoopWidth will be 1 while generating epilog vector + // version. + if (!EnableEpilogVectorization || EpilogVectorLoopWidth != 1) + DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]); DEBUG(DT->verifyDomTree()); } @@ -6128,6 +6411,31 @@ } } +/// Identify epilog vector loop width from profitable VF's. +LoopVectorizationCostModel::VectorizationFactor +LoopVectorizationCostModel::identifyNextProfitableVF(const unsigned Width) { + // Find next VF. + unsigned VF = 1; + unsigned Cost = 0; + bool hasMultipleInductionVariables = (Legal->getInductionVars()->size() != 1); + // Check the loop vector width should be more than min width threshold. + if (Width < MinWidthEpilogVectorization) + return VectorizationFactor(VF, Cost); + // Return if epilog vectorization option not provided. + // NOTE: Currently multiple induction variables not supported as multiple + // induction variable incurs multiple checks which may not be profitable. + if (!EnableEpilogVectorization || hasMultipleInductionVariables) + return VectorizationFactor(VF, Cost); + // Identify next profitable VF. + for (auto &NextVF : ProfitableVF) + // NOTE: VF's in ProfitableVF is in accesding order. + if ((NextVF.Width < Width) && (VF == 1 || NextVF.Cost < Cost)) { + VF = NextVF.Width; + Cost = NextVF.Cost; + } + return VectorizationFactor(VF, Cost); +} + LoopVectorizationCostModel::VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { // Width 1 means no vectorize @@ -6276,6 +6584,9 @@ << " because it will not generate any vector instructions.\n"); continue; } + // If profitable add it to ProfitableVF list. + if (VectorCost < ScalarCost) + ProfitableVF.push_back(VectorizationFactor(i, VectorCost)); if (VectorCost < Cost) { Cost = VectorCost; Width = i; @@ -7671,6 +7982,9 @@ } using namespace ore; + // Identify multi version loop width + const LoopVectorizationCostModel::VectorizationFactor EpilogVF = + CM.identifyNextProfitableVF(VF.Width); if (!VectorizeLoop) { assert(IC > 1 && "interleave count should not be 1 or 0"); // If we decided that it is not legal to vectorize the loop, then @@ -7686,7 +8000,7 @@ } else { // If we decided that it is *legal* to vectorize the loop, then do it. InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, - &LVL, &CM); + &LVL, &CM, EpilogVF.Width, GetLAA, AA); LB.vectorize(); ++LoopsVectorized;