Index: include/llvm/Transforms/Utils/LoopVersioning.h =================================================================== --- include/llvm/Transforms/Utils/LoopVersioning.h +++ include/llvm/Transforms/Utils/LoopVersioning.h @@ -41,7 +41,7 @@ /// object having no checks and we expect the user to add them. LoopVersioning(const LoopAccessInfo &LAI, Loop *L, LoopInfo *LI, DominatorTree *DT, ScalarEvolution *SE, - bool UseLAIChecks = true); + bool UseLAIChecks = true, bool AssumeLoopSimplifyForm = false); /// \brief Performs the CFG manipulation part of versioning the loop including /// the DominatorTree and LoopInfo updates. @@ -146,6 +146,9 @@ LoopInfo *LI; DominatorTree *DT; ScalarEvolution *SE; + // Flag to allow non simplify form loops in LoopVersioning. + // Minimal expectation is Loop should have a pre header & single exit block. + bool AssumeLoopSimplifyForm; }; } Index: lib/Transforms/Utils/LoopVersioning.cpp =================================================================== --- lib/Transforms/Utils/LoopVersioning.cpp +++ lib/Transforms/Utils/LoopVersioning.cpp @@ -32,11 +32,14 @@ LoopVersioning::LoopVersioning(const LoopAccessInfo &LAI, Loop *L, LoopInfo *LI, DominatorTree *DT, ScalarEvolution *SE, - bool UseLAIChecks) + bool UseLAIChecks, bool AssumeLoopSimplifyForm) : VersionedLoop(L), NonVersionedLoop(nullptr), LAI(LAI), LI(LI), DT(DT), - SE(SE) { + SE(SE), AssumeLoopSimplifyForm(AssumeLoopSimplifyForm) { assert(L->getExitBlock() && "No single exit block"); - assert(L->isLoopSimplifyForm() && "Loop is not in loop-simplify form"); + if (AssumeLoopSimplifyForm) + assert(L->getLoopPreheader() && "No preheader in the loop"); + else + assert(L->isLoopSimplifyForm() && "Loop is not in loop-simplify form"); if (UseLAIChecks) { setAliasChecks(LAI.getRuntimePointerChecking()->getChecks()); setSCEVChecks(LAI.getPSE().getUnionPredicate()); Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -215,6 +215,14 @@ cl::desc("The maximum number of SCEV checks allowed with a " "vectorize(enable) pragma")); +static cl::opt EnableEpilogVectorization( + "enable-epilog-vectorization", cl::init(false), cl::Hidden, + cl::desc("Enable vectorization of epilog scalar loop.")); + +static cl::opt MinWidthEpilogVectorization( + "min-width-epilog-vectorization", cl::init(16), cl::Hidden, + cl::desc("Minimum vector loop width to enable epilog vectorization.")); + /// Create an analysis remark that explains why vectorization failed /// /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p @@ -386,6 +394,67 @@ : ConstantFP::get(Ty, C); } +class EpilogLoopInfo { +public: + EpilogLoopInfo(Value *RemainderItrs, Value *EpilogResumeValue, + PHINode *EpilogInduction, PHINode *ACR) + : RemainderItrs(RemainderItrs), EpilogResumeValue(EpilogResumeValue), + EpilogInduction(EpilogInduction), ACR(ACR) {} + + EpilogLoopInfo() + : RemainderItrs(nullptr), EpilogResumeValue(nullptr), + EpilogInduction(nullptr), ACR(nullptr) {} + + Value *getRemainderItrs() { return RemainderItrs; } + Value *getEpilogResumeValue() { return EpilogResumeValue; } + PHINode *getEpilogInduction() { return EpilogInduction; } + PHINode *getAliasCheckResult() { return ACR; } + +private: + Value *RemainderItrs; + Value *EpilogResumeValue; + PHINode *EpilogInduction; + PHINode *ACR; +}; + +class EpilogVectorizationInfo { +public: + EpilogVectorizationInfo() + : ELI(nullptr), EpilogVectorLoopWidth(1), EpilogVecProfitable(false) {} + EpilogVectorizationInfo(EpilogLoopInfo *ELI, unsigned Width, bool EVP) + : ELI(ELI), EpilogVectorLoopWidth(Width), EpilogVecProfitable(EVP) {} + + void setEpilogVectorizationInfo(unsigned V, bool _EpilogVecProfitable) { + EpilogVectorLoopWidth = V; + EpilogVecProfitable = _EpilogVecProfitable; + } + + unsigned getEpilogVectorLoopWidth() { return EpilogVectorLoopWidth; } + + Value *getRemainderItrs() { return ELI ? ELI->getRemainderItrs() : nullptr; } + + Value *getEpilogResumeValue() { + return ELI ? ELI->getEpilogResumeValue() : nullptr; + } + + PHINode *getEpilogInduction() { + return ELI ? ELI->getEpilogInduction() : nullptr; + } + + PHINode *getAliasCheckResult() { + return ELI ? ELI->getAliasCheckResult() : nullptr; + } + + bool hasELI() { return ELI ? true : false; } + + bool isEpilogVecProfitable() { return EpilogVecProfitable; } + +private: + EpilogLoopInfo *ELI; + unsigned EpilogVectorLoopWidth; + bool EpilogVecProfitable; +}; + /// InnerLoopVectorizer vectorizes loops which contain only one basic /// block to a specified vectorization factor (VF). /// This class performs the widening of scalars into vectors, or multiple @@ -408,13 +477,14 @@ const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, unsigned VecWidth, unsigned UnrollFactor, LoopVectorizationLegality *LVL, - LoopVectorizationCostModel *CM) + LoopVectorizationCostModel *CM, + const EpilogVectorizationInfo &EVI) : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()), Induction(nullptr), OldInduction(nullptr), VectorLoopValueMap(UnrollFactor, VecWidth), TripCount(nullptr), VectorTripCount(nullptr), Legal(LVL), Cost(CM), - AddedSafetyChecks(false) {} + AddedSafetyChecks(false), ScalarResumeValue(nullptr), EVI(EVI) {} // Perform the actual loop widening (vectorization). void vectorize() { @@ -422,6 +492,9 @@ createEmptyLoop(); // Widen each instruction in the old loop to a new one in the new loop. vectorizeLoop(); + // Check for epilog vectorization feasiblity: + if (canCreateVectorEpilog()) + createVectorEpilog(); } // Return true if any runtime check is added. @@ -588,7 +661,7 @@ Value *getOrCreateTripCount(Loop *NewLoop); /// Returns (and creates if needed) the trip count of the widened loop. - Value *getOrCreateVectorTripCount(Loop *NewLoop); + Value *getOrCreateVectorTripCount(Loop *NewLoop, bool ReCreateVecTC = false); /// Emit a bypass check to see if the trip count would overflow, or we /// wouldn't have enough iterations to execute one vector loop. @@ -599,8 +672,9 @@ /// had to make are correct. void emitSCEVChecks(Loop *L, BasicBlock *Bypass); /// Emit bypass checks to check any memory assumptions we may have made. - void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); - + BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); + /// Emit bypass checks by checking runtime alias result. + BasicBlock *emitAliasResultCheck(Loop *L, BasicBlock *Bypass); /// Add additional metadata to \p To that was not present on \p Orig. /// /// Currently this is used to add the noalias annotations based on the @@ -615,6 +689,45 @@ /// loop. void addMetadata(Instruction *To, Instruction *From); + /// Returns true if epilog vectorization is legal & profitable. + /// Legality is already performed on the loop while first vector + /// loop generation, now checking minimal stuff required for epilog + /// vectorization. + bool canCreateVectorEpilog() { + // Epilog vectorization option should be enabled. + if (!EnableEpilogVectorization) + return false; + // Identified epilog vector width should be a valid vector width. + if (EVI.getEpilogVectorLoopWidth() == 1) + return false; + // Loop should have exit block. + if (!OrigLoop->getExitBlock()) + return false; + // Loop should have pre header. + if (!OrigLoop->getLoopPreheader()) + return false; + // Unique induction phi node should exist. + if (!ScalarResumeValue) + return false; + // Loop bypass blocks should not be empty. + if (!LoopBypassBlocks.size()) + return false; + // Original Trip count & resume value type should be same. + if (getOrCreateTripCount(OrigLoop)->getType() != + ScalarResumeValue->getType()) + return false; + // Old Induction variable should exist. + if (!OldInduction) + return false; + return true; + } + + // Transform for epilog vectorization. + void createVectorEpilog(); + + // Create alias check result phinode. + PHINode *createAliasCheckResultPHI(); + /// \brief Similar to the previous function but it adds the metadata to a /// vector of instructions. void addMetadata(ArrayRef To, Instruction *From); @@ -785,6 +898,8 @@ BasicBlock *LoopVectorBody; /// The scalar loop body. BasicBlock *LoopScalarBody; + /// Alias check block. + BasicBlock *AliasCheckBlock; /// A list of all bypass blocks. The first block is the entry of the loop. SmallVector LoopBypassBlocks; @@ -828,6 +943,10 @@ // Holds the end values for each induction variable. We save the end values // so we can later fix-up the external users of the induction variables. DenseMap IVEndValues; + /// Scalar block resume value PHINode. + PHINode *ScalarResumeValue; + // Epilog vectorization info. + EpilogVectorizationInfo EVI; }; class InnerLoopUnroller : public InnerLoopVectorizer { @@ -840,7 +959,7 @@ LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM) : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1, - UnrollFactor, LVL, CM) {} + UnrollFactor, LVL, CM, EpilogVectorizationInfo()) {} private: void scalarizeInstruction(Instruction *Instr, @@ -1577,7 +1696,7 @@ public: LoopVectorizationLegality( Loop *L, PredicatedScalarEvolution &PSE, DominatorTree *DT, - TargetLibraryInfo *TLI, AliasAnalysis *AA, Function *F, + const TargetLibraryInfo *TLI, AliasAnalysis *AA, Function *F, const TargetTransformInfo *TTI, std::function *GetLAA, LoopInfo *LI, OptimizationRemarkEmitter *ORE, LoopVectorizationRequirements *R, @@ -1785,7 +1904,7 @@ /// unrolling. PredicatedScalarEvolution &PSE; /// Target Library Info. - TargetLibraryInfo *TLI; + const TargetLibraryInfo *TLI; /// Target Transform Info const TargetTransformInfo *TTI; /// Dominator Tree. @@ -1859,6 +1978,9 @@ struct VectorizationFactor { unsigned Width; // Vector width with best cost unsigned Cost; // Cost of the loop with that width + VectorizationFactor() : Width(1), Cost(0) {} + VectorizationFactor(unsigned Width, unsigned Cost) + : Width(Width), Cost(Cost) {} }; /// \return The most profitable vectorization factor and the cost of that VF. /// This method checks every power of two up to VF. If UserVF is not ZERO @@ -1866,6 +1988,13 @@ /// possible. VectorizationFactor selectVectorizationFactor(bool OptForSize); + /// \return The next profitable vector factor after the input vector factor + /// If there is not any next profitable vector factor return + /// VectorizationFactor + /// with scalar as next possible factor. + VectorizationFactor identifyNextProfitableVF(const unsigned IVF, + bool OptForSize); + /// \return The size (in bits) of the smallest and widest types in the code /// that needs to be vectorized. We ignore values that remain scalar such as /// 64 bit loop indices. @@ -2186,6 +2315,8 @@ SmallPtrSet ValuesToIgnore; /// Values to ignore in the cost model when VF > 1. SmallPtrSet VecValuesToIgnore; + /// Profitable vector factors. + SmallVector ProfitableVF; }; /// \brief This holds vectorization requirements that must be verified late in @@ -3188,6 +3319,90 @@ VectorLoopValueMap.initScalar(Instr, Entry); } +/// This function will generate PHINode for alias result check in original +/// scalar loop's preheader block. +PHINode *InnerLoopVectorizer::createAliasCheckResultPHI() { + // Return nullptr if no alias check created previously. + if (!AliasCheckBlock) + return nullptr; + auto &M = *LoopScalarPreHeader->getTerminator()->getFunction()->getParent(); + auto &Ctx = M.getContext(); + // Create PHINode for alias check result. + PHINode *ACR = + PHINode::Create(Type::getInt1Ty(Ctx), 3, "acr.val", ScalarResumeValue); + // From middle block set true incoming value. + ACR->addIncoming(Builder.getInt1(true), LoopMiddleBlock); + // From bypassing blocks set false incoming value. + for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) + ACR->addIncoming(Builder.getInt1(false), LoopBypassBlocks[I]); + return ACR; +} + +/// This function will vectorize epilog by generating epilog vector version. +/// In this function we generate a new loop. The new loop will contain +/// the vectorized instructions with new vector factor. +/// +/// [ ] <-- original loop iteration number check. +/// / | +/// / v +/// | [ ] <-- original vector loop bypass (may consist of multiple blocks +/// | / | having checks like alias checks & scev checks.) +/// | / v +/// || [ ] <-- original vector pre header. +/// |/ | +/// | v +/// | [ ] \ +/// | [ ]_| <-- original vector loop. +/// | | +/// | v +/// | -[ ] <--- original middle-block. +/// | / | +/// | / v +/// |----[ ] <-- new version loop iteration number check. +/// | / | +/// |/ v +/// / [ ] <-- Previously executed alias result check. +/// || / | +/// ||/ v +/// |/ [ ] <-- new vector version pre header. +/// || | +/// || v +/// || [ ] \ +/// || [ ]_| <-- new vector loop. +/// || | +/// || v +/// || -[ ] <--- new middle-block. +/// || / | +/// | / v +/// -|- >[ ] <--- scalar preheader. +/// | | +/// | v +/// | [ ] \ +/// | [ ]_| <-- scalar loop to handle remainder. +/// \ | +/// \ v +/// >[ ] <-- exit block. +/// ... +/// +void InnerLoopVectorizer::createVectorEpilog() { + DEBUG(dbgs() << "Epilog vectorization is beneficial with width : " + << EVI.getEpilogVectorLoopWidth() << " in Function: " + << LoopScalarPreHeader->getParent()->getName() << "\n"); + // Create PHINode for alias check execution result. + PHINode *AliasCheckResult = createAliasCheckResultPHI(); + // Identify remaining number of iterations. + IRBuilder<> ResumeBuilder(&*LoopScalarPreHeader->getTerminator()); + Value *RemainderItrs = ResumeBuilder.CreateSub(getOrCreateTripCount(OrigLoop), + ScalarResumeValue, "rem.itrs"); + // Create instance of inner loop vectorizer & vectorize. + EpilogLoopInfo NewELI(RemainderItrs, ScalarResumeValue, OldInduction, + AliasCheckResult); + InnerLoopVectorizer LB(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, + EVI.getEpilogVectorLoopWidth(), /*EpilogUF*/ 1, Legal, + Cost, EpilogVectorizationInfo(&NewELI, 1, true)); + LB.vectorize(); +} + PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, Value *End, Value *Step, Instruction *DL) { @@ -3265,8 +3480,9 @@ return TripCount; } -Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { - if (VectorTripCount) +Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L, + bool ReCreateVecTC) { + if (!ReCreateVecTC && VectorTripCount) return VectorTripCount; Value *TC = getOrCreateTripCount(L); @@ -3299,7 +3515,9 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass) { - Value *Count = getOrCreateTripCount(L); + // If its a known trip count then use it. + Value *Count = + EVI.hasELI() ? EVI.getRemainderItrs() : getOrCreateTripCount(L); BasicBlock *BB = L->getLoopPreheader(); IRBuilder<> Builder(BB->getTerminator()); @@ -3323,7 +3541,7 @@ void InnerLoopVectorizer::emitVectorLoopEnteredCheck(Loop *L, BasicBlock *Bypass) { - Value *TC = getOrCreateVectorTripCount(L); + Value *TC = getOrCreateVectorTripCount(L, (EVI.hasELI() ? true : false)); BasicBlock *BB = L->getLoopPreheader(); IRBuilder<> Builder(BB->getTerminator()); @@ -3376,7 +3594,8 @@ AddedSafetyChecks = true; } -void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { +BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, + BasicBlock *Bypass) { BasicBlock *BB = L->getLoopPreheader(); // Generate the code that checks in runtime if arrays overlap. We put the @@ -3387,7 +3606,7 @@ std::tie(FirstCheckInst, MemRuntimeCheck) = Legal->getLAI()->addRuntimeChecks(BB->getTerminator()); if (!MemRuntimeCheck) - return; + return nullptr; // Create a new block containing the memory check. BB->setName("vector.memcheck"); @@ -3406,8 +3625,38 @@ // We currently don't use LoopVersioning for the actual loop cloning but we // still use it to add the noalias metadata. LVer = llvm::make_unique(*Legal->getLAI(), OrigLoop, LI, DT, - PSE.getSE()); + PSE.getSE(), true, true); + LVer->prepareNoAliasMetadata(); + // Return alias check basic block. + return BB; +} + +/// Generate a check for alias check result, if alias check is executed +/// with no-alias result then jump to epilog loop min iteration check +/// else jump to scalar loop. +BasicBlock *InnerLoopVectorizer::emitAliasResultCheck(Loop *L, + BasicBlock *Bypass) { + + BasicBlock *BB = L->getLoopPreheader(); + // Create a new block containing the memory check. + BB->setName("alias.result.chk"); + auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); + // Update dominator tree immediately if the generated block is a + // LoopBypassBlock because SCEV expansions to generate loop bypass + // checks may query it before the current function is finished. + DT->addNewBlock(NewBB, BB); + if (L->getParentLoop()) + L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); + ReplaceInstWithInst( + BB->getTerminator(), + BranchInst::Create(NewBB, Bypass, EVI.getAliasCheckResult())); + LoopBypassBlocks.push_back(BB); + // We currently don't use LoopVersioning for the actual loop cloning but we + // still use it to add the noalias metadata. + LVer = llvm::make_unique(*Legal->getLAI(), OrigLoop, LI, DT, + PSE.getSE(), true, true); LVer->prepareNoAliasMetadata(); + return BB; } void InnerLoopVectorizer::createEmptyLoop() { @@ -3460,7 +3709,8 @@ // - counts from zero, stepping by one // - is the size of the widest induction variable type // then we create a new one. - OldInduction = Legal->getPrimaryInduction(); + OldInduction = + EVI.hasELI() ? EVI.getEpilogInduction() : Legal->getPrimaryInduction(); Type *IdxTy = Legal->getWidestInductionType(); // Split the single block loop into the two loop structure described above. @@ -3506,17 +3756,29 @@ // Generate the code that checks in runtime if arrays overlap. We put the // checks into a separate block to make the more common case of few elements // faster. - emitMemRuntimeChecks(Lp, ScalarPH); - + // If alias check is already executed decide based on its result, + // else generate runtime alias check. + BasicBlock *RuntimeCheckBlock = nullptr; + if (EVI.getAliasCheckResult()) { + RuntimeCheckBlock = emitAliasResultCheck(Lp, ScalarPH); + } else { + RuntimeCheckBlock = emitMemRuntimeChecks(Lp, ScalarPH); + } // Generate the induction variable. // The loop step is equal to the vectorization factor (num of SIMD elements) // times the unroll factor (num of SIMD instructions). Value *CountRoundDown = getOrCreateVectorTripCount(Lp); Constant *Step = ConstantInt::get(IdxTy, VF * UF); - Induction = - createInductionVariable(Lp, StartIdx, CountRoundDown, Step, - getDebugLocFromInstOrOperands(OldInduction)); - + Value *EpilogResumeValue = EVI.getEpilogResumeValue(); + if (EpilogResumeValue) { + Induction = + createInductionVariable(Lp, EpilogResumeValue, CountRoundDown, Step, + getDebugLocFromInstOrOperands(OldInduction)); + } else { + Induction = + createInductionVariable(Lp, StartIdx, CountRoundDown, Step, + getDebugLocFromInstOrOperands(OldInduction)); + } // We are going to resume the execution of the scalar loop. // Go over all of the induction variables that we found and fix the // PHIs that are left in the scalar version of the loop. @@ -3528,6 +3790,7 @@ // This variable saves the new starting index for the scalar loop. It is used // to test if there are any tail iterations left once the vector loop has // completed. + ScalarResumeValue = nullptr; LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); for (auto &InductionEntry : *List) { PHINode *OrigPhi = InductionEntry.first; @@ -3560,11 +3823,22 @@ // The old induction's phi node in the scalar body needs the truncated // value. - for (BasicBlock *BB : LoopBypassBlocks) - BCResumeVal->addIncoming(II.getStartValue(), BB); + if (EpilogResumeValue) { + for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) + BCResumeVal->addIncoming(EpilogResumeValue, LoopBypassBlocks[I]); + } else { + for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) + BCResumeVal->addIncoming(II.getStartValue(), LoopBypassBlocks[I]); + } OrigPhi->setIncomingValue(BlockIdx, BCResumeVal); + ScalarResumeValue = BCResumeVal; } + // NOTE: Currently multiple induction variables not supported as multiple + // induction variable incurs multiple checks which may not be profitable. + if (List->size() > 1) + ScalarResumeValue = nullptr; + // Add a check in the middle block to see if we have completed // all of the iterations in the first vector loop. // If (N - N%VF) == N, then we *don't* need to run the remainder. @@ -3584,6 +3858,7 @@ LoopExitBlock = ExitBlock; LoopVectorBody = VecBody; LoopScalarBody = OldBasicBlock; + AliasCheckBlock = RuntimeCheckBlock; // Keep all loop hints from the original loop on the vector loop (we'll // replace the vectorizer-specific hints below). @@ -5021,8 +5296,10 @@ PSE.getSE()->forgetLoop(OrigLoop); // Update the dominator tree information. - assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && - "Entry does not dominate exit."); + // With epilog vectorization not all loop bypass blocks dominates exit blocks. + if (!EnableEpilogVectorization) + assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && + "Entry does not dominate exit."); // We don't predicate stores by this point, so the vector body should be a // single loop. @@ -5031,7 +5308,16 @@ DT->addNewBlock(LoopMiddleBlock, LoopVectorBody); DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]); DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); - DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]); + // Change immediate dominator only when: + // 1) When epilog vectorization option is disabled. + // 2) When epilog vectorization option is enabled & not profitable. + // 3) When epilog vectorization option is enabled & profitable + // for the given loop, only update immediate dominator for + // first vector loop. + bool EpilogVecProfitable = EVI.isEpilogVecProfitable(); + if (!EnableEpilogVectorization || !EpilogVecProfitable || + (EpilogVecProfitable && EVI.getEpilogVectorLoopWidth() != 1)) + DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]); DEBUG(DT->verifyDomTree()); } @@ -6129,6 +6415,35 @@ } } +/// Identify epilog vector loop width from profitable VF's. +LoopVectorizationCostModel::VectorizationFactor +LoopVectorizationCostModel::identifyNextProfitableVF(const unsigned Width, + bool OptForSize) { + // Find next VF. + unsigned VF = 1; + unsigned Cost = 0; + bool hasMultipleInductionVariables = (Legal->getInductionVars()->size() != 1); + // Return if epilog vectorization is disabled. + // Return if OptForSize is enabled + if (!EnableEpilogVectorization || OptForSize) + return VectorizationFactor(VF, Cost); + // NOTE: Currently multiple induction variables not supported as multiple + // induction variable incurs multiple checks which may not be profitable. + if (hasMultipleInductionVariables) + return VectorizationFactor(VF, Cost); + // Check the loop vector width should be more than min width threshold. + if (Width < MinWidthEpilogVectorization) + return VectorizationFactor(VF, Cost); + // Identify next profitable VF. + for (auto &NextVF : ProfitableVF) + // NOTE: VF's in ProfitableVF is in accesding order. + if ((NextVF.Width < Width) && (VF == 1 || NextVF.Cost < Cost)) { + VF = NextVF.Width; + Cost = NextVF.Cost; + } + return VectorizationFactor(VF, Cost); +} + LoopVectorizationCostModel::VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { // Width 1 means no vectorize @@ -6277,6 +6592,9 @@ << " because it will not generate any vector instructions.\n"); continue; } + // If profitable add it to ProfitableVF list. + if (VectorCost < ScalarCost) + ProfitableVF.push_back(VectorizationFactor(i, VectorCost)); if (VectorCost < Cost) { Cost = VectorCost; Width = i; @@ -7672,6 +7990,12 @@ } using namespace ore; + // Identify epilog vector loop width + const LoopVectorizationCostModel::VectorizationFactor EpilogVF = + CM.identifyNextProfitableVF(VF.Width, OptForSize); + // Check profitablity of epilog vectorization. + bool EpilogVecProfitable = (EpilogVF.Width == 1) ? 0 : 1; + // Switch indicates widening of epilog loop. if (!VectorizeLoop) { assert(IC > 1 && "interleave count should not be 1 or 0"); // If we decided that it is not legal to vectorize the loop, then @@ -7686,8 +8010,9 @@ << NV("InterleaveCount", IC) << ")"); } else { // If we decided that it is *legal* to vectorize the loop, then do it. - InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, - &LVL, &CM); + InnerLoopVectorizer LB( + L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, &LVL, &CM, + EpilogVectorizationInfo(nullptr, EpilogVF.Width, EpilogVecProfitable)); LB.vectorize(); ++LoopsVectorized; Index: test/Transforms/LoopVectorize/epilogvec1.ll =================================================================== --- test/Transforms/LoopVectorize/epilogvec1.ll +++ test/Transforms/LoopVectorize/epilogvec1.ll @@ -0,0 +1,46 @@ +; RUN: opt < %s -S -enable-epilog-vectorization -debug-only=loop-vectorize -loop-vectorize 2>&1 | FileCheck %s +; REQUIRES: asserts +; Test to check epilog loop vectorization. +; CHECK: LV: Found a vectorizable loop (16) +; CHECK: Epilog vectorization is beneficial with width : 8 +; CHECK-LABEL: @foo( +; CHECK-LABEL: vector.body: +; CHECK: <16 x i8> +; CHECK: scalar.ph: +; CHECK-NEXT: %acr.val = phi i1 [ true, %middle.block ], [ false, %for.body.preheader ], [ false, %min.iters.checked ], [ false, %vector.memcheck ] +; CHECK-LABEL: vector.body10: +; CHECK: <8 x i8> +; CHECK-LABEL: for.body: + +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: norecurse nounwind uwtable +define void @foo(i8* nocapture %A, i8* nocapture readonly %B, i8* nocapture readonly %C, i32 %len) local_unnamed_addr #0 { +entry: + %cmp14 = icmp sgt i32 %len, 0 + br i1 %cmp14, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %len to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i8, i8* %B, i64 %indvars.iv + %0 = load i8, i8* %arrayidx, align 1 + %arrayidx2 = getelementptr inbounds i8, i8* %C, i64 %indvars.iv + %1 = load i8, i8* %arrayidx2, align 1 + %add = add i8 %1, %0 + %arrayidx6 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv + store i8 %add, i8* %arrayidx6, align 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +} + Index: test/Transforms/LoopVectorize/epilogvec2.ll =================================================================== --- test/Transforms/LoopVectorize/epilogvec2.ll +++ test/Transforms/LoopVectorize/epilogvec2.ll @@ -0,0 +1,82 @@ +; RUN: opt < %s -S -enable-epilog-vectorization -debug-only=loop-vectorize -loop-vectorize 2>&1 | FileCheck %s +; REQUIRES: asserts +; Test to check epilog loop vectorization. +; CHECK: LV: Found a vectorizable loop (16) +; CHECK: Epilog vectorization is beneficial with width : 4 +; CHECK-LABEL: @foo( + +; CHECK-LABEL: vector.body: +; CHECK: <16 x i8> +; CHECK-LABEL: scalar.ph: +; CHECK-NEXT: %acr.val = phi i1 [ true, %middle.block ], [ false, %for.body4.preheader ], [ false, %min.iters.checked ], [ false, %vector.memcheck ] +; CHECK-NEXT: %bc.resume.val = phi i64 [ %n.vec, %middle.block ], [ 0, %for.body4.preheader ], [ 0, %min.iters.checked ], [ 0, %vector.memcheck ] +; CHECK-NEXT: %rem.itrs = sub i64 %wide.trip.count, %bc.resume.val +; CHECK-NEXT: %min.iters.check13 = icmp ult i64 %rem.itrs, 4 +; CHECK-NEXT: br i1 %min.iters.check13, label %scalar.ph12, label %min.iters.checked14 +; CHECK-LABEL: alias.result.chk: +; CHECK-NEXT: br i1 %acr.val, label %vector.ph19, label %scalar.ph12 +; CHECK-LABEL: vector.body10: +; CHECK: <4 x i8> + +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: norecurse nounwind uwtable +define void @foo(i8* nocapture %dst, i32 %d_st, i8* nocapture readonly %src1, i32 %s1_st, i8* nocapture readonly %src2, i32 %s2_st, i32 %w, i32 %h) local_unnamed_addr #0 { +entry: + %cmp32 = icmp sgt i32 %h, 0 + br i1 %cmp32, label %for.cond1.preheader.lr.ph, label %for.cond.cleanup + +for.cond1.preheader.lr.ph: ; preds = %entry + %cmp230 = icmp sgt i32 %w, 0 + %idx.ext = sext i32 %d_st to i64 + %idx.ext12 = sext i32 %s1_st to i64 + %idx.ext14 = sext i32 %s2_st to i64 + %wide.trip.count = zext i32 %w to i64 + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.cond.cleanup3, %for.cond1.preheader.lr.ph + %y.036 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %inc17, %for.cond.cleanup3 ] + %dst.addr.035 = phi i8* [ %dst, %for.cond1.preheader.lr.ph ], [ %add.ptr, %for.cond.cleanup3 ] + %src1.addr.034 = phi i8* [ %src1, %for.cond1.preheader.lr.ph ], [ %add.ptr13, %for.cond.cleanup3 ] + %src2.addr.033 = phi i8* [ %src2, %for.cond1.preheader.lr.ph ], [ %add.ptr15, %for.cond.cleanup3 ] + br i1 %cmp230, label %for.body4.preheader, label %for.cond.cleanup3 + +for.body4.preheader: ; preds = %for.cond1.preheader + br label %for.body4 + +for.cond.cleanup.loopexit: ; preds = %for.cond.cleanup3 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void + +for.cond.cleanup3.loopexit: ; preds = %for.body4 + br label %for.cond.cleanup3 + +for.cond.cleanup3: ; preds = %for.cond.cleanup3.loopexit, %for.cond1.preheader + %add.ptr = getelementptr inbounds i8, i8* %dst.addr.035, i64 %idx.ext + %add.ptr13 = getelementptr inbounds i8, i8* %src1.addr.034, i64 %idx.ext12 + %add.ptr15 = getelementptr inbounds i8, i8* %src2.addr.033, i64 %idx.ext14 + %inc17 = add nuw nsw i32 %y.036, 1 + %exitcond37 = icmp eq i32 %inc17, %h + br i1 %exitcond37, label %for.cond.cleanup.loopexit, label %for.cond1.preheader + +for.body4: ; preds = %for.body4.preheader, %for.body4 + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body4 ], [ 0, %for.body4.preheader ] + %arrayidx = getelementptr inbounds i8, i8* %src1.addr.034, i64 %indvars.iv + %0 = load i8, i8* %arrayidx, align 1 + %conv = zext i8 %0 to i32 + %arrayidx6 = getelementptr inbounds i8, i8* %src2.addr.033, i64 %indvars.iv + %1 = load i8, i8* %arrayidx6, align 1 + %conv7 = zext i8 %1 to i32 + %add = add nuw nsw i32 %conv, 1 + %add8 = add nuw nsw i32 %add, %conv7 + %shr29 = lshr i32 %add8, 1 + %conv9 = trunc i32 %shr29 to i8 + %arrayidx11 = getelementptr inbounds i8, i8* %dst.addr.035, i64 %indvars.iv + store i8 %conv9, i8* %arrayidx11, align 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup3.loopexit, label %for.body4 +} +