Index: llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -228,11 +228,11 @@ /// Plan how to best vectorize, return the best VF and its cost, or None if /// vectorization and interleaving should be avoided up front. - Optional plan(bool OptForSize, unsigned UserVF); + Optional plan(unsigned UserVF); /// Use the VPlan-native path to plan how to best vectorize, return the best /// VF and its cost. - VectorizationFactor planInVPlanNativePath(bool OptForSize, unsigned UserVF); + VectorizationFactor planInVPlanNativePath(unsigned UserVF); /// Finalize the best decision and dispose of all other VPlans. void setBestPlan(unsigned VF, unsigned UF); Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -836,6 +836,14 @@ namespace llvm { +// Loop vectorization cost-model hints how the scalar epilogue loop should be +// lowered. +enum ScalarEpilogueLowering { + CM_ScalarEpilogueAllowed, + CM_ScalarEpilogueNotAllowedOptSize, + CM_ScalarEpilogueNotAllowedLowTripLoop +}; + /// LoopVectorizationCostModel - estimates the expected speedups due to /// vectorization. /// In many cases vectorization is not profitable. This can happen because of @@ -845,7 +853,8 @@ /// different operations. class LoopVectorizationCostModel { public: - LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE, + LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, + PredicatedScalarEvolution &PSE, LoopInfo *LI, LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI, DemandedBits *DB, @@ -853,12 +862,13 @@ OptimizationRemarkEmitter *ORE, const Function *F, const LoopVectorizeHints *Hints, InterleavedAccessInfo &IAI) - : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), - AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {} + : IsScalarEpilogueAllowed(SEL), TheLoop(L), PSE(PSE), + LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), + TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {} /// \return An upper bound for the vectorization factor, or None if /// vectorization and interleaving should be avoided up front. - Optional computeMaxVF(bool OptForSize); + Optional computeMaxVF(); /// \return The most profitable vectorization factor and the cost of that VF. /// This method checks every power of two up to MaxVF. If UserVF is not ZERO @@ -881,8 +891,7 @@ /// If interleave count has been specified by metadata it will be returned. /// Otherwise, the interleave count is computed and returned. VF and LoopCost /// are the selected vectorization factor and the cost of the selected VF. - unsigned selectInterleaveCount(bool OptForSize, unsigned VF, - unsigned LoopCost); + unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost); /// Memory access instruction may be vectorized in more than one way. /// Form of instruction after vectorization depends on cost. @@ -1157,11 +1166,13 @@ /// to handle accesses with gaps, and there is nothing preventing us from /// creating a scalar epilogue. bool requiresScalarEpilogue() const { - return IsScalarEpilogueAllowed && InterleaveInfo.requiresScalarEpilogue(); + return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue(); } /// Returns true if a scalar epilogue is not allowed due to optsize. - bool isScalarEpilogueAllowed() const { return IsScalarEpilogueAllowed; } + bool isScalarEpilogueAllowed() const { + return IsScalarEpilogueAllowed == CM_ScalarEpilogueAllowed; + } /// Returns true if all loop blocks should be masked to fold tail loop. bool foldTailByMasking() const { return FoldTailByMasking; } @@ -1187,7 +1198,7 @@ /// \return An upper bound for the vectorization factor, larger than zero. /// One is returned if vectorization should best be avoided due to cost. - unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount); + unsigned computeFeasibleMaxVF(unsigned ConstTripCount); /// The vectorization cost is a combination of the cost itself and a boolean /// indicating whether any of the contributing operations will actually @@ -1270,13 +1281,13 @@ SmallPtrSet PredicatedBBsAfterVectorization; /// Records whether it is allowed to have the original scalar loop execute at - /// least once. This may be needed as a fallback loop in case runtime + /// least once. This may be needed as a fallback loop in case runtime /// aliasing/dependence checks fail, or to handle the tail/remainder /// iterations when the trip count is unknown or doesn't divide by the VF, /// or as a peel-loop to handle gaps in interleave-groups. /// Under optsize and when the trip count is very small we don't allow any /// iterations to execute in the scalar loop. - bool IsScalarEpilogueAllowed = true; + ScalarEpilogueLowering IsScalarEpilogueAllowed = CM_ScalarEpilogueAllowed; /// All blocks of loop are to be masked to fold tail of scalar iterations. bool FoldTailByMasking = false; @@ -4428,10 +4439,10 @@ // Check if masking is required. // A Group may need masking for one of two reasons: it resides in a block that // needs predication, or it was decided to use masking to deal with gaps. - bool PredicatedAccessRequiresMasking = + bool PredicatedAccessRequiresMasking = Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); - bool AccessWithGapsRequiresMasking = - Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed; + bool AccessWithGapsRequiresMasking = + Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) return true; @@ -4651,7 +4662,7 @@ Uniforms[VF].insert(Worklist.begin(), Worklist.end()); } -Optional LoopVectorizationCostModel::computeMaxVF(bool OptForSize) { +Optional LoopVectorizationCostModel::computeMaxVF() { if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { // TODO: It may by useful to do since it's still likely to be dynamically // uniform if the target can skip. @@ -4666,8 +4677,11 @@ } unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); - if (!OptForSize) // Remaining checks deal with scalar loop when OptForSize. - return computeFeasibleMaxVF(OptForSize, TC); + if (isScalarEpilogueAllowed()) + return computeFeasibleMaxVF(TC); + + LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue.\n" << + "LV: Performing code size checks.\n"); if (Legal->getRuntimePointerChecking()->Need) { ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize") @@ -4716,15 +4730,13 @@ // Record that scalar epilogue is not allowed. LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); - IsScalarEpilogueAllowed = !OptForSize; - // We don't create an epilogue when optimizing for size. // Invalidate interleave groups that require an epilogue if we can't mask // the interleave-group. - if (!useMaskedInterleavedAccesses(TTI)) + if (!useMaskedInterleavedAccesses(TTI)) InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); - unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC); + unsigned MaxVF = computeFeasibleMaxVF(TC); if (TC > 0 && TC % MaxVF == 0) { LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); @@ -4755,8 +4767,7 @@ } unsigned -LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize, - unsigned ConstTripCount) { +LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); unsigned SmallestType, WidestType; std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); @@ -4794,8 +4805,8 @@ } unsigned MaxVF = MaxVectorSize; - if (TTI.shouldMaximizeVectorBandwidth(OptForSize) || - (MaximizeBandwidth && !OptForSize)) { + if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || + (MaximizeBandwidth && isScalarEpilogueAllowed())) { // Collect all viable vectorization factors larger than the default MaxVF // (i.e. MaxVectorSize). SmallVector VFs; @@ -4934,8 +4945,7 @@ return {MinWidth, MaxWidth}; } -unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, - unsigned VF, +unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, unsigned LoopCost) { // -- The interleave heuristics -- // We interleave the loop in order to expose ILP and reduce the loop overhead. @@ -4951,8 +4961,7 @@ // 3. We don't interleave if we think that we will spill registers to memory // due to the increased register pressure. - // When we optimize for size, we don't interleave. - if (OptForSize) + if (!isScalarEpilogueAllowed()) return 1; // We used the distance for the interleave count. @@ -5611,8 +5620,8 @@ } // Calculate the cost of the whole interleaved group. - bool UseMaskForGaps = - Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed; + bool UseMaskForGaps = + Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); unsigned Cost = TTI.getInterleavedMemoryOpCost( I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps); @@ -6150,8 +6159,7 @@ } VectorizationFactor -LoopVectorizationPlanner::planInVPlanNativePath(bool OptForSize, - unsigned UserVF) { +LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) { unsigned VF = UserVF; // Outer loop handling: They may require CFG and instruction level // transformations before even evaluating whether vectorization is profitable. @@ -6190,10 +6198,9 @@ return VectorizationFactor::Disabled(); } -Optional LoopVectorizationPlanner::plan(bool OptForSize, - unsigned UserVF) { +Optional LoopVectorizationPlanner::plan(unsigned UserVF) { assert(OrigLoop->empty() && "Inner loop expected."); - Optional MaybeMaxVF = CM.computeMaxVF(OptForSize); + Optional MaybeMaxVF = CM.computeMaxVF(); if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. return None; @@ -7196,8 +7203,15 @@ assert(EnableVPlanNativePath && "VPlan-native path is disabled."); Function *F = L->getHeader()->getParent(); InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); - LoopVectorizationCostModel CM(L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, - &Hints, IAI); + + ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed; + if (Hints.getForce() != LoopVectorizeHints::FK_Enabled && + (F->hasOptSize() || + llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI))) + SEL = CM_ScalarEpilogueNotAllowedOptSize; + + LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, + DB, AC, ORE, F, &Hints, IAI); // Use the planner for outer loop vectorization. // TODO: CM is not used at this point inside the planner. Turn CM into an // optional argument if we don't need it in the future. @@ -7206,15 +7220,8 @@ // Get user vectorization factor. const unsigned UserVF = Hints.getWidth(); - // Check the function attributes and profiles to find out if this function - // should be optimized for size. - bool OptForSize = - Hints.getForce() != LoopVectorizeHints::FK_Enabled && - (F->hasOptSize() || - llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)); - // Plan how to best vectorize, return the best VF and its cost. - const VectorizationFactor VF = LVP.planInVPlanNativePath(OptForSize, UserVF); + const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); // If we are stress testing VPlan builds, do not attempt to generate vector // code. Masked vector code generation support will follow soon. @@ -7293,10 +7300,11 @@ // Check the function attributes and profiles to find out if this function // should be optimized for size. - bool OptForSize = - Hints.getForce() != LoopVectorizeHints::FK_Enabled && + ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed; + if (Hints.getForce() != LoopVectorizeHints::FK_Enabled && (F->hasOptSize() || - llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)); + llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI))) + SEL = CM_ScalarEpilogueNotAllowedOptSize; // Entrance to the VPlan-native vectorization path. Outer loops are processed // here. They may require CFG and instruction level transformations before @@ -7348,7 +7356,7 @@ // Loops with a very small trip count are considered for vectorization // under OptForSize, thereby making sure the cost of their loop body is // dominant, free of runtime guards and scalar iteration overheads. - OptForSize = true; + SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; } } @@ -7394,8 +7402,8 @@ } // Use the cost model. - LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F, - &Hints, IAI); + LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, + DB, AC, ORE, F, &Hints, IAI); CM.collectValuesToIgnore(); // Use the planner for vectorization. @@ -7405,7 +7413,7 @@ unsigned UserVF = Hints.getWidth(); // Plan how to best vectorize, return the best VF and its cost. - Optional MaybeVF = LVP.plan(OptForSize, UserVF); + Optional MaybeVF = LVP.plan(UserVF); VectorizationFactor VF = VectorizationFactor::Disabled(); unsigned IC = 1; @@ -7414,7 +7422,7 @@ if (MaybeVF) { VF = *MaybeVF; // Select the interleave count. - IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost); + IC = CM.selectInterleaveCount(VF.Width, VF.Cost); } // Identify the diagnostic messages that should be produced.