Index: llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -228,11 +228,11 @@ /// Plan how to best vectorize, return the best VF and its cost, or None if /// vectorization and interleaving should be avoided up front. - Optional plan(bool OptForSize, unsigned UserVF); + Optional plan(unsigned UserVF); /// Use the VPlan-native path to plan how to best vectorize, return the best /// VF and its cost. - VectorizationFactor planInVPlanNativePath(bool OptForSize, unsigned UserVF); + VectorizationFactor planInVPlanNativePath(unsigned UserVF); /// Finalize the best decision and dispose of all other VPlans. void setBestPlan(unsigned VF, unsigned UF); Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -845,7 +845,8 @@ /// different operations. class LoopVectorizationCostModel { public: - LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE, + LoopVectorizationCostModel(bool IsEpilogueAllowed, Loop *L, + PredicatedScalarEvolution &PSE, LoopInfo *LI, LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI, DemandedBits *DB, @@ -853,12 +854,13 @@ OptimizationRemarkEmitter *ORE, const Function *F, const LoopVectorizeHints *Hints, InterleavedAccessInfo &IAI) - : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), - AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {} + : IsScalarEpilogueAllowed(IsEpilogueAllowed), TheLoop(L), PSE(PSE), + LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), + TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {} /// \return An upper bound for the vectorization factor, or None if /// vectorization and interleaving should be avoided up front. - Optional computeMaxVF(bool OptForSize); + Optional computeMaxVF(); /// \return The most profitable vectorization factor and the cost of that VF. /// This method checks every power of two up to MaxVF. If UserVF is not ZERO @@ -881,8 +883,7 @@ /// If interleave count has been specified by metadata it will be returned. /// Otherwise, the interleave count is computed and returned. VF and LoopCost /// are the selected vectorization factor and the cost of the selected VF. - unsigned selectInterleaveCount(bool OptForSize, unsigned VF, - unsigned LoopCost); + unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost); /// Memory access instruction may be vectorized in more than one way. /// Form of instruction after vectorization depends on cost. @@ -1187,7 +1188,7 @@ /// \return An upper bound for the vectorization factor, larger than zero. /// One is returned if vectorization should best be avoided due to cost. - unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount); + unsigned computeFeasibleMaxVF(unsigned ConstTripCount); /// The vectorization cost is a combination of the cost itself and a boolean /// indicating whether any of the contributing operations will actually @@ -4651,7 +4652,7 @@ Uniforms[VF].insert(Worklist.begin(), Worklist.end()); } -Optional LoopVectorizationCostModel::computeMaxVF(bool OptForSize) { +Optional LoopVectorizationCostModel::computeMaxVF() { if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { // TODO: It may by useful to do since it's still likely to be dynamically // uniform if the target can skip. @@ -4666,8 +4667,11 @@ } unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); - if (!OptForSize) // Remaining checks deal with scalar loop when OptForSize. - return computeFeasibleMaxVF(OptForSize, TC); + if (IsScalarEpilogueAllowed) + return computeFeasibleMaxVF(TC); + + LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue.\n" << + "LV: Performing code size checks.\n"); if (Legal->getRuntimePointerChecking()->Need) { ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize") @@ -4676,7 +4680,7 @@ "compiling with -Os/-Oz"); LLVM_DEBUG( dbgs() - << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n"); + << "LV: Aborting. Runtime ptr check is required.\n"); return None; } @@ -4687,7 +4691,7 @@ "compiling with -Os/-Oz"); LLVM_DEBUG( dbgs() - << "LV: Aborting. Runtime SCEV check is required with -Os/-Oz.\n"); + << "LV: Aborting. Runtime SCEV check is required.\n"); return None; } @@ -4713,18 +4717,13 @@ return None; } - // Record that scalar epilogue is not allowed. - LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); - - IsScalarEpilogueAllowed = !OptForSize; - // We don't create an epilogue when optimizing for size. // Invalidate interleave groups that require an epilogue if we can't mask // the interleave-group. - if (!useMaskedInterleavedAccesses(TTI)) + if (!useMaskedInterleavedAccesses(TTI)) InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); - unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC); + unsigned MaxVF = computeFeasibleMaxVF(TC); if (TC > 0 && TC % MaxVF == 0) { LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); @@ -4755,8 +4754,7 @@ } unsigned -LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize, - unsigned ConstTripCount) { +LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); unsigned SmallestType, WidestType; std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); @@ -4794,8 +4792,8 @@ } unsigned MaxVF = MaxVectorSize; - if (TTI.shouldMaximizeVectorBandwidth(OptForSize) || - (MaximizeBandwidth && !OptForSize)) { + if (TTI.shouldMaximizeVectorBandwidth(IsScalarEpilogueAllowed) || + (MaximizeBandwidth && IsScalarEpilogueAllowed)) { // Collect all viable vectorization factors larger than the default MaxVF // (i.e. MaxVectorSize). SmallVector VFs; @@ -4934,8 +4932,7 @@ return {MinWidth, MaxWidth}; } -unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, - unsigned VF, +unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, unsigned LoopCost) { // -- The interleave heuristics -- // We interleave the loop in order to expose ILP and reduce the loop overhead. @@ -4951,8 +4948,7 @@ // 3. We don't interleave if we think that we will spill registers to memory // due to the increased register pressure. - // When we optimize for size, we don't interleave. - if (OptForSize) + if (!IsScalarEpilogueAllowed) return 1; // We used the distance for the interleave count. @@ -6150,8 +6146,7 @@ } VectorizationFactor -LoopVectorizationPlanner::planInVPlanNativePath(bool OptForSize, - unsigned UserVF) { +LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) { unsigned VF = UserVF; // Outer loop handling: They may require CFG and instruction level // transformations before even evaluating whether vectorization is profitable. @@ -6190,10 +6185,9 @@ return VectorizationFactor::Disabled(); } -Optional LoopVectorizationPlanner::plan(bool OptForSize, - unsigned UserVF) { +Optional LoopVectorizationPlanner::plan(unsigned UserVF) { assert(OrigLoop->empty() && "Inner loop expected."); - Optional MaybeMaxVF = CM.computeMaxVF(OptForSize); + Optional MaybeMaxVF = CM.computeMaxVF(); if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. return None; @@ -7196,8 +7190,13 @@ assert(EnableVPlanNativePath && "VPlan-native path is disabled."); Function *F = L->getHeader()->getParent(); InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); - LoopVectorizationCostModel CM(L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, - &Hints, IAI); + + bool IsEpilogueAllowed = + Hints.getForce() == LoopVectorizeHints::FK_Enabled || + (!F->hasOptSize() && !llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)); + + LoopVectorizationCostModel CM(IsEpilogueAllowed, L, PSE, LI, LVL, *TTI, TLI, + DB, AC, ORE, F, &Hints, IAI); // Use the planner for outer loop vectorization. // TODO: CM is not used at this point inside the planner. Turn CM into an // optional argument if we don't need it in the future. @@ -7206,15 +7205,8 @@ // Get user vectorization factor. const unsigned UserVF = Hints.getWidth(); - // Check the function attributes and profiles to find out if this function - // should be optimized for size. - bool OptForSize = - Hints.getForce() != LoopVectorizeHints::FK_Enabled && - (F->hasOptSize() || - llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)); - // Plan how to best vectorize, return the best VF and its cost. - const VectorizationFactor VF = LVP.planInVPlanNativePath(OptForSize, UserVF); + const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); // If we are stress testing VPlan builds, do not attempt to generate vector // code. Masked vector code generation support will follow soon. @@ -7293,10 +7285,9 @@ // Check the function attributes and profiles to find out if this function // should be optimized for size. - bool OptForSize = - Hints.getForce() != LoopVectorizeHints::FK_Enabled && - (F->hasOptSize() || - llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)); + bool IsEpilogueAllowed = + Hints.getForce() == LoopVectorizeHints::FK_Enabled || + (!F->hasOptSize() && !llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)); // Entrance to the VPlan-native vectorization path. Outer loops are processed // here. They may require CFG and instruction level transformations before @@ -7348,7 +7339,7 @@ // Loops with a very small trip count are considered for vectorization // under OptForSize, thereby making sure the cost of their loop body is // dominant, free of runtime guards and scalar iteration overheads. - OptForSize = true; + IsEpilogueAllowed = false; } } @@ -7394,8 +7385,8 @@ } // Use the cost model. - LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F, - &Hints, IAI); + LoopVectorizationCostModel CM(IsEpilogueAllowed, L, PSE, LI, &LVL, *TTI, TLI, + DB, AC, ORE, F, &Hints, IAI); CM.collectValuesToIgnore(); // Use the planner for vectorization. @@ -7405,7 +7396,7 @@ unsigned UserVF = Hints.getWidth(); // Plan how to best vectorize, return the best VF and its cost. - Optional MaybeVF = LVP.plan(OptForSize, UserVF); + Optional MaybeVF = LVP.plan(UserVF); VectorizationFactor VF = VectorizationFactor::Disabled(); unsigned IC = 1; @@ -7414,7 +7405,7 @@ if (MaybeVF) { VF = *MaybeVF; // Select the interleave count. - IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost); + IC = CM.selectInterleaveCount(VF.Width, VF.Cost); } // Identify the diagnostic messages that should be produced.