diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -27,6 +27,8 @@ #include "VPlan.h" #include "llvm/Support/InstructionCost.h" +class GeneratedRTChecks; + namespace llvm { class LoopInfo; @@ -188,12 +190,16 @@ /// Cost of the loop with that width. InstructionCost Cost; - VectorizationFactor(ElementCount Width, InstructionCost Cost) - : Width(Width), Cost(Cost) {} + /// Cost of the scalar loop. + InstructionCost ScalarCost; + + VectorizationFactor(ElementCount Width, InstructionCost Cost, + InstructionCost ScalarCost) + : Width(Width), Cost(Cost), ScalarCost(ScalarCost) {} /// Width 1 means no vectorization, cost 0 means uncomputed cost. static VectorizationFactor Disabled() { - return {ElementCount::getFixed(1), 0}; + return {ElementCount::getFixed(1), 0, 0}; } bool operator==(const VectorizationFactor &rhs) const { @@ -288,7 +294,8 @@ /// Plan how to best vectorize, return the best VF and its cost, or None if /// vectorization and interleaving should be avoided up front. - Optional plan(ElementCount UserVF, unsigned UserIC); + Optional plan(ElementCount UserVF, unsigned UserIC, + GeneratedRTChecks &Checks); /// Use the VPlan-native path to plan how to best vectorize, return the best /// VF and its cost. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -422,7 +422,6 @@ return None; } -// Forward declare GeneratedRTChecks. class GeneratedRTChecks; namespace llvm { @@ -1645,6 +1644,17 @@ Scalars.clear(); } + /// The vectorization cost is a combination of the cost itself and a boolean + /// indicating whether any of the contributing operations will actually + /// operate on vector values after type legalization in the backend. If this + /// latter value is false, then all operations will be scalarized (i.e. no + /// vectorization has actually taken place). + using VectorizationCostTy = std::pair; + + /// Returns the execution time cost of an instruction for a given vector + /// width. Vector width of one means scalar. + VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); + private: unsigned NumPredStores = 0; @@ -1680,13 +1690,6 @@ /// of elements. ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); - /// The vectorization cost is a combination of the cost itself and a boolean - /// indicating whether any of the contributing operations will actually - /// operate on vector values after type legalization in the backend. If this - /// latter value is false, then all operations will be scalarized (i.e. no - /// vectorization has actually taken place). - using VectorizationCostTy = std::pair; - /// Returns the expected execution cost. The unit of the cost does /// not matter because we use the 'cost' units to compare different /// vector widths. The cost that is returned is *not* normalized by @@ -1698,10 +1701,6 @@ expectedCost(ElementCount VF, SmallVectorImpl *Invalid = nullptr); - /// Returns the execution time cost of an instruction for a given vector - /// width. Vector width of one means scalar. - VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); - /// The cost-computation logic from getInstructionCost which provides /// the vector type as an output parameter. InstructionCost getInstructionCost(Instruction *I, ElementCount VF, @@ -5331,7 +5330,8 @@ assert(VFCandidates.count(ElementCount::getFixed(1)) && "Expected Scalar VF to be a candidate"); - const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost); + const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost, + ExpectedCost); VectorizationFactor ChosenFactor = ScalarCost; bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; @@ -5349,7 +5349,7 @@ continue; VectorizationCostTy C = expectedCost(i, &InvalidCosts); - VectorizationFactor Candidate(i, C.first); + VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost); #ifndef NDEBUG unsigned AssumedMinimumVscale = 1; @@ -5542,7 +5542,7 @@ LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); if (LVP.hasPlanWithVF(ForcedEC)) - return {ForcedEC, 0}; + return {ForcedEC, 0, 0}; else { LLVM_DEBUG( dbgs() @@ -7462,7 +7462,7 @@ if (VPlanBuildStressTest) return VectorizationFactor::Disabled(); - return {VF, 0 /*Cost*/}; + return {VF, 0 /*Cost*/, 0 /* ScalarCost */}; } LLVM_DEBUG( @@ -7480,7 +7480,8 @@ } Optional -LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { +LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC, + GeneratedRTChecks &Checks) { assert(OrigLoop->isInnermost() && "Inner loop expected."); FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. @@ -7513,7 +7514,7 @@ CM.collectInLoopReductions(); buildVPlansWithVPRecipes(UserVF, UserVF); LLVM_DEBUG(printPlans(dbgs())); - return {{UserVF, 0}}; + return {{UserVF, 0, 0}}; } else reportVectorizationInfo("UserVF ignored because of invalid costs.", "InvalidCost", ORE, OrigLoop); @@ -10477,8 +10478,12 @@ ElementCount UserVF = Hints.getWidth(); unsigned UserIC = Hints.getInterleave(); + // Optimistically generate runtime checks. Drop them if they turn out to not + // be profitable. + GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, + F->getParent()->getDataLayout()); // Plan how to best vectorize, return the best VF and its cost. - Optional MaybeVF = LVP.plan(UserVF, UserIC); + Optional MaybeVF = LVP.plan(UserVF, UserIC, Checks); VectorizationFactor VF = VectorizationFactor::Disabled(); unsigned IC = 1; @@ -10499,6 +10504,11 @@ VF = *MaybeVF; // Select the interleave count. IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); + + unsigned SelectedIC = std::max(IC, UserIC); + // Generate runtime checks if they are needed. + if (VF.Width.isVector() || SelectedIC > 1) + Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC); } // Identify the diagnostic messages that should be produced. @@ -10586,14 +10596,6 @@ bool DisableRuntimeUnroll = false; MDNode *OrigLoopID = L->getLoopID(); { - // Optimistically generate runtime checks. Drop them if they turn out to not - // be profitable. Limit the scope of Checks, so the cleanup happens - // immediately after vector codegeneration is done. - GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, - F->getParent()->getDataLayout()); - if (!VF.Width.isScalar() || IC > 1) - Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, IC); - using namespace ore; if (!VectorizeLoop) { assert(IC > 1 && "interleave count should not be 1 or 0");