diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -26,6 +26,8 @@ #include "VPlan.h" +class GeneratedRTChecks; + namespace llvm { class LoopInfo; @@ -183,12 +185,16 @@ /// Cost of the loop with that width. InstructionCost Cost; - VectorizationFactor(ElementCount Width, InstructionCost Cost) - : Width(Width), Cost(Cost) {} + /// Cost of the scalar loop. + InstructionCost ScalarCost; + + VectorizationFactor(ElementCount Width, InstructionCost Cost, + InstructionCost ScalarCost) + : Width(Width), Cost(Cost), ScalarCost(ScalarCost) {} /// Width 1 means no vectorization, cost 0 means uncomputed cost. static VectorizationFactor Disabled() { - return {ElementCount::getFixed(1), 0}; + return {ElementCount::getFixed(1), 0, 0}; } bool operator==(const VectorizationFactor &rhs) const { @@ -289,7 +295,8 @@ /// Plan how to best vectorize, return the best VF and its cost, or None if /// vectorization and interleaving should be avoided up front. - Optional plan(ElementCount UserVF, unsigned UserIC); + Optional plan(ElementCount UserVF, unsigned UserIC, + GeneratedRTChecks &Checks); /// Use the VPlan-native path to plan how to best vectorize, return the best /// VF and its cost. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -423,7 +423,6 @@ return None; } -// Forward declare GeneratedRTChecks. class GeneratedRTChecks; namespace llvm { @@ -1635,6 +1634,17 @@ Scalars.clear(); } + /// The vectorization cost is a combination of the cost itself and a boolean + /// indicating whether any of the contributing operations will actually + /// operate on vector values after type legalization in the backend. If this + /// latter value is false, then all operations will be scalarized (i.e. no + /// vectorization has actually taken place). + using VectorizationCostTy = std::pair; + + /// Returns the execution time cost of an instruction for a given vector + /// width. Vector width of one means scalar. + VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); + private: unsigned NumPredStores = 0; @@ -1663,13 +1673,6 @@ /// of elements. ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); - /// The vectorization cost is a combination of the cost itself and a boolean - /// indicating whether any of the contributing operations will actually - /// operate on vector values after type legalization in the backend. If this - /// latter value is false, then all operations will be scalarized (i.e. no - /// vectorization has actually taken place). - using VectorizationCostTy = std::pair; - /// Returns the expected execution cost. The unit of the cost does /// not matter because we use the 'cost' units to compare different /// vector widths. The cost that is returned is *not* normalized by @@ -1681,10 +1684,6 @@ expectedCost(ElementCount VF, SmallVectorImpl *Invalid = nullptr); - /// Returns the execution time cost of an instruction for a given vector - /// width. Vector width of one means scalar. - VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); - /// The cost-computation logic from getInstructionCost which provides /// the vector type as an output parameter. InstructionCost getInstructionCost(Instruction *I, ElementCount VF, @@ -6033,7 +6032,8 @@ assert(VFCandidates.count(ElementCount::getFixed(1)) && "Expected Scalar VF to be a candidate"); - const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost); + const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost, + ExpectedCost); VectorizationFactor ChosenFactor = ScalarCost; bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; @@ -6051,7 +6051,7 @@ continue; VectorizationCostTy C = expectedCost(i, &InvalidCosts); - VectorizationFactor Candidate(i, C.first); + VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost); LLVM_DEBUG( dbgs() << "LV: Vector loop of width " << i << " costs: " << (Candidate.Cost / Candidate.Width.getKnownMinValue()) @@ -6242,7 +6242,7 @@ LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); if (LVP.hasPlanWithVFs({MainLoopVF, ForcedEC})) - return {ForcedEC, 0}; + return {ForcedEC, 0, 0}; else { LLVM_DEBUG( dbgs() @@ -8086,7 +8086,7 @@ if (VPlanBuildStressTest) return VectorizationFactor::Disabled(); - return {VF, 0 /*Cost*/}; + return {VF, 0 /*Cost*/, 0 /* ScalarCost */}; } LLVM_DEBUG( @@ -8096,7 +8096,8 @@ } Optional -LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { +LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC, + GeneratedRTChecks &Checks) { assert(OrigLoop->isInnermost() && "Inner loop expected."); FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. @@ -8129,7 +8130,8 @@ CM.collectInLoopReductions(); buildVPlansWithVPRecipes(UserVF, UserVF); LLVM_DEBUG(printPlans(dbgs())); - return {{UserVF, 0}}; + Checks.Create(OrigLoop, *Legal->getLAI(), PSE.getUnionPredicate()); + return {{UserVF, 0, 0}}; } else reportVectorizationInfo("UserVF ignored because of invalid costs.", "InvalidCost", ORE, OrigLoop); @@ -8165,6 +8167,9 @@ // Select the optimal vectorization factor. auto SelectedVF = CM.selectVectorizationFactor(VFCandidates); + if (!SelectedVF.Width.isScalar()) + Checks.Create(OrigLoop, *Legal->getLAI(), PSE.getUnionPredicate()); + // Check if it is profitable to vectorize with runtime checks. unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { @@ -10296,8 +10301,10 @@ ElementCount UserVF = Hints.getWidth(); unsigned UserIC = Hints.getInterleave(); + GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, + F->getParent()->getDataLayout()); // Plan how to best vectorize, return the best VF and its cost. - Optional MaybeVF = LVP.plan(UserVF, UserIC); + Optional MaybeVF = LVP.plan(UserVF, UserIC, Checks); VectorizationFactor VF = VectorizationFactor::Disabled(); unsigned IC = 1; @@ -10393,13 +10400,6 @@ bool DisableRuntimeUnroll = false; MDNode *OrigLoopID = L->getLoopID(); { - // Optimistically generate runtime checks. Drop them if they turn out to not - // be profitable. Limit the scope of Checks, so the cleanup happens - // immediately after vector codegeneration is done. - GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, - F->getParent()->getDataLayout()); - if (!VF.Width.isScalar() || IC > 1) - Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); LVP.setBestPlan(VF.Width, IC); using namespace ore;