diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -34,6 +34,7 @@ class PredicatedScalarEvolution; class LoopVectorizationRequirements; class LoopVectorizeHints; +class GeneratedRTChecks; class OptimizationRemarkEmitter; class TargetTransformInfo; class TargetLibraryInfo; @@ -183,12 +184,16 @@ /// Cost of the loop with that width. InstructionCost Cost; - VectorizationFactor(ElementCount Width, InstructionCost Cost) - : Width(Width), Cost(Cost) {} + /// Cost of the scalar loop. + InstructionCost ScalarCost; + + VectorizationFactor(ElementCount Width, InstructionCost Cost, + InstructionCost ScalarCost) + : Width(Width), Cost(Cost), ScalarCost(ScalarCost) {} /// Width 1 means no vectorization, cost 0 means uncomputed cost. static VectorizationFactor Disabled() { - return {ElementCount::getFixed(1), 0}; + return {ElementCount::getFixed(1), 0, 0}; } bool operator==(const VectorizationFactor &rhs) const { @@ -289,7 +294,8 @@ /// Plan how to best vectorize, return the best VF and its cost, or None if /// vectorization and interleaving should be avoided up front. - Optional plan(ElementCount UserVF, unsigned UserIC); + Optional plan(ElementCount UserVF, unsigned UserIC, + GeneratedRTChecks &Checks); /// Use the VPlan-native path to plan how to best vectorize, return the best /// VF and its cost. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -342,6 +342,11 @@ cl::desc( "Prefer predicating a reduction operation over an after loop select.")); +static cl::opt RuntimeCheckOverheadFraction( + "lv-runtime-check-overhead-fraction", cl::init(0.005), cl::Hidden, + cl::desc("The maximum fraction of the allowed overhead runtime checks can " + "add compared to the runtime of the loop.")); + cl::opt EnableVPlanNativePath( "enable-vplan-native-path", cl::init(false), cl::Hidden, cl::desc("Enable VPlan-native vectorization path with " @@ -424,9 +429,6 @@ return None; } -// Forward declare GeneratedRTChecks. -class GeneratedRTChecks; - namespace llvm { /// InnerLoopVectorizer vectorizes loops which contain only one basic @@ -1630,6 +1632,19 @@ Scalars.clear(); } + /// The vectorization cost is a combination of the cost itself and a boolean + /// indicating whether any of the contributing operations will actually + /// operate on + /// vector values after type legalization in the backend. If this latter value + /// is + /// false, then all operations will be scalarized (i.e. no vectorization has + /// actually taken place). + using VectorizationCostTy = std::pair; + + /// Returns the execution time cost of an instruction for a given vector + /// width. Vector width of one means scalar. + VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); + private: unsigned NumPredStores = 0; @@ -1658,23 +1673,12 @@ /// of elements. ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); - /// The vectorization cost is a combination of the cost itself and a boolean - /// indicating whether any of the contributing operations will actually - /// operate on vector values after type legalization in the backend. If this - /// latter value is false, then all operations will be scalarized (i.e. no - /// vectorization has actually taken place). - using VectorizationCostTy = std::pair; - /// Returns the expected execution cost. The unit of the cost does /// not matter because we use the 'cost' units to compare different /// vector widths. The cost that is returned is *not* normalized by /// the factor width. VectorizationCostTy expectedCost(ElementCount VF); - /// Returns the execution time cost of an instruction for a given vector - /// width. Vector width of one means scalar. - VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); - /// The cost-computation logic from getInstructionCost which provides /// the vector type as an output parameter. InstructionCost getInstructionCost(Instruction *I, ElementCount VF, @@ -1890,7 +1894,6 @@ /// Profitable vector factors. SmallVector ProfitableVFs; }; -} // end namespace llvm /// Helper struct to manage generating runtime checks for vectorization. /// @@ -1995,6 +1998,25 @@ } } + unsigned getCost(LoopVectorizationCostModel &CM) { + unsigned RTCheckCost = 0; + if (SCEVCheckBlock) + for (Instruction &I : *SCEVCheckBlock) { + if (SCEVCheckBlock->getTerminator() == &I) + continue; + RTCheckCost += *CM.getInstructionCost(&I, ElementCount::getFixed(1)) + .first.getValue(); + } + if (MemCheckBlock) + for (Instruction &I : *MemCheckBlock) { + if (MemCheckBlock->getTerminator() == &I) + continue; + RTCheckCost += *CM.getInstructionCost(&I, ElementCount::getFixed(1)) + .first.getValue(); + } + return RTCheckCost; + } + /// Remove the created SCEV & memory runtime check blocks & instructions, if /// unused. ~GeneratedRTChecks() { @@ -2093,6 +2115,7 @@ return MemCheckBlock; } }; +} // end namespace llvm // Return true if \p OuterLp is an outer loop annotated with hints for explicit // vectorization. The loop needs to be annotated with #pragma omp simd @@ -3276,7 +3299,6 @@ } BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { - BasicBlock *const SCEVCheckBlock = RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); if (!SCEVCheckBlock) @@ -6069,7 +6091,8 @@ assert(VFCandidates.count(ElementCount::getFixed(1)) && "Expected Scalar VF to be a candidate"); - const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost); + const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost, + ExpectedCost); VectorizationFactor ChosenFactor = ScalarCost; bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; @@ -6091,7 +6114,7 @@ VectorizationCostTy C = expectedCost(i); assert(C.first.isValid() && "Unexpected invalid cost for vector loop"); - VectorizationFactor Candidate(i, C.first); + VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost); LLVM_DEBUG( dbgs() << "LV: Vector loop of width " << i << " costs: " << (*Candidate.Cost.getValue() / @@ -6218,7 +6241,7 @@ LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); if (LVP.hasPlanWithVFs( {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)})) - return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0}; + return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0, 0}; else { LLVM_DEBUG( dbgs() @@ -7968,7 +7991,7 @@ if (VPlanBuildStressTest) return VectorizationFactor::Disabled(); - return {VF, 0 /*Cost*/}; + return {VF, 0 /*Cost*/, 0}; } LLVM_DEBUG( @@ -7978,7 +8001,8 @@ } Optional -LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { +LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC, + GeneratedRTChecks &Checks) { assert(OrigLoop->isInnermost() && "Inner loop expected."); FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. @@ -8012,7 +8036,9 @@ CM.collectInLoopReductions(); buildVPlansWithVPRecipes(UserVF, UserVF); LLVM_DEBUG(printPlans(dbgs())); - return {{UserVF, 0}}; + + Checks.Create(OrigLoop, *Legal->getLAI(), PSE.getUnionPredicate()); + return {{UserVF, 0, 0}}; } // Populate the set of Vectorization Factor Candidates. @@ -8045,25 +8071,43 @@ // Select the optimal vectorization factor. auto SelectedVF = CM.selectVectorizationFactor(VFCandidates); + if (!SelectedVF.Width.isScalar()) + Checks.Create(OrigLoop, *Legal->getLAI(), PSE.getUnionPredicate()); + // Check if it is profitable to vectorize with runtime checks. unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { - bool PragmaThresholdReached = - NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; - bool ThresholdReached = - NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; - if ((ThresholdReached && !Hints.allowReordering()) || - PragmaThresholdReached) { - ORE->emit([&]() { - return OptimizationRemarkAnalysisAliasing( - DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(), - OrigLoop->getHeader()) - << "loop not vectorized: cannot prove it is safe to reorder " - "memory operations"; - }); - LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); - Hints.emitRemarkWithHints(); - return VectorizationFactor::Disabled(); + bool CanIgnoreRTThreshold = false; + if (auto ExpectedTC = getSmallBestKnownTC(*PSE.getSE(), OrigLoop)) { + unsigned RTCost = Checks.getCost(CM); + // If the expected cost of the runtime checks is a small fraction of the + // expected cost of the scalar loop, we can be more aggressive with + // using runtime checks. + CanIgnoreRTThreshold = + RTCost < (*ExpectedTC * *SelectedVF.ScalarCost.getValue() * + RuntimeCheckOverheadFraction); + LLVM_DEBUG(dbgs() << "LV: Cost of runtime check: " << RTCost << " " + << *ExpectedTC * SelectedVF.ScalarCost << "\n"); + } + + if (!CanIgnoreRTThreshold) { + bool PragmaThresholdReached = + NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; + bool ThresholdReached = NumRuntimePointerChecks > + VectorizerParams::RuntimeMemoryCheckThreshold; + if ((ThresholdReached && !Hints.allowReordering()) || + PragmaThresholdReached) { + ORE->emit([&]() { + return OptimizationRemarkAnalysisAliasing( + DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(), + OrigLoop->getHeader()) + << "loop not vectorized: cannot prove it is safe to reorder " + "memory operations"; + }); + LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); + Hints.emitRemarkWithHints(); + return None; + } } } return SelectedVF; @@ -10035,8 +10079,10 @@ ElementCount UserVF = Hints.getWidth(); unsigned UserIC = Hints.getInterleave(); + GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, + F->getParent()->getDataLayout()); // Plan how to best vectorize, return the best VF and its cost. - Optional MaybeVF = LVP.plan(UserVF, UserIC); + Optional MaybeVF = LVP.plan(UserVF, UserIC, Checks); VectorizationFactor VF = VectorizationFactor::Disabled(); unsigned IC = 1; @@ -10132,13 +10178,6 @@ bool DisableRuntimeUnroll = false; MDNode *OrigLoopID = L->getLoopID(); { - // Optimistically generate runtime checks. Drop them if they turn out to not - // be profitable. Limit the scope of Checks, so the cleanup happens - // immediately after vector codegeneration is done. - GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, - F->getParent()->getDataLayout()); - if (!VF.Width.isScalar() || IC > 1) - Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); LVP.setBestPlan(VF.Width, IC); using namespace ore; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll b/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll @@ -1,14 +1,19 @@ -; RUN: opt -loop-vectorize -mtriple=arm64-apple-iphoneos -S %s | FileCheck %s +; RUN: opt -loop-vectorize -mtriple=arm64-apple-iphoneos -S %s | FileCheck --check-prefix=CHECK --check-prefix=DEFAULT %s +; RUN: opt -loop-vectorize -lv-runtime-check-overhead-fraction=0.5 -mtriple=arm64-apple-iphoneos -S %s | FileCheck --check-prefix=CHECK --check-prefix=CUSTOM %s ; Tests for loops with large numbers of runtime checks. Check that loops are ; vectorized, if the loop trip counts are large and the impact of the runtime ; checks is very small compared to the expected loop runtimes. -; The trip count in the loop in this function is too to warrant large runtime checks. +; The trip count in the loop in this function is too small to warrant large +; runtime checks with the default threshold. It should be vectorized with +; a larger custom threshold. ; CHECK-LABEL: define {{.*}} @test_tc_too_small -; CHECK-NOT: vector.memcheck -; CHECK-NOT: vector.body +; DEFAULT-NOT: vector.memcheck +; DEFAULT-NOT: vector.body +; CUSTOM: vector.memcheck +; CUSTOM: vector.body define void @test_tc_too_small(i16* %ptr.1, i16* %ptr.2, i16* %ptr.3, i16* %ptr.4, i64 %off.1, i64 %off.2) { entry: br label %loop @@ -57,11 +62,11 @@ ret void } -; FIXME -; The trip count in the loop in this function high enough to warrant large runtime checks. +; The trip count in the loop in this function high enough to warrant large +; runtime checks. ; CHECK-LABEL: define {{.*}} @test_tc_big_enough -; CHECK-NOT: vector.memcheck -; CHECK-NOT: vector.body +; CHECK: vector.memcheck +; CHECK: vector.body define void @test_tc_big_enough(i16* %ptr.1, i16* %ptr.2, i16* %ptr.3, i16* %ptr.4, i64 %off.1, i64 %off.2) { entry: br label %loop