diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -34,6 +34,7 @@ class PredicatedScalarEvolution; class LoopVectorizationRequirements; class LoopVectorizeHints; +class GeneratedRTChecks; class OptimizationRemarkEmitter; class TargetTransformInfo; class TargetLibraryInfo; @@ -183,12 +184,16 @@ /// Cost of the loop with that width. InstructionCost Cost; - VectorizationFactor(ElementCount Width, InstructionCost Cost) - : Width(Width), Cost(Cost) {} + /// Cost of the scalar loop. + InstructionCost ScalarCost; + + VectorizationFactor(ElementCount Width, InstructionCost Cost, + InstructionCost ScalarCost) + : Width(Width), Cost(Cost), ScalarCost(ScalarCost) {} /// Width 1 means no vectorization, cost 0 means uncomputed cost. static VectorizationFactor Disabled() { - return {ElementCount::getFixed(1), 0}; + return {ElementCount::getFixed(1), 0, 0}; } bool operator==(const VectorizationFactor &rhs) const { @@ -289,7 +294,8 @@ /// Plan how to best vectorize, return the best VF and its cost, or None if /// vectorization and interleaving should be avoided up front. - Optional plan(ElementCount UserVF, unsigned UserIC); + Optional plan(ElementCount UserVF, unsigned UserIC, + GeneratedRTChecks &Checks); /// Use the VPlan-native path to plan how to best vectorize, return the best /// VF and its cost. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -342,6 +342,11 @@ cl::desc( "Prefer predicating a reduction operation over an after loop select.")); +static cl::opt RuntimeCheckOverheadFraction( + "lv-runtime-check-overhead-fraction", cl::init(0.005), cl::Hidden, + cl::desc("The maximum fraction of the allowed overhead runtime checks can " + "add compared to the runtime of the loop.")); + cl::opt EnableVPlanNativePath( "enable-vplan-native-path", cl::init(false), cl::Hidden, cl::desc("Enable VPlan-native vectorization path with " @@ -424,9 +429,6 @@ return None; } -// Forward declare GeneratedRTChecks. -class GeneratedRTChecks; - namespace llvm { /// InnerLoopVectorizer vectorizes loops which contain only one basic @@ -1637,6 +1639,17 @@ Scalars.clear(); } + /// The vectorization cost is a combination of the cost itself and a boolean + /// indicating whether any of the contributing operations will actually + /// operate on vector values after type legalization in the backend. If this + /// latter value is false, then all operations will be scalarized (i.e. no + /// vectorization has actually taken place). + using VectorizationCostTy = std::pair; + + /// Returns the execution time cost of an instruction for a given vector + /// width. Vector width of one means scalar. + VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); + private: unsigned NumPredStores = 0; @@ -1665,13 +1678,6 @@ /// of elements. ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); - /// The vectorization cost is a combination of the cost itself and a boolean - /// indicating whether any of the contributing operations will actually - /// operate on vector values after type legalization in the backend. If this - /// latter value is false, then all operations will be scalarized (i.e. no - /// vectorization has actually taken place). - using VectorizationCostTy = std::pair; - /// Returns the expected execution cost. The unit of the cost does /// not matter because we use the 'cost' units to compare different /// vector widths. The cost that is returned is *not* normalized by @@ -1683,10 +1689,6 @@ expectedCost(ElementCount VF, SmallVectorImpl *Invalid = nullptr); - /// Returns the execution time cost of an instruction for a given vector - /// width. Vector width of one means scalar. - VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); - /// The cost-computation logic from getInstructionCost which provides /// the vector type as an output parameter. InstructionCost getInstructionCost(Instruction *I, ElementCount VF, @@ -1905,7 +1907,6 @@ /// Profitable vector factors. SmallVector ProfitableVFs; }; -} // end namespace llvm /// Helper struct to manage generating runtime checks for vectorization. /// @@ -2010,6 +2011,25 @@ } } + unsigned getCost(LoopVectorizationCostModel &CM) { + unsigned RTCheckCost = 0; + if (SCEVCheckBlock) + for (Instruction &I : *SCEVCheckBlock) { + if (SCEVCheckBlock->getTerminator() == &I) + continue; + RTCheckCost += *CM.getInstructionCost(&I, ElementCount::getFixed(1)) + .first.getValue(); + } + if (MemCheckBlock) + for (Instruction &I : *MemCheckBlock) { + if (MemCheckBlock->getTerminator() == &I) + continue; + RTCheckCost += *CM.getInstructionCost(&I, ElementCount::getFixed(1)) + .first.getValue(); + } + return RTCheckCost; + } + /// Remove the created SCEV & memory runtime check blocks & instructions, if /// unused. ~GeneratedRTChecks() { @@ -2108,6 +2128,7 @@ return MemCheckBlock; } }; +} // end namespace llvm // Return true if \p OuterLp is an outer loop annotated with hints for explicit // vectorization. The loop needs to be annotated with #pragma omp simd @@ -3307,7 +3328,6 @@ } BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { - BasicBlock *const SCEVCheckBlock = RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); if (!SCEVCheckBlock) @@ -6033,7 +6053,8 @@ assert(VFCandidates.count(ElementCount::getFixed(1)) && "Expected Scalar VF to be a candidate"); - const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost); + const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost, + ExpectedCost); VectorizationFactor ChosenFactor = ScalarCost; bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; @@ -6051,7 +6072,7 @@ continue; VectorizationCostTy C = expectedCost(i, &InvalidCosts); - VectorizationFactor Candidate(i, C.first); + VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost); LLVM_DEBUG( dbgs() << "LV: Vector loop of width " << i << " costs: " << (Candidate.Cost / Candidate.Width.getKnownMinValue()) @@ -6242,7 +6263,7 @@ LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); if (LVP.hasPlanWithVFs( {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)})) - return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0}; + return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0, 0}; else { LLVM_DEBUG( dbgs() @@ -8071,7 +8092,7 @@ if (VPlanBuildStressTest) return VectorizationFactor::Disabled(); - return {VF, 0 /*Cost*/}; + return {VF, 0 /*Cost*/, 0}; } LLVM_DEBUG( @@ -8081,7 +8102,8 @@ } Optional -LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { +LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC, + GeneratedRTChecks &Checks) { assert(OrigLoop->isInnermost() && "Inner loop expected."); FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. @@ -8114,7 +8136,8 @@ CM.collectInLoopReductions(); buildVPlansWithVPRecipes(UserVF, UserVF); LLVM_DEBUG(printPlans(dbgs())); - return {{UserVF, 0}}; + Checks.Create(OrigLoop, *Legal->getLAI(), PSE.getUnionPredicate()); + return {{UserVF, 0, 0}}; } else reportVectorizationInfo("UserVF ignored because of invalid costs.", "InvalidCost", ORE, OrigLoop); @@ -8150,25 +8173,43 @@ // Select the optimal vectorization factor. auto SelectedVF = CM.selectVectorizationFactor(VFCandidates); + if (!SelectedVF.Width.isScalar()) + Checks.Create(OrigLoop, *Legal->getLAI(), PSE.getUnionPredicate()); + // Check if it is profitable to vectorize with runtime checks. unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { - bool PragmaThresholdReached = - NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; - bool ThresholdReached = - NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; - if ((ThresholdReached && !Hints.allowReordering()) || - PragmaThresholdReached) { - ORE->emit([&]() { - return OptimizationRemarkAnalysisAliasing( - DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(), - OrigLoop->getHeader()) - << "loop not vectorized: cannot prove it is safe to reorder " - "memory operations"; - }); - LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); - Hints.emitRemarkWithHints(); - return VectorizationFactor::Disabled(); + bool CanIgnoreRTThreshold = false; + if (auto ExpectedTC = getSmallBestKnownTC(*PSE.getSE(), OrigLoop)) { + unsigned RTCost = Checks.getCost(CM); + // If the expected cost of the runtime checks is a small fraction of the + // expected cost of the scalar loop, we can be more aggressive with + // using runtime checks. + CanIgnoreRTThreshold = + RTCost < (*ExpectedTC * *SelectedVF.ScalarCost.getValue() * + RuntimeCheckOverheadFraction); + LLVM_DEBUG(dbgs() << "LV: Cost of runtime check: " << RTCost << " " + << *ExpectedTC * SelectedVF.ScalarCost << "\n"); + } + + if (!CanIgnoreRTThreshold) { + bool PragmaThresholdReached = + NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; + bool ThresholdReached = NumRuntimePointerChecks > + VectorizerParams::RuntimeMemoryCheckThreshold; + if ((ThresholdReached && !Hints.allowReordering()) || + PragmaThresholdReached) { + ORE->emit([&]() { + return OptimizationRemarkAnalysisAliasing( + DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(), + OrigLoop->getHeader()) + << "loop not vectorized: cannot prove it is safe to reorder " + "memory operations"; + }); + LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); + Hints.emitRemarkWithHints(); + return None; + } } } return SelectedVF; @@ -10260,8 +10301,10 @@ ElementCount UserVF = Hints.getWidth(); unsigned UserIC = Hints.getInterleave(); + GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, + F->getParent()->getDataLayout()); // Plan how to best vectorize, return the best VF and its cost. - Optional MaybeVF = LVP.plan(UserVF, UserIC); + Optional MaybeVF = LVP.plan(UserVF, UserIC, Checks); VectorizationFactor VF = VectorizationFactor::Disabled(); unsigned IC = 1; @@ -10357,13 +10400,6 @@ bool DisableRuntimeUnroll = false; MDNode *OrigLoopID = L->getLoopID(); { - // Optimistically generate runtime checks. Drop them if they turn out to not - // be profitable. Limit the scope of Checks, so the cleanup happens - // immediately after vector codegeneration is done. - GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, - F->getParent()->getDataLayout()); - if (!VF.Width.isScalar() || IC > 1) - Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); LVP.setBestPlan(VF.Width, IC); using namespace ore; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll b/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll @@ -1,14 +1,19 @@ -; RUN: opt -loop-vectorize -mtriple=arm64-apple-iphoneos -S %s | FileCheck %s +; RUN: opt -loop-vectorize -mtriple=arm64-apple-iphoneos -S %s | FileCheck --check-prefix=CHECK --check-prefix=DEFAULT %s +; RUN: opt -loop-vectorize -lv-runtime-check-overhead-fraction=0.5 -mtriple=arm64-apple-iphoneos -S %s | FileCheck --check-prefix=CHECK --check-prefix=CUSTOM %s ; Tests for loops with large numbers of runtime checks. Check that loops are ; vectorized, if the loop trip counts are large and the impact of the runtime ; checks is very small compared to the expected loop runtimes. -; The trip count in the loop in this function is too to warrant large runtime checks. +; The trip count in the loop in this function is too small to warrant large +; runtime checks with the default threshold. It should be vectorized with +; a larger custom threshold. ; CHECK-LABEL: define {{.*}} @test_tc_too_small -; CHECK-NOT: vector.memcheck -; CHECK-NOT: vector.body +; DEFAULT-NOT: vector.memcheck +; DEFAULT-NOT: vector.body +; CUSTOM: vector.memcheck +; CUSTOM: vector.body define void @test_tc_too_small(i16* %ptr.1, i16* %ptr.2, i16* %ptr.3, i16* %ptr.4, i64 %off.1, i64 %off.2) { entry: br label %loop @@ -57,11 +62,11 @@ ret void } -; FIXME -; The trip count in the loop in this function high enough to warrant large runtime checks. +; The trip count in the loop in this function high enough to warrant large +; runtime checks. ; CHECK-LABEL: define {{.*}} @test_tc_big_enough -; CHECK-NOT: vector.memcheck -; CHECK-NOT: vector.body +; CHECK: vector.memcheck +; CHECK: vector.body define void @test_tc_big_enough(i16* %ptr.1, i16* %ptr.2, i16* %ptr.3, i16* %ptr.4, i64 %off.1, i64 %off.2) { entry: br label %loop