diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -36,6 +36,7 @@ class PredicatedScalarEvolution; class LoopVectorizationRequirements; class LoopVectorizeHints; +class GeneratedRTChecks; class OptimizationRemarkEmitter; class VPRecipeBuilder; @@ -255,7 +256,8 @@ /// Plan how to best vectorize, return the best VF and its cost, or None if /// vectorization and interleaving should be avoided up front. - Optional plan(ElementCount UserVF, unsigned UserIC); + Optional plan(ElementCount UserVF, unsigned UserIC, + GeneratedRTChecks &Checks); /// Use the VPlan-native path to plan how to best vectorize, return the best /// VF and its cost. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -335,6 +335,11 @@ cl::desc( "Prefer predicating a reduction operation over an after loop select.")); +static cl::opt RuntimeCheckOverheadFraction( + "lv-runtime-check-overhead-fraction", cl::init(0.005), cl::Hidden, + cl::desc("The maximum fraction of the allowed overhead runtime checks can " + "add compared to the runtime of the loop.")); + cl::opt EnableVPlanNativePath( "enable-vplan-native-path", cl::init(false), cl::Hidden, cl::desc("Enable VPlan-native vectorization path with " @@ -430,10 +435,10 @@ return None; } +namespace llvm { // Forward declare GeneratedRTChecks. class GeneratedRTChecks; -namespace llvm { /// InnerLoopVectorizer vectorizes loops which contain only one basic /// block to a specified vectorization factor (VF). @@ -1606,9 +1611,6 @@ Scalars.clear(); } -private: - unsigned NumPredStores = 0; - /// \return An upper bound for the vectorization factor, a power-of-2 larger /// than zero. One is returned if vectorization should best be avoided due /// to cost. @@ -1624,16 +1626,21 @@ /// actually taken place). using VectorizationCostTy = std::pair; + /// Returns the execution time cost of an instruction for a given vector + /// width. Vector width of one means scalar. + VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); + + float ScalarCost; + +private: + unsigned NumPredStores = 0; + /// Returns the expected execution cost. The unit of the cost does /// not matter because we use the 'cost' units to compare different /// vector widths. The cost that is returned is *not* normalized by /// the factor width. VectorizationCostTy expectedCost(ElementCount VF); - /// Returns the execution time cost of an instruction for a given vector - /// width. Vector width of one means scalar. - VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); - /// The cost-computation logic from getInstructionCost which provides /// the vector type as an output parameter. InstructionCost getInstructionCost(Instruction *I, ElementCount VF, @@ -1848,7 +1855,6 @@ /// Profitable vector factors. SmallVector ProfitableVFs; }; -} // end namespace llvm /// Helper struct to manage generating runtime checks for vectorization. /// @@ -1953,6 +1959,25 @@ } } + unsigned getCost(LoopVectorizationCostModel &CM) { + unsigned RTCheckCost = 0; + if (SCEVCheckBlock) + for (Instruction &I : *SCEVCheckBlock) { + if (SCEVCheckBlock->getTerminator() == &I) + continue; + RTCheckCost += *CM.getInstructionCost(&I, ElementCount::getFixed(1)) + .first.getValue(); + } + if (MemCheckBlock) + for (Instruction &I : *MemCheckBlock) { + if (MemCheckBlock->getTerminator() == &I) + continue; + RTCheckCost += *CM.getInstructionCost(&I, ElementCount::getFixed(1)) + .first.getValue(); + } + return RTCheckCost; + } + /// Remove the created SCEV & memory runtime check blocks & instructions, if /// unused. ~GeneratedRTChecks() { @@ -2051,6 +2076,7 @@ return MemCheckBlock; } }; +} // end namespace llvm // Return true if \p OuterLp is an outer loop annotated with hints for explicit // vectorization. The loop needs to be annotated with #pragma omp simd @@ -3207,7 +3233,6 @@ } BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { - BasicBlock *const SCEVCheckBlock = RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); if (!SCEVCheckBlock) @@ -5847,7 +5872,7 @@ assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); auto Width = ElementCount::getFixed(1); - const float ScalarCost = *ExpectedCost.getValue(); + ScalarCost = *ExpectedCost.getValue(); float Cost = ScalarCost; bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; @@ -7700,7 +7725,8 @@ } Optional -LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { +LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC, + GeneratedRTChecks &Checks) { assert(OrigLoop->isInnermost() && "Inner loop expected."); Optional MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. @@ -7740,6 +7766,8 @@ CM.collectInLoopReductions(); buildVPlansWithVPRecipes(VF, VF); LLVM_DEBUG(printPlans(dbgs())); + + Checks.Create(OrigLoop, *Legal->getLAI(), PSE.getUnionPredicate()); return {{VF, 0}}; } @@ -7767,24 +7795,41 @@ // Select the optimal vectorization factor. auto SelectedVF = CM.selectVectorizationFactor(MaxVF); + if (!SelectedVF.Width.isScalar()) + Checks.Create(OrigLoop, *Legal->getLAI(), PSE.getUnionPredicate()); + // Check if it is profitable to vectorize with runtime checks. unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { - bool PragmaThresholdReached = - NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; - bool ThresholdReached = - NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; - if ((ThresholdReached && !Hints.allowReordering()) || - PragmaThresholdReached) { - ORE->emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "CantReorderMemOps", - OrigLoop->getStartLoc(), - OrigLoop->getHeader()) - << "loop not vectorized: cannot prove it is safe to reorder " - "memory operations"; - }); - LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); - return VectorizationFactor::Disabled(); + bool CanIgnoreRTThreshold = false; + if (auto ExpectedTC = getSmallBestKnownTC(*PSE.getSE(), OrigLoop)) { + unsigned RTCost = Checks.getCost(CM); + // If the expected cost of the runtime checks is a small fraction of the + // expected cost of the scalar loop, we can be more aggressive with + // using runtime checks. + CanIgnoreRTThreshold = + RTCost < (*ExpectedTC * CM.ScalarCost * RuntimeCheckOverheadFraction); + LLVM_DEBUG(dbgs() << "LV: Cost of runtime check: " << RTCost << " " + << *ExpectedTC * CM.ScalarCost << "\n"); + } + + if (!CanIgnoreRTThreshold) { + bool PragmaThresholdReached = + NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; + bool ThresholdReached = NumRuntimePointerChecks > + VectorizerParams::RuntimeMemoryCheckThreshold; + if ((ThresholdReached && !Hints.allowReordering()) || + PragmaThresholdReached) { + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "CantReorderMemOps", + OrigLoop->getStartLoc(), + OrigLoop->getHeader()) + << "loop not vectorized: cannot prove it is safe to reorder " + "memory operations"; + }); + LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); + return None; + } } } return SelectedVF; @@ -9648,8 +9693,10 @@ ElementCount UserVF = Hints.getWidth(); unsigned UserIC = Hints.getInterleave(); + GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, + F->getParent()->getDataLayout()); // Plan how to best vectorize, return the best VF and its cost. - Optional MaybeVF = LVP.plan(UserVF, UserIC); + Optional MaybeVF = LVP.plan(UserVF, UserIC, Checks); VectorizationFactor VF = VectorizationFactor::Disabled(); unsigned IC = 1; @@ -9745,13 +9792,6 @@ bool DisableRuntimeUnroll = false; MDNode *OrigLoopID = L->getLoopID(); { - // Optimistically generate runtime checks. Drop them if they turn out to not - // be profitable. Limit the scope of Checks, so the cleanup happens - // immediately after vector codegeneration is done. - GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, - F->getParent()->getDataLayout()); - if (!VF.Width.isScalar() || IC > 1) - Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); LVP.setBestPlan(VF.Width, IC); using namespace ore; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll b/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll @@ -1,14 +1,19 @@ -; RUN: opt -loop-vectorize -mtriple=arm64-apple-iphoneos -S %s | FileCheck %s +; RUN: opt -loop-vectorize -mtriple=arm64-apple-iphoneos -S %s | FileCheck --check-prefix=CHECK --check-prefix=DEFAULT %s +; RUN: opt -loop-vectorize -lv-runtime-check-overhead-fraction=0.5 -mtriple=arm64-apple-iphoneos -S %s | FileCheck --check-prefix=CHECK --check-prefix=CUSTOM %s ; Tests for loops with large numbers of runtime checks. Check that loops are ; vectorized, if the loop trip counts are large and the impact of the runtime ; checks is very small compared to the expected loop runtimes. -; The trip count in the loop in this function is too to warrant large runtime checks. +; The trip count in the loop in this function is too small to warrant large +; runtime checks with the default threshold. It should be vectorized with +; a larger custom threshold. ; CHECK-LABEL: define {{.*}} @test_tc_too_small -; CHECK-NOT: vector.memcheck -; CHECK-NOT: vector.body +; DEFAULT-NOT: vector.memcheck +; DEFAULT-NOT: vector.body +; CUSTOM: vector.memcheck +; CUSTOM: vector.body define void @test_tc_too_small(i16* %ptr.1, i16* %ptr.2, i16* %ptr.3, i16* %ptr.4, i64 %off.1, i64 %off.2) { entry: br label %loop @@ -57,11 +62,11 @@ ret void } -; FIXME -; The trip count in the loop in this function high enough to warrant large runtime checks. +; The trip count in the loop in this function high enough to warrant large +; runtime checks. ; CHECK-LABEL: define {{.*}} @test_tc_big_enough -; CHECK-NOT: vector.memcheck -; CHECK-NOT: vector.body +; CHECK: vector.memcheck +; CHECK: vector.body define void @test_tc_big_enough(i16* %ptr.1, i16* %ptr.2, i16* %ptr.3, i16* %ptr.4, i64 %off.1, i64 %off.2) { entry: br label %loop