diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -187,7 +187,8 @@ void addRuntimePointerChecks(unsigned Num) { NumRuntimePointerChecks = Num; } - bool doesNotMeet(Function *F, Loop *L, const LoopVectorizeHints &Hints); + bool doesNotMeet(Function *F, Loop *L, const LoopVectorizeHints &Hints, + bool CanIgnoreRTThreshold); private: unsigned NumRuntimePointerChecks = 0; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -246,8 +246,9 @@ } } -bool LoopVectorizationRequirements::doesNotMeet( - Function *F, Loop *L, const LoopVectorizeHints &Hints) { +bool LoopVectorizationRequirements::doesNotMeet(Function *F, Loop *L, + const LoopVectorizeHints &Hints, + bool IgnoreRTThreshold) { const char *PassName = Hints.vectorizeAnalysisPassName(); bool Failed = false; if (UnsafeAlgebraInst && !Hints.allowReordering()) { @@ -266,8 +267,12 @@ NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; bool ThresholdReached = NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; - if ((ThresholdReached && !Hints.allowReordering()) || - PragmaThresholdReached) { + bool DoubleThresholdReached = + NumRuntimePointerChecks > + 2 * VectorizerParams::RuntimeMemoryCheckThreshold; + if ((!IgnoreRTThreshold && ((ThresholdReached && !Hints.allowReordering()) || + PragmaThresholdReached)) || + (DoubleThresholdReached && !Hints.allowReordering())) { ORE.emit([&]() { return OptimizationRemarkAnalysisAliasing(PassName, "CantReorderMemOps", L->getStartLoc(), diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -330,6 +330,11 @@ cl::desc( "Prefer predicating a reduction operation over an after loop select.")); +static cl::opt RuntimeCheckOverheadFraction( + "lv-runtime-check-overhead-fraction", cl::init(0.005), cl::Hidden, + cl::desc("The maximum fraction of the allowed overhead runtime checks can " + "add compared to the runtime of the loop.")); + cl::opt EnableVPlanNativePath( "enable-vplan-native-path", cl::init(false), cl::Hidden, cl::desc("Enable VPlan-native vectorization path with " @@ -1602,9 +1607,6 @@ Scalars.clear(); } -private: - unsigned NumPredStores = 0; - /// \return An upper bound for the vectorization factor, a power-of-2 larger /// than zero. One is returned if vectorization should best be avoided due /// to cost. @@ -1620,16 +1622,21 @@ /// actually taken place). using VectorizationCostTy = std::pair; + /// Returns the execution time cost of an instruction for a given vector + /// width. Vector width of one means scalar. + VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); + + float ScalarCost; + +private: + unsigned NumPredStores = 0; + /// Returns the expected execution cost. The unit of the cost does /// not matter because we use the 'cost' units to compare different /// vector widths. The cost that is returned is *not* normalized by /// the factor width. VectorizationCostTy expectedCost(ElementCount VF); - /// Returns the execution time cost of an instruction for a given vector - /// width. Vector width of one means scalar. - VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); - /// The cost-computation logic from getInstructionCost which provides /// the vector type as an output parameter. InstructionCost getInstructionCost(Instruction *I, ElementCount VF, @@ -1949,6 +1956,25 @@ } } + unsigned getCost(LoopVectorizationCostModel &CM) { + unsigned RTCheckCost = 0; + if (SCEVCheckBlock) + for (Instruction &I : *SCEVCheckBlock) { + if (SCEVCheckBlock->getTerminator() == &I) + continue; + RTCheckCost += *CM.getInstructionCost(&I, ElementCount::getFixed(1)) + .first.getValue(); + } + if (MemCheckBlock) + for (Instruction &I : *MemCheckBlock) { + if (MemCheckBlock->getTerminator() == &I) + continue; + RTCheckCost += *CM.getInstructionCost(&I, ElementCount::getFixed(1)) + .first.getValue(); + } + return RTCheckCost; + } + /// Remove the created SCEV & memory runtime check blocks & instructions, if /// unused. ~GeneratedRTChecks() { @@ -5852,7 +5878,7 @@ assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); auto Width = ElementCount::getFixed(1); - const float ScalarCost = *ExpectedCost.getValue(); + ScalarCost = *ExpectedCost.getValue(); float Cost = ScalarCost; bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; @@ -9618,13 +9644,6 @@ // Identify the diagnostic messages that should be produced. std::pair VecDiagMsg, IntDiagMsg; bool VectorizeLoop = true, InterleaveLoop = true; - if (Requirements.doesNotMeet(F, L, Hints)) { - LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " - "requirements.\n"); - Hints.emitRemarkWithHints(); - return false; - } - if (VF.Width.isScalar()) { LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); VecDiagMsg = std::make_pair( @@ -9712,8 +9731,31 @@ // immediately after vector codegeneration is done. GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, F->getParent()->getDataLayout()); - if (!VF.Width.isScalar() || IC > 1) + bool CanIgnoreRTThreshold = true; + if (!VF.Width.isScalar() || IC > 1) { + CanIgnoreRTThreshold = false; + Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); + if (ExpectedTC) { + unsigned RTCost = Checks.getCost(CM); + // If the expected cost of the runtime checks is a small fraction of the + // expected cost of the scalar loop, we can be more aggressive with + // using runtime checks. + CanIgnoreRTThreshold = RTCost < (*ExpectedTC * CM.ScalarCost * + RuntimeCheckOverheadFraction); + LLVM_DEBUG(dbgs() << "LV: Cost of runtime check: " << RTCost << " " + << *ExpectedTC * CM.ScalarCost << "\n"); + } + } + + if (Requirements.doesNotMeet(F, L, Hints, CanIgnoreRTThreshold)) { + LLVM_DEBUG( + dbgs() << "LV: Not vectorizing: loop did not meet vectorization " + "requirements.\n"); + Hints.emitRemarkWithHints(); + return false; + } + LVP.setBestPlan(VF.Width, IC); using namespace ore; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll b/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll @@ -1,14 +1,19 @@ -; RUN: opt -loop-vectorize -mtriple=arm64-apple-iphoneos -S %s | FileCheck %s +; RUN: opt -loop-vectorize -mtriple=arm64-apple-iphoneos -S %s | FileCheck --check-prefix=CHECK --check-prefix=DEFAULT %s +; RUN: opt -loop-vectorize -lv-runtime-check-overhead-fraction=0.5 -mtriple=arm64-apple-iphoneos -S %s | FileCheck --check-prefix=CHECK --check-prefix=CUSTOM %s ; Tests for loops with large numbers of runtime checks. Check that loops are ; vectorized, if the loop trip counts are large and the impact of the runtime ; checks is very small compared to the expected loop runtimes. -; The trip count in the loop in this function is too to warrant large runtime checks. +; The trip count in the loop in this function is too to warrant large runtime +; checks with the default threshold. It should be vectorized with a larger +; custom threshold. ; CHECK-LABEL: define {{.*}} @test_tc_too_small -; CHECK-NOT: vector.memcheck -; CHECK-NOT: vector.body +; DEFAULT-NOT: vector.memcheck +; DEFAULT-NOT: vector.body +; CUSTOM: vector.memcheck +; CUSTOM: vector.body define void @test_tc_too_small(i16* %ptr.1, i16* %ptr.2, i16* %ptr.3, i16* %ptr.4, i64 %off.1, i64 %off.2) { entry: br label %loop @@ -57,11 +62,10 @@ ret void } -; FIXME ; The trip count in the loop in this function high enough to warrant large runtime checks. ; CHECK-LABEL: define {{.*}} @test_tc_big_enough -; CHECK-NOT: vector.memcheck -; CHECK-NOT: vector.body +; CHECK: vector.memcheck +; CHECK: vector.body define void @test_tc_big_enough(i16* %ptr.1, i16* %ptr.2, i16* %ptr.3, i16* %ptr.4, i64 %off.1, i64 %off.2) { entry: br label %loop