diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h --- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -44,10 +44,6 @@ static unsigned VectorizationInterleave; /// True if force-vector-interleave was specified by the user. static bool isInterleaveForced(); - - /// \When performing memory disambiguation checks at runtime do not - /// make more than this number of comparisons. - static unsigned RuntimeMemoryCheckThreshold; }; /// Checks memory dependences among accesses to the same underlying diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -83,13 +83,6 @@ VectorizerParams::VectorizationInterleave)); unsigned VectorizerParams::VectorizationInterleave; -static cl::opt RuntimeMemoryCheckThreshold( - "runtime-memory-check-threshold", cl::Hidden, - cl::desc("When performing memory disambiguation checks at runtime do not " - "generate more than this number of comparisons (default = 8)."), - cl::location(VectorizerParams::RuntimeMemoryCheckThreshold), cl::init(8)); -unsigned VectorizerParams::RuntimeMemoryCheckThreshold; - /// The maximum iterations used to merge memory checks static cl::opt MemoryCheckMergeThreshold( "memory-check-merge-threshold", cl::Hidden, diff --git a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp --- a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp +++ b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp @@ -98,6 +98,12 @@ static const char *LICMVersioningMetaData = "llvm.loop.licm_versioning.disable"; +static cl::opt RuntimeMemoryCheckThreshold( + "runtime-memory-check-threshold", cl::Hidden, + cl::desc("When performing memory disambiguation checks at runtime do not " + "generate more than this number of comparisons (default = 8)."), + cl::init(8)); + /// Threshold minimum allowed percentage for possible /// invariant instructions in a loop. static cl::opt @@ -422,8 +428,7 @@ return false; } // Number of runtime-checks should be less then RuntimeMemoryCheckThreshold - if (LAI->getNumRuntimePointerChecks() > - VectorizerParams::RuntimeMemoryCheckThreshold) { + if (LAI->getNumRuntimePointerChecks() > RuntimeMemoryCheckThreshold) { LLVM_DEBUG( dbgs() << " LAA: Runtime checks are more than threshold !!\n"); ORE->emit([&]() { @@ -433,7 +438,7 @@ << "Number of runtime checks " << NV("RuntimeChecks", LAI->getNumRuntimePointerChecks()) << " exceeds threshold " - << NV("Threshold", VectorizerParams::RuntimeMemoryCheckThreshold); + << NV("Threshold", RuntimeMemoryCheckThreshold); }); return false; } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -34,6 +34,7 @@ class PredicatedScalarEvolution; class LoopVectorizationRequirements; class LoopVectorizeHints; +class GeneratedRTChecks; class OptimizationRemarkEmitter; class TargetTransformInfo; class TargetLibraryInfo; @@ -183,12 +184,16 @@ /// Cost of the loop with that width. InstructionCost Cost; - VectorizationFactor(ElementCount Width, InstructionCost Cost) - : Width(Width), Cost(Cost) {} + /// Cost of the scalar loop. + InstructionCost ScalarCost; + + VectorizationFactor(ElementCount Width, InstructionCost Cost, + InstructionCost ScalarCost) + : Width(Width), Cost(Cost), ScalarCost(ScalarCost) {} /// Width 1 means no vectorization, cost 0 means uncomputed cost. static VectorizationFactor Disabled() { - return {ElementCount::getFixed(1), 0}; + return {ElementCount::getFixed(1), 0, 0}; } bool operator==(const VectorizationFactor &rhs) const { @@ -289,7 +294,8 @@ /// Plan how to best vectorize, return the best VF and its cost, or None if /// vectorization and interleaving should be avoided up front. - Optional plan(ElementCount UserVF, unsigned UserIC); + Optional plan(ElementCount UserVF, unsigned UserIC, + GeneratedRTChecks &Checks); /// Use the VPlan-native path to plan how to best vectorize, return the best /// VF and its cost. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -198,10 +198,13 @@ "value are vectorized only if no scalar iteration overheads " "are incurred.")); -static cl::opt PragmaVectorizeMemoryCheckThreshold( - "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, - cl::desc("The maximum allowed number of runtime memory checks with a " - "vectorize(enable) pragma.")); +static cl::opt VectorizeMemoryCheckFactor( + "vectorize-memory-check-factor", cl::Hidden, + cl::desc( + "When performing memory disambiguation checks at runtime, the cost of " + "the runtime memory checks themselves should not be larger than the " + "cost of of N (default 7.0) scalar loop iterations."), + cl::init(7.0)); // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, // that predication is preferred, and this lists all options. I.e., the @@ -423,9 +426,6 @@ return None; } -// Forward declare GeneratedRTChecks. -class GeneratedRTChecks; - namespace llvm { /// InnerLoopVectorizer vectorizes loops which contain only one basic @@ -1635,6 +1635,17 @@ Scalars.clear(); } + /// The vectorization cost is a combination of the cost itself and a boolean + /// indicating whether any of the contributing operations will actually + /// operate on vector values after type legalization in the backend. If this + /// latter value is false, then all operations will be scalarized (i.e. no + /// vectorization has actually taken place). + using VectorizationCostTy = std::pair; + + /// Returns the execution time cost of an instruction for a given vector + /// width. Vector width of one means scalar. + VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); + private: unsigned NumPredStores = 0; @@ -1663,13 +1674,6 @@ /// of elements. ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); - /// The vectorization cost is a combination of the cost itself and a boolean - /// indicating whether any of the contributing operations will actually - /// operate on vector values after type legalization in the backend. If this - /// latter value is false, then all operations will be scalarized (i.e. no - /// vectorization has actually taken place). - using VectorizationCostTy = std::pair; - /// Returns the expected execution cost. The unit of the cost does /// not matter because we use the 'cost' units to compare different /// vector widths. The cost that is returned is *not* normalized by @@ -1681,10 +1685,6 @@ expectedCost(ElementCount VF, SmallVectorImpl *Invalid = nullptr); - /// Returns the execution time cost of an instruction for a given vector - /// width. Vector width of one means scalar. - VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); - /// The cost-computation logic from getInstructionCost which provides /// the vector type as an output parameter. InstructionCost getInstructionCost(Instruction *I, ElementCount VF, @@ -1903,7 +1903,6 @@ /// Profitable vector factors. SmallVector ProfitableVFs; }; -} // end namespace llvm /// Helper struct to manage generating runtime checks for vectorization. /// @@ -2008,6 +2007,25 @@ } } + unsigned getCost(LoopVectorizationCostModel &CM) { + unsigned RTCheckCost = 0; + if (SCEVCheckBlock) + for (Instruction &I : *SCEVCheckBlock) { + if (SCEVCheckBlock->getTerminator() == &I) + continue; + RTCheckCost += *CM.getInstructionCost(&I, ElementCount::getFixed(1)) + .first.getValue(); + } + if (MemCheckBlock) + for (Instruction &I : *MemCheckBlock) { + if (MemCheckBlock->getTerminator() == &I) + continue; + RTCheckCost += *CM.getInstructionCost(&I, ElementCount::getFixed(1)) + .first.getValue(); + } + return RTCheckCost; + } + /// Remove the created SCEV & memory runtime check blocks & instructions, if /// unused. ~GeneratedRTChecks() { @@ -2106,6 +2124,7 @@ return MemCheckBlock; } }; +} // end namespace llvm // Return true if \p OuterLp is an outer loop annotated with hints for explicit // vectorization. The loop needs to be annotated with #pragma omp simd @@ -3305,7 +3324,6 @@ } BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { - BasicBlock *const SCEVCheckBlock = RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); if (!SCEVCheckBlock) @@ -6042,7 +6060,8 @@ assert(VFCandidates.count(ElementCount::getFixed(1)) && "Expected Scalar VF to be a candidate"); - const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost); + const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost, + ExpectedCost); VectorizationFactor ChosenFactor = ScalarCost; bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; @@ -6060,7 +6079,7 @@ continue; VectorizationCostTy C = expectedCost(i, &InvalidCosts); - VectorizationFactor Candidate(i, C.first); + VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost); LLVM_DEBUG( dbgs() << "LV: Vector loop of width " << i << " costs: " << (Candidate.Cost / Candidate.Width.getKnownMinValue()) @@ -6251,7 +6270,7 @@ LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); if (LVP.hasPlanWithVFs( {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)})) - return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0}; + return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0, 0}; else { LLVM_DEBUG( dbgs() @@ -8080,7 +8099,7 @@ if (VPlanBuildStressTest) return VectorizationFactor::Disabled(); - return {VF, 0 /*Cost*/}; + return {VF, 0 /*Cost*/, 0}; } LLVM_DEBUG( @@ -8090,7 +8109,8 @@ } Optional -LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { +LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC, + GeneratedRTChecks &Checks) { assert(OrigLoop->isInnermost() && "Inner loop expected."); FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. @@ -8123,7 +8143,8 @@ CM.collectInLoopReductions(); buildVPlansWithVPRecipes(UserVF, UserVF); LLVM_DEBUG(printPlans(dbgs())); - return {{UserVF, 0}}; + Checks.Create(OrigLoop, *Legal->getLAI(), PSE.getUnionPredicate()); + return {{UserVF, 0, 0}}; } else reportVectorizationInfo("UserVF ignored because of invalid costs.", "InvalidCost", ORE, OrigLoop); @@ -8159,15 +8180,14 @@ // Select the optimal vectorization factor. auto SelectedVF = CM.selectVectorizationFactor(VFCandidates); + if (!SelectedVF.Width.isScalar()) + Checks.Create(OrigLoop, *Legal->getLAI(), PSE.getUnionPredicate()); + // Check if it is profitable to vectorize with runtime checks. - unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); - if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { - bool PragmaThresholdReached = - NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; - bool ThresholdReached = - NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; - if ((ThresholdReached && !Hints.allowReordering()) || - PragmaThresholdReached) { + if (SelectedVF.Width.getKnownMinValue() > 1 && + Requirements.getNumRuntimePointerChecks()) { + if (Checks.getCost(CM) > + VectorizeMemoryCheckFactor * (*SelectedVF.ScalarCost.getValue())) { ORE->emit([&]() { return OptimizationRemarkAnalysisAliasing( DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(), @@ -8177,7 +8197,7 @@ }); LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); Hints.emitRemarkWithHints(); - return VectorizationFactor::Disabled(); + return None; } } return SelectedVF; @@ -10279,8 +10299,10 @@ ElementCount UserVF = Hints.getWidth(); unsigned UserIC = Hints.getInterleave(); + GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, + F->getParent()->getDataLayout()); // Plan how to best vectorize, return the best VF and its cost. - Optional MaybeVF = LVP.plan(UserVF, UserIC); + Optional MaybeVF = LVP.plan(UserVF, UserIC, Checks); VectorizationFactor VF = VectorizationFactor::Disabled(); unsigned IC = 1; @@ -10376,13 +10398,6 @@ bool DisableRuntimeUnroll = false; MDNode *OrigLoopID = L->getLoopID(); { - // Optimistically generate runtime checks. Drop them if they turn out to not - // be profitable. Limit the scope of Checks, so the cleanup happens - // immediately after vector codegeneration is done. - GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, - F->getParent()->getDataLayout()); - if (!VF.Width.isScalar() || IC > 1) - Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); LVP.setBestPlan(VF.Width, IC); using namespace ore; diff --git a/llvm/test/LTO/X86/diagnostic-handler-remarks.ll b/llvm/test/LTO/X86/diagnostic-handler-remarks.ll --- a/llvm/test/LTO/X86/diagnostic-handler-remarks.ll +++ b/llvm/test/LTO/X86/diagnostic-handler-remarks.ll @@ -5,6 +5,7 @@ ; Confirm that there are -pass-remarks. ; RUN: llvm-lto -use-new-pm=false \ +; RUN: -vectorize-memory-check-factor=0 \ ; RUN: -pass-remarks=inline \ ; RUN: -exported-symbol _func2 -pass-remarks-analysis=loop-vectorize \ ; RUN: -exported-symbol _main -o %t.o %t.bc 2>&1 | \ @@ -12,6 +13,7 @@ ; RUN: llvm-nm %t.o | FileCheck %s -check-prefix NM ; RUN: llvm-lto -use-new-pm=false \ +; RUN: -vectorize-memory-check-factor=0 \ ; RUN: -pass-remarks=inline -use-diagnostic-handler \ ; RUN: -exported-symbol _func2 -pass-remarks-analysis=loop-vectorize \ ; RUN: -exported-symbol _main -o %t.o %t.bc 2>&1 | \ diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll b/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll @@ -1,14 +1,15 @@ -; RUN: opt -loop-vectorize -mtriple=arm64-apple-iphoneos -S %s | FileCheck %s +; RUN: opt -loop-vectorize -mtriple=arm64-apple-iphoneos -S %s | FileCheck --check-prefix=CHECK --check-prefix=DEFAULT %s +; RUN: opt -loop-vectorize -vectorize-memory-check-factor=1 -mtriple=arm64-apple-iphoneos -S %s | FileCheck --check-prefix=CHECK --check-prefix=CUSTOM %s -; Tests for loops with large numbers of runtime checks. Check that loops are -; vectorized, if the loop trip counts are large and the impact of the runtime -; checks is very small compared to the expected loop runtimes. +; All of the loops here have sufficiently-large loop bodies, +; so the additional cost of the runtime memory checks is not too large, +; so we vectorize them. - -; The trip count in the loop in this function is too to warrant large runtime checks. ; CHECK-LABEL: define {{.*}} @test_tc_too_small -; CHECK-NOT: vector.memcheck -; CHECK-NOT: vector.body +; DEFAULT: vector.memcheck +; DEFAULT: vector.body +; CUSTOM-NOT: vector.memcheck +; CUSTOM-NOT: vector.body define void @test_tc_too_small(i16* %ptr.1, i16* %ptr.2, i16* %ptr.3, i16* %ptr.4, i64 %off.1, i64 %off.2) { entry: br label %loop @@ -57,11 +58,11 @@ ret void } -; FIXME -; The trip count in the loop in this function high enough to warrant large runtime checks. ; CHECK-LABEL: define {{.*}} @test_tc_big_enough -; CHECK-NOT: vector.memcheck -; CHECK-NOT: vector.body +; DEFAULT: vector.memcheck +; DEFAULT: vector.body +; CUSTOM-NOT: vector.memcheck +; CUSTOM-NOT: vector.body define void @test_tc_big_enough(i16* %ptr.1, i16* %ptr.2, i16* %ptr.3, i16* %ptr.4, i64 %off.1, i64 %off.2) { entry: br label %loop diff --git a/llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll b/llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll --- a/llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll @@ -1,5 +1,5 @@ -; RUN: opt < %s -loop-vectorize -dce -instcombine -pass-remarks=loop-vectorize -pass-remarks-analysis=loop-vectorize -pass-remarks-missed=loop-vectorize -S 2>&1 | FileCheck %s -check-prefix=OVERRIDE -; RUN: opt < %s -loop-vectorize -pragma-vectorize-memory-check-threshold=6 -dce -instcombine -pass-remarks=loop-vectorize -pass-remarks-analysis=loop-vectorize -pass-remarks-missed=loop-vectorize -S 2>&1 | FileCheck %s +; RUN: opt < %s -loop-vectorize -vectorize-memory-check-factor=2 -dce -instcombine -pass-remarks=loop-vectorize -pass-remarks-analysis=loop-vectorize -pass-remarks-missed=loop-vectorize -S 2>&1 | FileCheck %s -check-prefix=CHECK +; RUN: opt < %s -loop-vectorize -vectorize-memory-check-factor=1 -dce -instcombine -pass-remarks=loop-vectorize -pass-remarks-analysis=loop-vectorize -pass-remarks-missed=loop-vectorize -S 2>&1 | FileCheck %s -check-prefix=OVERRIDE target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" @@ -8,10 +8,10 @@ ; First loop produced diagnostic pass remark. ;CHECK: remark: {{.*}}:0:0: vectorized loop (vectorization width: 4, interleaved count: 2) ; Second loop produces diagnostic analysis remark. -;CHECK: remark: {{.*}}:0:0: loop not vectorized: cannot prove it is safe to reorder memory operations +;CHECK: remark: {{.*}}:0:0: vectorized loop (vectorization width: 4, interleaved count: 1) ; First loop produced diagnostic pass remark. -;OVERRIDE: remark: {{.*}}:0:0: vectorized loop (vectorization width: 4, interleaved count: 2) +;OVERRIDE: remark: {{.*}}:0:0: loop not vectorized: cannot prove it is safe to reorder memory operations ; Second loop produces diagnostic pass remark. ;OVERRIDE: remark: {{.*}}:0:0: loop not vectorized: cannot prove it is safe to reorder memory operations @@ -20,7 +20,7 @@ ;CHECK: <4 x i32> ;CHECK: ret ;OVERRIDE-LABEL: func1x6( -;OVERRIDE: <4 x i32> +;OVERRIDE-NOT: <4 x i32> ;OVERRIDE: ret define i32 @func1x6(i32* nocapture %out, i32* nocapture %A, i32* nocapture %B, i32* nocapture %C, i32* nocapture %D, i32* nocapture %E, i32* nocapture %F) { entry: @@ -54,7 +54,7 @@ ; We are not vectorizing with 12 runtime checks. ;CHECK-LABEL: func2x6( -;CHECK-NOT: <4 x i32> +;CHECK: <4 x i32> ;CHECK: ret ; We vectorize with 12 checks if a vectorization hint is provided. ;OVERRIDE-LABEL: func2x6(