diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2007,6 +2007,25 @@ } } + InstructionCost getCost(LoopVectorizationCostModel &CM) { + InstructionCost RTCheckCost = 0; + if (SCEVCheckBlock) + for (Instruction &I : *SCEVCheckBlock) { + if (SCEVCheckBlock->getTerminator() == &I) + continue; + RTCheckCost += + CM.getInstructionCost(&I, ElementCount::getFixed(1)).first; + } + if (MemCheckBlock) + for (Instruction &I : *MemCheckBlock) { + if (MemCheckBlock->getTerminator() == &I) + continue; + RTCheckCost += + CM.getInstructionCost(&I, ElementCount::getFixed(1)).first; + } + return RTCheckCost; + } + /// Remove the created SCEV & memory runtime check blocks & instructions, if /// unused. ~GeneratedRTChecks() { @@ -3304,7 +3323,6 @@ } BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { - BasicBlock *const SCEVCheckBlock = RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); if (!SCEVCheckBlock) @@ -8164,7 +8182,29 @@ if (!SelectedVF.Width.isScalar()) Checks.Create(OrigLoop, *Legal->getLAI(), PSE.getUnionPredicate()); + bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; // Check if it is profitable to vectorize with runtime checks. + if (!ForceVectorization && SelectedVF.Width.getKnownMinValue() > 1) { + if (auto ExpectedTC = getSmallBestKnownTC(*PSE.getSE(), OrigLoop)) { + InstructionCost RTCost = Checks.getCost(CM); + // The total scalar cost is ScalarCost * ExpectedTC and the total vector + // cost is (VectorCost / Width) * ExpectedTC. To avoid dividing by a small + // number, we multiply ScalarCost * Width instead. To avoid multiplying + // with a potential large trip count, we divide by ExpectedTC. + InstructionCost ScalarCost = + SelectedVF.ScalarCost * SelectedVF.Width.getKnownMinValue(); + if (ScalarCost <= (RTCost / double(*ExpectedTC) + SelectedVF.Cost)) { + LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial due to " + "runtime check cost (scalar cost (" + << ScalarCost << ") <= runtime check + vector cost (" + << (RTCost / double(*ExpectedTC) + SelectedVF.Cost) + << ")\n"); + + return None; + } + } + } + unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { bool PragmaThresholdReached = diff --git a/llvm/test/Transforms/LoopVectorize/X86/pointer-runtime-checks-unprofitable.ll b/llvm/test/Transforms/LoopVectorize/X86/pointer-runtime-checks-unprofitable.ll --- a/llvm/test/Transforms/LoopVectorize/X86/pointer-runtime-checks-unprofitable.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pointer-runtime-checks-unprofitable.ll @@ -1,4 +1,6 @@ -; RUN: opt -runtime-memory-check-threshold=9 -passes='loop-vectorize' -mtriple=x86_64-unknown-linux -S %s | FileCheck %s +; REQUIRES: asserts + +; RUN: opt -runtime-memory-check-threshold=9 -passes='loop-vectorize' -mtriple=x86_64-unknown-linux -S -debug %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" @@ -10,9 +12,13 @@ ; than running the scalar loop. ; TODO: should not be vectorized. define void @test(double* nocapture %A, double* nocapture %B, double* nocapture %C, double* nocapture %D, double* nocapture %E) { +; CHECK: LV: Vectorization is not beneficial due to runtime check cost +; ; CHECK-LABEL: @test( -; CHECK: vector.memcheck -; CHECK: vector.body +; CHECK-NEXT: entry: +; CHECK-NEXT: br label %for.body +; CHECK-NOT: vector.memcheck +; CHECK-NOT: vector.body ; entry: br label %for.body