diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1232,10 +1232,11 @@ AssumptionCache *AC, OptimizationRemarkEmitter *ORE, const Function *F, const LoopVectorizeHints *Hints, - InterleavedAccessInfo &IAI) + InterleavedAccessInfo &IAI, + GeneratedRTChecks &RTChecks) : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), - Hints(Hints), InterleaveInfo(IAI) {} + Hints(Hints), InterleaveInfo(IAI), RTChecks(RTChecks) {} /// \return An upper bound for the vectorization factors (both fixed and /// scalable). If the factors are 0, vectorization and interleaving should be @@ -1640,6 +1641,17 @@ Scalars.clear(); } + /// The vectorization cost is a combination of the cost itself and a boolean + /// indicating whether any of the contributing operations will actually + /// operate on vector values after type legalization in the backend. If this + /// latter value is false, then all operations will be scalarized (i.e. no + /// vectorization has actually taken place). + using VectorizationCostTy = std::pair; + + /// Returns the execution time cost of an instruction for a given vector + /// width. Vector width of one means scalar. + VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF) const; + private: unsigned NumPredStores = 0; @@ -1668,13 +1680,6 @@ /// of elements. ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); - /// The vectorization cost is a combination of the cost itself and a boolean - /// indicating whether any of the contributing operations will actually - /// operate on vector values after type legalization in the backend. If this - /// latter value is false, then all operations will be scalarized (i.e. no - /// vectorization has actually taken place). - using VectorizationCostTy = std::pair; - /// Returns the expected execution cost. The unit of the cost does /// not matter because we use the 'cost' units to compare different /// vector widths. The cost that is returned is *not* normalized by @@ -1686,10 +1691,6 @@ expectedCost(ElementCount VF, SmallVectorImpl *Invalid = nullptr) const; - /// Returns the execution time cost of an instruction for a given vector - /// width. Vector width of one means scalar. - VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF) const; - /// The cost-computation logic from getInstructionCost which provides /// the vector type as an output parameter. InstructionCost getInstructionCost(Instruction *I, ElementCount VF, @@ -1910,6 +1911,10 @@ /// with the same stride and close to each other. InterleavedAccessInfo &InterleaveInfo; + /// Structure to hold information about generated runtime checks, responsible + /// for cleaning the checks, if vectorization turns out unprofitable. + GeneratedRTChecks &RTChecks; + /// Values to ignore in the cost model. SmallPtrSet ValuesToIgnore; @@ -1963,6 +1968,8 @@ /// Tracks whether SCEV expressions have been expanded internally. bool IsSCEVChecksExpanded = false; + // Cached cost of expanded SCEV expressions. + InstructionCost RTCheckCost = InstructionCost::getInvalid(); public: GeneratedRTChecks(PredicatedScalarEvolution &PSE, LoopVectorizationLegality &LVL, DominatorTree *DT, @@ -2073,6 +2080,31 @@ return MemCheckBlock; } + InstructionCost getCost(const LoopVectorizationCostModel &CM) { + if (RTCheckCost.isValid()) + return RTCheckCost; + + expandSCEVChecks(); + RTCheckCost.setValid(); + + if (SCEVCheckBlock) + for (Instruction &I : *SCEVCheckBlock) { + if (SCEVCheckBlock->getTerminator() == &I) + continue; + RTCheckCost += + CM.getInstructionCost(&I, ElementCount::getFixed(1)).first; + } + if (MemCheckBlock) + for (Instruction &I : *MemCheckBlock) { + if (MemCheckBlock->getTerminator() == &I) + continue; + RTCheckCost += + CM.getInstructionCost(&I, ElementCount::getFixed(1)).first; + } + assert(RTCheckCost.isValid() && "Unexpected invalid runtime checks cost"); + return RTCheckCost; + } + private: /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can /// accurately estimate the cost of the runtime checks. The blocks are @@ -6092,7 +6124,7 @@ } if (IsMainLoop) { - InstructionCost RTChecksCost = 0; + InstructionCost RTChecksCost = RTChecks.getCost(*this); InstructionCost FixedCostA = A.Width.isScalar() ? 0 : RTChecksCost; InstructionCost FixedCostB = B.Width.isScalar() ? 0 : RTChecksCost; FixedCostDiff = {128, (uint64_t)(*(FixedCostB - FixedCostA).getValue()), @@ -10146,7 +10178,7 @@ GeneratedRTChecks RTChecks(PSE, *LVL, DT, LI, L, F->getParent()->getDataLayout()); LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, - &Hints, IAI); + &Hints, IAI, RTChecks); // Use the planner for outer loop vectorization. // TODO: CM is not used at this point inside the planner. Turn CM into an // optional argument if we don't need it in the future. @@ -10383,7 +10415,7 @@ F->getParent()->getDataLayout()); // Use the cost model. LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, - F, &Hints, IAI); + F, &Hints, IAI, RTChecks); CM.collectValuesToIgnore(); CM.collectElementTypesForWidening(); diff --git a/llvm/test/Transforms/LoopVectorize/X86/pointer-runtime-checks-unprofitable.ll b/llvm/test/Transforms/LoopVectorize/X86/pointer-runtime-checks-unprofitable.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/X86/pointer-runtime-checks-unprofitable.ll @@ -0,0 +1,64 @@ +; REQUIRES: asserts + +; RUN: opt -runtime-memory-check-threshold=9 -passes='loop-vectorize' -mtriple=x86_64-unknown-linux -S -debug %s 2>&1 | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" + +target triple = "x86_64-unknown-linux" + +declare double @llvm.pow.f64(double, double) + +; Test case where the memory runtime checks and vector body is more expensive +; than running the scalar loop. +define void @test(double* nocapture %A, double* nocapture %B, double* nocapture %C, double* nocapture %D, double* nocapture %E) { +; CHECK-LABEL: @test( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label %for.body +; CHECK-NOT: vector.memcheck +; CHECK-NOT: vector.body +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %gep.A = getelementptr inbounds double, double* %A, i64 %iv + %l.A = load double, double* %gep.A, align 4 + store double 0.0, double* %gep.A, align 4 + %p.1 = call double @llvm.pow.f64(double %l.A, double 2.0) + + %gep.B = getelementptr inbounds double, double* %B, i64 %iv + %l.B = load double, double* %gep.B, align 4 + %p.2 = call double @llvm.pow.f64(double %l.B, double %p.1) + store double 0.0, double* %gep.B, align 4 + + %gep.C = getelementptr inbounds double, double* %C, i64 %iv + %l.C = load double, double* %gep.C, align 4, !noalias !5 + %p.3 = call double @llvm.pow.f64(double %p.1, double %l.C) + + %gep.D = getelementptr inbounds double, double* %D, i64 %iv + %l.D = load double, double* %gep.D + %p.4 = call double @llvm.pow.f64(double %p.3, double %l.D) + %p.5 = call double @llvm.pow.f64(double %p.4, double %p.3) + %mul = fmul double 2.0, %p.5 + %mul.2 = fmul double %mul, 2.0 + %mul.3 = fmul double %mul, %mul.2 + %gep.E = getelementptr inbounds double, double* %E, i64 %iv + store double %mul.3, double* %gep.E, align 4, !alias.scope !5 + %iv.next = add i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 16 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +!0 = !{!0} +!1 = !{!1} + +; Some scopes in these domains: +!2 = !{!2, !0} +!4 = !{!4, !1} + +!5 = !{!4} ; A list containing only scope !4 +