diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -202,9 +202,10 @@ Function *F, std::function *GetLAA, LoopInfo *LI, OptimizationRemarkEmitter *ORE, LoopVectorizationRequirements *R, LoopVectorizeHints *H, DemandedBits *DB, - AssumptionCache *AC) + AssumptionCache *AC, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) : TheLoop(L), LI(LI), PSE(PSE), TTI(TTI), TLI(TLI), DT(DT), - GetLAA(GetLAA), ORE(ORE), Requirements(R), Hints(H), DB(DB), AC(AC) {} + GetLAA(GetLAA), ORE(ORE), Requirements(R), Hints(H), DB(DB), AC(AC), + BFI(BFI), PSI(PSI) {} /// ReductionList contains the reduction descriptors for all /// of the reductions that were found in the loop. @@ -478,6 +479,10 @@ /// Assume instructions in predicated blocks must be dropped if the CFG gets /// flattened. SmallPtrSet ConditionalAssumes; + + /// BFI and PSI are used to check for profile guided size optimizations. + BlockFrequencyInfo *BFI; + ProfileSummaryInfo *PSI; }; } // namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -20,6 +20,7 @@ #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/Transforms/Utils/SizeOpts.h" #include "llvm/Transforms/Vectorize/LoopVectorize.h" using namespace llvm; @@ -412,7 +413,11 @@ const ValueToValueMap &Strides = getSymbolicStrides() ? *getSymbolicStrides() : ValueToValueMap(); - bool CanAddPredicate = !TheLoop->getHeader()->getParent()->hasOptSize(); + Function *F = TheLoop->getHeader()->getParent(); + bool OptForSize = F->hasOptSize() || + llvm::shouldOptimizeForSize(TheLoop->getHeader(), PSI, BFI, + PGSOQueryType::IRPass); + bool CanAddPredicate = !OptForSize; int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, CanAddPredicate, false); if (Stride == 1 || Stride == -1) return Stride; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -395,11 +395,19 @@ const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, unsigned VecWidth, unsigned UnrollFactor, LoopVectorizationLegality *LVL, - LoopVectorizationCostModel *CM) + LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, + ProfileSummaryInfo *PSI) : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()), - VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {} + VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM), + BFI(BFI), PSI(PSI) { + // Query this against the original loop and save it here because the profile + // of the original loop header may change as the transformation happens. + OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( + OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); + } + virtual ~InnerLoopVectorizer() = default; /// Create a new empty loop. Unlink the old loop and connect the new one. @@ -779,6 +787,14 @@ // Vector of original scalar PHIs whose corresponding widened PHIs need to be // fixed up at the end of vector code generation. SmallVector OrigPHIsToFix; + + /// BFI and PSI are used to check for profile guided size optimizations. + BlockFrequencyInfo *BFI; + ProfileSummaryInfo *PSI; + + // Whether this loop should be optimized for size based on profile guided size + // optimizatios. + bool OptForSizeBasedOnProfile; }; class InnerLoopUnroller : public InnerLoopVectorizer { @@ -789,9 +805,10 @@ const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, LoopVectorizationLegality *LVL, - LoopVectorizationCostModel *CM) + LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, + ProfileSummaryInfo *PSI) : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1, - UnrollFactor, LVL, CM) {} + UnrollFactor, LVL, CM, BFI, PSI) {} private: Value *getBroadcastInstrs(Value *V) override; @@ -2754,7 +2771,8 @@ if (C->isZero()) return; - assert(!SCEVCheckBlock->getParent()->hasOptSize() && + assert(!(SCEVCheckBlock->getParent()->hasOptSize() || + OptForSizeBasedOnProfile) && "Cannot SCEV check stride or overflow when optimizing for size"); SCEVCheckBlock->setName("vector.scevcheck"); @@ -2800,7 +2818,7 @@ assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking " "claimed checks are required"); - if (MemCheckBlock->getParent()->hasOptSize()) { + if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && "Cannot emit memory checks when optimizing for size, unless forced " "to vectorize."); @@ -7729,7 +7747,7 @@ LVP.setBestPlan(VF.Width, 1); InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, - &CM); + &CM, BFI, PSI); LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"); LVP.executePlan(LB, DT); @@ -7793,7 +7811,7 @@ // Check if it is legal to vectorize the loop. LoopVectorizationRequirements Requirements(*ORE); LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, - &Requirements, &Hints, DB, AC); + &Requirements, &Hints, DB, AC, BFI, PSI); if (!LVL.canVectorize(EnableVPlanNativePath)) { LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); Hints.emitRemarkWithHints(); @@ -7993,8 +8011,8 @@ assert(IC > 1 && "interleave count should not be 1 or 0"); // If we decided that it is not legal to vectorize the loop, then // interleave it. - InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, - &CM); + InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM, + BFI, PSI); LVP.executePlan(Unroller, DT); ORE->emit([&]() { @@ -8006,7 +8024,7 @@ } else { // If we decided that it is *legal* to vectorize the loop, then do it. InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, - &LVL, &CM); + &LVL, &CM, BFI, PSI); LVP.executePlan(LB, DT); ++LoopsVectorized; diff --git a/llvm/test/Transforms/LoopVectorize/optsize.ll b/llvm/test/Transforms/LoopVectorize/optsize.ll --- a/llvm/test/Transforms/LoopVectorize/optsize.ll +++ b/llvm/test/Transforms/LoopVectorize/optsize.ll @@ -121,6 +121,38 @@ br i1 %cmp26, label %for.body29, label %for.cond.cleanup28 } +define void @pr43371_pgso() !prof !14 { +; +; CHECK-LABEL: @pr43371_pgso +; CHECK-NOT: vector.scevcheck +; +; We do not want to generate SCEV predicates when optimising for size, because +; that will lead to extra code generation such as the SCEV overflow runtime +; checks. Not generating SCEV predicates can still result in vectorisation as +; the non-consecutive loads/stores can be scalarized: +; +; CHECK: vector.body: +; CHECK: store i16 0, i16* %{{.*}}, align 1 +; CHECK: store i16 0, i16* %{{.*}}, align 1 +; CHECK: br i1 {{.*}}, label %vector.body +; +entry: + br label %for.body29 + +for.cond.cleanup28: + unreachable + +for.body29: + %i24.0170 = phi i16 [ 0, %entry], [ %inc37, %for.body29] + %add33 = add i16 undef, %i24.0170 + %idxprom34 = zext i16 %add33 to i32 + %arrayidx35 = getelementptr [2592 x i16], [2592 x i16] * @cm_array, i32 0, i32 %idxprom34 + store i16 0, i16 * %arrayidx35, align 1 + %inc37 = add i16 %i24.0170, 1 + %cmp26 = icmp ult i16 %inc37, 756 + br i1 %cmp26, label %for.body29, label %for.cond.cleanup28 +} + ; PR45526: don't vectorize with fold-tail if first-order-recurrence is live-out. ; define i32 @pr45526() optsize { @@ -154,6 +186,37 @@ ret i32 %for } +define i32 @pr45526_pgso() !prof !14 { +; +; CHECK-LABEL: @pr45526_pgso +; CHECK-NEXT: entry: +; CHECK-NEXT: br label %loop +; CHECK-EMPTY: +; CHECK-NEXT: loop: +; CHECK-NEXT: %piv = phi i32 [ 0, %entry ], [ %pivPlus1, %loop ] +; CHECK-NEXT: %for = phi i32 [ 5, %entry ], [ %pivPlus1, %loop ] +; CHECK-NEXT: %pivPlus1 = add nuw nsw i32 %piv, 1 +; CHECK-NEXT: %cond = icmp ult i32 %piv, 510 +; CHECK-NEXT: br i1 %cond, label %loop, label %exit +; CHECK-EMPTY: +; CHECK-NEXT: exit: +; CHECK-NEXT: %for.lcssa = phi i32 [ %for, %loop ] +; CHECK-NEXT: ret i32 %for.lcssa +; +entry: + br label %loop + +loop: + %piv = phi i32 [ 0, %entry ], [ %pivPlus1, %loop ] + %for = phi i32 [ 5, %entry ], [ %pivPlus1, %loop ] + %pivPlus1 = add nuw nsw i32 %piv, 1 + %cond = icmp ult i32 %piv, 510 + br i1 %cond, label %loop, label %exit + +exit: + ret i32 %for +} + ; PR46228: Vectorize w/o versioning for unit stride under optsize and enabled ; vectorization. @@ -190,7 +253,7 @@ ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1026 -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !19 +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !21 ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: