Index: llvm/lib/Transforms/Utils/LoopPeel.cpp =================================================================== --- llvm/lib/Transforms/Utils/LoopPeel.cpp +++ llvm/lib/Transforms/Utils/LoopPeel.cpp @@ -460,27 +460,30 @@ if (L->getHeader()->getParent()->hasProfileData()) { if (violatesLegacyMultiExitLoopCheck(L)) return; - Optional PeelCount = getLoopEstimatedTripCount(L); - if (!PeelCount) + Optional EstimatedTripCount = getLoopEstimatedTripCount(L); + if (!EstimatedTripCount) return; - LLVM_DEBUG(dbgs() << "Profile-based estimated trip count is " << *PeelCount - << "\n"); - - if (*PeelCount) { - if ((*PeelCount + AlreadyPeeled <= UnrollPeelMaxCount) && - (LoopSize * (*PeelCount + 1) <= Threshold)) { - LLVM_DEBUG(dbgs() << "Peeling first " << *PeelCount - << " iterations.\n"); - PP.PeelCount = *PeelCount; + LLVM_DEBUG(dbgs() << "Profile-based estimated trip count is " + << *EstimatedTripCount << "\n"); + + if (*EstimatedTripCount) { + // Even if we cannot peel all estimated iterations, partial peeling also + // seems beneficial, but only to some extent. + if (*EstimatedTripCount + AlreadyPeeled <= MaxPeelCount * 2 && + AlreadyPeeled < MaxPeelCount) { + unsigned PeelCount = + std::min(*EstimatedTripCount, MaxPeelCount - AlreadyPeeled); + LLVM_DEBUG(dbgs() << "Peeling first " << PeelCount << " iterations.\n"); + PP.PeelCount = PeelCount; return; } - LLVM_DEBUG(dbgs() << "Requested peel count: " << *PeelCount << "\n"); LLVM_DEBUG(dbgs() << "Already peel count: " << AlreadyPeeled << "\n"); LLVM_DEBUG(dbgs() << "Max peel count: " << UnrollPeelMaxCount << "\n"); - LLVM_DEBUG(dbgs() << "Peel cost: " << LoopSize * (*PeelCount + 1) - << "\n"); + LLVM_DEBUG(dbgs() << "Loop cost: " << LoopSize << "\n"); LLVM_DEBUG(dbgs() << "Max peel cost: " << Threshold << "\n"); + LLVM_DEBUG(dbgs() << "Max peel count by cost: " + << (Threshold / LoopSize - 1) << "\n"); } } } Index: llvm/test/Transforms/LoopUnroll/peel-loop-conditions-pgo-2.ll =================================================================== --- llvm/test/Transforms/LoopUnroll/peel-loop-conditions-pgo-2.ll +++ llvm/test/Transforms/LoopUnroll/peel-loop-conditions-pgo-2.ll @@ -7,8 +7,8 @@ ; Check that we can peel off iterations that make conditions true. ; The second invocation of loop-unroll will NOT do profile based peeling of -; remained iterations because the total number of peeled iterations exceeds -; threashold specified with -unroll-peel-max-count=7. +; remaining iterations because the total number of iterations exceeds the +; doubled threshold specified with -unroll-peel-max-count=7. define void @test2(i32 %k) !prof !4 { ; CHECK: Loop Unroll: F[test2] Loop %for.body ; CHECK: PEELING loop %for.body with iteration count 2! @@ -39,5 +39,5 @@ } !1 = distinct !{!1} -!3 = !{!"branch_weights", i32 8, i32 1} +!3 = !{!"branch_weights", i32 14, i32 1} !4 = !{!"function_entry_count", i64 1} Index: llvm/test/Transforms/LoopUnroll/peel-loop-conditions-pgo-3.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopUnroll/peel-loop-conditions-pgo-3.ll @@ -0,0 +1,43 @@ +; RUN: opt < %s -S -loop-unroll -loop-unroll -verify-dom-info -debug-only=loop-unroll -unroll-peel-max-count=7 2>&1 | FileCheck %s +; REQUIRES: asserts + +declare void @f1() +declare void @f2() + +; Check that we can peel off iterations that make conditions true. +; The second invocation of loop-unroll will do profile-based peeling of remaining +; iterations and will partially peel them so that the total number of peeled +; iterations does not exceed the threshold specified with -unroll-peel-max-count=7. +define void @test2(i32 %k) !prof !4 { +; CHECK: Loop Unroll: F[test2] Loop %for.body +; CHECK: PEELING loop %for.body with iteration count 2! +; CHECK: PEELING loop %for.body with iteration count 5! +; CHECK: llvm.loop.unroll.disable +for.body.lr.ph: + br label %for.body + +for.body: + %i.05 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ] + %cmp1 = icmp ult i32 %i.05, 2 + br i1 %cmp1, label %if.then, label %if.else + +if.then: + call void @f1() + br label %for.inc + +if.else: + call void @f2() + br label %for.inc + +for.inc: + %inc = add nsw i32 %i.05, 1 + %cmp = icmp slt i32 %inc, %k + br i1 %cmp, label %for.body, label %for.end, !llvm.loop !1, !prof !3 + +for.end: + ret void +} + +!1 = distinct !{!1} +!3 = !{!"branch_weights", i32 13, i32 1} +!4 = !{!"function_entry_count", i64 1}