Index: include/llvm/Transforms/Utils/LoopUtils.h =================================================================== --- include/llvm/Transforms/Utils/LoopUtils.h +++ include/llvm/Transforms/Utils/LoopUtils.h @@ -260,6 +260,12 @@ /// estimate can not be made. Optional getLoopEstimatedTripCount(Loop *L); +/// Get a loop's estimated trip count based on branch weight metadata. +/// Only Latch branch is considered. +/// Returns 0 when the count is estimated to be 0, or None when a meaningful +/// estimate can not be made. +Optional getLoopEstimatedTripCountBasingOnLatchOnly(Loop *L); + /// Check inner loop (L) backedge count is known to be invariant on all /// iterations of its outer loop. If the loop has no parent, this is trivially /// true. Index: lib/Transforms/Utils/LoopUnrollPeel.cpp =================================================================== --- lib/Transforms/Utils/LoopUnrollPeel.cpp +++ lib/Transforms/Utils/LoopUnrollPeel.cpp @@ -61,18 +61,48 @@ "unroll-force-peel-count", cl::init(0), cl::Hidden, cl::desc("Force a peel count regardless of profiling information.")); +static cl::opt +UnrollPeelMultiExit("unroll-peel-multi-exit", cl::init(false), cl::Hidden, + cl::desc("Allow peeling of loops with multiple exits.")); + // Designates that a Phi is estimated to become invariant after an "infinite" // number of loop iterations (i.e. only may become an invariant if the loop is // fully unrolled). static const unsigned InfiniteIterationsToInvariance = std::numeric_limits::max(); +// Returns true if all non latch exits ends up with a deopt. +static bool allNonLatchExitsDeoptimize( + const BasicBlock *Latch, + SmallVectorImpl > & + ExitEdges) { + return all_of(ExitEdges, + [Latch]( + std::pair Edge) { + return Latch == Edge.first || Edge.second->getTerminatingDeoptimizeCall(); + }); +} + // Check whether we are capable of peeling this loop. bool llvm::canPeel(Loop *L) { // Make sure the loop is in simplified form if (!L->isLoopSimplifyForm()) return false; + if (UnrollPeelMultiExit) { + SmallVector, 4> ExitEdges; + L->getExitEdges(ExitEdges); + + if (ExitEdges.size() > 1) { + // Latch's terminator is a conditional branch, Latch is exiting and + // all non Latch exits ends up with deoptimize. + const BasicBlock *Latch = L->getLoopLatch(); + const BranchInst *T = dyn_cast(Latch->getTerminator()); + return T && T->isConditional() && L->isLoopExiting(Latch) && + allNonLatchExitsDeoptimize(Latch, ExitEdges); + } + } + // Only peel loops that contain a single exit if (!L->getExitingBlock() || !L->getUniqueExitBlock()) return false; @@ -316,7 +346,8 @@ // We only do this in the presence of profile information, since otherwise // our estimates of the trip count are not reliable enough. if (L->getHeader()->getParent()->hasProfileData()) { - Optional PeelCount = getLoopEstimatedTripCount(L); + Optional PeelCount = + getLoopEstimatedTripCountBasingOnLatchOnly(L); if (!PeelCount) return; Index: lib/Transforms/Utils/LoopUtils.cpp =================================================================== --- lib/Transforms/Utils/LoopUtils.cpp +++ lib/Transforms/Utils/LoopUtils.cpp @@ -624,6 +624,10 @@ // Only support loops with a unique exiting block, and a latch. if (!L->getExitingBlock()) return None; + return getLoopEstimatedTripCountBasingOnLatchOnly(L); +} + +Optional llvm::getLoopEstimatedTripCountBasingOnLatchOnly(Loop *L) { // Get the branch weights for the loop's backedge. BranchInst *LatchBR = Index: test/Transforms/LoopUnroll/peel-loop-pgo-deopt.ll =================================================================== --- /dev/null +++ test/Transforms/LoopUnroll/peel-loop-pgo-deopt.ll @@ -0,0 +1,80 @@ +; RUN: opt < %s -S -debug-only=loop-unroll -loop-unroll -unroll-runtime -unroll-peel-multi-exit 2>&1 | FileCheck %s +; RUN: opt < %s -S -debug-only=loop-unroll -unroll-peel-multi-exit -passes='require,function(require,unroll)' 2>&1 | FileCheck %s + +; Make sure we use the profile information correctly to peel-off 3 iterations +; from the loop, and update the branch weights for the peeled loop properly. + +; CHECK: Loop Unroll: F[basic] +; CHECK: PEELING loop %for.body with iteration count 3! + +; CHECK-LABEL: @basic +; CHECK: br i1 %{{.*}}, label %[[NEXT0:.*]], label %for.cond.for.end_crit_edge, !prof !16 +; CHECK: [[NEXT0]]: +; CHECK: br i1 %{{.*}}, label %[[NEXT1:.*]], label %for.cond.for.end_crit_edge, !prof !17 +; CHECK: [[NEXT1]]: +; CHECK: br i1 %{{.*}}, label %[[NEXT2:.*]], label %for.cond.for.end_crit_edge, !prof !18 +; CHECK: [[NEXT2]]: +; CHECK: br i1 %{{.*}}, label %for.body, label %{{.*}}, !prof !19 + +define i32 @basic(i32* %p, i32 %k, i1 %c) #0 !prof !15 { +entry: + %cmp3 = icmp slt i32 0, %k + br i1 %cmp3, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.lr.ph, %for.body + %i.05 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %continue ] + %p.addr.04 = phi i32* [ %p, %for.body.lr.ph ], [ %incdec.ptr, %continue ] + %incdec.ptr = getelementptr inbounds i32, i32* %p.addr.04, i32 1 + store i32 %i.05, i32* %p.addr.04, align 4 + %inc = add nsw i32 %i.05, 1 + %cmp = icmp slt i32 %inc, %k + br i1 %c, label %continue, label %side_exit, !prof !17 + +continue: + br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge, !prof !16 + +for.cond.for.end_crit_edge: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.cond.for.end_crit_edge, %entry + %res = phi i32 [ 0, %entry ], [ %inc, %for.cond.for.end_crit_edge ] + ret i32 %res + +side_exit: + %rval = call i32(...) @llvm.experimental.deoptimize.i32() [ "deopt"(i32 %inc) ] + ret i32 %rval +} + +declare i32 @llvm.experimental.deoptimize.i32(...) + +attributes #0 = { nounwind } +attributes #1 = { nounwind optsize } + +!llvm.module.flags = !{!1} + +!1 = !{i32 1, !"ProfileSummary", !2} +!2 = !{!3, !4, !5, !6, !7, !8, !9, !10} +!3 = !{!"ProfileFormat", !"InstrProf"} +!4 = !{!"TotalCount", i64 10} +!5 = !{!"MaxCount", i64 3} +!6 = !{!"MaxInternalCount", i64 1} +!7 = !{!"MaxFunctionCount", i64 3} +!8 = !{!"NumCounts", i64 2} +!9 = !{!"NumFunctions", i64 2} +!10 = !{!"DetailedSummary", !11} +!11 = !{!12, !13, !14} +!12 = !{i32 10000, i64 3, i32 2} +!13 = !{i32 999000, i64 1, i32 10} +!14 = !{i32 999999, i64 1, i32 10} +!15 = !{!"function_entry_count", i64 1} +!16 = !{!"branch_weights", i32 3001, i32 1001} +!17 = !{!"branch_weights", i32 1, i32 0} + +;CHECK: !16 = !{!"branch_weights", i32 900, i32 101} +;CHECK: !17 = !{!"branch_weights", i32 540, i32 360} +;CHECK: !18 = !{!"branch_weights", i32 162, i32 378} +;CHECK: !19 = !{!"branch_weights", i32 1399, i32 162} +