Index: include/llvm/Transforms/Utils/LoopUtils.h =================================================================== --- include/llvm/Transforms/Utils/LoopUtils.h +++ include/llvm/Transforms/Utils/LoopUtils.h @@ -260,6 +260,12 @@ /// estimate can not be made. Optional getLoopEstimatedTripCount(Loop *L); +/// Get a loop's estimated trip count based on branch weight metadata. +/// Only Latch branch is considered. +/// Returns 0 when the count is estimated to be 0, or None when a meaningful +/// estimate can not be made. +Optional getLoopEstimatedTripCountBasingOnLatchOnly(Loop *L); + /// Check inner loop (L) backedge count is known to be invariant on all /// iterations of its outer loop. If the loop has no parent, this is trivially /// true. Index: lib/Transforms/Utils/LoopUnrollPeel.cpp =================================================================== --- lib/Transforms/Utils/LoopUnrollPeel.cpp +++ lib/Transforms/Utils/LoopUnrollPeel.cpp @@ -61,6 +61,10 @@ "unroll-force-peel-count", cl::init(0), cl::Hidden, cl::desc("Force a peel count regardless of profiling information.")); +static cl::opt +UnrollPeelMultiExit("unroll-peel-multi-exit", cl::init(true), cl::Hidden, + cl::desc("Allow peeling of loops with multiple exits.")); + // Designates that a Phi is estimated to become invariant after an "infinite" // number of loop iterations (i.e. only may become an invariant if the loop is // fully unrolled). @@ -73,6 +77,27 @@ if (!L->isLoopSimplifyForm()) return false; + if (UnrollPeelMultiExit) { + SmallVector, 4> ExitEdges; + L->getExitEdges(ExitEdges); + + if (ExitEdges.size() > 0) { + bool SawLatch = false; + BasicBlock *Latch = L->getLoopLatch(); + for (auto It : ExitEdges) { + if (It.first == Latch) { + if (SawLatch) + return false; + SawLatch = true; + } else { + if (!It.second->getTerminatingDeoptimizeCall()) + return false; + } + } + return SawLatch; + } + } + // Only peel loops that contain a single exit if (!L->getExitingBlock() || !L->getUniqueExitBlock()) return false; @@ -316,7 +341,8 @@ // We only do this in the presence of profile information, since otherwise // our estimates of the trip count are not reliable enough. if (L->getHeader()->getParent()->hasProfileData()) { - Optional PeelCount = getLoopEstimatedTripCount(L); + Optional PeelCount = + getLoopEstimatedTripCountBasingOnLatchOnly(L); if (!PeelCount) return; Index: lib/Transforms/Utils/LoopUtils.cpp =================================================================== --- lib/Transforms/Utils/LoopUtils.cpp +++ lib/Transforms/Utils/LoopUtils.cpp @@ -624,6 +624,10 @@ // Only support loops with a unique exiting block, and a latch. if (!L->getExitingBlock()) return None; + return getLoopEstimatedTripCountBasingOnLatchOnly(L); +} + +Optional llvm::getLoopEstimatedTripCountBasingOnLatchOnly(Loop *L) { // Get the branch weights for the loop's backedge. BranchInst *LatchBR = Index: test/Transforms/LoopUnroll/peel-loop-pgo-deopt.ll =================================================================== --- /dev/null +++ test/Transforms/LoopUnroll/peel-loop-pgo-deopt.ll @@ -0,0 +1,80 @@ +; RUN: opt < %s -S -debug-only=loop-unroll -loop-unroll -unroll-runtime 2>&1 | FileCheck %s +; RUN: opt < %s -S -debug-only=loop-unroll -passes='require,function(require,unroll)' 2>&1 | FileCheck %s + +; Make sure we use the profile information correctly to peel-off 3 iterations +; from the loop, and update the branch weights for the peeled loop properly. + +; CHECK: Loop Unroll: F[basic] +; CHECK: PEELING loop %for.body with iteration count 3! + +; CHECK-LABEL: @basic +; CHECK: br i1 %{{.*}}, label %[[NEXT0:.*]], label %for.cond.for.end_crit_edge, !prof !16 +; CHECK: [[NEXT0]]: +; CHECK: br i1 %{{.*}}, label %[[NEXT1:.*]], label %for.cond.for.end_crit_edge, !prof !17 +; CHECK: [[NEXT1]]: +; CHECK: br i1 %{{.*}}, label %[[NEXT2:.*]], label %for.cond.for.end_crit_edge, !prof !18 +; CHECK: [[NEXT2]]: +; CHECK: br i1 %{{.*}}, label %for.body, label %{{.*}}, !prof !19 + +define i32 @basic(i32* %p, i32 %k, i1 %c) #0 !prof !15 { +entry: + %cmp3 = icmp slt i32 0, %k + br i1 %cmp3, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.lr.ph, %for.body + %i.05 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %continue ] + %p.addr.04 = phi i32* [ %p, %for.body.lr.ph ], [ %incdec.ptr, %continue ] + %incdec.ptr = getelementptr inbounds i32, i32* %p.addr.04, i32 1 + store i32 %i.05, i32* %p.addr.04, align 4 + %inc = add nsw i32 %i.05, 1 + %cmp = icmp slt i32 %inc, %k + br i1 %c, label %continue, label %side_exit, !prof !17 + +continue: + br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge, !prof !16 + +for.cond.for.end_crit_edge: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.cond.for.end_crit_edge, %entry + %res = phi i32 [ 0, %entry ], [ %inc, %for.cond.for.end_crit_edge ] + ret i32 %res + +side_exit: + %rval = call i32(...) @llvm.experimental.deoptimize.i32() [ "deopt"(i32 %inc) ] + ret i32 %rval +} + +declare i32 @llvm.experimental.deoptimize.i32(...) + +attributes #0 = { nounwind } +attributes #1 = { nounwind optsize } + +!llvm.module.flags = !{!1} + +!1 = !{i32 1, !"ProfileSummary", !2} +!2 = !{!3, !4, !5, !6, !7, !8, !9, !10} +!3 = !{!"ProfileFormat", !"InstrProf"} +!4 = !{!"TotalCount", i64 10} +!5 = !{!"MaxCount", i64 3} +!6 = !{!"MaxInternalCount", i64 1} +!7 = !{!"MaxFunctionCount", i64 3} +!8 = !{!"NumCounts", i64 2} +!9 = !{!"NumFunctions", i64 2} +!10 = !{!"DetailedSummary", !11} +!11 = !{!12, !13, !14} +!12 = !{i32 10000, i64 3, i32 2} +!13 = !{i32 999000, i64 1, i32 10} +!14 = !{i32 999999, i64 1, i32 10} +!15 = !{!"function_entry_count", i64 1} +!16 = !{!"branch_weights", i32 3001, i32 1001} +!17 = !{!"branch_weights", i32 1, i32 0} + +;CHECK: !16 = !{!"branch_weights", i32 901, i32 100} +;CHECK: !17 = !{!"branch_weights", i32 541, i32 360} +;CHECK: !18 = !{!"branch_weights", i32 163, i32 378} +;CHECK: !19 = !{!"branch_weights", i32 1396, i32 163} +