Index: include/llvm/Transforms/Utils/LoopUtils.h =================================================================== --- include/llvm/Transforms/Utils/LoopUtils.h +++ include/llvm/Transforms/Utils/LoopUtils.h @@ -461,6 +461,11 @@ void addStringMetadataToLoop(Loop *TheLoop, const char *MDString, unsigned V = 0); +/// \brief Get a loop's estimated trip count based on branch weight metadata. +/// Returns 0 when the count is estimated to be 0, or None when a meaningful +/// estimate can not be made. +Optional getLoopEstimatedTripCount(Loop *L); + /// Helper to consistently add the set of standard passes to a loop pass's \c /// AnalysisUsage. /// Index: lib/Transforms/Scalar/LoopUnrollPass.cpp =================================================================== --- lib/Transforms/Scalar/LoopUnrollPass.cpp +++ lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -102,6 +102,12 @@ cl::desc("Unrolled size limit for loops with an unroll(full) or " "unroll_count pragma.")); +static cl::opt FlatLoopTripCountThreshold( + "flat-loop-tripcount-threshold", cl::init(5), cl::Hidden, + cl::desc("If the runtime tripcount for the loop is lower than the " + "threshold, the loop is considered as flat and will be less " + "aggressively unrolled.")); + /// A magic value for use with the Threshold parameter to indicate /// that the loop unroll should be performed regardless of how much /// code expansion would result. @@ -748,6 +754,16 @@ bool ExplicitUnroll = PragmaCount > 0 || PragmaFullUnroll || PragmaEnableUnroll || UserUnrollCount; + // Check if the runtime trip count is too small when profile is available. + if (L->getHeader()->getParent()->getEntryCount() && TripCount == 0) { + if (auto ProfileTripCount = getLoopEstimatedTripCount(L)) { + if (*ProfileTripCount < FlatLoopTripCountThreshold) + return false; + else + UP.AllowExpensiveTripCount = true; + } + } + if (ExplicitUnroll && TripCount != 0) { // If the loop has an unrolling pragma, we want to be more aggressive with // unrolling limits. Set thresholds to at least the PragmaThreshold value Index: lib/Transforms/Utils/LoopUtils.cpp =================================================================== --- lib/Transforms/Utils/LoopUtils.cpp +++ lib/Transforms/Utils/LoopUtils.cpp @@ -1067,3 +1067,39 @@ // just a special case of this.) return true; } + +Optional llvm::getLoopEstimatedTripCount(Loop *L) { + // Only support loops with a unique exiting block, and a latch. + if (!L->getExitingBlock()) + return None; + + // Get the branch weights for the the loop's backedge. + BranchInst *LatchBR = + dyn_cast(L->getLoopLatch()->getTerminator()); + if (!LatchBR || LatchBR->getNumSuccessors() != 2) + return None; + + assert((LatchBR->getSuccessor(0) == L->getHeader() || + LatchBR->getSuccessor(1) == L->getHeader()) && + "At least one edge out of the latch must go to the header"); + + // To estimate the number of times the loop body was executed, we want to + // know the number of times the backedge was taken, vs. the number of times + // we exited the loop. + // The branch weights give us almost what we want, since they were adjusted + // from the raw counts to provide a better probability estimate. Remove + // the adjustment by subtracting 1 from both weights. + uint64_t TrueVal, FalseVal; + if (!LatchBR->extractProfMetadata(TrueVal, FalseVal) || (TrueVal <= 1) || + (FalseVal <= 1)) + return None; + + TrueVal -= 1; + FalseVal -= 1; + + // Divide the count of the backedge by the count of the edge exiting the loop. + if (LatchBR->getSuccessor(0) == L->getHeader()) + return TrueVal / FalseVal; + else + return FalseVal / TrueVal; +} Index: test/Transforms/LoopUnroll/unroll-heuristics-pgo.ll =================================================================== --- /dev/null +++ test/Transforms/LoopUnroll/unroll-heuristics-pgo.ll @@ -0,0 +1,59 @@ +; RUN: opt < %s -S -loop-unroll -unroll-runtime -unroll-threshold=40 -unroll-dynamic-cost-savings-discount=0 | FileCheck %s + +@known_constant = internal unnamed_addr constant [9 x i32] [i32 0, i32 -1, i32 0, i32 -1, i32 5, i32 -1, i32 0, i32 -1, i32 0], align 16 + +; CHECK-LABEL: @bar_prof +; CHECK: loop.prol: +; CHECK: loop: +; CHECK: %mul = mul +; CHECK: %mul.1 = mul +; CHECK: %mul.2 = mul +; CHECK: %mul.3 = mul +define i32 @bar_prof(i32* noalias nocapture readonly %src, i64 %c) !prof !1 { +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %inc, %loop ] + %r = phi i32 [ 0, %entry ], [ %add, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %src, i64 %iv + %src_element = load i32, i32* %arrayidx, align 4 + %array_const_idx = getelementptr inbounds [9 x i32], [9 x i32]* @known_constant, i64 0, i64 %iv + %const_array_element = load i32, i32* %array_const_idx, align 4 + %mul = mul nsw i32 %src_element, %const_array_element + %add = add nsw i32 %mul, %r + %inc = add nuw nsw i64 %iv, 1 + %exitcond86.i = icmp eq i64 %inc, %c + br i1 %exitcond86.i, label %loop.end, label %loop, !prof !2 + +loop.end: + %r.lcssa = phi i32 [ %r, %loop ] + ret i32 %r.lcssa +} + +; CHECK-LABEL: @bar_prof_flat +; CHECK-NOT: loop.prol +define i32 @bar_prof_flat(i32* noalias nocapture readonly %src, i64 %c) !prof !1 { +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %inc, %loop ] + %r = phi i32 [ 0, %entry ], [ %add, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %src, i64 %iv + %src_element = load i32, i32* %arrayidx, align 4 + %array_const_idx = getelementptr inbounds [9 x i32], [9 x i32]* @known_constant, i64 0, i64 %iv + %const_array_element = load i32, i32* %array_const_idx, align 4 + %mul = mul nsw i32 %src_element, %const_array_element + %add = add nsw i32 %mul, %r + %inc = add nuw nsw i64 %iv, 1 + %exitcond86.i = icmp eq i64 %inc, %c + br i1 %exitcond86.i, label %loop, label %loop.end, !prof !2 + +loop.end: + %r.lcssa = phi i32 [ %r, %loop ] + ret i32 %r.lcssa +} + +!1 = !{!"function_entry_count", i64 1} +!2 = !{!"branch_weights", i32 1, i32 1000}