Index: lib/Transforms/Scalar/LoopRotation.cpp =================================================================== --- lib/Transforms/Scalar/LoopRotation.cpp +++ lib/Transforms/Scalar/LoopRotation.cpp @@ -28,6 +28,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/IR/Module.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -39,6 +40,7 @@ #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/SSAUpdater.h" #include "llvm/Transforms/Utils/ValueMapper.h" +#include /// std::max using namespace llvm; #define DEBUG_TYPE "loop-rotate" @@ -47,6 +49,16 @@ "rotation-max-header-size", cl::init(16), cl::Hidden, cl::desc("The default maximum header size for automatic loop rotation")); +static cl::opt LargeAverageTripCountThreshold( + "large-average-tripcount-multiplier-threshold", cl::init(5), cl::Hidden, + cl::desc("Loop is considered to have large average tripcout if " + "total_tripcount/total_exit > default")); + +static cl::opt SmallAverageTripCountThreshold( + "small-average-tripcount-multiplier-threshold", cl::init(5), cl::Hidden, + cl::desc("Loop is considered to have small average tripcout if " + "total_exit/total_tripcount > default")); + STATISTIC(NumRotated, "Number of loops rotated"); namespace { @@ -70,6 +82,18 @@ private: bool rotateLoop(Loop *L, bool SimplifiedLatch); bool simplifyLoopLatch(Loop *L); + + /// Update the branch_weight metadata for the rotated loop latch and guard + /// block. + /// + /// NOTE: this function assumes that the tripcounts every time the loop is + /// executed are similar or bell curve like. PGO does not provide us with a + /// distribution of the tripcounts of the loop, it merely gives a summation + /// of all the times the branch is taken or not taken when the loop is hit. + /// In case the loop has a more skewed tripcount distribution we could end + /// up underestimating the tripcount each time the loop is executed. + void updateLoopEstimatedBranchWeight(Loop *L, BranchInst *GuardBR, + BranchInst *LatchBR); }; } // end anonymous namespace @@ -363,6 +387,7 @@ // loop in cases important for nested loops, and it also means we don't have // to split as many edges. BranchInst *PHBI = cast(OrigPreheader->getTerminator()); + BranchInst *GBI = PHBI; assert(PHBI->isConditional() && "Should be clone of BI condbr!"); if (!isa(PHBI->getCondition()) || PHBI->getSuccessor(cast(PHBI->getCondition())->isZero()) != @@ -425,6 +450,7 @@ BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI); NewBI->setDebugLoc(PHBI->getDebugLoc()); PHBI->eraseFromParent(); + GBI = NewBI; // With our CFG finalized, update DomTree if it is available. if (DT) { @@ -463,6 +489,7 @@ } } + assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation"); assert(L->getLoopLatch() && "Invalid loop latch after loop rotation"); @@ -472,12 +499,156 @@ // emitted code isn't too gross in this common case. MergeBlockIntoPredecessor(OrigHeader, DT, LI); + // We now have done all the transformations on the CFG. Adjust the loop branch + // metadata in case we have them. + assert(L->getLoopLatch()->getTerminator() == BI && "Unexpected latch terminator"); + updateLoopEstimatedBranchWeight(L, GBI, BI); + DEBUG(dbgs() << "LoopRotation: into "; L->dump()); ++NumRotated; return true; } +/// We have now rotated the loop, update branch weight prof metadata in case +/// we have it. +void LoopRotate::updateLoopEstimatedBranchWeight(Loop *L, + BranchInst *GuardBR, + BranchInst *LatchBR) { + // If loop has multiple exiting blocks, we need to look through each one + // of them and reason about their profile metadata to compute the rotated + // loop metadata correctly. e.g. if we have an early exit in the loop, + // we will not be able to tell whether the latch block will ever be + // reached after rotation, i.e. early exit could have been taken. + // we bail for now. + // + // FIXME: Handle loops with early exits. + if (!L->getExitingBlock()) + return; + + // Check whether there is loop !prof metadata. + uint64_t LoopBodyWeight, LoopExitWeight; + if (!LatchBR->extractProfMetadata(LoopBodyWeight, LoopExitWeight)) + return; + + // At this point, we know we have a branch weight metadata, recompute it. + // + // Update latch branch metadata. After the loop is rotated, the # of times + // the loop body is executed remain the same and the # of times the exit + // block is executed remain the same as well. We use these information to + // compute the weight for the rest of the branches. + // + // Make sure the loop body and exit weight gets its correct value. + bool LoopBodyOnTrue = LatchBR->getSuccessor(0) == L->getHeader(); + if (!LoopBodyOnTrue) + std::swap(LoopBodyWeight, LoopExitWeight); + + uint64_t LoopPHWeight, BackedgeWeight, NotTakenOnceWeight; + // This is the minimum edge weight. + uint64_t MinimumEdgeWeight = 1; + if (LoopBodyWeight >= LoopExitWeight) { + bool HasLargeAverageTripCount = + LoopBodyWeight >= (LoopExitWeight * LargeAverageTripCountThreshold); + // In case when LoopBodyWeight >> LoopExitWeight, i.e. the runtime trip + // count is large, we model the guard branch to be very biased against + // exiting. we give it 5% chance to exit. This is because everytime the + // guard branch is hit, it most likely will branch to the loop preheader. + // + // And in case it has a smaller average trip count (but bigger than or + // equal to 1). We give it a 50% chance to exit. + double NotTakenOnceRatio = HasLargeAverageTripCount ? 0.05 : 0.5; + + // In case we figure out that the guard branch has been turned into an + // unconidtional branch, we adjust NotTakenOnceRatio to 0. + if (GuardBR->isUnconditional()) + NotTakenOnceRatio = 0; + + // Calculate the not-taken-once weight and backedge weight. + NotTakenOnceWeight = LoopExitWeight * NotTakenOnceRatio; + // # of times loop is entered is the same as # of times the latch branch + // goes to the exit block. + LoopPHWeight = LoopExitWeight - NotTakenOnceWeight; + // Backedge weight is the # of times the loop body is executed - # of times + // loop body is entered. + BackedgeWeight = LoopBodyWeight - LoopPHWeight; + + // We finished computing the edge weights, normalize them a bit to + // make sure every edge weight is at least as big as the minimum + // edge weight. + LoopPHWeight = std::max(LoopPHWeight, MinimumEdgeWeight); + NotTakenOnceWeight = std::max(NotTakenOnceWeight, MinimumEdgeWeight); + BackedgeWeight = std::max(BackedgeWeight, MinimumEdgeWeight); + } else { + bool HasSmallAverageTripCount = + (LoopBodyWeight * SmallAverageTripCountThreshold) <= LoopExitWeight; + // For the case when LoopBodyWeight << LoopExitWeight, i.e. the runtime trip + // count is very small, we model the guard branch to be very biased for exiting. + // We give it 95% chance to exit, this is because everytime the guard branch + // is hit, it most likely will branch to the exit. + // + // And in case it has a bigger average trip count (but smaller than 1). We + // give the guard branch a 50% chance to exit. + const double NotTakenOnceRatio = HasSmallAverageTripCount ? 0.95 : 0.5; + + // NOTE: We do not check whether guard branch is unconditional here, as it + // should not be. However, its not impossible that we get incorrect loop + // body weight or loop exit weight. + + // Calculate the not-taken-once weight and backedge weight. + NotTakenOnceWeight = LoopExitWeight * NotTakenOnceRatio; + // We make sure that the not-taken-once weight is always going to be no smaller + // than LoopExitWeight - LoopBodyWeight. As LoopBodyWeight is the maximum # of + // times exit is branched to from the latch, which makes LoopExitWeight - + // LoopBodyWeight the minimum # of times the exit is reached without executing + // a single iteration of the loop. + NotTakenOnceWeight = std::max(NotTakenOnceWeight, LoopExitWeight - LoopBodyWeight); + // # of times loop is entered is the same as # of times the latch branch + // goes to the exit block. + LoopPHWeight = LoopExitWeight - NotTakenOnceWeight; + // Backedge weight is the # of times the loop body is executed - # of times + // loop body is entered. + BackedgeWeight = LoopBodyWeight - LoopPHWeight; + + // We finished computing the edge weights, normalize them a bit to + // make sure every edge weight is at least as big as the minimum + // edge weight. + LoopPHWeight = std::max(LoopPHWeight, MinimumEdgeWeight); + NotTakenOnceWeight = std::max(NotTakenOnceWeight, MinimumEdgeWeight); + BackedgeWeight = std::max(BackedgeWeight, MinimumEdgeWeight); + } + + // Sanity check ... no branch weight should be smaller than 1. + assert(BackedgeWeight >= MinimumEdgeWeight && + LoopExitWeight >= MinimumEdgeWeight && + NotTakenOnceWeight >= MinimumEdgeWeight && + LoopPHWeight >= MinimumEdgeWeight && + "Invalid branch weight detected!"); + + MDBuilder MDB(LatchBR->getFunction()->getContext()); + // Update the latch branch metadata. The loop preheader weight is the + // same as the loop exit weight. + LatchBR->setMetadata(LLVMContext::MD_prof, LoopBodyOnTrue ? + MDB.createBranchWeights(BackedgeWeight, + LoopPHWeight) : + MDB.createBranchWeights(LoopPHWeight, + BackedgeWeight)); + // Update the guard branch metadata. + if (GuardBR->isUnconditional()) { + // Guard branch has been simplified to an unconditional. It carries no + // branch metadata. + GuardBR->setMetadata(LLVMContext::MD_prof, nullptr); + } else { + GuardBR->setMetadata(LLVMContext::MD_prof, + // Make sure we know where guard branch goes to on + // true. + GuardBR->getSuccessor(0) == L->getLoopPreheader() ? + MDB.createBranchWeights(LoopPHWeight, + NotTakenOnceWeight) : + MDB.createBranchWeights(NotTakenOnceWeight, + LoopPHWeight)); + } +} + /// Determine whether the instructions in this range may be safely and cheaply /// speculated. This is not an important enough situation to develop complex /// heuristics. We handle a single arithmetic instruction along with any type Index: lib/Transforms/Utils/LoopUtils.cpp =================================================================== --- lib/Transforms/Utils/LoopUtils.cpp +++ lib/Transforms/Utils/LoopUtils.cpp @@ -1105,9 +1105,9 @@ return 0; // Divide the count of the backedge by the count of the edge exiting the loop, - // rounding to nearest. + // rounding to nearest. Trip count == backedge-taken count + 1. if (LatchBR->getSuccessor(0) == L->getHeader()) - return (TrueVal + (FalseVal / 2)) / FalseVal; + return (TrueVal + (FalseVal / 2)) / FalseVal + 1; else - return (FalseVal + (TrueVal / 2)) / TrueVal; + return (FalseVal + (TrueVal / 2)) / TrueVal + 1; } Index: test/Transforms/LoopRotate/loop-rotate-pgo.ll =================================================================== --- /dev/null +++ test/Transforms/LoopRotate/loop-rotate-pgo.ll @@ -0,0 +1,154 @@ +; RUN: opt -S -loop-rotate -large-average-tripcount-multiplier-threshold=5 -small-average-tripcount-multiplier-threshold=5 < %s | FileCheck %s +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.12.0" + +; This loop has a very large average trip count, we bias the guard branch highly +; against exiting, i.e only 5% chance to exit. +define i32 @loops_with_large_tripcount(i32 %n) { +; CHECK: entry +; CHECK: icmp slt i32 0, %n +; CHECK: label %for.body.lr.ph, label %for.end, !prof !0 +entry: + br label %for.cond + +for.cond: + %index = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %index, %n + br i1 %cmp, label %for.body, label %for.end, !prof !0 + +for.body: + br label %for.inc + +; CHECK: for.inc +; CHECK: icmp slt i32 %inc, %n +; CHECK-NEXT: label %for.body, label %for.cond.for.end_crit_edge, !prof !1 +for.inc: + %inc = add nsw i32 %index, 1 + br label %for.cond + +for.end: + ret i32 0 +} + +; This loop has average trip count bigger than 1 but no bigger than 5, we give +; the guard branch 50% chance to exit. +define i32 @loops_with_moderately_large_tripcount(i32 %n) { +; CHECK: entry +; CHECK: icmp slt i32 0, %n +; CHECK: label %for.body.lr.ph, label %for.end, !prof !2 +entry: + br label %for.cond + +for.cond: + %index = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %index, %n + br i1 %cmp, label %for.body, label %for.end, !prof !1 + +for.body: + br label %for.inc + +; CHECK: for.inc +; CHECK: icmp slt i32 %inc, %n +; CHECK-NEXT: label %for.body, label %for.cond.for.end_crit_edge, !prof !3 +for.inc: + %inc = add nsw i32 %index, 1 + br label %for.cond + +for.end: + ret i32 0 +} + +; This loop has a very small trip count, we bias the guard branch to have a +; very high chance to exit - 95%. +define i32 @loops_with_small_tripcount(i32 %a) { +; CHECK: icmp slt i32 15, %a +; CHECK-NEXT: label %for.end, label %for.body.lr.ph, !prof !4 +entry: + br label %for.cond + +for.cond: + %index = phi i32 [ 15, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %index, %a + br i1 %cmp, label %for.end, label %for.body, !prof !0 + +for.body: + br label %for.inc + +; CHECK: for.inc +; CHECK: icmp slt i32 %inc, %a +; CHECK-NEXT: label %for.cond.for.end_crit_edge, label %for.body, !prof !5 +for.inc: + %inc = add nsw i32 %index, 1 + br label %for.cond + +for.end: + ret i32 0 +} + +; This loop has average trip count smaller than 1 but no bigger than 1/5, we give +; the guard branch 50% chance to exit. +define i32 @loops_with_moderately_small_tripcount(i32 %a) { +; CHECK: icmp slt i32 15, %a +; CHECK-NEXT: label %for.end, label %for.body.lr.ph, !prof !6 +entry: + br label %for.cond + +for.cond: + %index = phi i32 [ 15, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %index, %a + br i1 %cmp, label %for.end, label %for.body, !prof !1 + +for.body: + br label %for.inc + +; CHECK: for.inc +; CHECK: icmp slt i32 %inc, %a +; CHECK-NEXT: label %for.cond.for.end_crit_edge, label %for.body, !prof !0 +for.inc: + %inc = add nsw i32 %index, 1 + br label %for.cond + +for.end: + ret i32 0 +} + +; This is a loop with known trip count, make sure guard branch metadata is +; deleted after the branch is simplified to an unconditional br. +define i32 @loops_with_known_tripcount() { +; Make sure entry has a branch with no metadata. +; CHECK: entry +; CHECK-NOT: !prof +; CHECK: for.body: +entry: + br label %for.cond + +for.cond: + %index = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %index, 4 + br i1 %cmp, label %for.body, label %for.end, !prof !1 + +for.body: + br label %for.inc + +; CHECK: for.inc +; CHECK: icmp slt i32 %inc, 4 +; CHECK-NEXT: label %for.body, label %for.end, !prof !6 +for.inc: + %inc = add nsw i32 %index, 1 + br label %for.cond + +for.end: + ret i32 0 +} + + +; CHECK: !0 = !{!"branch_weights", i32 7, i32 1} +; CHECK: !1 = !{!"branch_weights", i32 28, i32 7} +; CHECK: !2 = !{!"branch_weights", i32 4, i32 3} +; CHECK: !3 = !{!"branch_weights", i32 24, i32 4} +; CHECK: !4 = !{!"branch_weights", i32 33, i32 2} +; CHECK: !5 = !{!"branch_weights", i32 2, i32 5} +; CHECK: !6 = !{!"branch_weights", i32 21, i32 7} +!0 = !{!"branch_weights", i32 35, i32 7} +!1 = !{!"branch_weights", i32 28, i32 7} +!2 = !{!"branch_weights", i32 4, i32 1} Index: test/Transforms/LoopUnroll/peel-loop-pgo.ll =================================================================== --- test/Transforms/LoopUnroll/peel-loop-pgo.ll +++ test/Transforms/LoopUnroll/peel-loop-pgo.ll @@ -1,11 +1,11 @@ ; RUN: opt < %s -S -debug-only=loop-unroll -loop-unroll -unroll-allow-peeling 2>&1 | FileCheck %s ; REQUIRES: asserts -; Make sure we use the profile information correctly to peel-off 3 iterations +; Make sure we use the profile information correctly to peel-off 4 iterations ; from the loop, and update the branch weights for the peeled loop properly. ; CHECK: Loop Unroll: F[basic] -; CHECK: PEELING loop %for.body with iteration count 3! +; CHECK: PEELING loop %for.body with iteration count 4! ; CHECK: Loop Unroll: F[optsize] ; CHECK-NOT: PEELING @@ -16,7 +16,9 @@ ; CHECK: [[NEXT1]]: ; CHECK: br i1 %{{.*}}, label %[[NEXT2:.*]], label %for.cond.for.end_crit_edge, !prof !3 ; CHECK: [[NEXT2]]: -; CHECK: br i1 %{{.*}}, label %for.body, label %{{.*}}, !prof !4 +; CHECK: br i1 %{{.*}}, label %[[NEXT3:.*]], label %for.cond.for.end_crit_edge, !prof !4 +; CHECK: [[NEXT3]]: +; CHECK: br i1 %{{.*}}, label %for.body, label %{{.*}}, !prof !5 define void @basic(i32* %p, i32 %k) #0 !prof !0 { entry: @@ -80,7 +82,7 @@ !1 = !{!"branch_weights", i32 3001, i32 1001} ;CHECK: !1 = !{!"branch_weights", i32 900, i32 101} -;CHECK: !2 = !{!"branch_weights", i32 540, i32 360} -;CHECK: !3 = !{!"branch_weights", i32 162, i32 378} -;CHECK: !4 = !{!"branch_weights", i32 1399, i32 162} - +;CHECK: !2 = !{!"branch_weights", i32 607, i32 293} +;CHECK: !3 = !{!"branch_weights", i32 273, i32 334} +;CHECK: !4 = !{!"branch_weights", i32 61, i32 212} +;CHECK: !5 = !{!"branch_weights", i32 1160, i32 61}