Index: lib/Transforms/Scalar/LoopRotation.cpp =================================================================== --- lib/Transforms/Scalar/LoopRotation.cpp +++ lib/Transforms/Scalar/LoopRotation.cpp @@ -28,7 +28,10 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/IR/Module.h" +#include "llvm/Support/BlockFrequency.h" +#include "llvm/Support/BranchProbability.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -39,6 +42,7 @@ #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/SSAUpdater.h" #include "llvm/Transforms/Utils/ValueMapper.h" +#include /// std::max using namespace llvm; #define DEBUG_TYPE "loop-rotate" @@ -71,6 +75,18 @@ private: bool rotateLoop(Loop *L, bool SimplifiedLatch); bool simplifyLoopLatch(Loop *L); + + /// Update the branch_weight metadata for the rotated loop latch and guard + /// block. + /// + /// NOTE: this function assumes that the tripcounts every time the loop is + /// executed are similar or bell curve like. PGO does not provide us with a + /// distribution of the tripcounts of the loop, it merely gives a summation + /// of all the times the branch is taken or not taken when the loop is hit. + /// In case the loop has a more skewed tripcount distribution we could end + /// up underestimating the tripcount each time the loop is executed. + void updateLoopEstimatedBranchWeight(Loop *L, BranchInst *GuardBR, + BranchInst *LatchBR); }; } // end anonymous namespace @@ -403,6 +419,7 @@ // loop in cases important for nested loops, and it also means we don't have // to split as many edges. BranchInst *PHBI = cast(OrigPreheader->getTerminator()); + BranchInst *GBI = PHBI; assert(PHBI->isConditional() && "Should be clone of BI condbr!"); if (!isa(PHBI->getCondition()) || PHBI->getSuccessor(cast(PHBI->getCondition())->isZero()) != @@ -465,6 +482,7 @@ BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI); NewBI->setDebugLoc(PHBI->getDebugLoc()); PHBI->eraseFromParent(); + GBI = NewBI; // With our CFG finalized, update DomTree if it is available. if (DT) { @@ -512,12 +530,161 @@ // emitted code isn't too gross in this common case. MergeBlockIntoPredecessor(OrigHeader, DT, LI); + // We now have done all the transformations on the CFG. Adjust the loop branch + // metadata in case we have them. + assert(L->getLoopLatch()->getTerminator() == BI && + "Unexpected latch terminator"); + updateLoopEstimatedBranchWeight(L, GBI, BI); + DEBUG(dbgs() << "LoopRotation: into "; L->dump()); ++NumRotated; return true; } +/// Based on the loop body weight and the loop exit weight, determine the # of +/// times the loop exits without a single iteration. +static uint64_t computeLoopNotTakenOnceWeight(uint64_t LoopBodyWeight, + uint64_t LoopExitWeight) { + unsigned NotTakenOncePercent = 0; + double ATC = (double)LoopBodyWeight / LoopExitWeight; + // At this point, we have the # of times the loop body and loop exit execute. + // Based on this information, we compute the # of times the loop exits without + // a single iteration, i.e. not-taken-once-weight. + // + // NOTE: ATC does not directly tells us how many times the loop exits without + // executing a single iteration. + // + // We define the following discrete average-trip-count (ATC) thresholds: + // + // 1. Loop with really large ATC (20+). This loop probably always executes + // a few iterations every time its hit. We give it maximum chance to exit + // without a single iteration. + // + // 2. Loop with large ATC (5+). We give it 5% chance to exit without a single + // iteration. This is because everytime the loop is hit, it likely will branch + // to the loop body. + // + // 3. Loop with ATC (~1+-). We give it a 50% chance to exit. This loop may + // have sometimes exit wihtout a single iteration. + // + // 5. Loop with small ATC (~1/5). We give it 95% chance to exit, this + // is because everytime the loop is hit, it most likely will branch + // to the exit without executing a single iteration. + // + // 6. Loop with really small ATC (1/20-). We give it maximum chance to exit + // without a single iteration. This loop probably seldom executes everytime + // it is hit. + if (ATC > 20) + NotTakenOncePercent = 1; + else if (ATC > 5) + NotTakenOncePercent = 5; + else if (ATC > 0.2) + NotTakenOncePercent = 50; + else if (ATC > 0.05) + NotTakenOncePercent = 95; + else + NotTakenOncePercent = 1; + + // We now have the NotTakenOnce percentage, compute its value. + BranchProbability GuardBP(NotTakenOncePercent, 100); + uint64_t NotTakenOnce = + (BlockFrequency(LoopExitWeight) * GuardBP).getFrequency(); + + // In addition to the thresholds, we need to make sure NotTakenOnce is at + // least as large as its minimum, i.e. LoopExitWeight - LoopBodyWeight, in + // case LoopExitWeight > LoopBodyWeight. + if (LoopExitWeight < LoopBodyWeight) + return NotTakenOnce; + return std::max(LoopExitWeight - LoopBodyWeight, NotTakenOnce); +} + +/// We have now rotated the loop, update branch weight prof metadata in case +/// we have it. +void LoopRotate::updateLoopEstimatedBranchWeight(Loop *L, BranchInst *GuardBR, + BranchInst *LatchBR) { + // If loop has multiple exiting blocks, we need to look through each one + // of them and reason about their profile metadata to compute the rotated + // loop metadata correctly. e.g. if we have an early exit in the loop, + // we will not be able to tell whether the latch block will ever be + // reached after rotation, i.e. early exit could have been taken. + // we bail for now. + // + // FIXME: Handle loops with early exits. + if (!L->getExitingBlock()) + return; + + // Check whether there is loop !prof metadata. + uint64_t LoopBodyWeight, LoopExitWeight; + if (!LatchBR->extractProfMetadata(LoopBodyWeight, LoopExitWeight)) + return; + + // At this point, we know we have a branch weight metadata, recompute it. + // + // Update latch branch metadata. After the loop is rotated, the # of times + // the loop body is executed remain the same and the # of times the exit + // block is executed remain the same as well. We use these information to + // compute the weight for the rest of the branches. + // + // Make sure the loop body and exit weight gets its correct value. + bool LoopBodyOnTrue = LatchBR->getSuccessor(0) == L->getHeader(); + if (!LoopBodyOnTrue) + std::swap(LoopBodyWeight, LoopExitWeight); + + // Once we know the # of times the loop exits without a single iteration, + // we can compute other weights after rotating the loop. + uint64_t NotTakenOnceWeight = + computeLoopNotTakenOnceWeight(LoopBodyWeight, LoopExitWeight); + + // Make sure NotTakenOnceWeight is always no larger than LoopExitWeight. + assert(NotTakenOnceWeight <= LoopExitWeight && "Invalid LoopExitWeight"); + // # of times loop is entered is the same as # of times the latch branch + // goes to the exit block. + uint64_t LoopPHWeight = LoopExitWeight - NotTakenOnceWeight; + + // Backedge weight is the # of times the loop body is executed - # of times + // loop body is entered. + assert(LoopBodyWeight >= LoopPHWeight && "Invalid LoopPHWeight"); + uint64_t BackedgeWeight = LoopBodyWeight - LoopPHWeight; + + // We finished computing the edge weights, normalize them a bit to + // make sure every edge weight is at least as big as the minimum + // edge weight. + const uint64_t MinimumEdgeWeight = 1; + LoopPHWeight = std::max(LoopPHWeight, MinimumEdgeWeight); + NotTakenOnceWeight = std::max(NotTakenOnceWeight, MinimumEdgeWeight); + BackedgeWeight = std::max(BackedgeWeight, MinimumEdgeWeight); + + // Sanity check ... no branch weight should be smaller than 1. + assert(BackedgeWeight >= MinimumEdgeWeight && + LoopExitWeight >= MinimumEdgeWeight && + NotTakenOnceWeight >= MinimumEdgeWeight && + LoopPHWeight >= MinimumEdgeWeight && + "Invalid branch weight detected!"); + + MDBuilder MDB(LatchBR->getFunction()->getContext()); + // Update the latch branch metadata. The loop preheader weight is the + // same as the loop exit weight. + LatchBR->setMetadata( + LLVMContext::MD_prof, + LoopBodyOnTrue ? MDB.createBranchWeights(BackedgeWeight, LoopPHWeight) + : MDB.createBranchWeights(LoopPHWeight, BackedgeWeight)); + // Update the guard branch metadata. + if (GuardBR->isUnconditional()) { + // Guard branch has been simplified to an unconditional. It carries no + // branch metadata. + GuardBR->setMetadata(LLVMContext::MD_prof, nullptr); + } else { + GuardBR->setMetadata( + LLVMContext::MD_prof, + // Make sure we know where guard branch goes to on + // true. + GuardBR->getSuccessor(0) == L->getLoopPreheader() + ? MDB.createBranchWeights(LoopPHWeight, NotTakenOnceWeight) + : MDB.createBranchWeights(NotTakenOnceWeight, LoopPHWeight)); + } +} + /// Determine whether the instructions in this range may be safely and cheaply /// speculated. This is not an important enough situation to develop complex /// heuristics. We handle a single arithmetic instruction along with any type Index: lib/Transforms/Utils/LoopUtils.cpp =================================================================== --- lib/Transforms/Utils/LoopUtils.cpp +++ lib/Transforms/Utils/LoopUtils.cpp @@ -1106,9 +1106,9 @@ return 0; // Divide the count of the backedge by the count of the edge exiting the loop, - // rounding to nearest. + // rounding to nearest. Trip count == backedge-taken count + 1. if (LatchBR->getSuccessor(0) == L->getHeader()) - return (TrueVal + (FalseVal / 2)) / FalseVal; + return (TrueVal + (FalseVal / 2)) / FalseVal + 1; else - return (FalseVal + (TrueVal / 2)) / TrueVal; + return (FalseVal + (TrueVal / 2)) / TrueVal + 1; } Index: test/Transforms/LoopRotate/loop-rotate-pgo.ll =================================================================== --- /dev/null +++ test/Transforms/LoopRotate/loop-rotate-pgo.ll @@ -0,0 +1,154 @@ +; RUN: opt -S -loop-rotate < %s | FileCheck %s +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.12.0" + +; This loop has a very large average trip count, we bias the guard branch highly +; against exiting, i.e only 5% chance to exit. +define i32 @loops_with_large_tripcount(i32 %n) { +; CHECK: entry +; CHECK: icmp slt i32 0, %n +; CHECK: label %for.body.lr.ph, label %for.end, !prof !0 +entry: + br label %for.cond + +for.cond: + %index = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %index, %n + br i1 %cmp, label %for.body, label %for.end, !prof !0 + +for.body: + br label %for.inc + +; CHECK: for.inc +; CHECK: icmp slt i32 %inc, %n +; CHECK-NEXT: label %for.body, label %for.cond.for.end_crit_edge, !prof !1 +for.inc: + %inc = add nsw i32 %index, 1 + br label %for.cond + +for.end: + ret i32 0 +} + +; This loop has average trip count bigger than 1 but no bigger than 5, we give +; the guard branch 50% chance to exit. +define i32 @loops_with_moderately_large_tripcount(i32 %n) { +; CHECK: entry +; CHECK: icmp slt i32 0, %n +; CHECK: label %for.body.lr.ph, label %for.end, !prof !2 +entry: + br label %for.cond + +for.cond: + %index = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %index, %n + br i1 %cmp, label %for.body, label %for.end, !prof !1 + +for.body: + br label %for.inc + +; CHECK: for.inc +; CHECK: icmp slt i32 %inc, %n +; CHECK-NEXT: label %for.body, label %for.cond.for.end_crit_edge, !prof !3 +for.inc: + %inc = add nsw i32 %index, 1 + br label %for.cond + +for.end: + ret i32 0 +} + +; This loop has a very small trip count, we bias the guard branch to have a +; very high chance to exit - 95%. +define i32 @loops_with_small_tripcount(i32 %a) { +; CHECK: icmp slt i32 15, %a +; CHECK-NEXT: label %for.end, label %for.body.lr.ph, !prof !4 +entry: + br label %for.cond + +for.cond: + %index = phi i32 [ 15, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %index, %a + br i1 %cmp, label %for.end, label %for.body, !prof !0 + +for.body: + br label %for.inc + +; CHECK: for.inc +; CHECK: icmp slt i32 %inc, %a +; CHECK-NEXT: label %for.cond.for.end_crit_edge, label %for.body, !prof !5 +for.inc: + %inc = add nsw i32 %index, 1 + br label %for.cond + +for.end: + ret i32 0 +} + +; This loop has average trip count smaller than 1 but no bigger than 1/5, we give +; the guard branch 50% chance to exit. +define i32 @loops_with_moderately_small_tripcount(i32 %a) { +; CHECK: icmp slt i32 15, %a +; CHECK-NEXT: label %for.end, label %for.body.lr.ph, !prof !6 +entry: + br label %for.cond + +for.cond: + %index = phi i32 [ 15, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %index, %a + br i1 %cmp, label %for.end, label %for.body, !prof !1 + +for.body: + br label %for.inc + +; CHECK: for.inc +; CHECK: icmp slt i32 %inc, %a +; CHECK-NEXT: label %for.cond.for.end_crit_edge, label %for.body, !prof !0 +for.inc: + %inc = add nsw i32 %index, 1 + br label %for.cond + +for.end: + ret i32 0 +} + +; This is a loop with known trip count, make sure guard branch metadata is +; deleted after the branch is simplified to an unconditional br. +define i32 @loops_with_known_tripcount() { +; Make sure entry has a branch with no metadata. +; CHECK: entry +; CHECK-NOT: !prof +; CHECK: for.body: +entry: + br label %for.cond + +for.cond: + %index = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %index, 4 + br i1 %cmp, label %for.body, label %for.end, !prof !1 + +for.body: + br label %for.inc + +; CHECK: for.inc +; CHECK: icmp slt i32 %inc, 4 +; CHECK-NEXT: label %for.body, label %for.end, !prof !6 +for.inc: + %inc = add nsw i32 %index, 1 + br label %for.cond + +for.end: + ret i32 0 +} + + +; CHECK: !0 = !{!"branch_weights", i32 7, i32 1} +; CHECK: !1 = !{!"branch_weights", i32 28, i32 7} +; CHECK: !2 = !{!"branch_weights", i32 4, i32 3} +; CHECK: !3 = !{!"branch_weights", i32 24, i32 4} +; CHECK: !4 = !{!"branch_weights", i32 33, i32 2} +; CHECK: !5 = !{!"branch_weights", i32 2, i32 5} +; CHECK: !6 = !{!"branch_weights", i32 21, i32 7} +!0 = !{!"branch_weights", i32 35, i32 7} +!1 = !{!"branch_weights", i32 28, i32 7} +!2 = !{!"branch_weights", i32 4, i32 1} Index: test/Transforms/LoopUnroll/peel-loop-pgo.ll =================================================================== --- test/Transforms/LoopUnroll/peel-loop-pgo.ll +++ test/Transforms/LoopUnroll/peel-loop-pgo.ll @@ -1,11 +1,11 @@ ; RUN: opt < %s -S -debug-only=loop-unroll -loop-unroll 2>&1 | FileCheck %s ; REQUIRES: asserts -; Make sure we use the profile information correctly to peel-off 3 iterations +; Make sure we use the profile information correctly to peel-off 4 iterations ; from the loop, and update the branch weights for the peeled loop properly. ; CHECK: Loop Unroll: F[basic] -; CHECK: PEELING loop %for.body with iteration count 3! +; CHECK: PEELING loop %for.body with iteration count 4! ; CHECK: Loop Unroll: F[optsize] ; CHECK-NOT: PEELING @@ -16,7 +16,9 @@ ; CHECK: [[NEXT1]]: ; CHECK: br i1 %{{.*}}, label %[[NEXT2:.*]], label %for.cond.for.end_crit_edge, !prof !3 ; CHECK: [[NEXT2]]: -; CHECK: br i1 %{{.*}}, label %for.body, label %{{.*}}, !prof !4 +; CHECK: br i1 %{{.*}}, label %[[NEXT3:.*]], label %for.cond.for.end_crit_edge, !prof !4 +; CHECK: [[NEXT3]]: +; CHECK: br i1 %{{.*}}, label %for.body, label %{{.*}}, !prof !5 define void @basic(i32* %p, i32 %k) #0 !prof !0 { entry: @@ -80,7 +82,7 @@ !1 = !{!"branch_weights", i32 3001, i32 1001} ;CHECK: !1 = !{!"branch_weights", i32 900, i32 101} -;CHECK: !2 = !{!"branch_weights", i32 540, i32 360} -;CHECK: !3 = !{!"branch_weights", i32 162, i32 378} -;CHECK: !4 = !{!"branch_weights", i32 1399, i32 162} - +;CHECK: !2 = !{!"branch_weights", i32 607, i32 293} +;CHECK: !3 = !{!"branch_weights", i32 273, i32 334} +;CHECK: !4 = !{!"branch_weights", i32 61, i32 212} +;CHECK: !5 = !{!"branch_weights", i32 1160, i32 61}