Index: lib/IR/Metadata.cpp =================================================================== --- lib/IR/Metadata.cpp +++ lib/IR/Metadata.cpp @@ -1269,6 +1269,7 @@ Info.getAll(Result); } + bool Instruction::extractProfMetadata(uint64_t &TrueVal, uint64_t &FalseVal) const { assert( Index: lib/Transforms/Scalar/LoopRotation.cpp =================================================================== --- lib/Transforms/Scalar/LoopRotation.cpp +++ lib/Transforms/Scalar/LoopRotation.cpp @@ -29,6 +29,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/IR/Module.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -70,6 +71,10 @@ private: bool rotateLoop(Loop *L, bool SimplifiedLatch); bool simplifyLoopLatch(Loop *L); + + /// Return true if we are able to find and update the branch_weight metadata + /// for the rotated loop latch and guard block, false otherwise. + bool updateLoopEstimatedBranchWeight(Loop *L, BranchInst *, BranchInst *); }; } // end anonymous namespace @@ -363,6 +368,7 @@ // loop in cases important for nested loops, and it also means we don't have // to split as many edges. BranchInst *PHBI = cast(OrigPreheader->getTerminator()); + BranchInst *GBI = PHBI; assert(PHBI->isConditional() && "Should be clone of BI condbr!"); if (!isa(PHBI->getCondition()) || PHBI->getSuccessor(cast(PHBI->getCondition())->isZero()) != @@ -425,6 +431,7 @@ BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI); NewBI->setDebugLoc(PHBI->getDebugLoc()); PHBI->eraseFromParent(); + GBI = NewBI; // With our CFG finalized, update DomTree if it is available. if (DT) { @@ -463,6 +470,14 @@ } } + // Adjust the loop branch metadata, and in case we do not know how to + // update them properly, simply drop them. No data is better than imprecise + // data. + if (!updateLoopEstimatedBranchWeight(L, GBI, BI /* OrigHeader BR */)) { + BI->setMetadata(LLVMContext::MD_prof, nullptr); + GBI->setMetadata(LLVMContext::MD_prof, nullptr); + } + assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation"); assert(L->getLoopLatch() && "Invalid loop latch after loop rotation"); @@ -478,6 +493,88 @@ return true; } +/// We have now rotated the loop, update branch weight prof metadata in case +/// we have it. +bool LoopRotate::updateLoopEstimatedBranchWeight(Loop *L, + BranchInst *GuardBR, + BranchInst *LatchBR) { + // If loop has multiple exitting block, we need to look through each one + // of them and reason about their profile metadata to compute the rotated + // loop metadata correctly. e.g. if we have an early exit in the loop, + // we will not be able to tell whether the latch block will ever be + // reached after rotation, i.e. early exit could have been taken. + // we bail for now. + // + // FIXME: Handle loops with early exits. + if (!L->getExitingBlock()) + return false; + + // Update latch branch metadata. After the loop is rotated, the # of times + // the loop body is executed remain the same and the # of times the exit + // block is executed remain the same as well. We use these information to + // compute the weight for the rest of the branches. + uint64_t LoopBodyWeight, LoopExitWeight; + uint64_t BackedgeWeight = 1, NotTakeOnceWeight = 1; + if (LatchBR->extractProfMetadata(LoopBodyWeight, LoopExitWeight)) { + // Make sure the loop body and exit weight gets its correct value. + bool LoopBodyOnTrue = LatchBR->getSuccessor(0) == L->getHeader(); + if (!LoopBodyOnTrue) + std::swap(LoopBodyWeight, LoopExitWeight); + + MDBuilder MDB(LatchBR->getFunction()->getContext()); + if (LoopBodyWeight >= LoopExitWeight) { + // In case profiled LoopBodyWeight >= LoopExitWeight, we assume the + // branch-to-exit in the guard branch is never taken, as on average the + // loop body is executed at least once before the exit is taken. + // + // Backedge weight is simply the difference of # of times the loop + // body is executed and the # of times the loop body is entered through + // a forwarding branch, same as LoopExitWeight here. We also make sure + // backedge is at least 1, i.e. BackedgeWeight is initialized to 1. + if (LoopBodyWeight != LoopExitWeight) + BackedgeWeight = LoopBodyWeight - LoopExitWeight; + } else { + // In case of profiled LoopBodyWeight < LoopExitWeight, we know that + // when the unrotated loop header is hit, the loop body is not executed + // at all some of the time. And for times which the loop body is executed + // its probably just a small # of iterations, assuming the loop executes + // similar # of iterations every time its hit. In this case, we assume + // backedge weight is 1. + uint64_t OrigLoopExitWeight = LoopExitWeight; + // The # of times the LoopExitWeight is taken is the same as the # of + // times the loop body is executed, as we basically ignore backedge in + // this case. + LoopExitWeight = LoopBodyWeight; + NotTakeOnceWeight = OrigLoopExitWeight - LoopExitWeight; + } + + // Sanity check ... no branch weight should be smaller than 1. + assert(BackedgeWeight > 0 && LoopExitWeight > 0 && + NotTakeOnceWeight > 0 && "Invalid branch weight detected!"); + + // Update the latch branch and guard branch metadata. + LatchBR->setMetadata(LLVMContext::MD_prof, LoopBodyOnTrue ? + MDB.createBranchWeights(BackedgeWeight, + LoopExitWeight) : + MDB.createBranchWeights(LoopExitWeight, + BackedgeWeight)); + if (GuardBR->isUnconditional()) { + GuardBR->setMetadata(LLVMContext::MD_prof, nullptr); + } else { + GuardBR->setMetadata(LLVMContext::MD_prof, + GuardBR->getSuccessor(0) == L->getHeader() ? + MDB.createBranchWeights(LoopExitWeight, + NotTakeOnceWeight) : + MDB.createBranchWeights(NotTakeOnceWeight, + LoopExitWeight)); + } + return true; + } + + // We do not know how to update the branches' metadata. + return false; +} + /// Determine whether the instructions in this range may be safely and cheaply /// speculated. This is not an important enough situation to develop complex /// heuristics. We handle a single arithmetic instruction along with any type Index: test/Transforms/LoopRotate/loop-rotate-pgo.ll =================================================================== --- /dev/null +++ test/Transforms/LoopRotate/loop-rotate-pgo.ll @@ -0,0 +1,61 @@ +; RUN: opt -S -loop-rotate < %s | FileCheck %s +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.12.0" + +; This loop has a big trip count as the for.body is executed +; 35 times and the exit is executed 7 times. In such case we +; can generate a non-trivial prof metadata for the latch after +; rotation. +define i32 @loops_with_big_tripcount() { +entry: + br label %for.cond + +for.cond: + %index = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %index, 1024 + br i1 %cmp, label %for.body, label %for.end, !prof !0 + +for.body: + br label %for.inc + +; CHECK: for.inc +; CHECK: icmp slt i32 %inc, 1024 +; CHECK-NEXT: !prof !0 +for.inc: + %inc = add nsw i32 %index, 1 + br label %for.cond + +for.end: + ret i32 0 +} + +; This loop has a small trip count, most of the time it simply branches +; to for.end. +define i32 @loops_with_small_tripcount(i32 %a) { +; CHECK: icmp slt i32 15, %a +; CHECK-NEXT: !prof !0 +entry: + br label %for.cond + +for.cond: + %index = phi i32 [ 15, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %index, %a + br i1 %cmp, label %for.end, label %for.body, !prof !0 + +for.body: + br label %for.inc + +; CHECK: for.inc +; CHECK: icmp slt i32 %inc, %a +; CHECK-NEXT: !prof !1 +for.inc: + %inc = add nsw i32 %index, 1 + br label %for.cond + +for.end: + ret i32 0 +} + +; CHECK: !0 = !{!"branch_weights", i32 28, i32 7} +; CHECK: !1 = !{!"branch_weights", i32 7, i32 1} +!0 = !{!"branch_weights", i32 35, i32 7}