Index: include/llvm/Support/GenericDomTree.h =================================================================== --- include/llvm/Support/GenericDomTree.h +++ include/llvm/Support/GenericDomTree.h @@ -571,9 +571,15 @@ // API to update (Post)DominatorTree information based on modifications to // the CFG... - /// addNewBlock - Add a new node to the dominator tree information. This - /// creates a new node as a child of DomBB dominator node,linking it into - /// the children list of the immediate dominator. + /// Add a new node to the dominator tree information. + /// + /// This creates a new node as a child of DomBB dominator node, linking it + /// into the children list of the immediate dominator. + /// + /// \param BB New node in CFG. + /// \param DomBB CFG node that is dominator for BB. + /// \returns New dominator tree node that represents new CFG node. + /// DomTreeNodeBase *addNewBlock(NodeT *BB, NodeT *DomBB) { assert(getNode(BB) == nullptr && "Block already in dominator tree!"); DomTreeNodeBase *IDomNode = getNode(DomBB); @@ -583,6 +589,31 @@ llvm::make_unique>(BB, IDomNode))).get(); } + /// Add a new node to the forward dominator tree and make it a new root. + /// + /// \param BB New node in CFG. + /// \returns New dominator tree node that represents new CFG node. + /// + DomTreeNodeBase *setNewRoot(NodeT *BB) { + assert(getNode(BB) == nullptr && "Block already in dominator tree!"); + assert(!this->isPostDominator() && + "Cannot change root of post-dominator tree"); + DFSInfoValid = false; + auto &Roots = DominatorBase::Roots; + DomTreeNodeBase *NewNode = (DomTreeNodes[BB] = + llvm::make_unique>(BB, nullptr)).get(); + if (Roots.empty()) { + addRoot(BB); + } else { + assert(Roots.size() == 1); + NodeT *OldRoot = Roots.front(); + DomTreeNodes[OldRoot] = + NewNode->addChild(std::move(DomTreeNodes[OldRoot])); + Roots[0] = BB; + } + return RootNode = NewNode; + } + /// changeImmediateDominator - This method is used to update the dominator /// tree information when a node's immediate dominator changes. /// Index: lib/Transforms/Scalar/IndVarSimplify.cpp =================================================================== --- lib/Transforms/Scalar/IndVarSimplify.cpp +++ lib/Transforms/Scalar/IndVarSimplify.cpp @@ -2096,7 +2096,7 @@ return Builder.CreateGEP(nullptr, GEPBase, GEPOffset, "lftr.limit"); } else { // In any other case, convert both IVInit and IVCount to integers before - // comparing. This may result in SCEV expension of pointers, but in practice + // comparing. This may result in SCEV expansion of pointers, but in practice // SCEV will fold the pointer arithmetic away as such: // BECount = (IVEnd - IVInit - 1) => IVLimit = IVInit (postinc). // Index: lib/Transforms/Scalar/LoopRotation.cpp =================================================================== --- lib/Transforms/Scalar/LoopRotation.cpp +++ lib/Transforms/Scalar/LoopRotation.cpp @@ -29,6 +29,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/IR/Module.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -70,6 +71,17 @@ private: bool rotateLoop(Loop *L, bool SimplifiedLatch); bool simplifyLoopLatch(Loop *L); + + /// Update the branch_weight metadata for the rotated loop latch and guard + /// block. + /// + /// NOTE: this function assumes that the tripcounts every time the loop is + /// executed are similar or bell curve like. PGO does not provide us with a + /// distribution of the tripcounts of the loop, it merely gives a summation + /// of all the times the branch is taken or not taken when the loop is hit. + /// In case the loop has a more skewed tripcount distribution we could end + /// up underestimating the tripcount each time the loop is executed. + void updateLoopEstimatedBranchWeight(Loop *L, BranchInst *, BranchInst *); }; } // end anonymous namespace @@ -363,6 +375,7 @@ // loop in cases important for nested loops, and it also means we don't have // to split as many edges. BranchInst *PHBI = cast(OrigPreheader->getTerminator()); + BranchInst *GBI = PHBI; assert(PHBI->isConditional() && "Should be clone of BI condbr!"); if (!isa(PHBI->getCondition()) || PHBI->getSuccessor(cast(PHBI->getCondition())->isZero()) != @@ -425,6 +438,7 @@ BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI); NewBI->setDebugLoc(PHBI->getDebugLoc()); PHBI->eraseFromParent(); + GBI = NewBI; // With our CFG finalized, update DomTree if it is available. if (DT) { @@ -463,6 +477,9 @@ } } + // Adjust the loop branch metadata in case we have them. + updateLoopEstimatedBranchWeight(L, GBI, BI /* OrigHeader BR */); + assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation"); assert(L->getLoopLatch() && "Invalid loop latch after loop rotation"); @@ -478,6 +495,84 @@ return true; } +/// We have now rotated the loop, update branch weight prof metadata in case +/// we have it. +void LoopRotate::updateLoopEstimatedBranchWeight(Loop *L, + BranchInst *GuardBR, + BranchInst *LatchBR) { + // If loop has multiple exitting block, we need to look through each one + // of them and reason about their profile metadata to compute the rotated + // loop metadata correctly. e.g. if we have an early exit in the loop, + // we will not be able to tell whether the latch block will ever be + // reached after rotation, i.e. early exit could have been taken. + // we bail for now. + // + // FIXME: Handle loops with early exits. + if (!L->getExitingBlock()) + return; + + // Update latch branch metadata. After the loop is rotated, the # of times + // the loop body is executed remain the same and the # of times the exit + // block is executed remain the same as well. We use these information to + // compute the weight for the rest of the branches. + uint64_t LoopBodyWeight, LoopExitWeight; + uint64_t BackedgeWeight = 1, NotTakeOnceWeight = 1; + if (LatchBR->extractProfMetadata(LoopBodyWeight, LoopExitWeight)) { + // Make sure the loop body and exit weight gets its correct value. + bool LoopBodyOnTrue = LatchBR->getSuccessor(0) == L->getHeader(); + if (!LoopBodyOnTrue) + std::swap(LoopBodyWeight, LoopExitWeight); + + MDBuilder MDB(LatchBR->getFunction()->getContext()); + if (LoopBodyWeight >= LoopExitWeight) { + // In case profiled LoopBodyWeight >= LoopExitWeight, we assume the + // branch-to-exit in the guard branch is never taken, as on average the + // loop body is executed at least once before the exit is taken. + // + // Backedge weight is simply the difference of # of times the loop + // body is executed and the # of times the loop body is entered through + // a forwarding branch, same as LoopExitWeight here. We also make sure + // backedge is at least 1, i.e. BackedgeWeight is initialized to 1. + if (LoopBodyWeight != LoopExitWeight) + BackedgeWeight = LoopBodyWeight - LoopExitWeight; + } else { + // In case of profiled LoopBodyWeight < LoopExitWeight, we know that + // when the unrotated loop header is hit, the loop body is not executed + // at all some of the time. And for times which the loop body is executed + // its probably just a small # of iterations, assuming the loop executes + // similar # of iterations every time its hit. In this case, we assume + // backedge weight is 1. + uint64_t OrigLoopExitWeight = LoopExitWeight; + // The # of times the LoopExitWeight is taken is the same as the # of + // times the loop body is executed, as we basically ignore backedge in + // this case. + LoopExitWeight = LoopBodyWeight; + NotTakeOnceWeight = OrigLoopExitWeight - LoopExitWeight; + } + + // Sanity check ... no branch weight should be smaller than 1. + assert(BackedgeWeight > 0 && LoopExitWeight > 0 && + NotTakeOnceWeight > 0 && "Invalid branch weight detected!"); + + // Update the latch branch and guard branch metadata. + LatchBR->setMetadata(LLVMContext::MD_prof, LoopBodyOnTrue ? + MDB.createBranchWeights(BackedgeWeight, + LoopExitWeight) : + MDB.createBranchWeights(LoopExitWeight, + BackedgeWeight)); + if (GuardBR->isUnconditional()) { + GuardBR->setMetadata(LLVMContext::MD_prof, nullptr); + } else { + GuardBR->setMetadata(LLVMContext::MD_prof, + GuardBR->getSuccessor(0) == L->getHeader() ? + MDB.createBranchWeights(LoopExitWeight, + NotTakeOnceWeight) : + MDB.createBranchWeights(NotTakeOnceWeight, + LoopExitWeight)); + } + } +} + /// Determine whether the instructions in this range may be safely and cheaply /// speculated. This is not an important enough situation to develop complex /// heuristics. We handle a single arithmetic instruction along with any type Index: lib/Transforms/Scalar/StructurizeCFG.cpp =================================================================== --- lib/Transforms/Scalar/StructurizeCFG.cpp +++ lib/Transforms/Scalar/StructurizeCFG.cpp @@ -792,6 +792,7 @@ LoopFunc, LoopStart); BranchInst::Create(LoopStart, NewEntry); + DT->setNewRoot(NewEntry); } // Create an extra loop end node Index: test/Transforms/LoopRotate/loop-rotate-pgo.ll =================================================================== --- /dev/null +++ test/Transforms/LoopRotate/loop-rotate-pgo.ll @@ -0,0 +1,61 @@ +; RUN: opt -S -loop-rotate < %s | FileCheck %s +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.12.0" + +; This loop has a big trip count as the for.body is executed +; 35 times and the exit is executed 7 times. In such case we +; can generate a non-trivial prof metadata for the latch after +; rotation. +define i32 @loops_with_big_tripcount() { +entry: + br label %for.cond + +for.cond: + %index = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %index, 1024 + br i1 %cmp, label %for.body, label %for.end, !prof !0 + +for.body: + br label %for.inc + +; CHECK: for.inc +; CHECK: icmp slt i32 %inc, 1024 +; CHECK-NEXT: !prof !0 +for.inc: + %inc = add nsw i32 %index, 1 + br label %for.cond + +for.end: + ret i32 0 +} + +; This loop has a small trip count, most of the time it simply branches +; to for.end. +define i32 @loops_with_small_tripcount(i32 %a) { +; CHECK: icmp slt i32 15, %a +; CHECK-NEXT: !prof !0 +entry: + br label %for.cond + +for.cond: + %index = phi i32 [ 15, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %index, %a + br i1 %cmp, label %for.end, label %for.body, !prof !0 + +for.body: + br label %for.inc + +; CHECK: for.inc +; CHECK: icmp slt i32 %inc, %a +; CHECK-NEXT: !prof !1 +for.inc: + %inc = add nsw i32 %index, 1 + br label %for.cond + +for.end: + ret i32 0 +} + +; CHECK: !0 = !{!"branch_weights", i32 28, i32 7} +; CHECK: !1 = !{!"branch_weights", i32 7, i32 1} +!0 = !{!"branch_weights", i32 35, i32 7} Index: test/Transforms/StructurizeCFG/no-branch-to-entry.ll =================================================================== --- test/Transforms/StructurizeCFG/no-branch-to-entry.ll +++ test/Transforms/StructurizeCFG/no-branch-to-entry.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -o - -structurizecfg < %s | FileCheck %s +; RUN: opt -S -o - -structurizecfg -verify-dom-info < %s | FileCheck %s ; CHECK-LABEL: @no_branch_to_entry_undef( ; CHECK: entry: Index: tools/llvm-xray/xray-converter.cc =================================================================== --- tools/llvm-xray/xray-converter.cc +++ tools/llvm-xray/xray-converter.cc @@ -20,7 +20,6 @@ #include "llvm/Support/FileSystem.h" #include "llvm/Support/YAMLTraits.h" #include "llvm/Support/raw_ostream.h" -#include using namespace llvm; using namespace xray; Index: unittests/IR/DominatorTreeTest.cpp =================================================================== --- unittests/IR/DominatorTreeTest.cpp +++ unittests/IR/DominatorTreeTest.cpp @@ -203,6 +203,16 @@ EXPECT_EQ(DT->getNode(BB4)->getDFSNumIn(), 5UL); EXPECT_EQ(DT->getNode(BB4)->getDFSNumOut(), 6UL); + // Change root node + DT->verifyDomTree(); + BasicBlock *NewEntry = BasicBlock::Create(F.getContext(), "new_entry", + &F, BB0); + BranchInst::Create(BB0, NewEntry); + EXPECT_EQ(F.begin()->getName(), NewEntry->getName()); + EXPECT_TRUE(&F.getEntryBlock() == NewEntry); + DT->setNewRoot(NewEntry); + DT->verifyDomTree(); + return false; } void getAnalysisUsage(AnalysisUsage &AU) const override {