Index: llvm/trunk/include/llvm/Transforms/Scalar/JumpThreading.h =================================================================== --- llvm/trunk/include/llvm/Transforms/Scalar/JumpThreading.h +++ llvm/trunk/include/llvm/Transforms/Scalar/JumpThreading.h @@ -134,6 +134,8 @@ const char *Suffix); void UpdateBlockFreqAndEdgeWeight(BasicBlock *PredBB, BasicBlock *BB, BasicBlock *NewBB, BasicBlock *SuccBB); + /// Check if the block has profile metadata for its outgoing edges. + bool doesBlockHaveProfileData(BasicBlock *BB); }; } // end namespace llvm Index: llvm/trunk/lib/Transforms/Scalar/JumpThreading.cpp =================================================================== --- llvm/trunk/lib/Transforms/Scalar/JumpThreading.cpp +++ llvm/trunk/lib/Transforms/Scalar/JumpThreading.cpp @@ -1620,6 +1620,23 @@ return PredBB; } +bool JumpThreadingPass::doesBlockHaveProfileData(BasicBlock *BB) { + const TerminatorInst *TI = BB->getTerminator(); + assert(TI->getNumSuccessors() > 1 && "not a split"); + + MDNode *WeightsNode = TI->getMetadata(LLVMContext::MD_prof); + if (!WeightsNode) + return false; + + MDString *MDName = cast(WeightsNode->getOperand(0)); + if (MDName->getString() != "branch_weights") + return false; + + // Ensure there are weights for all of the successors. Note that the first + // operand to the metadata node is a name, not a weight. + return WeightsNode->getNumOperands() == TI->getNumSuccessors() + 1; +} + /// Update the block frequency of BB and branch weight and the metadata on the /// edge BB->SuccBB. This is done by scaling the weight of BB->SuccBB by 1 - /// Freq(PredBB->BB) / Freq(BB->SuccBB). @@ -1670,7 +1687,41 @@ for (int I = 0, E = BBSuccProbs.size(); I < E; I++) BPI->setEdgeProbability(BB, I, BBSuccProbs[I]); - if (BBSuccProbs.size() >= 2) { + // Update the profile metadata as well. + // + // Don't do this if the profile of the transformed blocks was statically + // estimated. (This could occur despite the function having an entry + // frequency in completely cold parts of the CFG.) + // + // In this case we don't want to suggest to subsequent passes that the + // calculated weights are fully consistent. Consider this graph: + // + // check_1 + // 50% / | + // eq_1 | 50% + // \ | + // check_2 + // 50% / | + // eq_2 | 50% + // \ | + // check_3 + // 50% / | + // eq_3 | 50% + // \ | + // + // Assuming the blocks check_* all compare the same value against 1, 2 and 3, + // the overall probabilities are inconsistent; the total probability that the + // value is either 1, 2 or 3 is 150%. + // + // As a consequence if we thread eq_1 -> check_2 to check_3, check_2->check_3 + // becomes 0%. This is even worse if the edge whose probability becomes 0% is + // the loop exit edge. Then based solely on static estimation we would assume + // the loop was extremely hot. + // + // FIXME this locally as well so that BPI and BFI are consistent as well. We + // shouldn't make edges extremely likely or unlikely based solely on static + // estimation. + if (BBSuccProbs.size() >= 2 && doesBlockHaveProfileData(BB)) { SmallVector Weights; for (auto Prob : BBSuccProbs) Weights.push_back(Prob.getNumerator()); Index: llvm/trunk/test/Transforms/JumpThreading/static-profile.ll =================================================================== --- llvm/trunk/test/Transforms/JumpThreading/static-profile.ll +++ llvm/trunk/test/Transforms/JumpThreading/static-profile.ll @@ -0,0 +1,119 @@ +; RUN: opt -S -jump-threading < %s | FileCheck %s + +; Check that based solely on static profile estimation we don't update the +; branch-weight metadata. Even if the function has an entry frequency, a +; completely cold part of the CFG may be statically estimated. + +; For example in the loop below, jump threading would update the weight of the +; loop-exiting branch to 0, drastically inflating the frequency of the loop +; (in the range of billions). +; +; This is the CFG of the loop. There is no run-time profile info for edges +; inside the loop, so branch and block frequencies are estimated as shown: +; +; check_1 (16) +; (8) / | +; eq_1 | (8) +; \ | +; check_2 (16) +; (8) / | +; eq_2 | (8) +; \ | +; check_3 (16) +; (1) / | +; (loop exit) | (15) +; | +; (back edge) +; +; First we thread eq_1->check_2 to check_3. Frequencies are updated to remove +; the frequency of eq_1 from check_2 and then the false edge leaving check_2 +; (changed frequencies are highlighted with * *): +; +; check_1 (16) +; (8) / | +; eq_1~ | (8) +; / | +; / check_2 (*8*) +; / (8) / | +; \ eq_2 | (*0*) +; \ \ | +; ` --- check_3 (16) +; (1) / | +; (loop exit) | (15) +; | +; (back edge) +; +; Next we thread eq_1->check_3 and eq_2->check_3 to check_1 as new back edges. +; Frequencies are updated to remove the frequency of eq_1 and eq_3 from +; check_3 and then the false edge leaving check_3 (changed frequencies are +; highlighted with * *): +; +; check_1 (16) +; (8) / | +; eq_1~ | (8) +; / | +; / check_2 (*8*) +; / (8) / | +; /-- eq_2~ | (*0*) +; (back edge) | +; check_3 (*0*) +; (*0*) / | +; (loop exit) | (*0*) +; | +; (back edge) +; +; As a result, the loop exit edge ends up with 0 frequency which in turn makes +; the loop header to have maximum frequency. + +declare void @bar() + +define void @foo(i32 *%p, i32 %n) !prof !0 { +entry: + %enter_loop = icmp eq i32 %n, 0 + br i1 %enter_loop, label %exit, label %check_1, !prof !1 +; CHECK: br i1 %enter_loop, label %exit, label %check_1, !prof !1 + +check_1: + %v = load i32, i32* %p + %cond1 = icmp eq i32 %v, 1 + br i1 %cond1, label %eq_1, label %check_2 +; No metadata: +; CHECK: br i1 %cond1, label %check_2.thread, label %check_2{{$}} + +eq_1: + call void @bar() + br label %check_2 +; Verify the new backedge: +; CHECK: check_2.thread: +; CHECK-NEXT: call void @bar() +; CHECK-NEXT: br label %check_1 + +check_2: + %cond2 = icmp eq i32 %v, 2 + br i1 %cond2, label %eq_2, label %check_3 +; No metadata: +; CHECK: br i1 %cond2, label %eq_2, label %check_3{{$}} + +eq_2: + call void @bar() + br label %check_3 +; Verify the new backedge: +; CHECK: eq_2: +; CHECK-NEXT: call void @bar() +; CHECK-NEXT: br label %check_1 + +check_3: + %condE = icmp eq i32 %v, 3 + br i1 %condE, label %exit, label %check_1 +; No metadata: +; CHECK: br i1 %condE, label %exit, label %check_1{{$}} + +exit: + ret void +} + +!0 = !{!"function_entry_count", i64 120} +; CHECK-NOT: branch_weights +!1 = !{!"branch_weights", i32 119, i32 1} +; CHECK: !1 = !{!"branch_weights", i32 119, i32 1} +; CHECK-NOT: branch_weights