Index: lib/Transforms/Scalar/JumpThreading.cpp =================================================================== --- lib/Transforms/Scalar/JumpThreading.cpp +++ lib/Transforms/Scalar/JumpThreading.cpp @@ -126,7 +126,116 @@ BBDupThreshold = (T == -1) ? BBDuplicateThreshold : unsigned(T); } -/// runOnFunction - Top level algorithm. +// Update branch probability information according to conditional +// branch probablity. This is usually made possible for cloned branches +// in inline instances by the context specific profile in the caller. +// For instance, +// +// [Block PredBB] +// [Branch PredBr] +// if (t) { +// Block A; +// } else { +// Block B; +// } +// +// [Block BB] +// cond = PN([true, %A], [..., %B]); // PHI node +// [Branch CondBr] +// if (cond) { +// ... // P(cond == true) = 1% +// } +// +// Here we know that when block A is taken, cond must be true, which means +// P(cond == true | A) = 1 +// +// Given that P(cond == true) = P(cond == true | A) * P(A) + +// P(cond == true | B) * P(B) +// we get +// P(cond == true ) = P(A) + P(cond == true | B) * P(B) +// +// which gives us: +// P(A) <= P(c == true), i.e. +// P(t == true) <= P(cond == true) +// +// In other words, if we know P(cond == true), we know that P(t == true) +// can not be greater than 1%. +static void updatePredecessorProfileMetadata(PHINode *PN, BasicBlock *BB) { + BranchInst *CondBr = dyn_cast(BB->getTerminator()); + if (!CondBr) + return; + + BranchProbability BP; + uint64_t TrueWeight, FalseWeight; + if (!CondBr->extractProfMetadata(TrueWeight, FalseWeight)) + return; + + // Returns the outgoing edge of the dominating predecessor block + // that leads to the PhiNode's incoming block: + auto GetPredOutEdge = + [](BasicBlock *IncomingBB, + BasicBlock *PhiBB) -> std::pair { + auto *PredBB = IncomingBB; + auto *SuccBB = PhiBB; + for (;;) { + BranchInst *PredBr = dyn_cast(PredBB->getTerminator()); + if (PredBr && PredBr->isConditional()) + return {PredBB, SuccBB}; + auto *SinglePredBB = PredBB->getSinglePredecessor(); + if (!SinglePredBB) + return {nullptr, nullptr}; + SuccBB = PredBB; + PredBB = SinglePredBB; + } + }; + + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { + Value *PhiOpnd = PN->getIncomingValue(i); + ConstantInt *CI = dyn_cast(PhiOpnd); + + if (!CI || !CI->getType()->isIntegerTy(1)) + continue; + + BP = (CI->isOne() ? BranchProbability::getBranchProbability( + TrueWeight, TrueWeight + FalseWeight) + : BranchProbability::getBranchProbability( + FalseWeight, TrueWeight + FalseWeight)); + + auto PredOutEdge = GetPredOutEdge(PN->getIncomingBlock(i), BB); + if (!PredOutEdge.first) + return; + + BasicBlock *PredBB = PredOutEdge.first; + BranchInst *PredBr = cast(PredBB->getTerminator()); + + uint64_t PredTrueWeight, PredFalseWeight; + // FIXME: We currently only set the profile data when it is missing. + // With PGO, this can be used to refine even existing profile data with + // context information. This needs to be done after more performance + // testing. + if (PredBr->extractProfMetadata(PredTrueWeight, PredFalseWeight)) + continue; + + // We can not infer anything useful when BP >= 50%, because BP is the + // upper bound probability value. + if (BP >= BranchProbability(50, 100)) + continue; + + SmallVector Weights; + if (PredBr->getSuccessor(0) == PredOutEdge.second) { + Weights.push_back(BP.getNumerator()); + Weights.push_back(BP.getCompl().getNumerator()); + } else { + Weights.push_back(BP.getCompl().getNumerator()); + Weights.push_back(BP.getNumerator()); + } + PredBr->setMetadata(LLVMContext::MD_prof, + MDBuilder(PredBr->getParent()->getContext()) + .createBranchWeights(Weights)); + } +} + +/// runOnFunction - Toplevel algorithm. /// bool JumpThreading::runOnFunction(Function &F) { if (skipFunction(F)) @@ -991,6 +1100,11 @@ if (SimplifyPartiallyRedundantLoad(LI)) return true; + // Before threading, try to propagate profile data backwards: + if (PHINode *PN = dyn_cast(CondInst)) + if (PN->getParent() == BB && isa(BB->getTerminator())) + updatePredecessorProfileMetadata(PN, BB); + // Handle a variety of cases where we are branching on something derived from // a PHI node in the current block. If we can prove that any predecessors // compute a predictable value based on a PHI node, thread those predecessors. Index: test/Transforms/JumpThreading/threading_prof1.ll =================================================================== --- test/Transforms/JumpThreading/threading_prof1.ll +++ test/Transforms/JumpThreading/threading_prof1.ll @@ -0,0 +1,99 @@ +; RUN: opt -jump-threading -S < %s | FileCheck %s +; RUN: opt -passes=jump-threading -S < %s | FileCheck %s + +define void @test() { +; CHECK-LABEL: @test() +bb: + %tmp = call i32 @a() + %tmp1 = icmp eq i32 %tmp, 1 + br i1 %tmp1, label %bb5, label %bb2 +; CHECK: br i1 %tmp1,{{.*}} !prof ![[PROF1:[0-9]+]] + +bb2: ; preds = %bb + %tmp3 = call i32 @b() + %tmp4 = icmp ne i32 %tmp3, 1 + br label %bb5 +; CHECK: br i1 %tmp4, {{.*}} !prof ![[PROF2:[0-9]+]] + +bb5: ; preds = %bb2, %bb + %tmp6 = phi i1 [ false, %bb ], [ %tmp4, %bb2 ] + br i1 %tmp6, label %bb8, label %bb7, !prof !0 + +bb7: ; preds = %bb5 + call void @bar() + br label %bb8 + +bb8: ; preds = %bb7, %bb5 + ret void +} + +define void @test_single_pred1() { +; CHECK-LABEL: @test_single_pred1() +bb: + %tmp = call i32 @a() + %tmp1 = icmp eq i32 %tmp, 1 + br i1 %tmp1, label %bb5_1, label %bb2 +; CHECK: br i1 %tmp1,{{.*}} !prof ![[PROF1:[0-9]+]] + +bb5_1: + br label %bb5; + +bb2: + %tmp3 = call i32 @b() + %tmp4 = icmp ne i32 %tmp3, 1 + br label %bb5 +; CHECK: br i1 %tmp4, {{.*}} !prof ![[PROF2:[0-9]+]] + +bb5: + %tmp6 = phi i1 [ false, %bb5_1 ], [ %tmp4, %bb2 ] + br i1 %tmp6, label %bb8, label %bb7, !prof !0 + +bb7: + call void @bar() + br label %bb8 + +bb8: + ret void +} + +define void @test_single_pred2() { +; CHECK-LABEL: @test_single_pred2() +bb: + %tmp = call i32 @a() + %tmp1 = icmp eq i32 %tmp, 1 + br i1 %tmp1, label %bb5_1, label %bb2 +; CHECK: br i1 %tmp1,{{.*}} !prof ![[PROF1:[0-9]+]] + +bb5_1: + br label %bb5_2; + +bb5_2: + br label %bb5; + +bb2: + %tmp3 = call i32 @b() + %tmp4 = icmp ne i32 %tmp3, 1 + br label %bb5 +; CHECK: br i1 %tmp4, {{.*}} !prof ![[PROF2:[0-9]+]] + +bb5: + %tmp6 = phi i1 [ false, %bb5_2 ], [ %tmp4, %bb2 ] + br i1 %tmp6, label %bb8, label %bb7, !prof !0 + +bb7: + call void @bar() + br label %bb8 + +bb8: + ret void +} + +declare void @bar() + +declare i32 @a() + +declare i32 @b() + +!0 = !{!"branch_weights", i32 2146410443, i32 1073205} +;CHECK: ![[PROF1]] = !{!"branch_weights", i32 1073205, i32 2146410443} +;CHECK: ![[PROF2]] = !{!"branch_weights", i32 2146410443, i32 1073205} Index: test/Transforms/JumpThreading/threading_prof2.ll =================================================================== --- test/Transforms/JumpThreading/threading_prof2.ll +++ test/Transforms/JumpThreading/threading_prof2.ll @@ -0,0 +1,42 @@ +; RUN: opt -jump-threading -S < %s | FileCheck %s +; RUN: opt -passes=jump-threading -S < %s | FileCheck %s +define void @test() { +bb: + %tmp = call i32 @a() + %tmp1 = icmp eq i32 %tmp, 1 + br i1 %tmp1, label %bb5, label %bb2 +; CHECK: br i1 %tmp1,{{.*}} !prof ![[PROF1:[0-9]+]] + +bb2: + %tmp3 = call i32 @b() + %tmp4 = icmp ne i32 %tmp3, 1 + br label %bb5 +; CHECK: br i1 %tmp4, {{.*}} !prof ![[PROF2:[0-9]+]] + +bb5: + %tmp6 = phi i1 [ false, %bb ], [ %tmp4, %bb2 ] + br i1 %tmp6, label %bb8, label %bb7, !prof !0 + +bb7: + call void @bar() + br label %bb9 + +bb8: + call void @foo() + br label %bb9 + +bb9: + ret void +} + +declare void @bar() + +declare void @foo() + +declare i32 @a() + +declare i32 @b() + +!0 = !{!"branch_weights", i32 2146410443, i32 1073205} +;CHECK: ![[PROF1]] = !{!"branch_weights", i32 1073205, i32 2146410443} +;CHECK: ![[PROF2]] = !{!"branch_weights", i32 2146410443, i32 1073205}