Index: lib/CodeGen/MachineBlockPlacement.cpp =================================================================== --- lib/CodeGen/MachineBlockPlacement.cpp +++ lib/CodeGen/MachineBlockPlacement.cpp @@ -761,57 +761,65 @@ // Cost in the first case is: P + V // For this calculation, we always assume P > Qout. If Qout > P // The result of this function will be ignored at the caller. - // Cost in the second case is: Qout + Qin * U + P * V + // Let F = SuccFreq - Qin + // Cost in the second case is: Qout + min(Qin, F) * U + max(Qin, F) * V if (PDom == nullptr || !Succ->isSuccessor(PDom)) { BranchProbability UProb = BestSuccSucc; BranchProbability VProb = AdjustedSuccSumProb - UProb; + BlockFrequency F = SuccFreq - Qin; BlockFrequency V = SuccFreq * VProb; - BlockFrequency QinU = Qin * UProb; + BlockFrequency QinU = std::min(Qin, F) * UProb; BlockFrequency BaseCost = P + V; - BlockFrequency DupCost = Qout + QinU + P * VProb; + BlockFrequency DupCost = Qout + QinU + std::max(Qin, F) * VProb; return greaterWithBias(BaseCost, DupCost, EntryFreq); } BranchProbability UProb = MBPI->getEdgeProbability(Succ, PDom); BranchProbability VProb = AdjustedSuccSumProb - UProb; BlockFrequency U = SuccFreq * UProb; BlockFrequency V = SuccFreq * VProb; + BlockFrequency F = SuccFreq - Qin; // If there is a post-dominating successor, here is the calculation: // BB BB BB BB - // | \Qout | \ | \Qout | \ - // |P C | = |P C | = - // = C' |P C = C' |P C - // | /Qin | | | /Qin | | - // | / | C' (+Succ) | / | C' (+Succ) - // Succ Succ /| Succ Succ /| - // | \ V | \/ | | \ V | \/ | - // |U \ |U /\ | |U = |U /\ | - // = D = = \= | D | = =| - // | / |/ D | / |/ D - // | / | / | = | / - // |/ | / |/ | = - // Dom Dom Dom Dom + // | \Qout | \ | \Qout | \ + // |P C | = |P C | = + // = C' |P C = C' |P C + // | /Qin | | | /Qin | | + // | / | C' (+Succ) | / | C' (+Succ) + // Succ Succ /| Succ Succ /| + // | \ V | \/ | | \ V | \/ | + // |U \ |U /\ =? |U = |U /\ | + // = D = = =?| | D | = =| + // | / |/ D | / |/ D + // | / | / | = | / + // |/ | / |/ | = + // Dom Dom Dom Dom // '=' : Branch taken for that CFG edge // The cost for taken branches in the first case is P + U + // Let F = SuccFreq - Qin // The cost in the second case (assuming independence), given the layout: - // BB, Succ, (C+Succ), D, Dom - // is Qout + P * V + Qin * U + // BB, Succ, (C+Succ), D, Dom or the layout: + // BB, Succ, D, Dom, (C+Succ) + // is Qout + max(F, Qin) * U + min(F, Qin) // compare P + U vs Qout + P * U + Qin. // // The 3rd and 4th cases cover when Dom would be chosen to follow Succ. // // For the 3rd case, the cost is P + 2 * V - // For the 4th case, the cost is Qout + Qin * U + P * V + V - // We choose 4 over 3 when (P + V) > Qout + Qin * U + P * V + // For the 4th case, the cost is Qout + min(Qin, F) * U + max(Qin, F) * V + V + // We choose 4 over 3 when (P + V) > Qout + min(Qin, F) * U + max(Qin, F) * V if (UProb > AdjustedSuccSumProb / 2 && !hasBetterLayoutPredecessor(Succ, PDom, *BlockToChain[PDom], UProb, UProb, Chain, BlockFilter)) // Cases 3 & 4 - return greaterWithBias((P + V), (Qout + Qin * UProb + P * VProb), - EntryFreq); + return greaterWithBias( + (P + V), (Qout + std::max(Qin, F) * VProb + std::min(Qin, F) * UProb), + EntryFreq); // Cases 1 & 2 - return greaterWithBias( - (P + U), (Qout + Qin * AdjustedSuccSumProb + P * UProb), EntryFreq); + return greaterWithBias((P + U), + (Qout + std::min(Qin, F) * AdjustedSuccSumProb + + std::max(Qin, F) * UProb), + EntryFreq); } /// Check for a trellis layout. \p BB is the upper part of a trellis if its Index: test/CodeGen/AArch64/combine-comparisons-by-cse.ll =================================================================== --- test/CodeGen/AArch64/combine-comparisons-by-cse.ll +++ test/CodeGen/AArch64/combine-comparisons-by-cse.ll @@ -264,7 +264,7 @@ define i32 @do_nothing_if_resultant_opcodes_would_differ() #0 { ; CHECK-LABEL: do_nothing_if_resultant_opcodes_would_differ ; CHECK: cmn -; CHECK: b.le +; CHECK: b.gt ; CHECK: cmp ; CHECK: b.gt entry: Index: test/CodeGen/X86/block-placement.ll =================================================================== --- test/CodeGen/X86/block-placement.ll +++ test/CodeGen/X86/block-placement.ll @@ -1454,9 +1454,50 @@ ret void } +; Because %endif has a higher frequency than %if, the calculations show we +; shouldn't tail-duplicate %endif so that we can place it after %if. We were +; previously undercounting the cost by ignoring execution frequency that didn't +; come from the %if->%endif path. +; CHECK-LABEL: higher_frequency_succ_tail_dup +; CHECK: %entry +; CHECK: %elseif +; CHECK: %else +; CHECK: %endif +; CHECK: %then +; CHECK: %ret +define void @higher_frequency_succ_tail_dup(i1 %a, i1 %b, i1 %c) { +entry: + br label %if +if: ; preds = %entry + call void @effect(i32 0) + br i1 %a, label %elseif, label %endif, !prof !11 ; even + +elseif: ; preds = %if + call void @effect(i32 1) + br i1 %b, label %else, label %endif, !prof !11 ; even + +else: ; preds = %elseif + call void @effect(i32 2) + br label %endif + +endif: ; preds = %if, %elseif, %else + br i1 %c, label %then, label %ret, !prof !12 ; 5 to 3 + +then: ; preds = %endif + call void @effect(i32 3) + br label %ret + +ret: ; preds = %endif, %then + ret void +} + +declare void @effect(i32) + !5 = !{!"branch_weights", i32 84, i32 16} !6 = !{!"function_entry_count", i32 10} !7 = !{!"branch_weights", i32 60, i32 40} !8 = !{!"branch_weights", i32 5001, i32 4999} !9 = !{!"branch_weights", i32 85, i32 15} !10 = !{!"branch_weights", i32 90, i32 10} +!11 = !{!"branch_weights", i32 1, i32 1} +!12 = !{!"branch_weights", i32 5, i32 3}