Index: lib/CodeGen/MachineBlockPlacement.cpp =================================================================== --- lib/CodeGen/MachineBlockPlacement.cpp +++ lib/CodeGen/MachineBlockPlacement.cpp @@ -40,6 +40,7 @@ #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/CodeGen/TailDuplicator.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/CommandLine.h" @@ -293,9 +294,12 @@ /// \brief A handle to the target's lowering info. const TargetLoweringBase *TLI; - /// \brief A handle to the post dominator tree. + /// \brief A handle to the dominator tree. MachineDominatorTree *MDT; + /// \brief A handle to the post dominator tree. + MachinePostDominatorTree *MPDT; + /// \brief Duplicator used to duplicate tails during placement. /// /// Placement decisions can open up new tail duplication opportunities, but @@ -403,6 +407,14 @@ void buildCFGChains(); void optimizeBranches(); void alignBlocks(); + bool shouldTailDuplicate(MachineBasicBlock *BB); + /// Check the edge frequencies to see if tail duplication will increase + /// fallthroughs. + bool probabilityJustifiesTailDuplicate( + MachineBasicBlock *BB, MachineBasicBlock *Succ); + bool canTailDuplicateUnplacedPreds( + MachineBasicBlock *BB, MachineBasicBlock *Succ, + BlockChain &Chain, const BlockFilterSet *BlockFilter); public: static char ID; // Pass identification, replacement for typeid @@ -416,6 +428,8 @@ AU.addRequired(); AU.addRequired(); AU.addRequired(); + if (TailDupPlacement) + AU.addRequired(); AU.addRequired(); AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); @@ -430,6 +444,7 @@ INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_END(MachineBlockPlacement, "block-placement", "Branch Probability Basic Block Placement", false, false) @@ -561,6 +576,132 @@ return SuccProb; } +/// Check if a block should be tail duplicated. +/// \p BB Block to check. +bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) { + // Blocks with single successors don't create additional fallthrough + // opportunities. Don't duplicate them. TODO: When conditional exits are + // analyzable, allow them to be duplicated. + bool IsSimple = TailDup.isSimpleBB(BB); + + if (BB->succ_size() == 1) + return false; + return TailDup.shouldTailDuplicate(IsSimple, *BB); +} + +/// Check the edge frequencies to see if tail duplication will increase +/// fallthroughs. +bool MachineBlockPlacement::probabilityJustifiesTailDuplicate( + MachineBasicBlock *BB, MachineBasicBlock *Succ) { + // We need to do a probability calculation to make sure this is profitable. + // First: does succ have a successor that post-dominates? This affects the + // calculation. The 2 relevant cases are: + // BB BB + // | \ | \ Q + // P| \Q |P \ + // = C = C + // | / | / + // | / | / + // Succ Succ + // / \ | \ V + // U/ =V |U \ + // / \ = D + // D E | / + // | / + // |/ + // Dom + // In the second case, Placing Succ while duplicating it into C prevents the + // fallthrough of Succ into either D or Dom, because they now have C as an + // unplaced predecessor + MachineBasicBlock *Dom = Succ; + auto BestSuccSucc = BranchProbability::getZero(); + for (MachineBasicBlock *SuccSucc : Succ->successors()) { + auto Prob = MBPI->getEdgeProbability(Succ, SuccSucc); + if (Prob > BestSuccSucc) + BestSuccSucc = Prob; + if (Dom == nullptr) break; + Dom = MPDT->findNearestCommonDominator(Dom, SuccSucc); + } + // If it doesn't have a post-dominating successor, here is the calculation: + // BB BB + // | \ | \ + // P| \Q | = + // = C | C + // | / | | + // | / | | + // Succ Succ /| + // / \ | \/ | + // U/ =V = /= = + // / \ | / \| + // D E D E + // Cost in the first case is: P + V + // Cost in the second case is: Q + QV + PU + PV + if (Dom == nullptr || !Succ->isSuccessor(Dom)) { + BranchProbability P = (MBPI->getEdgeProbability(BB, Succ)); + BranchProbability Q = P.getCompl(); + BranchProbability U = BestSuccSucc; + BranchProbability V = U.getCompl(); + BranchProbability QV = Q * V; + uint64_t BaseCost = static_cast(P.getNumerator()) + + static_cast(V.getNumerator()); + uint64_t DupCost = static_cast(Q.getNumerator()) + + static_cast(QV.getNumerator()) + + static_cast(P.getNumerator()); + return (BaseCost > DupCost); + } + BranchProbability U = MBPI->getEdgeProbability(Succ, Dom); + BranchProbability Q = MBPI->getEdgeProbability(BB, Succ).getCompl(); + // If there is a post-dominating successor, here is the calculation: + // BB BB + // | \ Q | \ Q + // |P \ | = + // = C |P C (+Succ) + // | / | | + // | / | | + // Succ Succ /| + // | \ V | \/ | + // |U \ |U /\ | + // = D = = =| + // | / |/ D + // | / | / + // |/ | / + // Dom Dom + // Branches have been marked with (=) + // The cost for taken branches in the first case is P + U + // The cost in the second case (assuming independence), given the layout: + // BB, Succ, (C+Succ), D, Dom + // is Q + P*U + P*V + Q*U == Q + P * Q*U. Subtracting P means we need to + // compare U vs Q + Q*U. + return (U > (Q + Q*U)); +} + + +/// When the option TailDupPlacement is on, this method checks if the +/// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated +/// into all of its unplaced, unfiltered predecessors, that are not BB. In +/// addition we keep a set of blocks that have been tail-duplicated into and +/// allow those blocks to be unplaced as well. This allows the creation of a +/// second (larger) spine and a short fallthrough spine. +/// We also identify blocks with the CFG that would have been produced by +/// tail-duplication and lay them out in the same manner. +bool MachineBlockPlacement::canTailDuplicateUnplacedPreds( + MachineBasicBlock *BB, MachineBasicBlock *Succ, BlockChain &Chain, + const BlockFilterSet *BlockFilter) { + if (!shouldTailDuplicate(Succ)) + return false; + + for (MachineBasicBlock *Pred : Succ->predecessors()) { + // Make sure all unplaced and unfiltered predecessors can be + // tail-duplicated into. + if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred)) + || BlockToChain[Pred] == &Chain) + continue; + if (!TailDup.canTailDuplicate(Succ, Pred)) + return false; + } + return true; +} + /// When the option OutlineOptionalBranches is on, this method /// checks if the fallthrough candidate block \p Succ (of block /// \p BB) also has other unscheduled predecessor blocks which @@ -634,6 +775,13 @@ if (SuccChain.UnscheduledPredecessors == 0) return false; + // As a heuristic, if we can duplicate the block into all its unscheduled + // predecessors, we return false. + if (TailDupPlacement + && canTailDuplicateUnplacedPreds(BB, Succ, Chain, BlockFilter) + && probabilityJustifiesTailDuplicate(BB, Succ)) + return false; + // There are two basic scenarios here: // ------------------------------------- // Case 1: triangular shape CFG (if-then): @@ -1908,13 +2056,8 @@ DuplicatedToLPred = false; DEBUG(dbgs() << "Redoing tail duplication for Succ#" << BB->getNumber() << "\n"); - bool IsSimple = TailDup.isSimpleBB(BB); - // Blocks with single successors don't create additional fallthrough - // opportunities. Don't duplicate them. TODO: When conditional exits are - // analyzable, allow them to be duplicated. - if (!IsSimple && BB->succ_size() == 1) - return false; - if (!TailDup.shouldTailDuplicate(IsSimple, *BB)) + + if (!shouldTailDuplicate(BB)) return false; // This has to be a callback because none of it can be done after // BB is deleted. @@ -1967,6 +2110,7 @@ llvm::function_ref(RemovalCallback); SmallVector DuplicatedPreds; + bool IsSimple = TailDup.isSimpleBB(BB); TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred, &DuplicatedPreds, &RemovalCallbackRef); @@ -2007,12 +2151,14 @@ TII = MF.getSubtarget().getInstrInfo(); TLI = MF.getSubtarget().getTargetLowering(); MDT = &getAnalysis(); + MPDT = nullptr; // Initialize PreferredLoopExit to nullptr here since it may never be set if // there are no MachineLoops. PreferredLoopExit = nullptr; if (TailDupPlacement) { + MPDT = &getAnalysis(); unsigned TailDupSize = TailDuplicatePlacementThreshold; if (MF.getFunction()->optForSize()) TailDupSize = 1; @@ -2043,6 +2189,8 @@ BlockToChain.clear(); // Must redo the dominator tree if blocks were changed. MDT->runOnMachineFunction(MF); + if (MPDT) + MPDT->runOnMachineFunction(MF); ChainAllocator.DestroyAll(); buildCFGChains(); } Index: test/CodeGen/AMDGPU/branch-relaxation.ll =================================================================== --- test/CodeGen/AMDGPU/branch-relaxation.ll +++ test/CodeGen/AMDGPU/branch-relaxation.ll @@ -335,6 +335,12 @@ ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: [[BB3]]: ; %bb3 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_endpgm define void @expand_requires_expand(i32 %cond0) #0 { bb0: @@ -356,6 +362,12 @@ br label %bb3 bb3: +; These NOPs prevent tail-duplication-based outlining +; from firing, which defeats the need to expand the branches and this test. + call void asm sideeffect + "v_nop_e64", ""() #0 + call void asm sideeffect + "v_nop_e64", ""() #0 ret void } @@ -385,6 +397,7 @@ ; GCN-NEXT: [[ENDIF]]: ; %endif ; GCN-NEXT: s_or_b64 exec, exec, [[MASK]] +; GCN-NEXT: s_sleep 5 ; GCN-NEXT: s_endpgm define void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) #0 { entry: @@ -402,6 +415,9 @@ br label %endif endif: + ; layout can remove the split branch if it can copy the return block. + ; This call makes the return block long enough that it doesn't get copied. + call void @llvm.amdgcn.s.sleep(i32 5); ret void } Index: test/CodeGen/ARM/cmpxchg-weak.ll =================================================================== --- test/CodeGen/ARM/cmpxchg-weak.ll +++ test/CodeGen/ARM/cmpxchg-weak.ll @@ -13,14 +13,16 @@ ; CHECK-NEXT: dmb ish ; CHECK-NEXT: strex [[SUCCESS:r[0-9]+]], r2, [r0] ; CHECK-NEXT: cmp [[SUCCESS]], #0 -; CHECK-NEXT: bne [[FAILBB:LBB[0-9]+_[0-9]+]] +; CHECK-NEXT: beq [[SUCCESSBB:LBB[0-9]+_[0-9]+]] ; CHECK-NEXT: BB#2: -; CHECK-NEXT: dmb ish ; CHECK-NEXT: str r3, [r0] ; CHECK-NEXT: bx lr ; CHECK-NEXT: [[LDFAILBB]]: ; CHECK-NEXT: clrex -; CHECK-NEXT: [[FAILBB]]: +; CHECK-NEXT: str r3, [r0] +; CHECK-NEXT: bx lr +; CHECK-NEXT: [[SUCCESSBB]]: +; CHECK-NEXT: dmb ish ; CHECK-NEXT: str r3, [r0] ; CHECK-NEXT: bx lr Index: test/CodeGen/Mips/brconeq.ll =================================================================== --- test/CodeGen/Mips/brconeq.ll +++ test/CodeGen/Mips/brconeq.ll @@ -8,11 +8,11 @@ entry: %0 = load i32, i32* @i, align 4 %1 = load i32, i32* @j, align 4 - %cmp = icmp eq i32 %0, %1 + %cmp = icmp ne i32 %0, %1 ; 16: cmp ${{[0-9]+}}, ${{[0-9]+}} ; 16: bteqz $[[LABEL:[0-9A-Ba-b_]+]] ; 16: $[[LABEL]]: - br i1 %cmp, label %if.end, label %if.then + br i1 %cmp, label %if.then, label %if.end if.then: ; preds = %entry store i32 1, i32* @result, align 4 Index: test/CodeGen/Mips/brconeqk.ll =================================================================== --- test/CodeGen/Mips/brconeqk.ll +++ test/CodeGen/Mips/brconeqk.ll @@ -6,8 +6,8 @@ define void @test() nounwind { entry: %0 = load i32, i32* @i, align 4 - %cmp = icmp eq i32 %0, 10 - br i1 %cmp, label %if.end, label %if.then + %cmp = icmp ne i32 %0, 10 + br i1 %cmp, label %if.then, label %if.end ; 16: cmpi ${{[0-9]+}}, {{[0-9]+}} ; 16: bteqz $[[LABEL:[0-9A-Ba-b_]+]] ; 16: $[[LABEL]]: Index: test/CodeGen/Mips/brcongt.ll =================================================================== --- test/CodeGen/Mips/brcongt.ll +++ test/CodeGen/Mips/brcongt.ll @@ -9,8 +9,8 @@ entry: %0 = load i32, i32* @i, align 4 %1 = load i32, i32* @j, align 4 - %cmp = icmp sgt i32 %0, %1 - br i1 %cmp, label %if.end, label %if.then + %cmp = icmp sle i32 %0, %1 + br i1 %cmp, label %if.then, label %if.end ; 16: slt ${{[0-9]+}}, ${{[0-9]+}} ; 16: btnez $[[LABEL:[0-9A-Ba-b_]+]] ; 16: $[[LABEL]]: Index: test/CodeGen/Mips/brconlt.ll =================================================================== --- test/CodeGen/Mips/brconlt.ll +++ test/CodeGen/Mips/brconlt.ll @@ -10,8 +10,8 @@ entry: %0 = load i32, i32* @j, align 4 %1 = load i32, i32* @i, align 4 - %cmp = icmp slt i32 %0, %1 - br i1 %cmp, label %if.end, label %if.then + %cmp = icmp sge i32 %0, %1 + br i1 %cmp, label %if.then, label %if.end ; 16: slt ${{[0-9]+}}, ${{[0-9]+}} ; MM32R6: slt ${{[0-9]+}}, ${{[0-9]+}} Index: test/CodeGen/Mips/brconnez.ll =================================================================== --- test/CodeGen/Mips/brconnez.ll +++ test/CodeGen/Mips/brconnez.ll @@ -7,7 +7,7 @@ entry: %0 = load i32, i32* @j, align 4 %cmp = icmp eq i32 %0, 0 - br i1 %cmp, label %if.then, label %if.end + br i1 %cmp, label %if.then, label %if.end, !prof !1 ; 16: bnez ${{[0-9]+}}, $[[LABEL:[0-9A-Ba-b_]+]] ; 16: lw ${{[0-9]+}}, %got(result)(${{[0-9]+}}) @@ -21,4 +21,4 @@ ret void } - +!1 = !{!"branch_weights", i32 2, i32 1} Index: test/CodeGen/Mips/micromips-compact-branches.ll =================================================================== --- test/CodeGen/Mips/micromips-compact-branches.ll +++ test/CodeGen/Mips/micromips-compact-branches.ll @@ -6,7 +6,7 @@ %x = alloca i32, align 4 %0 = load i32, i32* %x, align 4 %cmp = icmp eq i32 %0, 0 - br i1 %cmp, label %if.then, label %if.end + br i1 %cmp, label %if.then, label %if.end, !prof !1 if.then: store i32 10, i32* %x, align 4 @@ -17,3 +17,4 @@ } ; CHECK: bnezc +!1 = !{!"branch_weights", i32 2, i32 1} Index: test/CodeGen/PowerPC/misched-inorder-latency.ll =================================================================== --- test/CodeGen/PowerPC/misched-inorder-latency.ll +++ test/CodeGen/PowerPC/misched-inorder-latency.ll @@ -17,7 +17,7 @@ %sum1 = add i32 %sumin, 1 %val1 = load i32, i32* %ptr %p = icmp eq i32 %sumin, 0 - br i1 %p, label %true, label %end + br i1 %p, label %true, label %end, !prof !1 true: %sum2 = add i32 %sum1, 1 %ptr2 = getelementptr i32, i32* %ptr, i32 1 @@ -53,3 +53,5 @@ ret i32 %valmerge } declare void @llvm.prefetch(i8*, i32, i32, i32) nounwind + +!1 = !{!"branch_weights", i32 2, i32 1} Index: test/CodeGen/PowerPC/tail-dup-break-cfg.ll =================================================================== --- /dev/null +++ test/CodeGen/PowerPC/tail-dup-break-cfg.ll @@ -0,0 +1,97 @@ +; RUN: llc -O2 -o - %s | FileCheck %s +target datalayout = "e-m:e-i64:64-n32:64" +target triple = "powerpc64le-grtev4-linux-gnu" + +; Intended layout: +; The code for tail-duplication during layout will produce the layout: +; test1 +; test2 +; body1 (with copy of test2) +; body2 +; exit + +;CHECK-LABEL: tail_dup_break_cfg: +;CHECK: mr [[TAGREG:[0-9]+]], 3 +;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1 +;CHECK-NEXT: bc 12, 1, [[BODY1LABEL:[._0-9A-Za-z]+]] +;CHECK-NEXT: # %test2 +;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30 +;CHECK-NEXT: beq 0, [[EXITLABEL:[._0-9A-Za-z]+]] +;CHECK-NEXT: b [[BODY2LABEL:[._0-9A-Za-z]+]] +;CHECK-NEXT: [[BODY1LABEL]] +;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30 +;CHECK-NEXT: beq 0, [[EXITLABEL]] +;CHECK-NEXT: [[BODY2LABEL]] +;CHECK: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit +;CHECK: blr +define void @tail_dup_break_cfg(i32 %tag) { +entry: + br label %test1 +test1: + %tagbit1 = and i32 %tag, 1 + %tagbit1eq0 = icmp eq i32 %tagbit1, 0 + br i1 %tagbit1eq0, label %test2, label %body1, !prof !1 ; %test2 more likely +body1: + call void @a() + call void @a() + call void @a() + call void @a() + br label %test2 +test2: + %tagbit2 = and i32 %tag, 2 + %tagbit2eq0 = icmp eq i32 %tagbit2, 0 + br i1 %tagbit2eq0, label %exit, label %body2, !prof !1 ; %exit more likely +body2: + call void @b() + call void @b() + call void @b() + call void @b() + br label %exit +exit: + ret void +} + +; The branch weights here hint that we shouldn't tail duplicate in this case. +;CHECK-LABEL: tail_dup_dont_break_cfg: +;CHECK: mr [[TAGREG:[0-9]+]], 3 +;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1 +;CHECK-NEXT: bc 4, 1, [[TEST2LABEL:[._0-9A-Za-z]+]] +;CHECK-NEXT: # %body1 +;CHECK: [[TEST2LABEL]]: # %test2 +;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30 +;CHECK-NEXT: beq 0, [[EXITLABEL:[._0-9A-Za-z]+]] +;CHECK-NEXT: # %body2 +;CHECK: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit +;CHECK: blr +define void @tail_dup_dont_break_cfg(i32 %tag) { +entry: + br label %test1 +test1: + %tagbit1 = and i32 %tag, 1 + %tagbit1eq0 = icmp eq i32 %tagbit1, 0 + br i1 %tagbit1eq0, label %test2, label %body1, !prof !1 ; %test2 more likely +body1: + call void @a() + call void @a() + call void @a() + call void @a() + br label %test2 +test2: + %tagbit2 = and i32 %tag, 2 + %tagbit2eq0 = icmp ne i32 %tagbit2, 0 + br i1 %tagbit2eq0, label %body2, label %exit, !prof !1 ; %body2 more likely +body2: + call void @b() + call void @b() + call void @b() + call void @b() + br label %exit +exit: + ret void +} +declare void @a() +declare void @b() +declare void @c() +declare void @d() + +!1 = !{!"branch_weights", i32 5, i32 3} Index: test/CodeGen/PowerPC/tail-dup-layout.ll =================================================================== --- test/CodeGen/PowerPC/tail-dup-layout.ll +++ test/CodeGen/PowerPC/tail-dup-layout.ll @@ -19,7 +19,7 @@ ; The CHECK statements check for the whole string of tests and exit block, ; and then check that the correct test has been duplicated into the end of ; the optional blocks and that the optional blocks are in the correct order. -;CHECK-LABEL: f: +;CHECK-LABEL: straight_test: ; test1 may have been merged with entry ;CHECK: mr [[TAGREG:[0-9]+]], 3 ;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1 @@ -47,7 +47,7 @@ ;CHECK-NEXT: [[OPT4LABEL]] ;CHECK: b [[EXITLABEL]] -define void @f(i32 %tag) { +define void @straight_test(i32 %tag) { entry: br label %test1 test1: @@ -94,7 +94,57 @@ ret void } +; The block then2 is not unavoidable, but since it can be tail-duplicated, it +; should be placed as a fallthrough from test2 and copied. +; CHECK-LABEL: avoidable_test: +; CHECK: # %entry +; CHECK: andi. +; CHECK: # %test2 +; Make sure then2 falls through from test2 +; CHECK-NOT: # %{{[-_a-zA-Z0-9]+}} +; CHECK: # %then2 +; CHECK: rlwinm. {{[0-9]+}}, {{[0-9]+}}, 0, 29, 29 +; CHECK-NOT: # %{{[-_a-zA-Z0-9]+}} +; CHECK: # %end2 +; CHECK: # %else1 +; CHECK: bl a +; CHECK: bl a +; Make sure then2 was copied into else1 +; CHECK: rlwinm. {{[0-9]+}}, {{[0-9]+}}, 0, 29, 29 +; CHECK: # %else2 +; CHECK: bl c +define void @avoidable_test(i32 %tag) { +entry: + br label %test1 +test1: + %tagbit1 = and i32 %tag, 1 + %tagbit1eq0 = icmp eq i32 %tagbit1, 0 + br i1 %tagbit1eq0, label %test2, label %else1, !prof !1 ; %test2 more likely +else1: + call void @a() + call void @a() + br label %then2 +test2: + %tagbit2 = and i32 %tag, 2 + %tagbit2eq0 = icmp eq i32 %tagbit2, 0 + br i1 %tagbit2eq0, label %then2, label %else2, !prof !1 ; %then2 more likely +then2: + %tagbit3 = and i32 %tag, 4 + %tagbit3eq0 = icmp eq i32 %tagbit3, 0 + br i1 %tagbit3eq0, label %end2, label %end1, !prof !1 ; %end2 more likely +else2: + call void @c() + br label %end2 +end2: + ret void +end1: + call void @d() + ret void +} + declare void @a() declare void @b() declare void @c() declare void @d() + +!1 = !{!"branch_weights", i32 2, i32 1} Index: test/CodeGen/SPARC/sjlj.ll =================================================================== --- test/CodeGen/SPARC/sjlj.ll +++ test/CodeGen/SPARC/sjlj.ll @@ -66,14 +66,15 @@ ; CHECK: ba .LBB1_1 ; CHECK: nop ; CHECK:.LBB1_1: ! %entry -; CHECK: ba .LBB1_3 ; CHECK: mov %g0, %i0 +; CHECK: cmp %i0, 0 +; CHECK: bne .LBB1_4 +; CHECK: ba .LBB1_5 ; CHECK:.LBB1_2: ! Block address taken ; CHECK: mov 1, %i0 -; CHECK:.LBB1_3: ! %entry -; CHECK: cmp %i0, 0 ; CHECK: be .LBB1_5 -; CHECK: nop +; CHECK:.LBB1_4: +; CHECK: ba .LBB1_6 } declare i8* @llvm.frameaddress(i32) #2 Index: test/CodeGen/SystemZ/int-cmp-37.ll =================================================================== --- test/CodeGen/SystemZ/int-cmp-37.ll +++ test/CodeGen/SystemZ/int-cmp-37.ll @@ -15,8 +15,8 @@ entry: %val = load i16 , i16 *@g %src2 = zext i16 %val to i32 - %cond = icmp ult i32 %src1, %src2 - br i1 %cond, label %exit, label %mulb + %cond = icmp uge i32 %src1, %src2 + br i1 %cond, label %mulb, label %exit mulb: %mul = mul i32 %src1, %src1 br label %exit @@ -34,8 +34,8 @@ entry: %val = load i16 , i16 *@g %src2 = zext i16 %val to i32 - %cond = icmp slt i32 %src1, %src2 - br i1 %cond, label %exit, label %mulb + %cond = icmp sge i32 %src1, %src2 + br i1 %cond, label %mulb, label %exit mulb: %mul = mul i32 %src1, %src1 br label %exit @@ -54,8 +54,8 @@ entry: %val = load i16 , i16 *@g %src2 = zext i16 %val to i32 - %cond = icmp eq i32 %src1, %src2 - br i1 %cond, label %exit, label %mulb + %cond = icmp ne i32 %src1, %src2 + br i1 %cond, label %mulb, label %exit mulb: %mul = mul i32 %src1, %src1 br label %exit @@ -74,8 +74,8 @@ entry: %val = load i16 , i16 *@g %src2 = zext i16 %val to i32 - %cond = icmp ne i32 %src1, %src2 - br i1 %cond, label %exit, label %mulb + %cond = icmp eq i32 %src1, %src2 + br i1 %cond, label %mulb, label %exit mulb: %mul = mul i32 %src1, %src1 br label %exit @@ -95,8 +95,8 @@ entry: %val = load i16 , i16 *@h, align 1 %src2 = zext i16 %val to i32 - %cond = icmp ult i32 %src1, %src2 - br i1 %cond, label %exit, label %mulb + %cond = icmp uge i32 %src1, %src2 + br i1 %cond, label %mulb, label %exit mulb: %mul = mul i32 %src1, %src1 br label %exit @@ -115,8 +115,8 @@ entry: %val = load i16 , i16 *@g %src1 = zext i16 %val to i32 - %cond = icmp ult i32 %src1, %src2 - br i1 %cond, label %exit, label %mulb + %cond = icmp uge i32 %src1, %src2 + br i1 %cond, label %mulb, label %exit mulb: %mul = mul i32 %src2, %src2 br label %exit Index: test/CodeGen/SystemZ/int-cmp-40.ll =================================================================== --- test/CodeGen/SystemZ/int-cmp-40.ll +++ test/CodeGen/SystemZ/int-cmp-40.ll @@ -15,8 +15,8 @@ entry: %val = load i16 , i16 *@g %src2 = zext i16 %val to i64 - %cond = icmp ult i64 %src1, %src2 - br i1 %cond, label %exit, label %mulb + %cond = icmp uge i64 %src1, %src2 + br i1 %cond, label %mulb, label %exit mulb: %mul = mul i64 %src1, %src1 br label %exit @@ -54,8 +54,8 @@ entry: %val = load i16 , i16 *@g %src2 = zext i16 %val to i64 - %cond = icmp eq i64 %src1, %src2 - br i1 %cond, label %exit, label %mulb + %cond = icmp ne i64 %src1, %src2 + br i1 %cond, label %mulb, label %exit mulb: %mul = mul i64 %src1, %src1 br label %exit @@ -74,8 +74,8 @@ entry: %val = load i16 , i16 *@g %src2 = zext i16 %val to i64 - %cond = icmp ne i64 %src1, %src2 - br i1 %cond, label %exit, label %mulb + %cond = icmp eq i64 %src1, %src2 + br i1 %cond, label %mulb, label %exit mulb: %mul = mul i64 %src1, %src1 br label %exit @@ -95,8 +95,8 @@ entry: %val = load i16 , i16 *@h, align 1 %src2 = zext i16 %val to i64 - %cond = icmp ult i64 %src1, %src2 - br i1 %cond, label %exit, label %mulb + %cond = icmp uge i64 %src1, %src2 + br i1 %cond, label %mulb, label %exit mulb: %mul = mul i64 %src1, %src1 br label %exit @@ -115,8 +115,8 @@ entry: %val = load i16 , i16 *@g %src1 = zext i16 %val to i64 - %cond = icmp ult i64 %src1, %src2 - br i1 %cond, label %exit, label %mulb + %cond = icmp uge i64 %src1, %src2 + br i1 %cond, label %mulb, label %exit mulb: %mul = mul i64 %src2, %src2 br label %exit Index: test/CodeGen/SystemZ/int-cmp-44.ll =================================================================== --- test/CodeGen/SystemZ/int-cmp-44.ll +++ test/CodeGen/SystemZ/int-cmp-44.ll @@ -473,8 +473,8 @@ %xor = xor i32 %val, 1 %add = add i32 %xor, 1000000 call void @foo() - %cmp = icmp ne i32 %add, 0 - br i1 %cmp, label %exit, label %store + %cmp = icmp eq i32 %add, 0 + br i1 %cmp, label %store, label %exit, !prof !1 store: store i32 %add, i32 *%ptr @@ -888,3 +888,5 @@ exit: ret i64 %res } + +!1 = !{!"branch_weights", i32 2, i32 1} Index: test/CodeGen/Thumb/thumb-shrink-wrapping.ll =================================================================== --- test/CodeGen/Thumb/thumb-shrink-wrapping.ll +++ test/CodeGen/Thumb/thumb-shrink-wrapping.ll @@ -1,11 +1,12 @@ -; RUN: llc %s -o - -enable-shrink-wrap=true -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumb-macho \ +; RUN: llc %s -o - -enable-shrink-wrap=true -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -tail-dup-placement=0 -mtriple=thumb-macho \ ; RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE --check-prefix=ENABLE-V4T -; RUN: llc %s -o - -enable-shrink-wrap=true -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumbv5-macho \ +; RUN: llc %s -o - -enable-shrink-wrap=true -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -tail-dup-placement=0 -mtriple=thumbv5-macho \ ; RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE --check-prefix=ENABLE-V5T -; RUN: llc %s -o - -enable-shrink-wrap=false -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumb-macho \ +; RUN: llc %s -o - -enable-shrink-wrap=false -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -tail-dup-placement=0 -mtriple=thumb-macho \ ; RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE --check-prefix=DISABLE-V4T -; RUN: llc %s -o - -enable-shrink-wrap=false -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumbv5-macho \ +; RUN: llc %s -o - -enable-shrink-wrap=false -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -tail-dup-placement=0 -mtriple=thumbv5-macho \ ; RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE --check-prefix=DISABLE-V5T + ; ; Note: Lots of tests use inline asm instead of regular calls. ; This allows to have a better control on what the allocation will do. @@ -15,6 +16,8 @@ ; edges. ; Also disable the late if-converter as it makes harder to reason on ; the diffs. +; Disable tail-duplication during placement, as v4t vs v5t get different +; results due to branches not being analyzable under v5 ; Initial motivating example: Simple diamond with a call just on one side. ; CHECK-LABEL: foo: Index: test/CodeGen/Thumb2/cbnz.ll =================================================================== --- test/CodeGen/Thumb2/cbnz.ll +++ test/CodeGen/Thumb2/cbnz.ll @@ -26,7 +26,7 @@ call void @x() call void @x() call void @x() - ; CHECK: cbnz + ; CHECK: cbz %q = icmp eq i32 %y, 0 br i1 %q, label %t2, label %f Index: test/CodeGen/Thumb2/ifcvt-compare.ll =================================================================== --- test/CodeGen/Thumb2/ifcvt-compare.ll +++ test/CodeGen/Thumb2/ifcvt-compare.ll @@ -4,7 +4,7 @@ define void @f0(i32 %x) optsize { ; CHECK-LABEL: f0: - ; CHECK: cbnz + ; CHECK: cbz %p = icmp eq i32 %x, 0 br i1 %p, label %t, label %f Index: test/CodeGen/Thumb2/v8_IT_4.ll =================================================================== --- test/CodeGen/Thumb2/v8_IT_4.ll +++ test/CodeGen/Thumb2/v8_IT_4.ll @@ -12,10 +12,11 @@ define weak arm_aapcs_vfpcc i32 @_ZNKSs7compareERKSs(%"struct.std::basic_string,std::allocator >"* %this, %"struct.std::basic_string,std::allocator >"* %__str) { ; CHECK-LABEL: _ZNKSs7compareERKSs: -; CHECK: cbnz r0, +; CHECK: cbz r0, +; CHECK-NEXT: %bb1 +; CHECK-NEXT: pop.w ; CHECK-NEXT: %bb ; CHECK-NEXT: sub{{(.w)?}} r0, r{{[0-9]+}}, r{{[0-9]+}} -; CHECK-NEXT: %bb1 ; CHECK-NEXT: pop.w entry: %0 = tail call arm_aapcs_vfpcc i32 @_ZNKSs4sizeEv(%"struct.std::basic_string,std::allocator >"* %this) ; [#uses=3] Index: test/CodeGen/WebAssembly/phi.ll =================================================================== --- test/CodeGen/WebAssembly/phi.ll +++ test/CodeGen/WebAssembly/phi.ll @@ -8,8 +8,9 @@ ; Basic phi triangle. ; CHECK-LABEL: test0: -; CHECK: div_s $[[NUM0:[0-9]+]]=, $0, $pop[[NUM1:[0-9]+]]{{$}} -; CHECK: return $[[NUM0]]{{$}} +; CHECK: return $0 +; CHECK: div_s $push[[NUM0:[0-9]+]]=, $0, $pop[[NUM1:[0-9]+]]{{$}} +; CHECK: return $pop[[NUM0]]{{$}} define i32 @test0(i32 %p) { entry: %t = icmp slt i32 %p, 0 Index: test/CodeGen/X86/2008-11-29-ULT-Sign.ll =================================================================== --- test/CodeGen/X86/2008-11-29-ULT-Sign.ll +++ test/CodeGen/X86/2008-11-29-ULT-Sign.ll @@ -4,8 +4,8 @@ define i32 @a(i32 %x) nounwind { entry: - %cmp = icmp ult i32 %x, -2147483648 ; [#uses=1] - br i1 %cmp, label %if.end, label %if.then + %cmp = icmp uge i32 %x, -2147483648 ; [#uses=1] + br i1 %cmp, label %if.then, label %if.end if.then: ; preds = %entry %call = call i32 (...) @b() ; [#uses=0] Index: test/CodeGen/X86/add.ll =================================================================== --- test/CodeGen/X86/add.ll +++ test/CodeGen/X86/add.ll @@ -30,7 +30,8 @@ %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2) %sum = extractvalue {i32, i1} %t, 0 %obit = extractvalue {i32, i1} %t, 1 - br i1 %obit, label %overflow, label %normal + %notobit = xor i1 1, %obit + br i1 %notobit, label %normal, label %overflow normal: store i32 0, i32* %X @@ -53,7 +54,8 @@ %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2) %sum = extractvalue {i32, i1} %t, 0 %obit = extractvalue {i32, i1} %t, 1 - br i1 %obit, label %carry, label %normal + %notobit = xor i1 1, %obit + br i1 %notobit, label %normal, label %carry normal: store i32 0, i32* %X Index: test/CodeGen/X86/avx512-cmp.ll =================================================================== --- test/CodeGen/X86/avx512-cmp.ll +++ test/CodeGen/X86/avx512-cmp.ll @@ -69,13 +69,14 @@ ; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; ALL-NEXT: vucomiss %xmm1, %xmm0 ; ALL-NEXT: jne LBB3_1 -; ALL-NEXT: jnp LBB3_2 +; ALL-NEXT: jp LBB3_1 +; ALL-NEXT: ## BB#2: ## %return +; ALL-NEXT: retq ; ALL-NEXT: LBB3_1: ## %if.end ; ALL-NEXT: seta %al ; ALL-NEXT: movzbl %al, %eax ; ALL-NEXT: leaq {{.*}}(%rip), %rcx ; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; ALL-NEXT: LBB3_2: ## %return ; ALL-NEXT: retq entry: %cmp = fcmp oeq float %p, 0.000000e+00 Index: test/CodeGen/X86/bt.ll =================================================================== --- test/CodeGen/X86/bt.ll +++ test/CodeGen/X86/bt.ll @@ -49,7 +49,7 @@ %tmp29 = lshr i32 %x, %n %tmp3 = and i32 1, %tmp29 %tmp4 = icmp eq i32 %tmp3, 0 - br i1 %tmp4, label %bb, label %UnifiedReturnBlock + br i1 %tmp4, label %bb, label %UnifiedReturnBlock, !prof !1 bb: call void @foo() @@ -89,7 +89,7 @@ %tmp29 = ashr i32 %x, %n %tmp3 = and i32 1, %tmp29 %tmp4 = icmp eq i32 %tmp3, 0 - br i1 %tmp4, label %bb, label %UnifiedReturnBlock + br i1 %tmp4, label %bb, label %UnifiedReturnBlock, !prof !1 bb: call void @foo() @@ -109,7 +109,7 @@ %tmp29 = shl i32 1, %n %tmp3 = and i32 %tmp29, %x %tmp4 = icmp eq i32 %tmp3, 0 - br i1 %tmp4, label %bb, label %UnifiedReturnBlock + br i1 %tmp4, label %bb, label %UnifiedReturnBlock, !prof !1 bb: call void @foo() @@ -129,7 +129,7 @@ %tmp29 = shl i32 1, %n %tmp3 = and i32 %x, %tmp29 %tmp4 = icmp eq i32 %tmp3, 0 - br i1 %tmp4, label %bb, label %UnifiedReturnBlock + br i1 %tmp4, label %bb, label %UnifiedReturnBlock, !prof !1 bb: call void @foo() @@ -608,3 +608,5 @@ %tobool = icmp ne i64 %and1, 0 ret i1 %tobool } + +!1 = !{!"branch_weights", i32 2, i32 1} Index: test/CodeGen/X86/fp-une-cmp.ll =================================================================== --- test/CodeGen/X86/fp-une-cmp.ll +++ test/CodeGen/X86/fp-une-cmp.ll @@ -36,8 +36,8 @@ entry: %mul = fmul double %x, %y - %cmp = fcmp une double %mul, 0.000000e+00 - br i1 %cmp, label %bb2, label %bb1 + %cmp = fcmp oeq double %mul, 0.000000e+00 + br i1 %cmp, label %bb1, label %bb2 bb1: %add = fadd double %mul, -1.000000e+00 Index: test/CodeGen/X86/jump_sign.ll =================================================================== --- test/CodeGen/X86/jump_sign.ll +++ test/CodeGen/X86/jump_sign.ll @@ -6,7 +6,7 @@ ; CHECK: jns %tmp1 = add i32 %X, 1 ; [#uses=1] %tmp = icmp slt i32 %tmp1, 0 ; [#uses=1] - br i1 %tmp, label %cond_true, label %cond_next + br i1 %tmp, label %cond_true, label %cond_next, !prof !1 cond_true: ; preds = %entry %tmp2 = tail call i32 (...) @bar( ) ; [#uses=0] @@ -303,3 +303,5 @@ if.end: ret i32 undef } + +!1 = !{!"branch_weights", i32 2, i32 1} Index: test/CodeGen/X86/machine-cse.ll =================================================================== --- test/CodeGen/X86/machine-cse.ll +++ test/CodeGen/X86/machine-cse.ll @@ -86,8 +86,8 @@ ; CHECK-LABEL: cross_mbb_phys_cse: ; CHECK: cmpl ; CHECK: ja - %cmp = icmp ugt i32 %a, %b - br i1 %cmp, label %return, label %if.end + %cmp = icmp ule i32 %a, %b + br i1 %cmp, label %if.end, label %return if.end: ; preds = %entry ; CHECK-NOT: cmpl Index: test/CodeGen/X86/testb-je-fusion.ll =================================================================== --- test/CodeGen/X86/testb-je-fusion.ll +++ test/CodeGen/X86/testb-je-fusion.ll @@ -9,7 +9,7 @@ entry: %and = and i32 %flags, 512 %tobool = icmp eq i32 %and, 0 - br i1 %tobool, label %if.end, label %if.then + br i1 %tobool, label %if.end, label %if.then, !prof !1 if.then: br label %if.end @@ -18,3 +18,4 @@ %hasflag = phi i32 [ 1, %if.then ], [ 0, %entry ] ret i32 %hasflag } +!1 = !{!"branch_weights", i32 1, i32 2}