Index: lib/CodeGen/MachineBlockPlacement.cpp =================================================================== --- lib/CodeGen/MachineBlockPlacement.cpp +++ lib/CodeGen/MachineBlockPlacement.cpp @@ -51,6 +51,7 @@ #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetSubtargetInfo.h" #include +#include #include #include using namespace llvm; @@ -478,6 +479,9 @@ bool canTailDuplicateUnplacedPreds( const MachineBasicBlock *BB, MachineBasicBlock *Succ, const BlockChain &Chain, const BlockFilterSet *BlockFilter); + /// Find chains of triangles to tail-duplicate where a global analysis works, + /// but a local analysis would not find them. + void precomputeTriangleChains(); public: static char ID; // Pass identification, replacement for typeid @@ -1065,6 +1069,116 @@ return true; } +/// Find chains of triangles where we believe it would be profitable to +/// tail-duplicate them all, but a local analysis would not find them. +void MachineBlockPlacement::precomputeTriangleChains() { + struct TriangleChain { + int Count; + std::forward_list Edges; + TriangleChain(MachineBasicBlock* src, MachineBasicBlock *dst) { + Edges.push_front(src); + Edges.push_front(dst); + Count = 1; + DEBUG(dbgs() << "Saving Triangle: " << + getBlockName(src) << "->" << getBlockName(dst) << "\n"); + } + + void append(MachineBasicBlock *dst) { + DEBUG(dbgs() << "Extending Triangle: " << + getBlockName(Edges.front()) << "->" << getBlockName(dst) << + " " << (Count + 1) << "\n"); + assert(!Edges.empty() && Edges.front()->isSuccessor(dst) && + "Attempting to append a block that is not a successor."); + Edges.push_front(dst); + ++Count; + } + + MachineBasicBlock *getKey() { + return Edges.front(); + } + }; + + DEBUG(dbgs() << "Pre-computing triangle chains.\n"); + // Map from last block to the chain that contains it. This allows us to extend + // chains as we find new triangles. + DenseMap TriangleChainMap; + for (MachineBasicBlock &BB : *F) { + // If BB doesn't have 2 successors, it doesn't start a triangle. + if (BB.succ_size() != 2) + continue; + MachineBasicBlock *PDom = nullptr; + for (MachineBasicBlock *Succ : BB.successors()) { + if (!MPDT->dominates(Succ, &BB)) + continue; + PDom = Succ; + break; + } + // If BB doesn't have a post-dominating successor, it doesn't form a + // triangle. + if (PDom == nullptr) + continue; + DEBUG(dbgs() << "Found Triangle: " << + getBlockName(&BB) << "->" << getBlockName(PDom) << "\n"); + // If PDom has a hint that it is low probability, skip this triangle. + if (MBPI->getEdgeProbability(&BB, PDom) < BranchProbability(50, 100)) + continue; + // If PDom isn't eligible for duplication, this isn't the kind of triangle + // we're looking for. + if (!shouldTailDuplicate(PDom)) + continue; + bool CanTailDuplicate = true; + // If PDom can't tail-duplicate into it's non-BB predecessors, then this + // isn't the kind of triangle we're looking for. + for (MachineBasicBlock* Pred : PDom->predecessors()) { + if (Pred == &BB) + continue; + if (!TailDup.canTailDuplicate(PDom, Pred)) { + CanTailDuplicate = false; + break; + } + } + // If we can't tail-duplicate PDom to its predecessors, then skip this + // triangle. + if (!CanTailDuplicate) + continue; + + // Now we have an interesting triangle. Insert it if it's not part of an + // existing chain + auto Found = TriangleChainMap.find(&BB); + // If it is, remove the chain from the map, grow it, and put it back in the + // map with the end as the new key. + if (Found != TriangleChainMap.end()) { + TriangleChain Chain = std::move(Found->second); + TriangleChainMap.erase(Found); + Chain.append(PDom); + TriangleChainMap.insert(std::make_pair(Chain.getKey(), std::move(Chain))); + } else { + auto InsertResult = TriangleChainMap.try_emplace(PDom, &BB, PDom); + assert (InsertResult.second && "Block seen twice."); + (void) InsertResult; + } + } + + for (auto &ChainPair : TriangleChainMap) { + TriangleChain &Chain = ChainPair.second; + // Benchmarking has shown that due to branch correlation duplicating 2 or + // more triangles is profitable, despite the calculations assuming + // independence. + DEBUG(dbgs() << "After scanning, found a chain of size: " << Chain.Count << + "\n"); + if (Chain.Count < 2) + continue; + MachineBasicBlock *dst = Chain.Edges.front(); + Chain.Edges.pop_front(); + for (MachineBasicBlock *src : Chain.Edges) { + DEBUG(dbgs() << "Marking edge: " << getBlockName(src) << "->" << + getBlockName(dst) << " as pre-computed.\n"); + ComputedEdges[src] = { dst, true }; + dst = src; + } + } +} + /// When the option OutlineOptionalBranches is on, this method /// checks if the fallthrough candidate block \p Succ (of block /// \p BB) also has other unscheduled predecessor blocks which @@ -2600,6 +2714,7 @@ if (MF.getFunction()->optForSize()) TailDupSize = 1; TailDup.initMF(MF, MBPI, /* LayoutMode */ true, TailDupSize); + precomputeTriangleChains(); } assert(BlockToChain.empty()); Index: test/CodeGen/Mips/llvm-ir/ashr.ll =================================================================== --- test/CodeGen/Mips/llvm-ir/ashr.ll +++ test/CodeGen/Mips/llvm-ir/ashr.ll @@ -83,20 +83,23 @@ ; M2: srav $[[T0:[0-9]+]], $4, $7 ; M2: andi $[[T1:[0-9]+]], $7, 32 - ; M2: bnez $[[T1]], $[[BB0:BB[0-9_]+]] + ; M2: beqz $[[T1]], $[[BB0:BB[0-9_]+]] ; M2: move $3, $[[T0]] + ; M2: bnez $[[T1]], $[[BB1:BB[0-9_]+]] + ; M2: nop + ; M2: $[[EXIT:BB[0-9_]+]]: + ; M2: jr $ra + ; M2: nop + ; M2: $[[BB0]]: ; M2: srlv $[[T2:[0-9]+]], $5, $7 ; M2: not $[[T3:[0-9]+]], $7 ; M2: sll $[[T4:[0-9]+]], $4, 1 ; M2: sllv $[[T5:[0-9]+]], $[[T4]], $[[T3]] + ; M2: beqz $[[T1]], $[[EXIT]] ; M2: or $3, $[[T3]], $[[T2]] - ; M2: $[[BB0]]: - ; M2: beqz $[[T1]], $[[BB1:BB[0-9_]+]] - ; M2: nop - ; M2: sra $2, $4, 31 ; M2: $[[BB1]]: ; M2: jr $ra - ; M2: nop + ; M2: sra $2, $4, 31 ; 32R1-R5: srlv $[[T0:[0-9]+]], $5, $7 ; 32R1-R5: not $[[T1:[0-9]+]], $7 @@ -169,20 +172,23 @@ ; M3: sll $[[T0:[0-9]+]], $7, 0 ; M3: dsrav $[[T1:[0-9]+]], $4, $7 ; M3: andi $[[T2:[0-9]+]], $[[T0]], 64 - ; M3: bnez $[[T3:[0-9]+]], [[BB0:.LBB[0-9_]+]] + ; M3: beqz $[[T3:[0-9]+]], [[BB0:.LBB[0-9_]+]] ; M3: move $3, $[[T1]] + ; M3: bnez $[[T3]], [[BB1:.LBB[0-9_]+]] + ; M3: nop + ; M3: [[EXIT:.LBB[0-9_]+]]: + ; M3: jr $ra + ; M3: nop + ; M3: [[BB0]]: ; M3: dsrlv $[[T4:[0-9]+]], $5, $7 ; M3: dsll $[[T5:[0-9]+]], $4, 1 ; M3: not $[[T6:[0-9]+]], $[[T0]] ; M3: dsllv $[[T7:[0-9]+]], $[[T5]], $[[T6]] + ; M3: beqz $[[T3]], [[EXIT]] ; M3: or $3, $[[T7]], $[[T4]] - ; M3: [[BB0]]: - ; M3: beqz $[[T3]], [[BB1:.LBB[0-9_]+]] - ; M3: nop - ; M3: dsra $2, $4, 63 ; M3: [[BB1]]: ; M3: jr $ra - ; M3: nop + ; M3: dsra $2, $4, 63 ; GP64-NOT-R6: dsrlv $[[T0:[0-9]+]], $5, $7 ; GP64-NOT-R6: dsll $[[T1:[0-9]+]], $4, 1 Index: test/CodeGen/Mips/llvm-ir/lshr.ll =================================================================== --- test/CodeGen/Mips/llvm-ir/lshr.ll +++ test/CodeGen/Mips/llvm-ir/lshr.ll @@ -81,20 +81,24 @@ ; M2: srlv $[[T0:[0-9]+]], $4, $7 ; M2: andi $[[T1:[0-9]+]], $7, 32 - ; M2: bnez $[[T1]], $[[BB0:BB[0-9_]+]] + ; M2: beqz $[[T1]], $[[BB0:BB[0-9_]+]] ; M2: move $3, $[[T0]] + ; M2: beqz $[[T1]], $[[BB1:BB[0-9_]+]] + ; M2: addiu $2, $zero, 0 + ; M2: $[[EXIT:BB[0-9_]+]]: + ; M2: jr $ra + ; M2: nop + ; M2: $[[BB0]]: ; M2: srlv $[[T2:[0-9]+]], $5, $7 ; M2: not $[[T3:[0-9]+]], $7 ; M2: sll $[[T4:[0-9]+]], $4, 1 ; M2: sllv $[[T5:[0-9]+]], $[[T4]], $[[T3]] ; M2: or $3, $[[T3]], $[[T2]] - ; M2: $[[BB0]]: - ; M2: bnez $[[T1]], $[[BB1:BB[0-9_]+]] + ; M2: bnez $[[T1]], $[[EXIT:BB[0-9_]+]] ; M2: addiu $2, $zero, 0 - ; M2: move $2, $[[T0]] ; M2: $[[BB1]]: ; M2: jr $ra - ; M2: nop + ; M2: move $2, $[[T0]] ; 32R1-R5: srlv $[[T0:[0-9]+]], $5, $7 ; 32R1-R5: not $[[T1:[0-9]+]], $7 @@ -160,20 +164,24 @@ ; M3: sll $[[T0:[0-9]+]], $7, 0 ; M3: dsrlv $[[T1:[0-9]+]], $4, $7 ; M3: andi $[[T2:[0-9]+]], $[[T0]], 64 - ; M3: bnez $[[T3:[0-9]+]], [[BB0:\.LBB[0-9_]+]] + ; M3: beqz $[[T3:[0-9]+]], [[BB0:\.LBB[0-9_]+]] ; M3: move $3, $[[T1]] + ; M3: beqz $[[T3]], [[BB1:\.LBB[0-9_]+]] + ; M3: daddiu $2, $zero, 0 + ; M3: [[EXIT:\.LBB[0-9_]+]]: + ; M3: jr $ra + ; M3: nop + ; M3: [[BB0]]: ; M3: dsrlv $[[T4:[0-9]+]], $5, $7 ; M3: dsll $[[T5:[0-9]+]], $4, 1 ; M3: not $[[T6:[0-9]+]], $[[T0]] ; M3: dsllv $[[T7:[0-9]+]], $[[T5]], $[[T6]] ; M3: or $3, $[[T7]], $[[T4]] - ; M3: [[BB0]]: - ; M3: bnez $[[T3]], [[BB1:\.LBB[0-9_]+]] + ; M3: bnez $[[T3]], [[EXIT]] ; M3: daddiu $2, $zero, 0 - ; M3: move $2, $[[T1]] ; M3: [[BB1]]: ; M3: jr $ra - ; M3: nop + ; M3: move $2, $[[T1]] ; GP64-NOT-R6: dsrlv $[[T0:[0-9]+]], $5, $7 ; GP64-NOT-R6: dsll $[[T1:[0-9]+]], $4, 1 Index: test/CodeGen/Mips/llvm-ir/shl.ll =================================================================== --- test/CodeGen/Mips/llvm-ir/shl.ll +++ test/CodeGen/Mips/llvm-ir/shl.ll @@ -97,20 +97,24 @@ ; M2: sllv $[[T0:[0-9]+]], $5, $7 ; M2: andi $[[T1:[0-9]+]], $7, 32 - ; M2: bnez $[[T1]], $[[BB0:BB[0-9_]+]] + ; M2: beqz $[[T1]], $[[BB0:BB[0-9_]+]] ; M2: move $2, $[[T0]] + ; M2: beqz $[[T1]], $[[BB1:BB[0-9_]+]] + ; M2: addiu $3, $zero, 0 + ; M2: $[[EXIT:BB[0-9_]+]]: + ; M2: jr $ra + ; M2: nop + ; M2: $[[BB0]]: ; M2: sllv $[[T2:[0-9]+]], $4, $7 ; M2: not $[[T3:[0-9]+]], $7 ; M2: srl $[[T4:[0-9]+]], $5, 1 ; M2: srlv $[[T5:[0-9]+]], $[[T4]], $[[T3]] ; M2: or $2, $[[T2]], $[[T3]] - ; M2: $[[BB0]]: - ; M2: bnez $[[T1]], $[[BB1:BB[0-9_]+]] + ; M2: bnez $[[T1]], $[[EXIT]] ; M2: addiu $3, $zero, 0 - ; M2: move $3, $[[T0]] ; M2: $[[BB1]]: ; M2: jr $ra - ; M2: nop + ; M2: move $3, $[[T0]] ; 32R1-R5: sllv $[[T0:[0-9]+]], $4, $7 ; 32R1-R5: not $[[T1:[0-9]+]], $7 @@ -176,20 +180,24 @@ ; M3: sll $[[T0:[0-9]+]], $7, 0 ; M3: dsllv $[[T1:[0-9]+]], $5, $7 ; M3: andi $[[T2:[0-9]+]], $[[T0]], 64 - ; M3: bnez $[[T3:[0-9]+]], [[BB0:\.LBB[0-9_]+]] + ; M3: beqz $[[T3:[0-9]+]], [[BB0:\.LBB[0-9_]+]] ; M3: move $2, $[[T1]] + ; M3: beqz $[[T3]], [[BB1:\.LBB[0-9_]+]] + ; M3: daddiu $3, $zero, 0 + ; M3: [[EXIT:\.LBB[0-9_]+]]: + ; M3: jr $ra + ; M3: nop + ; M3: [[BB0]]: ; M3: dsllv $[[T4:[0-9]+]], $4, $7 ; M3: dsrl $[[T5:[0-9]+]], $5, 1 ; M3: not $[[T6:[0-9]+]], $[[T0]] ; M3: dsrlv $[[T7:[0-9]+]], $[[T5]], $[[T6]] ; M3: or $2, $[[T4]], $[[T7]] - ; M3: [[BB0]]: - ; M3: bnez $[[T3]], [[BB1:\.LBB[0-9_]+]] + ; M3: bnez $[[T3]], [[EXIT]] ; M3: daddiu $3, $zero, 0 - ; M3: move $3, $[[T1]] ; M3: [[BB1]]: ; M3: jr $ra - ; M3: nop + ; M3: move $3, $[[T1]] ; GP64-NOT-R6: dsllv $[[T0:[0-9]+]], $4, $7 ; GP64-NOT-R6: dsrl $[[T1:[0-9]+]], $5, 1 Index: test/CodeGen/PowerPC/tail-dup-layout.ll =================================================================== --- test/CodeGen/PowerPC/tail-dup-layout.ll +++ test/CodeGen/PowerPC/tail-dup-layout.ll @@ -56,9 +56,6 @@ br i1 %tagbit1eq0, label %test2, label %optional1, !prof !1 optional1: call void @a() - call void @a() - call void @a() - call void @a() br label %test2 test2: %tagbit2 = and i32 %tag, 2 @@ -66,9 +63,6 @@ br i1 %tagbit2eq0, label %test3, label %optional2, !prof !1 optional2: call void @b() - call void @b() - call void @b() - call void @b() br label %test3 test3: %tagbit3 = and i32 %tag, 4 @@ -76,9 +70,6 @@ br i1 %tagbit3eq0, label %test4, label %optional3, !prof !1 optional3: call void @c() - call void @c() - call void @c() - call void @c() br label %test4 test4: %tagbit4 = and i32 %tag, 8 @@ -86,8 +77,86 @@ br i1 %tagbit4eq0, label %exit, label %optional4, !prof !1 optional4: call void @d() - call void @d() - call void @d() + br label %exit +exit: + ret void +} + +; Intended layout: +; The chain-of-triangles based duplicating produces the layout +; test1 +; test2 +; test3 +; test4 +; optional1 +; optional2 +; optional3 +; optional4 +; exit +; even for 50/50 branches. +; Tail duplication puts test n+1 at the end of optional n +; so optional1 includes a copy of test2 at the end, and branches +; to test3 (at the top) or falls through to optional 2. +; The CHECK statements check for the whole string of tests +; and then check that the correct test has been duplicated into the end of +; the optional blocks and that the optional blocks are in the correct order. +;CHECK-LABEL: straight_test_50: +; test1 may have been merged with entry +;CHECK: mr [[TAGREG:[0-9]+]], 3 +;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1 +;CHECK-NEXT: bc 12, 1, .[[OPT1LABEL:[_0-9A-Za-z]+]] +;CHECK-NEXT: # %test2 +;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30 +;CHECK-NEXT: bne 0, .[[OPT2LABEL:[_0-9A-Za-z]+]] +;CHECK-NEXT: .[[TEST3LABEL:[_0-9A-Za-z]+]]: # %test3 +;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29 +;CHECK-NEXT: bne 0, .[[OPT3LABEL:[_0-9A-Za-z]+]] +;CHECK-NEXT: .[[TEST4LABEL:[_0-9A-Za-z]+]]: # %test4 +;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28 +;CHECK-NEXT: bne 0, .[[OPT4LABEL:[_0-9A-Za-z]+]] +;CHECK-NEXT: .[[EXITLABEL:[_0-9A-Za-z]+]]: # %exit +;CHECK: blr +;CHECK-NEXT: .[[OPT1LABEL]]: +;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30 +;CHECK-NEXT: beq 0, .[[TEST3LABEL]] +;CHECK-NEXT: .[[OPT2LABEL]]: +;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29 +;CHECK-NEXT: beq 0, .[[TEST4LABEL]] +;CHECK-NEXT: .[[OPT3LABEL]]: +;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28 +;CHECK-NEXT: beq 0, .[[EXITLABEL]] +;CHECK-NEXT: .[[OPT4LABEL]]: +;CHECK: b .[[EXITLABEL]] + +define void @straight_test_50(i32 %tag) { +entry: + br label %test1 +test1: + %tagbit1 = and i32 %tag, 1 + %tagbit1eq0 = icmp eq i32 %tagbit1, 0 + br i1 %tagbit1eq0, label %test2, label %optional1, !prof !2 +optional1: + call void @a() + br label %test2 +test2: + %tagbit2 = and i32 %tag, 2 + %tagbit2eq0 = icmp eq i32 %tagbit2, 0 + br i1 %tagbit2eq0, label %test3, label %optional2, !prof !2 +optional2: + call void @b() + br label %test3 +test3: + %tagbit3 = and i32 %tag, 4 + %tagbit3eq0 = icmp eq i32 %tagbit3, 0 + br i1 %tagbit3eq0, label %test4, label %optional3, !prof !2 +optional3: + call void @c() + br label %test4 +test4: + %tagbit4 = and i32 %tag, 8 + %tagbit4eq0 = icmp eq i32 %tagbit4, 0 + br i1 %tagbit4eq0, label %exit, label %optional4, !prof !1 +optional4: call void @d() br label %exit exit: Index: test/CodeGen/X86/cmovcmov.ll =================================================================== --- test/CodeGen/X86/cmovcmov.ll +++ test/CodeGen/X86/cmovcmov.ll @@ -249,16 +249,23 @@ ; CMOV-DAG: cmpl %edx, %esi ; CMOV-DAG: movb $20, %al ; CMOV-DAG: movb $20, %dl -; CMOV: jl [[BB0:.LBB[0-9_]+]] +; CMOV: jge [[BB2:.LBB[0-9_]+]] +; CMOV: jle [[BB3:.LBB[0-9_]+]] +; CMOV: [[BB0:.LBB[0-9_]+]] +; CMOV: testl %edi, %edi +; CMOV: jne [[BB4:.LBB[0-9_]+]] +; CMOV: [[BB1:.LBB[0-9_]+]] +; CMOV: movb %al, g8(%rip) +; CMOV: retq +; CMOV: [[BB2]]: ; CMOV: movl %ecx, %edx -; CMOV: [[BB0]]: -; CMOV: jg [[BB1:.LBB[0-9_]+]] +; CMOV: jg [[BB0]] +; CMOV: [[BB3]]: ; CMOV: movl %edx, %eax -; CMOV: [[BB1]]: ; CMOV: testl %edi, %edi -; CMOV: je [[BB2:.LBB[0-9_]+]] +; CMOV: je [[BB1]] +; CMOV: [[BB4]]: ; CMOV: movl %edx, %eax -; CMOV: [[BB2]]: ; CMOV: movb %al, g8(%rip) ; CMOV: retq define void @no_cascade_opt(i32 %v0, i32 %v1, i32 %v2, i32 %v3) {