Index: lib/CodeGen/MachineBlockPlacement.cpp =================================================================== --- lib/CodeGen/MachineBlockPlacement.cpp +++ lib/CodeGen/MachineBlockPlacement.cpp @@ -50,6 +50,7 @@ #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetSubtargetInfo.h" #include +#include #include #include using namespace llvm; @@ -143,6 +144,14 @@ cl::init(2), cl::Hidden); +// Heuristic for triangle chains. +static cl::opt TriangleChainCount( + "triangle-chain-count", + cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the " + "triangle tail duplication heuristic to kick in. 0 to disable."), + cl::init(3), + cl::Hidden); + extern cl::opt StaticLikelyProb; extern cl::opt ProfileLikelyProb; @@ -456,6 +465,9 @@ bool canTailDuplicateUnplacedPreds( const MachineBasicBlock *BB, MachineBasicBlock *Succ, const BlockChain &Chain, const BlockFilterSet *BlockFilter); + /// Find chains of triangles to tail-duplicate where a global analysis works, + /// but a local analysis would not find them. + void precomputeTriangleChains(); public: static char ID; // Pass identification, replacement for typeid @@ -1041,6 +1053,127 @@ return true; } +/// Find chains of triangles where we believe it would be profitable to +/// tail-duplicate them all, but a local analysis would not find them. +/// There are 3 ways this can be profitable: +/// 1) The post-dominators marked 50% are actually taken 55% (This shrinks with +/// longer chains) +/// 2) The chains are statically correlated. Branch probabilities have a very +/// U-shaped distribution. +/// [http://nrs.harvard.edu/urn-3:HUL.InstRepos:24015805] +/// If the branches in a chain are likely to be from the same side of the +/// distribution as their predecessor, but are independent at runtime, this +/// transformation is profitable. (Because the cost of being wrong is a small +/// fixed cost, unlike the standard triangle layout where the cost of being +/// wrong scales with the # of triangles.) +/// 3) The chains are dynamically correlated. If the probability that a previous +/// branch was taken positively influences whether the next branch will be +/// taken +/// We believe that 2 and 3 are common enough to justify the small margin in 1. +void MachineBlockPlacement::precomputeTriangleChains() { + struct TriangleChain { + unsigned Count; + std::forward_list Edges; + TriangleChain(MachineBasicBlock* src, MachineBasicBlock *dst) { + Edges.push_front(src); + Edges.push_front(dst); + Count = 1; + } + + void append(MachineBasicBlock *dst) { + assert(!Edges.empty() && Edges.front()->isSuccessor(dst) && + "Attempting to append a block that is not a successor."); + Edges.push_front(dst); + ++Count; + } + + MachineBasicBlock *getKey() { + return Edges.front(); + } + }; + + if (TriangleChainCount == 0) + return; + + DEBUG(dbgs() << "Pre-computing triangle chains.\n"); + // Map from last block to the chain that contains it. This allows us to extend + // chains as we find new triangles. + DenseMap TriangleChainMap; + for (MachineBasicBlock &BB : *F) { + // If BB doesn't have 2 successors, it doesn't start a triangle. + if (BB.succ_size() != 2) + continue; + MachineBasicBlock *PDom = nullptr; + for (MachineBasicBlock *Succ : BB.successors()) { + if (!MPDT->dominates(Succ, &BB)) + continue; + PDom = Succ; + break; + } + // If BB doesn't have a post-dominating successor, it doesn't form a + // triangle. + if (PDom == nullptr) + continue; + // If PDom has a hint that it is low probability, skip this triangle. + if (MBPI->getEdgeProbability(&BB, PDom) < BranchProbability(50, 100)) + continue; + // If PDom isn't eligible for duplication, this isn't the kind of triangle + // we're looking for. + if (!shouldTailDuplicate(PDom)) + continue; + bool CanTailDuplicate = true; + // If PDom can't tail-duplicate into it's non-BB predecessors, then this + // isn't the kind of triangle we're looking for. + for (MachineBasicBlock* Pred : PDom->predecessors()) { + if (Pred == &BB) + continue; + if (!TailDup.canTailDuplicate(PDom, Pred)) { + CanTailDuplicate = false; + break; + } + } + // If we can't tail-duplicate PDom to its predecessors, then skip this + // triangle. + if (!CanTailDuplicate) + continue; + + // Now we have an interesting triangle. Insert it if it's not part of an + // existing chain + // Note: This cannot be replaced with a call insert() or emplace() because + // the find key is BB, but the insert/emplace key is PDom. + auto Found = TriangleChainMap.find(&BB); + // If it is, remove the chain from the map, grow it, and put it back in the + // map with the end as the new key. + if (Found != TriangleChainMap.end()) { + TriangleChain Chain = std::move(Found->second); + TriangleChainMap.erase(Found); + Chain.append(PDom); + TriangleChainMap.insert(std::make_pair(Chain.getKey(), std::move(Chain))); + } else { + auto InsertResult = TriangleChainMap.try_emplace(PDom, &BB, PDom); + assert (InsertResult.second && "Block seen twice."); + (void) InsertResult; + } + } + + for (auto &ChainPair : TriangleChainMap) { + TriangleChain &Chain = ChainPair.second; + // Benchmarking has shown that due to branch correlation duplicating 2 or + // more triangles is profitable, despite the calculations assuming + // independence. + if (Chain.Count < TriangleChainCount) + continue; + MachineBasicBlock *dst = Chain.Edges.front(); + Chain.Edges.pop_front(); + for (MachineBasicBlock *src : Chain.Edges) { + DEBUG(dbgs() << "Marking edge: " << getBlockName(src) << "->" << + getBlockName(dst) << " as pre-computed based on triangles.\n"); + ComputedEdges[src] = { dst, true }; + dst = src; + } + } +} + // When profile is not present, return the StaticLikelyProb. // When profile is available, we need to handle the triangle-shape CFG. static BranchProbability getLayoutSuccessorProbThreshold( @@ -2504,6 +2637,7 @@ if (MF.getFunction()->optForSize()) TailDupSize = 1; TailDup.initMF(MF, MBPI, /* LayoutMode */ true, TailDupSize); + precomputeTriangleChains(); } assert(BlockToChain.empty()); Index: test/CodeGen/PowerPC/tail-dup-layout.ll =================================================================== --- test/CodeGen/PowerPC/tail-dup-layout.ll +++ test/CodeGen/PowerPC/tail-dup-layout.ll @@ -95,6 +95,87 @@ } ; Intended layout: +; The chain-of-triangles based duplicating produces the layout +; test1 +; test2 +; test3 +; test4 +; optional1 +; optional2 +; optional3 +; optional4 +; exit +; even for 50/50 branches. +; Tail duplication puts test n+1 at the end of optional n +; so optional1 includes a copy of test2 at the end, and branches +; to test3 (at the top) or falls through to optional 2. +; The CHECK statements check for the whole string of tests +; and then check that the correct test has been duplicated into the end of +; the optional blocks and that the optional blocks are in the correct order. +;CHECK-LABEL: straight_test_50: +; test1 may have been merged with entry +;CHECK: mr [[TAGREG:[0-9]+]], 3 +;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1 +;CHECK-NEXT: bc 12, 1, .[[OPT1LABEL:[_0-9A-Za-z]+]] +;CHECK-NEXT: # %test2 +;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30 +;CHECK-NEXT: bne 0, .[[OPT2LABEL:[_0-9A-Za-z]+]] +;CHECK-NEXT: .[[TEST3LABEL:[_0-9A-Za-z]+]]: # %test3 +;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29 +;CHECK-NEXT: bne 0, .[[OPT3LABEL:[_0-9A-Za-z]+]] +;CHECK-NEXT: .[[TEST4LABEL:[_0-9A-Za-z]+]]: # %test4 +;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28 +;CHECK-NEXT: bne 0, .[[OPT4LABEL:[_0-9A-Za-z]+]] +;CHECK-NEXT: .[[EXITLABEL:[_0-9A-Za-z]+]]: # %exit +;CHECK: blr +;CHECK-NEXT: .[[OPT1LABEL]]: +;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30 +;CHECK-NEXT: beq 0, .[[TEST3LABEL]] +;CHECK-NEXT: .[[OPT2LABEL]]: +;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29 +;CHECK-NEXT: beq 0, .[[TEST4LABEL]] +;CHECK-NEXT: .[[OPT3LABEL]]: +;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28 +;CHECK-NEXT: beq 0, .[[EXITLABEL]] +;CHECK-NEXT: .[[OPT4LABEL]]: +;CHECK: b .[[EXITLABEL]] + +define void @straight_test_50(i32 %tag) { +entry: + br label %test1 +test1: + %tagbit1 = and i32 %tag, 1 + %tagbit1eq0 = icmp eq i32 %tagbit1, 0 + br i1 %tagbit1eq0, label %test2, label %optional1, !prof !2 +optional1: + call void @a() + br label %test2 +test2: + %tagbit2 = and i32 %tag, 2 + %tagbit2eq0 = icmp eq i32 %tagbit2, 0 + br i1 %tagbit2eq0, label %test3, label %optional2, !prof !2 +optional2: + call void @b() + br label %test3 +test3: + %tagbit3 = and i32 %tag, 4 + %tagbit3eq0 = icmp eq i32 %tagbit3, 0 + br i1 %tagbit3eq0, label %test4, label %optional3, !prof !2 +optional3: + call void @c() + br label %test4 +test4: + %tagbit4 = and i32 %tag, 8 + %tagbit4eq0 = icmp eq i32 %tagbit4, 0 + br i1 %tagbit4eq0, label %exit, label %optional4, !prof !1 +optional4: + call void @d() + br label %exit +exit: + ret void +} + +; Intended layout: ; The chain-based outlining produces the layout ; entry ; --- Begin loop --- Index: test/CodeGen/X86/cmovcmov.ll =================================================================== --- test/CodeGen/X86/cmovcmov.ll +++ test/CodeGen/X86/cmovcmov.ll @@ -249,16 +249,23 @@ ; CMOV-DAG: cmpl %edx, %esi ; CMOV-DAG: movb $20, %al ; CMOV-DAG: movb $20, %dl -; CMOV: jl [[BB0:.LBB[0-9_]+]] +; CMOV: jge [[BB2:.LBB[0-9_]+]] +; CMOV: jle [[BB3:.LBB[0-9_]+]] +; CMOV: [[BB0:.LBB[0-9_]+]] +; CMOV: testl %edi, %edi +; CMOV: jne [[BB4:.LBB[0-9_]+]] +; CMOV: [[BB1:.LBB[0-9_]+]] +; CMOV: movb %al, g8(%rip) +; CMOV: retq +; CMOV: [[BB2]]: ; CMOV: movl %ecx, %edx -; CMOV: [[BB0]]: -; CMOV: jg [[BB1:.LBB[0-9_]+]] +; CMOV: jg [[BB0]] +; CMOV: [[BB3]]: ; CMOV: movl %edx, %eax -; CMOV: [[BB1]]: ; CMOV: testl %edi, %edi -; CMOV: je [[BB2:.LBB[0-9_]+]] +; CMOV: je [[BB1]] +; CMOV: [[BB4]]: ; CMOV: movl %edx, %eax -; CMOV: [[BB2]]: ; CMOV: movb %al, g8(%rip) ; CMOV: retq define void @no_cascade_opt(i32 %v0, i32 %v1, i32 %v2, i32 %v3) {