Index: lib/CodeGen/MachineBlockPlacement.cpp =================================================================== --- lib/CodeGen/MachineBlockPlacement.cpp +++ lib/CodeGen/MachineBlockPlacement.cpp @@ -74,6 +74,13 @@ "post dominator, out of line."), cl::init(false), cl::Hidden); +static cl::opt PlaceLastSuccessor( + "place-last-successor", + cl::desc("When selecting a non-successor block, choose the last block to " + "have been a successor. This represents the block whose " + "predecessor was most recently placed."), + cl::init(false), cl::Hidden); + namespace { class BlockChain; /// \brief Type for our function-wide basic block -> block chain mapping. @@ -223,11 +230,11 @@ const BlockFilterSet *BlockFilter = nullptr); MachineBasicBlock *selectBestSuccessor(MachineBasicBlock *BB, BlockChain &Chain, - const BlockFilterSet *BlockFilter); - MachineBasicBlock * - selectBestCandidateBlock(BlockChain &Chain, - SmallVectorImpl &WorkList, - const BlockFilterSet *BlockFilter); + const BlockFilterSet *BlockFilter, + bool &HasUnmergedSuccessors); + MachineBasicBlock *selectBestCandidateBlock( + BlockChain &Chain, SmallVectorImpl &WorkList, + const BlockFilterSet *BlockFilter, bool HasUnmergedSuccessors); MachineBasicBlock * getFirstUnplacedBlock(MachineFunction &F, const BlockChain &PlacedChain, MachineFunction::iterator &PrevUnplacedBlockIt, @@ -343,7 +350,8 @@ MachineBasicBlock * MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB, BlockChain &Chain, - const BlockFilterSet *BlockFilter) { + const BlockFilterSet *BlockFilter, + bool &HasUnmergedSuccessors) { const BranchProbability HotProb(4, 5); // 80% MachineBasicBlock *BestSucc = nullptr; @@ -370,6 +378,8 @@ continue; } + HasUnmergedSuccessors = true; + uint32_t SuccWeight = MBPI->getEdgeWeight(BB, Succ); BranchProbability SuccProb(SuccWeight / WeightScale, SumWeight); @@ -436,7 +446,25 @@ /// \returns The best block found, or null if none are viable. MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock( BlockChain &Chain, SmallVectorImpl &WorkList, - const BlockFilterSet *BlockFilter) { + const BlockFilterSet *BlockFilter, bool HasUnmergedSuccessors) { + if (PlaceLastSuccessor && HasUnmergedSuccessors) { + // If we're just placing the last successor as the best candidate, the + // logic is super simple. We skip the already placed entries on the + // worklist and return the most recently added entry that isn't placed. + while (!WorkList.empty()) { + MachineBasicBlock *SuccBB = WorkList.pop_back_val(); + BlockChain &SuccChain = *BlockToChain.lookup(SuccBB); + if (&SuccChain == &Chain) { + DEBUG(dbgs() << " " << getBlockName(SuccBB) + << " -> Already merged!\n"); + continue; + } + assert(SuccChain.LoopPredecessors == 0 && "Found CFG-violating block"); + return SuccBB; + } + + return nullptr; + } // Once we need to walk the worklist looking for a candidate, cleanup the // worklist of already placed entries. // FIXME: If this shows up on profiles, it could be folded (at the cost of @@ -513,13 +541,16 @@ // Look for the best viable successor if there is one to place immediately // after this block. - MachineBasicBlock *BestSucc = selectBestSuccessor(BB, Chain, BlockFilter); + bool HasUnmergedSuccessors = false; + MachineBasicBlock *BestSucc = + selectBestSuccessor(BB, Chain, BlockFilter, HasUnmergedSuccessors); // If an immediate successor isn't available, look for the best viable // block among those we've identified as not violating the loop's CFG at // this point. This won't be a fallthrough, but it will increase locality. if (!BestSucc) - BestSucc = selectBestCandidateBlock(Chain, BlockWorkList, BlockFilter); + BestSucc = selectBestCandidateBlock(Chain, BlockWorkList, BlockFilter, + HasUnmergedSuccessors); if (!BestSucc) { BestSucc = Index: test/CodeGen/X86/block-placement.ll =================================================================== --- test/CodeGen/X86/block-placement.ll +++ test/CodeGen/X86/block-placement.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=i686-linux -pre-RA-sched=source < %s | FileCheck %s +; RUN: llc -mtriple=i686-linux -place-last-successor -pre-RA-sched=source < %s | FileCheck %s declare void @error(i32 %i, i32 %a, i32 %b) @@ -1083,3 +1083,87 @@ %ret = phi i32 [ %val1, %then ], [ %val2, %else ] ret i32 %ret } + +define void @test_outlined() { +; This test ends up with diamond control flow in outlined optional regions. +; These diamonds should still be locally cohensive even when out-of-line due to +; being cold. +; CHECK-LABEL: test_outlined: +; CHECK: %a1 +; CHECK: %a2 +; CHECK: %done +; CHECK: %b1 +; CHECK: %c1 +; CHECK: %d1 +; CHECK: %f1 +; CHECK: %b2 +; CHECK: %c2 +; CHECK: %d2 +; CHECK: %f2 + +a1: + %call.a1 = call i1 @a1() + br i1 %call.a1, label %b1, label %a2, !prof !0 + +b1: + %call.b1 = call i1 @b1() + br i1 %call.b1, label %c1, label %d1 + +c1: + call void @c1() + br label %f1 + +d1: + call void @d1() + br label %f1 + +f1: + call void @f1() + br label %a2 + +a2: + %call.a2 = call i1 @a2() + br i1 %call.a2, label %b2, label %done, !prof !0 + +b2: + %call.b2 = call i1 @b2() + br i1 %call.b2, label %c2, label %d2 + +c2: + call void @c2() + br label %f2 + +d2: + call void @d2() + br label %f2 + +f2: + call void @f2() + br label %done + +done: + call void @done() + ret void +} + +declare i1 @a1() + +declare i1 @b1() + +declare void @c1() + +declare void @d1() + +declare void @f1() + +declare i1 @a2() + +declare i1 @b2() + +declare void @c2() + +declare void @d2() + +declare void @f2() + +declare void @done()