Index: include/llvm/CodeGen/TailDuplicator.h =================================================================== --- include/llvm/CodeGen/TailDuplicator.h +++ include/llvm/CodeGen/TailDuplicator.h @@ -34,6 +34,7 @@ const MachineModuleInfo *MMI; MachineRegisterInfo *MRI; bool PreRegAlloc; + bool LayoutMode; // A list of virtual registers for which to update SSA form. SmallVector SSAUpdateVRs; @@ -45,14 +46,18 @@ DenseMap SSAUpdateVals; public: + /// Prepare to run on a specific machine function. + /// LayoutMode - When true, don't use the existing layout to make decisions, + /// and don't remove blocks. void initMF(MachineFunction &MF, const MachineModuleInfo *MMI, - const MachineBranchProbabilityInfo *MBPI); + const MachineBranchProbabilityInfo *MBPI, bool LayoutMode); bool tailDuplicateBlocks(MachineFunction &MF); static bool isSimpleBB(MachineBasicBlock *TailBB); bool shouldTailDuplicate(const MachineFunction &MF, bool IsSimple, MachineBasicBlock &TailBB); bool tailDuplicateAndUpdate(MachineFunction &MF, bool IsSimple, - MachineBasicBlock *MBB); + MachineBasicBlock *MBB, + MachineBasicBlock *ForcedLayoutPred); private: typedef TargetInstrInfo::RegSubRegPair RegSubRegPair; @@ -78,6 +83,7 @@ SmallVectorImpl &Copies); bool tailDuplicate(MachineFunction &MF, bool IsSimple, MachineBasicBlock *TailBB, + MachineBasicBlock *ForcedLayoutPred, SmallVectorImpl &TDBBs, SmallVectorImpl &Copies); void appendCopies(MachineBasicBlock *MBB, Index: lib/CodeGen/BranchFolding.cpp =================================================================== --- lib/CodeGen/BranchFolding.cpp +++ lib/CodeGen/BranchFolding.cpp @@ -591,13 +591,21 @@ /// and decide if it would be profitable to merge those tails. Return the /// length of the common tail and iterators to the first common instruction /// in each block. +/// MBB1, MBB2 The blocks to check +/// I1, I2 Iterator references that will be changed to point to the first +/// instruction in the common tail shared by MBB1,MBB2 +/// SuccBB A common successor of MBB1, MBB2 which are in a canonical form +/// relative to SuccBB +/// PredBB The layout predecessor of SuccBB, if any. +/// AfterPlacement Flag to indicate that block placement has already run. static bool ProfitableToMerge(MachineBasicBlock *MBB1, MachineBasicBlock *MBB2, unsigned minCommonTailLength, unsigned &CommonTailLen, MachineBasicBlock::iterator &I1, MachineBasicBlock::iterator &I2, MachineBasicBlock *SuccBB, MachineBasicBlock *PredBB, - DenseMap &FuncletMembership) { + DenseMap &FuncletMembership, + bool AfterPlacement) { // It is never profitable to tail-merge blocks from two different funclets. if (!FuncletMembership.empty()) { auto Funclet1 = FuncletMembership.find(MBB1); @@ -616,8 +624,10 @@ << '\n'); // It's almost always profitable to merge any number of non-terminator - // instructions with the block that falls through into the common successor. - if (MBB1 == PredBB || MBB2 == PredBB) { + // instructions with the block that falls through into the common successor, + // unless the other block has its own fallthrough. + if ((MBB1 == PredBB && !MBB2->canFallThrough()) + || (MBB2 == PredBB && !MBB1->canFallThrough())) { MachineBasicBlock::iterator I; unsigned NumTerms = CountTerminators(MBB1 == PredBB ? MBB2 : MBB1, I); if (CommonTailLen > NumTerms) @@ -637,10 +647,17 @@ // count that as an additional common instruction for the following // heuristics. unsigned EffectiveTailLen = CommonTailLen; - if (SuccBB && MBB1 != PredBB && MBB2 != PredBB && - !MBB1->back().isBarrier() && - !MBB2->back().isBarrier()) - ++EffectiveTailLen; + // We can't do this during layout because it undoes tail duplication, despite + // non-overlapping thresholds. What happens is that a block with a + // conditional branch and fallthrough gets tail-duplicated and this heuristic + // gets run with the non-fallthrough successor as SuccBB, and counts an extra + // branch, despite the block having only 2 instructions when it was + // duplicated. + if (!AfterPlacement) + if (SuccBB && MBB1 != PredBB && MBB2 != PredBB && + !MBB1->back().isBarrier() && + !MBB2->back().isBarrier()) + ++EffectiveTailLen; // Check if the common tail is long enough to be worthwhile. if (EffectiveTailLen >= minCommonTailLength) @@ -682,7 +699,8 @@ minCommonTailLength, CommonTailLen, TrialBBI1, TrialBBI2, SuccBB, PredBB, - FuncletMembership)) { + FuncletMembership, + AfterBlockPlacement)) { if (CommonTailLen > maxCommonTailLength) { SameTails.clear(); maxCommonTailLength = CommonTailLen; Index: lib/CodeGen/MachineBlockPlacement.cpp =================================================================== --- lib/CodeGen/MachineBlockPlacement.cpp +++ lib/CodeGen/MachineBlockPlacement.cpp @@ -40,6 +40,7 @@ #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/TailDuplicator.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -117,6 +118,12 @@ static cl::opt JumpInstCost("jump-inst-cost", cl::desc("Cost of jump instructions."), cl::init(1), cl::Hidden); +static cl::opt +TailDupPlacement("tail-dup-placement", + cl::desc("Perform tail duplication during placement. " + "Creates more fallthrough opportunites in " + "outline branches."), + cl::init(true), cl::Hidden); static cl::opt BranchFoldPlacement("branch-fold-placement", @@ -262,6 +269,13 @@ /// \brief A handle to the post dominator tree. MachineDominatorTree *MDT; + /// \brief Duplicator used to duplicate tails during placement. + /// + /// Placement decisions can open up new tail duplication opportunities, but + /// since tail duplication affects placement decisions of later blocks, it + /// must be done inline. + TailDuplicator TailDup; + /// \brief A set of blocks that are unavoidably execute, i.e. they dominate /// all terminators of the MachineFunction. SmallPtrSet UnavoidableBlocks; @@ -855,6 +869,16 @@ // after this block. MachineBasicBlock *BestSucc = selectBestSuccessor(BB, Chain, BlockFilter); + // Placing an actual successor may have changed tail duplication + // opportunities. Check for that now. + if (TailDupPlacement && BestSucc) { + DEBUG(dbgs() << "Redoing tail duplication for BestSucc#" + << BestSucc->getNumber() << "\n"); + bool IsSimple = TailDup.isSimpleBB(BestSucc); + if (TailDup.shouldTailDuplicate(*F, IsSimple, *BestSucc)) + TailDup.tailDuplicateAndUpdate(*F, IsSimple, BestSucc, BB); + } + // If an immediate successor isn't available, look for the best viable // block among those we've identified as not violating the loop's CFG at // this point. This won't be a fallthrough, but it will increase locality. @@ -1654,6 +1678,10 @@ TII = MF.getSubtarget().getInstrInfo(); TLI = MF.getSubtarget().getTargetLowering(); MDT = &getAnalysis(); + auto MMI = getAnalysisIfAvailable(); + if (TailDupPlacement) + TailDup.initMF(MF, MMI, MBPI, /* LayoutMode */ true); + assert(BlockToChain.empty()); buildCFGChains(); @@ -1675,6 +1703,8 @@ /*AfterBlockPlacement=*/true)) { // Redo the layout if tail merging creates/removes/moves blocks. BlockToChain.clear(); + // Must redo the dominator tree if blocks were changed. + MDT->runOnMachineFunction(MF); ChainAllocator.DestroyAll(); buildCFGChains(); } Index: lib/CodeGen/TailDuplication.cpp =================================================================== --- lib/CodeGen/TailDuplication.cpp +++ lib/CodeGen/TailDuplication.cpp @@ -50,7 +50,7 @@ auto MMI = getAnalysisIfAvailable(); auto MBPI = &getAnalysis(); - Duplicator.initMF(MF, MMI, MBPI); + Duplicator.initMF(MF, MMI, MBPI, /* LayoutMode */ false); bool MadeChange = false; while (Duplicator.tailDuplicateBlocks(MF)) Index: lib/CodeGen/TailDuplicator.cpp =================================================================== --- lib/CodeGen/TailDuplicator.cpp +++ lib/CodeGen/TailDuplicator.cpp @@ -57,7 +57,8 @@ namespace llvm { void TailDuplicator::initMF(MachineFunction &MF, const MachineModuleInfo *MMIin, - const MachineBranchProbabilityInfo *MBPIin) { + const MachineBranchProbabilityInfo *MBPIin, + bool LayoutModeIn) { TII = MF.getSubtarget().getInstrInfo(); TRI = MF.getSubtarget().getRegisterInfo(); MRI = &MF.getRegInfo(); @@ -66,6 +67,7 @@ assert(MBPI != nullptr && "Machine Branch Probability Info required"); + LayoutMode = LayoutModeIn; PreRegAlloc = MRI->isSSA(); } @@ -119,15 +121,16 @@ } /// Tail duplicate the block and cleanup. -bool TailDuplicator::tailDuplicateAndUpdate(MachineFunction &MF, bool IsSimple, - MachineBasicBlock *MBB) { +bool TailDuplicator::tailDuplicateAndUpdate( + MachineFunction &MF, bool IsSimple, MachineBasicBlock *MBB, + MachineBasicBlock *ForcedLayoutPred) { // Save the successors list. SmallSetVector Succs(MBB->succ_begin(), MBB->succ_end()); SmallVector TDBBs; SmallVector Copies; - if (!tailDuplicate(MF, IsSimple, MBB, TDBBs, Copies)) + if (!tailDuplicate(MF, IsSimple, MBB, ForcedLayoutPred, TDBBs, Copies)) return false; ++NumTails; @@ -241,7 +244,7 @@ if (!shouldTailDuplicate(MF, IsSimple, *MBB)) continue; - MadeChange |= tailDuplicateAndUpdate(MF, IsSimple, MBB); + MadeChange |= tailDuplicateAndUpdate(MF, IsSimple, MBB, nullptr); } if (PreRegAlloc && TailDupVerify) @@ -506,8 +509,18 @@ bool TailDuplicator::shouldTailDuplicate(const MachineFunction &MF, bool IsSimple, MachineBasicBlock &TailBB) { - // Only duplicate blocks that end with unconditional branches. - if (TailBB.canFallThrough()) + // When doing tail-duplication during layout, the block ordering is in flux, + // so canFallThrough returns a result based on incorrect information and + // should just be ignored. + if (!LayoutMode && TailBB.canFallThrough()) + return false; + + // If the block to be duplicated ends in an unanalyzable fallthrough, don't + // duplicate it. + MachineBasicBlock *PredTBB = nullptr, *PredFBB = nullptr; + SmallVector PredCond; + if (TailBB.canFallThrough() && + TII->AnalyzeBranch(TailBB, PredTBB, PredFBB, PredCond, true)) return false; // Don't try to tail-duplicate single-block loops. @@ -719,8 +732,14 @@ /// If it is profitable, duplicate TailBB's contents in each /// of its predecessors. +/// ForcedLayoutPred When non-null, use this block as the layout predecessor +/// instead of the previous block in MF's order. +/// TDBBs A vector to keep track of all blocks tail-duplicated into. +/// Copies A vector of copy instructions inserted. Used later to walk +/// all the inserted copies and remove redundant ones. bool TailDuplicator::tailDuplicate(MachineFunction &MF, bool IsSimple, MachineBasicBlock *TailBB, + MachineBasicBlock *ForcedLayoutPred, SmallVectorImpl &TDBBs, SmallVectorImpl &Copies) { DEBUG(dbgs() << "\n*** Tail-duplicating BB#" << TailBB->getNumber() << '\n'); @@ -755,7 +774,12 @@ if (!PredCond.empty()) continue; // Don't duplicate into a fall-through predecessor (at least for now). - if (PredBB->isLayoutSuccessor(TailBB) && PredBB->canFallThrough()) + bool IsLayoutSuccessor = false; + if (ForcedLayoutPred) + IsLayoutSuccessor = (ForcedLayoutPred == PredBB); + else if (PredBB->isLayoutSuccessor(TailBB) && PredBB->canFallThrough()) + IsLayoutSuccessor = true; + if (IsLayoutSuccessor) continue; DEBUG(dbgs() << "\nTail-duplicating into PredBB: " << *PredBB @@ -807,13 +831,14 @@ // If TailBB was duplicated into all its predecessors except for the prior // block, which falls through unconditionally, move the contents of this - // block into the prior block. + // block into the prior block. Don't do this when LayoutMode is + // true, as there is no point in removing the block during layout. MachineBasicBlock *PrevBB = &*std::prev(TailBB->getIterator()); MachineBasicBlock *PriorTBB = nullptr, *PriorFBB = nullptr; SmallVector PriorCond; // This has to check PrevBB->succ_size() because EH edges are ignored by // AnalyzeBranch. - if (PrevBB->succ_size() == 1 && + if (!LayoutMode && PrevBB->succ_size() == 1 && !TII->AnalyzeBranch(*PrevBB, PriorTBB, PriorFBB, PriorCond, true) && PriorCond.empty() && !PriorTBB && TailBB->pred_size() == 1 && !TailBB->hasAddressTaken()) { Index: test/CodeGen/AArch64/machine_cse.ll =================================================================== --- test/CodeGen/AArch64/machine_cse.ll +++ test/CodeGen/AArch64/machine_cse.ll @@ -1,4 +1,8 @@ -; RUN: llc < %s -mtriple=aarch64-linux-gnuabi -O2 | FileCheck %s +; RUN: llc < %s -mtriple=aarch64-linux-gnuabi -O2 -tail-dup-placement=0 | FileCheck %s +; -tail-dup-placement causes tail duplication during layout. This breaks the +; assumptions of the test case as written (specifically, it creates an +; additional cmp instruction, creating a false positive), so we pass +; -tail-dup-placement=0 to restore the original behavior ; marked as external to prevent possible optimizations @a = external global i32 Index: test/CodeGen/Hexagon/rdf-copy.ll =================================================================== --- test/CodeGen/Hexagon/rdf-copy.ll +++ test/CodeGen/Hexagon/rdf-copy.ll @@ -17,7 +17,7 @@ ; CHECK: [[DST:r[0-9]+]] = [[SRC:r[0-9]+]] ; CHECK-DAG: memw([[SRC]] ; CHECK-NOT: memw([[DST]] -; CHECK-LABEL: LBB0_2 +; CHECK: %if.end target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32" target triple = "hexagon" Index: test/CodeGen/PowerPC/branch-opt.ll =================================================================== --- test/CodeGen/PowerPC/branch-opt.ll +++ test/CodeGen/PowerPC/branch-opt.ll @@ -1,9 +1,18 @@ -; RUN: llc < %s -march=ppc32 | \ -; RUN: grep "b LBB.*" | count 4 +; RUN: llc < %s -march=ppc32 | FileCheck %s target datalayout = "E-p:32:32" target triple = "powerpc-apple-darwin8.7.0" +;CHECK-LABEL: foo: +; There are 3 inner loops (%bb, %bb12, %bb25) that all exit to %cond_next48 +; The last (whichever it is) should have a fallthrough exit, and the other two +; need an unconditional branch. No other block should have an unconditional +; branch to cond_next48 +;CHECK: b LBB0_13 +;CHECK: b LBB0_13 +;CHECK-NOT: b LBB0_13 +;CHECK: LBB0_13: ; %cond_next48 + define void @foo(i32 %W, i32 %X, i32 %Y, i32 %Z) { entry: %tmp1 = and i32 %W, 1 ; [#uses=1] Index: test/CodeGen/PowerPC/tail-dup-layout.ll =================================================================== --- /dev/null +++ test/CodeGen/PowerPC/tail-dup-layout.ll @@ -0,0 +1,100 @@ +; RUN: llc -outline-optional-branches -O2 < %s | FileCheck %s +target datalayout = "e-m:e-i64:64-n32:64" +target triple = "powerpc64le-grtev4-linux-gnu" + +; Intended layout: +; The outlining flag produces the layout +; test1 +; test2 +; test3 +; test4 +; exit +; optional1 +; optional2 +; optional3 +; optional4 +; Tail duplication puts test n+1 at the end of optional n +; so optional1 includes a copy of test2 at the end, and branches +; to test3 (at the top) or falls through to optional 2. +; The CHECK statements check for the whole string of tests and exit block, +; and then check that the correct test has been duplicated into the end of +; the optional blocks and that the optional blocks are in the correct order. +;CHECK-LABEL: f: +; test1 may have been merged with entry +;CHECK: mr [[TAGREG:[0-9]+]], 3 +;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1 +;CHECK-NEXT: bc 12, 1, [[OPT1LABEL:[._0-9A-Za-z]+]] +;CHECK-NEXT: [[TEST2LABEL:[._0-9A-Za-z]+]]: # %test2 +;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30 +;CHECK-NEXT: bne 0, [[OPT2LABEL:[._0-9A-Za-z]+]] +;CHECK-NEXT: [[TEST3LABEL:[._0-9A-Za-z]+]]: # %test3 +;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29 +;CHECK-NEXT: bne 0, .[[OPT3LABEL:[._0-9A-Za-z]+]] +;CHECK-NEXT: [[TEST4LABEL:[._0-9A-Za-z]+]]: # %test4 +;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28 +;CHECK-NEXT: bne 0, .[[OPT4LABEL:[._0-9A-Za-z]+]] +;CHECK-NEXT: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit +;CHECK: blr +;CHECK-NEXT: [[OPT1LABEL]] +;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30 +;CHECK-NEXT: beq 0, [[TEST3LABEL]] +;CHECK-NEXT: [[OPT2LABEL]] +;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29 +;CHECK-NEXT: beq 0, [[TEST4LABEL]] +;CHECK-NEXT: [[OPT3LABEL]] +;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28 +;CHECK-NEXT: beq 0, [[EXITLABEL]] +;CHECK-NEXT: [[OPT4LABEL]] +;CHECK: b [[EXITLABEL]] + +define void @f(i32 %tag) { +entry: + br label %test1 +test1: + %tagbit1 = and i32 %tag, 1 + %tagbit1eq0 = icmp eq i32 %tagbit1, 0 + br i1 %tagbit1eq0, label %test2, label %optional1 +optional1: + call void @a() + call void @a() + call void @a() + call void @a() + br label %test2 +test2: + %tagbit2 = and i32 %tag, 2 + %tagbit2eq0 = icmp eq i32 %tagbit2, 0 + br i1 %tagbit2eq0, label %test3, label %optional2 +optional2: + call void @b() + call void @b() + call void @b() + call void @b() + br label %test3 +test3: + %tagbit3 = and i32 %tag, 4 + %tagbit3eq0 = icmp eq i32 %tagbit3, 0 + br i1 %tagbit3eq0, label %test4, label %optional3 +optional3: + call void @c() + call void @c() + call void @c() + call void @c() + br label %test4 +test4: + %tagbit4 = and i32 %tag, 8 + %tagbit4eq0 = icmp eq i32 %tagbit4, 0 + br i1 %tagbit4eq0, label %exit, label %optional4 +optional4: + call void @d() + call void @d() + call void @d() + call void @d() + br label %exit +exit: + ret void +} + +declare void @a() +declare void @b() +declare void @c() +declare void @d() Index: test/CodeGen/WebAssembly/cfg-stackify.ll =================================================================== --- test/CodeGen/WebAssembly/cfg-stackify.ll +++ test/CodeGen/WebAssembly/cfg-stackify.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -disable-block-placement -verify-machineinstrs -fast-isel=false | FileCheck %s -; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -verify-machineinstrs -fast-isel=false | FileCheck -check-prefix=OPT %s +; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -tail-dup-placement=0 -verify-machineinstrs -fast-isel=false | FileCheck -check-prefix=OPT %s ; Test the CFG stackifier pass. Index: test/CodeGen/WebAssembly/mem-intrinsics.ll =================================================================== --- test/CodeGen/WebAssembly/mem-intrinsics.ll +++ test/CodeGen/WebAssembly/mem-intrinsics.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s +; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -tail-dup-placement=0| FileCheck %s ; Test memcpy, memmove, and memset intrinsics. Index: test/CodeGen/X86/fp-une-cmp.ll =================================================================== --- test/CodeGen/X86/fp-une-cmp.ll +++ test/CodeGen/X86/fp-une-cmp.ll @@ -56,11 +56,11 @@ ; CHECK-NEXT: ucomisd %xmm1, %xmm0 ; CHECK-NEXT: jne .LBB1_1 ; CHECK-NEXT: jp .LBB1_1 -; CHECK-NEXT: .LBB1_2: # %bb2 +; CHECK-NEXT: # %bb2 ; CHECK-NEXT: retq ; CHECK-NEXT: .LBB1_1: # %bb1 ; CHECK-NEXT: addsd {{.*}}(%rip), %xmm0 -; CHECK-NEXT: jmp .LBB1_2 +; CHECK-NEXT: retq entry: %mul = fmul double %x, %y Index: test/CodeGen/X86/ragreedy-bug.ll =================================================================== --- test/CodeGen/X86/ragreedy-bug.ll +++ test/CodeGen/X86/ragreedy-bug.ll @@ -7,12 +7,21 @@ ; CHECK-NEXT: in Loop ; CHECK-NEXT: testl ; CHECK-NEXT: jne -; CHECK: isupper.exit +; isupper.exit223 gets tail-duplicated into all of its predecessors. Check them instead +; CHECK: cond.true.i.i217 ; CHECK-NEXT: in Loop +; Mem-move +; CHECK-NEXT: movl +; CHECK-NEXT: andl ; CHECK-NEXT: testl ; CHECK-NEXT: je ; CHECK: maskrune +; CHECK: cond.false.i.i219 ; CHECK: maskrune +; CHECK-NEXT: movzbl +; CHECK-NEXT: movzbl +; CHECK-NEXT: testl +; CHECK-NEXT: jne %struct.List_o_links_struct = type { i32, i32, i32, %struct.List_o_links_struct* } %struct.Connector_struct = type { i16, i16, i8, i8, %struct.Connector_struct*, i8* }