Index: include/llvm/CodeGen/TailDuplicator.h =================================================================== --- include/llvm/CodeGen/TailDuplicator.h +++ include/llvm/CodeGen/TailDuplicator.h @@ -51,9 +51,10 @@ bool tailDuplicateBlocks(MachineFunction &MF); static bool isSimpleBB(MachineBasicBlock *TailBB); bool shouldTailDuplicate(const MachineFunction &MF, bool IsSimple, - MachineBasicBlock &TailBB); + MachineBasicBlock &TailBB, bool IgnoreFallthrough); bool tailDuplicateAndUpdate(MachineFunction &MF, bool IsSimple, - MachineBasicBlock *MBB); + MachineBasicBlock *MBB, + MachineBasicBlock *ForcedLayoutPred); private: void addSSAUpdateEntry(unsigned OrigReg, unsigned NewReg, @@ -77,6 +78,7 @@ SmallVectorImpl &Copies); bool tailDuplicate(MachineFunction &MF, bool IsSimple, MachineBasicBlock *TailBB, + MachineBasicBlock *ForcedLayoutPred, SmallVectorImpl &TDBBs, SmallVectorImpl &Copies); Index: lib/CodeGen/MachineBlockPlacement.cpp =================================================================== --- lib/CodeGen/MachineBlockPlacement.cpp +++ lib/CodeGen/MachineBlockPlacement.cpp @@ -38,6 +38,7 @@ #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/TailDuplicator.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -110,6 +111,12 @@ static cl::opt JumpInstCost("jump-inst-cost", cl::desc("Cost of jump instructions."), cl::init(1), cl::Hidden); +static cl::opt +TailDupPlacement("tail-dup-placement", + cl::desc("Perform tail duplication during placement. " + "Creates more fallthrough opportunites in " + "outline branches."), + cl::init(true), cl::Hidden); namespace { class BlockChain; @@ -239,6 +246,13 @@ /// \brief A handle to the post dominator tree. MachineDominatorTree *MDT; + /// \brief Duplicator used to duplicate tails during placement. + /// + /// Placement decisions can open up new tail duplication opportunities, but + /// since tail duplication affects placement decisions of later blocks, it + /// must be done inline. + TailDuplicator TailDup; + /// \brief A set of blocks that are unavoidably execute, i.e. they dominate /// all terminators of the MachineFunction. SmallPtrSet UnavoidableBlocks; @@ -672,6 +686,24 @@ // after this block. MachineBasicBlock *BestSucc = selectBestSuccessor(BB, Chain, BlockFilter); + // Placing an actual successor may have changed tail duplication + // opportunities. Check for that now. + if (TailDupPlacement && BestSucc) { + DEBUG(dbgs() << "Redoing tail duplication for BestSucc#" + << BestSucc->getNumber() << "\n"); + bool IsSimple = TailDup.isSimpleBB(BestSucc); + bool IgnoreFallthrough = true; + // Simple blocks should just fallthrough, so only worry about non-simple + // ones. + if (!IsSimple && TailDup.shouldTailDuplicate(F, IsSimple, + *BestSucc, IgnoreFallthrough)) { + SmallVector Cond; + MachineBasicBlock *TBB = nullptr, *FBB = nullptr; + if (!TII->AnalyzeBranch(*BestSucc, TBB, FBB, Cond)) + TailDup.tailDuplicateAndUpdate(F, IsSimple, BestSucc, BB); + } + } + // If an immediate successor isn't available, look for the best viable // block among those we've identified as not violating the loop's CFG at // this point. This won't be a fallthrough, but it will increase locality. @@ -1439,6 +1471,10 @@ TII = F.getSubtarget().getInstrInfo(); TLI = F.getSubtarget().getTargetLowering(); MDT = &getAnalysis(); + auto MMI = getAnalysisIfAvailable(); + if (TailDupPlacement) + TailDup.initMF(F, MMI, MBPI); + assert(BlockToChain.empty()); buildCFGChains(F); Index: lib/CodeGen/TailDuplicator.cpp =================================================================== --- lib/CodeGen/TailDuplicator.cpp +++ lib/CodeGen/TailDuplicator.cpp @@ -120,15 +120,16 @@ } /// Tail duplicate the block and cleanup. -bool TailDuplicator::tailDuplicateAndUpdate(MachineFunction &MF, bool IsSimple, - MachineBasicBlock *MBB) { +bool TailDuplicator::tailDuplicateAndUpdate( + MachineFunction &MF, bool IsSimple, MachineBasicBlock *MBB, + MachineBasicBlock *ForcedLayoutPred) { // Save the successors list. SmallSetVector Succs(MBB->succ_begin(), MBB->succ_end()); SmallVector TDBBs; SmallVector Copies; - if (!tailDuplicate(MF, IsSimple, MBB, TDBBs, Copies)) + if (!tailDuplicate(MF, IsSimple, MBB, ForcedLayoutPred, TDBBs, Copies)) return false; ++NumTails; @@ -239,10 +240,11 @@ bool IsSimple = isSimpleBB(MBB); - if (!shouldTailDuplicate(MF, IsSimple, *MBB)) + bool IgnoreFallthrough = false; + if (!shouldTailDuplicate(MF, IsSimple, *MBB, IgnoreFallthrough)) continue; - MadeChange |= tailDuplicateAndUpdate(MF, IsSimple, MBB); + MadeChange |= tailDuplicateAndUpdate(MF, IsSimple, MBB, nullptr); } if (PreRegAlloc && TailDupVerify) @@ -458,9 +460,12 @@ /// Determine if it is profitable to duplicate this block. bool TailDuplicator::shouldTailDuplicate(const MachineFunction &MF, bool IsSimple, - MachineBasicBlock &TailBB) { - // Only duplicate blocks that end with unconditional branches. - if (TailBB.canFallThrough()) + MachineBasicBlock &TailBB, + bool IgnoreFallthrough) { + // IgnoreFallthrough is set when considering duplication during layout. + // Because the ultimate layout may change, it is better to consider + // duplicating blocks that can't fall through. + if (TailBB.canFallThrough() && !IgnoreFallthrough) return false; // Don't try to tail-duplicate single-block loops. @@ -674,6 +679,7 @@ /// of its predecessors. bool TailDuplicator::tailDuplicate(MachineFunction &MF, bool IsSimple, MachineBasicBlock *TailBB, + MachineBasicBlock *ForcedLayoutPred, SmallVectorImpl &TDBBs, SmallVectorImpl &Copies) { DEBUG(dbgs() << "\n*** Tail-duplicating BB#" << TailBB->getNumber() << '\n'); @@ -708,7 +714,12 @@ if (!PredCond.empty()) continue; // Don't duplicate into a fall-through predecessor (at least for now). - if (PredBB->isLayoutSuccessor(TailBB) && PredBB->canFallThrough()) + bool IsLayoutSuccessor = false; + if (ForcedLayoutPred) + IsLayoutSuccessor = (ForcedLayoutPred == PredBB); + else if (PredBB->isLayoutSuccessor(TailBB) && PredBB->canFallThrough()) + IsLayoutSuccessor = true; + if (IsLayoutSuccessor) continue; DEBUG(dbgs() << "\nTail-duplicating into PredBB: " << *PredBB @@ -779,13 +790,14 @@ // If TailBB was duplicated into all its predecessors except for the prior // block, which falls through unconditionally, move the contents of this - // block into the prior block. + // block into the prior block. Don't do this when ForcedLayoutPred is + // non-null, as it can break layout to remove blocks. MachineBasicBlock *PrevBB = &*std::prev(TailBB->getIterator()); MachineBasicBlock *PriorTBB = nullptr, *PriorFBB = nullptr; SmallVector PriorCond; // This has to check PrevBB->succ_size() because EH edges are ignored by // AnalyzeBranch. - if (PrevBB->succ_size() == 1 && + if (ForcedLayoutPred == nullptr && PrevBB->succ_size() == 1 && !TII->AnalyzeBranch(*PrevBB, PriorTBB, PriorFBB, PriorCond, true) && PriorCond.empty() && !PriorTBB && TailBB->pred_size() == 1 && !TailBB->hasAddressTaken()) { Index: test/CodeGen/AArch64/machine_cse.ll =================================================================== --- test/CodeGen/AArch64/machine_cse.ll +++ test/CodeGen/AArch64/machine_cse.ll @@ -1,4 +1,8 @@ -; RUN: llc < %s -mtriple=aarch64-linux-gnuabi -O2 | FileCheck %s +; RUN: llc < %s -mtriple=aarch64-linux-gnuabi -O2 -tail-dup-placement=0 | FileCheck %s +; -tail-dup-placement causes tail duplication during layout. This breaks the +; assumptions of the test case as written (specifically, it creates an +; additional cmp instruction, creating a false positive), so we pass +; -tail-dup-placement=0 to restore the original behavior ; marked as external to prevent possible optimizations @a = external global i32 Index: test/CodeGen/PowerPC/branch-opt.ll =================================================================== --- test/CodeGen/PowerPC/branch-opt.ll +++ test/CodeGen/PowerPC/branch-opt.ll @@ -1,9 +1,14 @@ -; RUN: llc < %s -march=ppc32 | \ -; RUN: grep "b LBB.*" | count 4 +; RUN: llc < %s -march=ppc32 | FileCheck %s target datalayout = "E-p:32:32" target triple = "powerpc-apple-darwin8.7.0" +;CHECK-LABEL: foo: +;CHECK: b LBB0_16 +;CHECK: b LBB0_14 +;CHECK: b LBB0_14 +;CHECK-NOT: b LBB + define void @foo(i32 %W, i32 %X, i32 %Y, i32 %Z) { entry: %tmp1 = and i32 %W, 1 ; [#uses=1] Index: test/CodeGen/PowerPC/tail-dup-layout.ll =================================================================== --- /dev/null +++ test/CodeGen/PowerPC/tail-dup-layout.ll @@ -0,0 +1,100 @@ +; RUN: llc -outline-optional-branches -O2 < %s | FileCheck %s +target datalayout = "e-m:e-i64:64-n32:64" +target triple = "powerpc64le-grtev4-linux-gnu" + +%struct.ptrpair = type { i8*, i8* } +;CHECK-LABEL: f: +;CHECK: # %test1 +;CHECK-NEXT: andi. {{[0-9]+}}, 4, 1 +;CHECK-NEXT: bc 12, 1, [[OPT1LABEL:[._0-9A-Za-z]+]] +;CHECK-NEXT: [[TEST2LABEL:[._0-9A-Za-z]+]]: # %test2 +;CHECK-NEXT: rlwinm. {{[0-9]+}}, 4, 0, 30, 30 +;CHECK-NEXT: bne 0, [[OPT2LABEL:[._0-9A-Za-z]+]] +;CHECK-NEXT: [[TEST3LABEL:[._0-9A-Za-z]+]]: # %test3 +;CHECK-NEXT: rlwinm. {{[0-9]+}}, 4, 0, 29, 29 +;CHECK-NEXT: bne 0, .[[OPT3LABEL:[._0-9A-Za-z]+]] +;CHECK-NEXT: [[TEST4LABEL:[._0-9A-Za-z]+]]: # %test4 +;CHECK-NEXT: rlwinm. {{[0-9]+}}, 4, 0, 28, 28 +;CHECK-NEXT: bne 0, .[[OPT4LABEL:[._0-9A-Za-z]+]] +;CHECK-NEXT: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit +;CHECK-NEXT: std 5, 0(3) +;CHECK-NEXT: std 6, 8(3) +;CHECK-NEXT: blr +;CHECK-NEXT: [[OPT1LABEL]] +;CHECK: rlwinm. {{[0-9]+}}, 4, 0, 30, 30 +;CHECK-NEXT: beq 0, [[TEST3LABEL]] +;CHECK-NEXT: [[OPT2LABEL]] +;CHECK: rlwinm. {{[0-9]+}}, 4, 0, 29, 29 +;CHECK-NEXT: beq 0, [[TEST4LABEL]] +;CHECK-NEXT: [[OPT3LABEL]] +;CHECK: rlwinm. {{[0-9]+}}, 4, 0, 28, 28 +;CHECK-NEXT: beq 0, [[EXITLABEL]] +;CHECK-NEXT: [[OPT4LABEL]] +;CHECK: b [[EXITLABEL]] + +define void @f(%struct.ptrpair* noalias nocapture sret %result, i32 %tag, i8* %source1, i8* %sink1) { +entry: + br label %test1 +test1: + %tagbit1 = and i32 %tag, 1 + %tagbit1eq0 = icmp eq i32 %tagbit1, 0 + br i1 %tagbit1eq0, label %test2, label %optional1 +optional1: + store i8 97, i8* %sink1 + %read1 = load i8, i8* %source1 + %sink_store1 = getelementptr inbounds i8, i8* %sink1, i64 1 + store i8 %read1, i8* %sink_store1 + %sink_bump1 = getelementptr inbounds i8, i8* %sink1, i64 2 + %source_bump1 = getelementptr inbounds i8, i8* %source1, i64 1 + br label %test2 +test2: + %source2 = phi i8*[%source1, %test1], [%source_bump1, %optional1] + %sink2 = phi i8*[%sink1, %test1], [%sink_bump1, %optional1] + %tagbit2 = and i32 %tag, 2 + %tagbit2eq0 = icmp eq i32 %tagbit2, 0 + br i1 %tagbit2eq0, label %test3, label %optional2 +optional2: + store i8 98, i8* %sink2 + %read2 = load i8, i8* %source2 + %sink_store2 = getelementptr inbounds i8, i8* %sink2, i64 1 + store i8 %read2, i8* %sink_store2 + %sink_bump2 = getelementptr inbounds i8, i8* %sink2, i64 2 + %source_bump2 = getelementptr inbounds i8, i8* %source2, i64 1 + br label %test3 +test3: + %source3 = phi i8*[%source2, %test2], [%source_bump2, %optional2] + %sink3 = phi i8*[%sink2, %test2], [%sink_bump2, %optional2] + %tagbit3 = and i32 %tag, 4 + %tagbit3eq0 = icmp eq i32 %tagbit3, 0 + br i1 %tagbit3eq0, label %test4, label %optional3 +optional3: + store i8 99, i8* %sink3 + %read3 = load i8, i8* %source3 + %sink_store3 = getelementptr inbounds i8, i8* %sink3, i64 1 + store i8 %read3, i8* %sink_store3 + %sink_bump3 = getelementptr inbounds i8, i8* %sink3, i64 2 + %source_bump3 = getelementptr inbounds i8, i8* %source3, i64 1 + br label %test4 +test4: + %source4 = phi i8*[%source3, %test3], [%source_bump3, %optional3] + %sink4 = phi i8*[%sink3, %test3], [%sink_bump3, %optional3] + %tagbit4 = and i32 %tag, 8 + %tagbit4eq0 = icmp eq i32 %tagbit4, 0 + br i1 %tagbit4eq0, label %exit, label %optional4 +optional4: + store i8 100, i8* %sink4 + %read4 = load i8, i8* %source4 + %sink_store4 = getelementptr inbounds i8, i8* %sink4, i64 1 + store i8 %read4, i8* %sink_store4 + %sink_bump4 = getelementptr inbounds i8, i8* %sink4, i64 2 + %source_bump4 = getelementptr inbounds i8, i8* %source4, i64 1 + br label %exit +exit: + %sourceout = phi i8*[%source4, %test4], [%source_bump4, %optional4] + %sinkout = phi i8*[%sink4, %test4], [%sink_bump4, %optional4] + %sourceret = getelementptr inbounds %struct.ptrpair, %struct.ptrpair* %result, i64 0, i32 0 + %sinkret = getelementptr inbounds %struct.ptrpair, %struct.ptrpair* %result, i64 0, i32 1 + store i8* %sourceout, i8 ** %sourceret + store i8* %sinkout, i8 ** %sinkret + ret void +} Index: test/CodeGen/WebAssembly/cfg-stackify.ll =================================================================== --- test/CodeGen/WebAssembly/cfg-stackify.ll +++ test/CodeGen/WebAssembly/cfg-stackify.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -asm-verbose=false -disable-block-placement -verify-machineinstrs | FileCheck %s -; RUN: llc < %s -asm-verbose=false -verify-machineinstrs | FileCheck -check-prefix=OPT %s +; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -tail-dup-placement=0 | FileCheck -check-prefix=OPT %s ; Test the CFG stackifier pass.