Index: include/llvm/CodeGen/TailDuplicator.h =================================================================== --- include/llvm/CodeGen/TailDuplicator.h +++ include/llvm/CodeGen/TailDuplicator.h @@ -51,9 +51,10 @@ bool tailDuplicateBlocks(MachineFunction &MF); static bool isSimpleBB(MachineBasicBlock *TailBB); bool shouldTailDuplicate(const MachineFunction &MF, bool IsSimple, - MachineBasicBlock &TailBB); + MachineBasicBlock &TailBB, bool IgnoreFallthrough); bool tailDuplicateAndUpdate(MachineFunction &MF, bool IsSimple, - MachineBasicBlock *MBB); + MachineBasicBlock *MBB, + MachineBasicBlock *ForcedLayoutPred); private: typedef TargetInstrInfo::RegSubRegPair RegSubRegPair; @@ -79,6 +80,7 @@ SmallVectorImpl &Copies); bool tailDuplicate(MachineFunction &MF, bool IsSimple, MachineBasicBlock *TailBB, + MachineBasicBlock *ForcedLayoutPred, SmallVectorImpl &TDBBs, SmallVectorImpl &Copies); void appendCopies(MachineBasicBlock *MBB, Index: lib/CodeGen/BranchFolding.cpp =================================================================== --- lib/CodeGen/BranchFolding.cpp +++ lib/CodeGen/BranchFolding.cpp @@ -597,7 +597,8 @@ MachineBasicBlock::iterator &I1, MachineBasicBlock::iterator &I2, MachineBasicBlock *SuccBB, MachineBasicBlock *PredBB, - DenseMap &FuncletMembership) { + DenseMap &FuncletMembership, + bool AfterPlacement) { // It is never profitable to tail-merge blocks from two different funclets. if (!FuncletMembership.empty()) { auto Funclet1 = FuncletMembership.find(MBB1); @@ -620,7 +621,8 @@ if (MBB1 == PredBB || MBB2 == PredBB) { MachineBasicBlock::iterator I; unsigned NumTerms = CountTerminators(MBB1 == PredBB ? MBB2 : MBB1, I); - if (CommonTailLen > NumTerms) + // Don't undo tail-duplication during layout. + if (!AfterPlacement && CommonTailLen > NumTerms) return true; } @@ -637,10 +639,17 @@ // count that as an additional common instruction for the following // heuristics. unsigned EffectiveTailLen = CommonTailLen; - if (SuccBB && MBB1 != PredBB && MBB2 != PredBB && - !MBB1->back().isBarrier() && - !MBB2->back().isBarrier()) - ++EffectiveTailLen; + // We can't do this during layout because it undoes tail duplication, despite + // non-overlapping thresholds. What happens is that a block with a + // conditional branch and fallthrough gets tail-duplicated and this heuristic + // gets run with the non-fallthrough successor as SuccBB, and counts an extra + // branch, despite the block having only 2 instructions when it was + // duplicated. + if (!AfterPlacement) + if (SuccBB && MBB1 != PredBB && MBB2 != PredBB && + !MBB1->back().isBarrier() && + !MBB2->back().isBarrier()) + ++EffectiveTailLen; // Check if the common tail is long enough to be worthwhile. if (EffectiveTailLen >= minCommonTailLength) @@ -682,7 +691,8 @@ minCommonTailLength, CommonTailLen, TrialBBI1, TrialBBI2, SuccBB, PredBB, - FuncletMembership)) { + FuncletMembership, + AfterBlockPlacement)) { if (CommonTailLen > maxCommonTailLength) { SameTails.clear(); maxCommonTailLength = CommonTailLen; Index: lib/CodeGen/MachineBlockPlacement.cpp =================================================================== --- lib/CodeGen/MachineBlockPlacement.cpp +++ lib/CodeGen/MachineBlockPlacement.cpp @@ -40,6 +40,7 @@ #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/TailDuplicator.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -117,6 +118,12 @@ static cl::opt JumpInstCost("jump-inst-cost", cl::desc("Cost of jump instructions."), cl::init(1), cl::Hidden); +static cl::opt +TailDupPlacement("tail-dup-placement", + cl::desc("Perform tail duplication during placement. " + "Creates more fallthrough opportunites in " + "outline branches."), + cl::init(true), cl::Hidden); static cl::opt BranchFoldPlacement("branch-fold-placement", @@ -258,6 +265,13 @@ /// \brief A handle to the post dominator tree. MachineDominatorTree *MDT; + /// \brief Duplicator used to duplicate tails during placement. + /// + /// Placement decisions can open up new tail duplication opportunities, but + /// since tail duplication affects placement decisions of later blocks, it + /// must be done inline. + TailDuplicator TailDup; + /// \brief A set of blocks that are unavoidably execute, i.e. they dominate /// all terminators of the MachineFunction. SmallPtrSet UnavoidableBlocks; @@ -864,6 +878,20 @@ // after this block. MachineBasicBlock *BestSucc = selectBestSuccessor(BB, Chain, BlockFilter); + // Placing an actual successor may have changed tail duplication + // opportunities. Check for that now. + if (TailDupPlacement && BestSucc) { + DEBUG(dbgs() << "Redoing tail duplication for BestSucc#" + << BestSucc->getNumber() << "\n"); + bool IsSimple = TailDup.isSimpleBB(BestSucc); + bool IgnoreFallthrough = true; + // Simple blocks should just fallthrough, so only worry about non-simple + // ones. + if (!IsSimple && TailDup.shouldTailDuplicate(*F, IsSimple, + *BestSucc, IgnoreFallthrough)) + TailDup.tailDuplicateAndUpdate(*F, IsSimple, BestSucc, BB); + } + // If an immediate successor isn't available, look for the best viable // block among those we've identified as not violating the loop's CFG at // this point. This won't be a fallthrough, but it will increase locality. @@ -1659,6 +1687,10 @@ TII = MF.getSubtarget().getInstrInfo(); TLI = MF.getSubtarget().getTargetLowering(); MDT = &getAnalysis(); + auto MMI = getAnalysisIfAvailable(); + if (TailDupPlacement) + TailDup.initMF(MF, MMI, MBPI); + assert(BlockToChain.empty()); buildCFGChains(); @@ -1680,6 +1712,8 @@ /*AfterBlockPlacement=*/true)) { // Redo the layout if tail merging creates/removes/moves blocks. BlockToChain.clear(); + // Must redo the dominator tree if blocks were changed. + MDT->runOnMachineFunction(MF); ChainAllocator.DestroyAll(); buildCFGChains(); } Index: lib/CodeGen/TailDuplicator.cpp =================================================================== --- lib/CodeGen/TailDuplicator.cpp +++ lib/CodeGen/TailDuplicator.cpp @@ -123,15 +123,16 @@ } /// Tail duplicate the block and cleanup. -bool TailDuplicator::tailDuplicateAndUpdate(MachineFunction &MF, bool IsSimple, - MachineBasicBlock *MBB) { +bool TailDuplicator::tailDuplicateAndUpdate( + MachineFunction &MF, bool IsSimple, MachineBasicBlock *MBB, + MachineBasicBlock *ForcedLayoutPred) { // Save the successors list. SmallSetVector Succs(MBB->succ_begin(), MBB->succ_end()); SmallVector TDBBs; SmallVector Copies; - if (!tailDuplicate(MF, IsSimple, MBB, TDBBs, Copies)) + if (!tailDuplicate(MF, IsSimple, MBB, ForcedLayoutPred, TDBBs, Copies)) return false; ++NumTails; @@ -242,10 +243,11 @@ bool IsSimple = isSimpleBB(MBB); - if (!shouldTailDuplicate(MF, IsSimple, *MBB)) + bool IgnoreFallthrough = false; + if (!shouldTailDuplicate(MF, IsSimple, *MBB, IgnoreFallthrough)) continue; - MadeChange |= tailDuplicateAndUpdate(MF, IsSimple, MBB); + MadeChange |= tailDuplicateAndUpdate(MF, IsSimple, MBB, nullptr); } if (PreRegAlloc && TailDupVerify) @@ -509,9 +511,20 @@ /// Determine if it is profitable to duplicate this block. bool TailDuplicator::shouldTailDuplicate(const MachineFunction &MF, bool IsSimple, - MachineBasicBlock &TailBB) { - // Only duplicate blocks that end with unconditional branches. - if (TailBB.canFallThrough()) + MachineBasicBlock &TailBB, + bool IgnoreFallthrough) { + // IgnoreFallthrough is set when considering duplication during layout. + // Because the ultimate layout may change, it is better to consider + // duplicating blocks that can't fall through. + if (TailBB.canFallThrough() && !IgnoreFallthrough) + return false; + + // If the block to be duplicated ends in an unanalyzable fallthrough, don't + // duplicate it. + MachineBasicBlock *PredTBB = nullptr, *PredFBB = nullptr; + SmallVector PredCond; + if (TailBB.canFallThrough() && + TII->AnalyzeBranch(TailBB, PredTBB, PredFBB, PredCond, true)) return false; // Don't try to tail-duplicate single-block loops. @@ -725,6 +738,7 @@ /// of its predecessors. bool TailDuplicator::tailDuplicate(MachineFunction &MF, bool IsSimple, MachineBasicBlock *TailBB, + MachineBasicBlock *ForcedLayoutPred, SmallVectorImpl &TDBBs, SmallVectorImpl &Copies) { DEBUG(dbgs() << "\n*** Tail-duplicating BB#" << TailBB->getNumber() << '\n'); @@ -759,7 +773,12 @@ if (!PredCond.empty()) continue; // Don't duplicate into a fall-through predecessor (at least for now). - if (PredBB->isLayoutSuccessor(TailBB) && PredBB->canFallThrough()) + bool IsLayoutSuccessor = false; + if (ForcedLayoutPred) + IsLayoutSuccessor = (ForcedLayoutPred == PredBB); + else if (PredBB->isLayoutSuccessor(TailBB) && PredBB->canFallThrough()) + IsLayoutSuccessor = true; + if (IsLayoutSuccessor) continue; DEBUG(dbgs() << "\nTail-duplicating into PredBB: " << *PredBB @@ -825,13 +844,14 @@ // If TailBB was duplicated into all its predecessors except for the prior // block, which falls through unconditionally, move the contents of this - // block into the prior block. + // block into the prior block. Don't do this when ForcedLayoutPred is + // non-null, as it can break layout to remove blocks. MachineBasicBlock *PrevBB = &*std::prev(TailBB->getIterator()); MachineBasicBlock *PriorTBB = nullptr, *PriorFBB = nullptr; SmallVector PriorCond; // This has to check PrevBB->succ_size() because EH edges are ignored by // AnalyzeBranch. - if (PrevBB->succ_size() == 1 && + if (ForcedLayoutPred == nullptr && PrevBB->succ_size() == 1 && !TII->AnalyzeBranch(*PrevBB, PriorTBB, PriorFBB, PriorCond, true) && PriorCond.empty() && !PriorTBB && TailBB->pred_size() == 1 && !TailBB->hasAddressTaken()) { Index: test/CodeGen/AArch64/machine_cse.ll =================================================================== --- test/CodeGen/AArch64/machine_cse.ll +++ test/CodeGen/AArch64/machine_cse.ll @@ -1,4 +1,8 @@ -; RUN: llc < %s -mtriple=aarch64-linux-gnuabi -O2 | FileCheck %s +; RUN: llc < %s -mtriple=aarch64-linux-gnuabi -O2 -tail-dup-placement=0 | FileCheck %s +; -tail-dup-placement causes tail duplication during layout. This breaks the +; assumptions of the test case as written (specifically, it creates an +; additional cmp instruction, creating a false positive), so we pass +; -tail-dup-placement=0 to restore the original behavior ; marked as external to prevent possible optimizations @a = external global i32 Index: test/CodeGen/PowerPC/branch-opt.ll =================================================================== --- test/CodeGen/PowerPC/branch-opt.ll +++ test/CodeGen/PowerPC/branch-opt.ll @@ -1,9 +1,14 @@ -; RUN: llc < %s -march=ppc32 | \ -; RUN: grep "b LBB.*" | count 4 +; RUN: llc < %s -march=ppc32 | FileCheck %s target datalayout = "E-p:32:32" target triple = "powerpc-apple-darwin8.7.0" +;CHECK-LABEL: foo: +;CHECK: b LBB0_14 +;CHECK: b LBB0_14 +;CHECK-NOT: b LBB0_14 +;CHECK: LBB0_14: ; %cond_next48 + define void @foo(i32 %W, i32 %X, i32 %Y, i32 %Z) { entry: %tmp1 = and i32 %W, 1 ; [#uses=1] Index: test/CodeGen/PowerPC/tail-dup-analyzable-fallthrough.ll =================================================================== --- /dev/null +++ test/CodeGen/PowerPC/tail-dup-analyzable-fallthrough.ll @@ -0,0 +1,75 @@ +; RUN: llc -O2 < %s | FileCheck %s +target datalayout = "e-m:e-i64:64-n32:64" +target triple = "powerpc64le--linux-gnu" + +%struct.HashBucket = type { %struct.HashBucket*, i64, i64* } +; This test is checking for a bug where blocks with un-analyzable fallthrough +; were getting tail-duplicated, leading to incorrect code. The loop-entry check +; gets transformed to have a conditional exit, which is currently un-analyzable. +; This block should not be tail-duplicated. This is from an actual hash table +; that was miscompiling. +; CHECK-LABEL: findKey + +; Checking for urem from entry. +; CHECK: divdu +; CHECK: mulld +; CHECK: sub + +; Don't duplicate the entry block +; CHECK: b [[LOOPENTRY:[._0-9A-Za-z]+]] + +; The Loop top does a pointer chase. +; CHECK: [[LOOPTOP:[._0-9A-Za-z]+]]: +; CHECK-NOT: {{{[._0-9A-Za-z]+}}}: +; CHECK: ld [[PTRREG:[0-9]+]], 0([[PTRREG]]) + +; The loop entry compares the pointer to 0 before the chase and early exits if +; it is null. +; CHECK-NEXT: [[LOOPENTRY]]: +; CHECK-NOT: {{{[._0-9A-Za-z]+}}}: +; CHECK: cmpldi [[PTRREG]], 0 +; CHECK-NEXT: beqlr + +; Function Attrs: norecurse nounwind readonly +define i64* @findKey(%struct.HashBucket** nocapture readonly %Buckets, i64 %BucketSize, i64 %Key) #0 { +entry: + %mul.i = mul i64 %Key, 237 + %shr.i = lshr i64 %mul.i, 16 + %xor.i = xor i64 %shr.i, %Key + %rem = urem i64 %xor.i, %BucketSize + %arrayidx = getelementptr inbounds %struct.HashBucket*, %struct.HashBucket** %Buckets, i64 %rem + %current.012 = load %struct.HashBucket*, %struct.HashBucket** %arrayidx, align 8 + %tobool13 = icmp eq %struct.HashBucket* %current.012, null + br i1 %tobool13, label %cond.end, label %land.rhs.preheader + +land.rhs.preheader: ; preds = %entry + br label %land.rhs + +land.rhs: ; preds = %land.rhs.preheader, %for.inc + %current.014 = phi %struct.HashBucket* [ %current.0, %for.inc ], [ %current.012, %land.rhs.preheader ] + %Key1 = getelementptr inbounds %struct.HashBucket, %struct.HashBucket* %current.014, i64 0, i32 1 + %0 = load i64, i64* %Key1, align 8 + %lnot = icmp eq i64 %0, %Key + br i1 %lnot, label %cond.true, label %for.inc + +for.inc: ; preds = %land.rhs + %Next = getelementptr inbounds %struct.HashBucket, %struct.HashBucket* %current.014, i64 0, i32 0 + %current.0 = load %struct.HashBucket*, %struct.HashBucket** %Next, align 8 + %tobool = icmp eq %struct.HashBucket* %current.0, null + br i1 %tobool, label %cond.end.loopexit, label %land.rhs + +cond.true: ; preds = %land.rhs + %current.014.lcssa = phi %struct.HashBucket* [ %current.014, %land.rhs ] + %Value = getelementptr inbounds %struct.HashBucket, %struct.HashBucket* %current.014.lcssa, i64 0, i32 2 + %1 = load i64*, i64** %Value, align 8 + br label %cond.end + +cond.end.loopexit: ; preds = %for.inc + br label %cond.end + +cond.end: ; preds = %cond.end.loopexit, %entry, %cond.true + %cond = phi i64* [ %1, %cond.true ], [ null, %entry ], [ null, %cond.end.loopexit ] + ret i64* %cond +} + +attributes #0 = { norecurse nounwind readonly } Index: test/CodeGen/PowerPC/tail-dup-layout.ll =================================================================== --- /dev/null +++ test/CodeGen/PowerPC/tail-dup-layout.ll @@ -0,0 +1,117 @@ +; RUN: llc -outline-optional-branches -O2 < %s | FileCheck %s +target datalayout = "e-m:e-i64:64-n32:64" +target triple = "powerpc64le-grtev4-linux-gnu" + +%struct.ptrpair = type { i8*, i8* } +; Intended layout: +; The outlining flag produces the layout +; test1 +; test2 +; test3 +; test4 +; exit +; optional1 +; optional2 +; optional3 +; optional4 +; Tail duplication puts test n+1 at the end of optional n +; so optional1 includes a copy of test2 at the end, and branches +; to test3 (at the top) or falls through to optional 2. +; The CHECK statements check for the whole string of tests and exit block, +; and then check that the correct test has been duplicated into the end of +; the optional blocks and that the optional blocks are in the correct order. +;CHECK-LABEL: f: +;CHECK: # %test1 +;CHECK-NEXT: andi. {{[0-9]+}}, 4, 1 +;CHECK-NEXT: bc 12, 1, [[OPT1LABEL:[._0-9A-Za-z]+]] +;CHECK-NEXT: [[TEST2LABEL:[._0-9A-Za-z]+]]: # %test2 +;CHECK-NEXT: rlwinm. {{[0-9]+}}, 4, 0, 30, 30 +;CHECK-NEXT: bne 0, [[OPT2LABEL:[._0-9A-Za-z]+]] +;CHECK-NEXT: [[TEST3LABEL:[._0-9A-Za-z]+]]: # %test3 +;CHECK-NEXT: rlwinm. {{[0-9]+}}, 4, 0, 29, 29 +;CHECK-NEXT: bne 0, .[[OPT3LABEL:[._0-9A-Za-z]+]] +;CHECK-NEXT: [[TEST4LABEL:[._0-9A-Za-z]+]]: # %test4 +;CHECK-NEXT: rlwinm. {{[0-9]+}}, 4, 0, 28, 28 +;CHECK-NEXT: bne 0, .[[OPT4LABEL:[._0-9A-Za-z]+]] +;CHECK-NEXT: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit +;CHECK-NEXT: std 5, 0(3) +;CHECK-NEXT: std 6, 8(3) +;CHECK-NEXT: blr +;CHECK-NEXT: [[OPT1LABEL]] +;CHECK: rlwinm. {{[0-9]+}}, 4, 0, 30, 30 +;CHECK-NEXT: beq 0, [[TEST3LABEL]] +;CHECK-NEXT: [[OPT2LABEL]] +;CHECK: rlwinm. {{[0-9]+}}, 4, 0, 29, 29 +;CHECK-NEXT: beq 0, [[TEST4LABEL]] +;CHECK-NEXT: [[OPT3LABEL]] +;CHECK: rlwinm. {{[0-9]+}}, 4, 0, 28, 28 +;CHECK-NEXT: beq 0, [[EXITLABEL]] +;CHECK-NEXT: [[OPT4LABEL]] +;CHECK: b [[EXITLABEL]] + +define void @f(%struct.ptrpair* noalias nocapture sret %result, i32 %tag, i8* %source1, i8* %sink1) { +entry: + br label %test1 +test1: + %tagbit1 = and i32 %tag, 1 + %tagbit1eq0 = icmp eq i32 %tagbit1, 0 + br i1 %tagbit1eq0, label %test2, label %optional1 +optional1: + store i8 97, i8* %sink1 + %read1 = load i8, i8* %source1 + %sink_store1 = getelementptr inbounds i8, i8* %sink1, i64 1 + store i8 %read1, i8* %sink_store1 + %sink_bump1 = getelementptr inbounds i8, i8* %sink1, i64 2 + %source_bump1 = getelementptr inbounds i8, i8* %source1, i64 1 + br label %test2 +test2: + %source2 = phi i8*[%source1, %test1], [%source_bump1, %optional1] + %sink2 = phi i8*[%sink1, %test1], [%sink_bump1, %optional1] + %tagbit2 = and i32 %tag, 2 + %tagbit2eq0 = icmp eq i32 %tagbit2, 0 + br i1 %tagbit2eq0, label %test3, label %optional2 +optional2: + store i8 98, i8* %sink2 + %read2 = load i8, i8* %source2 + %sink_store2 = getelementptr inbounds i8, i8* %sink2, i64 1 + store i8 %read2, i8* %sink_store2 + %sink_bump2 = getelementptr inbounds i8, i8* %sink2, i64 2 + %source_bump2 = getelementptr inbounds i8, i8* %source2, i64 1 + br label %test3 +test3: + %source3 = phi i8*[%source2, %test2], [%source_bump2, %optional2] + %sink3 = phi i8*[%sink2, %test2], [%sink_bump2, %optional2] + %tagbit3 = and i32 %tag, 4 + %tagbit3eq0 = icmp eq i32 %tagbit3, 0 + br i1 %tagbit3eq0, label %test4, label %optional3 +optional3: + store i8 99, i8* %sink3 + %read3 = load i8, i8* %source3 + %sink_store3 = getelementptr inbounds i8, i8* %sink3, i64 1 + store i8 %read3, i8* %sink_store3 + %sink_bump3 = getelementptr inbounds i8, i8* %sink3, i64 2 + %source_bump3 = getelementptr inbounds i8, i8* %source3, i64 1 + br label %test4 +test4: + %source4 = phi i8*[%source3, %test3], [%source_bump3, %optional3] + %sink4 = phi i8*[%sink3, %test3], [%sink_bump3, %optional3] + %tagbit4 = and i32 %tag, 8 + %tagbit4eq0 = icmp eq i32 %tagbit4, 0 + br i1 %tagbit4eq0, label %exit, label %optional4 +optional4: + store i8 100, i8* %sink4 + %read4 = load i8, i8* %source4 + %sink_store4 = getelementptr inbounds i8, i8* %sink4, i64 1 + store i8 %read4, i8* %sink_store4 + %sink_bump4 = getelementptr inbounds i8, i8* %sink4, i64 2 + %source_bump4 = getelementptr inbounds i8, i8* %source4, i64 1 + br label %exit +exit: + %sourceout = phi i8*[%source4, %test4], [%source_bump4, %optional4] + %sinkout = phi i8*[%sink4, %test4], [%sink_bump4, %optional4] + %sourceret = getelementptr inbounds %struct.ptrpair, %struct.ptrpair* %result, i64 0, i32 0 + %sinkret = getelementptr inbounds %struct.ptrpair, %struct.ptrpair* %result, i64 0, i32 1 + store i8* %sourceout, i8 ** %sourceret + store i8* %sinkout, i8 ** %sinkret + ret void +} Index: test/CodeGen/WebAssembly/cfg-stackify.ll =================================================================== --- test/CodeGen/WebAssembly/cfg-stackify.ll +++ test/CodeGen/WebAssembly/cfg-stackify.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -disable-block-placement -verify-machineinstrs -fast-isel=false | FileCheck %s -; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -verify-machineinstrs -fast-isel=false | FileCheck -check-prefix=OPT %s +; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -tail-dup-placement=0 -verify-machineinstrs -fast-isel=false | FileCheck -check-prefix=OPT %s ; Test the CFG stackifier pass.