Index: lib/CodeGen/BranchFolding.h =================================================================== --- lib/CodeGen/BranchFolding.h +++ lib/CodeGen/BranchFolding.h @@ -20,6 +20,7 @@ class MachineBranchProbabilityInfo; class MachineFunction; class MachineModuleInfo; + class MachineLoopInfo; class RegScavenger; class TargetInstrInfo; class TargetRegisterInfo; @@ -47,10 +48,11 @@ MBFIWrapper &MBFI, const MachineBranchProbabilityInfo &MBPI); - bool OptimizeFunction(MachineFunction &MF, - const TargetInstrInfo *tii, - const TargetRegisterInfo *tri, - MachineModuleInfo *mmi); + bool OptimizeFunction(MachineFunction &MF, const TargetInstrInfo *tii, + const TargetRegisterInfo *tri, MachineModuleInfo *mmi, + MachineLoopInfo *mli = nullptr, + bool AfterPlacement = false); + private: class MergePotentialsElt { unsigned Hash; @@ -108,11 +110,13 @@ }; std::vector SameTails; + bool AfterBlockPlacement; bool EnableTailMerge; bool EnableHoistCommonCode; const TargetInstrInfo *TII; const TargetRegisterInfo *TRI; MachineModuleInfo *MMI; + MachineLoopInfo *MLI; RegScavenger *RS; MBFIWrapper &MBBFreqInfo; Index: lib/CodeGen/BranchFolding.cpp =================================================================== --- lib/CodeGen/BranchFolding.cpp +++ lib/CodeGen/BranchFolding.cpp @@ -27,6 +27,7 @@ #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" @@ -136,6 +137,8 @@ // Remove the block. MF->erase(MBB); FuncletMembership.erase(MBB); + if (MLI) + MLI->removeBlock(MBB); } /// OptimizeImpDefsBlock - If a basic block is just a bunch of implicit_def @@ -192,18 +195,22 @@ } /// OptimizeFunction - Perhaps branch folding, tail merging and other -/// CFG optimizations on the given function. +/// CFG optimizations on the given function. Block placement changes the layout +/// and may create new tail merging opportunities. bool BranchFolder::OptimizeFunction(MachineFunction &MF, const TargetInstrInfo *tii, const TargetRegisterInfo *tri, - MachineModuleInfo *mmi) { + MachineModuleInfo *mmi, + MachineLoopInfo *mli, bool AfterPlacement) { if (!tii) return false; TriedMerging.clear(); + AfterBlockPlacement = AfterPlacement; TII = tii; TRI = tri; MMI = mmi; + MLI = mli; RS = nullptr; // Use a RegScavenger to help update liveness when required. @@ -229,7 +236,10 @@ bool MadeChangeThisIteration = true; while (MadeChangeThisIteration) { MadeChangeThisIteration = TailMergeBlocks(MF); - MadeChangeThisIteration |= OptimizeBranches(MF); + // No need to clean up if tail merging does not change anything after the + // block placement. + if (!AfterBlockPlacement || MadeChangeThisIteration) + MadeChangeThisIteration |= OptimizeBranches(MF); if (EnableHoistCommonCode) MadeChangeThisIteration |= HoistCommonCode(MF); MadeChange |= MadeChangeThisIteration; @@ -446,6 +456,11 @@ // Splice the code over. NewMBB->splice(NewMBB->end(), &CurMBB, BBI1, CurMBB.end()); + // NewMBB belongs to the same loop as CurMBB. + if (MLI) + if (MachineLoop *ML = MLI->getLoopFor(&CurMBB)) + ML->addBasicBlockToLoop(NewMBB, MLI->getBase()); + // NewMBB inherits CurMBB's block frequency. MBBFreqInfo.setBlockFreq(NewMBB, MBBFreqInfo.getBlockFreq(&CurMBB)); @@ -929,23 +944,27 @@ if (!EnableTailMerge) return MadeChange; // First find blocks with no successors. - MergePotentials.clear(); - for (MachineBasicBlock &MBB : MF) { - if (MergePotentials.size() == TailMergeThreshold) - break; - if (!TriedMerging.count(&MBB) && MBB.succ_empty()) - MergePotentials.push_back(MergePotentialsElt(HashEndOfMBB(MBB), &MBB)); - } + // Block placement does not create new tail merging opportunities for these + // blocks. + if (!AfterBlockPlacement) { + MergePotentials.clear(); + for (MachineBasicBlock &MBB : MF) { + if (MergePotentials.size() == TailMergeThreshold) + break; + if (!TriedMerging.count(&MBB) && MBB.succ_empty()) + MergePotentials.push_back(MergePotentialsElt(HashEndOfMBB(MBB), &MBB)); + } - // If this is a large problem, avoid visiting the same basic blocks - // multiple times. - if (MergePotentials.size() == TailMergeThreshold) - for (unsigned i = 0, e = MergePotentials.size(); i != e; ++i) - TriedMerging.insert(MergePotentials[i].getBlock()); + // If this is a large problem, avoid visiting the same basic blocks + // multiple times. + if (MergePotentials.size() == TailMergeThreshold) + for (unsigned i = 0, e = MergePotentials.size(); i != e; ++i) + TriedMerging.insert(MergePotentials[i].getBlock()); - // See if we can do any tail merging on those. - if (MergePotentials.size() >= 2) - MadeChange |= TryTailMergeBlocks(nullptr, nullptr); + // See if we can do any tail merging on those. + if (MergePotentials.size() >= 2) + MadeChange |= TryTailMergeBlocks(nullptr, nullptr); + } // Look at blocks (IBB) with multiple predecessors (PBB). // We change each predecessor to a canonical form, by @@ -992,6 +1011,17 @@ if (PBB->hasEHPadSuccessor()) continue; + // Bail out if the loop header (IBB) is not the top of the loop chain + // after the block placement. Otherwise, the common tail of IBB's + // predecessors may become the loop top if block placement is called again + // and the predecessors may branch to this common tail. + // FIXME: Relaxed this check if the algorithm of finding loop top is + // changed in MBP. + if (AfterBlockPlacement && MLI) + if (MachineLoop *ML = MLI->getLoopFor(IBB)) + if (IBB == ML->getHeader() && ML == MLI->getLoopFor(PBB)) + continue; + MachineBasicBlock *TBB = nullptr, *FBB = nullptr; SmallVector Cond; if (!TII->AnalyzeBranch(*PBB, TBB, FBB, Cond, true)) { Index: lib/CodeGen/MachineBlockPlacement.cpp =================================================================== --- lib/CodeGen/MachineBlockPlacement.cpp +++ lib/CodeGen/MachineBlockPlacement.cpp @@ -26,6 +26,8 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "BranchFolding.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" @@ -116,6 +118,12 @@ cl::desc("Cost of jump instructions."), cl::init(1), cl::Hidden); +static cl::opt +BranchFoldPlacement("branch-fold-placement", + cl::desc("Perform branch folding during placement. " + "Reduces code size."), + cl::init(true), cl::Hidden); + namespace { class BlockChain; /// \brief Type for our function-wide basic block -> block chain mapping. @@ -230,10 +238,10 @@ const MachineBranchProbabilityInfo *MBPI; /// \brief A handle to the function-wide block frequency pass. - const MachineBlockFrequencyInfo *MBFI; + MBFIWrapper *MBFI; /// \brief A handle to the loop info. - const MachineLoopInfo *MLI; + MachineLoopInfo *MLI; /// \brief A handle to the target's instruction info. const TargetInstrInfo *TII; @@ -304,7 +312,7 @@ const BlockFilterSet &LoopBlockSet); void rotateLoopWithProfile(BlockChain &LoopChain, MachineLoop &L, const BlockFilterSet &LoopBlockSet); - void buildCFGChains(MachineFunction &F); + bool buildCFGChains(MachineFunction &F); void optimizeBranches(MachineFunction &F); void alignBlocks(MachineFunction &F); @@ -321,6 +329,7 @@ AU.addRequired(); AU.addRequired(); AU.addRequired(); + AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } }; @@ -1204,7 +1213,7 @@ }); } -void MachineBlockPlacement::buildCFGChains(MachineFunction &F) { +bool MachineBlockPlacement::buildCFGChains(MachineFunction &F) { // Ensure that every BB in the function has an associated chain to simplify // the assumptions of the remaining algorithm. SmallVector Cond; // For AnalyzeBranch. @@ -1296,6 +1305,7 @@ }); // Splice the blocks into place. + bool isBrUpdated = false; MachineFunction::iterator InsertPos = F.begin(); for (MachineBasicBlock *ChainBB : FunctionChain) { DEBUG(dbgs() << (ChainBB == *FunctionChain.begin() ? "Placing chain " @@ -1327,7 +1337,7 @@ // NULL; for the 2nd scenario, the FBB, which is expected to be NULL, // is mistakenly pointing to "*BI". // - PrevBB->updateTerminator(); + isBrUpdated |= PrevBB->updateTerminator(); } } @@ -1335,7 +1345,9 @@ Cond.clear(); MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch. if (!TII->AnalyzeBranch(F.back(), TBB, FBB, Cond)) - F.back().updateTerminator(); + isBrUpdated |= F.back().updateTerminator(); + + return isBrUpdated; } void MachineBlockPlacement::optimizeBranches(MachineFunction &F) { @@ -1452,14 +1464,37 @@ return false; MBPI = &getAnalysis(); - MBFI = &getAnalysis(); + MBFI = new MBFIWrapper(getAnalysis()); MLI = &getAnalysis(); TII = F.getSubtarget().getInstrInfo(); TLI = F.getSubtarget().getTargetLowering(); MDT = &getAnalysis(); assert(BlockToChain.empty()); - buildCFGChains(F); + bool LayoutChange = buildCFGChains(F); + + // Changing the layout can create new tail merging opportunities. + TargetPassConfig *PassConfig = &getAnalysis(); + // TailMerge can create jump into if branches that make CFG irreducible for + // HW that requires structurized CFG. + bool EnableTailMerge = !F.getTarget().requiresStructuredCFG() && + PassConfig->getEnableTailMerge() && + BranchFoldPlacement; + // No tail merging opportunities if the block number is less than four. + if (LayoutChange && F.size() > 3 && EnableTailMerge) { + BranchFolder BF(/*EnableTailMerge=*/true, /*CommonHoist=*/false, *MBFI, + *MBPI); + + if (BF.OptimizeFunction(F, TII, F.getSubtarget().getRegisterInfo(), + getAnalysisIfAvailable(), MLI, + /*AfterBlockPlacement=*/true)) { + // Redo the layout if tail merging creates/removes/moves blocks. + BlockToChain.clear(); + ChainAllocator.DestroyAll(); + buildCFGChains(F); + } + } + optimizeBranches(F); alignBlocks(F); @@ -1480,6 +1515,7 @@ } } + delete MBFI; // We always return true as we have no way to track whether the final order // differs from the original order. return true; Index: test/CodeGen/AArch64/tailmerging_in_mbp.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/tailmerging_in_mbp.ll @@ -0,0 +1,63 @@ +;RUN: llc <%s -march=aarch64 | FileCheck %s + +; CHECK-LABEL: test: +; CHECK: .LBB0_7 +; CHECK: b.hi .LBB0_2 +; CHECK-NEXT: b .LBB0_9 +; CHECK-NEXT: .LBB0_8 +; CHECK-NEXT: mov x8, x9 +; CHECK-NEXT: .LBB0_9 +define i64 @test(i64 %n, i64* %a, i64* %b, i64* %c, i64* %d, i64* %e, i64* %f) { +entry: + %cmp28 = icmp sgt i64 %n, 1 + br i1 %cmp28, label %for.body, label %for.end + +for.body: ; preds = %for.body.lr.ph, %if.end + %j = phi i64 [ %n, %entry ], [ %div, %if.end ] + %div = lshr i64 %j, 1 + %a.arrayidx = getelementptr inbounds i64, i64* %a, i64 %div + %a.j = load i64, i64* %a.arrayidx + %b.arrayidx = getelementptr inbounds i64, i64* %b, i64 %div + %b.j = load i64, i64* %b.arrayidx + %cmp.i = icmp slt i64 %a.j, %b.j + br i1 %cmp.i, label %for.end.loopexit, label %cond.false.i + +cond.false.i: ; preds = %for.body + %cmp4.i = icmp sgt i64 %a.j, %b.j + br i1 %cmp4.i, label %if.end, label %cond.false6.i + +cond.false6.i: ; preds = %cond.false.i + %c.arrayidx = getelementptr inbounds i64, i64* %c, i64 %div + %c.j = load i64, i64* %c.arrayidx + %d.arrayidx = getelementptr inbounds i64, i64* %d, i64 %div + %d.j = load i64, i64* %d.arrayidx + %cmp9.i = icmp slt i64 %c.j, %d.j + br i1 %cmp9.i, label %for.end.loopexit, label %cond.false11.i + +cond.false11.i: ; preds = %cond.false6.i + %cmp14.i = icmp sgt i64 %c.j, %d.j + br i1 %cmp14.i, label %if.end, label %cond.false12.i + +cond.false12.i: ; preds = %cond.false11.i + %e.arrayidx = getelementptr inbounds i64, i64* %e, i64 %div + %e.j = load i64, i64* %e.arrayidx + %f.arrayidx = getelementptr inbounds i64, i64* %f, i64 %div + %f.j = load i64, i64* %f.arrayidx + %cmp19.i = icmp sgt i64 %e.j, %f.j + br i1 %cmp19.i, label %if.end, label %for.end.loopexit + +if.end: ; preds = %cond.false12.i, %cond.false11.i, %cond.false.i + %cmp = icmp ugt i64 %j, 3 + br i1 %cmp, label %for.body, label %for.end.loopexit + +for.end.loopexit: ; preds = %cond.false12.i, %cond.false6.i, %for.body, %if.end + %j.0.lcssa.ph = phi i64 [ %j, %cond.false12.i ], [ %j, %cond.false6.i ], [ %j, %for.body ], [ %div, %if.end ] + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + %j.0.lcssa = phi i64 [ %n, %entry ], [ %j.0.lcssa.ph, %for.end.loopexit ] + %j.2 = add i64 %j.0.lcssa, %n + %j.3 = mul i64 %j.2, %n + %j.4 = add i64 %j.3, 10 + ret i64 %j.4 +} Index: test/CodeGen/ARM/arm-and-tst-peephole.ll =================================================================== --- test/CodeGen/ARM/arm-and-tst-peephole.ll +++ test/CodeGen/ARM/arm-and-tst-peephole.ll @@ -49,7 +49,7 @@ ; V8-NEXT: beq ; V8-NEXT: %tailrecurse.switch ; V8: cmp -; V8-NEXT: bne +; V8-NEXT: beq ; V8-NEXT: b ; The trailing space in the last line checks that the branch is unconditional switch i32 %and, label %sw.epilog [