Index: lib/CodeGen/CodeGenPrepare.cpp =================================================================== --- lib/CodeGen/CodeGenPrepare.cpp +++ lib/CodeGen/CodeGenPrepare.cpp @@ -17,6 +17,8 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" @@ -119,6 +121,17 @@ "disable-preheader-prot", cl::Hidden, cl::init(false), cl::desc("Disable protection against removing loop preheaders")); +static cl::opt FreqRatioToSkipMerge( + "cgp-freq-ratio-to-skip-merge", cl::Hidden, cl::init(1000), + cl::desc("Skip merging empty blocks if (frequency of empty block) / " + "(frequency of destination block) is greater than this ratio")); + +static cl::opt + MinNumPhiInDestToSkipMerge("cgp-min-num-phi-skip-merge", cl::Hidden, + cl::init(1), + cl::desc("Min number of PHIs in destination " + "block to skip merging empty blocks")); + namespace { typedef SmallPtrSet SetOfInstrs; typedef PointerIntPair TypeIsSExt; @@ -175,9 +188,12 @@ private: bool eliminateFallThrough(Function &F); - bool eliminateMostlyEmptyBlocks(Function &F); + bool eliminateMostlyEmptyBlocks(Function &F, BlockFrequencyInfo &BFI); bool canMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const; void eliminateMostlyEmptyBlock(BasicBlock *BB); + bool isMergingEmptyBlockProfitable(BasicBlock *BB, BasicBlock *DestBB, + bool isPreheader, + BlockFrequencyInfo &BFI); bool optimizeBlock(BasicBlock &BB, bool& ModifiedDT); bool optimizeInst(Instruction *I, bool& ModifiedDT); bool optimizeMemoryInst(Instruction *I, Value *Addr, @@ -231,6 +247,9 @@ LI = &getAnalysis().getLoopInfo(); OptSize = F.optForSize(); + BranchProbabilityInfo BPI(F, *LI); + BlockFrequencyInfo BFI(F, BPI, *LI); + /// This optimization identifies DIV instructions that can be /// profitably bypassed and carried out with a shorter, faster divide. if (!OptSize && TLI && TLI->isSlowDivBypassed()) { @@ -248,7 +267,7 @@ // Eliminate blocks that contain only PHI nodes and an // unconditional branch. - EverMadeChange |= eliminateMostlyEmptyBlocks(F); + EverMadeChange |= eliminateMostlyEmptyBlocks(F, BFI); // llvm.dbg.value is far away from the value then iSel may not be able // handle it properly. iSel will drop llvm.dbg.value if it can not @@ -369,7 +388,8 @@ /// unconditional branch. Passes before isel (e.g. LSR/loopsimplify) often split /// edges in ways that are non-optimal for isel. Start by eliminating these /// blocks so we can split them the way we want them. -bool CodeGenPrepare::eliminateMostlyEmptyBlocks(Function &F) { +bool CodeGenPrepare::eliminateMostlyEmptyBlocks(Function &F, + BlockFrequencyInfo &BFI) { SmallPtrSet Preheaders; SmallVector LoopList(LI->begin(), LI->end()); while (!LoopList.empty()) { @@ -408,23 +428,68 @@ if (DestBB == BB) continue; - if (!canMergeBlocks(BB, DestBB)) + if (!canMergeBlocks(BB, DestBB) || + !isMergingEmptyBlockProfitable(BB, DestBB, Preheaders.count(BB), BFI)) continue; - // Do not delete loop preheaders if doing so would create a critical edge. - // Loop preheaders can be good locations to spill registers. If the - // preheader is deleted and we create a critical edge, registers may be - // spilled in the loop body instead. - if (!DisablePreheaderProtect && Preheaders.count(BB) && - !(BB->getSinglePredecessor() && BB->getSinglePredecessor()->getSingleSuccessor())) - continue; - eliminateMostlyEmptyBlock(BB); MadeChange = true; } return MadeChange; } +bool CodeGenPrepare::isMergingEmptyBlockProfitable(BasicBlock *BB, + BasicBlock *DestBB, + bool isPreheader, + BlockFrequencyInfo &BFI) { + // Do not delete loop preheaders if doing so would create a critical edge. + // Loop preheaders can be good locations to spill registers. If the + // preheader is deleted and we create a critical edge, registers may be + // spilled in the loop body instead. + if (!DisablePreheaderProtect && isPreheader && + !(BB->getSinglePredecessor() && + BB->getSinglePredecessor()->getSingleSuccessor())) + return false; + + // Try to skip merging if the unique predecessor of BB is terminated by a + // switch or indirect branch instruction, and BB is used as an incoming block + // of PHIs in DestBB. In such case, merging BB and DestBB would cause ISel to + // add COPY instructions in the header of switch. Note that the critical edge + // created by merging such blocks wont be split in MachineSink because the + // jump table is not analyzable. By keeping such empty block (BB), ISel will + // place COPY instructions in BB, not in the predecessor of BB. + BasicBlock *Pred = BB->getUniquePredecessor(); + if (Pred && BB->getTerminator() == BB->getFirstNonPHI() && + (isa(Pred->getTerminator()) || + isa(Pred->getTerminator()))) { + // Since a branch instruction could be added in the empty case block by + // skipping merging it, we skip merging the empty case only when the + // frequency of empty case block is significantly lower than the + // frequency of header of switch. + const BlockFrequency PredFreq = BFI.getBlockFreq(Pred); + const BlockFrequency BBFreq = BFI.getBlockFreq(BB); + if (!OptSize && BBFreq.getFrequency() > 0 && + PredFreq.getFrequency() / BBFreq.getFrequency() > + FreqRatioToSkipMerge) { + unsigned CntIncomingBB = MinNumPhiInDestToSkipMerge; + BasicBlock::const_iterator BBI = DestBB->begin(); + while (const PHINode *PN = dyn_cast(BBI++)) { + if (PN->getBasicBlockIndex(BB) >= 0) { + CntIncomingBB--; + // FIXME: Since the current default value of FreqRatioToSkipMerge + // (1000) is conservative enough, we do not allow creating critical + // edges even for a single PHINode (i.g., the current default value of + // MinNumPhiInDestToSkipMerge is 1). We can tune these flags based on + // extensive performance experiments. + if (CntIncomingBB == 0) + return false; + } + } + } + } + return true; +} + /// Return true if we can merge BB into DestBB if there is a single /// unconditional branch between them, and BB contains no other non-phi /// instructions. Index: test/Transforms/CodeGenPrepare/skip-merging-case-block.ll =================================================================== --- /dev/null +++ test/Transforms/CodeGenPrepare/skip-merging-case-block.ll @@ -0,0 +1,56 @@ +; RUN: opt -codegenprepare < %s -mtriple=aarch64-none-linux-gnu -S | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +define i32 @f_switch(i32 %c) { +; CHECK-LABEL: @f_switch +; CHECK-LABEL: entry: +; CHECK: i32 10, label %sw.bb +entry: + switch i32 %c, label %sw.default [ + i32 10, label %sw.bb + i32 20, label %sw.bb2 + i32 30, label %sw.bb3 + i32 40, label %sw.bb4 + ], !prof !0 + +sw.bb: ; preds = %entry + br label %sw.epilog + +sw.bb2: ; preds = %entry + call void bitcast (void (...)* @callcase2 to void ()*)() + br label %sw.epilog + +sw.bb3: ; preds = %entry + call void bitcast (void (...)* @callcase3 to void ()*)() + br label %sw.epilog + +sw.bb4: ; preds = %entry + call void bitcast (void (...)* @callcase4 to void ()*)() + br label %sw.epilog + +sw.default: ; preds = %entry + call void bitcast (void (...)* @calldefault to void ()*)() + br label %sw.epilog + +; CHECK-LABEL: sw.epilog: +; CHECK: %fp.0 = phi void (...)* [ @FD, %sw.default ], [ @F4, %sw.bb4 ], [ @F3, %sw.bb3 ], [ @F2, %sw.bb2 ], [ @F1, %sw.bb ] +sw.epilog: ; preds = %sw.default, %sw.bb3, %sw.bb2, %sw.bb + %fp.0 = phi void (...)* [ @FD, %sw.default ], [ @F4, %sw.bb4 ], [ @F3, %sw.bb3 ], [ @F2, %sw.bb2 ], [ @F1, %sw.bb ] + %callee.knr.cast = bitcast void (...)* %fp.0 to void ()* + call void %callee.knr.cast() #3 + ret i32 0 +} + +declare void @F1(...) local_unnamed_addr +declare void @F2(...) local_unnamed_addr +declare void @F3(...) local_unnamed_addr +declare void @F4(...) local_unnamed_addr +declare void @FD(...) local_unnamed_addr +declare void @callcase2(...) local_unnamed_addr +declare void @callcase3(...) local_unnamed_addr +declare void @callcase4(...) local_unnamed_addr +declare void @calldefault(...) local_unnamed_addr + +!0 = !{!"branch_weights", i32 500, i32 1, i32 500,i32 500, i32 500}