Index: lib/CodeGen/CodeGenPrepare.cpp =================================================================== --- lib/CodeGen/CodeGenPrepare.cpp +++ lib/CodeGen/CodeGenPrepare.cpp @@ -17,6 +17,8 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" @@ -118,6 +120,11 @@ "disable-preheader-prot", cl::Hidden, cl::init(false), cl::desc("Disable protection against removing loop preheaders")); +static cl::opt SwitchToCaseFreqRatio( + "cgp-switch-to-case-ratio", cl::Hidden, cl::init(1000), + cl::desc("Skip merging empty case blocks if (frequency of switch header) / " + "(frequency of case block) is greater than this ratio")); + namespace { typedef SmallPtrSet SetOfInstrs; typedef PointerIntPair TypeIsSExt; @@ -174,7 +181,7 @@ private: bool eliminateFallThrough(Function &F); - bool eliminateMostlyEmptyBlocks(Function &F); + bool eliminateMostlyEmptyBlocks(Function &F, BlockFrequencyInfo &BFI); bool canMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const; void eliminateMostlyEmptyBlock(BasicBlock *BB); bool optimizeBlock(BasicBlock &BB, bool& ModifiedDT); @@ -230,6 +237,9 @@ LI = &getAnalysis().getLoopInfo(); OptSize = F.optForSize(); + BranchProbabilityInfo BPI(F, *LI); + BlockFrequencyInfo BFI(F, BPI, *LI); + /// This optimization identifies DIV instructions that can be /// profitably bypassed and carried out with a shorter, faster divide. if (!OptSize && TLI && TLI->isSlowDivBypassed()) { @@ -247,7 +257,7 @@ // Eliminate blocks that contain only PHI nodes and an // unconditional branch. - EverMadeChange |= eliminateMostlyEmptyBlocks(F); + EverMadeChange |= eliminateMostlyEmptyBlocks(F, BFI); // llvm.dbg.value is far away from the value then iSel may not be able // handle it properly. iSel will drop llvm.dbg.value if it can not @@ -368,7 +378,8 @@ /// unconditional branch. Passes before isel (e.g. LSR/loopsimplify) often split /// edges in ways that are non-optimal for isel. Start by eliminating these /// blocks so we can split them the way we want them. -bool CodeGenPrepare::eliminateMostlyEmptyBlocks(Function &F) { +bool CodeGenPrepare::eliminateMostlyEmptyBlocks(Function &F, + BlockFrequencyInfo &BFI) { SmallPtrSet Preheaders; SmallVector LoopList(LI->begin(), LI->end()); while (!LoopList.empty()) { @@ -410,6 +421,40 @@ if (!canMergeBlocks(BB, DestBB)) continue; + // Try to skip merging if the unique predecessor of BB is terminated by a + // switch instruction and BB is used as an incoming block of a PHI in + // DestBB. In such case, merging BB and DestBB would cause ISel to add COPY + // instructions in the header of switch. This could potentially increase + // dynamic instructions, especially when the switch is in a loop. By keeping + // the empty block (BB), ISel will place COPY instructions in DestBB, not in + // BB. + BasicBlock *Pred = BB->getUniquePredecessor(); + if (Pred && isa(Pred->getTerminator()) && + BI == BB->getFirstNonPHI()) { + bool IsIncomingBlock = false; + BasicBlock::const_iterator BBI = DestBB->begin(); + + while (const PHINode *PN = dyn_cast(BBI++)) { + if (PN->getBasicBlockIndex(BB) >= 0) { + IsIncomingBlock = true; + break; + } + } + + if (!OptSize && IsIncomingBlock) { + // Since a branch instruction could be added in the empty case block by + // skipping merging it, we skip merging the empty case only when the + // frequency of empty case block is significantly lower than the + // frequency of header of switch. + const BlockFrequency PredFreq = BFI.getBlockFreq(Pred); + const BlockFrequency BBFreq = BFI.getBlockFreq(BB); + if (BBFreq.getFrequency() > 0 && + PredFreq.getFrequency() / BBFreq.getFrequency() > + SwitchToCaseFreqRatio) + continue; + } + } + // Do not delete loop preheaders if doing so would create a critical edge. // Loop preheaders can be good locations to spill registers. If the // preheader is deleted and we create a critical edge, registers may be Index: test/Transforms/CodeGenPrepare/skip-merging-case-block.ll =================================================================== --- /dev/null +++ test/Transforms/CodeGenPrepare/skip-merging-case-block.ll @@ -0,0 +1,48 @@ +; RUN: opt -codegenprepare < %s -mtriple=aarch64-none-linux-gnu -S | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +define i32 @foo(i32 %c) { +; CHECK-LABEL: entry: +; CHECK: i32 10, label %sw.bb +entry: + switch i32 %c, label %sw.default [ + i32 10, label %sw.bb + i32 20, label %sw.bb2 + i32 30, label %sw.bb3 + ], !prof !0 + +sw.bb: ; preds = %entry + br label %sw.epilog + +sw.bb2: ; preds = %entry + call void bitcast (void (...)* @callcase2 to void ()*)() + br label %sw.epilog + +sw.bb3: ; preds = %entry + call void bitcast (void (...)* @callcase3 to void ()*)() + br label %sw.epilog + +sw.default: ; preds = %entry + call void bitcast (void (...)* @calldefault to void ()*)() + br label %sw.epilog + +; CHECK-LABEL: sw.epilog: +; CHECK: %fp.0 = phi void (...)* [ @F4, %sw.default ], [ @F3, %sw.bb3 ], [ @F2, %sw.bb2 ], [ @F1, %sw.bb ] +sw.epilog: ; preds = %sw.default, %sw.bb3, %sw.bb2, %sw.bb + %fp.0 = phi void (...)* [ @F4, %sw.default ], [ @F3, %sw.bb3 ], [ @F2, %sw.bb2 ], [ @F1, %sw.bb ] + %callee.knr.cast = bitcast void (...)* %fp.0 to void ()* + call void %callee.knr.cast() #3 + ret i32 0 +} + +declare void @F1(...) local_unnamed_addr +declare void @F2(...) local_unnamed_addr +declare void @F3(...) local_unnamed_addr +declare void @F4(...) local_unnamed_addr +declare void @callcase2(...) local_unnamed_addr +declare void @callcase3(...) local_unnamed_addr +declare void @calldefault(...) local_unnamed_addr + +!0 = !{!"branch_weights", i32 500, i32 1, i32 500,i32 500}