Index: lib/CodeGen/CodeGenPrepare.cpp =================================================================== --- lib/CodeGen/CodeGenPrepare.cpp +++ lib/CodeGen/CodeGenPrepare.cpp @@ -17,6 +17,8 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" @@ -119,6 +121,11 @@ "disable-preheader-prot", cl::Hidden, cl::init(false), cl::desc("Disable protection against removing loop preheaders")); +static cl::opt FreqRatioToSkipMerge( + "cgp-freq-ratio-to-skip-merge", cl::Hidden, cl::init(2), + cl::desc("Skip merging empty blocks if (frequency of empty block) / " + "(frequency of destination block) is greater than this ratio")); + namespace { typedef SmallPtrSet SetOfInstrs; typedef PointerIntPair TypeIsSExt; @@ -175,9 +182,13 @@ private: bool eliminateFallThrough(Function &F); - bool eliminateMostlyEmptyBlocks(Function &F); + bool eliminateMostlyEmptyBlocks(Function &F, BlockFrequencyInfo &BFI); + BasicBlock *findDestBlockIfMergeableEmptyBlock(BasicBlock *BB); bool canMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const; void eliminateMostlyEmptyBlock(BasicBlock *BB); + bool isMergingEmptyBlockProfitable(BasicBlock *BB, BasicBlock *DestBB, + bool isPreheader, + BlockFrequencyInfo &BFI); bool optimizeBlock(BasicBlock &BB, bool& ModifiedDT); bool optimizeInst(Instruction *I, bool& ModifiedDT); bool optimizeMemoryInst(Instruction *I, Value *Addr, @@ -231,6 +242,9 @@ LI = &getAnalysis().getLoopInfo(); OptSize = F.optForSize(); + BranchProbabilityInfo BPI(F, *LI); + BlockFrequencyInfo BFI(F, BPI, *LI); + /// This optimization identifies DIV instructions that can be /// profitably bypassed and carried out with a shorter, faster divide. if (!OptSize && TLI && TLI->isSlowDivBypassed()) { @@ -248,7 +262,7 @@ // Eliminate blocks that contain only PHI nodes and an // unconditional branch. - EverMadeChange |= eliminateMostlyEmptyBlocks(F); + EverMadeChange |= eliminateMostlyEmptyBlocks(F, BFI); // llvm.dbg.value is far away from the value then iSel may not be able // handle it properly. iSel will drop llvm.dbg.value if it can not @@ -365,11 +379,44 @@ return Changed; } +/// Find a destination block from BB if BB is mergeable empty block. +BasicBlock *CodeGenPrepare::findDestBlockIfMergeableEmptyBlock(BasicBlock *BB) { + // If this block doesn't end with an uncond branch, ignore it. + BranchInst *BI = dyn_cast(BB->getTerminator()); + if (!BI || !BI->isUnconditional()) + return nullptr; + + // If the instruction before the branch (skipping debug info) isn't a phi + // node, then other stuff is happening here. + BasicBlock::iterator BBI = BI->getIterator(); + if (BBI != BB->begin()) { + --BBI; + while (isa(BBI)) { + if (BBI == BB->begin()) + break; + --BBI; + } + if (!isa(BBI) && !isa(BBI)) + return nullptr; + } + + // Do not break infinite loops. + BasicBlock *DestBB = BI->getSuccessor(0); + if (DestBB == BB) + return nullptr; + + if (!canMergeBlocks(BB, DestBB)) + DestBB = nullptr; + + return DestBB; +} + /// Eliminate blocks that contain only PHI nodes, debug info directives, and an /// unconditional branch. Passes before isel (e.g. LSR/loopsimplify) often split /// edges in ways that are non-optimal for isel. Start by eliminating these /// blocks so we can split them the way we want them. -bool CodeGenPrepare::eliminateMostlyEmptyBlocks(Function &F) { +bool CodeGenPrepare::eliminateMostlyEmptyBlocks(Function &F, + BlockFrequencyInfo &BFI) { SmallPtrSet Preheaders; SmallVector LoopList(LI->begin(), LI->end()); while (!LoopList.empty()) { @@ -383,46 +430,97 @@ // Note that this intentionally skips the entry block. for (Function::iterator I = std::next(F.begin()), E = F.end(); I != E;) { BasicBlock *BB = &*I++; - - // If this block doesn't end with an uncond branch, ignore it. - BranchInst *BI = dyn_cast(BB->getTerminator()); - if (!BI || !BI->isUnconditional()) + BasicBlock *DestBB = findDestBlockIfMergeableEmptyBlock(BB); + if (!DestBB || + !isMergingEmptyBlockProfitable(BB, DestBB, Preheaders.count(BB), BFI)) continue; - // If the instruction before the branch (skipping debug info) isn't a phi - // node, then other stuff is happening here. - BasicBlock::iterator BBI = BI->getIterator(); - if (BBI != BB->begin()) { - --BBI; - while (isa(BBI)) { - if (BBI == BB->begin()) + eliminateMostlyEmptyBlock(BB); + MadeChange = true; + } + return MadeChange; +} + +bool CodeGenPrepare::isMergingEmptyBlockProfitable(BasicBlock *BB, + BasicBlock *DestBB, + bool isPreheader, + BlockFrequencyInfo &BFI) { + // Do not delete loop preheaders if doing so would create a critical edge. + // Loop preheaders can be good locations to spill registers. If the + // preheader is deleted and we create a critical edge, registers may be + // spilled in the loop body instead. + if (!DisablePreheaderProtect && isPreheader && + !(BB->getSinglePredecessor() && + BB->getSinglePredecessor()->getSingleSuccessor())) + return false; + + // Try to skip merging if the unique predecessor of BB is terminated by a + // switch or indirect branch instruction, and BB is used as an incoming block + // of PHIs in DestBB. In such case, merging BB and DestBB would cause ISel to + // add COPY instructions in the predecessor of BB instead of BB (if it is not + // merged). Note that the critical edge created by merging such blocks wont be + // split in MachineSink because the jump table is not analyzable. By keeping + // such empty block (BB), ISel will place COPY instructions in BB, not in the + // predecessor of BB. + BasicBlock *Pred = BB->getUniquePredecessor(); + if (Pred && BB->getTerminator() == BB->getFirstNonPHI() && + (isa(Pred->getTerminator()) || + isa(Pred->getTerminator()))) { + // We use a simple cost heuristic which determine skipping merging is + // profitable if the cost of skipping merging is less than the cost of + // merging : Cost(skipping merging) < Cost(merging BB), where the + // Cost(skipping merging) is Freq(BB) * (Cost(Copy) + Cost(Branch)), and + // the Cost(merging BB) is Freq(Pred) * Cost(Copy). + // Assuming Cost(Copy) == Cost(Branch), we could simplify it to : + // Freq(Pred) / Freq(BB) > 2. + // Note that if there are multiple empty blocks sharing the same incoming + // value for the PHIs in the DestBB, we consider them together. In such + // case, Cost(merging BB) will be the sum of their frequencies. + + if (!isa(DestBB->begin())) + return true; + + BlockFrequency PredFreq = BFI.getBlockFreq(Pred); + BlockFrequency BBFreq = BFI.getBlockFreq(BB); + SmallPtrSet SameIncomingValueBBs; + + // Find other incoming blocks from which all the incoming values are same as + // the one from BB. + for (pred_iterator PI = pred_begin(DestBB), E = pred_end(DestBB); PI != E; + ++PI) { + BasicBlock *DestBBPred = *PI; + if (DestBBPred == BB) + continue; + + bool HasAllSameValue = true; + BasicBlock::const_iterator DestBBI = DestBB->begin(); + while (const PHINode *DestPN = dyn_cast(DestBBI++)) { + if (DestPN->getIncomingValueForBlock(BB) != + DestPN->getIncomingValueForBlock(DestBBPred)) { + HasAllSameValue = false; break; - --BBI; + } } - if (!isa(BBI) && !isa(BBI)) - continue; + if (HasAllSameValue) + SameIncomingValueBBs.insert(DestBBPred); } - // Do not break infinite loops. - BasicBlock *DestBB = BI->getSuccessor(0); - if (DestBB == BB) - continue; - - if (!canMergeBlocks(BB, DestBB)) - continue; + // See if all BB's incoming values are same as the value from Pred. In this + // case, no reason to skip merging because COPYs are expected to be place in + // Pred already. + if (SameIncomingValueBBs.count(Pred)) + return true; - // Do not delete loop preheaders if doing so would create a critical edge. - // Loop preheaders can be good locations to spill registers. If the - // preheader is deleted and we create a critical edge, registers may be - // spilled in the loop body instead. - if (!DisablePreheaderProtect && Preheaders.count(BB) && - !(BB->getSinglePredecessor() && BB->getSinglePredecessor()->getSingleSuccessor())) - continue; + for (auto SameValueBB : SameIncomingValueBBs) + if (SameValueBB->getUniquePredecessor() == Pred && + DestBB == findDestBlockIfMergeableEmptyBlock(SameValueBB)) + BBFreq += BFI.getBlockFreq(SameValueBB); - eliminateMostlyEmptyBlock(BB); - MadeChange = true; + if (PredFreq.getFrequency() > + (BBFreq.getFrequency() * FreqRatioToSkipMerge)) + return false; } - return MadeChange; + return true; } /// Return true if we can merge BB into DestBB if there is a single Index: test/Transforms/CodeGenPrepare/skip-merging-case-block.ll =================================================================== --- /dev/null +++ test/Transforms/CodeGenPrepare/skip-merging-case-block.ll @@ -0,0 +1,144 @@ +; RUN: opt -codegenprepare < %s -mtriple=aarch64-none-linux-gnu -S | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +; Expect to skip merging two empty blocks (sw.bb and sw.bb2) into sw.epilog +; as both of them are unlikely executed. +define i32 @f_switch(i32 %c) { +; CHECK-LABEL: @f_switch +; CHECK-LABEL: entry: +; CHECK: i32 10, label %sw.bb +; CHECK: i32 20, label %sw.bb2 +entry: + switch i32 %c, label %sw.default [ + i32 10, label %sw.bb + i32 20, label %sw.bb2 + i32 30, label %sw.bb3 + i32 40, label %sw.bb4 + ], !prof !0 + +sw.bb: ; preds = %entry + br label %sw.epilog + +sw.bb2: ; preds = %entry + br label %sw.epilog + +sw.bb3: ; preds = %entry + call void bitcast (void (...)* @callcase3 to void ()*)() + br label %sw.epilog + +sw.bb4: ; preds = %entry + call void bitcast (void (...)* @callcase4 to void ()*)() + br label %sw.epilog + +sw.default: ; preds = %entry + call void bitcast (void (...)* @calldefault to void ()*)() + br label %sw.epilog + +; CHECK-LABEL: sw.epilog: +; CHECK: %fp.0 = phi void (...)* [ @FD, %sw.default ], [ @F4, %sw.bb4 ], [ @F3, %sw.bb3 ], [ @F2, %sw.bb2 ], [ @F1, %sw.bb ] +sw.epilog: ; preds = %sw.default, %sw.bb3, %sw.bb2, %sw.bb + %fp.0 = phi void (...)* [ @FD, %sw.default ], [ @F4, %sw.bb4 ], [ @F3, %sw.bb3 ], [ @F2, %sw.bb2 ], [ @F1, %sw.bb ] + %callee.knr.cast = bitcast void (...)* %fp.0 to void ()* + call void %callee.knr.cast() + ret i32 0 +} + +; Expect not to merge sw.bb2 because of the conflict in the incoming value from +; sw.bb which is already merged. +define i32 @f_switch2(i32 %c) { +; CHECK-LABEL: @f_switch2 +; CHECK-LABEL: entry: +; CHECK: i32 10, label %sw.epilog +; CHECK: i32 20, label %sw.bb2 +entry: + switch i32 %c, label %sw.default [ + i32 10, label %sw.bb + i32 20, label %sw.bb2 + i32 30, label %sw.bb3 + i32 40, label %sw.bb4 + ], !prof !1 + +sw.bb: ; preds = %entry + br label %sw.epilog + +sw.bb2: ; preds = %entry + br label %sw.epilog + +sw.bb3: ; preds = %entry + call void bitcast (void (...)* @callcase3 to void ()*)() + br label %sw.epilog + +sw.bb4: ; preds = %entry + call void bitcast (void (...)* @callcase4 to void ()*)() + br label %sw.epilog + +sw.default: ; preds = %entry + call void bitcast (void (...)* @calldefault to void ()*)() + br label %sw.epilog + +; CHECK-LABEL: sw.epilog: +; CHECK: %fp.0 = phi void (...)* [ @FD, %sw.default ], [ @F4, %sw.bb4 ], [ @F3, %sw.bb3 ], [ @F2, %sw.bb2 ], [ @F1, %entry ] +sw.epilog: ; preds = %sw.default, %sw.bb3, %sw.bb2, %sw.bb + %fp.0 = phi void (...)* [ @FD, %sw.default ], [ @F4, %sw.bb4 ], [ @F3, %sw.bb3 ], [ @F2, %sw.bb2 ], [ @F1, %sw.bb ] + %callee.knr.cast = bitcast void (...)* %fp.0 to void ()* + call void %callee.knr.cast() + ret i32 0 +} + +; Multiple empty blocks should be considered together if all incoming values +; from them are same. We expect to merge both empty blocks (sw.bb and sw.bb2) +; because the sum of frequencies are higer than the threshold. +define i32 @f_switch3(i32 %c) { +; CHECK-LABEL: @f_switch3 +; CHECK-LABEL: entry: +; CHECK: i32 10, label %sw.epilog +; CHECK: i32 20, label %sw.epilog +entry: + switch i32 %c, label %sw.default [ + i32 10, label %sw.bb + i32 20, label %sw.bb2 + i32 30, label %sw.bb3 + i32 40, label %sw.bb4 + ], !prof !2 + +sw.bb: ; preds = %entry + br label %sw.epilog + +sw.bb2: ; preds = %entry + br label %sw.epilog + +sw.bb3: ; preds = %entry + call void bitcast (void (...)* @callcase3 to void ()*)() + br label %sw.epilog + +sw.bb4: ; preds = %entry + call void bitcast (void (...)* @callcase4 to void ()*)() + br label %sw.epilog + +sw.default: ; preds = %entry + call void bitcast (void (...)* @calldefault to void ()*)() + br label %sw.epilog + +; CHECK-LABEL: sw.epilog: +; CHECK: %fp.0 = phi void (...)* [ @FD, %sw.default ], [ @F4, %sw.bb4 ], [ @F3, %sw.bb3 ], [ @F1, %entry ], [ @F1, %entry ] +sw.epilog: ; preds = %sw.default, %sw.bb3, %sw.bb2, %sw.bb + %fp.0 = phi void (...)* [ @FD, %sw.default ], [ @F4, %sw.bb4 ], [ @F3, %sw.bb3 ], [ @F1, %sw.bb2 ], [ @F1, %sw.bb ] + %callee.knr.cast = bitcast void (...)* %fp.0 to void ()* + call void %callee.knr.cast() + ret i32 0 +} + +declare void @F1(...) local_unnamed_addr +declare void @F2(...) local_unnamed_addr +declare void @F3(...) local_unnamed_addr +declare void @F4(...) local_unnamed_addr +declare void @FD(...) local_unnamed_addr +declare void @callcase3(...) local_unnamed_addr +declare void @callcase4(...) local_unnamed_addr +declare void @calldefault(...) local_unnamed_addr + +!0 = !{!"branch_weights", i32 5, i32 1, i32 1,i32 5, i32 5} +!1 = !{!"branch_weights", i32 1 , i32 5, i32 1,i32 1, i32 1} +!2 = !{!"branch_weights", i32 1 , i32 4, i32 1,i32 1, i32 1}