Index: llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp =================================================================== --- llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp +++ llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp @@ -17,6 +17,8 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ProfileSummaryInfo.h" @@ -124,6 +126,11 @@ "profile-guided-section-prefix", cl::Hidden, cl::init(true), cl::desc("Use profile info to add section prefix for hot/cold functions")); +static cl::opt FreqRatioToSkipMerge( + "cgp-freq-ratio-to-skip-merge", cl::Hidden, cl::init(2), + cl::desc("Skip merging empty blocks if (frequency of empty block) / " + "(frequency of destination block) is greater than this ratio")); + namespace { typedef SmallPtrSet SetOfInstrs; typedef PointerIntPair TypeIsSExt; @@ -136,6 +143,8 @@ const TargetTransformInfo *TTI; const TargetLibraryInfo *TLInfo; const LoopInfo *LI; + std::unique_ptr BFI; + std::unique_ptr BPI; /// As we scan instructions optimizing them, this is the next instruction /// to optimize. Transforms that can invalidate this should update it. @@ -182,8 +191,11 @@ private: bool eliminateFallThrough(Function &F); bool eliminateMostlyEmptyBlocks(Function &F); + BasicBlock *findDestBlockOfMergeableEmptyBlock(BasicBlock *BB); bool canMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const; void eliminateMostlyEmptyBlock(BasicBlock *BB); + bool isMergingEmptyBlockProfitable(BasicBlock *BB, BasicBlock *DestBB, + bool isPreheader); bool optimizeBlock(BasicBlock &BB, bool& ModifiedDT); bool optimizeInst(Instruction *I, bool& ModifiedDT); bool optimizeMemoryInst(Instruction *I, Value *Addr, @@ -231,6 +243,8 @@ // Clear per function information. InsertedInsts.clear(); PromotedInsts.clear(); + BFI.reset(); + BPI.reset(); ModifiedDT = false; if (TM) @@ -383,6 +397,38 @@ return Changed; } +/// Find a destination block from BB if BB is mergeable empty block. +BasicBlock *CodeGenPrepare::findDestBlockOfMergeableEmptyBlock(BasicBlock *BB) { + // If this block doesn't end with an uncond branch, ignore it. + BranchInst *BI = dyn_cast(BB->getTerminator()); + if (!BI || !BI->isUnconditional()) + return nullptr; + + // If the instruction before the branch (skipping debug info) isn't a phi + // node, then other stuff is happening here. + BasicBlock::iterator BBI = BI->getIterator(); + if (BBI != BB->begin()) { + --BBI; + while (isa(BBI)) { + if (BBI == BB->begin()) + break; + --BBI; + } + if (!isa(BBI) && !isa(BBI)) + return nullptr; + } + + // Do not break infinite loops. + BasicBlock *DestBB = BI->getSuccessor(0); + if (DestBB == BB) + return nullptr; + + if (!canMergeBlocks(BB, DestBB)) + DestBB = nullptr; + + return DestBB; +} + /// Eliminate blocks that contain only PHI nodes, debug info directives, and an /// unconditional branch. Passes before isel (e.g. LSR/loopsimplify) often split /// edges in ways that are non-optimal for isel. Start by eliminating these @@ -401,46 +447,106 @@ // Note that this intentionally skips the entry block. for (Function::iterator I = std::next(F.begin()), E = F.end(); I != E;) { BasicBlock *BB = &*I++; - - // If this block doesn't end with an uncond branch, ignore it. - BranchInst *BI = dyn_cast(BB->getTerminator()); - if (!BI || !BI->isUnconditional()) + BasicBlock *DestBB = findDestBlockOfMergeableEmptyBlock(BB); + if (!DestBB || + !isMergingEmptyBlockProfitable(BB, DestBB, Preheaders.count(BB))) continue; - // If the instruction before the branch (skipping debug info) isn't a phi - // node, then other stuff is happening here. - BasicBlock::iterator BBI = BI->getIterator(); - if (BBI != BB->begin()) { - --BBI; - while (isa(BBI)) { - if (BBI == BB->begin()) - break; - --BBI; - } - if (!isa(BBI) && !isa(BBI)) - continue; - } + eliminateMostlyEmptyBlock(BB); + MadeChange = true; + } + return MadeChange; +} - // Do not break infinite loops. - BasicBlock *DestBB = BI->getSuccessor(0); - if (DestBB == BB) - continue; +bool CodeGenPrepare::isMergingEmptyBlockProfitable(BasicBlock *BB, + BasicBlock *DestBB, + bool isPreheader) { + // Do not delete loop preheaders if doing so would create a critical edge. + // Loop preheaders can be good locations to spill registers. If the + // preheader is deleted and we create a critical edge, registers may be + // spilled in the loop body instead. + if (!DisablePreheaderProtect && isPreheader && + !(BB->getSinglePredecessor() && + BB->getSinglePredecessor()->getSingleSuccessor())) + return false; + + // Try to skip merging if the unique predecessor of BB is terminated by a + // switch or indirect branch instruction, and BB is used as an incoming block + // of PHIs in DestBB. In such case, merging BB and DestBB would cause ISel to + // add COPY instructions in the predecessor of BB instead of BB (if it is not + // merged). Note that the critical edge created by merging such blocks wont be + // split in MachineSink because the jump table is not analyzable. By keeping + // such empty block (BB), ISel will place COPY instructions in BB, not in the + // predecessor of BB. + BasicBlock *Pred = BB->getUniquePredecessor(); + if (!Pred || + !(isa(Pred->getTerminator()) || + isa(Pred->getTerminator()))) + return true; - if (!canMergeBlocks(BB, DestBB)) - continue; + if (BB->getTerminator() != BB->getFirstNonPHI()) + return true; - // Do not delete loop preheaders if doing so would create a critical edge. - // Loop preheaders can be good locations to spill registers. If the - // preheader is deleted and we create a critical edge, registers may be - // spilled in the loop body instead. - if (!DisablePreheaderProtect && Preheaders.count(BB) && - !(BB->getSinglePredecessor() && BB->getSinglePredecessor()->getSingleSuccessor())) - continue; + // We use a simple cost heuristic which determine skipping merging is + // profitable if the cost of skipping merging is less than the cost of + // merging : Cost(skipping merging) < Cost(merging BB), where the + // Cost(skipping merging) is Freq(BB) * (Cost(Copy) + Cost(Branch)), and + // the Cost(merging BB) is Freq(Pred) * Cost(Copy). + // Assuming Cost(Copy) == Cost(Branch), we could simplify it to : + // Freq(Pred) / Freq(BB) > 2. + // Note that if there are multiple empty blocks sharing the same incoming + // value for the PHIs in the DestBB, we consider them together. In such + // case, Cost(merging BB) will be the sum of their frequencies. - eliminateMostlyEmptyBlock(BB); - MadeChange = true; + if (!isa(DestBB->begin())) + return true; + + SmallPtrSet SameIncomingValueBBs; + + // Find all other incoming blocks from which incoming values of all PHIs in + // DestBB are the same as the ones from BB. + for (pred_iterator PI = pred_begin(DestBB), E = pred_end(DestBB); PI != E; + ++PI) { + BasicBlock *DestBBPred = *PI; + if (DestBBPred == BB) + continue; + + bool HasAllSameValue = true; + BasicBlock::const_iterator DestBBI = DestBB->begin(); + while (const PHINode *DestPN = dyn_cast(DestBBI++)) { + if (DestPN->getIncomingValueForBlock(BB) != + DestPN->getIncomingValueForBlock(DestBBPred)) { + HasAllSameValue = false; + break; + } + } + if (HasAllSameValue) + SameIncomingValueBBs.insert(DestBBPred); } - return MadeChange; + + // See if all BB's incoming values are same as the value from Pred. In this + // case, no reason to skip merging because COPYs are expected to be place in + // Pred already. + if (SameIncomingValueBBs.count(Pred)) + return true; + + if (!BFI) { + Function &F = *BB->getParent(); + LoopInfo LI{DominatorTree(F)}; + BPI.reset(new BranchProbabilityInfo(F, LI)); + BFI.reset(new BlockFrequencyInfo(F, *BPI, LI)); + } + + BlockFrequency PredFreq = BFI->getBlockFreq(Pred); + BlockFrequency BBFreq = BFI->getBlockFreq(BB); + + for (auto SameValueBB : SameIncomingValueBBs) + if (SameValueBB->getUniquePredecessor() == Pred && + DestBB == findDestBlockOfMergeableEmptyBlock(SameValueBB)) + BBFreq += BFI->getBlockFreq(SameValueBB); + + return PredFreq.getFrequency() <= + BBFreq.getFrequency() * FreqRatioToSkipMerge; } /// Return true if we can merge BB into DestBB if there is a single Index: llvm/trunk/test/CodeGen/X86/phi-immediate-factoring.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/phi-immediate-factoring.ll +++ llvm/trunk/test/CodeGen/X86/phi-immediate-factoring.ll @@ -1,5 +1,6 @@ ; REQUIRES: asserts -; RUN: llc < %s -disable-preheader-prot=true -march=x86 -stats 2>&1 | grep "Number of blocks eliminated" | grep 6 +; RUN: llc < %s -disable-preheader-prot=true -march=x86 -stats 2>&1 | grep "Number of blocks eliminated" | grep 3 +; RUN: llc < %s -disable-preheader-prot=true -march=x86 -stats -cgp-freq-ratio-to-skip-merge=10 2>&1 | grep "Number of blocks eliminated" | grep 6 ; RUN: llc < %s -disable-preheader-prot=false -march=x86 -stats 2>&1 | grep "Number of blocks eliminated" | grep 3 ; PR1296 Index: llvm/trunk/test/CodeGen/X86/ragreedy-hoist-spill.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/ragreedy-hoist-spill.ll +++ llvm/trunk/test/CodeGen/X86/ragreedy-hoist-spill.ll @@ -177,14 +177,14 @@ br label %for.cond357 sw.bb474: + ; CHECK: sw.bb474 + ; spill is hoisted here. Although loop depth1 is even hotter than loop depth2, sw.bb474 is still cold. + ; CHECK: movq %r{{.*}}, {{[0-9]+}}(%rsp) + ; CHECK: land.rhs485 %cmp476 = icmp eq i8 undef, 0 br i1 %cmp476, label %if.end517, label %do.body479.preheader do.body479.preheader: - ; CHECK: do.body479.preheader - ; spill is hoisted here. Although loop depth1 is even hotter than loop depth2, do.body479.preheader is cold. - ; CHECK: movq %r{{.*}}, {{[0-9]+}}(%rsp) - ; CHECK: land.rhs485 %cmp4833314 = icmp eq i8 undef, 0 br i1 %cmp4833314, label %if.end517, label %land.rhs485 Index: llvm/trunk/test/Transforms/CodeGenPrepare/AArch64/widen_switch.ll =================================================================== --- llvm/trunk/test/Transforms/CodeGenPrepare/AArch64/widen_switch.ll +++ llvm/trunk/test/Transforms/CodeGenPrepare/AArch64/widen_switch.ll @@ -28,7 +28,7 @@ ; ARM64-LABEL: @widen_switch_i16( ; ARM64: %0 = zext i16 %trunc to i32 ; ARM64-NEXT: switch i32 %0, label %sw.default [ -; ARM64-NEXT: i32 1, label %return +; ARM64-NEXT: i32 1, label %sw.bb0 ; ARM64-NEXT: i32 65535, label %sw.bb1 } @@ -58,7 +58,7 @@ ; ARM64-LABEL: @widen_switch_i17( ; ARM64: %0 = zext i17 %trunc to i32 ; ARM64-NEXT: switch i32 %0, label %sw.default [ -; ARM64-NEXT: i32 10, label %return +; ARM64-NEXT: i32 10, label %sw.bb0 ; ARM64-NEXT: i32 131071, label %sw.bb1 } @@ -89,7 +89,7 @@ ; ARM64-LABEL: @widen_switch_i16_sext( ; ARM64: %0 = sext i2 %a to i32 ; ARM64-NEXT: switch i32 %0, label %sw.default [ -; ARM64-NEXT: i32 1, label %return +; ARM64-NEXT: i32 1, label %sw.bb0 ; ARM64-NEXT: i32 -1, label %sw.bb1 } Index: llvm/trunk/test/Transforms/CodeGenPrepare/X86/widen_switch.ll =================================================================== --- llvm/trunk/test/Transforms/CodeGenPrepare/X86/widen_switch.ll +++ llvm/trunk/test/Transforms/CodeGenPrepare/X86/widen_switch.ll @@ -28,7 +28,7 @@ ; X86-LABEL: @widen_switch_i16( ; X86: %trunc = trunc i32 %a to i16 ; X86-NEXT: switch i16 %trunc, label %sw.default [ -; X86-NEXT: i16 1, label %return +; X86-NEXT: i16 1, label %sw.bb0 ; X86-NEXT: i16 -1, label %sw.bb1 } @@ -58,7 +58,7 @@ ; X86-LABEL: @widen_switch_i17( ; X86: %0 = zext i17 %trunc to i32 ; X86-NEXT: switch i32 %0, label %sw.default [ -; X86-NEXT: i32 10, label %return +; X86-NEXT: i32 10, label %sw.bb0 ; X86-NEXT: i32 131071, label %sw.bb1 } @@ -89,7 +89,7 @@ ; X86-LABEL: @widen_switch_i16_sext( ; X86: %0 = sext i2 %a to i8 ; X86-NEXT: switch i8 %0, label %sw.default [ -; X86-NEXT: i8 1, label %return +; X86-NEXT: i8 1, label %sw.bb0 ; X86-NEXT: i8 -1, label %sw.bb1 } Index: llvm/trunk/test/Transforms/CodeGenPrepare/skip-merging-case-block.ll =================================================================== --- llvm/trunk/test/Transforms/CodeGenPrepare/skip-merging-case-block.ll +++ llvm/trunk/test/Transforms/CodeGenPrepare/skip-merging-case-block.ll @@ -0,0 +1,200 @@ +; RUN: opt -codegenprepare < %s -mtriple=aarch64-none-linux-gnu -S | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +; Expect to skip merging two empty blocks (sw.bb and sw.bb2) into sw.epilog +; as both of them are unlikely executed. +define i32 @f_switch(i32 %c) { +; CHECK-LABEL: @f_switch +; CHECK-LABEL: entry: +; CHECK: i32 10, label %sw.bb +; CHECK: i32 20, label %sw.bb2 +entry: + switch i32 %c, label %sw.default [ + i32 10, label %sw.bb + i32 20, label %sw.bb2 + i32 30, label %sw.bb3 + i32 40, label %sw.bb4 + ], !prof !0 + +sw.bb: ; preds = %entry + br label %sw.epilog + +sw.bb2: ; preds = %entry + br label %sw.epilog + +sw.bb3: ; preds = %entry + call void bitcast (void (...)* @callcase3 to void ()*)() + br label %sw.epilog + +sw.bb4: ; preds = %entry + call void bitcast (void (...)* @callcase4 to void ()*)() + br label %sw.epilog + +sw.default: ; preds = %entry + call void bitcast (void (...)* @calldefault to void ()*)() + br label %sw.epilog + +; CHECK-LABEL: sw.epilog: +; CHECK: %fp.0 = phi void (...)* [ @FD, %sw.default ], [ @F4, %sw.bb4 ], [ @F3, %sw.bb3 ], [ @F2, %sw.bb2 ], [ @F1, %sw.bb ] +sw.epilog: ; preds = %sw.default, %sw.bb3, %sw.bb2, %sw.bb + %fp.0 = phi void (...)* [ @FD, %sw.default ], [ @F4, %sw.bb4 ], [ @F3, %sw.bb3 ], [ @F2, %sw.bb2 ], [ @F1, %sw.bb ] + %callee.knr.cast = bitcast void (...)* %fp.0 to void ()* + call void %callee.knr.cast() + ret i32 0 +} + +; Expect not to merge sw.bb2 because of the conflict in the incoming value from +; sw.bb which is already merged. +define i32 @f_switch2(i32 %c) { +; CHECK-LABEL: @f_switch2 +; CHECK-LABEL: entry: +; CHECK: i32 10, label %sw.epilog +; CHECK: i32 20, label %sw.bb2 +entry: + switch i32 %c, label %sw.default [ + i32 10, label %sw.bb + i32 20, label %sw.bb2 + i32 30, label %sw.bb3 + i32 40, label %sw.bb4 + ], !prof !1 + +sw.bb: ; preds = %entry + br label %sw.epilog + +sw.bb2: ; preds = %entry + br label %sw.epilog + +sw.bb3: ; preds = %entry + call void bitcast (void (...)* @callcase3 to void ()*)() + br label %sw.epilog + +sw.bb4: ; preds = %entry + call void bitcast (void (...)* @callcase4 to void ()*)() + br label %sw.epilog + +sw.default: ; preds = %entry + call void bitcast (void (...)* @calldefault to void ()*)() + br label %sw.epilog + +; CHECK-LABEL: sw.epilog: +; CHECK: %fp.0 = phi void (...)* [ @FD, %sw.default ], [ @F4, %sw.bb4 ], [ @F3, %sw.bb3 ], [ @F2, %sw.bb2 ], [ @F1, %entry ] +sw.epilog: ; preds = %sw.default, %sw.bb3, %sw.bb2, %sw.bb + %fp.0 = phi void (...)* [ @FD, %sw.default ], [ @F4, %sw.bb4 ], [ @F3, %sw.bb3 ], [ @F2, %sw.bb2 ], [ @F1, %sw.bb ] + %callee.knr.cast = bitcast void (...)* %fp.0 to void ()* + call void %callee.knr.cast() + ret i32 0 +} + +; Multiple empty blocks should be considered together if all incoming values +; from them are same. We expect to merge both empty blocks (sw.bb and sw.bb2) +; because the sum of frequencies are higer than the threshold. +define i32 @f_switch3(i32 %c) { +; CHECK-LABEL: @f_switch3 +; CHECK-LABEL: entry: +; CHECK: i32 10, label %sw.epilog +; CHECK: i32 20, label %sw.epilog +entry: + switch i32 %c, label %sw.default [ + i32 10, label %sw.bb + i32 20, label %sw.bb2 + i32 30, label %sw.bb3 + i32 40, label %sw.bb4 + ], !prof !2 + +sw.bb: ; preds = %entry + br label %sw.epilog + +sw.bb2: ; preds = %entry + br label %sw.epilog + +sw.bb3: ; preds = %entry + call void bitcast (void (...)* @callcase3 to void ()*)() + br label %sw.epilog + +sw.bb4: ; preds = %entry + call void bitcast (void (...)* @callcase4 to void ()*)() + br label %sw.epilog + +sw.default: ; preds = %entry + call void bitcast (void (...)* @calldefault to void ()*)() + br label %sw.epilog + +; CHECK-LABEL: sw.epilog: +; CHECK: %fp.0 = phi void (...)* [ @FD, %sw.default ], [ @F4, %sw.bb4 ], [ @F3, %sw.bb3 ], [ @F1, %entry ], [ @F1, %entry ] +sw.epilog: ; preds = %sw.default, %sw.bb3, %sw.bb2, %sw.bb + %fp.0 = phi void (...)* [ @FD, %sw.default ], [ @F4, %sw.bb4 ], [ @F3, %sw.bb3 ], [ @F1, %sw.bb2 ], [ @F1, %sw.bb ] + %callee.knr.cast = bitcast void (...)* %fp.0 to void ()* + call void %callee.knr.cast() + ret i32 0 +} + +declare void @F1(...) local_unnamed_addr +declare void @F2(...) local_unnamed_addr +declare void @F3(...) local_unnamed_addr +declare void @F4(...) local_unnamed_addr +declare void @FD(...) local_unnamed_addr +declare void @callcase3(...) local_unnamed_addr +declare void @callcase4(...) local_unnamed_addr +declare void @calldefault(...) local_unnamed_addr + +!0 = !{!"branch_weights", i32 5, i32 1, i32 1,i32 5, i32 5} +!1 = !{!"branch_weights", i32 1 , i32 5, i32 1,i32 1, i32 1} +!2 = !{!"branch_weights", i32 1 , i32 4, i32 1,i32 1, i32 1} + + +; This test that BFI/BPI is created without any assertion in isMergingEmptyBlockProfitable() +; in the case where empty blocks are removed before creating BFI/BPI. +@b = common global i32 0, align 4 +@a = common global i32* null, align 8 +define i32 @should_not_assert(i32 %i) local_unnamed_addr { +entry: + %0 = load i32, i32* @b, align 4 + %cond = icmp eq i32 %0, 6 + br i1 %cond, label %while.cond.preheader, label %sw.epilog + +while.cond.preheader: ; preds = %entry + %1 = load i32*, i32** @a, align 8 + %magicptr = ptrtoint i32* %1 to i64 + %arrayidx = getelementptr inbounds i32, i32* %1, i64 1 + br label %while.cond + +while.cond: ; preds = %while.cond.preheader, %land.rhs + switch i64 %magicptr, label %land.rhs [ + i64 32, label %while.cond2.loopexit + i64 0, label %while.cond2.loopexit + ] + +land.rhs: ; preds = %while.cond + %2 = load i32, i32* %arrayidx, align 4 + %tobool1 = icmp eq i32 %2, 0 + br i1 %tobool1, label %while.cond2thread-pre-split.loopexit, label %while.cond + +while.cond2thread-pre-split.loopexit: ; preds = %land.rhs + br label %while.cond2thread-pre-split + +while.cond2thread-pre-split: ; preds = %while.cond2thread-pre-split.loopexit, %while.body4 + %.pr = phi i32* [ %.pr.pre, %while.body4 ], [ %1, %while.cond2thread-pre-split.loopexit ] + br label %while.cond2 + +while.cond2.loopexit: ; preds = %while.cond, %while.cond + br label %while.cond2 + +while.cond2: ; preds = %while.cond2.loopexit, %while.cond2thread-pre-split + %3 = phi i32* [ %.pr, %while.cond2thread-pre-split ], [ %1, %while.cond2.loopexit ] + %tobool3 = icmp eq i32* %3, null + br i1 %tobool3, label %sw.epilog, label %while.body4 + +while.body4: ; preds = %while.cond2 + tail call void bitcast (void (...)* @fn2 to void ()*)() + %.pr.pre = load i32*, i32** @a, align 8 + br label %while.cond2thread-pre-split + +sw.epilog: ; preds = %while.cond2, %entry + ret i32 undef +} + + +declare void @fn2(...) local_unnamed_addr +