diff --git a/bolt/include/bolt/Passes/TailDuplication.h b/bolt/include/bolt/Passes/TailDuplication.h --- a/bolt/include/bolt/Passes/TailDuplication.h +++ b/bolt/include/bolt/Passes/TailDuplication.h @@ -46,16 +46,19 @@ /// Pass for duplicating blocks that would require a jump. class TailDuplication : public BinaryFunctionPass { /// Record how many possible tail duplications there can be. - uint64_t PossibleDuplications = 0; + uint64_t ModifiedFunctions = 0; - /// Record how many times these duplications would get used. - uint64_t PossibleDuplicationsDynamicCount = 0; + /// The number of duplicated basic blocks. + uint64_t DuplicatedBlockCount = 0; + + /// The size (in bytes) of duplicated basic blocks. + uint64_t DuplicatedByteCount = 0; - /// Record the execution count of all unconditional branches. - uint64_t UnconditionalBranchDynamicCount = 0; + /// Record how many times these duplications would get used. + uint64_t DuplicationsDynamicCount = 0; /// Record the execution count of all blocks. - uint64_t AllBlocksDynamicCount = 0; + uint64_t AllDynamicCount = 0; /// Record the number of instructions deleted because of propagation uint64_t StaticInstructionDeletionCount = 0; @@ -87,25 +90,57 @@ constantAndCopyPropagate(BinaryBasicBlock &OriginalBB, std::vector &BlocksToPropagate); - /// True if Succ is in the same cache line as BB (approximately) + /// True if Tail is in the same cache line as BB (approximately) bool isInCacheLine(const BinaryBasicBlock &BB, - const BinaryBasicBlock &Succ) const; + const BinaryBasicBlock &Tail) const; /// Duplicates BlocksToDuplicate and places them after BB. - std::vector - tailDuplicate(BinaryBasicBlock &BB, - const std::vector &BlocksToDuplicate) const; - + std::vector duplicateBlocks( + BinaryBasicBlock &BB, + const std::vector &BlocksToDuplicate) const; + + /// Decide whether the tail basic blocks should be duplicated after BB. + bool shouldDuplicate(BinaryBasicBlock *BB, BinaryBasicBlock *Tail) const; + + /// Compute the cache score for a jump (Src, Dst) with frequency Count. + /// The value is in the range [0..1] and quantifies how "cache-friendly" + /// the jump is. The score is close to 1 for "short" forward jumps and + /// it is 0 for "long" jumps exceeding a specified threshold; between the + /// bounds, the value decreases linearly. For backward jumps, the value is + /// scaled by a specified factor. + double cacheScore(uint64_t SrcAddr, uint64_t SrcSize, uint64_t DstAddr, + uint64_t DstSize, uint64_t Count) const; + + /// Decide whether the cache score has been improved after duplication. + bool cacheScoreImproved(const MCCodeEmitter *Emitter, BinaryFunction &BF, + BinaryBasicBlock *Pred, BinaryBasicBlock *Tail) const; + + /// A moderate strategy for tail duplication. /// Returns a vector of BinaryBasicBlock to copy after BB. If it's empty, - /// nothing should be duplicated + /// nothing should be duplicated. std::vector - moderateCodeToDuplicate(BinaryBasicBlock &BB) const; + moderateDuplicate(BinaryBasicBlock &BB, BinaryBasicBlock &Tail) const; + + /// An aggressive strategy for tail duplication. std::vector - aggressiveCodeToDuplicate(BinaryBasicBlock &BB) const; + aggressiveDuplicate(BinaryBasicBlock &BB, BinaryBasicBlock &Tail) const; + + /// A cache-aware strategy for tail duplication. + std::vector cacheDuplicate(const MCCodeEmitter *Emitter, + BinaryFunction &BF, + BinaryBasicBlock *BB, + BinaryBasicBlock *Tail) const; void runOnFunction(BinaryFunction &Function); public: + enum DuplicationMode : char { + TD_NONE = 0, + TD_AGGRESSIVE, + TD_MODERATE, + TD_CACHE + }; + explicit TailDuplication() : BinaryFunctionPass(false) {} const char *getName() const override { return "tail duplication"; } diff --git a/bolt/lib/Passes/TailDuplication.cpp b/bolt/lib/Passes/TailDuplication.cpp --- a/bolt/lib/Passes/TailDuplication.cpp +++ b/bolt/lib/Passes/TailDuplication.cpp @@ -11,7 +11,9 @@ //===----------------------------------------------------------------------===// #include "bolt/Passes/TailDuplication.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/MC/MCRegisterInfo.h" + #include #define DEBUG_TYPE "taildup" @@ -21,13 +23,21 @@ namespace opts { extern cl::OptionCategory BoltOptCategory; - -static cl::opt TailDuplicationAggressive( - "tail-duplication-aggressive", - cl::desc("tail duplication should act aggressively in duplicating multiple " - "blocks per tail"), - cl::ZeroOrMore, cl::ReallyHidden, cl::init(false), - cl::cat(BoltOptCategory)); +extern cl::opt NoThreads; + +cl::opt TailDuplicationMode( + "tail-duplication", + cl::desc("duplicate unconditional branches that cross a cache line"), + cl::init(bolt::TailDuplication::TD_NONE), + cl::values(clEnumValN(bolt::TailDuplication::TD_NONE, "none", + "do not apply"), + clEnumValN(bolt::TailDuplication::TD_AGGRESSIVE, "aggressive", + "aggressive strategy"), + clEnumValN(bolt::TailDuplication::TD_MODERATE, "moderate", + "moderate strategy"), + clEnumValN(bolt::TailDuplication::TD_CACHE, "cache", + "cache-aware duplication strategy")), + cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory)); static cl::opt TailDuplicationMinimumOffset("tail-duplication-minimum-offset", @@ -38,18 +48,37 @@ static cl::opt TailDuplicationMaximumDuplication( "tail-duplication-maximum-duplication", - cl::desc("maximum size of duplicated blocks (in bytes)"), cl::ZeroOrMore, - cl::ReallyHidden, cl::init(64), cl::cat(BoltOptCategory)); + cl::desc("tail blocks whose size (in bytes) exceeds the value are never " + "duplicated"), + cl::ZeroOrMore, cl::ReallyHidden, cl::init(24), cl::cat(BoltOptCategory)); + +static cl::opt TailDuplicationMinimumDuplication( + "tail-duplication-minimum-duplication", + cl::desc("tail blocks with size (in bytes) not exceeding the value are " + "always duplicated"), + cl::ZeroOrMore, cl::ReallyHidden, cl::init(2), cl::cat(BoltOptCategory)); static cl::opt TailDuplicationConstCopyPropagation( "tail-duplication-const-copy-propagation", cl::desc("enable const and copy propagation after tail duplication"), cl::ReallyHidden, cl::init(false), cl::cat(BoltOptCategory)); +static cl::opt TailDuplicationMaxCacheDistance( + "tail-duplication-max-cache-distance", + cl::desc("The weight of backward jumps for ExtTSP value"), cl::init(256), + cl::ReallyHidden, cl::ZeroOrMore, cl::cat(BoltOptCategory)); + +static cl::opt TailDuplicationCacheBackwardWeight( + "tail-duplication-cache-backward-weight", + cl::desc( + "The maximum distance (in bytes) of backward jumps for ExtTSP value"), + cl::init(0.5), cl::ReallyHidden, cl::ZeroOrMore, cl::cat(BoltOptCategory)); + } // namespace opts namespace llvm { namespace bolt { + void TailDuplication::getCallerSavedRegs(const MCInst &Inst, BitVector &Regs, BinaryContext &BC) const { if (!BC.MIB->isCall(Inst)) @@ -242,24 +271,39 @@ } std::vector -TailDuplication::moderateCodeToDuplicate(BinaryBasicBlock &BB) const { +TailDuplication::moderateDuplicate(BinaryBasicBlock &BB, + BinaryBasicBlock &Tail) const { std::vector BlocksToDuplicate; - if (BB.hasJumpTable()) + // The block must be hot + if (BB.getKnownExecutionCount() == 0) + return BlocksToDuplicate; + // and its sucessor is not already in the same cache line + if (isInCacheLine(BB, Tail)) return BlocksToDuplicate; - if (BB.getOriginalSize() > opts::TailDuplicationMaximumDuplication) + // and its size do not exceed the maximum allowed size + if (Tail.getOriginalSize() > opts::TailDuplicationMaximumDuplication) return BlocksToDuplicate; - for (auto Itr = BB.succ_begin(); Itr != BB.succ_end(); ++Itr) { - if ((*Itr)->getLayoutIndex() == BB.getLayoutIndex() + 1) - // If duplicating would introduce a new branch, don't duplicate + // If duplicating would introduce a new branch, don't duplicate + for (auto Itr = Tail.succ_begin(); Itr != Tail.succ_end(); ++Itr) { + if ((*Itr)->getLayoutIndex() == Tail.getLayoutIndex() + 1) return BlocksToDuplicate; } - BlocksToDuplicate.push_back(&BB); + + BlocksToDuplicate.push_back(&Tail); return BlocksToDuplicate; } std::vector -TailDuplication::aggressiveCodeToDuplicate(BinaryBasicBlock &BB) const { +TailDuplication::aggressiveDuplicate(BinaryBasicBlock &BB, + BinaryBasicBlock &Tail) const { std::vector BlocksToDuplicate; + // The block must be hot + if (BB.getKnownExecutionCount() == 0) + return BlocksToDuplicate; + // and its sucessor is not already in the same cache line + if (isInCacheLine(BB, Tail)) + return BlocksToDuplicate; + BinaryBasicBlock *CurrBB = &BB; while (CurrBB) { LLVM_DEBUG(dbgs() << "Aggressive tail duplication: adding " @@ -297,10 +341,10 @@ // With one successor, if its a jump, we should duplicate all blocks in // BlocksToDuplicate. Otherwise, we should keep going - BinaryBasicBlock *Succ = CurrBB->getSuccessor(); - if (Succ->getLayoutIndex() != CurrBB->getLayoutIndex() + 1) + BinaryBasicBlock *SuccBB = CurrBB->getSuccessor(); + if (SuccBB->getLayoutIndex() != CurrBB->getLayoutIndex() + 1) break; - CurrBB = Succ; + CurrBB = SuccBB; } // Don't duplicate if its too much code unsigned DuplicationByteCount = std::accumulate( @@ -319,7 +363,146 @@ return BlocksToDuplicate; } -std::vector TailDuplication::tailDuplicate( +bool TailDuplication::shouldDuplicate(BinaryBasicBlock *Pred, + BinaryBasicBlock *Tail) const { + if (Pred == Tail) + return false; + // Cannot duplicate non-tail blocks + if (Tail->succ_size() != 0) + return false; + // The blocks are already in the order + if (Pred->getLayoutIndex() + 1 == Tail->getLayoutIndex()) + return false; + // No tail duplication for blocks with jump tables + if (Pred->hasJumpTable()) + return false; + if (Tail->hasJumpTable()) + return false; + + return true; +} + +double TailDuplication::cacheScore(uint64_t SrcAddr, uint64_t SrcSize, + uint64_t DstAddr, uint64_t DstSize, + uint64_t Count) const { + assert(Count != BinaryBasicBlock::COUNT_NO_PROFILE); + + bool IsForwardJump = SrcAddr <= DstAddr; + uint64_t JumpDistance = 0; + // Computing the length of the jump so that it takes the sizes of the two + // blocks into consideration + if (IsForwardJump) { + JumpDistance = (DstAddr + DstSize) - (SrcAddr); + } else { + JumpDistance = (SrcAddr + SrcSize) - (DstAddr); + } + + if (JumpDistance >= opts::TailDuplicationMaxCacheDistance) + return 0; + double Prob = 1.0 - static_cast(JumpDistance) / + opts::TailDuplicationMaxCacheDistance; + return (IsForwardJump ? 1.0 : opts::TailDuplicationCacheBackwardWeight) * + Prob * Count; +} + +bool TailDuplication::cacheScoreImproved(const MCCodeEmitter *Emitter, + BinaryFunction &BF, + BinaryBasicBlock *Pred, + BinaryBasicBlock *Tail) const { + // Collect (estimated) basic block sizes + DenseMap BBSize; + for (BinaryBasicBlock *BB : BF.layout()) { + BBSize[BB] = std::max(BB->estimateSize(Emitter), 1); + } + + // Build current addresses of basic blocks starting at the entry block + DenseMap CurAddr; + uint64_t Addr = 0; + for (BinaryBasicBlock *SrcBB : BF.layout()) { + CurAddr[SrcBB] = Addr; + Addr += BBSize[SrcBB]; + } + + // Build new addresses (after duplication) starting at the entry block + DenseMap NewAddr; + Addr = 0; + for (BinaryBasicBlock *SrcBB : BF.layout()) { + NewAddr[SrcBB] = Addr; + Addr += BBSize[SrcBB]; + if (SrcBB == Pred) + Addr += BBSize[Tail]; + } + + // Compute the cache score for the existing layout of basic blocks + double CurScore = 0; + for (BinaryBasicBlock *SrcBB : BF.layout()) { + auto BI = SrcBB->branch_info_begin(); + for (BinaryBasicBlock *DstBB : SrcBB->successors()) { + if (SrcBB != DstBB) { + CurScore += cacheScore(CurAddr[SrcBB], BBSize[SrcBB], CurAddr[DstBB], + BBSize[DstBB], BI->Count); + } + ++BI; + } + } + + // Compute the cache score for the layout of blocks after tail duplication + double NewScore = 0; + for (BinaryBasicBlock *SrcBB : BF.layout()) { + auto BI = SrcBB->branch_info_begin(); + for (BinaryBasicBlock *DstBB : SrcBB->successors()) { + if (SrcBB != DstBB) { + if (SrcBB == Pred && DstBB == Tail) { + NewScore += cacheScore(NewAddr[SrcBB], BBSize[SrcBB], + NewAddr[SrcBB] + BBSize[SrcBB], BBSize[DstBB], + BI->Count); + } else { + NewScore += cacheScore(NewAddr[SrcBB], BBSize[SrcBB], NewAddr[DstBB], + BBSize[DstBB], BI->Count); + } + } + ++BI; + } + } + + return NewScore > CurScore; +} + +std::vector +TailDuplication::cacheDuplicate(const MCCodeEmitter *Emitter, + BinaryFunction &BF, BinaryBasicBlock *Pred, + BinaryBasicBlock *Tail) const { + std::vector BlocksToDuplicate; + + // No need to duplicate cold basic blocks + if (Pred->isCold() || Tail->isCold()) { + return BlocksToDuplicate; + } + // Always duplicate "small" tail basic blocks, which might be beneficial for + // code size, since a jump instruction is eliminated + if (Tail->estimateSize(Emitter) <= opts::TailDuplicationMinimumDuplication) { + BlocksToDuplicate.push_back(Tail); + return BlocksToDuplicate; + } + // Never duplicate "large" tail basic blocks + if (Tail->estimateSize(Emitter) > opts::TailDuplicationMaximumDuplication) { + return BlocksToDuplicate; + } + // Do not append basic blocks after the last hot block in the current layout + auto NextBlock = BF.getBasicBlockAfter(Pred); + if (NextBlock == nullptr || (!Pred->isCold() && NextBlock->isCold())) { + return BlocksToDuplicate; + } + + // Duplicate the tail only if it improves the cache score + if (cacheScoreImproved(Emitter, BF, Pred, Tail)) { + BlocksToDuplicate.push_back(Tail); + } + + return BlocksToDuplicate; +} + +std::vector TailDuplication::duplicateBlocks( BinaryBasicBlock &BB, const std::vector &BlocksToDuplicate) const { BinaryFunction *BF = BB.getFunction(); @@ -329,7 +512,7 @@ // successor's execution count. Used to set this new branches execution count // and lower the old successor's execution count double ExecutionCountRatio = - BB.getExecutionCount() > BB.getSuccessor()->getExecutionCount() + BB.getExecutionCount() >= BB.getSuccessor()->getExecutionCount() ? 1.0 : (double)BB.getExecutionCount() / BB.getSuccessor()->getExecutionCount(); @@ -347,15 +530,15 @@ std::vector> DuplicatedBlocks; std::vector DuplicatedBlocksToReturn; - for (BinaryBasicBlock *CurrBB : BlocksToDuplicate) { + for (BinaryBasicBlock *CurBB : BlocksToDuplicate) { DuplicatedBlocks.emplace_back( BF->createBasicBlock(0, (BC.Ctx)->createNamedTempSymbol("tail-dup"))); BinaryBasicBlock *NewBB = DuplicatedBlocks.back().get(); - NewBB->addInstructions(CurrBB->begin(), CurrBB->end()); + NewBB->addInstructions(CurBB->begin(), CurBB->end()); // Set execution count as if it was just a copy of the original - NewBB->setExecutionCount( - std::max((uint64_t)1, CurrBB->getExecutionCount())); + NewBB->setExecutionCount(CurBB->getExecutionCount()); + NewBB->setIsCold(CurBB->isCold()); LastDuplicatedBB->addSuccessor(NewBB, LastBI); DuplicatedBlocksToReturn.push_back(NewBB); @@ -367,10 +550,10 @@ LastDuplicatedBB->adjustExecutionCount(ExecutionCountRatio); } - if (CurrBB->succ_size() == 1) - LastBI = CurrBB->getBranchInfo(*(CurrBB->getSuccessor())); + if (CurBB->succ_size() == 1) + LastBI = CurBB->getBranchInfo(*(CurBB->getSuccessor())); - LastOriginalBB = CurrBB; + LastOriginalBB = CurBB; LastDuplicatedBB = NewBB; } @@ -387,64 +570,73 @@ } void TailDuplication::runOnFunction(BinaryFunction &Function) { + // Create a separate MCCodeEmitter to allow lock-free execution + BinaryContext::IndependentCodeEmitter Emitter; + if (!opts::NoThreads) { + Emitter = Function.getBinaryContext().createIndependentMCCodeEmitter(); + } + + Function.updateLayoutIndices(); + // New blocks will be added and layout will change, // so make a copy here to iterate over the original layout BinaryFunction::BasicBlockOrderType BlockLayout = Function.getLayout(); + bool ModifiedFunction = false; for (BinaryBasicBlock *BB : BlockLayout) { - if (BB->succ_size() == 1 && - BB->getSuccessor()->getLayoutIndex() != BB->getLayoutIndex() + 1) - UnconditionalBranchDynamicCount += BB->getExecutionCount(); - if (BB->succ_size() == 2 && - BB->getFallthrough()->getLayoutIndex() != BB->getLayoutIndex() + 1) - UnconditionalBranchDynamicCount += BB->getFallthroughBranchInfo().Count; - AllBlocksDynamicCount += BB->getExecutionCount(); - - // The block must be hot - if (BB->getExecutionCount() == 0) - continue; - // with one successor - if (BB->succ_size() != 1) - continue; + AllDynamicCount += BB->getKnownExecutionCount(); - // no jump table - if (BB->hasJumpTable()) + // The block must be with one successor + if (BB->succ_size() != 1) continue; - - // Skip not-in-layout, i.e. unreachable, blocks. - if (BB->getLayoutIndex() >= BlockLayout.size()) + BinaryBasicBlock *Tail = BB->getSuccessor(); + // Verify that the tail should be duplicated + if (!shouldDuplicate(BB, Tail)) continue; - // and we are estimating that this sucessor is not already in the same cache - // line - BinaryBasicBlock *Succ = BB->getSuccessor(); - if (isInCacheLine(*BB, *Succ)) - continue; std::vector BlocksToDuplicate; - if (opts::TailDuplicationAggressive) - BlocksToDuplicate = aggressiveCodeToDuplicate(*Succ); - else - BlocksToDuplicate = moderateCodeToDuplicate(*Succ); + if (opts::TailDuplicationMode == TailDuplication::TD_AGGRESSIVE) { + BlocksToDuplicate = aggressiveDuplicate(*BB, *Tail); + } else if (opts::TailDuplicationMode == TailDuplication::TD_MODERATE) { + BlocksToDuplicate = moderateDuplicate(*BB, *Tail); + } else if (opts::TailDuplicationMode == TailDuplication::TD_CACHE) { + BlocksToDuplicate = cacheDuplicate(Emitter.MCE.get(), Function, BB, Tail); + } else { + llvm_unreachable("unknown tail duplication mode"); + } - if (BlocksToDuplicate.size() == 0) - continue; - PossibleDuplications++; - PossibleDuplicationsDynamicCount += BB->getExecutionCount(); - std::vector DuplicatedBlocks = - tailDuplicate(*BB, BlocksToDuplicate); - if (!opts::TailDuplicationConstCopyPropagation) + if (BlocksToDuplicate.empty()) continue; - constantAndCopyPropagate(*BB, DuplicatedBlocks); - BinaryBasicBlock *FirstBB = BlocksToDuplicate[0]; - if (FirstBB->pred_size() == 1) { - BinaryBasicBlock *PredBB = *FirstBB->pred_begin(); - if (PredBB->succ_size() == 1) - constantAndCopyPropagate(*PredBB, BlocksToDuplicate); + // Apply the the duplication + ModifiedFunction = true; + DuplicationsDynamicCount += BB->getExecutionCount(); + auto DuplicatedBlocks = duplicateBlocks(*BB, BlocksToDuplicate); + for (BinaryBasicBlock *BB : DuplicatedBlocks) { + DuplicatedBlockCount++; + DuplicatedByteCount += BB->estimateSize(Emitter.MCE.get()); + } + + if (opts::TailDuplicationConstCopyPropagation) { + constantAndCopyPropagate(*BB, DuplicatedBlocks); + BinaryBasicBlock *FirstBB = BlocksToDuplicate[0]; + if (FirstBB->pred_size() == 1) { + BinaryBasicBlock *PredBB = *FirstBB->pred_begin(); + if (PredBB->succ_size() == 1) + constantAndCopyPropagate(*PredBB, BlocksToDuplicate); + } } + + // Layout indices might be stale after duplication + Function.updateLayoutIndices(); } + if (ModifiedFunction) + ModifiedFunctions++; } void TailDuplication::runOnFunctions(BinaryContext &BC) { + if (opts::TailDuplicationMode == TailDuplication::TD_NONE) + return; + for (auto &It : BC.getBinaryFunctions()) { BinaryFunction &Function = It.second; if (!shouldOptimize(Function)) @@ -452,24 +644,23 @@ runOnFunction(Function); } - outs() << "BOLT-INFO: tail duplication possible duplications: " - << PossibleDuplications << "\n"; - outs() << "BOLT-INFO: tail duplication possible dynamic reductions: " - << PossibleDuplicationsDynamicCount << "\n"; - outs() << "BOLT-INFO: tail duplication possible dynamic reductions to " - "unconditional branch execution : " - << format("%.1f", ((float)PossibleDuplicationsDynamicCount * 100.0f) / - UnconditionalBranchDynamicCount) - << "%\n"; - outs() << "BOLT-INFO: tail duplication possible dynamic reductions to all " - "blocks execution : " - << format("%.1f", ((float)PossibleDuplicationsDynamicCount * 100.0f) / - AllBlocksDynamicCount) - << "%\n"; - outs() << "BOLT-INFO: tail duplication static propagation deletions: " - << StaticInstructionDeletionCount << "\n"; - outs() << "BOLT-INFO: tail duplication dynamic propagation deletions: " - << DynamicInstructionDeletionCount << "\n"; // + outs() << "BOLT-INFO: tail duplication" + << format(" modified %zu (%.2f%%) functions;", ModifiedFunctions, + 100.0 * ModifiedFunctions / BC.getBinaryFunctions().size()) + << format(" duplicated %zu blocks (%zu bytes) responsible for", + DuplicatedBlockCount, DuplicatedByteCount) + << format(" %zu dynamic executions (%.2f%% of all block executions)", + DuplicationsDynamicCount, + 100.0 * DuplicationsDynamicCount / AllDynamicCount) + << "\n"; + + if (opts::TailDuplicationConstCopyPropagation) { + outs() << "BOLT-INFO: tail duplication " + << format("applied %zu static and %zu dynamic propagation deletions", + StaticInstructionDeletionCount, + DynamicInstructionDeletionCount) + << "\n"; + } } } // end namespace bolt diff --git a/bolt/lib/Rewrite/BinaryPassManager.cpp b/bolt/lib/Rewrite/BinaryPassManager.cpp --- a/bolt/lib/Rewrite/BinaryPassManager.cpp +++ b/bolt/lib/Rewrite/BinaryPassManager.cpp @@ -238,11 +238,6 @@ cl::desc("verify the CFG after every pass"), cl::init(false), cl::Hidden, cl::ZeroOrMore, cl::cat(BoltOptCategory)); -static cl::opt -TailDuplicationFlag("tail-duplication", - cl::desc("duplicate unconditional branches that cross a cache line"), - cl::ZeroOrMore, cl::ReallyHidden, cl::cat(BoltOptCategory)); - static cl::opt ThreeWayBranchFlag("three-way-branch", cl::desc("reorder three way branches"), @@ -396,8 +391,7 @@ Manager.registerPass(std::make_unique()); - Manager.registerPass(std::make_unique(), - opts::TailDuplicationFlag); + Manager.registerPass(std::make_unique()); Manager.registerPass(std::make_unique(), opts::CMOVConversionFlag); diff --git a/bolt/test/X86/tail-duplication-cache.s b/bolt/test/X86/tail-duplication-cache.s new file mode 100644 --- /dev/null +++ b/bolt/test/X86/tail-duplication-cache.s @@ -0,0 +1,59 @@ +# REQUIRES: system-linux + +# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \ +# RUN: %s -o %t.o +# RUN: link_fdata %s %t.o %t.fdata +# RUN: link_fdata %s %t.o %t.fdata2 "FDATA2" +# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q +# RUN: llvm-bolt %t.exe -data %t.fdata -reorder-blocks=none -print-finalized \ +# RUN: -tail-duplication=cache -o %t.out | FileCheck %s +# RUN: llvm-bolt %t.exe -data %t.fdata2 -reorder-blocks=none -print-finalized \ +# RUN: -tail-duplication=cache -o %t.out2 | FileCheck --check-prefix="CHECK2" %s + +# A test where the tail is duplicated to eliminate an uncoditional jump +# FDATA: 1 main #.BB0_br# 1 main #.BB4# 0 100 +# FDATA: 1 main #.BB0_br# 1 main #.BB1# 0 100 +# FDATA: 1 main #.BB1_br# 1 main #.BB3# 0 50 +# FDATA: 1 main #.BB1_br# 1 main #.BB2# 0 50 +# FDATA: 1 main #.BB3_br# 1 main #.BB2# 0 50 +# CHECK: BOLT-INFO: tail duplication modified 1 ({{.*}}%) functions; duplicated 1 blocks (13 bytes) responsible for 50 dynamic executions ({{.*}}% of all block executions) +# CHECK: BB Layout : .LBB00, .Ltmp0, .Ltmp1, .Ltmp2, .Ltmp3, .Ltmp4, .Ltmp5, .Ltail-dup0, .Ltmp6 + +# A test where the tail is not duplicated due to the cache score +# FDATA2: 1 main #.BB0_br# 1 main #.BB4# 0 100 +# FDATA2: 1 main #.BB0_br# 1 main #.BB1# 0 2 +# FDATA2: 1 main #.BB1_br# 1 main #.BB3# 0 1 +# FDATA2: 1 main #.BB1_br# 1 main #.BB2# 0 1 +# FDATA2: 1 main #.BB3_br# 1 main #.BB2# 0 1 +# CHECK2: BOLT-INFO: tail duplication modified 0 (0.00%) functions; duplicated 0 blocks (0 bytes) responsible for 0 dynamic executions (0.00% of all block executions) +# CHECK2: BB Layout : .LBB00, .Ltmp0, .Ltmp1, .Ltmp2, .Ltmp3, .Ltmp4, .Ltmp5, .Ltmp6 + + .text + .globl main + .type main, %function + .size main, .Lend-main +main: +.BB0: + xor %eax, %eax + cmpl %eax, %ebx +.BB0_br: + je .BB4 +.BB1: + inc %rax +.BB1_br: + je .BB3 +.BB2: + inc %rax + inc %rax + inc %rax + inc %rax + retq +.BB3: + inc %rax +.BB3_br: + jmp .BB2 +.BB4: + retq +# For relocations against .text + call exit +.Lend: diff --git a/bolt/test/X86/tail-duplication-cacheline.s b/bolt/test/X86/tail-duplication-cacheline.s --- a/bolt/test/X86/tail-duplication-cacheline.s +++ b/bolt/test/X86/tail-duplication-cacheline.s @@ -9,7 +9,7 @@ # RUN: llvm-strip --strip-unneeded %t.o # RUN: %clang %cflags -no-pie %t.o -o %t.exe -Wl,-q -nostdlib # RUN: llvm-bolt %t.exe -o %t.out -data %t.fdata -relocs \ -# RUN: -tail-duplication=1 -tail-duplication-aggressive=1 +# RUN: -tail-duplication=aggressive .globl _start _start: jmp d diff --git a/bolt/test/X86/tail-duplication-complex.s b/bolt/test/X86/tail-duplication-complex.s --- a/bolt/test/X86/tail-duplication-complex.s +++ b/bolt/test/X86/tail-duplication-complex.s @@ -6,14 +6,14 @@ # RUN: %clang %cflags %t.o -o %t.exe -Wl,-q # RUN: llvm-bolt %t.exe -data %t.fdata -print-finalized \ -# RUN: -tail-duplication -tail-duplication-minimum-offset 1 -o %t.out | FileCheck %s +# RUN: -tail-duplication=moderate -tail-duplication-minimum-offset 1 -o %t.out | FileCheck %s # FDATA: 1 main f 1 main 19 0 10 # FDATA: 1 main f 1 main 11 0 13 # FDATA: 1 main 17 1 main 3c 0 10 # FDATA: 1 main 39 1 main 3c 0 10 -# CHECK: tail duplication possible duplications: 1 +# CHECK: tail duplication modified 1 ({{.*}}%) functions; duplicated 1 blocks ({{.*}} bytes) responsible for {{.*}} dynamic executions ({{.*}} of all block executions) # CHECK: BB Layout : .LBB00, .Ltmp0, .Ltail-dup0, .Ltmp1, .Ltmp2 # This is the C++ code fed to Clang diff --git a/bolt/test/X86/tail-duplication-jt.s b/bolt/test/X86/tail-duplication-jt.s --- a/bolt/test/X86/tail-duplication-jt.s +++ b/bolt/test/X86/tail-duplication-jt.s @@ -7,9 +7,9 @@ # RUN: %s -o %t.o # RUN: link_fdata %s %t.o %t.fdata # RUN: llvm-strip --strip-unneeded %t.o -# RUN: %clangxx %cflags -no-pie %t.o -o %t.exe -Wl,-q +# RUN: %clangxx %cflags -no-pie %t.o -o %t.exe -Wl,-q # RUN: llvm-bolt %t.exe -o %t.out -data %t.fdata -relocs \ -# RUN: -tail-duplication=1 -tail-duplication-aggressive=1 \ +# RUN: -tail-duplication=aggressive \ # RUN: -print-cfg | FileCheck %s # CHECK: Jump table {{.*}} for function a at {{.*}} with a total count of 3 .globl main diff --git a/bolt/test/X86/tail-duplication-pass.s b/bolt/test/X86/tail-duplication-pass.s --- a/bolt/test/X86/tail-duplication-pass.s +++ b/bolt/test/X86/tail-duplication-pass.s @@ -5,11 +5,11 @@ # RUN: link_fdata %s %t.o %t.fdata # RUN: %clang %cflags %t.o -o %t.exe -Wl,-q # RUN: llvm-bolt %t.exe -data %t.fdata -reorder-blocks=ext-tsp -print-finalized \ -# RUN: -tail-duplication -tail-duplication-minimum-offset 1 -o %t.out | FileCheck %s +# RUN: -tail-duplication=moderate -tail-duplication-minimum-offset=1 -o %t.out | FileCheck %s # FDATA: 1 main 2 1 main #.BB2# 0 10 # FDATA: 1 main 4 1 main #.BB2# 0 20 -# CHECK: tail duplication possible duplications: 1 +# CHECK: BOLT-INFO: tail duplication modified 1 ({{.*}}%) functions; duplicated 1 blocks (1 bytes) responsible for {{.*}} dynamic executions ({{.*}}% of all block executions) # CHECK: BB Layout : .LBB00, .Ltail-dup0, .Ltmp0, .Ltmp1 .text diff --git a/bolt/test/X86/tail-duplication-prop-bug.s b/bolt/test/X86/tail-duplication-prop-bug.s --- a/bolt/test/X86/tail-duplication-prop-bug.s +++ b/bolt/test/X86/tail-duplication-prop-bug.s @@ -6,7 +6,7 @@ # RUN: llvm-strip --strip-unneeded %t.o # RUN: ld.lld %t.o -o %t.exe -q -nostdlib # RUN: llvm-bolt %t.exe -o %t.out -data %t.fdata -relocs \ -# RUN: -tail-duplication=1 -tail-duplication-aggressive=1 \ +# RUN: -tail-duplication=aggressive \ # RUN: -tail-duplication-const-copy-propagation=1 .text diff --git a/bolt/test/runtime/X86/tail-duplication-constant-prop.s b/bolt/test/runtime/X86/tail-duplication-constant-prop.s --- a/bolt/test/runtime/X86/tail-duplication-constant-prop.s +++ b/bolt/test/runtime/X86/tail-duplication-constant-prop.s @@ -5,13 +5,13 @@ # RUN: link_fdata %s %t.o %t.fdata # RUN: %clang %cflags %t.o -o %t.exe -Wl,-q # RUN: llvm-bolt %t.exe -data %t.fdata -reorder-blocks=ext-tsp -print-finalized \ -# RUN: -tail-duplication -tail-duplication-minimum-offset 1 -o %t.out | FileCheck %s +# RUN: -tail-duplication=moderate -tail-duplication-minimum-offset=1 -tail-duplication-const-copy-propagation=1 -o %t.out | FileCheck %s # RUN: %t.exe; echo $? # RUN: %t.out; echo $? # FDATA: 1 main 14 1 main #.BB2# 0 10 # FDATA: 1 main 16 1 main #.BB2# 0 20 -# CHECK: tail duplication possible duplications: 1 +# CHECK: BOLT-INFO: tail duplication modified 1 ({{.*}}%) functions; duplicated 1 blocks ({{.*}} bytes) responsible for {{.*}} dynamic executions ({{.*}}% of all block executions) # CHECK: BB Layout : .LBB00, .Ltail-dup0, .Ltmp0, .Ltmp1 # CHECK-NOT: mov $0x2, %rbx @@ -31,7 +31,7 @@ mov %rbx, %rax mov $0x5, %rbx add %rsi, %rax - jmp .BB4 + retq .BB3: mov $0x9, %rbx .BB4: