Index: llvm/lib/Transforms/IPO/HotColdSplitting.cpp =================================================================== --- llvm/lib/Transforms/IPO/HotColdSplitting.cpp +++ llvm/lib/Transforms/IPO/HotColdSplitting.cpp @@ -66,10 +66,10 @@ static cl::opt EnableStaticAnalyis("hot-cold-static-analysis", cl::init(true), cl::Hidden); -static cl::opt MinOutliningInstCount( - "min-outlining-inst-count", cl::init(3), cl::Hidden, - cl::desc("Minimum number of instructions needed for a single-block region " - "to be an outlining candidate")); +static cl::opt + MinOutliningThreshold("min-outlining-thresh", cl::init(3), cl::Hidden, + cl::desc("Code size threshold for outlining within a " + "single BB (as a multiple of TCC_Basic)")); namespace { @@ -135,14 +135,18 @@ return !BB.hasAddressTaken(); } -/// Check whether \p BB has at least \p Min non-debug, non-terminator -/// instructions. -static bool hasMinimumInstCount(const BasicBlock &BB, unsigned Min) { - unsigned Count = 0; +/// Check whether \p BB is profitable to outline (i.e. its code size cost meets +/// the threshold set in \p MinOutliningThreshold). +static bool isProfitableToOutline(const BasicBlock &BB, + TargetTransformInfo &TTI) { + int Cost = 0; for (const Instruction &I : BB) { if (isa(&I) || &I == BB.getTerminator()) continue; - if (++Count >= Min) + + Cost += TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize); + + if (Cost >= (MinOutliningThreshold * TargetTransformInfo::TCC_Basic)) return true; } return false; @@ -156,8 +160,10 @@ /// /// Return an empty sequence if the cold region is too small to outline, or if /// the cold region has no warm predecessors. -static BlockSequence -findMaximalColdRegion(BasicBlock &SinkBB, DominatorTree &DT, PostDomTree &PDT) { +static BlockSequence findMaximalColdRegion(BasicBlock &SinkBB, + TargetTransformInfo &TTI, + DominatorTree &DT, + PostDomTree &PDT) { // The maximal cold region. BlockSequence ColdRegion = {}; @@ -241,8 +247,7 @@ ++SuccIt; } - if (ColdRegion.size() == 1 && - !hasMinimumInstCount(*ColdRegion[0], MinOutliningInstCount)) + if (ColdRegion.size() == 1 && !isProfitableToOutline(*ColdRegion[0], TTI)) return {}; return ColdRegion; @@ -251,6 +256,7 @@ /// Get the largest cold region in \p F. static BlockSequence getLargestColdRegion(Function &F, ProfileSummaryInfo &PSI, BlockFrequencyInfo *BFI, + TargetTransformInfo &TTI, DominatorTree &DT, PostDomTree &PDT) { // Keep track of the largest cold region. BlockSequence LargestColdRegion = {}; @@ -270,7 +276,7 @@ }); // Find a maximal cold region we can outline. - BlockSequence ColdRegion = findMaximalColdRegion(BB, DT, PDT); + BlockSequence ColdRegion = findMaximalColdRegion(BB, TTI, DT, PDT); if (ColdRegion.empty()) { LLVM_DEBUG(dbgs() << " Skipping (block not profitable to extract)\n"); continue; @@ -305,7 +311,7 @@ private: bool shouldOutlineFrom(const Function &F) const; Function *extractColdRegion(const BlockSequence &Region, DominatorTree &DT, - BlockFrequencyInfo *BFI, + BlockFrequencyInfo *BFI, TargetTransformInfo &TTI, OptimizationRemarkEmitter &ORE, unsigned Count); SmallPtrSet OutlinedFunctions; ProfileSummaryInfo *PSI; @@ -365,6 +371,7 @@ Function *HotColdSplitting::extractColdRegion(const BlockSequence &Region, DominatorTree &DT, BlockFrequencyInfo *BFI, + TargetTransformInfo &TTI, OptimizationRemarkEmitter &ORE, unsigned Count) { assert(!Region.empty()); @@ -393,7 +400,7 @@ CallInst *CI = cast(U); CallSite CS(CI); NumColdRegionsOutlined++; - if (GetTTI(*OutF).useColdCCForColdCall(*OutF)) { + if (TTI.useColdCCForColdCall(*OutF)) { OutF->setCallingConv(CallingConv::Cold); CS.setCallingConv(CallingConv::Cold); } @@ -437,14 +444,15 @@ PostDomTree PDT(F); PDT.recalculate(F); BlockFrequencyInfo *BFI = GetBFI(F); + TargetTransformInfo &TTI = GetTTI(F); - BlockSequence ColdRegion = getLargestColdRegion(F, *PSI, BFI, DT, PDT); + BlockSequence ColdRegion = getLargestColdRegion(F, *PSI, BFI, TTI, DT, PDT); if (ColdRegion.empty()) continue; OptimizationRemarkEmitter &ORE = (*GetORE)(F); Function *Outlined = - extractColdRegion(ColdRegion, DT, BFI, ORE, /*Count=*/1); + extractColdRegion(ColdRegion, DT, BFI, TTI, ORE, /*Count=*/1); if (Outlined) { OutlinedFunctions.insert(Outlined); Changed = true; Index: llvm/test/Transforms/HotColdSplit/outline-expensive.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/HotColdSplit/outline-expensive.ll @@ -0,0 +1,25 @@ +; The magic number 6 comes from (1 * TCC_Expensive) + (1 * CostOfCallX86). +; RUN: opt -hotcoldsplit -min-outlining-thresh=6 -S < %s | FileCheck %s + +; Test that we outline even though there are only two cold instructions. TTI +; should determine that they are expensive in terms of code size. + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.14.0" + +; CHECK-LABEL: @fun +; CHECK: call void @fun.cold.1 +define void @fun(i32 %x) { +entry: + br i1 undef, label %if.then, label %if.else + +if.then: + ret void + +if.else: + %y = sdiv i32 %x, 111 + call void @sink(i32 %y) + ret void +} + +declare void @sink(i32 %x) cold