diff --git a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp --- a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp +++ b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp @@ -67,6 +67,7 @@ #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/ValueMapper.h" #include +#include #include #include @@ -96,6 +97,10 @@ cl::desc("Name for the section containing cold functions " "extracted by hot-cold splitting.")); +static cl::opt MaxParametersForSplit( + "hotcoldsplit-max-params", cl::init(4), cl::Hidden, + cl::desc("Maximum number of parameters for a split function")); + namespace { // Same as blockEndsInUnreachable in CodeGen/BranchFolding.cpp. Do not modify // this function unless you modify the MBB version as well. @@ -257,18 +262,6 @@ if (SplittingThreshold <= 0) return Penalty; - // The typical code size cost for materializing an argument for the outlined - // call. - LLVM_DEBUG(dbgs() << "Applying penalty for: " << NumInputs << " inputs\n"); - const int CostForArgMaterialization = TargetTransformInfo::TCC_Basic; - Penalty += CostForArgMaterialization * NumInputs; - - // The typical code size cost for an output alloca, its associated store, and - // its associated reload. - LLVM_DEBUG(dbgs() << "Applying penalty for: " << NumOutputs << " outputs\n"); - const int CostForRegionOutput = 3 * TargetTransformInfo::TCC_Basic; - Penalty += CostForRegionOutput * NumOutputs; - // Find the number of distinct exit blocks for the region. Use a conservative // check to determine whether control returns from the region. bool NoBlocksReturn = true; @@ -289,6 +282,48 @@ } } + // Count the number of phis in exit blocks with >= 2 incoming values from the + // outlining region. These phis are split (\ref severSplitPHINodesOfExits), + // and new outputs are created to supply the split phis. CodeExtractor can't + // report these new outputs until extraction begins, but it's important to + // factor the cost of the outputs into the cost calculation. + unsigned NumSplitExitPhis = 0; + for (BasicBlock *ExitBB : SuccsOutsideRegion) { + for (PHINode &PN : ExitBB->phis()) { + // Find all incoming values from the outlining region. + int NumIncomingVals = 0; + for (unsigned i = 0; i < PN.getNumIncomingValues(); ++i) + if (find(Region, PN.getIncomingBlock(i)) != Region.end()) { + ++NumIncomingVals; + if (NumIncomingVals > 1) { + ++NumSplitExitPhis; + break; + } + } + } + } + + // Apply a penalty for calling the split function. Factor in the cost of + // materializing all of the parameters. + int NumOutputsAndSplitPhis = NumOutputs + NumSplitExitPhis; + int NumParams = NumInputs + NumOutputsAndSplitPhis; + if (NumParams > MaxParametersForSplit) { + LLVM_DEBUG(dbgs() << NumInputs << " inputs and " << NumOutputsAndSplitPhis + << " outputs exceeds parameter limit (" + << MaxParametersForSplit << ")\n"); + return std::numeric_limits::max(); + } + const int CostForArgMaterialization = 2 * TargetTransformInfo::TCC_Basic; + LLVM_DEBUG(dbgs() << "Applying penalty for: " << NumParams << " params\n"); + Penalty += CostForArgMaterialization * NumParams; + + // Apply the typical code size cost for an output alloca and its associated + // reload in the caller. Also penalize the associated store in the callee. + LLVM_DEBUG(dbgs() << "Applying penalty for: " << NumOutputsAndSplitPhis + << " outputs/split phis\n"); + const int CostForRegionOutput = 3 * TargetTransformInfo::TCC_Basic; + Penalty += CostForRegionOutput * NumOutputsAndSplitPhis; + // Apply a `noreturn` bonus. if (NoBlocksReturn) { LLVM_DEBUG(dbgs() << "Applying bonus for: " << Region.size() @@ -298,7 +333,7 @@ // Apply a penalty for having more than one successor outside of the region. // This penalty accounts for the switch needed in the caller. - if (!SuccsOutsideRegion.empty()) { + if (SuccsOutsideRegion.size() > 1) { LLVM_DEBUG(dbgs() << "Applying penalty for: " << SuccsOutsideRegion.size() << " non-region successors\n"); Penalty += (SuccsOutsideRegion.size() - 1) * TargetTransformInfo::TCC_Basic; diff --git a/llvm/test/Transforms/CodeExtractor/extract-assume.ll b/llvm/test/Transforms/CodeExtractor/extract-assume.ll --- a/llvm/test/Transforms/CodeExtractor/extract-assume.ll +++ b/llvm/test/Transforms/CodeExtractor/extract-assume.ll @@ -1,4 +1,4 @@ -; RUN: opt -passes="function(slp-vectorizer),module(hotcoldsplit),function(slp-vectorizer,print)" -disable-output %s 2>&1 | FileCheck %s +; RUN: opt -passes="function(slp-vectorizer),module(hotcoldsplit),function(slp-vectorizer,print)" -hotcoldsplit-threshold=-1 -disable-output %s 2>&1 | FileCheck %s ; ; Make sure this compiles. Check that function assumption cache is refreshed ; after extracting blocks with assume calls from the function. diff --git a/llvm/test/Transforms/HotColdSplit/apply-penalty-for-inputs.ll b/llvm/test/Transforms/HotColdSplit/apply-penalty-for-inputs.ll --- a/llvm/test/Transforms/HotColdSplit/apply-penalty-for-inputs.ll +++ b/llvm/test/Transforms/HotColdSplit/apply-penalty-for-inputs.ll @@ -1,5 +1,5 @@ ; REQUIRES: asserts -; RUN: opt -hotcoldsplit -debug-only=hotcoldsplit -S < %s -o /dev/null 2>&1 | FileCheck %s +; RUN: opt -hotcoldsplit -debug-only=hotcoldsplit -hotcoldsplit-threshold=2 -hotcoldsplit-max-params=2 -S < %s -o /dev/null 2>&1 | FileCheck %s declare void @sink(i32*, i32, i32) cold @@ -10,10 +10,27 @@ br i1 undef, label %cold, label %exit cold: - ; CHECK: Applying penalty for: 2 inputs + ; CHECK: Applying penalty for splitting: 2 + ; CHECK-NEXT: Applying penalty for: 2 params + ; CHECK-NEXT: Applying penalty for: 0 outputs/split phis + ; CHECK-NEXT: penalty = 6 call void @sink(i32* @g, i32 %arg, i32 %local) ret void exit: ret void } + +define void @bar(i32* %p1, i32 %p2, i32 %p3) { + br i1 undef, label %cold, label %exit + +cold: + ; CHECK: Applying penalty for splitting: 2 + ; CHECK-NEXT: 3 inputs and 0 outputs exceeds parameter limit (2) + ; CHECK-NEXT: penalty = 2147483647 + call void @sink(i32* %p1, i32 %p2, i32 %p3) + ret void + +exit: + ret void +} diff --git a/llvm/test/Transforms/HotColdSplit/apply-penalty-for-outputs.ll b/llvm/test/Transforms/HotColdSplit/apply-penalty-for-outputs.ll --- a/llvm/test/Transforms/HotColdSplit/apply-penalty-for-outputs.ll +++ b/llvm/test/Transforms/HotColdSplit/apply-penalty-for-outputs.ll @@ -1,5 +1,5 @@ ; REQUIRES: asserts -; RUN: opt -hotcoldsplit -debug-only=hotcoldsplit -S < %s -o /dev/null 2>&1 | FileCheck %s +; RUN: opt -hotcoldsplit -debug-only=hotcoldsplit -hotcoldsplit-threshold=2 -S < %s -o /dev/null 2>&1 | FileCheck %s declare void @sink() cold @@ -10,8 +10,10 @@ br i1 undef, label %cold, label %exit cold: - ; CHECK: Applying penalty for: 1 output - ; CHECK: Applying penalty for: 1 non-region successors + ; CHECK: Applying penalty for splitting: 2 + ; CHECK-NEXT: Applying penalty for: 1 params + ; CHECK-NEXT: Applying penalty for: 1 outputs/split phis + ; CHECK-NEXT: penalty = 7 %local = load i32, i32* @g call void @sink() br label %exit diff --git a/llvm/test/Transforms/HotColdSplit/apply-successor-penalty.ll b/llvm/test/Transforms/HotColdSplit/apply-successor-penalty.ll --- a/llvm/test/Transforms/HotColdSplit/apply-successor-penalty.ll +++ b/llvm/test/Transforms/HotColdSplit/apply-successor-penalty.ll @@ -1,5 +1,5 @@ ; REQUIRES: asserts -; RUN: opt -hotcoldsplit -debug-only=hotcoldsplit -S < %s -o /dev/null 2>&1 | FileCheck %s +; RUN: opt -hotcoldsplit -debug-only=hotcoldsplit -hotcoldsplit-threshold=2 -S < %s -o /dev/null 2>&1 | FileCheck %s declare void @sink() cold @@ -9,7 +9,10 @@ br i1 undef, label %cold1, label %exit cold1: - ; CHECK: Applying penalty for: 1 non-region successor + ; CHECK: Applying penalty for splitting: 2 + ; CHECK-NEXT: Applying penalty for: 0 params + ; CHECK-NEXT: Applying penalty for: 0 outputs/split phis + ; CHECK-NEXT: penalty = 2 call void @sink() br i1 undef, label %cold2, label %cold3 @@ -32,7 +35,11 @@ br i1 undef, label %cold1, label %exit1 cold1: - ; CHECK: Applying penalty for: 2 non-region successors + ; CHECK: Applying penalty for splitting: 2 + ; CHECK-NEXT: Applying penalty for: 0 params + ; CHECK-NEXT: Applying penalty for: 0 outputs/split phis + ; CHECK-NEXT: Applying penalty for: 2 non-region successors + ; CHECK-NEXT: penalty = 3 call void @sink() br i1 undef, label %cold2, label %cold3 diff --git a/llvm/test/Transforms/HotColdSplit/assumption-cache-invalidation.ll b/llvm/test/Transforms/HotColdSplit/assumption-cache-invalidation.ll --- a/llvm/test/Transforms/HotColdSplit/assumption-cache-invalidation.ll +++ b/llvm/test/Transforms/HotColdSplit/assumption-cache-invalidation.ll @@ -1,5 +1,5 @@ ; REQUIRES: asserts -; RUN: opt -S -instsimplify -hotcoldsplit -debug < %s 2>&1 | FileCheck %s +; RUN: opt -S -instsimplify -hotcoldsplit -hotcoldsplit-threshold=-1 -debug < %s 2>&1 | FileCheck %s ; RUN: opt -instcombine -hotcoldsplit -instsimplify %s -o /dev/null target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" @@ -13,7 +13,10 @@ ; CHECK-NOT: @llvm.assume ; CHECK: } ; CHECK: declare {{.*}}@llvm.assume -; CHECK: define {{.*}}@f.cold.1(i64 %0) +; CHECK: define {{.*}}@f.cold.1() +; CHECK-LABEL: newFuncRoot: +; CHECK: } +; CHECK: define {{.*}}@f.cold.2(i64 %0) ; CHECK-LABEL: newFuncRoot: ; CHECK: %1 = icmp eq i64 %0, 0 ; CHECK-NOT: call void @llvm.assume