diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -39,6 +39,7 @@ typedef unsigned ID; } +class AllocaInst; class AssumptionCache; class BlockFrequencyInfo; class DominatorTree; @@ -344,6 +345,10 @@ /// \returns A value to be added to the inlining threshold. unsigned adjustInliningThreshold(const CallBase *CB) const; + /// \returns The cost of having an Alloca in the caller if not inlined, to be + /// added to the threshold + unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const; + /// \returns Vector bonus in percent. /// /// Vector bonuses: We want to more aggressively inline vector-dense kernels @@ -1671,9 +1676,11 @@ getPointersChainCost(ArrayRef Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, TTI::TargetCostKind CostKind) = 0; - virtual unsigned getInliningThresholdMultiplier() = 0; + virtual unsigned getInliningThresholdMultiplier() const = 0; virtual unsigned adjustInliningThreshold(const CallBase *CB) = 0; - virtual int getInlinerVectorBonusPercent() = 0; + virtual int getInlinerVectorBonusPercent() const = 0; + virtual unsigned getCallerAllocaCost(const CallBase *CB, + const AllocaInst *AI) const = 0; virtual InstructionCost getMemcpyCost(const Instruction *I) = 0; virtual unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, unsigned &JTSize, @@ -2033,15 +2040,19 @@ TargetCostKind CostKind) override { return Impl.getPointersChainCost(Ptrs, Base, Info, CostKind); } - unsigned getInliningThresholdMultiplier() override { + unsigned getInliningThresholdMultiplier() const override { return Impl.getInliningThresholdMultiplier(); } unsigned adjustInliningThreshold(const CallBase *CB) override { return Impl.adjustInliningThreshold(CB); - } - int getInlinerVectorBonusPercent() override { + } + int getInlinerVectorBonusPercent() const override { return Impl.getInlinerVectorBonusPercent(); } + unsigned getCallerAllocaCost(const CallBase *CB, + const AllocaInst *AI) const override { + return Impl.getCallerAllocaCost(CB, AI); + } InstructionCost getMemcpyCost(const Instruction *I) override { return Impl.getMemcpyCost(I); } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -70,6 +70,9 @@ unsigned getInliningThresholdMultiplier() const { return 1; } unsigned adjustInliningThreshold(const CallBase *CB) const { return 0; } + unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const { + return 0; + }; int getInlinerVectorBonusPercent() const { return 150; } diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -532,10 +532,13 @@ return TargetTransformInfo::TCC_Expensive; } - unsigned getInliningThresholdMultiplier() { return 1; } + unsigned getInliningThresholdMultiplier() const { return 1; } unsigned adjustInliningThreshold(const CallBase *CB) { return 0; } + unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const { + return 0; + } - int getInlinerVectorBonusPercent() { return 150; } + int getInlinerVectorBonusPercent() const { return 150; } void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp --- a/llvm/lib/Analysis/InlineCost.cpp +++ b/llvm/lib/Analysis/InlineCost.cpp @@ -717,7 +717,8 @@ void onInitializeSROAArg(AllocaInst *Arg) override { assert(Arg != nullptr && "Should not initialize SROA costs for null value."); - SROAArgCosts[Arg] = 0; + SROACostSavings += SROAArgCosts[Arg] = + TTI.getCallerAllocaCost(&CandidateCall, Arg); } void onAggregateSROAUse(AllocaInst *SROAArg) override { diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -217,6 +217,11 @@ return TTIImpl->adjustInliningThreshold(CB); } +unsigned TargetTransformInfo::getCallerAllocaCost(const CallBase *CB, + const AllocaInst *AI) const { + return TTIImpl->getCallerAllocaCost(CB, AI); +} + int TargetTransformInfo::getInlinerVectorBonusPercent() const { return TTIImpl->getInlinerVectorBonusPercent(); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -230,10 +230,11 @@ bool areInlineCompatible(const Function *Caller, const Function *Callee) const; - unsigned getInliningThresholdMultiplier() { return 11; } + unsigned getInliningThresholdMultiplier() const { return 11; } unsigned adjustInliningThreshold(const CallBase *CB) const; + unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const; - int getInlinerVectorBonusPercent() { return 0; } + int getInlinerVectorBonusPercent() const { return 0; } InstructionCost getArithmeticReductionCost( unsigned Opcode, VectorType *Ty, std::optional FMF, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1206,11 +1206,11 @@ return adjustThreshold; } -unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const { +static size_t getTotalAllocaSize(const CallBase *CB, const DataLayout &DL) { // If we have a pointer to private array passed into a function // it will not be optimized out, leaving scratch usage. // Increase the inline threshold to allow inlining in this case. - unsigned adjustThreshold = 0; + // The bigger the private array the bigger the bonus. uint64_t AllocaSize = 0; SmallPtrSet AIVisited; for (Value *PtrArg : CB->args()) { @@ -1224,18 +1224,40 @@ if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second) continue; AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType()); - // If the amount of stack memory is excessive we will not be able - // to get rid of the scratch anyway, bail out. - if (AllocaSize > ArgAllocaCutoff) { - AllocaSize = 0; - break; - } } } - adjustThreshold += - adjustInliningThresholdUsingCallee(CB, TLI, this); - adjustThreshold += AllocaSize ? ArgAllocaCost : AllocaSize; - return adjustThreshold; + return AllocaSize; +} + +unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const { + unsigned Threshold = adjustInliningThresholdUsingCallee(CB, TLI, this); + size_t AllocaSize = getTotalAllocaSize(CB, DL); + if (AllocaSize > 0) { + Threshold += ArgAllocaCost; + } + return Threshold; +} + +unsigned GCNTTIImpl::getCallerAllocaCost(const CallBase *CB, + const AllocaInst *AI) const { + auto AllocaSize = getTotalAllocaSize(CB, DL); + if (AllocaSize <= ArgAllocaCutoff) { + // the inline threshold bonus has been given already by + // adjustInliningThreshold + return 0; + } + + // give an inline threshold bonus depending on the size of the alloca that is + // being optimized by SROA. the bigger the array is, the better chances we + // have to avoid scratch memory + auto ArgAllocaSize = DL.getTypeAllocSize(AI->getAllocatedType()); + unsigned ThresholdBonus = + (ArgAllocaCost * getInliningThresholdMultiplier() * ArgAllocaSize) / + AllocaSize; + + // awkwardly, this bonus gets multiplied by the single-bb-bonus and the + // vector-bonus + return ThresholdBonus; } void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h @@ -92,7 +92,7 @@ // Increase the inlining cost threshold by a factor of 11, reflecting that // calls are particularly expensive in NVPTX. - unsigned getInliningThresholdMultiplier() { return 11; } + unsigned getInliningThresholdMultiplier() const { return 11; } InstructionCost getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -36,7 +36,7 @@ /// \name Scalar TTI Implementations /// @{ - unsigned getInliningThresholdMultiplier() { return 3; } + unsigned getInliningThresholdMultiplier() const { return 3; } unsigned adjustInliningThreshold(const CallBase *CB) const; InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-inline.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-inline.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-inline.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-inline.ll @@ -30,6 +30,8 @@ define coldcc void @foo_private_ptr2(ptr addrspace(5) nocapture %p1, ptr addrspace(5) nocapture %p2) { entry: + call void @forbid_sroa(ptr addrspace(5) %p1) + call void @forbid_sroa(ptr addrspace(5) %p2) %tmp1 = load float, ptr addrspace(5) %p1, align 4 %cmp = fcmp ogt float %tmp1, 1.000000e+00 br i1 %cmp, label %if.then, label %if.end @@ -171,6 +173,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1 declare float @_Z3sinf(float) #1 +declare void @forbid_sroa(ptr addrspace(5) nocapture %p) attributes #0 = { noinline } attributes #1 = { nounwind readnone } diff --git a/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-alloca-argument-cost.ll b/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-alloca-argument-cost.ll --- a/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-alloca-argument-cost.ll +++ b/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-alloca-argument-cost.ll @@ -1,21 +1,94 @@ -; RUN: opt -mtriple=amdgcn--amdhsa -S -passes=inline -inline-threshold=0 -debug-only=inline-cost < %s 2>&1 | FileCheck %s +; RUN: opt -mtriple=amdgcn--amdhsa -S -passes=inline -inline-threshold=0 -debug-only=inline-cost %s 2>&1 | FileCheck %s ; REQUIRES: asserts target datalayout = "A5" ; Verify we are properly adding cost of the -amdgpu-inline-arg-alloca-cost to the threshold. +; Awkwardly, the adjusted threshold is multiplied by the single-basic-block bonus in . +define void @local_access_only(ptr addrspace(5) %p, i32 %idx) { + %arrayidx = getelementptr inbounds [64 x float], ptr addrspace(5) %p, i32 0, i32 %idx + %value = load float, ptr addrspace(5) %arrayidx + store float %value , ptr addrspace(5) %arrayidx, align 4 + ret void +} + +; below the cutoff, the alloca cost is 0, and only the cost of the instructions saved by sroa is counted +; CHECK: Analyzing call of local_access_only... (caller:test_inliner_sroa_single_below_cutoff) +; CHECK: NumAllocaArgs: 1 +; CHECK: SROACostSavings: 10 +; CHECK: SROACostSavingsLost: 0 +; CHECK: Threshold: 66000 +define amdgpu_kernel void @test_inliner_sroa_single_below_cutoff(ptr addrspace(1) %a, i32 %n) { +entry: + %pvt_arr = alloca [64 x float], align 4, addrspace(5) + call void @local_access_only(ptr addrspace(5) %pvt_arr, i32 4) + ret void +} + +; above the cutoff, attribute a cost to the alloca +; CHECK: Analyzing call of local_access_only... (caller:test_inliner_sroa_single_above_cutoff) ; CHECK: NumAllocaArgs: 1 +; CHECK: SROACostSavings: 44010 +; CHECK: SROACostSavingsLost: 0 +; CHECK: Threshold: 66000 +define amdgpu_kernel void @test_inliner_sroa_single_above_cutoff(ptr addrspace(1) %a, i32 %n) { +entry: + %pvt_arr = alloca [65 x float], align 4, addrspace(5) + call void @local_access_only(ptr addrspace(5) %pvt_arr, i32 4) + ret void +} + +define void @use_first_externally(ptr addrspace(5) %p1, ptr addrspace(5) %p2) { + call void @external(ptr addrspace(5) %p1) + %arrayidx = getelementptr inbounds [64 x float], ptr addrspace(5) %p2, i32 0, i32 7 + %value = load float, ptr addrspace(5) %arrayidx + store float %value , ptr addrspace(5) %arrayidx, align 4 + ret void +} + +define void @use_both_externally(ptr addrspace(5) %p1, ptr addrspace(5) %p2) { + call void @external(ptr addrspace(5) %p1) + call void @external(ptr addrspace(5) %p2) + ret void +} + +; CHECK: Analyzing call of use_first_externally... (caller:test_inliner_sroa_double) +; CHECK: NumAllocaArgs: 2 +; CHECK: SROACostSavings: 21671 +; CHECK: SROACostSavingsLost: 22338 ; CHECK: Threshold: 66000 +define amdgpu_kernel void @test_inliner_sroa_double() { +entry: + %pvt_arr1 = alloca [33 x float], align 4, addrspace(5) + %pvt_arr2 = alloca [32 x float], align 4, addrspace(5) + call void @use_first_externally(ptr addrspace(5) %pvt_arr1, ptr addrspace(5) %pvt_arr2) + ret void +} -define void @use_private_ptr_arg(ptr addrspace(5) nocapture %p) { +; CHECK: Analyzing call of use_both_externally... (caller:test_inliner_no_sroa) +; CHECK: NumAllocaArgs: 2 +; CHECK: SROACostSavings: 0 +; CHECK: SROACostSavingsLost: 43999 +; CHECK: Threshold: 66000 +define amdgpu_kernel void @test_inliner_no_sroa() { +entry: + %pvt_arr1 = alloca [33 x float], align 4, addrspace(5) + %pvt_arr2 = alloca [32 x float], align 4, addrspace(5) + call void @use_both_externally(ptr addrspace(5) %pvt_arr1, ptr addrspace(5) %pvt_arr2) ret void } -define amdgpu_kernel void @test_inliner_pvt_ptr(ptr addrspace(1) nocapture %a, i32 %n) { +; CHECK: Analyzing call of use_both_externally... (caller:test_inliner_no_alloc) +; CHECK: NumAllocaArgs: 0 +; CHECK: SROACostSavings: 0 +; CHECK: SROACostSavingsLost: 0 +; CHECK: Threshold: 0 +define amdgpu_kernel void @test_inliner_no_alloc(ptr addrspace(5) %a, ptr addrspace(5) %b) { entry: - %pvt_arr = alloca [64 x float], align 4, addrspace(5) - call void @use_private_ptr_arg(ptr addrspace(5) %pvt_arr) + call void @use_both_externally(ptr addrspace(5) %a, ptr addrspace(5) %b) ret void } + +declare void @external(ptr addrspace(5) %p) \ No newline at end of file