diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -71,6 +71,7 @@ bool IsGraphics; bool HasFP32Denormals; bool HasFP64FP16Denormals; + static constexpr bool InlinerVectorBonusPercent = 0; static const FeatureBitset InlineFeatureIgnoreList; @@ -240,8 +241,9 @@ unsigned getInliningThresholdMultiplier() const { return 11; } unsigned adjustInliningThreshold(const CallBase *CB) const; + unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const; - int getInlinerVectorBonusPercent() const { return 0; } + int getInlinerVectorBonusPercent() const { return InlinerVectorBonusPercent; } InstructionCost getArithmeticReductionCost( unsigned Opcode, VectorType *Ty, std::optional FMF, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1219,36 +1219,83 @@ return adjustThreshold; } -unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const { - // If we have a pointer to private array passed into a function +static unsigned getCallArgsTotalAllocaSize(const CallBase *CB, + const DataLayout &DL) { + // If we have a pointer to a private array passed into a function // it will not be optimized out, leaving scratch usage. - // Increase the inline threshold to allow inlining in this case. - unsigned adjustThreshold = 0; - uint64_t AllocaSize = 0; + // This function calculates the total size in bytes of the memory that would + // end in scratch if the call was not inlined. + unsigned AllocaSize = 0; SmallPtrSet AIVisited; for (Value *PtrArg : CB->args()) { PointerType *Ty = dyn_cast(PtrArg->getType()); - if (!Ty || (Ty->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS && - Ty->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS)) + if (!Ty) continue; - PtrArg = getUnderlyingObject(PtrArg); - if (const AllocaInst *AI = dyn_cast(PtrArg)) { - if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second) - continue; - AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType()); - // If the amount of stack memory is excessive we will not be able - // to get rid of the scratch anyway, bail out. - if (AllocaSize > ArgAllocaCutoff) { - AllocaSize = 0; - break; - } - } + unsigned AddrSpace = Ty->getAddressSpace(); + if (AddrSpace != AMDGPUAS::FLAT_ADDRESS && + AddrSpace != AMDGPUAS::PRIVATE_ADDRESS) + continue; + + const AllocaInst *AI = dyn_cast(getUnderlyingObject(PtrArg)); + if (!AI || !AI->isStaticAlloca() || !AIVisited.insert(AI).second) + continue; + + AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType()); } - adjustThreshold += - adjustInliningThresholdUsingCallee(CB, TLI, this); - adjustThreshold += AllocaSize ? ArgAllocaCost : AllocaSize; - return adjustThreshold; + return AllocaSize; +} + +unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const { + unsigned Threshold = adjustInliningThresholdUsingCallee(CB, TLI, this); + + // Private object passed as arguments may end up in scratch usage if the call + // is not inlined. Increase the inline threshold to promote inlining. + unsigned AllocaSize = getCallArgsTotalAllocaSize(CB, DL); + if (AllocaSize > 0) + Threshold += ArgAllocaCost; + return Threshold; +} + +unsigned GCNTTIImpl::getCallerAllocaCost(const CallBase *CB, + const AllocaInst *AI) const { + + // Below the cutoff, assume that the private memory objects would be + // optimized + auto AllocaSize = getCallArgsTotalAllocaSize(CB, DL); + if (AllocaSize <= ArgAllocaCutoff) + return 0; + + // Above the cutoff, we give a cost to each private memory object + // depending its size. If the array can be optimized by SROA this cost is not + // added to the total-cost in the inliner cost analysis. + // + // We choose the total cost of the alloca such that their sum cancels the + // bonus given in the threshold (ArgAllocaCost). + // + // Cost_Alloca_0 + ... + Cost_Alloca_N == ArgAllocaCost + // + // Awkwardly, the ArgAllocaCost bonus is multiplied by threshold-multiplier, + // the single-bb bonus and the vector-bonus. + // + // We compensate the first two multipliers, by repeating logic from the + // inliner-cost in here. The vector-bonus is 0 on AMDGPU. + static_assert(InlinerVectorBonusPercent == 0, "vector bonus assumed to be 0"); + unsigned Threshold = ArgAllocaCost * getInliningThresholdMultiplier(); + + bool SingleBB = none_of(*CB->getCalledFunction(), [](const BasicBlock &BB) { + return BB.getTerminator()->getNumSuccessors() > 1; + }); + if (SingleBB) { + Threshold += Threshold / 2; + } + + auto ArgAllocaSize = DL.getTypeAllocSize(AI->getAllocatedType()); + + // Attribute the bonus proportionally to the alloca size + unsigned AllocaThresholdBonus = (Threshold * ArgAllocaSize) / AllocaSize; + + return AllocaThresholdBonus; } void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-inline.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-inline.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-inline.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-inline.ll @@ -30,6 +30,8 @@ define coldcc void @foo_private_ptr2(ptr addrspace(5) nocapture %p1, ptr addrspace(5) nocapture %p2) { entry: + call void @forbid_sroa(ptr addrspace(5) %p1) + call void @forbid_sroa(ptr addrspace(5) %p2) %tmp1 = load float, ptr addrspace(5) %p1, align 4 %cmp = fcmp ogt float %tmp1, 1.000000e+00 br i1 %cmp, label %if.then, label %if.end @@ -171,6 +173,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1 declare float @_Z3sinf(float) #1 +declare void @forbid_sroa(ptr addrspace(5) nocapture %p) attributes #0 = { noinline } attributes #1 = { nounwind readnone } diff --git a/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-alloca-argument-cost.ll b/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-alloca-argument-cost.ll --- a/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-alloca-argument-cost.ll +++ b/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-alloca-argument-cost.ll @@ -1,4 +1,4 @@ -; RUN: opt -mtriple=amdgcn--amdhsa -S -passes=inline -inline-threshold=0 -debug-only=inline-cost < %s 2>&1 | FileCheck %s +; RUN: opt -mtriple=amdgcn--amdhsa -S -passes=inline -inline-threshold=0 -debug-only=inline-cost %s 2>&1 | FileCheck %s ; REQUIRES: asserts @@ -6,16 +6,91 @@ ; Verify we are properly adding cost of the -amdgpu-inline-arg-alloca-cost to the threshold. +define void @local_access_only(ptr addrspace(5) %p, i32 %idx) { + %arrayidx = getelementptr inbounds [64 x float], ptr addrspace(5) %p, i32 0, i32 %idx + %value = load float, ptr addrspace(5) %arrayidx + store float %value , ptr addrspace(5) %arrayidx, align 4 + ret void +} + +; Below the cutoff, the alloca cost is 0, and only the cost of the instructions saved by sroa is counted +; CHECK: Analyzing call of local_access_only... (caller:test_inliner_sroa_single_below_cutoff) +; CHECK: NumAllocaArgs: 1 +; CHECK: SROACostSavings: 10 +; CHECK: SROACostSavingsLost: 0 +; CHECK: Threshold: 66000 +define amdgpu_kernel void @test_inliner_sroa_single_below_cutoff(ptr addrspace(1) %a, i32 %n) { +entry: + %pvt_arr = alloca [64 x float], align 4, addrspace(5) + call void @local_access_only(ptr addrspace(5) %pvt_arr, i32 4) + ret void +} + +; Above the cutoff, attribute a cost to the alloca +; CHECK: Analyzing call of local_access_only... (caller:test_inliner_sroa_single_above_cutoff) ; CHECK: NumAllocaArgs: 1 +; CHECK: SROACostSavings: 66010 +; CHECK: SROACostSavingsLost: 0 +; CHECK: Threshold: 66000 +define amdgpu_kernel void @test_inliner_sroa_single_above_cutoff(ptr addrspace(1) %a, i32 %n) { +entry: + %pvt_arr = alloca [65 x float], align 4, addrspace(5) + call void @local_access_only(ptr addrspace(5) %pvt_arr, i32 4) + ret void +} + +define void @use_first_externally(ptr addrspace(5) %p1, ptr addrspace(5) %p2) { + call void @external(ptr addrspace(5) %p1) + %arrayidx = getelementptr inbounds [64 x float], ptr addrspace(5) %p2, i32 0, i32 7 + %value = load float, ptr addrspace(5) %arrayidx + store float %value , ptr addrspace(5) %arrayidx, align 4 + ret void +} + +define void @use_both_externally(ptr addrspace(5) %p1, ptr addrspace(5) %p2) { + call void @external(ptr addrspace(5) %p1) + call void @external(ptr addrspace(5) %p2) + ret void +} + +; One array cannot get handled by SROA +; CHECK: Analyzing call of use_first_externally... (caller:test_inliner_sroa_double) +; CHECK: NumAllocaArgs: 2 +; CHECK: SROACostSavings: 32502 +; CHECK: SROACostSavingsLost: 33507 ; CHECK: Threshold: 66000 +define amdgpu_kernel void @test_inliner_sroa_double() { +entry: + %pvt_arr1 = alloca [33 x float], align 4, addrspace(5) + %pvt_arr2 = alloca [32 x float], align 4, addrspace(5) + call void @use_first_externally(ptr addrspace(5) %pvt_arr1, ptr addrspace(5) %pvt_arr2) + ret void +} -define void @use_private_ptr_arg(ptr addrspace(5) nocapture %p) { +; The two arrays cannot get handled by SROA +; CHECK: Analyzing call of use_both_externally... (caller:test_inliner_no_sroa) +; CHECK: NumAllocaArgs: 2 +; CHECK: SROACostSavings: 0 +; CHECK: SROACostSavingsLost: 65999 +; CHECK: Threshold: 66000 +define amdgpu_kernel void @test_inliner_no_sroa() { +entry: + %pvt_arr1 = alloca [33 x float], align 4, addrspace(5) + %pvt_arr2 = alloca [32 x float], align 4, addrspace(5) + call void @use_both_externally(ptr addrspace(5) %pvt_arr1, ptr addrspace(5) %pvt_arr2) ret void } -define amdgpu_kernel void @test_inliner_pvt_ptr(ptr addrspace(1) nocapture %a, i32 %n) { +; No private arrays +; CHECK: Analyzing call of use_both_externally... (caller:test_inliner_no_alloc) +; CHECK: NumAllocaArgs: 0 +; CHECK: SROACostSavings: 0 +; CHECK: SROACostSavingsLost: 0 +; CHECK: Threshold: 0 +define amdgpu_kernel void @test_inliner_no_alloc(ptr addrspace(5) %a, ptr addrspace(5) %b) { entry: - %pvt_arr = alloca [64 x float], align 4, addrspace(5) - call void @use_private_ptr_arg(ptr addrspace(5) %pvt_arr) + call void @use_both_externally(ptr addrspace(5) %a, ptr addrspace(5) %b) ret void } + +declare void @external(ptr addrspace(5) %p)