diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -232,6 +232,7 @@ unsigned getInliningThresholdMultiplier() const { return 11; } unsigned adjustInliningThreshold(const CallBase *CB) const; + unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const; int getInlinerVectorBonusPercent() const { return 0; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1206,11 +1206,11 @@ return adjustThreshold; } -unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const { +static size_t getTotalAllocaSize(const CallBase *CB, const DataLayout &DL) { // If we have a pointer to private array passed into a function // it will not be optimized out, leaving scratch usage. // Increase the inline threshold to allow inlining in this case. - unsigned adjustThreshold = 0; + // The bigger the private array the bigger the bonus. uint64_t AllocaSize = 0; SmallPtrSet AIVisited; for (Value *PtrArg : CB->args()) { @@ -1224,18 +1224,38 @@ if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second) continue; AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType()); - // If the amount of stack memory is excessive we will not be able - // to get rid of the scratch anyway, bail out. - if (AllocaSize > ArgAllocaCutoff) { - AllocaSize = 0; - break; - } } } - adjustThreshold += - adjustInliningThresholdUsingCallee(CB, TLI, this); - adjustThreshold += AllocaSize ? ArgAllocaCost : AllocaSize; - return adjustThreshold; + return AllocaSize; +} + +unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const { + unsigned Threshold = adjustInliningThresholdUsingCallee(CB, TLI, this); + size_t AllocaSize = getTotalAllocaSize(CB, DL); + if (AllocaSize > 0) { + Threshold += ArgAllocaCost; + } + return Threshold; +} + +unsigned GCNTTIImpl::getCallerAllocaCost(const CallBase *CB, + const AllocaInst *AI) const { + auto AllocaSize = getTotalAllocaSize(CB, DL); + if (AllocaSize <= ArgAllocaCutoff) { + // the inline threshold bonus has been given already by + // adjustInliningThreshold + return 0; + } + + // give an inline threshold bonus depending on the size of the alloca that is + // being optimized by SROA. the bigger the array is, the better chances we + // have to avoid scratch memory + auto ArgAllocaSize = DL.getTypeAllocSize(AI->getAllocatedType()); + unsigned ThresholdBonus = + (ArgAllocaCost * getInliningThresholdMultiplier() * ArgAllocaSize) / + AllocaSize; + // awkwardly, this gets multiplied by the vector and single-bb bonuses + return ThresholdBonus; } void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-inline.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-inline.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-inline.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-inline.ll @@ -30,6 +30,8 @@ define coldcc void @foo_private_ptr2(ptr addrspace(5) nocapture %p1, ptr addrspace(5) nocapture %p2) { entry: + call void @forbid_sroa(ptr addrspace(5) %p1) + call void @forbid_sroa(ptr addrspace(5) %p2) %tmp1 = load float, ptr addrspace(5) %p1, align 4 %cmp = fcmp ogt float %tmp1, 1.000000e+00 br i1 %cmp, label %if.then, label %if.end @@ -171,6 +173,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1 declare float @_Z3sinf(float) #1 +declare void @forbid_sroa(ptr addrspace(5) nocapture %p) attributes #0 = { noinline } attributes #1 = { nounwind readnone } diff --git a/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-alloca-argument-cost.ll b/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-alloca-argument-cost.ll --- a/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-alloca-argument-cost.ll +++ b/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-alloca-argument-cost.ll @@ -1,21 +1,94 @@ -; RUN: opt -mtriple=amdgcn--amdhsa -S -passes=inline -inline-threshold=0 -debug-only=inline-cost < %s 2>&1 | FileCheck %s +; RUN: opt -mtriple=amdgcn--amdhsa -S -passes=inline -inline-threshold=0 -debug-only=inline-cost %s 2>&1 | FileCheck %s ; REQUIRES: asserts target datalayout = "A5" ; Verify we are properly adding cost of the -amdgpu-inline-arg-alloca-cost to the threshold. +; Awkwardly, the adjusted threshold is multiplied by the single-basic-block bonus in . +define void @local_access_only(ptr addrspace(5) %p, i32 %idx) { + %arrayidx = getelementptr inbounds [64 x float], ptr addrspace(5) %p, i32 0, i32 %idx + %value = load float, ptr addrspace(5) %arrayidx + store float %value , ptr addrspace(5) %arrayidx, align 4 + ret void +} + +; below the cutoff, the alloca cost is 0, and only the cost of the instructions saved by sroa is counted +; CHECK: Analyzing call of local_access_only... (caller:test_inliner_sroa_single_below_cutoff) +; CHECK: NumAllocaArgs: 1 +; CHECK: SROACostSavings: 10 +; CHECK: SROACostSavingsLost: 0 +; CHECK: Threshold: 66000 +define amdgpu_kernel void @test_inliner_sroa_single_below_cutoff(ptr addrspace(1) %a, i32 %n) { +entry: + %pvt_arr = alloca [64 x float], align 4, addrspace(5) + call void @local_access_only(ptr addrspace(5) %pvt_arr, i32 4) + ret void +} + +; above the cutoff, attribute a cost to the alloca +; CHECK: Analyzing call of local_access_only... (caller:test_inliner_sroa_single_above_cutoff) ; CHECK: NumAllocaArgs: 1 +; CHECK: SROACostSavings: 44010 +; CHECK: SROACostSavingsLost: 0 +; CHECK: Threshold: 66000 +define amdgpu_kernel void @test_inliner_sroa_single_above_cutoff(ptr addrspace(1) %a, i32 %n) { +entry: + %pvt_arr = alloca [65 x float], align 4, addrspace(5) + call void @local_access_only(ptr addrspace(5) %pvt_arr, i32 4) + ret void +} + +define void @use_first_externally(ptr addrspace(5) %p1, ptr addrspace(5) %p2) { + call void @external(ptr addrspace(5) %p1) + %arrayidx = getelementptr inbounds [64 x float], ptr addrspace(5) %p2, i32 0, i32 7 + %value = load float, ptr addrspace(5) %arrayidx + store float %value , ptr addrspace(5) %arrayidx, align 4 + ret void +} + +define void @use_both_externally(ptr addrspace(5) %p1, ptr addrspace(5) %p2) { + call void @external(ptr addrspace(5) %p1) + call void @external(ptr addrspace(5) %p2) + ret void +} + +; CHECK: Analyzing call of use_first_externally... (caller:test_inliner_sroa_double) +; CHECK: NumAllocaArgs: 2 +; CHECK: SROACostSavings: 21671 +; CHECK: SROACostSavingsLost: 22338 ; CHECK: Threshold: 66000 +define amdgpu_kernel void @test_inliner_sroa_double() { +entry: + %pvt_arr1 = alloca [33 x float], align 4, addrspace(5) + %pvt_arr2 = alloca [32 x float], align 4, addrspace(5) + call void @use_first_externally(ptr addrspace(5) %pvt_arr1, ptr addrspace(5) %pvt_arr2) + ret void +} -define void @use_private_ptr_arg(ptr addrspace(5) nocapture %p) { +; CHECK: Analyzing call of use_both_externally... (caller:test_inliner_no_sroa) +; CHECK: NumAllocaArgs: 2 +; CHECK: SROACostSavings: 0 +; CHECK: SROACostSavingsLost: 43999 +; CHECK: Threshold: 66000 +define amdgpu_kernel void @test_inliner_no_sroa() { +entry: + %pvt_arr1 = alloca [33 x float], align 4, addrspace(5) + %pvt_arr2 = alloca [32 x float], align 4, addrspace(5) + call void @use_both_externally(ptr addrspace(5) %pvt_arr1, ptr addrspace(5) %pvt_arr2) ret void } -define amdgpu_kernel void @test_inliner_pvt_ptr(ptr addrspace(1) nocapture %a, i32 %n) { +; CHECK: Analyzing call of use_both_externally... (caller:test_inliner_no_alloc) +; CHECK: NumAllocaArgs: 0 +; CHECK: SROACostSavings: 0 +; CHECK: SROACostSavingsLost: 0 +; CHECK: Threshold: 0 +define amdgpu_kernel void @test_inliner_no_alloc(ptr addrspace(5) %a, ptr addrspace(5) %b) { entry: - %pvt_arr = alloca [64 x float], align 4, addrspace(5) - call void @use_private_ptr_arg(ptr addrspace(5) %pvt_arr) + call void @use_both_externally(ptr addrspace(5) %a, ptr addrspace(5) %b) ret void } + +declare void @external(ptr addrspace(5) %p) \ No newline at end of file