diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -3884,11 +3884,12 @@ 63:32 4 bytes PRIVATE_SEGMENT_FIXED_SIZE The amount of fixed private address space memory required for a - work-item in bytes. - Additional space may need to - be added to this value if - the call stack has - non-inlined function calls. + work-item in bytes. When + this cannot be predicted, + code object v4 and older + sets this value to be + higher than the minimum + requirement. 95:64 4 bytes KERNARG_SIZE The size of the kernarg memory pointed to by the AQL dispatch packet. The diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp @@ -43,9 +43,9 @@ char llvm::AMDGPUResourceUsageAnalysis::ID = 0; char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID; -// We need to tell the runtime some amount ahead of time if we don't know the -// true stack size. Assume a smaller number if this is only due to dynamic / -// non-entry block allocas. +// In code object v4 and older, we need to tell the runtime some amount ahead of +// time if we don't know the true stack size. Assume a smaller number if this is +// only due to dynamic / non-entry block allocas. static cl::opt AssumedStackSizeForExternalCall( "amdgpu-assume-external-call-stack-size", cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden, @@ -109,6 +109,15 @@ CallGraph CG = CallGraph(M); auto End = po_end(&CG); + // By default, for code object v5 and later, track only the minimum scratch + // size + if (AMDGPU::getAmdhsaCodeObjectVersion() >= 5) { + if (!AssumedStackSizeForDynamicSizeObjects.getNumOccurrences()) + AssumedStackSizeForDynamicSizeObjects = 0; + if (!AssumedStackSizeForExternalCall.getNumOccurrences()) + AssumedStackSizeForExternalCall = 0; + } + for (auto IT = po_begin(&CG); IT != End; ++IT) { Function *F = IT->getFunction(); if (!F || F->isDeclaration()) diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll --- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll @@ -1,4 +1,5 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=2 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CI %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=5 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN-V5 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=2 -mcpu=fiji -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,VI-NOBUG %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=2 -mcpu=iceland -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,VI-BUG %s @@ -182,6 +183,9 @@ ; NumSgprs: 48 ; NumVgprs: 24 ; GCN: ScratchSize: 16384 +; +; GCN-V5-LABEL: {{^}}usage_external: +; GCN-V5: ScratchSize: 0 define amdgpu_kernel void @usage_external() #0 { call void @external() ret void @@ -194,6 +198,9 @@ ; NumSgprs: 48 ; NumVgprs: 24 ; GCN: ScratchSize: 16384 +; +; GCN-V5-LABEL: {{^}}usage_external_recurse: +; GCN-V5: ScratchSize: 0 define amdgpu_kernel void @usage_external_recurse() #0 { call void @external_recurse() ret void @@ -201,6 +208,9 @@ ; GCN-LABEL: {{^}}direct_recursion_use_stack: ; GCN: ScratchSize: 18448{{$}} +; +; GCN-V5-LABEL: {{^}}direct_recursion_use_stack: +; GCN-V5: ScratchSize: 2064{{$}} define void @direct_recursion_use_stack(i32 %val) #2 { %alloca = alloca [512 x i32], align 4, addrspace(5) call void asm sideeffect "; use $0", "v"([512 x i32] addrspace(5)* %alloca) #0 @@ -220,6 +230,9 @@ ; GCN: is_ptr64 = 1 ; GCN: is_dynamic_callstack = 1 ; GCN: workitem_private_segment_byte_size = 18448{{$}} +; +; GCN-V5-LABEL: {{^}}usage_direct_recursion: +; GCN-V5: .amdhsa_private_segment_fixed_size 2064{{$}} define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 { call void @direct_recursion_use_stack(i32 %n) ret void diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll --- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=DEFAULTSIZE,MUBUF %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdhsa-code-object-version=5 < %s | FileCheck -check-prefixes=DEFAULTSIZE-V5,MUBUF %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-assume-dynamic-stack-object-size=1024 < %s | FileCheck -check-prefixes=ASSUME1024,MUBUF %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-assume-dynamic-stack-object-size=1024 -amdhsa-code-object-version=5 < %s | FileCheck -check-prefixes=ASSUME1024,MUBUF %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=DEFAULTSIZE,FLATSCR %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=+enable-flat-scratch -amdgpu-assume-dynamic-stack-object-size=1024 < %s | FileCheck -check-prefixes=ASSUME1024,FLATSCR %s @@ -110,6 +112,9 @@ } ; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4112 ; DEFAULTSIZE: ; ScratchSize: 4112 +; DEFAULTSIZE-V5: .amdhsa_private_segment_fixed_size 16 +; DEFAULTSIZE-V5: .amdhsa_uses_dynamic_stack 1 +; DEFAULTSIZE-V5: ; ScratchSize: 16 ; ASSUME1024: .amdhsa_private_segment_fixed_size 1040 ; ASSUME1024: ; ScratchSize: 1040 @@ -203,6 +208,9 @@ ; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4160 ; DEFAULTSIZE: ; ScratchSize: 4160 +; DEFAULTSIZE-V5: .amdhsa_private_segment_fixed_size 64 +; DEFAULTSIZE-V5: .amdhsa_uses_dynamic_stack 1 +; DEFAULTSIZE-V5: ; ScratchSize: 64 ; ASSUME1024: .amdhsa_private_segment_fixed_size 1088 ; ASSUME1024: ; ScratchSize: 1088 diff --git a/llvm/test/CodeGen/AMDGPU/recursion.ll b/llvm/test/CodeGen/AMDGPU/recursion.ll --- a/llvm/test/CodeGen/AMDGPU/recursion.ll +++ b/llvm/test/CodeGen/AMDGPU/recursion.ll @@ -1,4 +1,5 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs --amdhsa-code-object-version=5 < %s | FileCheck -check-prefixes=V5 %s ; CHECK-LABEL: {{^}}recursive: ; CHECK: ScratchSize: 16 @@ -28,9 +29,13 @@ ret void } -; For an arbitrary recursive call, report a large number for unknown stack usage. +; For an arbitrary recursive call, report a large number for unknown stack +; usage for code object v4 and older ; CHECK-LABEL: {{^}}calls_recursive: ; CHECK: .amdhsa_private_segment_fixed_size 16400{{$}} +; +; V5-LABEL: {{^}}calls_recursive: +; V5: .amdhsa_private_segment_fixed_size 0{{$}} define amdgpu_kernel void @calls_recursive() { call void @recursive() ret void @@ -51,6 +56,9 @@ ; CHECK-LABEL: {{^}}kernel_calls_tail_recursive: ; CHECK: .amdhsa_private_segment_fixed_size 16384{{$}} +; +; V5-LABEL: {{^}}kernel_calls_tail_recursive: +; V5: .amdhsa_private_segment_fixed_size 0{{$}} define amdgpu_kernel void @kernel_calls_tail_recursive() { call void @tail_recursive() ret void @@ -58,6 +66,9 @@ ; CHECK-LABEL: {{^}}kernel_calls_tail_recursive_with_stack: ; CHECK: .amdhsa_private_segment_fixed_size 16384{{$}} +; +; V5-LABEL: {{^}}kernel_calls_tail_recursive_with_stack: +; V5: .amdhsa_private_segment_fixed_size 8{{$}} define amdgpu_kernel void @kernel_calls_tail_recursive_with_stack() { call void @tail_recursive_with_stack() ret void