diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp @@ -43,20 +43,6 @@ char llvm::AMDGPUResourceUsageAnalysis::ID = 0; char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID; -// We need to tell the runtime some amount ahead of time if we don't know the -// true stack size. Assume a smaller number if this is only due to dynamic / -// non-entry block allocas. -static cl::opt AssumedStackSizeForExternalCall( - "amdgpu-assume-external-call-stack-size", - cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden, - cl::init(16384)); - -static cl::opt AssumedStackSizeForDynamicSizeObjects( - "amdgpu-assume-dynamic-stack-object-size", - cl::desc("Assumed extra stack use if there are any " - "variable sized objects (in bytes)"), - cl::Hidden, cl::init(4096)); - INITIALIZE_PASS(AMDGPUResourceUsageAnalysis, DEBUG_TYPE, "Function register usage analysis", true, true) @@ -165,8 +151,6 @@ // Assume a big number if there are any unknown sized objects. Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects(); - if (Info.HasDynamicallySizedStack) - Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects; if (MFI->isStackRealigned()) Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value(); @@ -459,27 +443,9 @@ // FIXME: Call site could have norecurse on it if (!Callee || !Callee->doesNotRecurse()) { Info.HasRecursion = true; - - // TODO: If we happen to know there is no stack usage in the - // callgraph, we don't need to assume an infinitely growing stack. - if (!MI.isReturn()) { - // We don't need to assume an unknown stack size for tail calls. - - // FIXME: This only benefits in the case where the kernel does not - // directly call the tail called function. If a kernel directly - // calls a tail recursive function, we'll assume maximum stack size - // based on the regular call instruction. - CalleeFrameSize = - std::max(CalleeFrameSize, - static_cast(AssumedStackSizeForExternalCall)); - } } if (IsIndirect || I == CallGraphResourceInfo.end()) { - CalleeFrameSize = - std::max(CalleeFrameSize, - static_cast(AssumedStackSizeForExternalCall)); - // Register usage of indirect calls gets handled later Info.UsesVCC = true; Info.UsesFlatScratch = ST.hasFlatAddressSpace(); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,DEFAULTSIZE %s -; RUN: llc -global-isel -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-assume-dynamic-stack-object-size=1024 < %s | FileCheck -check-prefixes=GCN,ASSUME1024 %s ; FIXME: Generated test checks do not check metadata at the end of the ; function, so this also includes manually added checks. @@ -76,11 +75,8 @@ store volatile i32 0, i32 addrspace(1)* undef ret void } -; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4112 -; DEFAULTSIZE: ; ScratchSize: 4112 - -; ASSUME1024: .amdhsa_private_segment_fixed_size 1040 -; ASSUME1024: ; ScratchSize: 1040 +; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 16 +; DEFAULTSIZE: ; ScratchSize: 16 define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align64(i32 addrspace(1)* %out, i32 %arg.cond, i32 %in) { ; GCN-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64: @@ -139,11 +135,8 @@ ret void } -; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4160 -; DEFAULTSIZE: ; ScratchSize: 4160 - -; ASSUME1024: .amdhsa_private_segment_fixed_size 1088 -; ASSUME1024: ; ScratchSize: 1088 +; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 64 +; DEFAULTSIZE: ; ScratchSize: 64 define void @func_non_entry_block_static_alloca_align4(i32 addrspace(1)* %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) { diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll --- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll @@ -181,7 +181,7 @@ ; GCN: is_dynamic_callstack = 1 ; NumSgprs: 48 ; NumVgprs: 24 -; GCN: ScratchSize: 16384 +; GCN: ScratchSize: 0 define amdgpu_kernel void @usage_external() #0 { call void @external() ret void @@ -193,14 +193,14 @@ ; GCN: is_dynamic_callstack = 1 ; NumSgprs: 48 ; NumVgprs: 24 -; GCN: ScratchSize: 16384 +; GCN: ScratchSize: 0 define amdgpu_kernel void @usage_external_recurse() #0 { call void @external_recurse() ret void } ; GCN-LABEL: {{^}}direct_recursion_use_stack: -; GCN: ScratchSize: 18448{{$}} +; GCN: ScratchSize: 2064{{$}} define void @direct_recursion_use_stack(i32 %val) #2 { %alloca = alloca [512 x i32], align 4, addrspace(5) call void asm sideeffect "; use $0", "v"([512 x i32] addrspace(5)* %alloca) #0 @@ -219,7 +219,7 @@ ; GCN-LABEL: {{^}}usage_direct_recursion: ; GCN: is_ptr64 = 1 ; GCN: is_dynamic_callstack = 1 -; GCN: workitem_private_segment_byte_size = 18448{{$}} +; GCN: workitem_private_segment_byte_size = 2064{{$}} define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 { call void @direct_recursion_use_stack(i32 %n) ret void diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-resource-usage-function-ordering.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-resource-usage-function-ordering.ll --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-resource-usage-function-ordering.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-resource-usage-function-ordering.ll @@ -1,10 +1,10 @@ ; Note: uses a randomly selected assumed external call stack size so that the ; test assertions are unlikely to succeed by accident. -; RUN: llc -amdgpu-assume-external-call-stack-size=5310 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=4 -enable-misched=0 -filetype=asm -o - < %s | FileCheck --check-prefixes CHECK,GFX7 %s -; RUN: llc -amdgpu-assume-external-call-stack-size=5310 -mattr=-xnack -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=4 -mcpu=gfx803 -enable-misched=0 -filetype=asm -o - < %s | FileCheck --check-prefixes CHECK,GFX8 %s -; RUN: llc -amdgpu-assume-external-call-stack-size=5310 -mattr=-xnack -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=4 -mcpu=gfx900 -enable-misched=0 -filetype=asm -o - < %s | FileCheck --check-prefixes CHECK,GFX9 %s -; RUN: llc -amdgpu-assume-external-call-stack-size=5310 -mattr=-xnack -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=4 -mcpu=gfx1010 -enable-misched=0 -filetype=asm -o - < %s | FileCheck --check-prefixes CHECK,GFX10 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=4 -enable-misched=0 -filetype=asm -o - < %s | FileCheck --check-prefixes CHECK,GFX7 %s +; RUN: llc -mattr=-xnack -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=4 -mcpu=gfx803 -enable-misched=0 -filetype=asm -o - < %s | FileCheck --check-prefixes CHECK,GFX8 %s +; RUN: llc -mattr=-xnack -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=4 -mcpu=gfx900 -enable-misched=0 -filetype=asm -o - < %s | FileCheck --check-prefixes CHECK,GFX9 %s +; RUN: llc -mattr=-xnack -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=4 -mcpu=gfx1010 -enable-misched=0 -filetype=asm -o - < %s | FileCheck --check-prefixes CHECK,GFX10 %s ; CHECK-LABEL: amdhsa.kernels @@ -78,7 +78,7 @@ ; test a kernel with an external call that occurs before its callee in the module ; CHECK-LABEL: test3 -; CHECK: .private_segment_fixed_size: 5310 +; CHECK: .private_segment_fixed_size: 0 ; GFX7: .sgpr_count: 37 ; GFX7: .sgpr_spill_count: 0 @@ -108,7 +108,7 @@ ; test a kernel without an external call that occurs after its callee in the module ; CHECK-LABEL: test4 -; CHECK: .private_segment_fixed_size: 5310 +; CHECK: .private_segment_fixed_size: 0 ; GFX7: .sgpr_count: 37 ; GFX7: .sgpr_spill_count: 0 diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll --- a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll @@ -66,7 +66,7 @@ ; CHECK-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1 ; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0 ; CHECK-NEXT: .amdhsa_uses_dynamic_stack 1 -; CHECK-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 +; CHECK-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0 ; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 ; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 ; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll --- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -27,7 +27,7 @@ ; GCN-NEXT: enable_wgp_mode = 0 ; GCN-NEXT: enable_mem_ordered = 0 ; GCN-NEXT: enable_fwd_progress = 0 -; GCN-NEXT: enable_sgpr_private_segment_wave_byte_offset = 1 +; GCN-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 ; GCN-NEXT: user_sgpr_count = 14 ; GCN-NEXT: enable_trap_handler = 0 ; GCN-NEXT: enable_sgpr_workgroup_id_x = 1 @@ -55,7 +55,7 @@ ; GCN-NEXT: is_dynamic_callstack = 1 ; GCN-NEXT: is_debug_enabled = 0 ; GCN-NEXT: is_xnack_enabled = 0 -; GCN-NEXT: workitem_private_segment_byte_size = 16384 +; GCN-NEXT: workitem_private_segment_byte_size = 0 ; GCN-NEXT: workgroup_group_segment_byte_size = 0 ; GCN-NEXT: gds_segment_byte_size = 0 ; GCN-NEXT: kernarg_segment_byte_size = 64 @@ -120,7 +120,7 @@ ; GISEL-NEXT: enable_wgp_mode = 0 ; GISEL-NEXT: enable_mem_ordered = 0 ; GISEL-NEXT: enable_fwd_progress = 0 -; GISEL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 1 +; GISEL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 ; GISEL-NEXT: user_sgpr_count = 14 ; GISEL-NEXT: enable_trap_handler = 0 ; GISEL-NEXT: enable_sgpr_workgroup_id_x = 1 @@ -148,7 +148,7 @@ ; GISEL-NEXT: is_dynamic_callstack = 1 ; GISEL-NEXT: is_debug_enabled = 0 ; GISEL-NEXT: is_xnack_enabled = 0 -; GISEL-NEXT: workitem_private_segment_byte_size = 16384 +; GISEL-NEXT: workitem_private_segment_byte_size = 0 ; GISEL-NEXT: workgroup_group_segment_byte_size = 0 ; GISEL-NEXT: gds_segment_byte_size = 0 ; GISEL-NEXT: kernarg_segment_byte_size = 64 @@ -218,7 +218,7 @@ ; GCN-NEXT: enable_wgp_mode = 0 ; GCN-NEXT: enable_mem_ordered = 0 ; GCN-NEXT: enable_fwd_progress = 0 -; GCN-NEXT: enable_sgpr_private_segment_wave_byte_offset = 1 +; GCN-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 ; GCN-NEXT: user_sgpr_count = 14 ; GCN-NEXT: enable_trap_handler = 0 ; GCN-NEXT: enable_sgpr_workgroup_id_x = 1 @@ -246,7 +246,7 @@ ; GCN-NEXT: is_dynamic_callstack = 1 ; GCN-NEXT: is_debug_enabled = 0 ; GCN-NEXT: is_xnack_enabled = 0 -; GCN-NEXT: workitem_private_segment_byte_size = 16384 +; GCN-NEXT: workitem_private_segment_byte_size = 0 ; GCN-NEXT: workgroup_group_segment_byte_size = 0 ; GCN-NEXT: gds_segment_byte_size = 0 ; GCN-NEXT: kernarg_segment_byte_size = 64 @@ -312,7 +312,7 @@ ; GISEL-NEXT: enable_wgp_mode = 0 ; GISEL-NEXT: enable_mem_ordered = 0 ; GISEL-NEXT: enable_fwd_progress = 0 -; GISEL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 1 +; GISEL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 ; GISEL-NEXT: user_sgpr_count = 14 ; GISEL-NEXT: enable_trap_handler = 0 ; GISEL-NEXT: enable_sgpr_workgroup_id_x = 1 @@ -340,7 +340,7 @@ ; GISEL-NEXT: is_dynamic_callstack = 1 ; GISEL-NEXT: is_debug_enabled = 0 ; GISEL-NEXT: is_xnack_enabled = 0 -; GISEL-NEXT: workitem_private_segment_byte_size = 16384 +; GISEL-NEXT: workitem_private_segment_byte_size = 0 ; GISEL-NEXT: workgroup_group_segment_byte_size = 0 ; GISEL-NEXT: gds_segment_byte_size = 0 ; GISEL-NEXT: kernarg_segment_byte_size = 64 diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll --- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll @@ -1,8 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=DEFAULTSIZE,MUBUF %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-assume-dynamic-stack-object-size=1024 < %s | FileCheck -check-prefixes=ASSUME1024,MUBUF %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=DEFAULTSIZE,FLATSCR %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=+enable-flat-scratch -amdgpu-assume-dynamic-stack-object-size=1024 < %s | FileCheck -check-prefixes=ASSUME1024,FLATSCR %s ; FIXME: Generated test checks do not check metadata at the end of the ; function, so this also includes manually added checks. @@ -108,11 +106,8 @@ store volatile i32 0, i32 addrspace(1)* undef ret void } -; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4112 -; DEFAULTSIZE: ; ScratchSize: 4112 - -; ASSUME1024: .amdhsa_private_segment_fixed_size 1040 -; ASSUME1024: ; ScratchSize: 1040 +; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 16 +; DEFAULTSIZE: ; ScratchSize: 16 define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align64(i32 addrspace(1)* %out, i32 %arg.cond, i32 %in) { ; MUBUF-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64: @@ -201,11 +196,8 @@ ret void } -; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4160 -; DEFAULTSIZE: ; ScratchSize: 4160 - -; ASSUME1024: .amdhsa_private_segment_fixed_size 1088 -; ASSUME1024: ; ScratchSize: 1088 +; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 64 +; DEFAULTSIZE: ; ScratchSize: 64 define void @func_non_entry_block_static_alloca_align4(i32 addrspace(1)* %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) { diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-calling-conv.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-calling-conv.ll --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-calling-conv.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-calling-conv.ll @@ -80,7 +80,7 @@ ; ASM: buffer_store_dword ; ASM: buffer_store_dword ; ASM: s_swappc_b64 -; ASM: ScratchSize: 16400 +; ASM: ScratchSize: 16 define amdgpu_kernel void @call_private(i32 addrspace(1)* %out, i32 %in) #0 { entry: %tmp = alloca [2 x i32], addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/recursion.ll b/llvm/test/CodeGen/AMDGPU/recursion.ll --- a/llvm/test/CodeGen/AMDGPU/recursion.ll +++ b/llvm/test/CodeGen/AMDGPU/recursion.ll @@ -28,9 +28,9 @@ ret void } -; For an arbitrary recursive call, report a large number for unknown stack usage. +; For an arbitrary recursive call, report only the minimum stack requirement ; CHECK-LABEL: {{^}}calls_recursive: -; CHECK: .amdhsa_private_segment_fixed_size 16400{{$}} +; CHECK: .amdhsa_private_segment_fixed_size 16{{$}} define amdgpu_kernel void @calls_recursive() { call void @recursive() ret void @@ -50,14 +50,14 @@ ; in the kernel. ; CHECK-LABEL: {{^}}kernel_calls_tail_recursive: -; CHECK: .amdhsa_private_segment_fixed_size 16384{{$}} +; CHECK: .amdhsa_private_segment_fixed_size 0{{$}} define amdgpu_kernel void @kernel_calls_tail_recursive() { call void @tail_recursive() ret void } ; CHECK-LABEL: {{^}}kernel_calls_tail_recursive_with_stack: -; CHECK: .amdhsa_private_segment_fixed_size 16384{{$}} +; CHECK: .amdhsa_private_segment_fixed_size 8{{$}} define amdgpu_kernel void @kernel_calls_tail_recursive_with_stack() { call void @tail_recursive_with_stack() ret void