diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -693,10 +693,12 @@ ProgInfo.AccumOffset = alignTo(std::max(1, Info.NumVGPR), 4) / 4 - 1; ProgInfo.TgSplit = STM.isTgSplitEnabled(); ProgInfo.NumSGPR = Info.NumExplicitSGPR; - ProgInfo.ScratchSize = Info.PrivateSegmentSize; + ProgInfo.DynamicCallStack = + Info.HasDynamicallySizedStack || Info.HasRecursion; + ProgInfo.ScratchSize = + ProgInfo.DynamicCallStack ? 0 : Info.PrivateSegmentSize; ProgInfo.VCCUsed = Info.UsesVCC; ProgInfo.FlatUsed = Info.UsesFlatScratch; - ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion; const uint64_t MaxScratchPerWorkitem = STM.getMaxWaveScratchSize() / STM.getWavefrontSize(); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll @@ -76,11 +76,11 @@ store volatile i32 0, i32 addrspace(1)* undef ret void } -; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4112 -; DEFAULTSIZE: ; ScratchSize: 4112 +; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 0 +; DEFAULTSIZE: ; ScratchSize: 0 -; ASSUME1024: .amdhsa_private_segment_fixed_size 1040 -; ASSUME1024: ; ScratchSize: 1040 +; ASSUME1024: .amdhsa_private_segment_fixed_size 0 +; ASSUME1024: ; ScratchSize: 0 define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align64(i32 addrspace(1)* %out, i32 %arg.cond, i32 %in) { ; GCN-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64: @@ -139,11 +139,11 @@ ret void } -; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4160 -; DEFAULTSIZE: ; ScratchSize: 4160 +; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 0 +; DEFAULTSIZE: ; ScratchSize: 0 -; ASSUME1024: .amdhsa_private_segment_fixed_size 1088 -; ASSUME1024: ; ScratchSize: 1088 +; ASSUME1024: .amdhsa_private_segment_fixed_size 0 +; ASSUME1024: ; ScratchSize: 0 define void @func_non_entry_block_static_alloca_align4(i32 addrspace(1)* %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) { diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll --- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll @@ -181,7 +181,7 @@ ; GCN: is_dynamic_callstack = 1 ; NumSgprs: 48 ; NumVgprs: 24 -; GCN: ScratchSize: 16384 +; GCN: ScratchSize: 0 define amdgpu_kernel void @usage_external() #0 { call void @external() ret void @@ -193,7 +193,7 @@ ; GCN: is_dynamic_callstack = 1 ; NumSgprs: 48 ; NumVgprs: 24 -; GCN: ScratchSize: 16384 +; GCN: ScratchSize: 0 define amdgpu_kernel void @usage_external_recurse() #0 { call void @external_recurse() ret void @@ -219,7 +219,7 @@ ; GCN-LABEL: {{^}}usage_direct_recursion: ; GCN: is_ptr64 = 1 ; GCN: is_dynamic_callstack = 1 -; GCN: workitem_private_segment_byte_size = 18448{{$}} +; GCN: workitem_private_segment_byte_size = 0{{$}} define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 { call void @direct_recursion_use_stack(i32 %n) ret void diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll @@ -480,7 +480,7 @@ ; GCN: .amdhsa_user_sgpr_dispatch_id 1 ; GCN: .amdhsa_user_sgpr_flat_scratch_init 1 ; GCN: .amdhsa_user_sgpr_private_segment_size 0 -; GCN: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 +; GCN: .amdhsa_system_sgpr_private_segment_wavefront_offset 0 ; GCN: .amdhsa_system_sgpr_workgroup_id_x 1 ; GCN: .amdhsa_system_sgpr_workgroup_id_y 1 ; GCN: .amdhsa_system_sgpr_workgroup_id_z 1 @@ -506,7 +506,7 @@ ; GCN: .amdhsa_user_sgpr_dispatch_id 1 ; GCN: .amdhsa_user_sgpr_flat_scratch_init 1 ; GCN: .amdhsa_user_sgpr_private_segment_size 0 -; GCN: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 +; GCN: .amdhsa_system_sgpr_private_segment_wavefront_offset 0 ; GCN: .amdhsa_system_sgpr_workgroup_id_x 1 ; GCN: .amdhsa_system_sgpr_workgroup_id_y 1 ; GCN: .amdhsa_system_sgpr_workgroup_id_z 1 diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-resource-usage-function-ordering.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-resource-usage-function-ordering.ll --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-resource-usage-function-ordering.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-resource-usage-function-ordering.ll @@ -78,7 +78,7 @@ ; test a kernel with an external call that occurs before its callee in the module ; CHECK-LABEL: test3 -; CHECK: .private_segment_fixed_size: 5310 +; CHECK: .private_segment_fixed_size: 0 ; GFX7: .sgpr_count: 37 ; GFX7: .sgpr_spill_count: 0 @@ -108,7 +108,7 @@ ; test a kernel without an external call that occurs after its callee in the module ; CHECK-LABEL: test4 -; CHECK: .private_segment_fixed_size: 5310 +; CHECK: .private_segment_fixed_size: 0 ; GFX7: .sgpr_count: 37 ; GFX7: .sgpr_spill_count: 0 diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll --- a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll @@ -66,7 +66,7 @@ ; CHECK-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1 ; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0 ; CHECK-NEXT: .amdhsa_is_dynamic_callstack 1 -; CHECK-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 +; CHECK-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0 ; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 ; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 ; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll --- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -27,7 +27,7 @@ ; GCN-NEXT: enable_wgp_mode = 0 ; GCN-NEXT: enable_mem_ordered = 0 ; GCN-NEXT: enable_fwd_progress = 0 -; GCN-NEXT: enable_sgpr_private_segment_wave_byte_offset = 1 +; GCN-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 ; GCN-NEXT: user_sgpr_count = 14 ; GCN-NEXT: enable_trap_handler = 0 ; GCN-NEXT: enable_sgpr_workgroup_id_x = 1 @@ -55,7 +55,7 @@ ; GCN-NEXT: is_dynamic_callstack = 1 ; GCN-NEXT: is_debug_enabled = 0 ; GCN-NEXT: is_xnack_enabled = 0 -; GCN-NEXT: workitem_private_segment_byte_size = 16384 +; GCN-NEXT: workitem_private_segment_byte_size = 0 ; GCN-NEXT: workgroup_group_segment_byte_size = 0 ; GCN-NEXT: gds_segment_byte_size = 0 ; GCN-NEXT: kernarg_segment_byte_size = 64 @@ -120,7 +120,7 @@ ; GISEL-NEXT: enable_wgp_mode = 0 ; GISEL-NEXT: enable_mem_ordered = 0 ; GISEL-NEXT: enable_fwd_progress = 0 -; GISEL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 1 +; GISEL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 ; GISEL-NEXT: user_sgpr_count = 14 ; GISEL-NEXT: enable_trap_handler = 0 ; GISEL-NEXT: enable_sgpr_workgroup_id_x = 1 @@ -148,7 +148,7 @@ ; GISEL-NEXT: is_dynamic_callstack = 1 ; GISEL-NEXT: is_debug_enabled = 0 ; GISEL-NEXT: is_xnack_enabled = 0 -; GISEL-NEXT: workitem_private_segment_byte_size = 16384 +; GISEL-NEXT: workitem_private_segment_byte_size = 0 ; GISEL-NEXT: workgroup_group_segment_byte_size = 0 ; GISEL-NEXT: gds_segment_byte_size = 0 ; GISEL-NEXT: kernarg_segment_byte_size = 64 @@ -218,7 +218,7 @@ ; GCN-NEXT: enable_wgp_mode = 0 ; GCN-NEXT: enable_mem_ordered = 0 ; GCN-NEXT: enable_fwd_progress = 0 -; GCN-NEXT: enable_sgpr_private_segment_wave_byte_offset = 1 +; GCN-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 ; GCN-NEXT: user_sgpr_count = 14 ; GCN-NEXT: enable_trap_handler = 0 ; GCN-NEXT: enable_sgpr_workgroup_id_x = 1 @@ -246,7 +246,7 @@ ; GCN-NEXT: is_dynamic_callstack = 1 ; GCN-NEXT: is_debug_enabled = 0 ; GCN-NEXT: is_xnack_enabled = 0 -; GCN-NEXT: workitem_private_segment_byte_size = 16384 +; GCN-NEXT: workitem_private_segment_byte_size = 0 ; GCN-NEXT: workgroup_group_segment_byte_size = 0 ; GCN-NEXT: gds_segment_byte_size = 0 ; GCN-NEXT: kernarg_segment_byte_size = 64 @@ -312,7 +312,7 @@ ; GISEL-NEXT: enable_wgp_mode = 0 ; GISEL-NEXT: enable_mem_ordered = 0 ; GISEL-NEXT: enable_fwd_progress = 0 -; GISEL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 1 +; GISEL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 ; GISEL-NEXT: user_sgpr_count = 14 ; GISEL-NEXT: enable_trap_handler = 0 ; GISEL-NEXT: enable_sgpr_workgroup_id_x = 1 @@ -340,7 +340,7 @@ ; GISEL-NEXT: is_dynamic_callstack = 1 ; GISEL-NEXT: is_debug_enabled = 0 ; GISEL-NEXT: is_xnack_enabled = 0 -; GISEL-NEXT: workitem_private_segment_byte_size = 16384 +; GISEL-NEXT: workitem_private_segment_byte_size = 0 ; GISEL-NEXT: workgroup_group_segment_byte_size = 0 ; GISEL-NEXT: gds_segment_byte_size = 0 ; GISEL-NEXT: kernarg_segment_byte_size = 64 diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll --- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll @@ -108,11 +108,11 @@ store volatile i32 0, i32 addrspace(1)* undef ret void } -; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4112 -; DEFAULTSIZE: ; ScratchSize: 4112 +; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 0 +; DEFAULTSIZE: ; ScratchSize: 0 -; ASSUME1024: .amdhsa_private_segment_fixed_size 1040 -; ASSUME1024: ; ScratchSize: 1040 +; ASSUME1024: .amdhsa_private_segment_fixed_size 0 +; ASSUME1024: ; ScratchSize: 0 define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align64(i32 addrspace(1)* %out, i32 %arg.cond, i32 %in) { ; MUBUF-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64: @@ -201,11 +201,11 @@ ret void } -; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4160 -; DEFAULTSIZE: ; ScratchSize: 4160 +; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 0 +; DEFAULTSIZE: ; ScratchSize: 0 -; ASSUME1024: .amdhsa_private_segment_fixed_size 1088 -; ASSUME1024: ; ScratchSize: 1088 +; ASSUME1024: .amdhsa_private_segment_fixed_size 0 +; ASSUME1024: ; ScratchSize: 0 define void @func_non_entry_block_static_alloca_align4(i32 addrspace(1)* %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) { diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-calling-conv.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-calling-conv.ll --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-calling-conv.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-calling-conv.ll @@ -80,7 +80,7 @@ ; ASM: buffer_store_dword ; ASM: buffer_store_dword ; ASM: s_swappc_b64 -; ASM: ScratchSize: 16400 +; ASM: ScratchSize: 0 define amdgpu_kernel void @call_private(i32 addrspace(1)* %out, i32 %in) #0 { entry: %tmp = alloca [2 x i32], addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/recursion.ll b/llvm/test/CodeGen/AMDGPU/recursion.ll --- a/llvm/test/CodeGen/AMDGPU/recursion.ll +++ b/llvm/test/CodeGen/AMDGPU/recursion.ll @@ -28,9 +28,9 @@ ret void } -; For an arbitrary recursive call, report a large number for unknown stack usage. +; For an arbitrary recursive call, report stack usage as 0 ; CHECK-LABEL: {{^}}calls_recursive: -; CHECK: .amdhsa_private_segment_fixed_size 16400{{$}} +; CHECK: .amdhsa_private_segment_fixed_size 0{{$}} define amdgpu_kernel void @calls_recursive() { call void @recursive() ret void @@ -50,14 +50,14 @@ ; in the kernel. ; CHECK-LABEL: {{^}}kernel_calls_tail_recursive: -; CHECK: .amdhsa_private_segment_fixed_size 16384{{$}} +; CHECK: .amdhsa_private_segment_fixed_size 0{{$}} define amdgpu_kernel void @kernel_calls_tail_recursive() { call void @tail_recursive() ret void } ; CHECK-LABEL: {{^}}kernel_calls_tail_recursive_with_stack: -; CHECK: .amdhsa_private_segment_fixed_size 16384{{$}} +; CHECK: .amdhsa_private_segment_fixed_size 0{{$}} define amdgpu_kernel void @kernel_calls_tail_recursive_with_stack() { call void @tail_recursive_with_stack() ret void