Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -550,8 +550,13 @@ // Scratch is allocated in 256 dword per wave blocks for the entire // wavefront. When viewed from the perspecive of an arbitrary workitem, this // is 4-byte aligned. + // + // Only 4-byte alignment is really needed to access anything. Transformations + // on the pointer value itself may rely on the alignment / known low bits of + // the pointer. Set this to something above the minimum to avoid needing + // dynamic realignment in common cases. unsigned getStackAlignment() const { - return 4; + return 16; } bool enableMachineScheduler() const override { Index: test/CodeGen/AMDGPU/byval-frame-setup.ll =================================================================== --- test/CodeGen/AMDGPU/byval-frame-setup.ll +++ test/CodeGen/AMDGPU/byval-frame-setup.ll @@ -34,7 +34,7 @@ ; GCN-DAG: buffer_store_dword v33 ; GCN-NOT: v_writelane_b32 v{{[0-9]+}}, s32 ; GCN-DAG: v_writelane_b32 -; GCN-DAG: s_add_u32 s32, s32, 0xb00{{$}} +; GCN-DAG: s_add_u32 s32, s32, 0xc00{{$}} ; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5 offset:4{{$}} ; GCN-DAG: v_add_{{[iu]}}32_e32 [[ADD0:v[0-9]+]], vcc, 1, [[LOAD0]] ; GCN-DAG: buffer_store_dword [[ADD0]], off, s[0:3], s5 offset:4{{$}} @@ -50,7 +50,7 @@ ; GCN-NOT: v_readlane_b32 s32 ; GCN: buffer_load_dword v32, ; GCN: buffer_load_dword v33, -; GCN: s_sub_u32 s32, s32, 0xb00{{$}} +; GCN: s_sub_u32 s32, s32, 0xc00{{$}} ; GCN: s_setpc_b64 define void @void_func_byval_struct_non_leaf(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg1) #1 { entry: @@ -130,7 +130,7 @@ ; GCN-LABEL: {{^}}call_void_func_byval_struct_kernel: ; GCN: s_mov_b32 s33, s7 -; GCN: s_add_u32 s32, s33, 0xa00{{$}} +; GCN: s_add_u32 s32, s33, 0xc00{{$}} ; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 ; GCN-DAG: v_mov_b32_e32 [[THIRTEEN:v[0-9]+]], 13 Index: test/CodeGen/AMDGPU/call-argument-types.ll =================================================================== --- test/CodeGen/AMDGPU/call-argument-types.ll +++ test/CodeGen/AMDGPU/call-argument-types.ll @@ -475,8 +475,8 @@ } ; GCN-LABEL: {{^}}test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32: -; MESA-DAG: s_add_u32 [[SP:s[0-9]+]], [[FP_REG:s[0-9]+]], 0x600{{$}} -; HSA-DAG: s_add_u32 [[SP:s[0-9]+]], [[FP_REG:s[0-9]+]], 0x600{{$}} +; MESA-DAG: s_add_u32 [[SP:s[0-9]+]], [[FP_REG:s[0-9]+]], 0x800{{$}} +; HSA-DAG: s_add_u32 [[SP:s[0-9]+]], [[FP_REG:s[0-9]+]], 0x800{{$}} ; GCN-DAG: v_mov_b32_e32 [[VAL0:v[0-9]+]], 3 ; GCN-DAG: v_mov_b32_e32 [[VAL1:v[0-9]+]], 8 Index: test/CodeGen/AMDGPU/call-graph-register-usage.ll =================================================================== --- test/CodeGen/AMDGPU/call-graph-register-usage.ll +++ test/CodeGen/AMDGPU/call-graph-register-usage.ll @@ -146,7 +146,7 @@ } ; GCN-LABEL: {{^}}indirect_use_stack: -; GCN: ScratchSize: 2124 +; GCN: ScratchSize: 2132 define void @indirect_use_stack() #1 { %alloca = alloca [16 x i32], align 4, addrspace(5) call void asm sideeffect "; use $0", "v"([16 x i32] addrspace(5)* %alloca) #0 @@ -156,7 +156,7 @@ ; GCN-LABEL: {{^}}indirect_2_level_use_stack: ; GCN: is_dynamic_callstack = 0 -; GCN: ScratchSize: 2124 +; GCN: ScratchSize: 2132 define amdgpu_kernel void @indirect_2_level_use_stack() #0 { call void @indirect_use_stack() ret void @@ -199,7 +199,7 @@ } ; GCN-LABEL: {{^}}direct_recursion_use_stack: -; GCN: ScratchSize: 2056 +; GCN: ScratchSize: 2064 define void @direct_recursion_use_stack(i32 %val) #2 { %alloca = alloca [512 x i32], align 4, addrspace(5) call void asm sideeffect "; use $0", "v"([512 x i32] addrspace(5)* %alloca) #0 @@ -218,7 +218,7 @@ ; GCN-LABEL: {{^}}usage_direct_recursion: ; GCN: is_ptr64 = 1 ; GCN: is_dynamic_callstack = 1 -; GCN: workitem_private_segment_byte_size = 2056 +; GCN: workitem_private_segment_byte_size = 2064 define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 { call void @direct_recursion_use_stack(i32 %n) ret void Index: test/CodeGen/AMDGPU/callee-frame-setup.ll =================================================================== --- test/CodeGen/AMDGPU/callee-frame-setup.ll +++ test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -42,7 +42,7 @@ ; GCN-DAG: v_writelane_b32 v32, s33, ; GCN-DAG: v_writelane_b32 v32, s34, ; GCN-DAG: v_writelane_b32 v32, s35, -; GCN-DAG: s_add_u32 s32, s32, 0x300{{$}} +; GCN-DAG: s_add_u32 s32, s32, 0x400{{$}} ; GCN-DAG: v_mov_b32_e32 v0, 0{{$}} ; GCN-DAG: buffer_store_dword v0, off, s[0:3], s5 offset:4{{$}} ; GCN-DAG: s_mov_b32 s33, s5 @@ -82,7 +82,7 @@ ; GCN-DAG: v_readlane_b32 s34, v32, 1 ; GCN-DAG: v_readlane_b32 s33, v32, 0 ; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4 -; GCN: s_sub_u32 s32, s32, 0x200 +; GCN: s_sub_u32 s32, s32, 0x400 ; GCN: s_setpc_b64 define void @callee_no_stack_with_call() #0 { Index: test/CodeGen/AMDGPU/callee-special-input-sgprs.ll =================================================================== --- test/CodeGen/AMDGPU/callee-special-input-sgprs.ll +++ test/CodeGen/AMDGPU/callee-special-input-sgprs.ll @@ -558,7 +558,7 @@ ; GCN-LABEL: {{^}}func_use_every_sgpr_input_call_use_workgroup_id_xyz_spill: ; GCN: s_mov_b32 s5, s32 -; GCN: s_add_u32 s32, s32, 0x300 +; GCN: s_add_u32 s32, s32, 0x400 ; GCN-DAG: s_mov_b32 [[SAVE_X:s[0-57-9][0-9]*]], s14 ; GCN-DAG: s_mov_b32 [[SAVE_Y:s[0-68-9][0-9]*]], s15 Index: test/CodeGen/AMDGPU/callee-special-input-vgprs.ll =================================================================== --- test/CodeGen/AMDGPU/callee-special-input-vgprs.ll +++ test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -423,7 +423,7 @@ ; GCN: enable_vgpr_workitem_id = 0 ; GCN: s_mov_b32 s33, s7 -; GCN: s_add_u32 s32, s33, 0x200{{$}} +; GCN: s_add_u32 s32, s33, 0x400{{$}} ; GCN-NOT: s32 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} Index: test/CodeGen/AMDGPU/nested-calls.ll =================================================================== --- test/CodeGen/AMDGPU/nested-calls.ll +++ test/CodeGen/AMDGPU/nested-calls.ll @@ -12,7 +12,7 @@ ; GCN: s_mov_b32 s5, s32 ; Spill CSR VGPR used for SGPR spilling ; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4 -; GCN-DAG: s_add_u32 s32, s32, 0x200 +; GCN-DAG: s_add_u32 s32, s32, 0x400 ; GCN-DAG: v_writelane_b32 v32, s33, 0 ; GCN-DAG: v_writelane_b32 v32, s34, 1 ; GCN-DAG: v_writelane_b32 v32, s35, 2 @@ -23,7 +23,7 @@ ; GCN: v_readlane_b32 s34, v32, 1 ; GCN: v_readlane_b32 s33, v32, 0 ; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4 -; GCN: s_sub_u32 s32, s32, 0x200 +; GCN: s_sub_u32 s32, s32, 0x400 ; GCN: s_setpc_b64 define void @test_func_call_external_void_func_i32_imm() #0 { call void @external_void_func_i32(i32 42) @@ -33,10 +33,10 @@ ; GCN-LABEL: {{^}}test_func_call_external_void_func_i32_imm_stack_use: ; GCN: s_waitcnt ; GCN: s_mov_b32 s5, s32 -; GCN: s_add_u32 s32, s32, 0x1200{{$}} +; GCN: s_add_u32 s32, s32, 0x1400{{$}} ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset ; GCN: s_swappc_b64 -; GCN: s_sub_u32 s32, s32, 0x1200{{$}} +; GCN: s_sub_u32 s32, s32, 0x1400{{$}} ; GCN: s_setpc_b64 define void @test_func_call_external_void_func_i32_imm_stack_use() #0 { %alloca = alloca [16 x i32], align 4, addrspace(5) Index: test/CodeGen/AMDGPU/promote-alloca-calling-conv.ll =================================================================== --- test/CodeGen/AMDGPU/promote-alloca-calling-conv.ll +++ test/CodeGen/AMDGPU/promote-alloca-calling-conv.ll @@ -80,7 +80,7 @@ ; ASM: buffer_store_dword ; ASM: buffer_store_dword ; ASM: s_swappc_b64 -; ASM: ScratchSize: 16396 +; ASM: ScratchSize: 16400 define amdgpu_kernel void @call_private(i32 addrspace(1)* %out, i32 %in) #0 { entry: %tmp = alloca [2 x i32], addrspace(5)