Index: lib/Target/AMDGPU/SIRegisterInfo.h =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.h +++ lib/Target/AMDGPU/SIRegisterInfo.h @@ -73,6 +73,7 @@ unsigned getFrameRegister(const MachineFunction &MF) const override; + bool canRealignStack(const MachineFunction &MF) const override; bool requiresRegisterScavenging(const MachineFunction &Fn) const override; bool requiresFrameIndexScavenging(const MachineFunction &MF) const override; Index: lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.cpp +++ lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -240,6 +240,19 @@ return Reserved; } +bool SIRegisterInfo::canRealignStack(const MachineFunction &MF) const { + const SIMachineFunctionInfo *Info = MF.getInfo(); + // On entry, the base address is 0, so it can't possibly need any more + // alignment. + + // FIXME: Should be able to specify the entry frame alignment per calling + // convention instead. + if (Info->isEntryFunction()) + return false; + + return TargetRegisterInfo::canRealignStack(MF); +} + bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { const SIMachineFunctionInfo *Info = Fn.getInfo(); if (Info->isEntryFunction()) { Index: test/CodeGen/AMDGPU/stack-realign-kernel.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/stack-realign-kernel.ll @@ -0,0 +1,294 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s + +; Make sure the stack is never realigned for entry functions. + +define amdgpu_kernel void @max_alignment_128() #0 { +; VI-LABEL: max_alignment_128: +; VI: ; %bb.0: +; VI-NEXT: s_add_u32 s4, s4, s7 +; VI-NEXT: v_mov_b32_e32 v0, 9 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s5 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s7 offset:128 +; VI-NEXT: s_endpgm +; VI-NEXT: .section .rodata,#alloc +; VI-NEXT: .p2align 6 +; VI-NEXT: .amdhsa_kernel max_alignment_128 +; VI-NEXT: .amdhsa_group_segment_fixed_size 0 +; VI-NEXT: .amdhsa_private_segment_fixed_size 256 +; VI-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 +; VI-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 +; VI-NEXT: .amdhsa_user_sgpr_queue_ptr 0 +; VI-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 +; VI-NEXT: .amdhsa_user_sgpr_dispatch_id 0 +; VI-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1 +; VI-NEXT: .amdhsa_user_sgpr_private_segment_size 0 +; VI-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 +; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 +; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 +; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 +; VI-NEXT: .amdhsa_system_sgpr_workgroup_info 0 +; VI-NEXT: .amdhsa_system_vgpr_workitem_id 0 +; VI-NEXT: .amdhsa_next_free_vgpr 1 +; VI-NEXT: .amdhsa_next_free_sgpr 8 +; VI-NEXT: .amdhsa_reserve_vcc 0 +; VI-NEXT: .amdhsa_float_round_mode_32 0 +; VI-NEXT: .amdhsa_float_round_mode_16_64 0 +; VI-NEXT: .amdhsa_float_denorm_mode_32 0 +; VI-NEXT: .amdhsa_float_denorm_mode_16_64 3 +; VI-NEXT: .amdhsa_dx10_clamp 1 +; VI-NEXT: .amdhsa_ieee_mode 1 +; VI-NEXT: .amdhsa_exception_fp_ieee_invalid_op 0 +; VI-NEXT: .amdhsa_exception_fp_denorm_src 0 +; VI-NEXT: .amdhsa_exception_fp_ieee_div_zero 0 +; VI-NEXT: .amdhsa_exception_fp_ieee_overflow 0 +; VI-NEXT: .amdhsa_exception_fp_ieee_underflow 0 +; VI-NEXT: .amdhsa_exception_fp_ieee_inexact 0 +; VI-NEXT: .amdhsa_exception_int_div_zero 0 +; VI-NEXT: .end_amdhsa_kernel +; VI-NEXT: .text +; +; GFX9-LABEL: max_alignment_128: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s4, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, 9 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s7 offset:128 +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: .section .rodata,#alloc +; GFX9-NEXT: .p2align 6 +; GFX9-NEXT: .amdhsa_kernel max_alignment_128 +; GFX9-NEXT: .amdhsa_group_segment_fixed_size 0 +; GFX9-NEXT: .amdhsa_private_segment_fixed_size 256 +; GFX9-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 +; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 +; GFX9-NEXT: .amdhsa_user_sgpr_queue_ptr 0 +; GFX9-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 +; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_id 0 +; GFX9-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1 +; GFX9-NEXT: .amdhsa_user_sgpr_private_segment_size 0 +; GFX9-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 +; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 +; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 +; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 +; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_info 0 +; GFX9-NEXT: .amdhsa_system_vgpr_workitem_id 0 +; GFX9-NEXT: .amdhsa_next_free_vgpr 1 +; GFX9-NEXT: .amdhsa_next_free_sgpr 8 +; GFX9-NEXT: .amdhsa_reserve_vcc 0 +; GFX9-NEXT: .amdhsa_float_round_mode_32 0 +; GFX9-NEXT: .amdhsa_float_round_mode_16_64 0 +; GFX9-NEXT: .amdhsa_float_denorm_mode_32 0 +; GFX9-NEXT: .amdhsa_float_denorm_mode_16_64 3 +; GFX9-NEXT: .amdhsa_dx10_clamp 1 +; GFX9-NEXT: .amdhsa_ieee_mode 1 +; GFX9-NEXT: .amdhsa_fp16_overflow 0 +; GFX9-NEXT: .amdhsa_exception_fp_ieee_invalid_op 0 +; GFX9-NEXT: .amdhsa_exception_fp_denorm_src 0 +; GFX9-NEXT: .amdhsa_exception_fp_ieee_div_zero 0 +; GFX9-NEXT: .amdhsa_exception_fp_ieee_overflow 0 +; GFX9-NEXT: .amdhsa_exception_fp_ieee_underflow 0 +; GFX9-NEXT: .amdhsa_exception_fp_ieee_inexact 0 +; GFX9-NEXT: .amdhsa_exception_int_div_zero 0 +; GFX9-NEXT: .end_amdhsa_kernel +; GFX9-NEXT: .text + %alloca.align = alloca i32, align 128, addrspace(5) + store volatile i32 9, i32 addrspace(5)* %alloca.align, align 128 + ret void +} + +define amdgpu_kernel void @stackrealign_attr() #1 { +; VI-LABEL: stackrealign_attr: +; VI: ; %bb.0: +; VI-NEXT: s_add_u32 s4, s4, s7 +; VI-NEXT: v_mov_b32_e32 v0, 9 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s5 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s7 offset:4 +; VI-NEXT: s_endpgm +; VI-NEXT: .section .rodata,#alloc +; VI-NEXT: .p2align 6 +; VI-NEXT: .amdhsa_kernel stackrealign_attr +; VI-NEXT: .amdhsa_group_segment_fixed_size 0 +; VI-NEXT: .amdhsa_private_segment_fixed_size 8 +; VI-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 +; VI-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 +; VI-NEXT: .amdhsa_user_sgpr_queue_ptr 0 +; VI-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 +; VI-NEXT: .amdhsa_user_sgpr_dispatch_id 0 +; VI-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1 +; VI-NEXT: .amdhsa_user_sgpr_private_segment_size 0 +; VI-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 +; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 +; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 +; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 +; VI-NEXT: .amdhsa_system_sgpr_workgroup_info 0 +; VI-NEXT: .amdhsa_system_vgpr_workitem_id 0 +; VI-NEXT: .amdhsa_next_free_vgpr 1 +; VI-NEXT: .amdhsa_next_free_sgpr 8 +; VI-NEXT: .amdhsa_reserve_vcc 0 +; VI-NEXT: .amdhsa_float_round_mode_32 0 +; VI-NEXT: .amdhsa_float_round_mode_16_64 0 +; VI-NEXT: .amdhsa_float_denorm_mode_32 0 +; VI-NEXT: .amdhsa_float_denorm_mode_16_64 3 +; VI-NEXT: .amdhsa_dx10_clamp 1 +; VI-NEXT: .amdhsa_ieee_mode 1 +; VI-NEXT: .amdhsa_exception_fp_ieee_invalid_op 0 +; VI-NEXT: .amdhsa_exception_fp_denorm_src 0 +; VI-NEXT: .amdhsa_exception_fp_ieee_div_zero 0 +; VI-NEXT: .amdhsa_exception_fp_ieee_overflow 0 +; VI-NEXT: .amdhsa_exception_fp_ieee_underflow 0 +; VI-NEXT: .amdhsa_exception_fp_ieee_inexact 0 +; VI-NEXT: .amdhsa_exception_int_div_zero 0 +; VI-NEXT: .end_amdhsa_kernel +; VI-NEXT: .text +; +; GFX9-LABEL: stackrealign_attr: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s4, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, 9 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s7 offset:4 +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: .section .rodata,#alloc +; GFX9-NEXT: .p2align 6 +; GFX9-NEXT: .amdhsa_kernel stackrealign_attr +; GFX9-NEXT: .amdhsa_group_segment_fixed_size 0 +; GFX9-NEXT: .amdhsa_private_segment_fixed_size 8 +; GFX9-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 +; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 +; GFX9-NEXT: .amdhsa_user_sgpr_queue_ptr 0 +; GFX9-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 +; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_id 0 +; GFX9-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1 +; GFX9-NEXT: .amdhsa_user_sgpr_private_segment_size 0 +; GFX9-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 +; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 +; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 +; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 +; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_info 0 +; GFX9-NEXT: .amdhsa_system_vgpr_workitem_id 0 +; GFX9-NEXT: .amdhsa_next_free_vgpr 1 +; GFX9-NEXT: .amdhsa_next_free_sgpr 8 +; GFX9-NEXT: .amdhsa_reserve_vcc 0 +; GFX9-NEXT: .amdhsa_float_round_mode_32 0 +; GFX9-NEXT: .amdhsa_float_round_mode_16_64 0 +; GFX9-NEXT: .amdhsa_float_denorm_mode_32 0 +; GFX9-NEXT: .amdhsa_float_denorm_mode_16_64 3 +; GFX9-NEXT: .amdhsa_dx10_clamp 1 +; GFX9-NEXT: .amdhsa_ieee_mode 1 +; GFX9-NEXT: .amdhsa_fp16_overflow 0 +; GFX9-NEXT: .amdhsa_exception_fp_ieee_invalid_op 0 +; GFX9-NEXT: .amdhsa_exception_fp_denorm_src 0 +; GFX9-NEXT: .amdhsa_exception_fp_ieee_div_zero 0 +; GFX9-NEXT: .amdhsa_exception_fp_ieee_overflow 0 +; GFX9-NEXT: .amdhsa_exception_fp_ieee_underflow 0 +; GFX9-NEXT: .amdhsa_exception_fp_ieee_inexact 0 +; GFX9-NEXT: .amdhsa_exception_int_div_zero 0 +; GFX9-NEXT: .end_amdhsa_kernel +; GFX9-NEXT: .text + %alloca.align = alloca i32, align 4, addrspace(5) + store volatile i32 9, i32 addrspace(5)* %alloca.align, align 4 + ret void +} + +define amdgpu_kernel void @alignstack_attr() #2 { +; VI-LABEL: alignstack_attr: +; VI: ; %bb.0: +; VI-NEXT: s_add_u32 s4, s4, s7 +; VI-NEXT: v_mov_b32_e32 v0, 9 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s5 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s7 offset:4 +; VI-NEXT: s_endpgm +; VI-NEXT: .section .rodata,#alloc +; VI-NEXT: .p2align 6 +; VI-NEXT: .amdhsa_kernel alignstack_attr +; VI-NEXT: .amdhsa_group_segment_fixed_size 0 +; VI-NEXT: .amdhsa_private_segment_fixed_size 128 +; VI-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 +; VI-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 +; VI-NEXT: .amdhsa_user_sgpr_queue_ptr 0 +; VI-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 +; VI-NEXT: .amdhsa_user_sgpr_dispatch_id 0 +; VI-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1 +; VI-NEXT: .amdhsa_user_sgpr_private_segment_size 0 +; VI-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 +; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 +; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 +; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 +; VI-NEXT: .amdhsa_system_sgpr_workgroup_info 0 +; VI-NEXT: .amdhsa_system_vgpr_workitem_id 0 +; VI-NEXT: .amdhsa_next_free_vgpr 1 +; VI-NEXT: .amdhsa_next_free_sgpr 8 +; VI-NEXT: .amdhsa_reserve_vcc 0 +; VI-NEXT: .amdhsa_float_round_mode_32 0 +; VI-NEXT: .amdhsa_float_round_mode_16_64 0 +; VI-NEXT: .amdhsa_float_denorm_mode_32 0 +; VI-NEXT: .amdhsa_float_denorm_mode_16_64 3 +; VI-NEXT: .amdhsa_dx10_clamp 1 +; VI-NEXT: .amdhsa_ieee_mode 1 +; VI-NEXT: .amdhsa_exception_fp_ieee_invalid_op 0 +; VI-NEXT: .amdhsa_exception_fp_denorm_src 0 +; VI-NEXT: .amdhsa_exception_fp_ieee_div_zero 0 +; VI-NEXT: .amdhsa_exception_fp_ieee_overflow 0 +; VI-NEXT: .amdhsa_exception_fp_ieee_underflow 0 +; VI-NEXT: .amdhsa_exception_fp_ieee_inexact 0 +; VI-NEXT: .amdhsa_exception_int_div_zero 0 +; VI-NEXT: .end_amdhsa_kernel +; VI-NEXT: .text +; +; GFX9-LABEL: alignstack_attr: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s4, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, 9 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s7 offset:4 +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: .section .rodata,#alloc +; GFX9-NEXT: .p2align 6 +; GFX9-NEXT: .amdhsa_kernel alignstack_attr +; GFX9-NEXT: .amdhsa_group_segment_fixed_size 0 +; GFX9-NEXT: .amdhsa_private_segment_fixed_size 128 +; GFX9-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 +; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 +; GFX9-NEXT: .amdhsa_user_sgpr_queue_ptr 0 +; GFX9-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 +; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_id 0 +; GFX9-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1 +; GFX9-NEXT: .amdhsa_user_sgpr_private_segment_size 0 +; GFX9-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 +; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 +; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 +; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 +; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_info 0 +; GFX9-NEXT: .amdhsa_system_vgpr_workitem_id 0 +; GFX9-NEXT: .amdhsa_next_free_vgpr 1 +; GFX9-NEXT: .amdhsa_next_free_sgpr 8 +; GFX9-NEXT: .amdhsa_reserve_vcc 0 +; GFX9-NEXT: .amdhsa_float_round_mode_32 0 +; GFX9-NEXT: .amdhsa_float_round_mode_16_64 0 +; GFX9-NEXT: .amdhsa_float_denorm_mode_32 0 +; GFX9-NEXT: .amdhsa_float_denorm_mode_16_64 3 +; GFX9-NEXT: .amdhsa_dx10_clamp 1 +; GFX9-NEXT: .amdhsa_ieee_mode 1 +; GFX9-NEXT: .amdhsa_fp16_overflow 0 +; GFX9-NEXT: .amdhsa_exception_fp_ieee_invalid_op 0 +; GFX9-NEXT: .amdhsa_exception_fp_denorm_src 0 +; GFX9-NEXT: .amdhsa_exception_fp_ieee_div_zero 0 +; GFX9-NEXT: .amdhsa_exception_fp_ieee_overflow 0 +; GFX9-NEXT: .amdhsa_exception_fp_ieee_underflow 0 +; GFX9-NEXT: .amdhsa_exception_fp_ieee_inexact 0 +; GFX9-NEXT: .amdhsa_exception_int_div_zero 0 +; GFX9-NEXT: .end_amdhsa_kernel +; GFX9-NEXT: .text + %alloca.align = alloca i32, align 4, addrspace(5) + store volatile i32 9, i32 addrspace(5)* %alloca.align, align 4 + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind "stackrealign" } +attributes #2 = { nounwind alignstack=128 } Index: test/CodeGen/AMDGPU/stack-realign.ll =================================================================== --- test/CodeGen/AMDGPU/stack-realign.ll +++ test/CodeGen/AMDGPU/stack-realign.ll @@ -120,6 +120,32 @@ ret void } +; GCN-LABEL: {{^}}default_realign_align128: +; GCN: s_add_u32 [[TMP:s[0-9]+]], s32, 0x1fc0 +; GCN-NEXT: s_and_b32 s5, [[TMP]], 0xffffe000 +; GCN-NEXT: s_add_u32 s32, s32, 0x6000 +; GCN-NOT: s5 +; GCN: buffer_store_dword v0, off, s[0:3], s5 offset:128 +; GCN: s_sub_u32 s32, s32, 0x6000 +define void @default_realign_align128(i32 %idx) #0 { + %alloca.align = alloca i32, align 128, addrspace(5) + store volatile i32 9, i32 addrspace(5)* %alloca.align, align 128 + ret void +} + +; GCN-LABEL: {{^}}disable_realign_align128: +; GCN-NOT: s32 +; GCN: s_mov_b32 s5, s32 +; GCN-NOT: s32 +; GCN: buffer_store_dword v0, off, s[0:3], s5 offset:16 +; GCN-NOT: s32 +define void @disable_realign_align128(i32 %idx) #3 { + %alloca.align = alloca i32, align 128, addrspace(5) + store volatile i32 9, i32 addrspace(5)* %alloca.align, align 128 + ret void +} + attributes #0 = { noinline nounwind } attributes #1 = { noinline nounwind "stackrealign" } attributes #2 = { noinline nounwind alignstack=4 } +attributes #3 = { noinline nounwind "no-realign-stack" }