Index: llvm/lib/Target/AMDGPU/SIFrameLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/SIFrameLowering.h +++ llvm/lib/Target/AMDGPU/SIFrameLowering.h @@ -71,6 +71,8 @@ public: bool hasFP(const MachineFunction &MF) const override; + + bool requiresStackPointerReference(const MachineFunction &MF) const; }; } // end namespace llvm Index: llvm/lib/Target/AMDGPU/SIFrameLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -447,7 +447,7 @@ } assert(ScratchWaveOffsetReg); - if (MF.getFrameInfo().hasCalls()) { + if (requiresStackPointerReference(MF)) { Register SPReg = MFI->getStackPtrOffsetReg(); assert(SPReg != AMDGPU::SP_REG); BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg) @@ -1262,6 +1262,20 @@ return MBB.erase(I); } +/// Returns true if the frame will require a reference to the stack pointer. +/// +/// This is the set of conditions common to setting up the stack pointer in a +/// kernel, and for using a frame pointer in a callable function. +/// +/// FIXME: Should also check hasOpaqueSPAdjustment and if any inline asm +/// references SP. +static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI) { + return MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint(); +} + +// The FP for kernels is always known 0, so we never really need to setup an +// explicit register for it. However, DisableFramePointerElim will force us to +// use a register for it. bool SIFrameLowering::hasFP(const MachineFunction &MF) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); @@ -1277,8 +1291,31 @@ return MFI.getStackSize() != 0; } - return MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() || - MFI.hasStackMap() || MFI.hasPatchPoint() || + return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() || MF.getSubtarget().getRegisterInfo()->needsStackRealignment(MF) || MF.getTarget().Options.DisableFramePointerElim(MF); } + +// This is essentially a reduced version of hasFP for entry functions. Since the +// stack pointer is known 0 on entry to kernels, we never really need an FP +// register. We may need to initialize the stack pointer depending on the frame +// properties, which logically overlaps many of the cases where an ordinary +// function would require an FP. +bool SIFrameLowering::requiresStackPointerReference( + const MachineFunction &MF) const { + // Callable functions always require a stack pointer reference. + assert(MF.getInfo()->isEntryFunction() && + "only expected to call this for entry points"); + + const MachineFrameInfo &MFI = MF.getFrameInfo(); + + // Entry points ordinarily don't need to initialize SP. We have to set it up + // for callees if there are any. Also note tail calls are impossible/don't + // make any sense for kernels. + if (MFI.hasCalls()) + return true; + + // We still need to initialize the SP if we're doing anything weird that + // references the SP, like variable sized stack objects. + return frameTriviallyRequiresSP(MFI); +} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll @@ -15,6 +15,7 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15 ; GFX9-NEXT: s_and_b32 s4, s4, -16 +; GFX9-NEXT: s_movk_i32 s32, 0x400 ; GFX9-NEXT: s_lshl_b32 s4, s4, 6 ; GFX9-NEXT: s_add_u32 s4, s32, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -26,6 +27,7 @@ ; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align4: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_add_u32 s6, s6, s9 +; GFX10-NEXT: s_movk_i32 s32, 0x200 ; GFX10-NEXT: s_mov_b32 s33, 0 ; GFX10-NEXT: s_addc_u32 s7, s7, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 @@ -117,6 +119,7 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15 ; GFX9-NEXT: s_and_b32 s4, s4, -16 +; GFX9-NEXT: s_movk_i32 s32, 0x400 ; GFX9-NEXT: s_lshl_b32 s4, s4, 6 ; GFX9-NEXT: s_add_u32 s4, s32, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -128,6 +131,7 @@ ; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_add_u32 s6, s6, s9 +; GFX10-NEXT: s_movk_i32 s32, 0x200 ; GFX10-NEXT: s_mov_b32 s33, 0 ; GFX10-NEXT: s_addc_u32 s7, s7, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 @@ -219,6 +223,7 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15 ; GFX9-NEXT: s_and_b32 s4, s4, -16 +; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_lshl_b32 s4, s4, 6 ; GFX9-NEXT: s_add_u32 s4, s32, s4 ; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff800 @@ -231,6 +236,7 @@ ; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_add_u32 s6, s6, s9 +; GFX10-NEXT: s_movk_i32 s32, 0x400 ; GFX10-NEXT: s_mov_b32 s33, 0 ; GFX10-NEXT: s_addc_u32 s7, s7, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll @@ -18,12 +18,13 @@ ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; GCN-NEXT: s_add_u32 s0, s0, s9 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mov_b32 s33, 0 +; GCN-NEXT: s_movk_i32 s32, 0x400 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-NEXT: s_cselect_b32 s6, 1, 0 ; GCN-NEXT: s_and_b32 s6, s6, 1 ; GCN-NEXT: s_cmp_lg_u32 s6, 0 +; GCN-NEXT: s_mov_b32 s33, 0 ; GCN-NEXT: s_cbranch_scc1 BB0_3 ; GCN-NEXT: ; %bb.1: ; %bb.0 ; GCN-NEXT: s_load_dword s6, s[4:5], 0xc @@ -99,12 +100,13 @@ ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; GCN-NEXT: s_add_u32 s0, s0, s9 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mov_b32 s33, 0 +; GCN-NEXT: s_movk_i32 s32, 0x1000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-NEXT: s_cselect_b32 s6, 1, 0 ; GCN-NEXT: s_and_b32 s6, s6, 1 ; GCN-NEXT: s_cmp_lg_u32 s6, 0 +; GCN-NEXT: s_mov_b32 s33, 0 ; GCN-NEXT: s_cbranch_scc1 BB1_2 ; GCN-NEXT: ; %bb.1: ; %bb.0 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 Index: llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll +++ llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll @@ -18,6 +18,7 @@ ; GCN-NEXT: s_add_u32 s0, s0, s9 ; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 ; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_movk_i32 s32, 0x400 ; GCN-NEXT: s_mov_b32 s33, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s8, 0 @@ -89,6 +90,7 @@ ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 ; GCN-NEXT: s_add_u32 s0, s0, s9 ; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_movk_i32 s32, 0x1000 ; GCN-NEXT: s_mov_b32 s33, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s6, 0