diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp --- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -262,13 +262,10 @@ if (!LowestAvailableVGPR) LowestAvailableVGPR = PreReservedVGPR; - const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs(); MachineFrameInfo &FrameInfo = MF.getFrameInfo(); - Optional FI; - // Check if we are reserving a CSR. Create a stack object for a possible spill - // in the function prologue. - if (FuncInfo->isCalleeSavedReg(CSRegs, LowestAvailableVGPR)) - FI = FrameInfo.CreateSpillStackObject(4, Align(4)); + // Create a stack object for a possible spill in the function prologue. + // Note Non-CSR VGPR also need this as we may overwrite inactive lanes. + Optional FI = FrameInfo.CreateSpillStackObject(4, Align(4)); // Find saved info about the pre-reserved register. const auto *ReservedVGPRInfoItr = diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll --- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -269,6 +269,10 @@ ; Use a copy to a free SGPR instead of introducing a second CSR VGPR. ; GCN-LABEL: {{^}}last_lane_vgpr_for_fp_csr: ; GCN: s_waitcnt +; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; GCN-NEXT: v_writelane_b32 v1, s33, 63 ; GCN-COUNT-60: v_writelane_b32 v1 ; GCN: s_mov_b32 s33, s32 @@ -280,11 +284,15 @@ ; GCN: ;;#ASMSTART ; GCN: v_writelane_b32 v1 -; MUBUF: s_add_u32 s32, s32, 0x300 -; MUBUF: s_sub_u32 s32, s32, 0x300 -; FLATSCR: s_add_u32 s32, s32, 12 -; FLATSCR: s_sub_u32 s32, s32, 12 +; MUBUF: s_add_u32 s32, s32, 0x400 +; MUBUF: s_sub_u32 s32, s32, 0x400 +; FLATSCR: s_add_u32 s32, s32, 16 +; FLATSCR: s_sub_u32 s32, s32, 16 ; GCN-NEXT: v_readlane_b32 s33, v1, 63 +; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define void @last_lane_vgpr_for_fp_csr() #1 { @@ -306,6 +314,10 @@ ; Use a copy to a free SGPR instead of introducing a second CSR VGPR. ; GCN-LABEL: {{^}}no_new_vgpr_for_fp_csr: ; GCN: s_waitcnt +; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; GCN-COUNT-62: v_writelane_b32 v1, ; GCN: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 ; GCN-NEXT: s_mov_b32 s33, s32 @@ -318,12 +330,16 @@ ; GCN: v_writelane_b32 v1, ; MUBUF: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; FLATSCR: scratch_load_dword v41, off, s33 ; 4-byte Folded Reload -; MUBUF: s_add_u32 s32, s32, 0x300 -; FLATSCR: s_add_u32 s32, s32, 12 +; MUBUF: s_add_u32 s32, s32, 0x400 +; FLATSCR: s_add_u32 s32, s32, 16 ; GCN-COUNT-64: v_readlane_b32 s{{[0-9]+}}, v1 -; MUBUF-NEXT: s_sub_u32 s32, s32, 0x300 -; FLATSCR-NEXT: s_sub_u32 s32, s32, 12 +; MUBUF-NEXT: s_sub_u32 s32, s32, 0x400 +; FLATSCR-NEXT: s_sub_u32 s32, s32, 16 ; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]] +; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define void @no_new_vgpr_for_fp_csr() #1 { @@ -368,6 +384,10 @@ ; GCN-LABEL: {{^}}no_unused_non_csr_sgpr_for_fp: ; GCN: s_waitcnt +; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; GCN-NEXT: v_writelane_b32 v1, s33, 2 ; GCN-NEXT: v_writelane_b32 v1, s30, 0 ; GCN-NEXT: s_mov_b32 s33, s32 @@ -377,15 +397,20 @@ ; FLATSCR: scratch_store_dword off, [[ZERO]], s33 offset:4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN: ;;#ASMSTART -; MUBUF: v_readlane_b32 s4, v1, 0 -; MUBUF-NEXT: s_add_u32 s32, s32, 0x200 +; MUBUF: s_add_u32 s32, s32, 0x300 +; MUBUF-NEXT: v_readlane_b32 s4, v1, 0 ; MUBUF-NEXT: v_readlane_b32 s5, v1, 1 -; FLATSCR: v_readlane_b32 s0, v1, 0 -; FLATSCR-NEXT: s_add_u32 s32, s32, 8 +; FLATSCR: s_add_u32 s32, s32, 12 +; FLATSCR-NEXT: v_readlane_b32 s0, v1, 0 ; FLATSCR-NEXT: v_readlane_b32 s1, v1, 1 -; MUBUF-NEXT: s_sub_u32 s32, s32, 0x200 -; FLATSCR-NEXT: s_sub_u32 s32, s32, 8 +; MUBUF-NEXT: s_sub_u32 s32, s32, 0x300 +; FLATSCR-NEXT: s_sub_u32 s32, s32, 12 ; GCN-NEXT: v_readlane_b32 s33, v1, 2 +; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] +; GCN-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_setpc_b64 s[4:5] ; FLATSCR-NEXT: s_setpc_b64 s[0:1] define void @no_unused_non_csr_sgpr_for_fp() #1 { @@ -645,9 +670,11 @@ ; scratch VGPR to hold the offset. ; GCN-LABEL: {{^}}spill_fp_to_memory_scratch_reg_needed_mubuf_offset ; MUBUF: s_or_saveexec_b64 s[4:5], -1 +; MUBUF: v_mov_b32_e32 v0, 0x1008 +; MUBUF-NEXT: buffer_store_dword v39, v0, s[0:3], s32 offen ; 4-byte Folded Spill ; MUBUF: v_mov_b32_e32 v0, s33 -; GCN-NOT: v_mov_b32_e32 v0, 0x1008 -; MUBUF-NEXT: v_mov_b32_e32 v1, 0x1008 +; GCN-NOT: v_mov_b32_e32 v0, 0x100c +; MUBUF-NEXT: v_mov_b32_e32 v1, 0x100c ; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], s32 offen ; 4-byte Folded Spill ; FLATSCR: s_add_u32 [[SOFF:s[0-9]+]], s33, 0x1004 ; FLATSCR: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll --- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll @@ -293,12 +293,12 @@ ; GCN-LABEL: spill_bp_to_memory_scratch_reg_needed_mubuf_offset ; GCN: s_or_saveexec_b64 s[4:5], -1 ; GCN: v_mov_b32_e32 v0, s33 -; GCN-NOT: v_mov_b32_e32 v0, 0x1084 -; GCN-NEXT: v_mov_b32_e32 v1, 0x1084 -; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], s32 offen -; GCN: v_mov_b32_e32 v0, s34 ; GCN-NOT: v_mov_b32_e32 v0, 0x1088 ; GCN-NEXT: v_mov_b32_e32 v1, 0x1088 +; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], s32 offen +; GCN: v_mov_b32_e32 v0, s34 +; GCN-NOT: v_mov_b32_e32 v0, 0x108c +; GCN-NEXT: v_mov_b32_e32 v1, 0x108c ; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], s32 offen %local_val = alloca i32, align 128, addrspace(5) store volatile i32 %b, i32 addrspace(5)* %local_val, align 128