diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -20,18 +20,10 @@ #define DEBUG_TYPE "frame-info" - -// Find a scratch register that we can use at the start of the prologue to -// re-align the stack pointer. We avoid using callee-save registers since they -// may appear to be free when this is called from canUseAsPrologue (during -// shrink wrapping), but then no longer be free when this is called from -// emitPrologue. -// -// FIXME: This is a bit conservative, since in the above case we could use one -// of the callee-save registers as a scratch temp to re-align the stack pointer, -// but we would then have to make sure that we were in fact saving at least one -// callee-save register in the prologue, which is additional complexity that -// doesn't seem worth the benefit. +// Find a scratch register that we can use in the prologue. We avoid using +// callee-save registers since they may appear to be free when this is called +// from canUseAsPrologue (during shrink wrapping), but then no longer be free +// when this is called from emitPrologue. static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI, LivePhysRegs &LiveRegs, const TargetRegisterClass &RC, @@ -55,12 +47,6 @@ } } - // If we require an unused register, this is used in contexts where failure is - // an option and has an alternative plan. In other contexts, this must - // succeed0. - if (!Unused) - report_fatal_error("failed to find free scratch register"); - return MCRegister(); } @@ -178,37 +164,72 @@ MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister( MF->getRegInfo(), LiveRegs, AMDGPU::SReg_32_XM0RegClass); + bool HasOffsetReg = OffsetReg; + if (!HasOffsetReg) { + // No free register, use stack pointer and restore afterwards. + OffsetReg = SPReg; + } + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_ADD_U32), OffsetReg) .addReg(SPReg) .addImm(Offset); BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::SCRATCH_STORE_DWORD_SADDR)) - .addReg(SpillReg, RegState::Kill) - .addReg(OffsetReg, RegState::Kill) - .addImm(0) - .addImm(0) // glc - .addImm(0) // slc - .addImm(0) // dlc - .addMemOperand(MMO); + .addReg(SpillReg, RegState::Kill) + .addReg(OffsetReg, HasOffsetReg ? RegState::Kill : 0) + .addImm(0) // offset + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // dlc + .addMemOperand(MMO); + + if (!HasOffsetReg) { + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_SUB_U32), OffsetReg) + .addReg(SPReg) + .addImm(Offset); + } } else { MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister( MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass); - BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg) - .addImm(Offset); + if (OffsetReg) { + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg) + .addImm(Offset); - BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFEN)) - .addReg(SpillReg, RegState::Kill) - .addReg(OffsetReg, RegState::Kill) - .addReg(ScratchRsrcReg) - .addReg(SPReg) - .addImm(0) - .addImm(0) // glc - .addImm(0) // slc - .addImm(0) // tfe - .addImm(0) // dlc - .addImm(0) // swz - .addMemOperand(MMO); + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFEN)) + .addReg(SpillReg, RegState::Kill) + .addReg(OffsetReg, RegState::Kill) + .addReg(ScratchRsrcReg) + .addReg(SPReg) + .addImm(0) // offset + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // tfe + .addImm(0) // dlc + .addImm(0) // swz + .addMemOperand(MMO); + } else { + // No free register, use stack pointer and restore afterwards. + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_ADD_U32), SPReg) + .addReg(SPReg) + .addImm(Offset); + + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFSET)) + .addReg(SpillReg, RegState::Kill) + .addReg(ScratchRsrcReg) + .addReg(SPReg) + .addImm(0) // offset + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // tfe + .addImm(0) // dlc + .addImm(0) // swz + .addMemOperand(MMO); + + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_SUB_U32), SPReg) + .addReg(SPReg) + .addImm(Offset); + } } LiveRegs.removeReg(SpillReg); @@ -241,19 +262,21 @@ } MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister( MF->getRegInfo(), LiveRegs, AMDGPU::SReg_32_XM0RegClass); + if (!OffsetReg) + report_fatal_error("failed to find free scratch register"); - BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_ADD_U32), OffsetReg) + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_ADD_U32), OffsetReg) .addReg(SPReg) .addImm(Offset); - BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::SCRATCH_LOAD_DWORD_SADDR), - SpillReg) - .addReg(OffsetReg, RegState::Kill) - .addImm(0) - .addImm(0) // glc - .addImm(0) // slc - .addImm(0) // dlc - .addMemOperand(MMO); - return; + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::SCRATCH_LOAD_DWORD_SADDR), + SpillReg) + .addReg(OffsetReg, RegState::Kill) + .addImm(0) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // dlc + .addMemOperand(MMO); + return; } if (SIInstrInfo::isLegalMUBUFImmOffset(Offset)) { @@ -273,6 +296,8 @@ MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister( MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass); + if (!OffsetReg) + report_fatal_error("failed to find free scratch register"); BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg) .addImm(Offset); @@ -821,6 +846,8 @@ ScratchExecCopy = findScratchNonCalleeSaveRegister( MRI, LiveRegs, *TRI.getWaveMaskRegClass()); + if (!ScratchExecCopy) + report_fatal_error("failed to find free scratch register"); if (!IsProlog) LiveRegs.removeReg(ScratchExecCopy); @@ -903,6 +930,8 @@ MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( MRI, LiveRegs, AMDGPU::VGPR_32RegClass); + if (!TmpVGPR) + report_fatal_error("failed to find free scratch register"); BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) .addReg(FramePtrReg); @@ -920,6 +949,8 @@ MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( MRI, LiveRegs, AMDGPU::VGPR_32RegClass); + if (!TmpVGPR) + report_fatal_error("failed to find free scratch register"); BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) .addReg(BasePtrReg); @@ -1140,6 +1171,8 @@ MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister( MRI, LiveRegs, AMDGPU::VGPR_32RegClass); + if (!TempVGPR) + report_fatal_error("failed to find free scratch register"); buildEpilogReload(ST, LiveRegs, MBB, MBBI, TII, TempVGPR, FuncInfo->getScratchRSrcReg(), StackPtrReg, FI); BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), FramePtrReg) @@ -1165,6 +1198,8 @@ MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister( MRI, LiveRegs, AMDGPU::VGPR_32RegClass); + if (!TempVGPR) + report_fatal_error("failed to find free scratch register"); buildEpilogReload(ST, LiveRegs, MBB, MBBI, TII, TempVGPR, FuncInfo->getScratchRSrcReg(), StackPtrReg, BasePtrFI); BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), BasePtrReg) diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -289,8 +289,6 @@ assert(Size >= 4 && "invalid sgpr spill size"); assert(TRI->spillSGPRToVGPR() && "not spilling SGPRs to VGPRs"); - const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs(); - // Make sure to handle the case where a wide SGPR spill may span between two // VGPRs. for (unsigned I = 0; I < NumLanes; ++I, ++NumVGPRSpillLanes) { @@ -316,13 +314,14 @@ return false; } - Optional CSRSpillFI; - if ((FrameInfo.hasCalls() || !isEntryFunction()) && CSRegs && - isCalleeSavedReg(CSRegs, LaneVGPR)) { - CSRSpillFI = FrameInfo.CreateSpillStackObject(4, Align(4)); + Optional SpillFI; + // We need to preserve inactive lanes, so always save, even caller-save + // registers. + if (!isEntryFunction()) { + SpillFI = FrameInfo.CreateSpillStackObject(4, Align(4)); } - SpillVGPRs.push_back(SGPRSpillVGPRCSR(LaneVGPR, CSRSpillFI)); + SpillVGPRs.push_back(SGPRSpillVGPRCSR(LaneVGPR, SpillFI)); // Add this register as live-in to all blocks to avoid machine verifer // complaining about use of an undefined physical register. diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll --- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -214,11 +214,20 @@ ; GCN-LABEL: {{^}}spill_only_csr_sgpr: ; GCN: s_waitcnt +; GCN-NEXT: s_or_saveexec_b64 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; FLATSCR-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, ; GCN-NEXT: v_writelane_b32 v0, s42, 0 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; clobber s42 ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: v_readlane_b32 s42, v0, 0 +; GCN-NEXT: s_or_saveexec_b64 +; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; FLATSCR-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define void @spill_only_csr_sgpr() { call void asm sideeffect "; clobber s42", "~{s42}"() diff --git a/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir b/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir --- a/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir @@ -7,11 +7,12 @@ name: merge_sgpr_spill_into_copy_from_exec_lo tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true body: | bb.0: ; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_exec_lo ; CHECK: liveins: $vgpr0 - ; CHECK: S_WAITCNT 0 ; CHECK: S_NOP 0, implicit-def $exec_lo ; CHECK: $sgpr0 = S_MOV_B32 $exec_lo ; CHECK: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 @@ -31,11 +32,12 @@ name: merge_sgpr_spill_into_copy_from_exec_hi tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true body: | bb.0: ; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_exec_hi ; CHECK: liveins: $vgpr0 - ; CHECK: S_WAITCNT 0 ; CHECK: S_NOP 0, implicit-def $exec_hi ; CHECK: $sgpr0 = S_MOV_B32 $exec_hi ; CHECK: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 @@ -55,11 +57,12 @@ name: merge_sgpr_spill_into_copy_from_exec tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true body: | bb.0: ; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_exec ; CHECK: liveins: $vgpr0 - ; CHECK: S_WAITCNT 0 ; CHECK: S_NOP 0, implicit-def $exec ; CHECK: $sgpr0_sgpr1 = S_MOV_B64 $exec ; CHECK: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1 @@ -85,11 +88,12 @@ name: reload_sgpr_spill_into_copy_to_exec_lo tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true body: | bb.0: ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec_lo ; CHECK: liveins: $vgpr0 - ; CHECK: S_WAITCNT 0 ; CHECK: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $exec_lo ; CHECK: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 ; CHECK: $sgpr0 = V_READLANE_B32 $vgpr0, 0 @@ -107,11 +111,12 @@ name: reload_sgpr_spill_into_copy_to_exec_hi tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true body: | bb.0: ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec_hi ; CHECK: liveins: $vgpr0 - ; CHECK: S_WAITCNT 0 ; CHECK: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $exec_hi ; CHECK: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 ; CHECK: $sgpr0 = V_READLANE_B32 $vgpr0, 0 @@ -129,11 +134,12 @@ name: reload_sgpr_spill_into_copy_to_exec tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true body: | bb.0: ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec ; CHECK: liveins: $vgpr0 - ; CHECK: S_WAITCNT 0 ; CHECK: S_NOP 0, implicit-def renamable $sgpr0_sgpr1, implicit-def dead renamable $sgpr2_sgpr3, implicit-def $exec ; CHECK: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1 ; CHECK: $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, $vgpr0, implicit $sgpr0_sgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir b/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir --- a/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir @@ -7,12 +7,13 @@ name: merge_sgpr_spill_into_copy_from_m0 tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true body: | bb.0: ; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_m0 ; CHECK: liveins: $vgpr0 - ; CHECK: S_WAITCNT 0 ; CHECK: S_NOP 0, implicit-def $m0 ; CHECK: $sgpr0 = S_MOV_B32 $m0 ; CHECK: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 @@ -36,12 +37,13 @@ name: reload_sgpr_spill_into_copy_to_m0 tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true body: | bb.0: ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_m0 ; CHECK: liveins: $vgpr0 - ; CHECK: S_WAITCNT 0 ; CHECK: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $m0 ; CHECK: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 ; CHECK: $sgpr0 = V_READLANE_B32 $vgpr0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll @@ -488,22 +488,39 @@ ; GFX9-LABEL: void_func_void_clobber_s33: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v0, s33, 0 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; clobber ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: v_readlane_b32 s33, v0, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: void_func_void_clobber_s33: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v0, s33, 0 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; clobber ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: v_readlane_b32 s33, v0, 0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "; clobber", "~{s33}"() #0 ret void @@ -513,22 +530,39 @@ ; GFX9-LABEL: void_func_void_clobber_s34: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v0, s34, 0 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; clobber ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: v_readlane_b32 s34, v0, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: void_func_void_clobber_s34: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v0, s34, 0 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; clobber ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: v_readlane_b32 s34, v0, 0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "; clobber", "~{s34}"() #0 ret void diff --git a/llvm/test/CodeGen/AMDGPU/need-fp-from-csr-vgpr-spill.ll b/llvm/test/CodeGen/AMDGPU/need-fp-from-csr-vgpr-spill.ll --- a/llvm/test/CodeGen/AMDGPU/need-fp-from-csr-vgpr-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/need-fp-from-csr-vgpr-spill.ll @@ -73,6 +73,9 @@ ; CHECK-LABEL: csr_vgpr_spill_fp_tailcall_callee: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1 +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; clobber csr v40 @@ -83,6 +86,9 @@ ; CHECK-NEXT: s_add_u32 s4, s4, callee_has_fp@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, callee_has_fp@rel32@hi+12 ; CHECK-NEXT: v_readlane_b32 s33, v1, 0 +; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[6:7] ; CHECK-NEXT: s_setpc_b64 s[4:5] bb: call void asm sideeffect "; clobber csr v40", "~{v40}"() diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir --- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir @@ -25,37 +25,59 @@ ; GFX8-LABEL: name: pei_scavenge_vgpr_spill ; GFX8: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr2 + ; GFX8: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX8: $sgpr32 = S_ADD_U32 $sgpr32, 8196, implicit-def $scc + ; GFX8: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.3, addrspace 5) + ; GFX8: $sgpr32 = S_SUB_U32 $sgpr32, 8196, implicit-def $scc + ; GFX8: $exec = S_MOV_B64 killed $sgpr4_sgpr5 ; GFX8: $vgpr2 = V_WRITELANE_B32 $sgpr33, 0, undef $vgpr2 ; GFX8: $sgpr33 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc ; GFX8: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def $scc ; GFX8: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc ; GFX8: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec - ; GFX8: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 12, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.3, addrspace 5) + ; GFX8: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 20, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.4, addrspace 5) ; GFX8: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX8: $vcc_lo = S_MOV_B32 8192 ; GFX8: $vgpr3, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr3, 0, implicit $exec ; GFX8: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec ; GFX8: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc ; GFX8: $sgpr33 = V_READLANE_B32 $vgpr2, 0 - ; GFX8: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 12, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.3, addrspace 5) + ; GFX8: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX8: $vgpr0 = V_MOV_B32_e32 8196, implicit $exec + ; GFX8: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.3, addrspace 5) + ; GFX8: $exec = S_MOV_B64 killed $sgpr4_sgpr5 + ; GFX8: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 20, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.4, addrspace 5) ; GFX8: S_ENDPGM 0, csr_amdgpu_allvgprs ; GFX9-LABEL: name: pei_scavenge_vgpr_spill ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr2 + ; GFX9: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX9: $sgpr32 = S_ADD_U32 $sgpr32, 8196, implicit-def $scc + ; GFX9: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.3, addrspace 5) + ; GFX9: $sgpr32 = S_SUB_U32 $sgpr32, 8196, implicit-def $scc + ; GFX9: $exec = S_MOV_B64 killed $sgpr4_sgpr5 ; GFX9: $vgpr2 = V_WRITELANE_B32 $sgpr33, 0, undef $vgpr2 ; GFX9: $sgpr33 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc ; GFX9: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def $scc ; GFX9: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc ; GFX9: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec - ; GFX9: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 12, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.3, addrspace 5) + ; GFX9: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 20, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.4, addrspace 5) ; GFX9: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX9: $vgpr3 = V_ADD_U32_e32 8192, killed $vgpr3, implicit $exec ; GFX9: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec ; GFX9: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc ; GFX9: $sgpr33 = V_READLANE_B32 $vgpr2, 0 - ; GFX9: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 12, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.3, addrspace 5) + ; GFX9: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX9: $vgpr0 = V_MOV_B32_e32 8196, implicit $exec + ; GFX9: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.3, addrspace 5) + ; GFX9: $exec = S_MOV_B64 killed $sgpr4_sgpr5 + ; GFX9: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 20, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.4, addrspace 5) ; GFX9: S_ENDPGM 0, csr_amdgpu_allvgprs ; GFX9-FLATSCR-LABEL: name: pei_scavenge_vgpr_spill ; GFX9-FLATSCR: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr2 + ; GFX9-FLATSCR: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX9-FLATSCR: $sgpr4 = S_ADD_U32 $sgpr32, 8196, implicit-def $scc + ; GFX9-FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr2, killed $sgpr4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.3, addrspace 5) + ; GFX9-FLATSCR: $exec = S_MOV_B64 killed $sgpr4_sgpr5 ; GFX9-FLATSCR: $vgpr2 = V_WRITELANE_B32 $sgpr33, 0, undef $vgpr2 ; GFX9-FLATSCR: $sgpr33 = frame-setup S_ADD_U32 $sgpr32, 8191, implicit-def $scc ; GFX9-FLATSCR: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def $scc @@ -65,6 +87,10 @@ ; GFX9-FLATSCR: $vgpr0 = V_OR_B32_e32 killed $vcc_hi, $vgpr1, implicit $exec ; GFX9-FLATSCR: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 24576, implicit-def $scc ; GFX9-FLATSCR: $sgpr33 = V_READLANE_B32 $vgpr2, 0 + ; GFX9-FLATSCR: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX9-FLATSCR: $sgpr4 = S_ADD_U32 $sgpr32, 8196, implicit-def $scc + ; GFX9-FLATSCR: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR killed $sgpr4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.3, addrspace 5) + ; GFX9-FLATSCR: $exec = S_MOV_B64 killed $sgpr4_sgpr5 ; GFX9-FLATSCR: S_ENDPGM 0, csr_amdgpu_allvgprs $vgpr0 = V_MOV_B32_e32 %stack.0, implicit $exec $vgpr0 = V_OR_B32_e32 %stack.1, $vgpr1, implicit $exec