diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -606,23 +606,13 @@ llvm_unreachable("Invalid TargetStackID::Value"); } -// Activate all lanes, returns saved exec. -static Register buildScratchExecCopy(LivePhysRegs &LiveRegs, - MachineFunction &MF, - MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - bool IsProlog) { - Register ScratchExecCopy; - MachineRegisterInfo &MRI = MF.getRegInfo(); - const GCNSubtarget &ST = MF.getSubtarget(); - const SIInstrInfo *TII = ST.getInstrInfo(); - const SIRegisterInfo &TRI = TII->getRegisterInfo(); - SIMachineFunctionInfo *FuncInfo = MF.getInfo(); - DebugLoc DL; - +static void initLiveRegs(LivePhysRegs &LiveRegs, const SIRegisterInfo &TRI, + const SIMachineFunctionInfo *FuncInfo, + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, bool IsProlog) { if (LiveRegs.empty()) { + LiveRegs.init(TRI); if (IsProlog) { - LiveRegs.init(TRI); LiveRegs.addLiveIns(MBB); if (FuncInfo->SGPRForFPSaveRestoreCopy) LiveRegs.removeReg(FuncInfo->SGPRForFPSaveRestoreCopy); @@ -631,11 +621,27 @@ LiveRegs.removeReg(FuncInfo->SGPRForBPSaveRestoreCopy); } else { // In epilog. - LiveRegs.init(*ST.getRegisterInfo()); LiveRegs.addLiveOuts(MBB); LiveRegs.stepBackward(*MBBI); } } +} + +// Activate all lanes, returns saved exec. +static Register buildScratchExecCopy(LivePhysRegs &LiveRegs, + MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + bool IsProlog) { + Register ScratchExecCopy; + MachineRegisterInfo &MRI = MF.getRegInfo(); + const GCNSubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + SIMachineFunctionInfo *FuncInfo = MF.getInfo(); + DebugLoc DL; + + initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, IsProlog); ScratchExecCopy = findScratchNonCalleeSaveRegister( MRI, LiveRegs, *TRI.getWaveMaskRegClass()); @@ -723,12 +729,20 @@ /*UseSp*/ true); } + if (ScratchExecCopy) { + // FIXME: Split block and make terminator. + unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) + .addReg(ScratchExecCopy, RegState::Kill); + LiveRegs.addReg(ScratchExecCopy); + } + if (FPSaveIndex && spilledToMemory(MF, *FPSaveIndex)) { const int FramePtrFI = *FPSaveIndex; assert(!MFI.isDeadObjectIndex(FramePtrFI)); - if (!ScratchExecCopy) - ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true); + initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, true); MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( MRI, LiveRegs, AMDGPU::VGPR_32RegClass); @@ -747,8 +761,7 @@ const int BasePtrFI = *BPSaveIndex; assert(!MFI.isDeadObjectIndex(BasePtrFI)); - if (!ScratchExecCopy) - ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true); + initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, true); MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( MRI, LiveRegs, AMDGPU::VGPR_32RegClass); @@ -763,15 +776,6 @@ /*UseSp*/ true); } - if (ScratchExecCopy) { - // FIXME: Split block and make terminator. - unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) - .addReg(ScratchExecCopy, RegState::Kill); - LiveRegs.addReg(ScratchExecCopy); - } - // In this case, spill the FP to a reserved VGPR. if (FPSaveIndex && !spilledToMemory(MF, *FPSaveIndex)) { const int FramePtrFI = *FPSaveIndex; @@ -951,13 +955,11 @@ .setMIFlag(MachineInstr::FrameDestroy); } - Register ScratchExecCopy; if (FPSaveIndex) { const int FramePtrFI = *FPSaveIndex; assert(!MFI.isDeadObjectIndex(FramePtrFI)); if (spilledToMemory(MF, FramePtrFI)) { - if (!ScratchExecCopy) - ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false); + initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, false); MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( MRI, LiveRegs, AMDGPU::VGPR_32RegClass); @@ -984,8 +986,7 @@ const int BasePtrFI = *BPSaveIndex; assert(!MFI.isDeadObjectIndex(BasePtrFI)); if (spilledToMemory(MF, BasePtrFI)) { - if (!ScratchExecCopy) - ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false); + initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, false); MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( MRI, LiveRegs, AMDGPU::VGPR_32RegClass); @@ -1008,6 +1009,7 @@ } } + Register ScratchExecCopy; for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg : FuncInfo->getSGPRSpillVGPRs()) { if (!Reg.FI) diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll --- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -563,18 +563,14 @@ ; With no free registers, we must spill the FP to memory. ; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_memory: -; MUBUF: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF: v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], s33 ; MUBUF: buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s32 offset:4 -; MUBUF: s_mov_b64 exec, [[COPY_EXEC1]] ; FLATSCR: s_mov_b32 s0, s33 ; GCN: s_mov_b32 s33, s32 -; MUBUF: s_or_saveexec_b64 [[COPY_EXEC2:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF: buffer_load_dword [[TMP_VGPR2:v[0-9]+]], off, s[0:3], s32 offset:4 ; FLATSCR: s_mov_b32 s33, s0 ; MUBUF: s_waitcnt vmcnt(0) ; MUBUF: v_readfirstlane_b32 s33, [[TMP_VGPR2]] -; MUBUF: s_mov_b64 exec, [[COPY_EXEC2]] ; GCN: s_setpc_b64 ; MUBUF: ScratchSize: 8 ; FLATSCR: ScratchSize: 0 @@ -598,16 +594,16 @@ ; VGPR. ; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_memory_full_reserved_vgpr: ; MUBUF: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; MUBUF: s_mov_b64 exec, [[COPY_EXEC1]] ; MUBUF: v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], s33 ; MUBUF: buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s32 offset:[[OFF:[0-9]+]] -; MUBUF: s_mov_b64 exec, [[COPY_EXEC1]] ; GCN-NOT: v_writelane_b32 v40, s33 ; MUBUF: s_mov_b32 s33, s32 ; FLATSCR: s_mov_b32 s33, s0 ; GCN-NOT: v_readlane_b32 s33, v40 -; MUBUF: s_or_saveexec_b64 [[COPY_EXEC2:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF: buffer_load_dword [[TMP_VGPR2:v[0-9]+]], off, s[0:3], s32 offset:[[OFF]] ; MUBUF: v_readfirstlane_b32 s33, [[TMP_VGPR2]] +; MUBUF: s_or_saveexec_b64 [[COPY_EXEC2:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF: s_mov_b64 exec, [[COPY_EXEC2]] ; GCN: s_setpc_b64 define void @callee_need_to_spill_fp_to_memory_full_reserved_vgpr() #3 { @@ -675,7 +671,7 @@ ; MUBUF: v_mov_b32_e32 v0, s33 ; GCN-NOT: v_mov_b32_e32 v0, 0x100c ; MUBUF-NEXT: s_add_u32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x40300 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill +; MUBUF: buffer_store_dword v0, off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill ; FLATSCR: s_add_u32 [[SOFF:s[0-9]+]], s33, 0x1004 ; FLATSCR: v_mov_b32_e32 v0, 0 ; FLATSCR: scratch_store_dword off, v0, [[SOFF]] diff --git a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll --- a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll @@ -38,13 +38,11 @@ ; NO-SPILL-TO-VGPR-LABEL: callee_with_stack_and_call: ; NO-SPILL-TO-VGPR: ; %bb.0: ; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; NO-SPILL-TO-VGPR-NEXT: s_or_saveexec_b64 s[4:5], -1 ; NO-SPILL-TO-VGPR-NEXT: v_mov_b32_e32 v0, s33 -; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[4:5] ; NO-SPILL-TO-VGPR-NEXT: v_writelane_b32 v1, s30, 0 -; NO-SPILL-TO-VGPR-NEXT: s_mov_b32 s33, s32 ; NO-SPILL-TO-VGPR-NEXT: v_writelane_b32 v1, s31, 1 +; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; NO-SPILL-TO-VGPR-NEXT: s_mov_b32 s33, s32 ; NO-SPILL-TO-VGPR-NEXT: s_add_u32 s32, s32, 0x800 ; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 s[30:31], exec ; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, 3 @@ -63,15 +61,13 @@ ; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, 3 ; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[4:5] -; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) -; NO-SPILL-TO-VGPR-NEXT: v_readlane_b32 s4, v1, 0 -; NO-SPILL-TO-VGPR-NEXT: v_readlane_b32 s5, v1, 1 ; NO-SPILL-TO-VGPR-NEXT: s_sub_u32 s32, s32, 0x800 -; NO-SPILL-TO-VGPR-NEXT: s_or_saveexec_b64 s[6:7], -1 ; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(1) +; NO-SPILL-TO-VGPR-NEXT: v_readlane_b32 s4, v1, 0 +; NO-SPILL-TO-VGPR-NEXT: v_readlane_b32 s5, v1, 1 ; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) ; NO-SPILL-TO-VGPR-NEXT: v_readfirstlane_b32 s33, v0 -; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[6:7] ; NO-SPILL-TO-VGPR-NEXT: s_setpc_b64 s[4:5] %alloca = alloca i32, addrspace(5) store volatile i32 0, i32 addrspace(5)* %alloca diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll --- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll @@ -294,14 +294,16 @@ ; GCN: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_add_u32 s6, s32, 0x42100 ; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s6 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, s33 ; GCN-NOT: v_mov_b32_e32 v0, 0x1088 -; GCN-NEXT: s_add_u32 s6, s32, 0x42200 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill +; GCN-NEXT: s_add_u32 s4, s32, 0x42200 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill ; GCN-NEXT: v_mov_b32_e32 v0, s34 ; GCN-NOT: v_mov_b32_e32 v0, 0x108c -; GCN-NEXT: s_add_u32 s6, s32, 0x42300 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill +; GCN-NEXT: s_add_u32 s4, s32, 0x42300 +; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill %local_val = alloca i32, align 128, addrspace(5) store volatile i32 %b, i32 addrspace(5)* %local_val, align 128