diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -648,6 +648,22 @@ llvm_unreachable("Invalid TargetStackID::Value"); } +static void initLiveRegs(LivePhysRegs &LiveRegs, const SIRegisterInfo &TRI, + const SIMachineFunctionInfo *FuncInfo, + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, bool IsProlog) { + if (LiveRegs.empty()) { + LiveRegs.init(TRI); + if (IsProlog) { + LiveRegs.addLiveIns(MBB); + } else { + // In epilog. + LiveRegs.addLiveOuts(MBB); + LiveRegs.stepBackward(*MBBI); + } + } +} + // Activate all lanes, returns saved exec. static Register buildScratchExecCopy(LivePhysRegs &LiveRegs, MachineFunction &MF, @@ -659,19 +675,10 @@ const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo &TRI = TII->getRegisterInfo(); + SIMachineFunctionInfo *FuncInfo = MF.getInfo(); DebugLoc DL; - if (LiveRegs.empty()) { - if (IsProlog) { - LiveRegs.init(TRI); - LiveRegs.addLiveIns(MBB); - } else { - // In epilog. - LiveRegs.init(*ST.getRegisterInfo()); - LiveRegs.addLiveOuts(MBB); - LiveRegs.stepBackward(*MBBI); - } - } + initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, IsProlog); ScratchExecCopy = findScratchNonCalleeSaveRegister( MRI, LiveRegs, *TRI.getWaveMaskRegClass()); @@ -740,13 +747,20 @@ buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBBI, Reg.VGPR, *Reg.FI); } + if (ScratchExecCopy) { + // FIXME: Split block and make terminator. + unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) + .addReg(ScratchExecCopy, RegState::Kill); + LiveRegs.addReg(ScratchExecCopy); + } + if (FPSaveIndex && spilledToMemory(MF, *FPSaveIndex)) { const int FramePtrFI = *FPSaveIndex; assert(!MFI.isDeadObjectIndex(FramePtrFI)); - if (!ScratchExecCopy) - ScratchExecCopy = - buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ true); + initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true); MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( MRI, LiveRegs, AMDGPU::VGPR_32RegClass); @@ -764,9 +778,7 @@ const int BasePtrFI = *BPSaveIndex; assert(!MFI.isDeadObjectIndex(BasePtrFI)); - if (!ScratchExecCopy) - ScratchExecCopy = - buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ true); + initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true); MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( MRI, LiveRegs, AMDGPU::VGPR_32RegClass); @@ -780,15 +792,6 @@ BasePtrFI); } - if (ScratchExecCopy) { - // FIXME: Split block and make terminator. - unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) - .addReg(ScratchExecCopy, RegState::Kill); - LiveRegs.addReg(ScratchExecCopy); - } - // In this case, spill the FP to a reserved VGPR. if (FPSaveIndex && !spilledToMemory(MF, *FPSaveIndex)) { const int FramePtrFI = *FPSaveIndex; @@ -968,14 +971,11 @@ .setMIFlag(MachineInstr::FrameDestroy); } - Register ScratchExecCopy; if (FPSaveIndex) { const int FramePtrFI = *FPSaveIndex; assert(!MFI.isDeadObjectIndex(FramePtrFI)); if (spilledToMemory(MF, FramePtrFI)) { - if (!ScratchExecCopy) - ScratchExecCopy = - buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ false); + initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false); MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( MRI, LiveRegs, AMDGPU::VGPR_32RegClass); @@ -1001,9 +1001,7 @@ const int BasePtrFI = *BPSaveIndex; assert(!MFI.isDeadObjectIndex(BasePtrFI)); if (spilledToMemory(MF, BasePtrFI)) { - if (!ScratchExecCopy) - ScratchExecCopy = - buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ false); + initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false); MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( MRI, LiveRegs, AMDGPU::VGPR_32RegClass); @@ -1025,6 +1023,7 @@ } } + Register ScratchExecCopy; for (const SIMachineFunctionInfo::SGPRSpillVGPR &Reg : FuncInfo->getSGPRSpillVGPRs()) { if (!Reg.FI.hasValue()) diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll --- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -563,18 +563,14 @@ ; With no free registers, we must spill the FP to memory. ; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_memory: -; MUBUF: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF: v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], s33 ; MUBUF: buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s32 offset:4 -; MUBUF: s_mov_b64 exec, [[COPY_EXEC1]] ; FLATSCR: s_mov_b32 s0, s33 ; GCN: s_mov_b32 s33, s32 -; MUBUF: s_or_saveexec_b64 [[COPY_EXEC2:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF: buffer_load_dword [[TMP_VGPR2:v[0-9]+]], off, s[0:3], s32 offset:4 ; FLATSCR: s_mov_b32 s33, s0 ; MUBUF: s_waitcnt vmcnt(0) ; MUBUF: v_readfirstlane_b32 s33, [[TMP_VGPR2]] -; MUBUF: s_mov_b64 exec, [[COPY_EXEC2]] ; GCN: s_setpc_b64 ; MUBUF: ScratchSize: 8 ; FLATSCR: ScratchSize: 0 @@ -598,16 +594,16 @@ ; VGPR. ; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_memory_full_reserved_vgpr: ; MUBUF: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; MUBUF: s_mov_b64 exec, [[COPY_EXEC1]] ; MUBUF: v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], s33 ; MUBUF: buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s32 offset:[[OFF:[0-9]+]] -; MUBUF: s_mov_b64 exec, [[COPY_EXEC1]] ; GCN-NOT: v_writelane_b32 v40, s33 ; MUBUF: s_mov_b32 s33, s32 ; FLATSCR: s_mov_b32 s33, s0 ; GCN-NOT: v_readlane_b32 s33, v40 -; MUBUF: s_or_saveexec_b64 [[COPY_EXEC2:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF: buffer_load_dword [[TMP_VGPR2:v[0-9]+]], off, s[0:3], s32 offset:[[OFF]] ; MUBUF: v_readfirstlane_b32 s33, [[TMP_VGPR2]] +; MUBUF: s_or_saveexec_b64 [[COPY_EXEC2:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF: s_mov_b64 exec, [[COPY_EXEC2]] ; GCN: s_setpc_b64 define void @callee_need_to_spill_fp_to_memory_full_reserved_vgpr() #3 { @@ -672,10 +668,10 @@ ; MUBUF: s_or_saveexec_b64 s[4:5], -1 ; MUBUF-NEXT: s_add_u32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x40200 ; MUBUF-NEXT: buffer_store_dword v39, off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill -; MUBUF-NEXT: v_mov_b32_e32 v0, s33 +; MUBUF: v_mov_b32_e32 v0, s33 ; GCN-NOT: v_mov_b32_e32 v0, 0x100c ; MUBUF-NEXT: s_add_u32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x40300 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill +; MUBUF: buffer_store_dword v0, off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill ; FLATSCR: s_add_u32 [[SOFF:s[0-9]+]], s33, 0x1004 ; FLATSCR: v_mov_b32_e32 v0, 0 ; FLATSCR: scratch_store_dword off, v0, [[SOFF]] diff --git a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll --- a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll @@ -38,10 +38,8 @@ ; NO-SPILL-TO-VGPR-LABEL: callee_with_stack_and_call: ; NO-SPILL-TO-VGPR: ; %bb.0: ; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; NO-SPILL-TO-VGPR-NEXT: s_or_saveexec_b64 s[4:5], -1 ; NO-SPILL-TO-VGPR-NEXT: v_mov_b32_e32 v0, s33 ; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[4:5] ; NO-SPILL-TO-VGPR-NEXT: s_mov_b32 s33, s32 ; NO-SPILL-TO-VGPR-NEXT: s_add_u32 s32, s32, 0x800 ; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 s[6:7], exec @@ -60,7 +58,7 @@ ; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) ; NO-SPILL-TO-VGPR-NEXT: s_swappc_b64 s[30:31], s[4:5] -; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 s[8:9], exec +; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 s[6:7], exec ; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, 3 ; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:16 ; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload @@ -69,13 +67,11 @@ ; NO-SPILL-TO-VGPR-NEXT: v_readlane_b32 s5, v2, 1 ; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:16 ; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) -; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[8:9] +; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[6:7] ; NO-SPILL-TO-VGPR-NEXT: s_sub_u32 s32, s32, 0x800 -; NO-SPILL-TO-VGPR-NEXT: s_or_saveexec_b64 s[6:7], -1 ; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) ; NO-SPILL-TO-VGPR-NEXT: v_readfirstlane_b32 s33, v0 -; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[6:7] ; NO-SPILL-TO-VGPR-NEXT: s_setpc_b64 s[4:5] %alloca = alloca i32, addrspace(5) store volatile i32 0, i32 addrspace(5)* %alloca diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll --- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll @@ -294,6 +294,7 @@ ; GCN: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_add_u32 s6, s32, 0x42100 ; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s6 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, s33 ; GCN-NOT: v_mov_b32_e32 v0, 0x1088 ; GCN-NEXT: s_add_u32 s6, s32, 0x42200 @@ -301,6 +302,7 @@ ; GCN-NEXT: v_mov_b32_e32 v0, s34 ; GCN-NOT: v_mov_b32_e32 v0, 0x108c ; GCN-NEXT: s_add_u32 s6, s32, 0x42300 +; GCN-NEXT: s_mov_b32 s34, s32 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill %local_val = alloca i32, align 128, addrspace(5) store volatile i32 %b, i32 addrspace(5)* %local_val, align 128