diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -115,187 +115,43 @@ // We need to specially emit stack operations here because a different frame // register is used than in the rest of the function, as getFrameRegister would // use. -static void buildPrologSpill(const GCNSubtarget &ST, LivePhysRegs &LiveRegs, - MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, - const SIInstrInfo *TII, Register SpillReg, - Register ScratchRsrcReg, Register SPReg, int FI) { - MachineFunction *MF = MBB.getParent(); - MachineFrameInfo &MFI = MF->getFrameInfo(); - - int64_t Offset = MFI.getObjectOffset(FI); +static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI, + const SIMachineFunctionInfo &FuncInfo, + LivePhysRegs &LiveRegs, MachineFunction &MF, + MachineBasicBlock::iterator I, Register SpillReg, + int FI) { + unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR + : AMDGPU::BUFFER_STORE_DWORD_OFFSET; - MachineMemOperand *MMO = MF->getMachineMemOperand( - MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, 4, - MFI.getObjectAlign(FI)); - - if (ST.enableFlatScratch()) { - if (TII->isLegalFLATOffset(Offset, AMDGPUAS::PRIVATE_ADDRESS, true)) { - BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::SCRATCH_STORE_DWORD_SADDR)) - .addReg(SpillReg, RegState::Kill) - .addReg(SPReg) - .addImm(Offset) - .addImm(0) // cpol - .addMemOperand(MMO); - return; - } - } else if (SIInstrInfo::isLegalMUBUFImmOffset(Offset)) { - BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFSET)) - .addReg(SpillReg, RegState::Kill) - .addReg(ScratchRsrcReg) - .addReg(SPReg) - .addImm(Offset) - .addImm(0) // cpol - .addImm(0) // tfe - .addImm(0) // swz - .addMemOperand(MMO); - return; - } - - // Don't clobber the TmpVGPR if we also need a scratch reg for the stack - // offset in the spill. + MachineFrameInfo &FrameInfo = MF.getFrameInfo(); + MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); + MachineMemOperand *MMO = MF.getMachineMemOperand( + PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FI), + FrameInfo.getObjectAlign(FI)); LiveRegs.addReg(SpillReg); - - if (ST.enableFlatScratch()) { - MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister( - MF->getRegInfo(), LiveRegs, AMDGPU::SReg_32_XM0RegClass); - - bool HasOffsetReg = OffsetReg; - if (!HasOffsetReg) { - // No free register, use stack pointer and restore afterwards. - OffsetReg = SPReg; - } - - BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_ADD_U32), OffsetReg) - .addReg(SPReg) - .addImm(Offset); - - BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::SCRATCH_STORE_DWORD_SADDR)) - .addReg(SpillReg, RegState::Kill) - .addReg(OffsetReg, HasOffsetReg ? RegState::Kill : 0) - .addImm(0) // offset - .addImm(0) // cpol - .addMemOperand(MMO); - - if (!HasOffsetReg) { - BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_SUB_U32), OffsetReg) - .addReg(SPReg) - .addImm(Offset); - } - } else { - MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister( - MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass); - - if (OffsetReg) { - BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg) - .addImm(Offset); - - BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFEN)) - .addReg(SpillReg, RegState::Kill) - .addReg(OffsetReg, RegState::Kill) - .addReg(ScratchRsrcReg) - .addReg(SPReg) - .addImm(0) // offset - .addImm(0) // cpol - .addImm(0) // tfe - .addImm(0) // swz - .addMemOperand(MMO); - } else { - // No free register, use stack pointer and restore afterwards. - BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_ADD_U32), SPReg) - .addReg(SPReg) - .addImm(Offset); - - BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFSET)) - .addReg(SpillReg, RegState::Kill) - .addReg(ScratchRsrcReg) - .addReg(SPReg) - .addImm(0) // offset - .addImm(0) // cpol - .addImm(0) // tfe - .addImm(0) // swz - .addMemOperand(MMO); - - BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_SUB_U32), SPReg) - .addReg(SPReg) - .addImm(Offset); - } - } - + TRI.buildSpillLoadStore(I, Opc, FI, SpillReg, true, + FuncInfo.getStackPtrOffsetReg(), 0, MMO, nullptr, + &LiveRegs); LiveRegs.removeReg(SpillReg); } -static void buildEpilogReload(const GCNSubtarget &ST, LivePhysRegs &LiveRegs, - MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, - const SIInstrInfo *TII, Register SpillReg, - Register ScratchRsrcReg, Register SPReg, int FI) { - MachineFunction *MF = MBB.getParent(); - MachineFrameInfo &MFI = MF->getFrameInfo(); - int64_t Offset = MFI.getObjectOffset(FI); - - MachineMemOperand *MMO = MF->getMachineMemOperand( - MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad, 4, - MFI.getObjectAlign(FI)); - - if (ST.enableFlatScratch()) { - if (TII->isLegalFLATOffset(Offset, AMDGPUAS::PRIVATE_ADDRESS, true)) { - BuildMI(MBB, I, DebugLoc(), - TII->get(AMDGPU::SCRATCH_LOAD_DWORD_SADDR), SpillReg) - .addReg(SPReg) - .addImm(Offset) - .addImm(0) // cpol - .addMemOperand(MMO); - return; - } - MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister( - MF->getRegInfo(), LiveRegs, AMDGPU::SReg_32_XM0RegClass); - if (!OffsetReg) - report_fatal_error("failed to find free scratch register"); +static void buildEpilogRestore(const GCNSubtarget &ST, + const SIRegisterInfo &TRI, + const SIMachineFunctionInfo &FuncInfo, + LivePhysRegs &LiveRegs, MachineFunction &MF, + MachineBasicBlock::iterator I, Register SpillReg, + int FI) { + unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR + : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; - BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_ADD_U32), OffsetReg) - .addReg(SPReg) - .addImm(Offset); - BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::SCRATCH_LOAD_DWORD_SADDR), - SpillReg) - .addReg(OffsetReg, RegState::Kill) - .addImm(0) - .addImm(0) // cpol - .addMemOperand(MMO); - return; - } - - if (SIInstrInfo::isLegalMUBUFImmOffset(Offset)) { - BuildMI(MBB, I, DebugLoc(), - TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFSET), SpillReg) - .addReg(ScratchRsrcReg) - .addReg(SPReg) - .addImm(Offset) - .addImm(0) // cpol - .addImm(0) // tfe - .addImm(0) // swz - .addMemOperand(MMO); - return; - } - - MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister( - MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass); - if (!OffsetReg) - report_fatal_error("failed to find free scratch register"); - - BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg) - .addImm(Offset); - - BuildMI(MBB, I, DebugLoc(), - TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFEN), SpillReg) - .addReg(OffsetReg, RegState::Kill) - .addReg(ScratchRsrcReg) - .addReg(SPReg) - .addImm(0) - .addImm(0) // cpol - .addImm(0) // tfe - .addImm(0) // swz - .addMemOperand(MMO); + MachineFrameInfo &FrameInfo = MF.getFrameInfo(); + MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); + MachineMemOperand *MMO = MF.getMachineMemOperand( + PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FI), + FrameInfo.getObjectAlign(FI)); + TRI.buildSpillLoadStore(I, Opc, FI, SpillReg, false, + FuncInfo.getStackPtrOffsetReg(), 0, MMO, nullptr, + &LiveRegs); } static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, @@ -878,12 +734,10 @@ continue; if (!ScratchExecCopy) - ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true); + ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, + /*IsProlog*/ true); - buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, Reg.VGPR, - FuncInfo->getScratchRSrcReg(), - StackPtrReg, - Reg.FI.getValue()); + buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBBI, Reg.VGPR, *Reg.FI); } if (FPSaveIndex && spilledToMemory(MF, *FPSaveIndex)) { @@ -891,7 +745,8 @@ assert(!MFI.isDeadObjectIndex(FramePtrFI)); if (!ScratchExecCopy) - ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true); + ScratchExecCopy = + buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ true); MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( MRI, LiveRegs, AMDGPU::VGPR_32RegClass); @@ -901,8 +756,8 @@ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) .addReg(FramePtrReg); - buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, TmpVGPR, - FuncInfo->getScratchRSrcReg(), StackPtrReg, FramePtrFI); + buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBBI, TmpVGPR, + FramePtrFI); } if (BPSaveIndex && spilledToMemory(MF, *BPSaveIndex)) { @@ -910,7 +765,8 @@ assert(!MFI.isDeadObjectIndex(BasePtrFI)); if (!ScratchExecCopy) - ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true); + ScratchExecCopy = + buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ true); MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( MRI, LiveRegs, AMDGPU::VGPR_32RegClass); @@ -920,8 +776,8 @@ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) .addReg(BasePtrReg); - buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, TmpVGPR, - FuncInfo->getScratchRSrcReg(), StackPtrReg, BasePtrFI); + buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBBI, TmpVGPR, + BasePtrFI); } if (ScratchExecCopy) { @@ -1118,16 +974,17 @@ assert(!MFI.isDeadObjectIndex(FramePtrFI)); if (spilledToMemory(MF, FramePtrFI)) { if (!ScratchExecCopy) - ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false); + ScratchExecCopy = + buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ false); - MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister( + MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( MRI, LiveRegs, AMDGPU::VGPR_32RegClass); - if (!TempVGPR) + if (!TmpVGPR) report_fatal_error("failed to find free scratch register"); - buildEpilogReload(ST, LiveRegs, MBB, MBBI, TII, TempVGPR, - FuncInfo->getScratchRSrcReg(), StackPtrReg, FramePtrFI); + buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBBI, TmpVGPR, + FramePtrFI); BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), FramePtrReg) - .addReg(TempVGPR, RegState::Kill); + .addReg(TmpVGPR, RegState::Kill); } else { // Reload from VGPR spill. assert(MFI.getStackID(FramePtrFI) == TargetStackID::SGPRSpill); @@ -1145,16 +1002,17 @@ assert(!MFI.isDeadObjectIndex(BasePtrFI)); if (spilledToMemory(MF, BasePtrFI)) { if (!ScratchExecCopy) - ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false); + ScratchExecCopy = + buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ false); - MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister( + MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( MRI, LiveRegs, AMDGPU::VGPR_32RegClass); - if (!TempVGPR) + if (!TmpVGPR) report_fatal_error("failed to find free scratch register"); - buildEpilogReload(ST, LiveRegs, MBB, MBBI, TII, TempVGPR, - FuncInfo->getScratchRSrcReg(), StackPtrReg, BasePtrFI); + buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBBI, TmpVGPR, + BasePtrFI); BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), BasePtrReg) - .addReg(TempVGPR, RegState::Kill); + .addReg(TmpVGPR, RegState::Kill); } else { // Reload from VGPR spill. assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill); @@ -1173,11 +1031,11 @@ continue; if (!ScratchExecCopy) - ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false); + ScratchExecCopy = + buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ false); - buildEpilogReload(ST, LiveRegs, MBB, MBBI, TII, Reg.VGPR, - FuncInfo->getScratchRSrcReg(), StackPtrReg, - Reg.FI.getValue()); + buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBBI, Reg.VGPR, + *Reg.FI); } if (ScratchExecCopy) { diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -21,6 +21,7 @@ class GCNSubtarget; class LiveIntervals; +class LivePhysRegs; class RegisterBank; class SIMachineFunctionInfo; @@ -342,16 +343,15 @@ /// of the subtarget. ArrayRef getAllSGPR32(const MachineFunction &MF) const; -private: - void buildSpillLoadStore(MachineBasicBlock::iterator MI, - unsigned LoadStoreOp, - int Index, - Register ValueReg, - bool ValueIsKill, - MCRegister ScratchOffsetReg, - int64_t InstrOffset, - MachineMemOperand *MMO, - RegScavenger *RS) const; + // Insert spill or restore instructions. + // When lowering spill pseudos, the RegScavenger should be set. + // For creating spill instructions during frame lowering, where no scavenger + // is available, LiveRegs can be used. + void buildSpillLoadStore(MachineBasicBlock::iterator MI, unsigned LoadStoreOp, + int Index, Register ValueReg, bool ValueIsKill, + MCRegister ScratchOffsetReg, int64_t InstrOffset, + MachineMemOperand *MMO, RegScavenger *RS, + LivePhysRegs *LiveRegs = nullptr) const; }; } // End namespace llvm diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -790,15 +790,13 @@ return LoadStoreOp; } -void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, - unsigned LoadStoreOp, - int Index, - Register ValueReg, - bool IsKill, - MCRegister ScratchOffsetReg, - int64_t InstOffset, - MachineMemOperand *MMO, - RegScavenger *RS) const { +void SIRegisterInfo::buildSpillLoadStore( + MachineBasicBlock::iterator MI, unsigned LoadStoreOp, int Index, + Register ValueReg, bool IsKill, MCRegister ScratchOffsetReg, + int64_t InstOffset, MachineMemOperand *MMO, RegScavenger *RS, + LivePhysRegs *LiveRegs) const { + assert((!RS || !LiveRegs) && "Only RS or LiveRegs can be set but not both"); + MachineBasicBlock *MBB = MI->getParent(); MachineFunction *MF = MI->getParent()->getParent(); const SIInstrInfo *TII = ST.getInstrInfo(); @@ -853,9 +851,17 @@ Offset *= ST.getWavefrontSize(); // We don't have access to the register scavenger if this function is called - // during PEI::scavengeFrameVirtualRegs(). - if (RS) + // during PEI::scavengeFrameVirtualRegs() so use LiveRegs in this case. + if (RS) { SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0, false); + } else if (LiveRegs) { + for (MCRegister Reg : AMDGPU::SGPR_32RegClass) { + if (LiveRegs->available(MF->getRegInfo(), Reg)) { + SOffset = Reg; + break; + } + } + } if (!SOffset) { // There are no free SGPRs, and since we are in the process of spilling @@ -1502,6 +1508,7 @@ } default: { + // Other access to frame index const DebugLoc &DL = MI->getDebugLoc(); int64_t Offset = FrameInfo.getObjectOffset(Index); diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll --- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -483,8 +483,8 @@ ; GCN-LABEL: {{^}}scratch_reg_needed_mubuf_offset: ; GCN: s_waitcnt ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: v_mov_b32_e32 [[SCRATCH_VGPR:v[0-9]+]], 0x1008 -; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], [[SCRATCH_VGPR]], s[0:3], s32 offen ; 4-byte Folded Spill +; MUBUF-NEXT: s_add_u32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x40200 +; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill ; FLATSCR-NEXT: s_add_u32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x1008 ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], [[SCRATCH_SGPR]] ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] @@ -506,8 +506,8 @@ ; FLATSCR-NEXT: s_sub_u32 s32, s32, 0x100c{{$}} ; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], 2 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: v_mov_b32_e32 [[SCRATCH_VGPR:v[0-9]+]], 0x1008 -; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], [[SCRATCH_VGPR]], s[0:3], s32 offen ; 4-byte Folded Reload +; MUBUF-NEXT: s_add_u32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x40200 +; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Reload ; FLATSCR-NEXT: s_add_u32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x1008 ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, [[SCRATCH_SGPR]] ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] @@ -670,12 +670,12 @@ ; scratch VGPR to hold the offset. ; GCN-LABEL: {{^}}spill_fp_to_memory_scratch_reg_needed_mubuf_offset ; MUBUF: s_or_saveexec_b64 s[4:5], -1 -; MUBUF: v_mov_b32_e32 v0, 0x1008 -; MUBUF-NEXT: buffer_store_dword v39, v0, s[0:3], s32 offen ; 4-byte Folded Spill -; MUBUF: v_mov_b32_e32 v0, s33 +; MUBUF-NEXT: s_add_u32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x40200 +; MUBUF-NEXT: buffer_store_dword v39, off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill +; MUBUF-NEXT: v_mov_b32_e32 v0, s33 ; GCN-NOT: v_mov_b32_e32 v0, 0x100c -; MUBUF-NEXT: v_mov_b32_e32 v1, 0x100c -; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], s32 offen ; 4-byte Folded Spill +; MUBUF-NEXT: s_add_u32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x40300 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill ; FLATSCR: s_add_u32 [[SOFF:s[0-9]+]], s33, 0x1004 ; FLATSCR: v_mov_b32_e32 v0, 0 ; FLATSCR: scratch_store_dword off, v0, [[SOFF]] diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir --- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir @@ -26,17 +26,16 @@ ; GFX8-LABEL: name: pei_scavenge_vgpr_spill ; GFX8: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr2 ; GFX8: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX8: $sgpr32 = S_ADD_U32 $sgpr32, 8196, implicit-def $scc - ; GFX8: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.3, addrspace 5) - ; GFX8: $sgpr32 = S_SUB_U32 $sgpr32, 8196, implicit-def $scc + ; GFX8: $sgpr6 = S_ADD_U32 $sgpr32, 524544, implicit-def $scc + ; GFX8: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.3, addrspace 5) ; GFX8: $exec = S_MOV_B64 killed $sgpr4_sgpr5 ; GFX8: $vgpr2 = V_WRITELANE_B32 $sgpr33, 0, undef $vgpr2 ; GFX8: $sgpr33 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc ; GFX8: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def $scc ; GFX8: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc ; GFX8: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec - ; GFX8: $sgpr6 = S_ADD_U32 $sgpr33, 524800, implicit-def $scc - ; GFX8: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.4, addrspace 5) + ; GFX8: $sgpr7 = S_ADD_U32 $sgpr33, 524800, implicit-def $scc + ; GFX8: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr7, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.4, addrspace 5) ; GFX8: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX8: $vcc_lo = S_MOV_B32 8192 ; GFX8: $vgpr3, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr3, 0, implicit $exec @@ -44,8 +43,8 @@ ; GFX8: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc ; GFX8: $sgpr33 = V_READLANE_B32 $vgpr2, 0 ; GFX8: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX8: $vgpr0 = V_MOV_B32_e32 8196, implicit $exec - ; GFX8: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.3, addrspace 5) + ; GFX8: $sgpr6 = S_ADD_U32 $sgpr32, 524544, implicit-def $scc + ; GFX8: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.3, addrspace 5) ; GFX8: $exec = S_MOV_B64 killed $sgpr4_sgpr5 ; GFX8: $sgpr4 = S_ADD_U32 $sgpr33, 524800, implicit-def $scc ; GFX8: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.4, addrspace 5) @@ -53,25 +52,24 @@ ; GFX9-LABEL: name: pei_scavenge_vgpr_spill ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr2 ; GFX9: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX9: $sgpr32 = S_ADD_U32 $sgpr32, 8196, implicit-def $scc - ; GFX9: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.3, addrspace 5) - ; GFX9: $sgpr32 = S_SUB_U32 $sgpr32, 8196, implicit-def $scc + ; GFX9: $sgpr6 = S_ADD_U32 $sgpr32, 524544, implicit-def $scc + ; GFX9: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.3, addrspace 5) ; GFX9: $exec = S_MOV_B64 killed $sgpr4_sgpr5 ; GFX9: $vgpr2 = V_WRITELANE_B32 $sgpr33, 0, undef $vgpr2 ; GFX9: $sgpr33 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc ; GFX9: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def $scc ; GFX9: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc ; GFX9: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec - ; GFX9: $sgpr6 = S_ADD_U32 $sgpr33, 524800, implicit-def $scc - ; GFX9: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.4, addrspace 5) + ; GFX9: $sgpr7 = S_ADD_U32 $sgpr33, 524800, implicit-def $scc + ; GFX9: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr7, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.4, addrspace 5) ; GFX9: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX9: $vgpr3 = V_ADD_U32_e32 8192, killed $vgpr3, implicit $exec ; GFX9: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec ; GFX9: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc ; GFX9: $sgpr33 = V_READLANE_B32 $vgpr2, 0 ; GFX9: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX9: $vgpr0 = V_MOV_B32_e32 8196, implicit $exec - ; GFX9: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.3, addrspace 5) + ; GFX9: $sgpr6 = S_ADD_U32 $sgpr32, 524544, implicit-def $scc + ; GFX9: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.3, addrspace 5) ; GFX9: $exec = S_MOV_B64 killed $sgpr4_sgpr5 ; GFX9: $sgpr4 = S_ADD_U32 $sgpr33, 524800, implicit-def $scc ; GFX9: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.4, addrspace 5) diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll --- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll @@ -292,14 +292,16 @@ ; GCN-LABEL: spill_bp_to_memory_scratch_reg_needed_mubuf_offset ; GCN: s_or_saveexec_b64 s[4:5], -1 -; GCN: v_mov_b32_e32 v0, s33 +; GCN-NEXT: s_add_u32 s6, s32, 0x42100 +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s6 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v0, s33 ; GCN-NOT: v_mov_b32_e32 v0, 0x1088 -; GCN-NEXT: v_mov_b32_e32 v1, 0x1088 -; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], s32 offen -; GCN: v_mov_b32_e32 v0, s34 +; GCN-NEXT: s_add_u32 s6, s32, 0x42200 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v0, s34 ; GCN-NOT: v_mov_b32_e32 v0, 0x108c -; GCN-NEXT: v_mov_b32_e32 v1, 0x108c -; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], s32 offen +; GCN-NEXT: s_add_u32 s6, s32, 0x42300 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill %local_val = alloca i32, align 128, addrspace(5) store volatile i32 %b, i32 addrspace(5)* %local_val, align 128