diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -119,7 +119,8 @@ MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const SIInstrInfo *TII, Register SpillReg, - Register ScratchRsrcReg, Register SPReg, int FI) { + Register ScratchRsrcReg, Register SPReg, int FI, + bool IsKill = true) { MachineFunction *MF = MBB.getParent(); MachineFrameInfo &MFI = MF->getFrameInfo(); @@ -132,27 +133,27 @@ if (ST.enableFlatScratch()) { if (TII->isLegalFLATOffset(Offset, AMDGPUAS::PRIVATE_ADDRESS, true)) { BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::SCRATCH_STORE_DWORD_SADDR)) - .addReg(SpillReg, RegState::Kill) + .addReg(SpillReg, getKillRegState(IsKill)) + .addReg(SPReg) + .addImm(Offset) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // dlc + .addMemOperand(MMO); + return; + } + } else if (SIInstrInfo::isLegalMUBUFImmOffset(Offset)) { + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFSET)) + .addReg(SpillReg, getKillRegState(IsKill)) + .addReg(ScratchRsrcReg) .addReg(SPReg) .addImm(Offset) .addImm(0) // glc .addImm(0) // slc + .addImm(0) // tfe .addImm(0) // dlc + .addImm(0) // swz .addMemOperand(MMO); - return; - } - } else if (SIInstrInfo::isLegalMUBUFImmOffset(Offset)) { - BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFSET)) - .addReg(SpillReg, RegState::Kill) - .addReg(ScratchRsrcReg) - .addReg(SPReg) - .addImm(Offset) - .addImm(0) // glc - .addImm(0) // slc - .addImm(0) // tfe - .addImm(0) // dlc - .addImm(0) // swz - .addMemOperand(MMO); return; } @@ -175,8 +176,8 @@ .addImm(Offset); BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::SCRATCH_STORE_DWORD_SADDR)) - .addReg(SpillReg, RegState::Kill) - .addReg(OffsetReg, HasOffsetReg ? RegState::Kill : 0) + .addReg(SpillReg, getKillRegState(IsKill)) + .addReg(OffsetReg, getKillRegState(HasOffsetReg)) .addImm(0) // offset .addImm(0) // glc .addImm(0) // slc @@ -197,7 +198,7 @@ .addImm(Offset); BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFEN)) - .addReg(SpillReg, RegState::Kill) + .addReg(SpillReg, getKillRegState(IsKill)) .addReg(OffsetReg, RegState::Kill) .addReg(ScratchRsrcReg) .addReg(SPReg) @@ -215,7 +216,7 @@ .addImm(Offset); BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFSET)) - .addReg(SpillReg, RegState::Kill) + .addReg(SpillReg, getKillRegState(IsKill)) .addReg(ScratchRsrcReg) .addReg(SPReg) .addImm(0) // offset @@ -859,6 +860,64 @@ return ScratchExecCopy; } +// Save or restore the frame or base pointer to memory. +// +// 1. Save exec into ScratchExecCopy if not already saved +// 2. Scavenge a TmpVGPR and save it to scratch if not already done +// 3. When IsProlog: Write SpillSGPR into TmpVGPR and save to FrameIndex +// When !IsProlog: Load TmpVGPR from FrameIndex and read into SpillSGPR +static void +buildSpillLoadStore(LivePhysRegs &LiveRegs, SIMachineFunctionInfo *FuncInfo, + const GCNSubtarget &ST, const SIInstrInfo *TII, + const MachineFrameInfo &MFI, MachineRegisterInfo &MRI, + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, bool IsProlog, + Register StackPtrReg, Register &ScratchExecCopy, + MCRegister &TmpVGPR, Register SpillSGPR, int FrameIndex) { + assert(!MFI.isDeadObjectIndex(FrameIndex)); + + if (!ScratchExecCopy) + ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, IsProlog); + + if (!TmpVGPR) { + TmpVGPR = findScratchNonCalleeSaveRegister(MRI, LiveRegs, + AMDGPU::VGPR_32RegClass); + if (!TmpVGPR) + report_fatal_error("failed to find free scratch register"); + // Reserve temporary stack slot + if (!FuncInfo->SpillSGPRTmpIndex) { + assert(IsProlog && "Frame index should have been reserved in the prolog"); + MachineFrameInfo &FrameInfo = MF.getFrameInfo(); + FuncInfo->SpillSGPRTmpIndex = + FrameInfo.CreateSpillStackObject(4, Align(4)); + } + BuildMI(MBB, MBBI, DebugLoc(), TII->get(AMDGPU::COPY_INACTIVE_LANES), + TmpVGPR); + + // Save TmpVGPR + buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, TmpVGPR, + FuncInfo->getScratchRSrcReg(), StackPtrReg, + *FuncInfo->SpillSGPRTmpIndex, false); + } + + if (IsProlog) { + BuildMI(MBB, MBBI, DebugLoc(), TII->get(AMDGPU::V_WRITELANE_B32), TmpVGPR) + .addReg(SpillSGPR) + .addImm(0) + .addReg(TmpVGPR); + + buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, TmpVGPR, + FuncInfo->getScratchRSrcReg(), StackPtrReg, FrameIndex, + false); + } else { + buildEpilogReload(ST, LiveRegs, MBB, MBBI, TII, TmpVGPR, + FuncInfo->getScratchRSrcReg(), StackPtrReg, FrameIndex); + BuildMI(MBB, MBBI, DebugLoc(), TII->get(AMDGPU::V_READFIRSTLANE_B32), + SpillSGPR) + .addReg(TmpVGPR); + } +} + void SIFrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { SIMachineFunctionInfo *FuncInfo = MF.getInfo(); @@ -889,6 +948,7 @@ // To avoid clobbering VGPRs in lanes that weren't active on function entry, // turn on all lanes before doing the spill to memory. Register ScratchExecCopy; + MCRegister TmpVGPR; bool HasFPSaveIndex = FuncInfo->FramePointerSaveIndex.hasValue(); bool SpillFPToMemory = false; @@ -910,68 +970,52 @@ for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg : FuncInfo->getSGPRSpillVGPRs()) { - if (!Reg.FI.hasValue()) + if (!Reg.FI) continue; if (!ScratchExecCopy) ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true); buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, Reg.VGPR, - FuncInfo->getScratchRSrcReg(), - StackPtrReg, - Reg.FI.getValue()); + FuncInfo->getScratchRSrcReg(), StackPtrReg, *Reg.FI); } if (HasFPSaveIndex && SpillFPToMemory) { - assert(!MFI.isDeadObjectIndex(FuncInfo->FramePointerSaveIndex.getValue())); - - if (!ScratchExecCopy) - ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true); - - MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( - MRI, LiveRegs, AMDGPU::VGPR_32RegClass); - if (!TmpVGPR) - report_fatal_error("failed to find free scratch register"); - - BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) - .addReg(FramePtrReg); - - buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, TmpVGPR, - FuncInfo->getScratchRSrcReg(), StackPtrReg, - FuncInfo->FramePointerSaveIndex.getValue()); + buildSpillLoadStore(LiveRegs, FuncInfo, ST, TII, MFI, MRI, MF, MBB, MBBI, + /*IsProlog*/ true, StackPtrReg, ScratchExecCopy, + TmpVGPR, FramePtrReg, *FuncInfo->FramePointerSaveIndex); } if (HasBPSaveIndex && SpillBPToMemory) { - assert(!MFI.isDeadObjectIndex(*FuncInfo->BasePointerSaveIndex)); - - if (!ScratchExecCopy) - ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true); - - MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( - MRI, LiveRegs, AMDGPU::VGPR_32RegClass); - if (!TmpVGPR) - report_fatal_error("failed to find free scratch register"); - - BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) - .addReg(BasePtrReg); + buildSpillLoadStore(LiveRegs, FuncInfo, ST, TII, MFI, MRI, MF, MBB, MBBI, + /*IsProlog*/ true, StackPtrReg, ScratchExecCopy, + TmpVGPR, BasePtrReg, *FuncInfo->BasePointerSaveIndex); + } - buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, TmpVGPR, - FuncInfo->getScratchRSrcReg(), StackPtrReg, - *FuncInfo->BasePointerSaveIndex); + if (TmpVGPR) { + // Restore TmpVGPR + buildEpilogReload(ST, LiveRegs, MBB, MBBI, TII, TmpVGPR, + FuncInfo->getScratchRSrcReg(), StackPtrReg, + *FuncInfo->SpillSGPRTmpIndex); + // If TmpVGPR is set, ScratchExecCopy must also be set. We add an implicit + // use and kill to the exec restore below, so this reload cannot get + // eliminated. } if (ScratchExecCopy) { // FIXME: Split block and make terminator. unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) - .addReg(ScratchExecCopy, RegState::Kill); + auto I = BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) + .addReg(ScratchExecCopy, RegState::Kill); + if (TmpVGPR) + I.addReg(TmpVGPR, RegState::ImplicitKill); LiveRegs.addReg(ScratchExecCopy); } // In this case, spill the FP to a reserved VGPR. if (HasFPSaveIndex && !SpillFPToMemory) { - const int FI = FuncInfo->FramePointerSaveIndex.getValue(); + const int FI = *FuncInfo->FramePointerSaveIndex; assert(!MFI.isDeadObjectIndex(FI)); assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); @@ -980,7 +1024,6 @@ assert(Spill.size() == 1); // Save FP before setting it up. - // FIXME: This should respect spillSGPRToVGPR; BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill[0].VGPR) .addReg(FramePtrReg) .addImm(Spill[0].Lane) @@ -998,7 +1041,6 @@ assert(Spill.size() == 1); // Save BP before setting it up. - // FIXME: This should respect spillSGPRToVGPR; BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill[0].VGPR) .addReg(BasePtrReg) .addImm(Spill[0].Lane) @@ -1106,7 +1148,7 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { - const SIMachineFunctionInfo *FuncInfo = MF.getInfo(); + SIMachineFunctionInfo *FuncInfo = MF.getInfo(); if (FuncInfo->isEntryFunction()) return; @@ -1162,21 +1204,14 @@ } Register ScratchExecCopy; + MCRegister TmpVGPR; if (HasFPSaveIndex) { - const int FI = FuncInfo->FramePointerSaveIndex.getValue(); + const int FI = *FuncInfo->FramePointerSaveIndex; assert(!MFI.isDeadObjectIndex(FI)); if (SpillFPToMemory) { - if (!ScratchExecCopy) - ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false); - - MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister( - MRI, LiveRegs, AMDGPU::VGPR_32RegClass); - if (!TempVGPR) - report_fatal_error("failed to find free scratch register"); - buildEpilogReload(ST, LiveRegs, MBB, MBBI, TII, TempVGPR, - FuncInfo->getScratchRSrcReg(), StackPtrReg, FI); - BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), FramePtrReg) - .addReg(TempVGPR, RegState::Kill); + buildSpillLoadStore(LiveRegs, FuncInfo, ST, TII, MFI, MRI, MF, MBB, MBBI, + /*IsProlog*/ false, StackPtrReg, ScratchExecCopy, + TmpVGPR, FramePtrReg, FI); } else { // Reload from VGPR spill. assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); @@ -1190,25 +1225,17 @@ } if (HasBPSaveIndex) { - const int BasePtrFI = *FuncInfo->BasePointerSaveIndex; - assert(!MFI.isDeadObjectIndex(BasePtrFI)); + const int FI = *FuncInfo->BasePointerSaveIndex; + assert(!MFI.isDeadObjectIndex(FI)); if (SpillBPToMemory) { - if (!ScratchExecCopy) - ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false); - - MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister( - MRI, LiveRegs, AMDGPU::VGPR_32RegClass); - if (!TempVGPR) - report_fatal_error("failed to find free scratch register"); - buildEpilogReload(ST, LiveRegs, MBB, MBBI, TII, TempVGPR, - FuncInfo->getScratchRSrcReg(), StackPtrReg, BasePtrFI); - BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), BasePtrReg) - .addReg(TempVGPR, RegState::Kill); + buildSpillLoadStore(LiveRegs, FuncInfo, ST, TII, MFI, MRI, MF, MBB, MBBI, + /*IsProlog*/ false, StackPtrReg, ScratchExecCopy, + TmpVGPR, BasePtrReg, FI); } else { // Reload from VGPR spill. - assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill); + assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); ArrayRef Spill = - FuncInfo->getSGPRToVGPRSpills(BasePtrFI); + FuncInfo->getSGPRToVGPRSpills(FI); assert(Spill.size() == 1); BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READLANE_B32), BasePtrReg) .addReg(Spill[0].VGPR) @@ -1218,23 +1245,34 @@ for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg : FuncInfo->getSGPRSpillVGPRs()) { - if (!Reg.FI.hasValue()) + if (!Reg.FI) continue; if (!ScratchExecCopy) ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false); buildEpilogReload(ST, LiveRegs, MBB, MBBI, TII, Reg.VGPR, + FuncInfo->getScratchRSrcReg(), StackPtrReg, *Reg.FI); + } + + if (TmpVGPR) { + // Restore TmpVGPR + buildEpilogReload(ST, LiveRegs, MBB, MBBI, TII, TmpVGPR, FuncInfo->getScratchRSrcReg(), StackPtrReg, - Reg.FI.getValue()); + *FuncInfo->SpillSGPRTmpIndex); + // If TmpVGPR is set, ScratchExecCopy must also be set. We add an implicit + // use and kill to the exec restore below, so this reload cannot get + // eliminated. } if (ScratchExecCopy) { // FIXME: Split block and make terminator. unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) - .addReg(ScratchExecCopy, RegState::Kill); + auto I = BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) + .addReg(ScratchExecCopy, RegState::Kill); + if (TmpVGPR) + I.addReg(TmpVGPR, RegState::ImplicitKill); } } diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll --- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -539,16 +539,24 @@ ; With no free registers, we must spill the FP to memory. ; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_memory: ; MUBUF: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF: v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], s33 -; MUBUF: buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s32 offset:4 -; MUBUF: s_mov_b64 exec, [[COPY_EXEC1]] +; MUBUF-NEXT: ;;#COPY_INACTIVE_LANES [[TMP_VGPR1:v[0-9]+]] +; MUBUF-NEXT: buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s32 ; 4-byte Folded Spill +; MUBUF-NEXT: v_writelane_b32 [[TMP_VGPR1]], s33, 0 +; MUBUF-NEXT: buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; MUBUF-NEXT: buffer_load_dword [[TMP_VGPR1]], off, s[0:3], s32 ; 4-byte Folded Reload +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; FLATSCR: s_mov_b32 s0, s33 ; GCN: s_mov_b32 s33, s32 ; MUBUF: s_or_saveexec_b64 [[COPY_EXEC2:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF: buffer_load_dword [[TMP_VGPR2:v[0-9]+]], off, s[0:3], s32 offset:4 +; MUBUF-NEXT: ;;#COPY_INACTIVE_LANES [[TMP_VGPR2:v[0-9]+]] +; MUBUF-NEXT: buffer_store_dword [[TMP_VGPR2]], off, s[0:3], s32 ; 4-byte Folded Spill +; MUBUF-NEXT: buffer_load_dword [[TMP_VGPR2]], off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: v_readfirstlane_b32 s33, [[TMP_VGPR2]] +; MUBUF-NEXT: buffer_load_dword [[TMP_VGPR2]], off, s[0:3], s32 ; 4-byte Folded Reload +; MUBUF-NEXT: s_waitcnt vmcnt(0) ; FLATSCR: s_mov_b32 s33, s0 -; MUBUF: s_waitcnt vmcnt(0) -; MUBUF: v_readfirstlane_b32 s33, [[TMP_VGPR2]] ; MUBUF: s_mov_b64 exec, [[COPY_EXEC2]] ; GCN: s_setpc_b64 ; MUBUF: ScratchSize: 8 @@ -573,7 +581,7 @@ ; VGPR. ; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_memory_full_reserved_vgpr: ; MUBUF: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF: v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], s33 +; MUBUF: v_writelane_b32 [[TMP_VGPR1:v[0-9]+]], s33, 0 ; MUBUF: buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s32 offset:[[OFF:[0-9]+]] ; MUBUF: s_mov_b64 exec, [[COPY_EXEC1]] ; GCN-NOT: v_writelane_b32 v40, s33 @@ -645,13 +653,27 @@ ; scratch VGPR to hold the offset. ; GCN-LABEL: {{^}}spill_fp_to_memory_scratch_reg_needed_mubuf_offset ; MUBUF: s_or_saveexec_b64 s[4:5], -1 -; MUBUF: v_mov_b32_e32 v0, s33 +; MUBUF-NEXT: ;;#COPY_INACTIVE_LANES v0 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; MUBUF-NEXT: v_writelane_b32 v0, s33, 0 +; MUBUF-NEXT: v_mov_b32_e32 v1, 0x1008 +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], s32 offen ; 4-byte Folded Spill +; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; MUBUF-NEXT: s_waitcnt vmcnt(0) ; GCN-NOT: v_mov_b32_e32 v0, 0x1008 -; MUBUF-NEXT: v_mov_b32_e32 v1, 0x1008 -; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], s32 offen ; 4-byte Folded Spill ; FLATSCR: s_add_u32 [[SOFF:s[0-9]+]], s33, 0x1004 ; FLATSCR: v_mov_b32_e32 v0, 0 ; FLATSCR: scratch_store_dword off, v0, [[SOFF]] +; MUBUF: s_or_saveexec_b64 s[4:5], -1 +; MUBUF-NEXT: ;;#COPY_INACTIVE_LANES v0 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; MUBUF-NEXT: v_mov_b32_e32 v0, 0x1008 +; MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], s32 offen ; 4-byte Folded Reload +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: v_readfirstlane_b32 s33, v0 +; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_mov_b64 exec, s[4:5] define void @spill_fp_to_memory_scratch_reg_needed_mubuf_offset([4096 x i8] addrspace(5)* byval([4096 x i8]) align 4 %arg) #3 { %alloca = alloca i32, addrspace(5) store volatile i32 0, i32 addrspace(5)* %alloca diff --git a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll --- a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll @@ -1,27 +1,105 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-spill-sgpr-to-vgpr=true < %s | FileCheck -check-prefixes=GCN,SPILL-TO-VGPR %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-spill-sgpr-to-vgpr=false < %s | FileCheck -check-prefixes=GCN,NO-SPILL-TO-VGPR %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-spill-sgpr-to-vgpr=true < %s | FileCheck -check-prefix=SPILL-TO-VGPR %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-spill-sgpr-to-vgpr=false < %s | FileCheck -check-prefix=NO-SPILL-TO-VGPR %s ; Check frame setup where SGPR spills to VGPRs are disabled or enabled. declare hidden void @external_void_func_void() #0 -; GCN-LABEL: {{^}}callee_with_stack_and_call: -; SPILL-TO-VGPR: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SPILL-TO-VGPR: v_writelane_b32 v40, s33, 2 -; NO-SPILL-TO-VGPR: v_mov_b32_e32 v0, s33 -; NO-SPILL-TO-VGPR: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill - -; GCN: s_swappc_b64 s[30:31], s[4:5] - -; SPILL-TO-VGPR: v_readlane_b32 s4, v40, 0 -; SPILL-TO-VGPR: v_readlane_b32 s5, v40, 1 -; NO-SPILL-TO-VGPR: v_readlane_b32 s4, v0, 0 -; NO-SPILL-TO-VGPR: v_readlane_b32 s5, v0, 1 - -; SPILL-TO-VGPR: v_readlane_b32 s33, v40, 2 -; NO-SPILL-TO-VGPR: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; NO-SPILL-TO-VGPR: v_readfirstlane_b32 s33, v0 define void @callee_with_stack_and_call() #0 { +; SPILL-TO-VGPR-LABEL: callee_with_stack_and_call: +; SPILL-TO-VGPR: ; %bb.0: +; SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SPILL-TO-VGPR-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SPILL-TO-VGPR-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[4:5] +; SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, s33, 2 +; SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, s30, 0 +; SPILL-TO-VGPR-NEXT: s_mov_b32 s33, s32 +; SPILL-TO-VGPR-NEXT: s_add_u32 s32, s32, 0x400 +; SPILL-TO-VGPR-NEXT: v_mov_b32_e32 v0, 0 +; SPILL-TO-VGPR-NEXT: s_getpc_b64 s[4:5] +; SPILL-TO-VGPR-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; SPILL-TO-VGPR-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, s31, 1 +; SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33 +; SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) +; SPILL-TO-VGPR-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SPILL-TO-VGPR-NEXT: v_readlane_b32 s4, v40, 0 +; SPILL-TO-VGPR-NEXT: v_readlane_b32 s5, v40, 1 +; SPILL-TO-VGPR-NEXT: s_sub_u32 s32, s32, 0x400 +; SPILL-TO-VGPR-NEXT: v_readlane_b32 s33, v40, 2 +; SPILL-TO-VGPR-NEXT: s_or_saveexec_b64 s[6:7], -1 +; SPILL-TO-VGPR-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[6:7] +; SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) +; SPILL-TO-VGPR-NEXT: s_setpc_b64 s[4:5] +; +; NO-SPILL-TO-VGPR-LABEL: callee_with_stack_and_call: +; NO-SPILL-TO-VGPR: ; %bb.0: +; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; NO-SPILL-TO-VGPR-NEXT: s_or_saveexec_b64 s[4:5], -1 +; NO-SPILL-TO-VGPR-NEXT: ;;#COPY_INACTIVE_LANES v0 +; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; NO-SPILL-TO-VGPR-NEXT: v_writelane_b32 v0, s33, 0 +; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) +; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[4:5] +; NO-SPILL-TO-VGPR-NEXT: s_mov_b32 s33, s32 +; NO-SPILL-TO-VGPR-NEXT: ;;#COPY_INACTIVE_LANES v0 +; NO-SPILL-TO-VGPR-NEXT: s_add_u32 s32, s32, 0x800 +; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; 4-byte Folded Spill +; NO-SPILL-TO-VGPR-NEXT: s_not_b64 exec, exec +; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; 4-byte Folded Spill +; NO-SPILL-TO-VGPR-NEXT: s_not_b64 exec, exec +; NO-SPILL-TO-VGPR-NEXT: v_writelane_b32 v0, s30, 0 +; NO-SPILL-TO-VGPR-NEXT: v_writelane_b32 v0, s31, 1 +; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 s[30:31], exec +; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, 3 +; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[30:31] +; NO-SPILL-TO-VGPR-NEXT: v_readlane_b32 s31, v0, 1 +; NO-SPILL-TO-VGPR-NEXT: v_readlane_b32 s30, v0, 0 +; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload +; NO-SPILL-TO-VGPR-NEXT: s_not_b64 exec, exec +; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload +; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) +; NO-SPILL-TO-VGPR-NEXT: s_not_b64 exec, exec +; NO-SPILL-TO-VGPR-NEXT: v_mov_b32_e32 v0, 0 +; NO-SPILL-TO-VGPR-NEXT: s_getpc_b64 s[4:5] +; NO-SPILL-TO-VGPR-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; NO-SPILL-TO-VGPR-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33 +; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) +; NO-SPILL-TO-VGPR-NEXT: s_swappc_b64 s[30:31], s[4:5] +; NO-SPILL-TO-VGPR-NEXT: ;;#COPY_INACTIVE_LANES v0 +; NO-SPILL-TO-VGPR-NEXT: s_or_saveexec_b64 s[4:5], -1 +; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; 4-byte Folded Spill +; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[4:5] +; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 s[4:5], exec +; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, 3 +; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[4:5] +; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) +; NO-SPILL-TO-VGPR-NEXT: v_readlane_b32 s4, v0, 0 +; NO-SPILL-TO-VGPR-NEXT: v_readlane_b32 s5, v0, 1 +; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload +; NO-SPILL-TO-VGPR-NEXT: s_not_b64 exec, exec +; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload +; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) +; NO-SPILL-TO-VGPR-NEXT: s_not_b64 exec, exec +; NO-SPILL-TO-VGPR-NEXT: s_sub_u32 s32, s32, 0x800 +; NO-SPILL-TO-VGPR-NEXT: s_or_saveexec_b64 s[6:7], -1 +; NO-SPILL-TO-VGPR-NEXT: ;;#COPY_INACTIVE_LANES v0 +; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) +; NO-SPILL-TO-VGPR-NEXT: v_readfirstlane_b32 s33, v0 +; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) +; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[6:7] +; NO-SPILL-TO-VGPR-NEXT: s_setpc_b64 s[4:5] %alloca = alloca i32, addrspace(5) store volatile i32 0, i32 addrspace(5)* %alloca call void @external_void_func_void() diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll --- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll @@ -258,10 +258,14 @@ ; GCN-LABEL: no_free_regs_spill_bp_to_mem ; GCN: s_or_saveexec_b64 s[4:5], -1 -; GCN: v_mov_b32_e32 v0, s33 -; GCN: buffer_store_dword v0, off, s[0:3], s32 -; GCN: v_mov_b32_e32 v0, s34 -; GCN-DAG: buffer_store_dword v0, off, s[0:3], s32 +; GCN-NEXT: ;;#COPY_INACTIVE_LANES v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: v_writelane_b32 v0, s33, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GCN-NEXT: v_writelane_b32 v0, s34, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) %local_val = alloca i32, align 128, addrspace(5) store volatile i32 %b, i32 addrspace(5)* %local_val, align 128 @@ -291,14 +295,16 @@ ; GCN-LABEL: spill_bp_to_memory_scratch_reg_needed_mubuf_offset ; GCN: s_or_saveexec_b64 s[4:5], -1 -; GCN: v_mov_b32_e32 v0, s33 -; GCN-NOT: v_mov_b32_e32 v0, 0x1084 +; GCN-NEXT: ;;#COPY_INACTIVE_LANES v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: v_writelane_b32 v0, s33, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 0x1084 -; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], s32 offen -; GCN: v_mov_b32_e32 v0, s34 -; GCN-NOT: v_mov_b32_e32 v0, 0x1088 +; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], s32 offen ; 4-byte Folded Spill +; GCN-NEXT: v_writelane_b32 v0, s34, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 0x1088 -; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], s32 offen +; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], s32 offen ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) %local_val = alloca i32, align 128, addrspace(5) store volatile i32 %b, i32 addrspace(5)* %local_val, align 128