diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h @@ -33,6 +33,8 @@ RegScavenger *RS = nullptr) const override; void determineCalleeSavesSGPR(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS = nullptr) const; + void determinePrologEpilogSGPRSaves(MachineFunction &MF, + BitVector &SavedRegs) const; bool assignCalleeSavedSpillSlots(MachineFunction &MF, const TargetRegisterInfo *TRI, diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -26,6 +26,18 @@ cl::ReallyHidden, cl::init(true)); +// Find a register matching \p RC from \p LiveRegs which is unused and available +// throughout the function. On failure, returns AMDGPU::NoRegister. +static MCRegister findUnusedRegister(MachineRegisterInfo &MRI, + const LivePhysRegs &LiveRegs, + const TargetRegisterClass &RC) { + for (MCRegister Reg : RC) { + if (!MRI.isPhysRegUsed(Reg) && LiveRegs.available(MRI, Reg)) + return Reg; + } + return MCRegister(); +} + // Find a scratch register that we can use in the prologue. We avoid using // callee-save registers since they may appear to be free when this is called // from canUseAsPrologue (during shrink wrapping), but then no longer be free @@ -39,65 +51,70 @@ for (unsigned i = 0; CSRegs[i]; ++i) LiveRegs.addReg(CSRegs[i]); - if (Unused) { - // We are looking for a register that can be used throughout the entire - // function, so any use is unacceptable. - for (MCRegister Reg : RC) { - if (!MRI.isPhysRegUsed(Reg) && LiveRegs.available(MRI, Reg)) - return Reg; - } - } else { - for (MCRegister Reg : RC) { - if (LiveRegs.available(MRI, Reg)) - return Reg; - } + // We are looking for a register that can be used throughout the entire + // function, so any use is unacceptable. + if (Unused) + return findUnusedRegister(MRI, LiveRegs, RC); + + for (MCRegister Reg : RC) { + if (LiveRegs.available(MRI, Reg)) + return Reg; } return MCRegister(); } -static void getVGPRSpillLaneOrTempRegister(MachineFunction &MF, - LivePhysRegs &LiveRegs, - Register &TempSGPR, - std::optional &FrameIndex, - bool IsFP) { +static void getVGPRSpillLaneOrTempRegister( + MachineFunction &MF, LivePhysRegs &LiveRegs, Register SGPR, + const TargetRegisterClass &RC = AMDGPU::SReg_32_XM0_XEXECRegClass) { SIMachineFunctionInfo *MFI = MF.getInfo(); MachineFrameInfo &FrameInfo = MF.getFrameInfo(); const GCNSubtarget &ST = MF.getSubtarget(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); + unsigned Size = TRI->getSpillSize(RC); + Align Alignment = TRI->getSpillAlign(RC); - // We need to save and restore the current FP/BP. + // We need to save and restore the given SGPR. - // 1: Try to save the FP/BP in an unused SGPR. - TempSGPR = findScratchNonCalleeSaveRegister( - MF.getRegInfo(), LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass, true); + // 1: Try to save the given register into an unused scratch SGPR. The LiveRegs + // should have all the callee saved registers marked as used. + Register ScratchSGPR = findUnusedRegister(MF.getRegInfo(), LiveRegs, RC); - if (!TempSGPR) { - int NewFI = FrameInfo.CreateStackObject(4, Align(4), true, nullptr, - TargetStackID::SGPRSpill); + if (!ScratchSGPR) { + int FI = FrameInfo.CreateStackObject(Size, Alignment, true, nullptr, + TargetStackID::SGPRSpill); - if (TRI->spillSGPRToVGPR() && MFI->allocateSGPRSpillToVGPRLane( - MF, NewFI, /* IsPrologEpilog */ true)) { - // 2: There's no free lane to spill, and no free register to save FP/BP, - // so we're forced to spill another VGPR to use for the spill. - FrameIndex = NewFI; + if (TRI->spillSGPRToVGPR() && + MFI->allocateSGPRSpillToVGPRLane(MF, FI, /* IsPrologEpilog */ true)) { + // 2: There's no free lane to spill, and no free register to save the + // SGPR, so we're forced to take another VGPR to use for the spill. + MFI->addToPrologEpilogSGPRSpills( + SGPR, PrologEpilogSGPRSaveRestoreInfo( + SGPRSaveKind::SPILL_TO_VGPR_LANE, FI)); LLVM_DEBUG( - auto Spill = MFI->getPrologEpilogSGPRSpillToVGPRLanes(NewFI).front(); - dbgs() << (IsFP ? "FP" : "BP") << " requires fallback spill to " + auto Spill = MFI->getPrologEpilogSGPRSpillToVGPRLanes(FI).front(); + dbgs() << printReg(SGPR, TRI) << " requires fallback spill to " << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane << '\n';); } else { - // Remove dead index - MF.getFrameInfo().RemoveStackObject(NewFI); - // 3: If all else fails, spill the FP/BP to memory. - FrameIndex = FrameInfo.CreateSpillStackObject(4, Align(4)); - LLVM_DEBUG(dbgs() << "Reserved FI " << FrameIndex << " for spilling " - << (IsFP ? "FP" : "BP") << '\n'); + // Remove dead index + MF.getFrameInfo().RemoveStackObject(FI); + // 3: If all else fails, spill the register to memory. + FI = FrameInfo.CreateSpillStackObject(Size, Alignment); + MFI->addToPrologEpilogSGPRSpills( + SGPR, + PrologEpilogSGPRSaveRestoreInfo(SGPRSaveKind::SPILL_TO_MEM, FI)); + LLVM_DEBUG(dbgs() << "Reserved FI " << FI << " for spilling " + << printReg(SGPR, TRI) << '\n'); } } else { - LLVM_DEBUG(dbgs() << "Saving " << (IsFP ? "FP" : "BP") << " with copy to " - << printReg(TempSGPR, TRI) << '\n'); + MFI->addToPrologEpilogSGPRSpills( + SGPR, PrologEpilogSGPRSaveRestoreInfo( + SGPRSaveKind::COPY_TO_SCRATCH_SGPR, ScratchSGPR)); + LiveRegs.addReg(ScratchSGPR); + LLVM_DEBUG(dbgs() << "Saving " << printReg(SGPR, TRI) << " with copy to " + << printReg(ScratchSGPR, TRI) << '\n'); } } @@ -109,7 +126,7 @@ LivePhysRegs &LiveRegs, MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, - Register SpillReg, int FI) { + Register SpillReg, int FI, int64_t DwordOff = 0) { unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR : AMDGPU::BUFFER_STORE_DWORD_OFFSET; @@ -121,19 +138,17 @@ LiveRegs.addReg(SpillReg); bool IsKill = !MBB.isLiveIn(SpillReg); TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, IsKill, - FuncInfo.getStackPtrOffsetReg(), 0, MMO, nullptr, - &LiveRegs); + FuncInfo.getStackPtrOffsetReg(), DwordOff, MMO, + nullptr, &LiveRegs); if (IsKill) LiveRegs.removeReg(SpillReg); } -static void buildEpilogRestore(const GCNSubtarget &ST, - const SIRegisterInfo &TRI, - const SIMachineFunctionInfo &FuncInfo, - LivePhysRegs &LiveRegs, MachineFunction &MF, - MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, - const DebugLoc &DL, Register SpillReg, int FI) { +static void buildEpilogRestore( + const GCNSubtarget &ST, const SIRegisterInfo &TRI, + const SIMachineFunctionInfo &FuncInfo, LivePhysRegs &LiveRegs, + MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + const DebugLoc &DL, Register SpillReg, int FI, int64_t DwordOff = 0) { unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; @@ -143,8 +158,8 @@ PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FI), FrameInfo.getObjectAlign(FI)); TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, false, - FuncInfo.getStackPtrOffsetReg(), 0, MMO, nullptr, - &LiveRegs); + FuncInfo.getStackPtrOffsetReg(), DwordOff, MMO, + nullptr, &LiveRegs); } static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, @@ -172,6 +187,182 @@ .addReg(GitPtrLo); } +static void initLiveRegs(LivePhysRegs &LiveRegs, const SIRegisterInfo &TRI, + const SIMachineFunctionInfo *FuncInfo, + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, bool IsProlog) { + if (LiveRegs.empty()) { + LiveRegs.init(TRI); + if (IsProlog) { + LiveRegs.addLiveIns(MBB); + } else { + // In epilog. + LiveRegs.addLiveOuts(MBB); + LiveRegs.stepBackward(*MBBI); + } + } +} + +namespace llvm { + +// SpillBuilder to save/restore special SGPR spills like the one needed for FP, +// BP, etc. These spills are delayed until the current function's frame is +// finalized. For a given register, the builder uses the +// PrologEpilogSGPRSaveRestoreInfo to decide the spill method. +class PrologEpilogSGPRSpillBuilder { + MachineBasicBlock::iterator MI; + MachineBasicBlock &MBB; + MachineFunction &MF; + const GCNSubtarget &ST; + MachineFrameInfo &MFI; + SIMachineFunctionInfo *FuncInfo; + const SIInstrInfo *TII; + const SIRegisterInfo &TRI; + Register SuperReg; + const PrologEpilogSGPRSaveRestoreInfo SI; + LivePhysRegs &LiveRegs; + const DebugLoc &DL; + ArrayRef SplitParts; + unsigned NumSubRegs; + unsigned EltSize = 4; + + void saveToMemory(const int FI) const { + MachineRegisterInfo &MRI = MF.getRegInfo(); + assert(!MFI.isDeadObjectIndex(FI)); + + initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ true); + + MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( + MRI, LiveRegs, AMDGPU::VGPR_32RegClass); + if (!TmpVGPR) + report_fatal_error("failed to find free scratch register"); + + for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) { + Register SubReg = NumSubRegs == 1 + ? SuperReg + : Register(TRI.getSubReg(SuperReg, SplitParts[I])); + BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) + .addReg(SubReg); + + buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MI, DL, TmpVGPR, + FI, DwordOff); + DwordOff += 4; + } + } + + void saveToVGPRLane(const int FI) const { + assert(!MFI.isDeadObjectIndex(FI)); + + assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); + ArrayRef Spill = + FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI); + assert(Spill.size() == NumSubRegs); + + for (unsigned I = 0; I < NumSubRegs; ++I) { + Register SubReg = NumSubRegs == 1 + ? SuperReg + : Register(TRI.getSubReg(SuperReg, SplitParts[I])); + BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill[I].VGPR) + .addReg(SubReg) + .addImm(Spill[I].Lane) + .addReg(Spill[I].VGPR, RegState::Undef); + } + } + + void copyToScratchSGPR(Register DstReg) const { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), DstReg) + .addReg(SuperReg) + .setMIFlag(MachineInstr::FrameSetup); + } + + void restoreFromMemory(const int FI) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + + initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ false); + MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( + MRI, LiveRegs, AMDGPU::VGPR_32RegClass); + if (!TmpVGPR) + report_fatal_error("failed to find free scratch register"); + + for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) { + Register SubReg = NumSubRegs == 1 + ? SuperReg + : Register(TRI.getSubReg(SuperReg, SplitParts[I])); + + buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MI, DL, TmpVGPR, + FI, DwordOff); + BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg) + .addReg(TmpVGPR, RegState::Kill); + DwordOff += 4; + } + } + + void restoreFromVGPRLane(const int FI) { + assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); + ArrayRef Spill = + FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI); + assert(Spill.size() == NumSubRegs); + + for (unsigned I = 0; I < NumSubRegs; ++I) { + Register SubReg = NumSubRegs == 1 + ? SuperReg + : Register(TRI.getSubReg(SuperReg, SplitParts[I])); + BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg) + .addReg(Spill[I].VGPR) + .addImm(Spill[I].Lane); + } + } + + void copyFromScratchSGPR(Register SrcReg) const { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), SuperReg) + .addReg(SrcReg) + .setMIFlag(MachineInstr::FrameDestroy); + } + +public: + PrologEpilogSGPRSpillBuilder(Register Reg, + const PrologEpilogSGPRSaveRestoreInfo SI, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const DebugLoc &DL, const SIInstrInfo *TII, + const SIRegisterInfo &TRI, + LivePhysRegs &LiveRegs) + : MI(MI), MBB(MBB), MF(*MBB.getParent()), + ST(MF.getSubtarget()), MFI(MF.getFrameInfo()), + FuncInfo(MF.getInfo()), TII(TII), TRI(TRI), + SuperReg(Reg), SI(SI), LiveRegs(LiveRegs), DL(DL) { + const TargetRegisterClass *RC = TRI.getPhysRegClass(SuperReg); + SplitParts = TRI.getRegSplitParts(RC, EltSize); + NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); + + assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); + } + + void save() { + switch (SI.getKind()) { + case SGPRSaveKind::SPILL_TO_MEM: + return saveToMemory(SI.getIndex()); + case SGPRSaveKind::SPILL_TO_VGPR_LANE: + return saveToVGPRLane(SI.getIndex()); + case SGPRSaveKind::COPY_TO_SCRATCH_SGPR: + return copyToScratchSGPR(SI.getReg()); + } + } + + void restore() { + switch (SI.getKind()) { + case SGPRSaveKind::SPILL_TO_MEM: + return restoreFromMemory(SI.getIndex()); + case SGPRSaveKind::SPILL_TO_VGPR_LANE: + return restoreFromVGPRLane(SI.getIndex()); + case SGPRSaveKind::COPY_TO_SCRATCH_SGPR: + return copyFromScratchSGPR(SI.getReg()); + } + } +}; + +} // namespace llvm + // Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()` void SIFrameLowering::emitEntryFunctionFlatScratchInit( MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, @@ -671,22 +862,6 @@ llvm_unreachable("Invalid TargetStackID::Value"); } -static void initLiveRegs(LivePhysRegs &LiveRegs, const SIRegisterInfo &TRI, - const SIMachineFunctionInfo *FuncInfo, - MachineFunction &MF, MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, bool IsProlog) { - if (LiveRegs.empty()) { - LiveRegs.init(TRI); - if (IsProlog) { - LiveRegs.addLiveIns(MBB); - } else { - // In epilog. - LiveRegs.addLiveOuts(MBB); - LiveRegs.stepBackward(*MBBI); - } - } -} - // Activate all lanes, returns saved exec. static Register buildScratchExecCopy(LivePhysRegs &LiveRegs, MachineFunction &MF, @@ -718,13 +893,6 @@ return ScratchExecCopy; } -// A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR. -// Otherwise we are spilling to memory. -static bool spilledToMemory(const MachineFunction &MF, int SaveIndex) { - const MachineFrameInfo &MFI = MF.getFrameInfo(); - return MFI.getStackID(SaveIndex) != TargetStackID::SGPRSpill; -} - void SIFrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { SIMachineFunctionInfo *FuncInfo = MF.getInfo(); @@ -734,7 +902,6 @@ } MachineFrameInfo &MFI = MF.getFrameInfo(); - MachineRegisterInfo &MRI = MF.getRegInfo(); const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo &TRI = TII->getRegisterInfo(); @@ -758,9 +925,6 @@ // turn on all lanes before doing the spill to memory. Register ScratchExecCopy; - std::optional FPSaveIndex = FuncInfo->FramePointerSaveIndex; - std::optional BPSaveIndex = FuncInfo->BasePointerSaveIndex; - // Spill Whole-Wave Mode VGPRs. for (const auto &Reg : FuncInfo->getWWMSpills()) { Register VGPR = Reg.first; @@ -781,86 +945,26 @@ LiveRegs.addReg(ScratchExecCopy); } - auto SaveSGPRToMemory = [&](Register Reg, const int FI) { - assert(!MFI.isDeadObjectIndex(FI)); - - initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true); - - MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( - MRI, LiveRegs, AMDGPU::VGPR_32RegClass); - if (!TmpVGPR) - report_fatal_error("failed to find free scratch register"); - - BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) - .addReg(Reg); - - buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL, TmpVGPR, - FI); - }; - - auto SaveSGPRToVGPRLane = [&](Register Reg, const int FI) { - assert(!MFI.isDeadObjectIndex(FI)); - - assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); - ArrayRef Spill = - FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI); - assert(Spill.size() == 1); - - BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill[0].VGPR) - .addReg(Reg) - .addImm(Spill[0].Lane) - .addReg(Spill[0].VGPR, RegState::Undef); - }; - - if (FPSaveIndex) { - if (spilledToMemory(MF, *FPSaveIndex)) - SaveSGPRToMemory(FramePtrReg, *FPSaveIndex); - else - SaveSGPRToVGPRLane(FramePtrReg, *FPSaveIndex); - } - - // Emit the copy if we need an FP, and are using a free SGPR to save it. - if (FuncInfo->SGPRForFPSaveRestoreCopy) { - BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), - FuncInfo->SGPRForFPSaveRestoreCopy) - .addReg(FramePtrReg) - .setMIFlag(MachineInstr::FrameSetup); + for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) { + PrologEpilogSGPRSpillBuilder SB(Spill.first, Spill.second, MBB, MBBI, DL, + TII, TRI, LiveRegs); + SB.save(); } - if (BPSaveIndex) { - if (spilledToMemory(MF, *BPSaveIndex)) - SaveSGPRToMemory(BasePtrReg, *BPSaveIndex); - else - SaveSGPRToVGPRLane(BasePtrReg, *BPSaveIndex); - } - - // Emit the copy if we need a BP, and are using a free SGPR to save it. - if (FuncInfo->SGPRForBPSaveRestoreCopy) { - BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), - FuncInfo->SGPRForBPSaveRestoreCopy) - .addReg(BasePtrReg) - .setMIFlag(MachineInstr::FrameSetup); - } - - // If a copy has been emitted for FP and/or BP, Make the SGPRs - // used in the copy instructions live throughout the function. - SmallVector TempSGPRs; - if (FuncInfo->SGPRForFPSaveRestoreCopy) - TempSGPRs.push_back(FuncInfo->SGPRForFPSaveRestoreCopy); - - if (FuncInfo->SGPRForBPSaveRestoreCopy) - TempSGPRs.push_back(FuncInfo->SGPRForBPSaveRestoreCopy); - - if (!TempSGPRs.empty()) { + // If a copy to scratch SGPR has been chosen for any of the SGPR spills, make + // such scratch registers live throughout the function. + SmallVector ScratchSGPRs; + FuncInfo->getAllScratchSGPRCopyDstRegs(ScratchSGPRs); + if (!ScratchSGPRs.empty()) { for (MachineBasicBlock &MBB : MF) { - for (MCPhysReg Reg : TempSGPRs) + for (MCPhysReg Reg : ScratchSGPRs) MBB.addLiveIn(Reg); MBB.sortUniqueLiveIns(); } if (!LiveRegs.empty()) { - LiveRegs.addReg(FuncInfo->SGPRForFPSaveRestoreCopy); - LiveRegs.addReg(FuncInfo->SGPRForBPSaveRestoreCopy); + for (MCPhysReg Reg : ScratchSGPRs) + LiveRegs.addReg(Reg); } } @@ -910,24 +1014,20 @@ Add->getOperand(3).setIsDead(); // Mark SCC as dead. } - assert((!HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy || - FuncInfo->FramePointerSaveIndex)) && + bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(FramePtrReg); + assert((!HasFP || FPSaved) && "Needed to save FP but didn't save it anywhere"); // If we allow spilling to AGPRs we may have saved FP but then spill // everything into AGPRs instead of the stack. - assert((HasFP || (!FuncInfo->SGPRForFPSaveRestoreCopy && - !FuncInfo->FramePointerSaveIndex) || - EnableSpillVGPRToAGPR) && + assert((HasFP || !FPSaved || EnableSpillVGPRToAGPR) && "Saved FP but didn't need it"); - assert((!HasBP || (FuncInfo->SGPRForBPSaveRestoreCopy || - FuncInfo->BasePointerSaveIndex)) && + bool BPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(BasePtrReg); + assert((!HasBP || BPSaved) && "Needed to save BP but didn't save it anywhere"); - assert((HasBP || (!FuncInfo->SGPRForBPSaveRestoreCopy && - !FuncInfo->BasePointerSaveIndex)) && - "Saved BP but didn't need it"); + assert((HasBP || !BPSaved) && "Saved BP but didn't need it"); } void SIFrameLowering::emitEpilogue(MachineFunction &MF, @@ -938,7 +1038,6 @@ const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); - MachineRegisterInfo &MRI = MF.getRegInfo(); const SIRegisterInfo &TRI = TII->getRegisterInfo(); LivePhysRegs LiveRegs; // Get the insert location for the epilogue. If there were no terminators in @@ -959,12 +1058,6 @@ ? NumBytes + MFI.getMaxAlign().value() : NumBytes; const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg(); - const Register FramePtrReg = FuncInfo->getFrameOffsetReg(); - const Register BasePtrReg = - TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register(); - - std::optional FPSaveIndex = FuncInfo->FramePointerSaveIndex; - std::optional BPSaveIndex = FuncInfo->BasePointerSaveIndex; if (RoundedSize != 0 && hasFP(MF)) { auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg) @@ -974,56 +1067,10 @@ Add->getOperand(3).setIsDead(); // Mark SCC as dead. } - if (FuncInfo->SGPRForFPSaveRestoreCopy) { - BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg) - .addReg(FuncInfo->SGPRForFPSaveRestoreCopy) - .setMIFlag(MachineInstr::FrameDestroy); - } - - if (FuncInfo->SGPRForBPSaveRestoreCopy) { - BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg) - .addReg(FuncInfo->SGPRForBPSaveRestoreCopy) - .setMIFlag(MachineInstr::FrameDestroy); - } - - auto RestoreSGPRFromMemory = [&](Register Reg, const int FI) { - initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false); - MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( - MRI, LiveRegs, AMDGPU::VGPR_32RegClass); - if (!TmpVGPR) - report_fatal_error("failed to find free scratch register"); - buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL, TmpVGPR, - FI); - BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), Reg) - .addReg(TmpVGPR, RegState::Kill); - }; - - auto RestoreSGPRFromVGPRLane = [&](Register Reg, const int FI) { - assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); - ArrayRef Spill = - FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI); - assert(Spill.size() == 1); - BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READLANE_B32), Reg) - .addReg(Spill[0].VGPR) - .addImm(Spill[0].Lane); - }; - - if (FPSaveIndex) { - const int FramePtrFI = *FPSaveIndex; - assert(!MFI.isDeadObjectIndex(FramePtrFI)); - if (spilledToMemory(MF, FramePtrFI)) - RestoreSGPRFromMemory(FramePtrReg, FramePtrFI); - else - RestoreSGPRFromVGPRLane(FramePtrReg, FramePtrFI); - } - - if (BPSaveIndex) { - const int BasePtrFI = *BPSaveIndex; - assert(!MFI.isDeadObjectIndex(BasePtrFI)); - if (spilledToMemory(MF, BasePtrFI)) - RestoreSGPRFromMemory(BasePtrReg, BasePtrFI); - else - RestoreSGPRFromVGPRLane(BasePtrReg, BasePtrFI); + for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) { + PrologEpilogSGPRSpillBuilder SB(Spill.first, Spill.second, MBB, MBBI, DL, + TII, TRI, LiveRegs); + SB.restore(); } Register ScratchExecCopy; @@ -1055,8 +1102,7 @@ I != E; ++I) { if (!MFI.isDeadObjectIndex(I) && MFI.getStackID(I) == TargetStackID::SGPRSpill && - (I != FuncInfo->FramePointerSaveIndex && - I != FuncInfo->BasePointerSaveIndex)) { + !FuncInfo->checkIndexInPrologEpilogSGPRSpills(I)) { return false; } } @@ -1215,6 +1261,49 @@ } } +// The special SGPR spills like the one needed for FP, BP or any reserved +// registers delayed until frame lowering. +void SIFrameLowering::determinePrologEpilogSGPRSaves( + MachineFunction &MF, BitVector &SavedVGPRs) const { + MachineFrameInfo &FrameInfo = MF.getFrameInfo(); + SIMachineFunctionInfo *MFI = MF.getInfo(); + const GCNSubtarget &ST = MF.getSubtarget(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + LivePhysRegs LiveRegs; + LiveRegs.init(*TRI); + // Initially mark callee saved registers as used so we will not choose them + // while looking for scratch SGPRs. + const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs(); + for (unsigned I = 0; CSRegs[I]; ++I) + LiveRegs.addReg(CSRegs[I]); + + // hasFP only knows about stack objects that already exist. We're now + // determining the stack slots that will be created, so we have to predict + // them. Stack objects force FP usage with calls. + // + // Note a new VGPR CSR may be introduced if one is used for the spill, but we + // don't want to report it here. + // + // FIXME: Is this really hasReservedCallFrame? + const bool WillHaveFP = + FrameInfo.hasCalls() && + (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo)); + + if (WillHaveFP || hasFP(MF)) { + Register FramePtrReg = MFI->getFrameOffsetReg(); + assert(!MFI->hasPrologEpilogSGPRSpillEntry(FramePtrReg) && + "Re-reserving spill slot for FP"); + getVGPRSpillLaneOrTempRegister(MF, LiveRegs, FramePtrReg); + } + + if (TRI->hasBasePointer(MF)) { + Register BasePtrReg = TRI->getBaseRegister(); + assert(!MFI->hasPrologEpilogSGPRSpillEntry(BasePtrReg) && + "Re-reserving spill slot for BP"); + getVGPRSpillLaneOrTempRegister(MF, LiveRegs, BasePtrReg); + } +} + // Only report VGPRs to generic code. void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, BitVector &SavedVGPRs, @@ -1224,7 +1313,6 @@ if (MFI->isEntryFunction()) return; - MachineFrameInfo &FrameInfo = MF.getFrameInfo(); const GCNSubtarget &ST = MF.getSubtarget(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); @@ -1256,43 +1344,13 @@ if (!ST.hasGFX90AInsts()) SavedVGPRs.clearBitsInMask(TRI->getAllAGPRRegMask()); - // hasFP only knows about stack objects that already exist. We're now - // determining the stack slots that will be created, so we have to predict - // them. Stack objects force FP usage with calls. - // - // Note a new VGPR CSR may be introduced if one is used for the spill, but we - // don't want to report it here. - // - // FIXME: Is this really hasReservedCallFrame? - const bool WillHaveFP = - FrameInfo.hasCalls() && - (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo)); + determinePrologEpilogSGPRSaves(MF, SavedVGPRs); // The Whole-Wave VGPRs need to be specially inserted in the prolog, so don't // allow the default insertion to handle them. for (auto &Reg : MFI->getWWMSpills()) SavedVGPRs.reset(Reg.first); - LivePhysRegs LiveRegs; - LiveRegs.init(*TRI); - - if (WillHaveFP || hasFP(MF)) { - assert(!MFI->SGPRForFPSaveRestoreCopy && !MFI->FramePointerSaveIndex && - "Re-reserving spill slot for FP"); - getVGPRSpillLaneOrTempRegister(MF, LiveRegs, MFI->SGPRForFPSaveRestoreCopy, - MFI->FramePointerSaveIndex, true); - } - - if (TRI->hasBasePointer(MF)) { - if (MFI->SGPRForFPSaveRestoreCopy) - LiveRegs.addReg(MFI->SGPRForFPSaveRestoreCopy); - - assert(!MFI->SGPRForBPSaveRestoreCopy && - !MFI->BasePointerSaveIndex && "Re-reserving spill slot for BP"); - getVGPRSpillLaneOrTempRegister(MF, LiveRegs, MFI->SGPRForBPSaveRestoreCopy, - MFI->BasePointerSaveIndex, false); - } - // Mark all lane VGPRs as BB LiveIns. for (MachineBasicBlock &MBB : MF) { for (auto &Reg : MFI->getWWMSpills()) @@ -1354,29 +1412,31 @@ return true; // Early exit if no callee saved registers are modified! const SIMachineFunctionInfo *FuncInfo = MF.getInfo(); - if (!FuncInfo->SGPRForFPSaveRestoreCopy && - !FuncInfo->SGPRForBPSaveRestoreCopy) - return false; - const GCNSubtarget &ST = MF.getSubtarget(); const SIRegisterInfo *RI = ST.getRegisterInfo(); Register FramePtrReg = FuncInfo->getFrameOffsetReg(); Register BasePtrReg = RI->getBaseRegister(); + Register SGPRForFPSaveRestoreCopy = + FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg); + Register SGPRForBPSaveRestoreCopy = + FuncInfo->getScratchSGPRCopyDstReg(BasePtrReg); + if (!SGPRForFPSaveRestoreCopy && !SGPRForBPSaveRestoreCopy) + return false; + unsigned NumModifiedRegs = 0; - if (FuncInfo->SGPRForFPSaveRestoreCopy) + if (SGPRForFPSaveRestoreCopy) NumModifiedRegs++; - if (FuncInfo->SGPRForBPSaveRestoreCopy) + if (SGPRForBPSaveRestoreCopy) NumModifiedRegs++; for (auto &CS : CSI) { - if (CS.getReg() == FramePtrReg && FuncInfo->SGPRForFPSaveRestoreCopy) { - CS.setDstReg(FuncInfo->SGPRForFPSaveRestoreCopy); + if (CS.getReg() == FramePtrReg && SGPRForFPSaveRestoreCopy) { + CS.setDstReg(SGPRForFPSaveRestoreCopy); if (--NumModifiedRegs) break; - } else if (CS.getReg() == BasePtrReg && - FuncInfo->SGPRForBPSaveRestoreCopy) { - CS.setDstReg(FuncInfo->SGPRForBPSaveRestoreCopy); + } else if (CS.getReg() == BasePtrReg && SGPRForBPSaveRestoreCopy) { + CS.setDstReg(SGPRForBPSaveRestoreCopy); if (--NumModifiedRegs) break; } diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -321,6 +321,35 @@ } // end namespace yaml +// A CSR SGPR value can be preserved inside a callee using one of the following +// methods. +// 1. Copy to an unused scratch SGPR. +// 2. Spill to a VGPR lane. +// 3. Spill to memory via. a scratch VGPR. +// class PrologEpilogSGPRSaveRestoreInfo represents the save/restore method used +// for an SGPR at function prolog/epilog. +enum class SGPRSaveKind : uint8_t { + COPY_TO_SCRATCH_SGPR, + SPILL_TO_VGPR_LANE, + SPILL_TO_MEM +}; + +class PrologEpilogSGPRSaveRestoreInfo { + SGPRSaveKind Kind; + union { + int Index; + Register Reg; + }; + +public: + PrologEpilogSGPRSaveRestoreInfo(SGPRSaveKind K, int I) : Kind(K), Index(I) {} + PrologEpilogSGPRSaveRestoreInfo(SGPRSaveKind K, Register R) + : Kind(K), Reg(R) {} + Register getReg() const { return Reg; } + int getIndex() const { return Index; } + SGPRSaveKind getKind() const { return Kind; } +}; + /// This class keeps track of the SPI_SP_INPUT_ADDR config register, which /// tells the hardware which interpolation parameters to load. class SIMachineFunctionInfo final : public AMDGPUMachineFunction { @@ -464,6 +493,14 @@ // the serialization easier. ReservedRegSet WWMReservedRegs; + using PrologEpilogSGPRSpillsMap = + DenseMap; + // To track the SGPR spill method used for a CSR SGPR register during + // frame lowering. Even though the SGPR spills are handled during + // SILowerSGPRSpills pass, some special handling needed later during the + // PrologEpilogInserter. + PrologEpilogSGPRSpillsMap PrologEpilogSGPRSpills; + DenseMap VGPRToAGPRSpills; // AGPRs used for VGPR spills. @@ -493,17 +530,6 @@ VGPRForAGPRCopy = NewVGPRForAGPRCopy; } -public: // FIXME - /// If this is set, an SGPR used for save/restore of the register used for the - /// frame pointer. - Register SGPRForFPSaveRestoreCopy; - std::optional FramePointerSaveIndex; - - /// If this is set, an SGPR used for save/restore of the register used for the - /// base pointer. - Register SGPRForBPSaveRestoreCopy; - std::optional BasePointerSaveIndex; - bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg); public: @@ -538,6 +564,50 @@ const WWMSpillsMap &getWWMSpills() const { return WWMSpills; } const ReservedRegSet &getWWMReservedRegs() const { return WWMReservedRegs; } + const PrologEpilogSGPRSpillsMap &getPrologEpilogSGPRSpills() const { + return PrologEpilogSGPRSpills; + } + + void addToPrologEpilogSGPRSpills(Register Reg, + PrologEpilogSGPRSaveRestoreInfo SI) { + PrologEpilogSGPRSpills.insert(std::make_pair(Reg, SI)); + } + + // Check if an entry created for \p Reg in PrologEpilogSGPRSpills. Return true + // on success and false otherwise. + bool hasPrologEpilogSGPRSpillEntry(Register Reg) const { + return PrologEpilogSGPRSpills.find(Reg) != PrologEpilogSGPRSpills.end(); + } + + // Get the scratch SGPR if allocated to save/restore \p Reg. + Register getScratchSGPRCopyDstReg(Register Reg) const { + auto I = PrologEpilogSGPRSpills.find(Reg); + if (I != PrologEpilogSGPRSpills.end() && + I->second.getKind() == SGPRSaveKind::COPY_TO_SCRATCH_SGPR) + return I->second.getReg(); + + return AMDGPU::NoRegister; + } + + // Get all scratch SGPRs allocated to copy/restore the SGPR spills. + void getAllScratchSGPRCopyDstRegs(SmallVectorImpl &Regs) const { + for (const auto &SI : PrologEpilogSGPRSpills) { + if (SI.second.getKind() == SGPRSaveKind::COPY_TO_SCRATCH_SGPR) + Regs.push_back(SI.second.getReg()); + } + } + + // Check if \p FI is allocated for any SGPR spill to a VGPR lane during PEI. + bool checkIndexInPrologEpilogSGPRSpills(int FI) const { + return find_if(PrologEpilogSGPRSpills, + [FI](const std::pair &SI) { + return SI.second.getKind() == + SGPRSaveKind::SPILL_TO_VGPR_LANE && + SI.second.getIndex() == FI; + }) != PrologEpilogSGPRSpills.end(); + } + ArrayRef getPrologEpilogSGPRSpillToVGPRLanes(int FrameIndex) const { auto I = PrologEpilogSGPRSpillToVGPRLanes.find(FrameIndex); diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -472,13 +472,13 @@ bool HaveSGPRToMemory = false; if (ResetSGPRSpillStackIDs) { - // All other SPGRs must be allocated on the default stack, so reset the + // All other SGPRs must be allocated on the default stack, so reset the // stack ID. - for (int i = MFI.getObjectIndexBegin(), e = MFI.getObjectIndexEnd(); i != e; - ++i) { - if (i != FramePointerSaveIndex && i != BasePointerSaveIndex) { - if (MFI.getStackID(i) == TargetStackID::SGPRSpill) { - MFI.setStackID(i, TargetStackID::Default); + for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); I != E; + ++I) { + if (!checkIndexInPrologEpilogSGPRSpills(I)) { + if (MFI.getStackID(I) == TargetStackID::SGPRSpill) { + MFI.setStackID(I, TargetStackID::Default); HaveSGPRToMemory = true; } } diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll --- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll @@ -159,11 +159,11 @@ ; GCN: buffer_store_dword [[VGPR_REG:v[0-9]+]], off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword [[VGPR_REG_1:v[0-9]+]], off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] +; GCN-DAG: v_writelane_b32 [[VGPR_REG_1]], s34, 1 ; GCN-NEXT: v_writelane_b32 [[VGPR_REG_1]], s33, 0 ; GCN-DAG: s_add_i32 [[SCRATCH_REG:s[0-9]+]], s32, 0xffc0 ; GCN: s_and_b32 s33, [[SCRATCH_REG]], 0xffff0000 ; GCN: v_mov_b32_e32 v32, 0 -; GCN-DAG: v_writelane_b32 [[VGPR_REG_1]], s34, 1 ; GCN: s_mov_b32 s34, s32 ; GCN: buffer_store_dword v32, off, s[0:3], s33 offset:1024 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -175,8 +175,8 @@ ; GCN: v_readlane_b32 s31, [[VGPR_REG]], 1 ; GCN: v_readlane_b32 s30, [[VGPR_REG]], 0 ; GCN: s_add_i32 s32, s32, 0xfffd0000 -; GCN-NEXT: v_readlane_b32 s33, [[VGPR_REG_1]], 0 ; GCN-NEXT: v_readlane_b32 s34, [[VGPR_REG_1]], 1 +; GCN-NEXT: v_readlane_b32 s33, [[VGPR_REG_1]], 0 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword [[VGPR_REG]], off, s[0:3], s32 offset:1028 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword [[VGPR_REG_1]], off, s[0:3], s32 offset:1032 ; 4-byte Folded Reload @@ -197,8 +197,8 @@ ; The BP value will get saved/restored in an SGPR at the prolgoue/epilogue. ; GCN-LABEL: needs_align1024_stack_args_used_inside_loop: -; GCN: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 -; GCN-NEXT: s_mov_b32 [[BP_COPY:s[0-9]+]], s34 +; GCN: s_mov_b32 [[BP_COPY:s[0-9]+]], s34 +; GCN-NEXT: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 ; GCN-NEXT: s_add_i32 s33, s32, 0xffc0 ; GCN-NEXT: s_mov_b32 s34, s32 ; GCN-NEXT: s_and_b32 s33, s33, 0xffff0000 @@ -209,8 +209,8 @@ ; GCN: buffer_load_dword v{{[0-9]+}}, [[VGPR_REG]], s[0:3], 0 offen ; GCN: v_add_u32_e32 [[VGPR_REG]], vcc, 4, [[VGPR_REG]] ; GCN: s_add_i32 s32, s32, 0xfffd0000 -; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]] ; GCN-NEXT: s_mov_b32 s34, [[BP_COPY]] +; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]] ; GCN-NEXT: s_setpc_b64 s[30:31] begin: %local_var = alloca i32, align 1024, addrspace(5) @@ -262,9 +262,9 @@ ; GCN-LABEL: no_free_regs_spill_bp_to_mem ; GCN: s_or_saveexec_b64 s[4:5], -1 -; GCN: v_mov_b32_e32 v0, s33 -; GCN: buffer_store_dword v0, off, s[0:3], s32 ; GCN: v_mov_b32_e32 v0, s34 +; GCN: buffer_store_dword v0, off, s[0:3], s32 +; GCN: v_mov_b32_e32 v0, s33 ; GCN-DAG: buffer_store_dword v0, off, s[0:3], s32 %local_val = alloca i32, align 128, addrspace(5) store volatile i32 %b, ptr addrspace(5) %local_val, align 128 @@ -298,13 +298,13 @@ ; GCN-NEXT: s_add_i32 s6, s32, 0x42100 ; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s6 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, s33 +; GCN-NEXT: v_mov_b32_e32 v0, s34 ; GCN-NOT: v_mov_b32_e32 v0, 0x1088 -; GCN-NEXT: s_add_i32 s6, s32, 0x42200 +; GCN-NEXT: s_add_i32 s6, s32, 0x42300 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v0, s34 +; GCN-NEXT: v_mov_b32_e32 v0, s33 ; GCN-NOT: v_mov_b32_e32 v0, 0x108c -; GCN-NEXT: s_add_i32 s6, s32, 0x42300 +; GCN-NEXT: s_add_i32 s6, s32, 0x42200 ; GCN-NEXT: s_mov_b32 s34, s32 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill %local_val = alloca i32, align 128, addrspace(5)