diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h @@ -55,26 +55,23 @@ MachineBasicBlock::iterator MI) const override; private: - void emitFlatScratchInit(const GCNSubtarget &ST, - MachineFunction &MF, - MachineBasicBlock &MBB) const; - - unsigned getReservedPrivateSegmentBufferReg( - const GCNSubtarget &ST, - const SIInstrInfo *TII, - const SIRegisterInfo *TRI, - SIMachineFunctionInfo *MFI, - MachineFunction &MF) const; - - std::pair getReservedPrivateSegmentWaveByteOffsetReg( - const GCNSubtarget &ST, const SIInstrInfo *TII, const SIRegisterInfo *TRI, - SIMachineFunctionInfo *MFI, MachineFunction &MF) const; - - // Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set. - void emitEntryFunctionScratchSetup(const GCNSubtarget &ST, MachineFunction &MF, - MachineBasicBlock &MBB, SIMachineFunctionInfo *MFI, - MachineBasicBlock::iterator I, unsigned PreloadedPrivateBufferReg, - unsigned ScratchRsrcReg) const; + void emitEntryFunctionFlatScratchInit(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + DebugLoc DL, + Register ScratchWaveOffsetReg) const; + + Register getEntryFunctionReservedScratchRsrcReg(MachineFunction &MF) const; + + Register + getEntryFunctionReservedScratchWaveOffsetReg(MachineFunction &MF) const; + + void emitEntryFunctionScratchRsrcRegSetup(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + DebugLoc DL, + Register PreloadedPrivateBufferReg, + Register ScratchRsrcReg) const; public: bool hasFP(const MachineFunction &MF) const override; diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -184,11 +184,13 @@ .addMemOperand(MMO); } -void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST, - MachineFunction &MF, - MachineBasicBlock &MBB) const { +// Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()` +void SIFrameLowering::emitEntryFunctionFlatScratchInit( + MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + DebugLoc DL, Register ScratchWaveOffsetReg) const { + const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); - const SIRegisterInfo* TRI = &TII->getRegisterInfo(); + const SIRegisterInfo *TRI = &TII->getRegisterInfo(); const SIMachineFunctionInfo *MFI = MF.getInfo(); // We don't need this if we only have spills since there is no user facing @@ -201,11 +203,6 @@ // pointer. Because we only detect if flat instructions are used at all, // this will be used more often than necessary on VI. - // Debug location must be unknown since the first debug location is used to - // determine the end of the prologue. - DebugLoc DL; - MachineBasicBlock::iterator I = MBB.begin(); - Register FlatScratchInitReg = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT); @@ -216,8 +213,6 @@ Register FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0); Register FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); - unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); - // Do a 64-bit pointer add. if (ST.flatScratchIsPointer()) { if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) { @@ -266,16 +261,20 @@ .addImm(8); } -unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg( - const GCNSubtarget &ST, - const SIInstrInfo *TII, - const SIRegisterInfo *TRI, - SIMachineFunctionInfo *MFI, - MachineFunction &MF) const { +// Shift down registers reserved for the scratch RSRC. +Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg( + MachineFunction &MF) const { + + const GCNSubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo *TRI = &TII->getRegisterInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); + SIMachineFunctionInfo *MFI = MF.getInfo(); + + assert(MFI->isEntryFunction()); - // We need to insert initialization of the scratch resource descriptor. unsigned ScratchRsrcReg = MFI->getScratchRSrcReg(); + if (ScratchRsrcReg == AMDGPU::NoRegister || !MRI.isPhysRegUsed(ScratchRsrcReg)) return AMDGPU::NoRegister; @@ -315,29 +314,35 @@ } // Shift down registers reserved for the scratch wave offset. -std::pair -SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg( - const GCNSubtarget &ST, const SIInstrInfo *TII, const SIRegisterInfo *TRI, - SIMachineFunctionInfo *MFI, MachineFunction &MF) const { +Register SIFrameLowering::getEntryFunctionReservedScratchWaveOffsetReg( + MachineFunction &MF) const { + + const GCNSubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo *TRI = &TII->getRegisterInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); - unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); + SIMachineFunctionInfo *MFI = MF.getInfo(); assert(MFI->isEntryFunction()); - // No replacement necessary. + unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); + if (ScratchWaveOffsetReg == AMDGPU::NoRegister || - (!hasFP(MF) && !MRI.isPhysRegUsed(ScratchWaveOffsetReg))) { - return std::make_pair(AMDGPU::NoRegister, false); + (!MRI.isPhysRegUsed(ScratchWaveOffsetReg) && !hasFP(MF) && + !MFI->hasFlatScratchInit())) { + assert(!hasFP(MF) && !MFI->hasFlatScratchInit()); + return AMDGPU::NoRegister; } - if (ST.hasSGPRInitBug()) - return std::make_pair(ScratchWaveOffsetReg, false); + if (ST.hasSGPRInitBug() || + ScratchWaveOffsetReg != TRI->reservedPrivateSegmentWaveByteOffsetReg(MF)) + return ScratchWaveOffsetReg; unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); ArrayRef AllSGPRs = getAllSGPRs(ST, MF); if (NumPreloaded > AllSGPRs.size()) - return std::make_pair(ScratchWaveOffsetReg, false); + return ScratchWaveOffsetReg; AllSGPRs = AllSGPRs.slice(NumPreloaded); @@ -358,90 +363,78 @@ unsigned ReservedRegCount = 13; if (AllSGPRs.size() < ReservedRegCount) - return std::make_pair(ScratchWaveOffsetReg, false); - - bool HandledScratchWaveOffsetReg = - ScratchWaveOffsetReg != TRI->reservedPrivateSegmentWaveByteOffsetReg(MF); - bool FPAdjusted = false; + return ScratchWaveOffsetReg; for (MCPhysReg Reg : AllSGPRs.drop_back(ReservedRegCount)) { // Pick the first unallocated SGPR. Be careful not to pick an alias of the // scratch descriptor, since we haven’t added its uses yet. if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) { - if (!HandledScratchWaveOffsetReg) { - HandledScratchWaveOffsetReg = true; - - MRI.replaceRegWith(ScratchWaveOffsetReg, Reg); - if (MFI->getScratchWaveOffsetReg() == MFI->getStackPtrOffsetReg()) { - assert(!hasFP(MF)); - MFI->setStackPtrOffsetReg(Reg); - } - - MFI->setScratchWaveOffsetReg(Reg); - MFI->setFrameOffsetReg(Reg); - ScratchWaveOffsetReg = Reg; - FPAdjusted = true; - break; + MRI.replaceRegWith(ScratchWaveOffsetReg, Reg); + if (MFI->getScratchWaveOffsetReg() == MFI->getStackPtrOffsetReg()) { + assert(!hasFP(MF)); + MFI->setStackPtrOffsetReg(Reg); } + MFI->setScratchWaveOffsetReg(Reg); + MFI->setFrameOffsetReg(Reg); + return Reg; } } - return std::make_pair(ScratchWaveOffsetReg, FPAdjusted); + return ScratchWaveOffsetReg; } void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); - SIMachineFunctionInfo *MFI = MF.getInfo(); + // FIXME: If we only have SGPR spills, we won't actually be using scratch + // memory since these spill to VGPRs. We should be cleaning up these unused + // SGPR spill frame indices somewhere. - // If we only have SGPR spills, we won't actually be using scratch memory - // since these spill to VGPRs. - // - // FIXME: We should be cleaning up these unused SGPR spill frame indices - // somewhere. + // FIXME: We still have implicit uses on SGPR spill instructions in case they + // need to spill to vector memory. It's likely that will not happen, but at + // this point it appears we need the setup. This part of the prolog should be + // emitted after frame indices are eliminated. + + // FIXME: Remove all of the isPhysRegUsed checks + SIMachineFunctionInfo *MFI = MF.getInfo(); const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo *TRI = &TII->getRegisterInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); const Function &F = MF.getFunction(); + assert(MFI->isEntryFunction()); + // We need to do the replacement of the private segment buffer and wave offset // register even if there are no stack objects. There could be stores to undef // or a constant without an associated object. + // + // These calls will return `AMDGPU::NoRegister` in cases where there are no + // actual uses of the respective registers. + Register ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF); + Register ScratchWaveOffsetReg = + getEntryFunctionReservedScratchWaveOffsetReg(MF); - // FIXME: We still have implicit uses on SGPR spill instructions in case they - // need to spill to vector memory. It's likely that will not happen, but at - // this point it appears we need the setup. This part of the prolog should be - // emitted after frame indices are eliminated. - - if (MFI->hasFlatScratchInit()) - emitFlatScratchInit(ST, MF, MBB); + // Make the selected registers live throughout the function. + for (MachineBasicBlock &OtherBB : MF) { + if (&OtherBB == &MBB) + continue; - unsigned ScratchRsrcReg - = getReservedPrivateSegmentBufferReg(ST, TII, TRI, MFI, MF); + if (ScratchWaveOffsetReg != AMDGPU::NoRegister) + OtherBB.addLiveIn(ScratchWaveOffsetReg); - unsigned ScratchWaveOffsetReg; - bool FPAdjusted; - std::tie(ScratchWaveOffsetReg, FPAdjusted) = - getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF); + if (ScratchRsrcReg != AMDGPU::NoRegister) + OtherBB.addLiveIn(ScratchRsrcReg); + } - // We need to insert initialization of the scratch resource descriptor. + // Now that we have fixed the reserved registers we need to locate the + // (potentially) preloaded registers. We should always have a preloaded + // scratch wave offset register, but we only have a preloaded scratch rsrc + // register for HSA. Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg( AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); - - unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister; - if (ST.isAmdHsaOrMesa(F)) { - PreloadedPrivateBufferReg = MFI->getPreloadedReg( - AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); - } - - bool OffsetRegUsed = ScratchWaveOffsetReg != AMDGPU::NoRegister && - MRI.isPhysRegUsed(ScratchWaveOffsetReg); - bool ResourceRegUsed = ScratchRsrcReg != AMDGPU::NoRegister && - MRI.isPhysRegUsed(ScratchRsrcReg); - // FIXME: Hack to not crash in situations which emitted an error. if (PreloadedScratchWaveOffsetReg == AMDGPU::NoRegister) return; @@ -451,72 +444,59 @@ MRI.addLiveIn(PreloadedScratchWaveOffsetReg); MBB.addLiveIn(PreloadedScratchWaveOffsetReg); - if (ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister) { - assert(ST.isAmdHsaOrMesa(F) || ST.isMesaGfxShader(F)); - MRI.addLiveIn(PreloadedPrivateBufferReg); - MBB.addLiveIn(PreloadedPrivateBufferReg); - } - - // Make the register selected live throughout the function. - for (MachineBasicBlock &OtherBB : MF) { - if (&OtherBB == &MBB) - continue; - - if (OffsetRegUsed || FPAdjusted) - OtherBB.addLiveIn(ScratchWaveOffsetReg); - - if (ResourceRegUsed) - OtherBB.addLiveIn(ScratchRsrcReg); + Register PreloadedScratchRsrcReg = AMDGPU::NoRegister; + if (ST.isAmdHsaOrMesa(F)) { + PreloadedScratchRsrcReg = + MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); + if (ScratchRsrcReg != AMDGPU::NoRegister && + PreloadedScratchRsrcReg != AMDGPU::NoRegister) { + MRI.addLiveIn(PreloadedScratchRsrcReg); + MBB.addLiveIn(PreloadedScratchRsrcReg); + } } DebugLoc DL; MachineBasicBlock::iterator I = MBB.begin(); - // If we reserved the original input registers, we don't need to copy to the - // reserved registers. + const bool HasFP = hasFP(MF); - bool CopyBuffer = ResourceRegUsed && - PreloadedPrivateBufferReg != AMDGPU::NoRegister && - ST.isAmdHsaOrMesa(F) && - ScratchRsrcReg != PreloadedPrivateBufferReg; + // If we are not HSA or we happened to reserved the original input registers, + // we don't need to copy to the reserved registers. + const bool CopyBuffer = ST.isAmdHsaOrMesa(F) && + ScratchRsrcReg != AMDGPU::NoRegister && + PreloadedScratchRsrcReg != AMDGPU::NoRegister && + ScratchRsrcReg != PreloadedScratchRsrcReg; // This needs to be careful of the copying order to avoid overwriting one of // the input registers before it's been copied to it's final // destination. Usually the offset should be copied first. - bool CopyBufferFirst = TRI->isSubRegisterEq(PreloadedPrivateBufferReg, - ScratchWaveOffsetReg); - if (CopyBuffer && CopyBufferFirst) { - BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg) - .addReg(PreloadedPrivateBufferReg, RegState::Kill); - } + const bool CopyBufferFirst = + TRI->isSubRegisterEq(PreloadedScratchRsrcReg, ScratchWaveOffsetReg); - unsigned SPReg = MFI->getStackPtrOffsetReg(); - assert(SPReg != AMDGPU::SP_REG); - - // FIXME: Remove the isPhysRegUsed checks - const bool HasFP = hasFP(MF); + if (CopyBuffer && CopyBufferFirst) + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg) + .addReg(PreloadedScratchRsrcReg, RegState::Kill); - if (HasFP || OffsetRegUsed) { - assert(ScratchWaveOffsetReg); + if (ScratchWaveOffsetReg != AMDGPU::NoRegister) BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg) - .addReg(PreloadedScratchWaveOffsetReg, HasFP ? RegState::Kill : 0); - } + .addReg(PreloadedScratchWaveOffsetReg, HasFP ? RegState::Kill : 0); - if (CopyBuffer && !CopyBufferFirst) { + if (CopyBuffer && !CopyBufferFirst) BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg) - .addReg(PreloadedPrivateBufferReg, RegState::Kill); - } + .addReg(PreloadedScratchRsrcReg, RegState::Kill); - if (ResourceRegUsed) { - emitEntryFunctionScratchSetup(ST, MF, MBB, MFI, I, - PreloadedPrivateBufferReg, ScratchRsrcReg); - } + // FIXME: This should also implement the setup path for HSA. + if (ScratchRsrcReg != AMDGPU::NoRegister) + emitEntryFunctionScratchRsrcRegSetup( + MF, MBB, I, DL, PreloadedScratchRsrcReg, ScratchRsrcReg); if (HasFP) { - DebugLoc DL; const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); int64_t StackSize = FrameInfo.getStackSize(); + unsigned SPReg = MFI->getStackPtrOffsetReg(); + assert(SPReg != AMDGPU::SP_REG); + // On kernel entry, the private scratch wave offset is the SP value. if (StackSize == 0) { BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), SPReg) @@ -527,18 +507,23 @@ .addImm(StackSize * ST.getWavefrontSize()); } } + + if (MFI->hasFlatScratchInit()) + emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, + MFI->getScratchWaveOffsetReg()); } -// Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set. -void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST, - MachineFunction &MF, MachineBasicBlock &MBB, SIMachineFunctionInfo *MFI, - MachineBasicBlock::iterator I, unsigned PreloadedPrivateBufferReg, - unsigned ScratchRsrcReg) const { +// Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoRegister` +void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup( + MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + DebugLoc DL, Register PreloadedScratchRsrcReg, + Register ScratchRsrcReg) const { + const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo *TRI = &TII->getRegisterInfo(); + const SIMachineFunctionInfo *MFI = MF.getInfo(); const Function &Fn = MF.getFunction(); - DebugLoc DL; if (ST.isAmdPalOS()) { // The pointer to the GIT is formed from the offset passed in and either @@ -595,10 +580,8 @@ .addImm(0) // dlc .addReg(ScratchRsrcReg, RegState::ImplicitDefine) .addMemOperand(MMO); - return; - } - if (ST.isMesaGfxShader(Fn) - || (PreloadedPrivateBufferReg == AMDGPU::NoRegister)) { + } else if (ST.isMesaGfxShader(Fn) || + (PreloadedScratchRsrcReg == AMDGPU::NoRegister)) { assert(!ST.isAmdHsaOrMesa(Fn)); const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);