diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h @@ -10,11 +10,11 @@ #define LLVM_LIB_TARGET_AMDGPU_SIFRAMELOWERING_H #include "AMDGPUFrameLowering.h" +#include "SIMachineFunctionInfo.h" namespace llvm { class SIInstrInfo; -class SIMachineFunctionInfo; class SIRegisterInfo; class GCNSubtarget; @@ -82,6 +82,14 @@ bool hasFP(const MachineFunction &MF) const override; bool requiresStackPointerReference(const MachineFunction &MF) const; + + /// If '-amdgpu-spill-cfi-saved-regs' is enabled, emit RA/EXEC spills to + /// a free VGPR (lanes) or memory and corresponding CFI rules. + void emitCFISavedRegSpills(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + LivePhysRegs &LiveRegs, Register &ScratchExecCopy, + bool emitSpillsToMem) const; + /// Create a CFI index for CFIInst and build a MachineInstr around it. void buildCFI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, const MCCFIInstruction &CFIInst) const; @@ -91,6 +99,12 @@ MachineBasicBlock::iterator MBBI, const DebugLoc &DL, const Register SGPR, const Register VGPR, const int Lane) const; + /// Create a CFI index describing a spill of an SGPR to multiple lanes of + /// VGPRs and build a MachineInstr around it. + void buildCFIForSGPRToVGPRSpill( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, Register SGPR, + ArrayRef VGPRSpills) const; /// Create a CFI index describing a spill of a VGPR to VMEM and /// build a MachineInstr around it. void buildCFIForVGPRToVMEMSpill(MachineBasicBlock &MBB, diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -141,11 +141,13 @@ MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const SIInstrInfo *TII, Register SpillReg, - Register ScratchRsrcReg, Register SPReg, int FI) { + Register ScratchRsrcReg, Register SPReg, int FI, + int DwordOff) { MachineFunction *MF = MBB.getParent(); MachineFrameInfo &MFI = MF->getFrameInfo(); - int64_t Offset = MFI.getObjectOffset(FI); + // 'DwordOff' is the offset of the lower/upper double word + int64_t Offset = MFI.getObjectOffset(FI) + DwordOff; MachineMemOperand *MMO = MF->getMachineMemOperand( MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, 4, @@ -778,6 +780,166 @@ return ScratchExecCopy; } +// A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR. +// Otherwise we are spilling to memory. +static bool spilledToMemory(const MachineFunction &MF, int SaveIndex) { + const MachineFrameInfo &MFI = MF.getFrameInfo(); + return MFI.getStackID(SaveIndex) != TargetStackID::SGPRSpill; +} + +// Emit the RA and EXEC saves for the non-kernel functions in the +// prologue and the corresponding CFI rules. +void SIFrameLowering::emitCFISavedRegSpills(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + LivePhysRegs &LiveRegs, + Register &ScratchExecCopy, + bool emitSpillsToMem) const { + const GCNSubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + const MCRegisterInfo *MCRI = MF.getMMI().getContext().getRegisterInfo(); + + SIMachineFunctionInfo *FuncInfo = MF.getInfo(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + Register StackPtrReg = FuncInfo->getStackPtrOffsetReg(); + + Optional RASaveIndex = FuncInfo->ReturnAddressSaveIndex; + Optional EXECSaveIndex = FuncInfo->EXECSaveIndex; + Register RetAddrReg = TRI.getReturnAddressReg(MF); + DebugLoc DL; + + if (emitSpillsToMem) { + // Return address is being spilled into memory at the frame + // index and consumes two double words. And + // build the corresponding CFI rule. + if (RASaveIndex && spilledToMemory(MF, *RASaveIndex)) { + const int FI = *RASaveIndex; + assert(!MFI.isDeadObjectIndex(FI)); + + if (!ScratchExecCopy) + ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true); + + MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( + MRI, LiveRegs, AMDGPU::VGPR_32RegClass); + + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) + .addReg(TRI.getSubReg(RetAddrReg, AMDGPU::sub0)); + + int DwordOff = 0; + buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, TmpVGPR, + FuncInfo->getScratchRSrcReg(), StackPtrReg, FI, + DwordOff); + + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) + .addReg(TRI.getSubReg(RetAddrReg, AMDGPU::sub1)); + + DwordOff = 4; + buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, TmpVGPR, + FuncInfo->getScratchRSrcReg(), StackPtrReg, FI, + DwordOff); + + buildCFI(MBB, MBBI, DL, + MCCFIInstruction::createOffset( + nullptr, MCRI->getDwarfRegNum(AMDGPU::PC_REG, false), + MFI.getObjectOffset(FI) * ST.getWavefrontSize())); + } + + // EXEC mask is being spilled into memory at the frame + // index and consumes two double words in + // wave64 mode and one doble word in wave32 mode. And + // build the corresponding CFI rule. + if (EXECSaveIndex && spilledToMemory(MF, *EXECSaveIndex)) { + const int FI = *EXECSaveIndex; + assert(!MFI.isDeadObjectIndex(FI)); + + if (!ScratchExecCopy) + ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true); + + MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( + MRI, LiveRegs, AMDGPU::VGPR_32RegClass); + + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) + .addReg(TRI.getSubReg(ScratchExecCopy, AMDGPU::sub0)); + + int DwordOff = 0; + buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, TmpVGPR, + FuncInfo->getScratchRSrcReg(), StackPtrReg, FI, + DwordOff); + + if (!ST.isWave32()) { + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) + .addReg(TRI.getSubReg(ScratchExecCopy, AMDGPU::sub1)); + + DwordOff = 4; + buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, TmpVGPR, + FuncInfo->getScratchRSrcReg(), StackPtrReg, FI, + DwordOff); + } + + buildCFI(MBB, MBBI, DL, + MCCFIInstruction::createOffset( + nullptr, MCRI->getDwarfRegNum(AMDGPU::EXEC, false), + MFI.getObjectOffset(FI) * ST.getWavefrontSize())); + } + } + + if (!emitSpillsToMem) { + // Return address is being spilled into free VGPR lanes + // and consumes two lanes, build the corresponding CFI rule. + if (RASaveIndex && !spilledToMemory(MF, *RASaveIndex)) { + MCRegister RetAddrReg = TRI.getReturnAddressReg(MF); + if (!MBB.isLiveIn(RetAddrReg)) + MBB.addLiveIn(RetAddrReg); + + ArrayRef ReturnAddressSpill = + FuncInfo->getSGPRToVGPRSpills(*RASaveIndex); + assert(ReturnAddressSpill.size() == 2); + BuildMI(MBB, MBBI, DL, + TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), + ReturnAddressSpill[0].VGPR) + .addReg(TRI.getSubReg(RetAddrReg, AMDGPU::sub0)) + .addImm(ReturnAddressSpill[0].Lane) + .addReg(ReturnAddressSpill[0].VGPR, RegState::Undef); + BuildMI(MBB, MBBI, DL, + TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), + ReturnAddressSpill[1].VGPR) + .addReg(TRI.getSubReg(RetAddrReg, AMDGPU::sub1)) + .addImm(ReturnAddressSpill[1].Lane) + .addReg(ReturnAddressSpill[1].VGPR, RegState::Undef); + buildCFIForSGPRToVGPRSpill(MBB, MBBI, DL, AMDGPU::PC_REG, + ReturnAddressSpill); + } + + // EXEC mask is being spilled into free VGPR lanes and consumes + // two lanes in wave64 mode and one lane in wave32 mode, build + // the corresponding CFI rule. + if (EXECSaveIndex && !spilledToMemory(MF, *EXECSaveIndex)) { + ArrayRef EXECSpill = + FuncInfo->getSGPRToVGPRSpills(*EXECSaveIndex); + assert(EXECSpill.size()); + BuildMI(MBB, MBBI, DL, + TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), + EXECSpill[0].VGPR) + .addReg(AMDGPU::EXEC_LO) + .addImm(EXECSpill[0].Lane) + .addReg(EXECSpill[0].VGPR, RegState::Undef); + if (!ST.isWave32()) { + assert(EXECSpill.size() == 2); + BuildMI(MBB, MBBI, DL, + TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), + EXECSpill[1].VGPR) + .addReg(AMDGPU::EXEC_HI) + .addImm(EXECSpill[1].Lane) + .addReg(EXECSpill[1].VGPR, RegState::Undef); + } + buildCFIForSGPRToVGPRSpill(MBB, MBBI, DL, AMDGPU::EXEC, EXECSpill); + } + } +} + +// Emit the CFI rules for non-kernel functions in the prologue void SIFrameLowering::emitPrologueEntryCFI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL) const { @@ -870,23 +1032,8 @@ emitPrologueEntryCFI(MBB, MBBI, DL); - bool HasFPSaveIndex = FuncInfo->FramePointerSaveIndex.hasValue(); - bool SpillFPToMemory = false; - // A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR. - // Otherwise we are spilling the FP to memory. - if (HasFPSaveIndex) { - SpillFPToMemory = MFI.getStackID(*FuncInfo->FramePointerSaveIndex) != - TargetStackID::SGPRSpill; - } - - bool HasBPSaveIndex = FuncInfo->BasePointerSaveIndex.hasValue(); - bool SpillBPToMemory = false; - // A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR. - // Otherwise we are spilling the BP to memory. - if (HasBPSaveIndex) { - SpillBPToMemory = MFI.getStackID(*FuncInfo->BasePointerSaveIndex) != - TargetStackID::SGPRSpill; - } + Optional FPSaveIndex = FuncInfo->FramePointerSaveIndex; + Optional BPSaveIndex = FuncInfo->BasePointerSaveIndex; // Emit the copy if we need an FP, and are using a free SGPR to save it. if (FuncInfo->SGPRForFPSaveRestoreCopy) { @@ -940,9 +1087,10 @@ ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true); int FI = Reg.FI.getValue(); + int DwordOff = 0; buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, Reg.VGPR, - FuncInfo->getScratchRSrcReg(), StackPtrReg, FI); + FuncInfo->getScratchRSrcReg(), StackPtrReg, FI, DwordOff); // We spill the entire VGPR, so we can get away with just cfi_offset buildCFI(MBB, MBBI, DL, @@ -951,8 +1099,14 @@ MFI.getObjectOffset(FI) * ST.getWavefrontSize())); } - if (HasFPSaveIndex && SpillFPToMemory) { - const int FI = *FuncInfo->FramePointerSaveIndex; + if (TRI.isCFISavedRegsSpillEnabled()) { + bool emitSpillsToMem = true; + emitCFISavedRegSpills(MF, MBB, MBBI, LiveRegs, ScratchExecCopy, + emitSpillsToMem); + } + + if (FPSaveIndex && spilledToMemory(MF, *FPSaveIndex)) { + const int FI = *FPSaveIndex; assert(!MFI.isDeadObjectIndex(FI)); if (!ScratchExecCopy) @@ -964,16 +1118,17 @@ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) .addReg(FramePtrReg); + int DwordOff = 0; buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, TmpVGPR, - FuncInfo->getScratchRSrcReg(), StackPtrReg, FI); + FuncInfo->getScratchRSrcReg(), StackPtrReg, FI, DwordOff); buildCFI(MBB, MBBI, DL, MCCFIInstruction::createOffset( nullptr, MCRI->getDwarfRegNum(FramePtrReg, false), MFI.getObjectOffset(FI) * ST.getWavefrontSize())); } - if (HasBPSaveIndex && SpillBPToMemory) { - const int BasePtrFI = *FuncInfo->BasePointerSaveIndex; + if (BPSaveIndex && spilledToMemory(MF, *BPSaveIndex)) { + const int BasePtrFI = *BPSaveIndex; assert(!MFI.isDeadObjectIndex(BasePtrFI)); if (!ScratchExecCopy) @@ -985,8 +1140,10 @@ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) .addReg(BasePtrReg); + int DwordOff = 0; buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, TmpVGPR, - FuncInfo->getScratchRSrcReg(), StackPtrReg, BasePtrFI); + FuncInfo->getScratchRSrcReg(), StackPtrReg, BasePtrFI, + DwordOff); buildCFI(MBB, MBBI, DL, MCCFIInstruction::createOffset( nullptr, MCRI->getDwarfRegNum(BasePtrReg, false), @@ -1002,9 +1159,15 @@ LiveRegs.addReg(ScratchExecCopy); } + if (TRI.isCFISavedRegsSpillEnabled()) { + bool emitSpillsToMem = false; + emitCFISavedRegSpills(MF, MBB, MBBI, LiveRegs, ScratchExecCopy, + emitSpillsToMem); + } + // In this case, spill the FP to a reserved VGPR. - if (HasFPSaveIndex && !SpillFPToMemory) { - const int FI = *FuncInfo->FramePointerSaveIndex; + if (FPSaveIndex && !spilledToMemory(MF, *FPSaveIndex)) { + const int FI = *FPSaveIndex; assert(!MFI.isDeadObjectIndex(FI)); assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); @@ -1025,8 +1188,8 @@ } // In this case, spill the BP to a reserved VGPR. - if (HasBPSaveIndex && !SpillBPToMemory) { - const int BasePtrFI = *FuncInfo->BasePointerSaveIndex; + if (BPSaveIndex && !spilledToMemory(MF, *BPSaveIndex)) { + const int BasePtrFI = *BPSaveIndex; assert(!MFI.isDeadObjectIndex(BasePtrFI)); assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill); @@ -1144,19 +1307,8 @@ const Register BasePtrReg = TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register(); - bool HasFPSaveIndex = FuncInfo->FramePointerSaveIndex.hasValue(); - bool SpillFPToMemory = false; - if (HasFPSaveIndex) { - SpillFPToMemory = MFI.getStackID(*FuncInfo->FramePointerSaveIndex) != - TargetStackID::SGPRSpill; - } - - bool HasBPSaveIndex = FuncInfo->BasePointerSaveIndex.hasValue(); - bool SpillBPToMemory = false; - if (HasBPSaveIndex) { - SpillBPToMemory = MFI.getStackID(*FuncInfo->BasePointerSaveIndex) != - TargetStackID::SGPRSpill; - } + Optional FPSaveIndex = FuncInfo->FramePointerSaveIndex; + Optional BPSaveIndex = FuncInfo->BasePointerSaveIndex; if (RoundedSize != 0 && hasFP(MF)) { BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg) @@ -1178,10 +1330,10 @@ } Register ScratchExecCopy; - if (HasFPSaveIndex) { - const int FI = *FuncInfo->FramePointerSaveIndex; + if (FPSaveIndex) { + const int FI = *FPSaveIndex; assert(!MFI.isDeadObjectIndex(FI)); - if (SpillFPToMemory) { + if (spilledToMemory(MF, *FPSaveIndex)) { if (!ScratchExecCopy) ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false); @@ -1209,10 +1361,10 @@ MCCFIInstruction::createDefCfaRegister( nullptr, MCRI->getDwarfRegNum(StackPtrReg, false))); - if (HasBPSaveIndex) { - const int BasePtrFI = *FuncInfo->BasePointerSaveIndex; + if (BPSaveIndex) { + const int BasePtrFI = *BPSaveIndex; assert(!MFI.isDeadObjectIndex(BasePtrFI)); - if (SpillBPToMemory) { + if (spilledToMemory(MF, *BPSaveIndex)) { if (!ScratchExecCopy) ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false); @@ -1272,14 +1424,25 @@ #ifndef NDEBUG static bool allSGPRSpillsAreDead(const MachineFunction &MF) { const MachineFrameInfo &MFI = MF.getFrameInfo(); + const GCNSubtarget &ST = MF.getSubtarget(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); const SIMachineFunctionInfo *FuncInfo = MF.getInfo(); for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); I != E; ++I) { if (!MFI.isDeadObjectIndex(I) && - MFI.getStackID(I) == TargetStackID::SGPRSpill && - (I != FuncInfo->FramePointerSaveIndex && - I != FuncInfo->BasePointerSaveIndex)) { - return false; + MFI.getStackID(I) == TargetStackID::SGPRSpill) { + // Found a non dead SGPR spill + if (I != FuncInfo->FramePointerSaveIndex && + I != FuncInfo->BasePointerSaveIndex && + (!TRI->isCFISavedRegsSpillEnabled() || + (I != FuncInfo->ReturnAddressSaveIndex && + I != FuncInfo->EXECSaveIndex))) { + // This is a hack. Consider the SGPR spill as dead if it is for the + // FP, BP or RA/EXEC if '-amdgpu-spill-cfi-saved-regs' is enabled. + return false; + } else { + return true; + } } } @@ -1304,7 +1467,7 @@ const SIRegisterInfo *TRI = ST.getRegisterInfo(); SIMachineFunctionInfo *FuncInfo = MF.getInfo(); - FuncInfo->removeDeadFrameIndices(MFI); + FuncInfo->removeDeadFrameIndices(MF); assert(allSGPRSpillsAreDead(MF) && "SGPR spill should have been removed in SILowerSGPRSpills"); @@ -1327,6 +1490,35 @@ } } +// Find a register/memory location for RA and EXEC saves +static void allocateCFISave(MachineFunction &MF, int &FI, Register Reg) { + SIMachineFunctionInfo *MFI = MF.getInfo(); + const GCNSubtarget &ST = MF.getSubtarget(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + if (MFI->haveFreeLanesForSGPRSpill(MF, TRI->getSpillSize(*RC) / 4)) { + int NewFI = MF.getFrameInfo().CreateStackObject( + TRI->getSpillSize(*RC), TRI->getSpillAlign(*RC), true, nullptr, + TargetStackID::SGPRSpill); + if (MFI->allocateSGPRSpillToVGPR(MF, NewFI)) { + FI = NewFI; + } + } else { + int NewFI = MF.getFrameInfo().CreateStackObject( + TRI->getSpillSize(*RC), TRI->getSpillAlign(*RC), true, nullptr, + TargetStackID::SGPRSpill); + if (MFI->allocateSGPRSpillToVGPR(MF, NewFI)) { + FI = NewFI; + } else { + // Remove dead index + MF.getFrameInfo().RemoveStackObject(NewFI); + FI = MF.getFrameInfo().CreateSpillStackObject( + TRI->getSpillSize(*RC), Align(TRI->getSpillAlign(*RC))); + } + } + return; +} + // Only report VGPRs to generic code. void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, BitVector &SavedVGPRs, @@ -1360,6 +1552,13 @@ for (auto SSpill : MFI->getSGPRSpillVGPRs()) SavedVGPRs.reset(SSpill.VGPR); + if (TRI->isCFISavedRegsSpillEnabled()) { + allocateCFISave(MF, MFI->ReturnAddressSaveIndex, + TRI->getReturnAddressReg(MF)); + allocateCFISave(MF, MFI->EXECSaveIndex, + ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC); + } + LivePhysRegs LiveRegs; LiveRegs.init(*TRI); @@ -1473,6 +1672,7 @@ return MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint(); } +// Emit the spill instructions for CSRs bool SIFrameLowering::spillCalleeSavedRegisters( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const ArrayRef CSI, const TargetRegisterInfo *TRI) const { @@ -1565,6 +1765,7 @@ } } +// Emit CFI for an SGPR spilled to a single lane of a VGPR void SIFrameLowering::buildCFIForSGPRToVGPRSpill( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, const Register SGPR, const Register VGPR, @@ -1609,6 +1810,60 @@ MCCFIInstruction::createEscape(nullptr, OSCFIInst.str())); } +// Emit CFI for an SGPR spilled to multiple lanes of VGPRs +void SIFrameLowering::buildCFIForSGPRToVGPRSpill( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, Register SGPR, + ArrayRef VGPRSpills) const { + MachineFunction &MF = *MBB.getParent(); + const MCRegisterInfo &MCRI = *MF.getMMI().getContext().getRegisterInfo(); + int DwarfSGPR = MCRI.getDwarfRegNum(SGPR, false); + + // CFI for an SGPR spilled to a multiple lanes of VGPRs is implemented as an + // expression(E) rule where E is a composite location description + // with multiple parts each referencing + // VGPR register location storage with a bit offset of the lane index + // multiplied by the size of an SGPR (32 bits). In other words we generate + // the following DWARF: + // + // DW_CFA_expression: , + // (DW_OP_regx ) (DW_OP_bit_piece 32, *32) + // (DW_OP_regx ) (DW_OP_bit_piece 32, *32) + // ... + // (DW_OP_regx ) (DW_OP_bit_piece 32, *32) + // + // The memory location description for the current CFA is pushed on the + // stack before E is evaluated, but we choose not to drop it as it would + // require a longer expression E and DWARF defines the result of the + // evaulation to be the location description on the top of the stack (i.e. the + // implictly pushed one is just ignored.) + SmallString<20> CFIInst; + raw_svector_ostream OSCFIInst(CFIInst); + SmallString<20> Block; + raw_svector_ostream OSBlock(Block); + + OSCFIInst << uint8_t(dwarf::DW_CFA_expression); + encodeULEB128(DwarfSGPR, OSCFIInst); + + // TODO: Detect when we can merge multiple adjacent pieces, or even reduce + // this to a register location description (when all pieces are adjacent). + for (SIMachineFunctionInfo::SpilledReg Spill : VGPRSpills) { + encodeDwarfRegisterLocation(MCRI.getDwarfRegNum(Spill.VGPR, false), + OSBlock); + OSBlock << uint8_t(dwarf::DW_OP_bit_piece); + // FIXME:Can this be a function of the SGPR? + const unsigned SGPRBitSize = 32; + encodeULEB128(SGPRBitSize, OSBlock); + encodeULEB128(SGPRBitSize * Spill.Lane, OSBlock); + } + + encodeULEB128(Block.size(), OSCFIInst); + OSCFIInst << Block; + + buildCFI(MBB, MBBI, DL, + MCCFIInstruction::createEscape(nullptr, OSCFIInst.str())); +} + void SIFrameLowering::buildCFIForVGPRToVMEMSpill( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, unsigned VGPR, int64_t Offset) const { diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -496,6 +496,9 @@ Register SGPRForBPSaveRestoreCopy; Optional BasePointerSaveIndex; + int ReturnAddressSaveIndex; + int EXECSaveIndex; + Register VGPRReservedForSGPRSpill; bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg); @@ -541,7 +544,7 @@ bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI); bool reserveVGPRforSGPRSpills(MachineFunction &MF); bool allocateVGPRSpillToAGPR(MachineFunction &MF, int FI, bool isAGPRtoVGPR); - void removeDeadFrameIndices(MachineFrameInfo &MFI); + void removeDeadFrameIndices(MachineFunction &MF); bool hasCalculatedTID() const { return TIDReg != 0; }; Register getTIDReg() const { return TIDReg; }; diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -428,10 +428,16 @@ return Spill.FullyAllocated; } -void SIMachineFunctionInfo::removeDeadFrameIndices(MachineFrameInfo &MFI) { - // The FP & BP spills haven't been inserted yet, so keep them around. +// Remove the dead spill locations +void SIMachineFunctionInfo::removeDeadFrameIndices(MachineFunction &MF) { + MachineFrameInfo &MFI = MF.getFrameInfo(); + const GCNSubtarget &ST = MF.getSubtarget(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + // RA, EXEC, FP & BP spills haven't been inserted yet, so keep them around. for (auto &R : SGPRToVGPRSpills) { - if (R.first != FramePointerSaveIndex && R.first != BasePointerSaveIndex) + if (R.first != FramePointerSaveIndex && R.first != BasePointerSaveIndex && + (!TRI->isCFISavedRegsSpillEnabled() || + (R.first != ReturnAddressSaveIndex && R.first != EXECSaveIndex))) MFI.RemoveStackObject(R.first); } @@ -439,7 +445,9 @@ // ID. for (int i = MFI.getObjectIndexBegin(), e = MFI.getObjectIndexEnd(); i != e; ++i) - if (i != FramePointerSaveIndex && i != BasePointerSaveIndex) + if (i != FramePointerSaveIndex && i != BasePointerSaveIndex && + (!TRI->isCFISavedRegsSpillEnabled() || + (i != ReturnAddressSaveIndex && i != EXECSaveIndex))) MFI.setStackID(i, TargetStackID::Default); for (auto &R : VGPRToAGPRSpills) { diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -58,6 +58,8 @@ return SpillSGPRToVGPR; } + bool isCFISavedRegsSpillEnabled() const; + /// Return the end register initially reserved for the scratch buffer in case /// spilling is needed. MCRegister reservedPrivateSegmentBufferReg(const MachineFunction &MF) const; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -40,6 +40,11 @@ cl::ReallyHidden, cl::init(true)); +static cl::opt EnableSpillCFISavedRegs( + "amdgpu-spill-cfi-saved-regs", + cl::desc("Enable spilling the registers required for CFI emission"), + cl::ReallyHidden, cl::init(false)); + std::array, 16> SIRegisterInfo::RegSplitParts; std::array, 9> SIRegisterInfo::SubRegFromChannelTable; @@ -198,6 +203,10 @@ return SubRegFromChannelTable[NumRegIndex - 1][Channel]; } +bool SIRegisterInfo::isCFISavedRegsSpillEnabled() const { + return EnableSpillCFISavedRegs; +} + MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg( const MachineFunction &MF) const { unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4; diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-spill-cfi-saved-regs.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-spill-cfi-saved-regs.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-spill-cfi-saved-regs.ll @@ -0,0 +1,170 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-spill-cfi-saved-regs -verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK,WAVE64 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-spill-cfi-saved-regs -verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK,WAVE32 %s + +; CHECK-LABEL: kern: +; CHECK: .cfi_startproc +; CHECK-NOT: .cfi_{{.*}} +; CHECK: %bb.0: +; CHECK-NEXT: .cfi_escape 0x0f, 0x03, 0x30, 0x36, 0xe1 +; CHECK-NEXT: .cfi_undefined 16 +; CHECK-NOT: .cfi_{{.*}} +; CHECK: .cfi_endproc +define protected amdgpu_kernel void @kern() #0 { +entry: + ret void +} + +; CHECK-LABEL: func_saved_in_clobbered_vgpr: +; CHECK: .cfi_startproc +; CHECK-NOT: .cfi_{{.*}} +; CHECK: %bb.0: +; SGPR32 = 64 +; CHECK-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; CHECK-NEXT: .cfi_escape 0x10, 0x10, 0x08, 0x90, 0x3e, 0x93, 0x04, 0x90, 0x3f, 0x93, 0x04 + + +; FIXME: ideally this would not care what VGPR we spill to, but since we are +; using .cfi_escape it isn't trivial/possible to make this general yet + +; CHECK: v_writelane_b32 v0, s30, 0 +; CHECK-NEXT: v_writelane_b32 v0, s31, 1 + +; DW_CFA_expression [0x10] +; PC_64 ULEB128(17)=[0x10] +; BLOCK_LENGTH ULEB128(12)=[0x0c] +; DW_OP_regx [0x90] +; VGPR0_wave64 ULEB128(2560)=[0x80, 0x14] +; DW_OP_bit_piece [0x9d] +; PIECE_SIZE [0x20] +; PIECE_OFFSET [0x00] +; DW_OP_regx [0x90] +; VGPR0_wave64 ULEB128(2560)=[0x80, 0x14] +; DW_OP_bit_piece [0x9d] +; PIECE_SIZE [0x20] +; PIECE_OFFSET [0x20] +; WAVE64-NEXT: .cfi_escape 0x10, 0x10, 0x0c, 0x90, 0x80, 0x14, 0x9d, 0x20, 0x00, 0x90, 0x80, 0x14, 0x9d, 0x20, 0x20 + +; DW_CFA_expression [0x10] +; PC_64 ULEB128(17)=[0x10] +; BLOCK_LENGTH ULEB128(12)=[0x0c] +; DW_OP_regx [0x90] +; VGPR0_wave32 ULEB128(1536)=[0x80, 0x0c] +; DW_OP_bit_piece [0x9d] +; PIECE_SIZE [0x20] +; PIECE_OFFSET [0x00] +; DW_OP_regx [0x90] +; VGPR0_wave32 ULEB128(1536)=[0x80, 0x0c] +; DW_OP_bit_piece [0x9d] +; PIECE_SIZE [0x20] +; PIECE_OFFSET [0x20] +; WAVE32-NEXT: .cfi_escape 0x10, 0x10, 0x0c, 0x90, 0x80, 0x0c, 0x9d, 0x20, 0x00, 0x90, 0x80, 0x0c, 0x9d, 0x20, 0x20 + + +; WAVE64: v_writelane_b32 v0, exec_lo, 2 +; WAVE64-NEXT: v_writelane_b32 v0, exec_hi, 3 +; DW_CFA_expression [0x10] +; EXEC_MASK_wave64 ULEB128(17)=[0x11] +; BLOCK_LENGTH ULEB128(12)=[0x0c] +; DW_OP_regx [0x90] +; VGPR0_wave64 ULEB128(2560)=[0x80, 0x14] +; DW_OP_bit_piece [0x9d] +; PIECE_SIZE [0x20] +; PIECE_OFFSET [0x40] +; DW_OP_regx [0x90] +; VGPR0_wave64 ULEB128(2560)=[0x80, 0x14] +; DW_OP_bit_piece [0x9d] +; PIECE_SIZE [0x20] +; PIECE_OFFSET [0x60] +; WAVE64-NEXT: .cfi_escape 0x10, 0x11, 0x0c, 0x90, 0x80, 0x14, 0x9d, 0x20, 0x40, 0x90, 0x80, 0x14, 0x9d, 0x20, 0x60 + +; WAVE32: v_writelane_b32 v0, exec_lo, 2 +; DW_CFA_expression [0x10] +; EXEC_MASK_wave32 ULEB128(1)=[0x01] +; BLOCK_LENGTH ULEB128(6)=[0x06] +; DW_OP_regx [0x90] +; VGPR0_wave32 ULEB128(1536)=[0x80, 0x0c] +; DW_OP_bit_piece [0x9d] +; PIECE_SIZE [0x20] +; PIECE_OFFSET [0x40] +; WAVE32-NEXT: .cfi_escape 0x10, 0x01, 0x06, 0x90, 0x80, 0x0c, 0x9d, 0x20, 0x40 + +; CHECK-NOT: .cfi_{{.*}} +; CHECK: .cfi_endproc +define hidden void @func_saved_in_clobbered_vgpr() #0 { +entry: + ret void +} + +; Check that the option causes a CSR VGPR to spill when needed. + +; CHECK-LABEL: func_saved_in_preserved_vgpr: +; CHECK: %bb.0: + +; CHECK: s_or_saveexec_b{{(32|64)}} +; CHECK: buffer_store_dword [[CSR:v[0-9]+]], off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK: s_mov_b{{(32|64)}} {{(exec|exec_lo)}}, + +; CHECK: v_writelane_b32 [[CSR]], s30, {{[0-9]+}} +; CHECK-NEXT: v_writelane_b32 [[CSR]], s31, {{[0-9]+}} + +; WAVE64: v_writelane_b32 [[CSR]], exec_lo, {{[0-9]+}} +; WAVE64-NEXT: v_writelane_b32 [[CSR]], exec_hi, {{[0-9]+}} + +; WAVE32: v_writelane_b32 [[CSR]], exec_lo, {{[0-9]+}} + +define hidden void @func_saved_in_preserved_vgpr() #0 { +entry: + call void asm sideeffect "; clobber nonpreserved VGPRs", + "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9} + ,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19} + ,~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29} + ,~{v30},~{v31},~{v32},~{v33},~{v34},~{v35},~{v36},~{v37},~{v38},~{v39}"() + ret void +} + +; There's no return here, so the return address live in was +; deleted. It needs to be re-added as a live in to the entry block. +; CHECK-LABEL: {{^}}empty_func: +; CHECK: v_writelane_b32 v0, s30, 0 +; CHECK: v_writelane_b32 v0, s31, 1 +define void @empty_func() { + unreachable +} + +; Check that the option causes RA and EXEC to be spilled to memory. + +; CHECK-LABEL: no_vgprs_to_spill_into: +; CHECK: %bb.0: + +; WAVE64: s_or_saveexec_b64 s[4:5], -1 +; WAVE64-NEXT: v_mov_b32_e32 v0, s30 +; WAVE64-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; WAVE64-NEXT: v_mov_b32_e32 v0, s31 +; WAVE64-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_offset 16, 0 +; WAVE64-NEXT: v_mov_b32_e32 v0, s4 +; WAVE64-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; WAVE64-NEXT: v_mov_b32_e32 v0, s5 +; WAVE64-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_offset 17, 512 +; WAVE64-NEXT: s_mov_b64 exec, s[4:5] + +define void @no_vgprs_to_spill_into() #1 { + call void asm sideeffect "", + "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9} + ,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19} + ,~{v20},~{v21},~{v22},~{v23},~{v24}"() + + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind "amdgpu-waves-per-eu"="10,10" } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, emissionKind: FullDebug) +!1 = !DIFile(filename: "filename", directory: "directory") +!2 = !{i32 7, !"Dwarf Version", i32 4} +!3 = !{i32 2, !"Debug Info Version", i32 3}