diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h @@ -10,11 +10,11 @@ #define LLVM_LIB_TARGET_AMDGPU_SIFRAMELOWERING_H #include "AMDGPUFrameLowering.h" +#include "SIMachineFunctionInfo.h" namespace llvm { class SIInstrInfo; -class SIMachineFunctionInfo; class SIRegisterInfo; class GCNSubtarget; @@ -82,6 +82,14 @@ bool hasFP(const MachineFunction &MF) const override; bool requiresStackPointerReference(const MachineFunction &MF) const; + + /// If '-amdgpu-spill-cfi-saved-regs' is enabled, emit RA/EXEC spills to + /// a free VGPR (lanes) or memory and corresponding CFI rules. + void emitCFISavedRegSpills(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + LivePhysRegs &LiveRegs, Register &ScratchExecCopy, + bool emitSpillsToMem) const; + /// Create a CFI index for CFIInst and build a MachineInstr around it. void buildCFI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, const MCCFIInstruction &CFIInst) const; @@ -91,6 +99,12 @@ MachineBasicBlock::iterator MBBI, const DebugLoc &DL, const Register SGPR, const Register VGPR, const int Lane) const; + /// Create a CFI index describing a spill of an SGPR to multiple lanes of + /// VGPRs and build a MachineInstr around it. + void buildCFIForSGPRToVGPRSpill( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, Register SGPR, + ArrayRef VGPRSpills) const; /// Create a CFI index describing a spill of a VGPR to VMEM and /// build a MachineInstr around it. void buildCFIForVGPRToVMEMSpill(MachineBasicBlock &MBB, diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -123,11 +123,13 @@ MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const SIInstrInfo *TII, Register SpillReg, - Register ScratchRsrcReg, Register SPReg, int FI) { + Register ScratchRsrcReg, Register SPReg, int FI, + int DwordOff) { MachineFunction *MF = MBB.getParent(); MachineFrameInfo &MFI = MF->getFrameInfo(); - int64_t Offset = MFI.getObjectOffset(FI); + // 'DwordOff' is the offset of the lower/upper double word + int64_t Offset = MFI.getObjectOffset(FI) + DwordOff; MachineMemOperand *MMO = MF->getMachineMemOperand( MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, 4, @@ -483,12 +485,25 @@ } // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not -// memory. They should have been removed by now. -static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) { +// memory. They should have been removed by now, except CFI Saved Reg spills. +static bool allStackObjectsAreDead(const MachineFunction &MF) { + const MachineFrameInfo &MFI = MF.getFrameInfo(); + const GCNSubtarget &ST = MF.getSubtarget(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const SIMachineFunctionInfo *FuncInfo = MF.getInfo(); for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); I != E; ++I) { - if (!MFI.isDeadObjectIndex(I)) + if (!MFI.isDeadObjectIndex(I)) { + // determineCalleeSaves() might have added the SGPRSpill stack IDs for + // CFI saves into scratch VGPR, ignore them + if (MFI.getStackID(I) == TargetStackID::SGPRSpill && + TRI->isCFISavedRegsSpillEnabled() && + (I == FuncInfo->ReturnAddressSaveIndex || + I == FuncInfo->EXECSaveIndex)) { + continue; + } return false; + } } return true; @@ -508,8 +523,8 @@ Register ScratchRsrcReg = MFI->getScratchRSrcReg(); - if (!ScratchRsrcReg || (!MRI.isPhysRegUsed(ScratchRsrcReg) && - allStackObjectsAreDead(MF.getFrameInfo()))) + if (!ScratchRsrcReg || + (!MRI.isPhysRegUsed(ScratchRsrcReg) && allStackObjectsAreDead(MF))) return Register(); if (ST.hasSGPRInitBug() || @@ -882,6 +897,166 @@ return ScratchExecCopy; } +// A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR. +// Otherwise we are spilling to memory. +static bool spilledToMemory(const MachineFunction &MF, int SaveIndex) { + const MachineFrameInfo &MFI = MF.getFrameInfo(); + return MFI.getStackID(SaveIndex) != TargetStackID::SGPRSpill; +} + +// Emit the RA and EXEC saves for the non-kernel functions in the +// prologue and the corresponding CFI rules. +void SIFrameLowering::emitCFISavedRegSpills(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + LivePhysRegs &LiveRegs, + Register &ScratchExecCopy, + bool emitSpillsToMem) const { + const GCNSubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + const MCRegisterInfo *MCRI = MF.getMMI().getContext().getRegisterInfo(); + + SIMachineFunctionInfo *FuncInfo = MF.getInfo(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + Register StackPtrReg = FuncInfo->getStackPtrOffsetReg(); + + Optional RASaveIndex = FuncInfo->ReturnAddressSaveIndex; + Optional EXECSaveIndex = FuncInfo->EXECSaveIndex; + Register RetAddrReg = TRI.getReturnAddressReg(MF); + DebugLoc DL; + + if (emitSpillsToMem) { + // Return address is being spilled into memory at the frame + // index and consumes two double words. And + // build the corresponding CFI rule. + if (RASaveIndex && spilledToMemory(MF, *RASaveIndex)) { + const int FI = *RASaveIndex; + assert(!MFI.isDeadObjectIndex(FI)); + + if (!ScratchExecCopy) + ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true); + + MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( + MRI, LiveRegs, AMDGPU::VGPR_32RegClass); + + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) + .addReg(TRI.getSubReg(RetAddrReg, AMDGPU::sub0)); + + int DwordOff = 0; + buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, TmpVGPR, + FuncInfo->getScratchRSrcReg(), StackPtrReg, FI, + DwordOff); + + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) + .addReg(TRI.getSubReg(RetAddrReg, AMDGPU::sub1)); + + DwordOff = 4; + buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, TmpVGPR, + FuncInfo->getScratchRSrcReg(), StackPtrReg, FI, + DwordOff); + + buildCFI(MBB, MBBI, DL, + MCCFIInstruction::createOffset( + nullptr, MCRI->getDwarfRegNum(AMDGPU::PC_REG, false), + MFI.getObjectOffset(FI) * ST.getWavefrontSize())); + } + + // EXEC mask is being spilled into memory at the frame + // index and consumes two double words in + // wave64 mode and one doble word in wave32 mode. And + // build the corresponding CFI rule. + if (EXECSaveIndex && spilledToMemory(MF, *EXECSaveIndex)) { + const int FI = *EXECSaveIndex; + assert(!MFI.isDeadObjectIndex(FI)); + + if (!ScratchExecCopy) + ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true); + + MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( + MRI, LiveRegs, AMDGPU::VGPR_32RegClass); + + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) + .addReg(TRI.getSubReg(ScratchExecCopy, AMDGPU::sub0)); + + int DwordOff = 0; + buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, TmpVGPR, + FuncInfo->getScratchRSrcReg(), StackPtrReg, FI, + DwordOff); + + if (!ST.isWave32()) { + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) + .addReg(TRI.getSubReg(ScratchExecCopy, AMDGPU::sub1)); + + DwordOff = 4; + buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, TmpVGPR, + FuncInfo->getScratchRSrcReg(), StackPtrReg, FI, + DwordOff); + } + + buildCFI(MBB, MBBI, DL, + MCCFIInstruction::createOffset( + nullptr, MCRI->getDwarfRegNum(AMDGPU::EXEC, false), + MFI.getObjectOffset(FI) * ST.getWavefrontSize())); + } + } + + if (!emitSpillsToMem) { + // Return address is being spilled into free VGPR lanes + // and consumes two lanes, build the corresponding CFI rule. + if (RASaveIndex && !spilledToMemory(MF, *RASaveIndex)) { + MCRegister RetAddrReg = TRI.getReturnAddressReg(MF); + if (!MBB.isLiveIn(RetAddrReg)) + MBB.addLiveIn(RetAddrReg); + + ArrayRef ReturnAddressSpill = + FuncInfo->getSGPRToVGPRSpills(*RASaveIndex); + assert(ReturnAddressSpill.size() == 2); + BuildMI(MBB, MBBI, DL, + TII->get(AMDGPU::V_WRITELANE_B32), + ReturnAddressSpill[0].VGPR) + .addReg(TRI.getSubReg(RetAddrReg, AMDGPU::sub0)) + .addImm(ReturnAddressSpill[0].Lane) + .addReg(ReturnAddressSpill[0].VGPR, RegState::Undef); + BuildMI(MBB, MBBI, DL, + TII->get(AMDGPU::V_WRITELANE_B32), + ReturnAddressSpill[1].VGPR) + .addReg(TRI.getSubReg(RetAddrReg, AMDGPU::sub1)) + .addImm(ReturnAddressSpill[1].Lane) + .addReg(ReturnAddressSpill[1].VGPR, RegState::Undef); + buildCFIForSGPRToVGPRSpill(MBB, MBBI, DL, AMDGPU::PC_REG, + ReturnAddressSpill); + } + + // EXEC mask is being spilled into free VGPR lanes and consumes + // two lanes in wave64 mode and one lane in wave32 mode, build + // the corresponding CFI rule. + if (EXECSaveIndex && !spilledToMemory(MF, *EXECSaveIndex)) { + ArrayRef EXECSpill = + FuncInfo->getSGPRToVGPRSpills(*EXECSaveIndex); + assert(EXECSpill.size()); + BuildMI(MBB, MBBI, DL, + TII->get(AMDGPU::V_WRITELANE_B32), + EXECSpill[0].VGPR) + .addReg(AMDGPU::EXEC_LO) + .addImm(EXECSpill[0].Lane) + .addReg(EXECSpill[0].VGPR, RegState::Undef); + if (!ST.isWave32()) { + assert(EXECSpill.size() == 2); + BuildMI(MBB, MBBI, DL, + TII->get(AMDGPU::V_WRITELANE_B32), + EXECSpill[1].VGPR) + .addReg(AMDGPU::EXEC_HI) + .addImm(EXECSpill[1].Lane) + .addReg(EXECSpill[1].VGPR, RegState::Undef); + } + buildCFIForSGPRToVGPRSpill(MBB, MBBI, DL, AMDGPU::EXEC, EXECSpill); + } + } +} + +// Emit the CFI rules for non-kernel functions in the prologue void SIFrameLowering::emitPrologueEntryCFI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL) const { @@ -974,23 +1149,8 @@ emitPrologueEntryCFI(MBB, MBBI, DL); - bool HasFPSaveIndex = FuncInfo->FramePointerSaveIndex.hasValue(); - bool SpillFPToMemory = false; - // A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR. - // Otherwise we are spilling the FP to memory. - if (HasFPSaveIndex) { - SpillFPToMemory = MFI.getStackID(*FuncInfo->FramePointerSaveIndex) != - TargetStackID::SGPRSpill; - } - - bool HasBPSaveIndex = FuncInfo->BasePointerSaveIndex.hasValue(); - bool SpillBPToMemory = false; - // A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR. - // Otherwise we are spilling the BP to memory. - if (HasBPSaveIndex) { - SpillBPToMemory = MFI.getStackID(*FuncInfo->BasePointerSaveIndex) != - TargetStackID::SGPRSpill; - } + Optional FPSaveIndex = FuncInfo->FramePointerSaveIndex; + Optional BPSaveIndex = FuncInfo->BasePointerSaveIndex; for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg : FuncInfo->getSGPRSpillVGPRs()) { @@ -1001,9 +1161,10 @@ ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true); int FI = Reg.FI.getValue(); + int DwordOff = 0; buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, Reg.VGPR, - FuncInfo->getScratchRSrcReg(), StackPtrReg, FI); + FuncInfo->getScratchRSrcReg(), StackPtrReg, FI, DwordOff); // We spill the entire VGPR, so we can get away with just cfi_offset buildCFI(MBB, MBBI, DL, @@ -1012,8 +1173,14 @@ MFI.getObjectOffset(FI) * ST.getWavefrontSize())); } - if (HasFPSaveIndex && SpillFPToMemory) { - const int FI = *FuncInfo->FramePointerSaveIndex; + if (TRI.isCFISavedRegsSpillEnabled()) { + bool emitSpillsToMem = true; + emitCFISavedRegSpills(MF, MBB, MBBI, LiveRegs, ScratchExecCopy, + emitSpillsToMem); + } + + if (FPSaveIndex && spilledToMemory(MF, *FPSaveIndex)) { + const int FI = *FPSaveIndex; assert(!MFI.isDeadObjectIndex(FI)); if (!ScratchExecCopy) @@ -1027,16 +1194,17 @@ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) .addReg(FramePtrReg); + int DwordOff = 0; buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, TmpVGPR, - FuncInfo->getScratchRSrcReg(), StackPtrReg, FI); + FuncInfo->getScratchRSrcReg(), StackPtrReg, FI, DwordOff); buildCFI(MBB, MBBI, DL, MCCFIInstruction::createOffset( nullptr, MCRI->getDwarfRegNum(FramePtrReg, false), MFI.getObjectOffset(FI) * ST.getWavefrontSize())); } - if (HasBPSaveIndex && SpillBPToMemory) { - const int BasePtrFI = *FuncInfo->BasePointerSaveIndex; + if (BPSaveIndex && spilledToMemory(MF, *BPSaveIndex)) { + const int BasePtrFI = *BPSaveIndex; assert(!MFI.isDeadObjectIndex(BasePtrFI)); if (!ScratchExecCopy) @@ -1050,8 +1218,10 @@ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) .addReg(BasePtrReg); + int DwordOff = 0; buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, TmpVGPR, - FuncInfo->getScratchRSrcReg(), StackPtrReg, BasePtrFI); + FuncInfo->getScratchRSrcReg(), StackPtrReg, BasePtrFI, + DwordOff); buildCFI(MBB, MBBI, DL, MCCFIInstruction::createOffset( nullptr, MCRI->getDwarfRegNum(BasePtrReg, false), @@ -1067,9 +1237,15 @@ LiveRegs.addReg(ScratchExecCopy); } + if (TRI.isCFISavedRegsSpillEnabled()) { + bool emitSpillsToMem = false; + emitCFISavedRegSpills(MF, MBB, MBBI, LiveRegs, ScratchExecCopy, + emitSpillsToMem); + } + // In this case, spill the FP to a reserved VGPR. - if (HasFPSaveIndex && !SpillFPToMemory) { - const int FI = *FuncInfo->FramePointerSaveIndex; + if (FPSaveIndex && !spilledToMemory(MF, *FPSaveIndex)) { + const int FI = *FPSaveIndex; assert(!MFI.isDeadObjectIndex(FI)); assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); @@ -1089,8 +1265,8 @@ } // In this case, spill the BP to a reserved VGPR. - if (HasBPSaveIndex && !SpillBPToMemory) { - const int BasePtrFI = *FuncInfo->BasePointerSaveIndex; + if (BPSaveIndex && !spilledToMemory(MF, *BPSaveIndex)) { + const int BasePtrFI = *BPSaveIndex; assert(!MFI.isDeadObjectIndex(BasePtrFI)); assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill); @@ -1248,19 +1424,8 @@ const Register BasePtrReg = TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register(); - bool HasFPSaveIndex = FuncInfo->FramePointerSaveIndex.hasValue(); - bool SpillFPToMemory = false; - if (HasFPSaveIndex) { - SpillFPToMemory = MFI.getStackID(*FuncInfo->FramePointerSaveIndex) != - TargetStackID::SGPRSpill; - } - - bool HasBPSaveIndex = FuncInfo->BasePointerSaveIndex.hasValue(); - bool SpillBPToMemory = false; - if (HasBPSaveIndex) { - SpillBPToMemory = MFI.getStackID(*FuncInfo->BasePointerSaveIndex) != - TargetStackID::SGPRSpill; - } + Optional FPSaveIndex = FuncInfo->FramePointerSaveIndex; + Optional BPSaveIndex = FuncInfo->BasePointerSaveIndex; if (RoundedSize != 0 && hasFP(MF)) { BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg) @@ -1282,10 +1447,10 @@ } Register ScratchExecCopy; - if (HasFPSaveIndex) { - const int FI = *FuncInfo->FramePointerSaveIndex; + if (FPSaveIndex) { + const int FI = *FPSaveIndex; assert(!MFI.isDeadObjectIndex(FI)); - if (SpillFPToMemory) { + if (spilledToMemory(MF, *FPSaveIndex)) { if (!ScratchExecCopy) ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false); @@ -1314,10 +1479,10 @@ MCCFIInstruction::createDefCfaRegister( nullptr, MCRI->getDwarfRegNum(StackPtrReg, false))); - if (HasBPSaveIndex) { - const int BasePtrFI = *FuncInfo->BasePointerSaveIndex; + if (BPSaveIndex) { + const int BasePtrFI = *BPSaveIndex; assert(!MFI.isDeadObjectIndex(BasePtrFI)); - if (SpillBPToMemory) { + if (spilledToMemory(MF, *BPSaveIndex)) { if (!ScratchExecCopy) ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false); @@ -1366,14 +1531,25 @@ #ifndef NDEBUG static bool allSGPRSpillsAreDead(const MachineFunction &MF) { const MachineFrameInfo &MFI = MF.getFrameInfo(); + const GCNSubtarget &ST = MF.getSubtarget(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); const SIMachineFunctionInfo *FuncInfo = MF.getInfo(); for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); I != E; ++I) { if (!MFI.isDeadObjectIndex(I) && - MFI.getStackID(I) == TargetStackID::SGPRSpill && - (I != FuncInfo->FramePointerSaveIndex && - I != FuncInfo->BasePointerSaveIndex)) { - return false; + MFI.getStackID(I) == TargetStackID::SGPRSpill) { + // Found a non dead SGPR spill + if (I != FuncInfo->FramePointerSaveIndex && + I != FuncInfo->BasePointerSaveIndex && + (!TRI->isCFISavedRegsSpillEnabled() || + (I != FuncInfo->ReturnAddressSaveIndex && + I != FuncInfo->EXECSaveIndex))) { + // This is a hack. Consider the SGPR spill as dead if it is for the + // FP, BP or RA/EXEC if '-amdgpu-spill-cfi-saved-regs' is enabled. + return false; + } else { + return true; + } } } @@ -1399,14 +1575,14 @@ const SIRegisterInfo *TRI = ST.getRegisterInfo(); SIMachineFunctionInfo *FuncInfo = MF.getInfo(); - FuncInfo->removeDeadFrameIndices(MFI); + FuncInfo->removeDeadFrameIndices(MF); assert(allSGPRSpillsAreDead(MF) && "SGPR spill should have been removed in SILowerSGPRSpills"); // FIXME: The other checks should be redundant with allStackObjectsAreDead, // but currently hasNonSpillStackObjects is set only from source // allocas. Stack temps produced from legalization are not counted currently. - if (!allStackObjectsAreDead(MFI)) { + if (!allStackObjectsAreDead(MF)) { assert(RS && "RegScavenger required if spilling"); if (FuncInfo->isEntryFunction()) { @@ -1422,6 +1598,35 @@ } } +// Find a register/memory location for RA and EXEC saves +static void allocateCFISave(MachineFunction &MF, int &FI, Register Reg) { + SIMachineFunctionInfo *MFI = MF.getInfo(); + const GCNSubtarget &ST = MF.getSubtarget(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + if (MFI->haveFreeLanesForSGPRSpill(MF, TRI->getSpillSize(*RC) / 4)) { + int NewFI = MF.getFrameInfo().CreateStackObject( + TRI->getSpillSize(*RC), TRI->getSpillAlign(*RC), true, nullptr, + TargetStackID::SGPRSpill); + if (TRI->spillSGPRToVGPR() && MFI->allocateSGPRSpillToVGPR(MF, NewFI)) { + FI = NewFI; + } + } else { + int NewFI = MF.getFrameInfo().CreateStackObject( + TRI->getSpillSize(*RC), TRI->getSpillAlign(*RC), true, nullptr, + TargetStackID::SGPRSpill); + if (TRI->spillSGPRToVGPR() && MFI->allocateSGPRSpillToVGPR(MF, NewFI)) { + FI = NewFI; + } else { + // Remove dead index + MF.getFrameInfo().RemoveStackObject(NewFI); + FI = MF.getFrameInfo().CreateSpillStackObject( + TRI->getSpillSize(*RC), Align(TRI->getSpillAlign(*RC))); + } + } + return; +} + // Only report VGPRs to generic code. void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, BitVector &SavedVGPRs, @@ -1438,6 +1643,13 @@ // Ignore the SGPRs the default implementation found. SavedVGPRs.clearBitsNotInMask(TRI->getAllVGPRRegMask()); + if (TRI->isCFISavedRegsSpillEnabled()) { + allocateCFISave(MF, MFI->ReturnAddressSaveIndex, + TRI->getReturnAddressReg(MF)); + allocateCFISave(MF, MFI->EXECSaveIndex, + ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC); + } + // hasFP only knows about stack objects that already exist. We're now // determining the stack slots that will be created, so we have to predict // them. Stack objects force FP usage with calls. @@ -1446,9 +1658,8 @@ // don't want to report it here. // // FIXME: Is this really hasReservedCallFrame? - const bool WillHaveFP = - FrameInfo.hasCalls() && - (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo)); + bool WillHaveFP = FrameInfo.hasCalls() && + (SavedVGPRs.any() || !allStackObjectsAreDead(MF)); // VGPRs used for SGPR spilling need to be specially inserted in the prolog, // so don't allow the default insertion to handle them. @@ -1587,6 +1798,7 @@ return MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint(); } +// Emit the spill instructions for CSRs bool SIFrameLowering::spillCalleeSavedRegisters( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const ArrayRef CSI, const TargetRegisterInfo *TRI) const { @@ -1685,6 +1897,7 @@ } } +// Emit CFI for an SGPR spilled to a single lane of a VGPR void SIFrameLowering::buildCFIForSGPRToVGPRSpill( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, const Register SGPR, const Register VGPR, @@ -1729,6 +1942,60 @@ MCCFIInstruction::createEscape(nullptr, OSCFIInst.str())); } +// Emit CFI for an SGPR spilled to multiple lanes of VGPRs +void SIFrameLowering::buildCFIForSGPRToVGPRSpill( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, Register SGPR, + ArrayRef VGPRSpills) const { + MachineFunction &MF = *MBB.getParent(); + const MCRegisterInfo &MCRI = *MF.getMMI().getContext().getRegisterInfo(); + int DwarfSGPR = MCRI.getDwarfRegNum(SGPR, false); + + // CFI for an SGPR spilled to a multiple lanes of VGPRs is implemented as an + // expression(E) rule where E is a composite location description + // with multiple parts each referencing + // VGPR register location storage with a bit offset of the lane index + // multiplied by the size of an SGPR (32 bits). In other words we generate + // the following DWARF: + // + // DW_CFA_expression: , + // (DW_OP_regx ) (DW_OP_bit_piece 32, *32) + // (DW_OP_regx ) (DW_OP_bit_piece 32, *32) + // ... + // (DW_OP_regx ) (DW_OP_bit_piece 32, *32) + // + // The memory location description for the current CFA is pushed on the + // stack before E is evaluated, but we choose not to drop it as it would + // require a longer expression E and DWARF defines the result of the + // evaulation to be the location description on the top of the stack (i.e. the + // implictly pushed one is just ignored.) + SmallString<20> CFIInst; + raw_svector_ostream OSCFIInst(CFIInst); + SmallString<20> Block; + raw_svector_ostream OSBlock(Block); + + OSCFIInst << uint8_t(dwarf::DW_CFA_expression); + encodeULEB128(DwarfSGPR, OSCFIInst); + + // TODO: Detect when we can merge multiple adjacent pieces, or even reduce + // this to a register location description (when all pieces are adjacent). + for (SIMachineFunctionInfo::SpilledReg Spill : VGPRSpills) { + encodeDwarfRegisterLocation(MCRI.getDwarfRegNum(Spill.VGPR, false), + OSBlock); + OSBlock << uint8_t(dwarf::DW_OP_bit_piece); + // FIXME:Can this be a function of the SGPR? + const unsigned SGPRBitSize = 32; + encodeULEB128(SGPRBitSize, OSBlock); + encodeULEB128(SGPRBitSize * Spill.Lane, OSBlock); + } + + encodeULEB128(Block.size(), OSCFIInst); + OSCFIInst << Block; + + buildCFI(MBB, MBBI, DL, + MCCFIInstruction::createEscape(nullptr, OSCFIInst.str())); +} + void SIFrameLowering::buildCFIForVGPRToVMEMSpill( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, unsigned VGPR, int64_t Offset) const { diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -491,6 +491,9 @@ Register SGPRForBPSaveRestoreCopy; Optional BasePointerSaveIndex; + int ReturnAddressSaveIndex; + int EXECSaveIndex; + Register VGPRReservedForSGPRSpill; bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg); @@ -536,7 +539,7 @@ bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI); bool reserveVGPRforSGPRSpills(MachineFunction &MF); bool allocateVGPRSpillToAGPR(MachineFunction &MF, int FI, bool isAGPRtoVGPR); - void removeDeadFrameIndices(MachineFrameInfo &MFI); + void removeDeadFrameIndices(MachineFunction &MF); bool hasCalculatedTID() const { return TIDReg != 0; }; Register getTIDReg() const { return TIDReg; }; diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -420,10 +420,16 @@ return Spill.FullyAllocated; } -void SIMachineFunctionInfo::removeDeadFrameIndices(MachineFrameInfo &MFI) { - // The FP & BP spills haven't been inserted yet, so keep them around. +// Remove the dead spill locations +void SIMachineFunctionInfo::removeDeadFrameIndices(MachineFunction &MF) { + MachineFrameInfo &MFI = MF.getFrameInfo(); + const GCNSubtarget &ST = MF.getSubtarget(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + // RA, EXEC, FP & BP spills haven't been inserted yet, so keep them around. for (auto &R : SGPRToVGPRSpills) { - if (R.first != FramePointerSaveIndex && R.first != BasePointerSaveIndex) + if (R.first != FramePointerSaveIndex && R.first != BasePointerSaveIndex && + (!TRI->isCFISavedRegsSpillEnabled() || + (R.first != ReturnAddressSaveIndex && R.first != EXECSaveIndex))) MFI.RemoveStackObject(R.first); } @@ -431,7 +437,9 @@ // ID. for (int i = MFI.getObjectIndexBegin(), e = MFI.getObjectIndexEnd(); i != e; ++i) - if (i != FramePointerSaveIndex && i != BasePointerSaveIndex) + if (i != FramePointerSaveIndex && i != BasePointerSaveIndex && + (!TRI->isCFISavedRegsSpillEnabled() || + (i != ReturnAddressSaveIndex && i != EXECSaveIndex))) MFI.setStackID(i, TargetStackID::Default); for (auto &R : VGPRToAGPRSpills) { diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -56,6 +56,8 @@ return SpillSGPRToVGPR; } + bool isCFISavedRegsSpillEnabled() const; + /// Return the end register initially reserved for the scratch buffer in case /// spilling is needed. MCRegister reservedPrivateSegmentBufferReg(const MachineFunction &MF) const; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -33,6 +33,11 @@ cl::ReallyHidden, cl::init(true)); +static cl::opt EnableSpillCFISavedRegs( + "amdgpu-spill-cfi-saved-regs", + cl::desc("Enable spilling the registers required for CFI emission"), + cl::ReallyHidden, cl::init(false), cl::ZeroOrMore); + std::array, 16> SIRegisterInfo::RegSplitParts; std::array, 9> SIRegisterInfo::SubRegFromChannelTable; @@ -194,6 +199,10 @@ return SubRegFromChannelTable[NumRegIndex - 1][Channel]; } +bool SIRegisterInfo::isCFISavedRegsSpillEnabled() const { + return EnableSpillCFISavedRegs; +} + MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg( const MachineFunction &MF) const { unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4; diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-spill-cfi-saved-regs.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-spill-cfi-saved-regs.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-spill-cfi-saved-regs.ll @@ -0,0 +1,170 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-spill-cfi-saved-regs -verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK,WAVE64 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-spill-cfi-saved-regs -verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK,WAVE32 %s + +; CHECK-LABEL: kern: +; CHECK: .cfi_startproc +; CHECK-NOT: .cfi_{{.*}} +; CHECK: %bb.0: +; CHECK-NEXT: .cfi_escape 0x0f, 0x03, 0x30, 0x36, 0xe1 +; CHECK-NEXT: .cfi_undefined 16 +; CHECK-NOT: .cfi_{{.*}} +; CHECK: .cfi_endproc +define protected amdgpu_kernel void @kern() #0 { +entry: + ret void +} + +; CHECK-LABEL: func_saved_in_clobbered_vgpr: +; CHECK: .cfi_startproc +; CHECK-NOT: .cfi_{{.*}} +; CHECK: %bb.0: +; SGPR32 = 64 +; CHECK-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; CHECK-NEXT: .cfi_escape 0x10, 0x10, 0x08, 0x90, 0x3e, 0x93, 0x04, 0x90, 0x3f, 0x93, 0x04 + + +; FIXME: ideally this would not care what VGPR we spill to, but since we are +; using .cfi_escape it isn't trivial/possible to make this general yet + +; CHECK: v_writelane_b32 v0, s30, 0 +; CHECK-NEXT: v_writelane_b32 v0, s31, 1 + +; DW_CFA_expression [0x10] +; PC_64 ULEB128(17)=[0x10] +; BLOCK_LENGTH ULEB128(12)=[0x0c] +; DW_OP_regx [0x90] +; VGPR0_wave64 ULEB128(2560)=[0x80, 0x14] +; DW_OP_bit_piece [0x9d] +; PIECE_SIZE [0x20] +; PIECE_OFFSET [0x00] +; DW_OP_regx [0x90] +; VGPR0_wave64 ULEB128(2560)=[0x80, 0x14] +; DW_OP_bit_piece [0x9d] +; PIECE_SIZE [0x20] +; PIECE_OFFSET [0x20] +; WAVE64-NEXT: .cfi_escape 0x10, 0x10, 0x0c, 0x90, 0x80, 0x14, 0x9d, 0x20, 0x00, 0x90, 0x80, 0x14, 0x9d, 0x20, 0x20 + +; DW_CFA_expression [0x10] +; PC_64 ULEB128(17)=[0x10] +; BLOCK_LENGTH ULEB128(12)=[0x0c] +; DW_OP_regx [0x90] +; VGPR0_wave32 ULEB128(1536)=[0x80, 0x0c] +; DW_OP_bit_piece [0x9d] +; PIECE_SIZE [0x20] +; PIECE_OFFSET [0x00] +; DW_OP_regx [0x90] +; VGPR0_wave32 ULEB128(1536)=[0x80, 0x0c] +; DW_OP_bit_piece [0x9d] +; PIECE_SIZE [0x20] +; PIECE_OFFSET [0x20] +; WAVE32-NEXT: .cfi_escape 0x10, 0x10, 0x0c, 0x90, 0x80, 0x0c, 0x9d, 0x20, 0x00, 0x90, 0x80, 0x0c, 0x9d, 0x20, 0x20 + + +; WAVE64: v_writelane_b32 v0, exec_lo, 2 +; WAVE64-NEXT: v_writelane_b32 v0, exec_hi, 3 +; DW_CFA_expression [0x10] +; EXEC_MASK_wave64 ULEB128(17)=[0x11] +; BLOCK_LENGTH ULEB128(12)=[0x0c] +; DW_OP_regx [0x90] +; VGPR0_wave64 ULEB128(2560)=[0x80, 0x14] +; DW_OP_bit_piece [0x9d] +; PIECE_SIZE [0x20] +; PIECE_OFFSET [0x40] +; DW_OP_regx [0x90] +; VGPR0_wave64 ULEB128(2560)=[0x80, 0x14] +; DW_OP_bit_piece [0x9d] +; PIECE_SIZE [0x20] +; PIECE_OFFSET [0x60] +; WAVE64-NEXT: .cfi_escape 0x10, 0x11, 0x0c, 0x90, 0x80, 0x14, 0x9d, 0x20, 0x40, 0x90, 0x80, 0x14, 0x9d, 0x20, 0x60 + +; WAVE32: v_writelane_b32 v0, exec_lo, 2 +; DW_CFA_expression [0x10] +; EXEC_MASK_wave32 ULEB128(1)=[0x01] +; BLOCK_LENGTH ULEB128(6)=[0x06] +; DW_OP_regx [0x90] +; VGPR0_wave32 ULEB128(1536)=[0x80, 0x0c] +; DW_OP_bit_piece [0x9d] +; PIECE_SIZE [0x20] +; PIECE_OFFSET [0x40] +; WAVE32-NEXT: .cfi_escape 0x10, 0x01, 0x06, 0x90, 0x80, 0x0c, 0x9d, 0x20, 0x40 + +; CHECK-NOT: .cfi_{{.*}} +; CHECK: .cfi_endproc +define hidden void @func_saved_in_clobbered_vgpr() #0 { +entry: + ret void +} + +; Check that the option causes a CSR VGPR to spill when needed. + +; CHECK-LABEL: func_saved_in_preserved_vgpr: +; CHECK: %bb.0: + +; CHECK: s_or_saveexec_b{{(32|64)}} +; CHECK: buffer_store_dword [[CSR:v[0-9]+]], off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK: s_mov_b{{(32|64)}} {{(exec|exec_lo)}}, + +; CHECK: v_writelane_b32 [[CSR]], s30, {{[0-9]+}} +; CHECK-NEXT: v_writelane_b32 [[CSR]], s31, {{[0-9]+}} + +; WAVE64: v_writelane_b32 [[CSR]], exec_lo, {{[0-9]+}} +; WAVE64-NEXT: v_writelane_b32 [[CSR]], exec_hi, {{[0-9]+}} + +; WAVE32: v_writelane_b32 [[CSR]], exec_lo, {{[0-9]+}} + +define hidden void @func_saved_in_preserved_vgpr() #0 { +entry: + call void asm sideeffect "; clobber nonpreserved VGPRs", + "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9} + ,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19} + ,~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29} + ,~{v30},~{v31},~{v32},~{v33},~{v34},~{v35},~{v36},~{v37},~{v38},~{v39}"() + ret void +} + +; There's no return here, so the return address live in was +; deleted. It needs to be re-added as a live in to the entry block. +; CHECK-LABEL: {{^}}empty_func: +; CHECK: v_writelane_b32 v0, s30, 0 +; CHECK: v_writelane_b32 v0, s31, 1 +define void @empty_func() { + unreachable +} + +; Check that the option causes RA and EXEC to be spilled to memory. + +; CHECK-LABEL: no_vgprs_to_spill_into: +; CHECK: %bb.0: + +; WAVE64: s_or_saveexec_b64 s[4:5], -1 +; WAVE64-NEXT: v_mov_b32_e32 v0, s30 +; WAVE64-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; WAVE64-NEXT: v_mov_b32_e32 v0, s31 +; WAVE64-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_offset 16, 0 +; WAVE64-NEXT: v_mov_b32_e32 v0, s4 +; WAVE64-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; WAVE64-NEXT: v_mov_b32_e32 v0, s5 +; WAVE64-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_offset 17, 512 +; WAVE64-NEXT: s_mov_b64 exec, s[4:5] + +define void @no_vgprs_to_spill_into() #1 { + call void asm sideeffect "", + "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9} + ,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19} + ,~{v20},~{v21},~{v22},~{v23},~{v24}"() + + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind "amdgpu-waves-per-eu"="10,10" } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, emissionKind: FullDebug) +!1 = !DIFile(filename: "filename", directory: "directory") +!2 = !{i32 7, !"Dwarf Version", i32 4} +!3 = !{i32 2, !"Debug Info Version", i32 3} diff --git a/llvm/test/CodeGen/AMDGPU/debug-frame.ll b/llvm/test/CodeGen/AMDGPU/debug-frame.ll --- a/llvm/test/CodeGen/AMDGPU/debug-frame.ll +++ b/llvm/test/CodeGen/AMDGPU/debug-frame.ll @@ -522,102 +522,6 @@ ret void } -; CHECK-LABEL: func_spill_vgpr_to_vmem: -; CHECK: .cfi_startproc - -; CHECK-NOT: .cfi_{{.*}} - -; CHECK: %bb.0: -; SGPR32 = 64 -; CHECK-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 -; CHECK-NEXT: .cfi_escape 0x10, 0x10, 0x08, 0x90, 0x3e, 0x93, 0x04, 0x90, 0x3f, 0x93, 0x04 - -; CHECK-NOT: .cfi_{{.*}} - -; CHECK: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill - -; DW_CFA_expression [0x10] -; VGPR40_wave64 ULEB128(1576)=[0xa8, 0x14] -; BLOCK_LENGTH ULEB128(14)=[0x0e] -; DW_OP_regx [0x90] -; VGPR40_wave64 ULEB128(1576)=[0xa8, 0x14] -; DW_OP_swap [0x16] -; DW_OP_LLVM_offset_uconst [0xe4] -; OFFSET ULEB128(256)=[0x80, 0x02] -; DW_OP_LLVM_call_frame_entry_reg [0xe6] -; EXEC_MASK_wave64 ULEB128(17)=[0x11] -; DW_OP_deref_size [0x94] -; SIZE [0x08] -; DW_OP_LLVM_select_bit_piece [0xec] -; ELEMENT_SIZE [0x20] -; ELEMENT_COUNT [0x40] -; WAVE64-NEXT: .cfi_escape 0x10, 0xa8, 0x14, 0x0e, 0x90, 0xa8, 0x14, 0x16, 0xe4, 0x80, 0x02, 0xe6, 0x11, 0x94, 0x08, 0xec, 0x20, 0x40 - -; DW_CFA_expression [0x10] -; VGPR40_wave32 ULEB128(1576)=[0xa8, 0x0c] -; BLOCK_LENGTH ULEB128(14)=[0x0e] -; DW_OP_regx [0x90] -; VGPR40_wave32 ULEB128(1576)=[0xa8, 0x0c] -; DW_OP_swap [0x16] -; DW_OP_LLVM_offset_uconst [0xe4] -; OFFSET ULEB128(128)=[0x80, 0x01] -; DW_OP_LLVM_call_frame_entry_reg [0xe6] -; EXEC_MASK_wave32 ULEB128(1)=[0x01] -; DW_OP_deref_size [0x94] -; SIZE [0x04] -; DW_OP_LLVM_select_bit_piece [0xec] -; ELEMENT_SIZE [0x20] -; ELEMENT_COUNT [0x20] -; WAVE32-NEXT: .cfi_escape 0x10, 0xa8, 0x0c, 0x0e, 0x90, 0xa8, 0x0c, 0x16, 0xe4, 0x80, 0x01, 0xe6, 0x01, 0x94, 0x04, 0xec, 0x20, 0x20 - -; CHECK-NOT: .cfi_{{.*}} - -; CHECK: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill - -; DW_CFA_expression [0x10] -; VGPR41_wave64 ULEB128(2601)=[0xa9, 0x14] -; BLOCK_LENGTH ULEB128(13)=[0x0d] -; DW_OP_regx [0x90] -; VGPR41_wave64 ULEB128(2601)=[0xa9, 0x14] -; DW_OP_swap [0x16] -; DW_OP_LLVM_offset_uconst [0xe4] -; OFFSET ULEB128(0)=[0x00] -; DW_OP_LLVM_call_frame_entry_reg [0xe6] -; EXEC_MASK_wave64 ULEB128(17)=[0x11] -; DW_OP_deref_size [0x94] -; SIZE [0x08] -; DW_OP_LLVM_select_bit_piece [0xec] -; ELEMENT_SIZE [0x20] -; ELEMENT_COUNT [0x40] -; WAVE64-NEXT: .cfi_escape 0x10, 0xa9, 0x14, 0x0d, 0x90, 0xa9, 0x14, 0x16, 0xe4, 0x00, 0xe6, 0x11, 0x94, 0x08, 0xec, 0x20, 0x40 - -; DW_CFA_expression [0x10] -; VGPR41_wave32 ULEB128(1577)=[0xa9, 0x0c] -; BLOCK_LENGTH ULEB128(13)=[0x0d] -; DW_OP_regx [0x90] -; VGPR41_wave32 ULEB128(1577)=[0xa9, 0x0c] -; DW_OP_swap [0x16] -; DW_OP_LLVM_offset_uconst [0xe4] -; OFFSET ULEB128(0)=[0x00] -; DW_OP_LLVM_call_frame_entry_reg [0xe6] -; EXEC_MASK_wave32 ULEB128(1)=[0x01] -; DW_OP_deref_size [0x94] -; SIZE [0x04] -; DW_OP_LLVM_select_bit_piece [0xec] -; ELEMENT_SIZE [0x20] -; ELEMENT_COUNT [0x20] -; WAVE32-NEXT: .cfi_escape 0x10, 0xa9, 0x0c, 0x0d, 0x90, 0xa9, 0x0c, 0x16, 0xe4, 0x00, 0xe6, 0x01, 0x94, 0x04, 0xec, 0x20, 0x20 - -; CHECK-NOT: .cfi_{{.*}} - -; CHECK: .cfi_endproc -define hidden void @func_spill_vgpr_to_vmem() #0 { -entry: - call void asm sideeffect "; clobber", "~{v40}"() #0 - call void asm sideeffect "; clobber", "~{v41}"() #0 - ret void -} - ; NOTE: Number of VGPRs available to kernel, and in turn number of corresponding CFIs generated, ; is dependent on waves/WG size. Since the intent here is to check whether we generate the correct ; CFIs, doing it for any one set of details is sufficient which also makes the test insensitive to diff --git a/llvm/test/CodeGen/AMDGPU/pei-cfi-saves-bug.ll b/llvm/test/CodeGen/AMDGPU/pei-cfi-saves-bug.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/pei-cfi-saves-bug.ll @@ -0,0 +1,113 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-spill-cfi-saved-regs < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-cfi-saved-regs < %s | FileCheck %s + +; Function Attrs: noinline optnone +define fastcc void @tail_callee() #2 { +; CHECK-LABEL: tail_callee: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-NEXT: v_writelane_b32 v0, s30, 0 +; CHECK-NEXT: v_writelane_b32 v0, s31, 1 +; CHECK-NEXT: v_writelane_b32 v0, exec_lo, 2 +; CHECK-NEXT: v_writelane_b32 v0, exec_hi, 3 +; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1 +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] + ret void +} + +; Function Attrs: noinline +define fastcc void @callee_no_fp() #0 { +; CHECK-LABEL: callee_no_fp: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1 +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-NEXT: v_writelane_b32 v1, s30, 0 +; CHECK-NEXT: v_writelane_b32 v1, s31, 1 +; CHECK-NEXT: v_writelane_b32 v1, exec_lo, 2 +; CHECK-NEXT: v_writelane_b32 v1, exec_hi, 3 +; CHECK-NEXT: v_writelane_b32 v1, s33, 4 +; CHECK-NEXT: s_mov_b32 s33, s32 +; CHECK-NEXT: s_add_u32 s32, s32, 0x400 +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, tail_callee@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, tail_callee@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] +entry: + tail call fastcc void @tail_callee() #3 + unreachable +} + +define protected amdgpu_kernel void @kernel() #1 { +; CHECK-LABEL: kernel: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s4, s7 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; CHECK-NEXT: s_add_u32 s0, s0, s7 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: s_cbranch_scc0 BB2_2 +; CHECK-NEXT: ; %bb.1: ; %end +; CHECK-NEXT: s_endpgm +; CHECK-NEXT: BB2_2: ; %body +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, callee_no_fp@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, callee_no_fp@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] +entry: + br i1 undef, label %end, label %body + +body: ; preds = %entry + tail call fastcc void @callee_no_fp() #3 + unreachable + +end: ; preds = %entry + ret void +} + +; When we have calls, spilling a CSR VGPR for CFI saves should force FP usage +; Function Attrs: noinline +define dso_local fastcc void @func_needs_fp() unnamed_addr #0 { +; CHECK-LABEL: func_needs_fp: +; CHECK: func_needs_fp$local: +; CHECK-NEXT: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1 +; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-NEXT: v_writelane_b32 v40, s30, 0 +; CHECK-NEXT: v_writelane_b32 v40, s31, 1 +; CHECK-NEXT: v_writelane_b32 v40, exec_lo, 2 +; CHECK-NEXT: v_writelane_b32 v40, exec_hi, 3 +; CHECK-NEXT: v_writelane_b32 v40, s33, 4 +; CHECK-NEXT: s_mov_b32 s33, s32 +; CHECK-NEXT: s_add_u32 s32, s32, 0x400 +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, tail_callee_fp@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, tail_callee_fp@rel32@hi+12 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] +entry: + tail call fastcc void @tail_callee_fp() #3 + unreachable +} + +; Function Attrs: noinline optnone +declare dso_local fastcc void @tail_callee_fp() unnamed_addr #2 + +attributes #0 = { noinline } +attributes #1 = { "use-soft-float"="false" } +attributes #2 = { noinline optnone } +attributes #3 = { convergent nounwind } +