diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h @@ -10,11 +10,11 @@ #define LLVM_LIB_TARGET_AMDGPU_SIFRAMELOWERING_H #include "AMDGPUFrameLowering.h" +#include "SIMachineFunctionInfo.h" namespace llvm { class SIInstrInfo; -class SIMachineFunctionInfo; class SIRegisterInfo; class GCNSubtarget; @@ -92,6 +92,12 @@ MachineBasicBlock::iterator MBBI, const DebugLoc &DL, const Register SGPR, const Register VGPR, const int Lane) const; + /// Create a CFI index describing a spill of an SGPR to multiple lanes of + /// VGPRs and build a MachineInstr around it. + void buildCFIForSGPRToVGPRSpill( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, Register SGPR, + ArrayRef VGPRSpills) const; /// Create a CFI index describing a spill of a VGPR to VMEM and /// build a MachineInstr around it. void buildCFIForVGPRToVMEMSpill(MachineBasicBlock &MBB, diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -705,6 +705,44 @@ MCRI->getDwarfRegNum(FuncInfo->SGPRForFPSaveRestoreCopy, false))); } + if (TRI.isCFISavedRegsSpillEnabled()) { + MCRegister ReturnAddressReg = TRI.getReturnAddressReg(MF); + ArrayRef ReturnAddressSpill = + FuncInfo->getSGPRToVGPRSpills(FuncInfo->ReturnAddressSaveIndex); + assert(ReturnAddressSpill.size() == 2); + BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), + ReturnAddressSpill[0].VGPR) + .addReg(TRI.getSubReg(ReturnAddressReg, TRI.getSubRegFromChannel(0))) + .addImm(ReturnAddressSpill[0].Lane) + .addReg(ReturnAddressSpill[0].VGPR, RegState::Undef); + BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), + ReturnAddressSpill[1].VGPR) + .addReg(TRI.getSubReg(ReturnAddressReg, TRI.getSubRegFromChannel(1))) + .addImm(ReturnAddressSpill[1].Lane) + .addReg(ReturnAddressSpill[1].VGPR, RegState::Undef); + buildCFIForSGPRToVGPRSpill(MBB, MBBI, DL, AMDGPU::PC_REG, + ReturnAddressSpill); + + ArrayRef EXECSpill = + FuncInfo->getSGPRToVGPRSpills(FuncInfo->EXECSaveIndex); + assert(EXECSpill.size()); + BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), + EXECSpill[0].VGPR) + .addReg(AMDGPU::EXEC_LO) + .addImm(EXECSpill[0].Lane) + .addReg(EXECSpill[0].VGPR, RegState::Undef); + if (!ST.isWave32()) { + assert(EXECSpill.size() == 2); + BuildMI(MBB, MBBI, DL, + TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), + EXECSpill[1].VGPR) + .addReg(AMDGPU::EXEC_HI) + .addImm(EXECSpill[1].Lane) + .addReg(EXECSpill[1].VGPR, RegState::Undef); + } + buildCFIForSGPRToVGPRSpill(MBB, MBBI, DL, AMDGPU::EXEC, EXECSpill); + } + for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg : FuncInfo->getSGPRSpillVGPRs()) { if (!Reg.FI.hasValue()) @@ -997,6 +1035,18 @@ } } +static void allocateCFISave(MachineFunction &MF, int &FI, Register Reg) { + SIMachineFunctionInfo *MFI = MF.getInfo(); + const GCNSubtarget &ST = MF.getSubtarget(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + int NewFI = MF.getFrameInfo().CreateStackObject( + TRI->getSpillSize(*RC), TRI->getSpillAlignment(*RC), true); + if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI)) + llvm_unreachable("allocate SGPR spill should have worked"); + FI = NewFI; +} + // Only report VGPRs to generic code. void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, BitVector &SavedVGPRs, @@ -1030,6 +1080,13 @@ for (auto SSpill : MFI->getSGPRSpillVGPRs()) SavedVGPRs.reset(SSpill.VGPR); + if (TRI->isCFISavedRegsSpillEnabled()) { + allocateCFISave(MF, MFI->ReturnAddressSaveIndex, + TRI->getReturnAddressReg(MF)); + allocateCFISave(MF, MFI->EXECSaveIndex, + ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC); + } + const bool HasFP = WillHaveFP || hasFP(MF); if (!HasFP) return; @@ -1251,6 +1308,59 @@ MCCFIInstruction::createEscape(nullptr, OSCFIInst.str())); } +void SIFrameLowering::buildCFIForSGPRToVGPRSpill( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, Register SGPR, + ArrayRef VGPRSpills) const { + MachineFunction &MF = *MBB.getParent(); + const MCRegisterInfo &MCRI = *MF.getMMI().getContext().getRegisterInfo(); + int DwarfSGPR = MCRI.getDwarfRegNum(SGPR, false); + + // CFI for an SGPR spilled to a multiple lanes of VGPRs is implemented as an + // expression(E) rule where E is a composite location description + // with multiple parts each referencing + // VGPR register location storage with a bit offset of the lane index + // multiplied by the size of an SGPR (32 bits). In other words we generate + // the following DWARF: + // + // DW_CFA_expression: , + // (DW_OP_regx ) (DW_OP_bit_piece 32, *32) + // (DW_OP_regx ) (DW_OP_bit_piece 32, *32) + // ... + // (DW_OP_regx ) (DW_OP_bit_piece 32, *32) + // + // The memory location description for the current CFA is pushed on the + // stack before E is evaluated, but we choose not to drop it as it would + // require a longer expression E and DWARF defines the result of the + // evaulation to be the location description on the top of the stack (i.e. the + // implictly pushed one is just ignored.) + SmallString<20> CFIInst; + raw_svector_ostream OSCFIInst(CFIInst); + SmallString<20> Block; + raw_svector_ostream OSBlock(Block); + + OSCFIInst << uint8_t(dwarf::DW_CFA_expression); + encodeULEB128(DwarfSGPR, OSCFIInst); + + // TODO: Detect when we can merge multiple adjacent pieces, or even reduce + // this to a register location description (when all pieces are adjacent). + for (SIMachineFunctionInfo::SpilledReg Spill : VGPRSpills) { + encodeDwarfRegisterLocation(MCRI.getDwarfRegNum(Spill.VGPR, false), + OSBlock); + OSBlock << uint8_t(dwarf::DW_OP_bit_piece); + // FIXME: + const unsigned SGPRBitSize = 32; + encodeULEB128(SGPRBitSize, OSBlock); + encodeULEB128(SGPRBitSize * Spill.Lane, OSBlock); + } + + encodeULEB128(Block.size(), OSCFIInst); + OSCFIInst << Block; + + buildCFI(MBB, MBBI, DL, + MCCFIInstruction::createEscape(nullptr, OSCFIInst.str())); +} + void SIFrameLowering::buildCFIForVGPRToVMEMSpill( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, unsigned VGPR, int64_t Offset) const { diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -485,6 +485,9 @@ Register SGPRForFPSaveRestoreCopy; Optional FramePointerSaveIndex; + int ReturnAddressSaveIndex; + int EXECSaveIndex; + public: SIMachineFunctionInfo(const MachineFunction &MF); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -47,6 +47,8 @@ return SpillSGPRToVGPR; } + bool isCFISavedRegsSpillEnabled() const; + /// Return the end register initially reserved for the scratch buffer in case /// spilling is needed. MCRegister reservedPrivateSegmentBufferReg(const MachineFunction &MF) const; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -39,6 +39,11 @@ cl::ReallyHidden, cl::init(true)); +static cl::opt EnableSpillCFISavedRegs( + "amdgpu-spill-cfi-saved-regs", + cl::desc("Enable spilling the registers required for CFI emission"), + cl::ReallyHidden, cl::init(false)); + SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour()), ST(ST), SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) { @@ -188,6 +193,10 @@ return SubRegFromChannelTable[NumRegIndex][Channel]; } +bool SIRegisterInfo::isCFISavedRegsSpillEnabled() const { + return EnableSpillCFISavedRegs; +} + MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg( const MachineFunction &MF) const { unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4; diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-spill-cfi-saved-regs.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-spill-cfi-saved-regs.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-spill-cfi-saved-regs.ll @@ -0,0 +1,106 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=asm -amdgpu-spill-cfi-saved-regs -o - %s | FileCheck --check-prefixes=CHECK,WAVE64 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -filetype=asm -amdgpu-spill-cfi-saved-regs -o - %s | FileCheck --check-prefixes=CHECK,WAVE32 %s + +; CHECK-LABEL: kern: +; CHECK: .cfi_startproc +; CHECK-NOT: .cfi_{{.*}} +; CHECK: %bb.0: +; CHECK-NEXT: .cfi_escape 0x0f, 0x03, 0x30, 0x36, 0xe1 +; CHECK-NEXT: .cfi_undefined 16 +; CHECK-NOT: .cfi_{{.*}} +; CHECK: .cfi_endproc +define protected amdgpu_kernel void @kern() #0 { +entry: + ret void +} + +; CHECK-LABEL: func: +; CHECK: .cfi_startproc +; CHECK-NOT: .cfi_{{.*}} +; CHECK: %bb.0: +; SGPR32 = 64 +; CHECK-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; CHECK-NEXT: .cfi_escape 0x10, 0x10, 0x08, 0x90, 0x3e, 0x93, 0x04, 0x90, 0x3f, 0x93, 0x04 + + +; FIXME: ideally this would not care what VGPR we spill to, but since we are +; using .cfi_escape it isn't trivial/possible to make this general yet + +; CHECK: v_writelane_b32 v0, s30, 0 +; CHECK-NEXT: v_writelane_b32 v0, s31, 1 + +; DW_CFA_expression [0x10] +; PC_64 ULEB128(17)=[0x10] +; BLOCK_LENGTH ULEB128(12)=[0x0c] +; DW_OP_regx [0x90] +; VGPR0_wave64 ULEB128(2560)=[0x80, 0x14] +; DW_OP_bit_piece [0x9d] +; PIECE_SIZE [0x20] +; PIECE_OFFSET [0x00] +; DW_OP_regx [0x90] +; VGPR0_wave64 ULEB128(2560)=[0x80, 0x14] +; DW_OP_bit_piece [0x9d] +; PIECE_SIZE [0x20] +; PIECE_OFFSET [0x20] +; WAVE64-NEXT: .cfi_escape 0x10, 0x10, 0x0c, 0x90, 0x80, 0x14, 0x9d, 0x20, 0x00, 0x90, 0x80, 0x14, 0x9d, 0x20, 0x20 + +; DW_CFA_expression [0x10] +; PC_64 ULEB128(17)=[0x10] +; BLOCK_LENGTH ULEB128(12)=[0x0c] +; DW_OP_regx [0x90] +; VGPR0_wave32 ULEB128(1536)=[0x80, 0x0c] +; DW_OP_bit_piece [0x9d] +; PIECE_SIZE [0x20] +; PIECE_OFFSET [0x00] +; DW_OP_regx [0x90] +; VGPR0_wave32 ULEB128(1536)=[0x80, 0x0c] +; DW_OP_bit_piece [0x9d] +; PIECE_SIZE [0x20] +; PIECE_OFFSET [0x20] +; WAVE32-NEXT: .cfi_escape 0x10, 0x10, 0x0c, 0x90, 0x80, 0x0c, 0x9d, 0x20, 0x00, 0x90, 0x80, 0x0c, 0x9d, 0x20, 0x20 + + +; WAVE64: v_writelane_b32 v0, exec_lo, 2 +; WAVE64-NEXT: v_writelane_b32 v0, exec_hi, 3 +; DW_CFA_expression [0x10] +; EXEC_MASK_wave64 ULEB128(17)=[0x11] +; BLOCK_LENGTH ULEB128(12)=[0x0c] +; DW_OP_regx [0x90] +; VGPR0_wave64 ULEB128(2560)=[0x80, 0x14] +; DW_OP_bit_piece [0x9d] +; PIECE_SIZE [0x20] +; PIECE_OFFSET [0x40] +; DW_OP_regx [0x90] +; VGPR0_wave64 ULEB128(2560)=[0x80, 0x14] +; DW_OP_bit_piece [0x9d] +; PIECE_SIZE [0x20] +; PIECE_OFFSET [0x60] +; WAVE64-NEXT: .cfi_escape 0x10, 0x11, 0x0c, 0x90, 0x80, 0x14, 0x9d, 0x20, 0x40, 0x90, 0x80, 0x14, 0x9d, 0x20, 0x60 + +; WAVE32: v_writelane_b32 v0, exec_lo, 2 +; DW_CFA_expression [0x10] +; EXEC_MASK_wave32 ULEB128(1)=[0x01] +; BLOCK_LENGTH ULEB128(6)=[0x06] +; DW_OP_regx [0x90] +; VGPR0_wave32 ULEB128(1536)=[0x80, 0x0c] +; DW_OP_bit_piece [0x9d] +; PIECE_SIZE [0x20] +; PIECE_OFFSET [0x40] +; WAVE32-NEXT: .cfi_escape 0x10, 0x01, 0x06, 0x90, 0x80, 0x0c, 0x9d, 0x20, 0x40 + +; CHECK-NOT: .cfi_{{.*}} +; CHECK: .cfi_endproc +define hidden void @func() #0 { +entry: + ret void +} + +attributes #0 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, emissionKind: FullDebug) +!1 = !DIFile(filename: "filename", directory: "directory") +!2 = !{i32 7, !"Dwarf Version", i32 4} +!3 = !{i32 2, !"Debug Info Version", i32 3}