diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h @@ -54,6 +54,11 @@ MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override; + bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const ArrayRef CSI, + const TargetRegisterInfo *TRI) const override; + private: void emitEntryFunctionFlatScratchInit(MachineFunction &MF, MachineBasicBlock &MBB, @@ -85,6 +90,12 @@ MachineBasicBlock::iterator MBBI, const DebugLoc &DL, const Register SGPR, const Register VGPR, const int Lane) const; + /// Create a CFI index describing a spill of a VGPR to VMEM and + /// build a MachineInstr around it. + void buildCFIForVGPRToVMEMSpill(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, unsigned VGPR, + int64_t Offset) const; }; } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -1377,6 +1377,30 @@ return MBB.erase(I); } +bool SIFrameLowering::spillCalleeSavedRegisters( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + const ArrayRef CSI, const TargetRegisterInfo *TRI) const { + MachineFunction &MF = *MBB.getParent(); + const SIInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + + for (const CalleeSavedInfo &CS : CSI) { + // Insert the spill to the stack frame. + unsigned Reg = CS.getReg(); + + if (CS.isSpilledToReg()) { + BuildMI(MBB, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), + CS.getDstReg()) + .addReg(Reg, getKillRegState(true)); + } else { + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + TII->storeRegToStackSlotCFI(MBB, MBBI, Reg, true, CS.getFrameIdx(), RC, + TRI); + } + } + + return true; +} + bool SIFrameLowering::hasFP(const MachineFunction &MF) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); @@ -1461,3 +1485,42 @@ buildCFI(MBB, MBBI, DL, MCCFIInstruction::createEscape(nullptr, OSCFIInst.str())); } + +void SIFrameLowering::buildCFIForVGPRToVMEMSpill( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, unsigned VGPR, int64_t Offset) const { + MachineFunction &MF = *MBB.getParent(); + const GCNSubtarget &ST = MF.getSubtarget(); + const MCRegisterInfo &MCRI = *MF.getMMI().getContext().getRegisterInfo(); + int DwarfVGPR = MCRI.getDwarfRegNum(VGPR, false); + + SmallString<20> CFIInst; + raw_svector_ostream OSCFIInst(CFIInst); + SmallString<20> Block; + raw_svector_ostream OSBlock(Block); + + OSCFIInst << uint8_t(dwarf::DW_CFA_expression); + encodeULEB128(DwarfVGPR, OSCFIInst); + + encodeDwarfRegisterLocation(DwarfVGPR, OSBlock); + OSBlock << uint8_t(dwarf::DW_OP_swap); + OSBlock << uint8_t(dwarf::DW_OP_LLVM_offset_uconst); + encodeULEB128(Offset, OSBlock); + OSBlock << uint8_t(dwarf::DW_OP_LLVM_call_frame_entry_reg); + encodeULEB128(MCRI.getDwarfRegNum( + ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC, false), + OSBlock); + OSBlock << uint8_t(dwarf::DW_OP_deref_size); + OSBlock << uint8_t(ST.getWavefrontSize() / CHAR_BIT); + OSBlock << uint8_t(dwarf::DW_OP_LLVM_select_bit_piece); + // FIXME: Can this be a function of the VGPR? + const unsigned VGPRLaneBitSize = 32; + encodeULEB128(VGPRLaneBitSize, OSBlock); + encodeULEB128(ST.getWavefrontSize(), OSBlock); + + encodeULEB128(Block.size(), OSCFIInst); + OSCFIInst << Block; + + buildCFI(MBB, MBBI, DL, + MCCFIInstruction::createEscape(nullptr, OSCFIInst.str())); +} diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -222,12 +222,27 @@ MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const; +private: + void storeRegToStackSlotImpl(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, Register SrcReg, + bool isKill, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI, + bool NeedsCFI) const; + +public: void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const override; + void storeRegToStackSlotCFI(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, Register SrcReg, + bool isKill, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const; + void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1161,51 +1161,63 @@ return get(getIndirectVGPRWritePseudoOpc(VecSize)); } -static unsigned getSGPRSpillSaveOpcode(unsigned Size) { +static unsigned getSGPRSpillSaveOpcode(unsigned Size, bool NeedsCFI) { switch (Size) { case 4: - return AMDGPU::SI_SPILL_S32_SAVE; + return NeedsCFI ? AMDGPU::SI_SPILL_S32_CFI_SAVE : AMDGPU::SI_SPILL_S32_SAVE; case 8: - return AMDGPU::SI_SPILL_S64_SAVE; + return NeedsCFI ? AMDGPU::SI_SPILL_S64_CFI_SAVE : AMDGPU::SI_SPILL_S64_SAVE; case 12: - return AMDGPU::SI_SPILL_S96_SAVE; + return NeedsCFI ? AMDGPU::SI_SPILL_S96_CFI_SAVE : AMDGPU::SI_SPILL_S96_SAVE; case 16: - return AMDGPU::SI_SPILL_S128_SAVE; + return NeedsCFI ? AMDGPU::SI_SPILL_S128_CFI_SAVE + : AMDGPU::SI_SPILL_S128_SAVE; case 20: - return AMDGPU::SI_SPILL_S160_SAVE; + return NeedsCFI ? AMDGPU::SI_SPILL_S160_CFI_SAVE + : AMDGPU::SI_SPILL_S160_SAVE; case 24: - return AMDGPU::SI_SPILL_S192_SAVE; + return NeedsCFI ? AMDGPU::SI_SPILL_S192_CFI_SAVE + : AMDGPU::SI_SPILL_S192_SAVE; case 32: - return AMDGPU::SI_SPILL_S256_SAVE; + return NeedsCFI ? AMDGPU::SI_SPILL_S256_CFI_SAVE + : AMDGPU::SI_SPILL_S256_SAVE; case 64: - return AMDGPU::SI_SPILL_S512_SAVE; + return NeedsCFI ? AMDGPU::SI_SPILL_S512_CFI_SAVE + : AMDGPU::SI_SPILL_S512_SAVE; case 128: - return AMDGPU::SI_SPILL_S1024_SAVE; + return NeedsCFI ? AMDGPU::SI_SPILL_S1024_CFI_SAVE + : AMDGPU::SI_SPILL_S1024_SAVE; default: llvm_unreachable("unknown register size"); } } -static unsigned getVGPRSpillSaveOpcode(unsigned Size) { +static unsigned getVGPRSpillSaveOpcode(unsigned Size, bool NeedsCFI) { switch (Size) { case 4: - return AMDGPU::SI_SPILL_V32_SAVE; + return NeedsCFI ? AMDGPU::SI_SPILL_V32_CFI_SAVE : AMDGPU::SI_SPILL_V32_SAVE; case 8: - return AMDGPU::SI_SPILL_V64_SAVE; + return NeedsCFI ? AMDGPU::SI_SPILL_V64_CFI_SAVE : AMDGPU::SI_SPILL_V64_SAVE; case 12: - return AMDGPU::SI_SPILL_V96_SAVE; + return NeedsCFI ? AMDGPU::SI_SPILL_V96_CFI_SAVE : AMDGPU::SI_SPILL_V96_SAVE; case 16: - return AMDGPU::SI_SPILL_V128_SAVE; + return NeedsCFI ? AMDGPU::SI_SPILL_V128_CFI_SAVE + : AMDGPU::SI_SPILL_V128_SAVE; case 20: - return AMDGPU::SI_SPILL_V160_SAVE; + return NeedsCFI ? AMDGPU::SI_SPILL_V160_CFI_SAVE + : AMDGPU::SI_SPILL_V160_SAVE; case 24: - return AMDGPU::SI_SPILL_V192_SAVE; + return NeedsCFI ? AMDGPU::SI_SPILL_V192_CFI_SAVE + : AMDGPU::SI_SPILL_V192_SAVE; case 32: - return AMDGPU::SI_SPILL_V256_SAVE; + return NeedsCFI ? AMDGPU::SI_SPILL_V256_CFI_SAVE + : AMDGPU::SI_SPILL_V256_SAVE; case 64: - return AMDGPU::SI_SPILL_V512_SAVE; + return NeedsCFI ? AMDGPU::SI_SPILL_V512_CFI_SAVE + : AMDGPU::SI_SPILL_V512_SAVE; case 128: - return AMDGPU::SI_SPILL_V1024_SAVE; + return NeedsCFI ? AMDGPU::SI_SPILL_V1024_CFI_SAVE + : AMDGPU::SI_SPILL_V1024_SAVE; default: llvm_unreachable("unknown register size"); } @@ -1228,12 +1240,10 @@ } } -void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - Register SrcReg, bool isKill, - int FrameIndex, - const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const { +void SIInstrInfo::storeRegToStackSlotImpl( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, + bool isKill, int FrameIndex, const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI, bool NeedsCFI) const { MachineFunction *MF = MBB.getParent(); SIMachineFunctionInfo *MFI = MF->getInfo(); MachineFrameInfo &FrameInfo = MF->getFrameInfo(); @@ -1254,7 +1264,8 @@ // We are only allowed to create one new instruction when spilling // registers, so we need to use pseudo instruction for spilling SGPRs. - const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize)); + const MCInstrDesc &OpDesc = + get(getSGPRSpillSaveOpcode(SpillSize, NeedsCFI)); // The SGPR spill/restore instructions only work on number sgprs, so we need // to make sure we are using the correct register class. @@ -1277,8 +1288,9 @@ return; } - unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillSaveOpcode(SpillSize) - : getVGPRSpillSaveOpcode(SpillSize); + unsigned Opcode = RI.hasAGPRs(RC) + ? getAGPRSpillSaveOpcode(SpillSize) + : getVGPRSpillSaveOpcode(SpillSize, NeedsCFI); MFI->setHasSpilledVGPRs(); auto MIB = BuildMI(MBB, MI, DL, get(Opcode)); @@ -1295,6 +1307,24 @@ .addMemOperand(MMO); } +void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + Register SrcReg, bool isKill, + int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + storeRegToStackSlotImpl(MBB, MI, SrcReg, isKill, FrameIndex, RC, TRI, false); +} + +void SIInstrInfo::storeRegToStackSlotCFI(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + Register SrcReg, bool isKill, + int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + storeRegToStackSlotImpl(MBB, MI, SrcReg, isKill, FrameIndex, RC, TRI, true); +} + static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { switch (Size) { case 4: diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -645,6 +645,13 @@ let mayLoad = 0; } + def _CFI_SAVE : PseudoInstSI < + (outs), + (ins sgpr_class:$data, i32imm:$addr)> { + let mayStore = 1; + let mayLoad = 0; + } + def _RESTORE : PseudoInstSI < (outs sgpr_class:$data), (ins i32imm:$addr)> { @@ -683,6 +690,18 @@ let Size = !if(!le(MaxSize, 256), MaxSize, 252); } + def _CFI_SAVE : VPseudoInstSI < + (outs), + (ins vgpr_class:$vdata, i32imm:$vaddr, SReg_128:$srsrc, + SReg_32:$soffset, i32imm:$offset)> { + let mayStore = 1; + let mayLoad = 0; + // (2 * 4) + (8 * num_subregs) bytes maximum + int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8); + // Size field is unsigned char and cannot fit more. + let Size = !if(!le(MaxSize, 256), MaxSize, 252); + } + def _RESTORE : VPseudoInstSI < (outs vgpr_class:$vdata), (ins i32imm:$vaddr, SReg_128:$srsrc, SReg_32:$soffset, diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp --- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -89,7 +89,7 @@ ArrayRef CSI, LiveIntervals *LIS) { MachineFunction &MF = *SaveBlock.getParent(); - const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + const SIInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); @@ -103,8 +103,8 @@ const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, MVT::i32); - TII.storeRegToStackSlot(SaveBlock, I, Reg, true, CS.getFrameIdx(), RC, - TRI); + TII.storeRegToStackSlotCFI(SaveBlock, I, Reg, true, CS.getFrameIdx(), RC, + TRI); if (LIS) { assert(std::distance(MIS.begin(), I) == 1); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -109,9 +109,8 @@ bool IsLoad) const; /// If \p OnlyToVGPR is true, this will only succeed if this - bool spillSGPR(MachineBasicBlock::iterator MI, - int FI, RegScavenger *RS, - bool OnlyToVGPR = false) const; + bool spillSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, + bool OnlyToVGPR = false, bool NeedsCFI = false) const; bool restoreSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, @@ -326,16 +325,12 @@ ArrayRef getAllVGPR32(const MachineFunction &MF) const; private: - void buildSpillLoadStore(MachineBasicBlock::iterator MI, - unsigned LoadStoreOp, - int Index, - Register ValueReg, - bool ValueIsKill, + void buildSpillLoadStore(MachineBasicBlock::iterator MI, unsigned LoadStoreOp, + int Index, Register ValueReg, bool ValueIsKill, MCRegister ScratchRsrcReg, - MCRegister ScratchOffsetReg, - int64_t InstrOffset, - MachineMemOperand *MMO, - RegScavenger *RS) const; + MCRegister ScratchOffsetReg, int64_t InstrOffset, + MachineMemOperand *MMO, RegScavenger *RS, + bool NeedsCFI = false) const; }; } // End namespace llvm diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -14,14 +14,15 @@ #include "SIRegisterInfo.h" #include "AMDGPURegisterBankInfo.h" #include "AMDGPUSubtarget.h" -#include "SIInstrInfo.h" -#include "SIMachineFunctionInfo.h" #include "MCTargetDesc/AMDGPUInstPrinter.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/CodeGen/SlotIndexes.h" #include "llvm/IR/Function.h" @@ -538,22 +539,28 @@ switch (Op) { case AMDGPU::SI_SPILL_S1024_SAVE: + case AMDGPU::SI_SPILL_S1024_CFI_SAVE: case AMDGPU::SI_SPILL_S1024_RESTORE: case AMDGPU::SI_SPILL_V1024_SAVE: + case AMDGPU::SI_SPILL_V1024_CFI_SAVE: case AMDGPU::SI_SPILL_V1024_RESTORE: case AMDGPU::SI_SPILL_A1024_SAVE: case AMDGPU::SI_SPILL_A1024_RESTORE: return 32; case AMDGPU::SI_SPILL_S512_SAVE: + case AMDGPU::SI_SPILL_S512_CFI_SAVE: case AMDGPU::SI_SPILL_S512_RESTORE: case AMDGPU::SI_SPILL_V512_SAVE: + case AMDGPU::SI_SPILL_V512_CFI_SAVE: case AMDGPU::SI_SPILL_V512_RESTORE: case AMDGPU::SI_SPILL_A512_SAVE: case AMDGPU::SI_SPILL_A512_RESTORE: return 16; case AMDGPU::SI_SPILL_S256_SAVE: + case AMDGPU::SI_SPILL_S256_CFI_SAVE: case AMDGPU::SI_SPILL_S256_RESTORE: case AMDGPU::SI_SPILL_V256_SAVE: + case AMDGPU::SI_SPILL_V256_CFI_SAVE: case AMDGPU::SI_SPILL_V256_RESTORE: return 8; case AMDGPU::SI_SPILL_S192_SAVE: @@ -562,32 +569,42 @@ case AMDGPU::SI_SPILL_V192_RESTORE: return 6; case AMDGPU::SI_SPILL_S160_SAVE: + case AMDGPU::SI_SPILL_S160_CFI_SAVE: case AMDGPU::SI_SPILL_S160_RESTORE: case AMDGPU::SI_SPILL_V160_SAVE: + case AMDGPU::SI_SPILL_V160_CFI_SAVE: case AMDGPU::SI_SPILL_V160_RESTORE: return 5; case AMDGPU::SI_SPILL_S128_SAVE: + case AMDGPU::SI_SPILL_S128_CFI_SAVE: case AMDGPU::SI_SPILL_S128_RESTORE: case AMDGPU::SI_SPILL_V128_SAVE: + case AMDGPU::SI_SPILL_V128_CFI_SAVE: case AMDGPU::SI_SPILL_V128_RESTORE: case AMDGPU::SI_SPILL_A128_SAVE: case AMDGPU::SI_SPILL_A128_RESTORE: return 4; case AMDGPU::SI_SPILL_S96_SAVE: + case AMDGPU::SI_SPILL_S96_CFI_SAVE: case AMDGPU::SI_SPILL_S96_RESTORE: case AMDGPU::SI_SPILL_V96_SAVE: + case AMDGPU::SI_SPILL_V96_CFI_SAVE: case AMDGPU::SI_SPILL_V96_RESTORE: return 3; case AMDGPU::SI_SPILL_S64_SAVE: + case AMDGPU::SI_SPILL_S64_CFI_SAVE: case AMDGPU::SI_SPILL_S64_RESTORE: case AMDGPU::SI_SPILL_V64_SAVE: + case AMDGPU::SI_SPILL_V64_CFI_SAVE: case AMDGPU::SI_SPILL_V64_RESTORE: case AMDGPU::SI_SPILL_A64_SAVE: case AMDGPU::SI_SPILL_A64_RESTORE: return 2; case AMDGPU::SI_SPILL_S32_SAVE: + case AMDGPU::SI_SPILL_S32_CFI_SAVE: case AMDGPU::SI_SPILL_S32_RESTORE: case AMDGPU::SI_SPILL_V32_SAVE: + case AMDGPU::SI_SPILL_V32_CFI_SAVE: case AMDGPU::SI_SPILL_V32_RESTORE: case AMDGPU::SI_SPILL_A32_SAVE: case AMDGPU::SI_SPILL_A32_RESTORE: @@ -721,20 +738,16 @@ return true; } -void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, - unsigned LoadStoreOp, - int Index, - Register ValueReg, - bool IsKill, - MCRegister ScratchRsrcReg, - MCRegister ScratchOffsetReg, - int64_t InstOffset, - MachineMemOperand *MMO, - RegScavenger *RS) const { +void SIRegisterInfo::buildSpillLoadStore( + MachineBasicBlock::iterator MI, unsigned LoadStoreOp, int Index, + Register ValueReg, bool IsKill, MCRegister ScratchRsrcReg, + MCRegister ScratchOffsetReg, int64_t InstOffset, MachineMemOperand *MMO, + RegScavenger *RS, bool NeedsCFI) const { MachineBasicBlock *MBB = MI->getParent(); MachineFunction *MF = MI->getParent()->getParent(); const SIInstrInfo *TII = ST.getInstrInfo(); const MachineFrameInfo &MFI = MF->getFrameInfo(); + const SIFrameLowering *TFL = ST.getFrameLowering(); const SIMachineFunctionInfo *FuncInfo = MF->getInfo(); const MCInstrDesc &Desc = TII->get(LoadStoreOp); @@ -750,6 +763,7 @@ unsigned Size = NumSubRegs * EltSize; int64_t Offset = InstOffset + MFI.getObjectOffset(Index); int64_t ScratchOffsetRegDelta = 0; + int64_t AdditionalCFIOffset = 0; Align Alignment = MFI.getObjectAlign(Index); const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo(); @@ -792,6 +806,8 @@ if (!SOffset) report_fatal_error("could not scavenge SGPR to spill in entry function"); + AdditionalCFIOffset = Offset; + if (ScratchOffsetReg == AMDGPU::NoRegister) { BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset) .addImm(Offset); @@ -850,6 +866,11 @@ .addImm(0) // swz .addMemOperand(NewMMO); + if (IsStore && NeedsCFI) + TFL->buildCFIForVGPRToVMEMSpill(*MBB, MI, DL, SubReg, + Offset * ST.getWavefrontSize() + + AdditionalCFIOffset); + if (!IsStore && TmpReg != AMDGPU::NoRegister) MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32), FinalReg) @@ -980,14 +1001,14 @@ } } -bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, - int Index, - RegScavenger *RS, - bool OnlyToVGPR) const { +bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index, + RegScavenger *RS, bool OnlyToVGPR, + bool NeedsCFI) const { MachineBasicBlock *MBB = MI->getParent(); MachineFunction *MF = MBB->getParent(); SIMachineFunctionInfo *MFI = MF->getInfo(); DenseSet SGPRSpillVGPRDefinedSet; + const SIFrameLowering *TFL = ST.getFrameLowering(); ArrayRef VGPRSpills = MFI->getSGPRToVGPRSpills(Index); @@ -1036,6 +1057,10 @@ .addImm(Spill.Lane) .addReg(Spill.VGPR, VGPRDefined ? 0 : RegState::Undef); + if (NeedsCFI) + TFL->buildCFIForSGPRToVGPRSpill(*MBB, MI, DL, SubReg, Spill.VGPR, + Spill.Lane); + // FIXME: Since this spills to another register instead of an actual // frame index, we should delete the frame index when all references to // it are fixed. @@ -1086,6 +1111,8 @@ // Write out VGPR buildSGPRSpillLoadStore(MI, Index, Offset, EltSize, TmpVGPR, VGPRLanes, RS, false); + + // TODO: Implement CFI for SpillToVMEM if/when it is fully supported. } } @@ -1181,7 +1208,18 @@ MachineBasicBlock::iterator MI, int FI, RegScavenger *RS) const { + bool NeedsCFI = false; switch (MI->getOpcode()) { + case AMDGPU::SI_SPILL_S1024_CFI_SAVE: + case AMDGPU::SI_SPILL_S512_CFI_SAVE: + case AMDGPU::SI_SPILL_S256_CFI_SAVE: + case AMDGPU::SI_SPILL_S160_CFI_SAVE: + case AMDGPU::SI_SPILL_S128_CFI_SAVE: + case AMDGPU::SI_SPILL_S96_CFI_SAVE: + case AMDGPU::SI_SPILL_S64_CFI_SAVE: + case AMDGPU::SI_SPILL_S32_CFI_SAVE: + NeedsCFI = true; + LLVM_FALLTHROUGH; case AMDGPU::SI_SPILL_S1024_SAVE: case AMDGPU::SI_SPILL_S512_SAVE: case AMDGPU::SI_SPILL_S256_SAVE: @@ -1191,7 +1229,7 @@ case AMDGPU::SI_SPILL_S96_SAVE: case AMDGPU::SI_SPILL_S64_SAVE: case AMDGPU::SI_SPILL_S32_SAVE: - return spillSGPR(MI, FI, RS, true); + return spillSGPR(MI, FI, RS, true, NeedsCFI); case AMDGPU::SI_SPILL_S1024_RESTORE: case AMDGPU::SI_SPILL_S512_RESTORE: case AMDGPU::SI_SPILL_S256_RESTORE: @@ -1226,8 +1264,21 @@ ? getBaseRegister() : getFrameRegister(*MF); + bool NeedsCFI = false; + switch (MI->getOpcode()) { // SGPR register spill + case AMDGPU::SI_SPILL_S1024_CFI_SAVE: + case AMDGPU::SI_SPILL_S512_CFI_SAVE: + case AMDGPU::SI_SPILL_S256_CFI_SAVE: + case AMDGPU::SI_SPILL_S160_CFI_SAVE: + case AMDGPU::SI_SPILL_S128_CFI_SAVE: + case AMDGPU::SI_SPILL_S96_CFI_SAVE: + case AMDGPU::SI_SPILL_S64_CFI_SAVE: + case AMDGPU::SI_SPILL_S32_CFI_SAVE: { + NeedsCFI = true; + LLVM_FALLTHROUGH; + } case AMDGPU::SI_SPILL_S1024_SAVE: case AMDGPU::SI_SPILL_S512_SAVE: case AMDGPU::SI_SPILL_S256_SAVE: @@ -1256,6 +1307,16 @@ } // VGPR register spill + case AMDGPU::SI_SPILL_V1024_CFI_SAVE: + case AMDGPU::SI_SPILL_V512_CFI_SAVE: + case AMDGPU::SI_SPILL_V256_CFI_SAVE: + case AMDGPU::SI_SPILL_V160_CFI_SAVE: + case AMDGPU::SI_SPILL_V128_CFI_SAVE: + case AMDGPU::SI_SPILL_V96_CFI_SAVE: + case AMDGPU::SI_SPILL_V64_CFI_SAVE: + case AMDGPU::SI_SPILL_V32_CFI_SAVE: + NeedsCFI = true; + LLVM_FALLTHROUGH; case AMDGPU::SI_SPILL_V1024_SAVE: case AMDGPU::SI_SPILL_V512_SAVE: case AMDGPU::SI_SPILL_V256_SAVE: @@ -1274,14 +1335,12 @@ assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == MFI->getStackPtrOffsetReg()); - buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET, - Index, - VData->getReg(), VData->isKill(), - TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(), - FrameReg, - TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), - *MI->memoperands_begin(), - RS); + buildSpillLoadStore( + MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET, Index, VData->getReg(), + VData->isKill(), + TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(), FrameReg, + TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), + *MI->memoperands_begin(), RS, NeedsCFI); MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode())); MI->eraseFromParent(); break; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll @@ -294,8 +294,6 @@ ; GFX10-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, gv@gotpcrel32@hi+4 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_mov_b32 s33, s6 diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll --- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -205,18 +205,18 @@ ; TODO: Can the SP inc/deec be remvoed? ; GCN-LABEL: {{^}}callee_with_stack_no_fp_elim_csr_vgpr: ; GCN: s_waitcnt -; GCN-NEXT:s_mov_b32 [[FP_COPY:s[0-9]+]], s33 +; GCN-NEXT: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 ; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_add_u32 s32, s32, 0x300 +; GCN: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 -; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:8 +; GCN: buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:8 ; GCN: ;;#ASMSTART ; GCN-NEXT: ; clobber v41 ; GCN-NEXT: ;;#ASMEND ; GCN: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN: s_add_u32 s32, s32, 0x300 ; GCN-NEXT: s_sub_u32 s32, s32, 0x300 ; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -233,14 +233,14 @@ ; GCN: s_waitcnt ; GCN-NEXT: v_writelane_b32 v1, s33, 63 ; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_add_u32 s32, s32, 0x300 ; GCN: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-COUNT-63: v_writelane_b32 v1 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:8 ; GCN: ;;#ASMSTART ; GCN-COUNT-63: v_readlane_b32 s{{[0-9]+}}, v1 -; GCN: s_add_u32 s32, s32, 0x300 -; GCN-NEXT: s_sub_u32 s32, s32, 0x300 +; GCN: s_sub_u32 s32, s32, 0x300 ; GCN-NEXT: v_readlane_b32 s33, v1, 63 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 @@ -265,6 +265,7 @@ ; GCN: s_waitcnt ; GCN-NEXT: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 ; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_add_u32 s32, s32, 0x300 ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-COUNT-64: v_writelane_b32 v1, @@ -273,8 +274,7 @@ ; GCN-COUNT-64: v_readlane_b32 s{{[0-9]+}}, v1 ; GCN: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN: s_add_u32 s32, s32, 0x300 -; GCN-NEXT: s_sub_u32 s32, s32, 0x300 +; GCN: s_sub_u32 s32, s32, 0x300 ; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 diff --git a/llvm/test/CodeGen/AMDGPU/debug-frame.ll b/llvm/test/CodeGen/AMDGPU/debug-frame.ll --- a/llvm/test/CodeGen/AMDGPU/debug-frame.ll +++ b/llvm/test/CodeGen/AMDGPU/debug-frame.ll @@ -522,6 +522,103 @@ ret void } + +; CHECK-LABEL: func_spill_vgpr_to_vmem: +; CHECK: .cfi_startproc + +; CHECK-NOT: .cfi_{{.*}} + +; CHECK: %bb.0: +; SGPR32 = 64 +; CHECK-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; CHECK-NEXT: .cfi_escape 0x10, 0x10, 0x08, 0x90, 0x3e, 0x93, 0x04, 0x90, 0x3f, 0x93, 0x04 + +; CHECK-NOT: .cfi_{{.*}} + +; CHECK: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill + +; DW_CFA_expression [0x10] +; VGPR40_wave64 ULEB128(1576)=[0xa8, 0x14] +; BLOCK_LENGTH ULEB128(14)=[0x0e] +; DW_OP_regx [0x90] +; VGPR40_wave64 ULEB128(1576)=[0xa8, 0x14] +; DW_OP_swap [0x16] +; DW_OP_LLVM_offset_uconst [0xe4] +; OFFSET ULEB128(256)=[0x80, 0x02] +; DW_OP_LLVM_call_frame_entry_reg [0xe6] +; EXEC_MASK_wave64 ULEB128(17)=[0x11] +; DW_OP_deref_size [0x94] +; SIZE [0x08] +; DW_OP_LLVM_select_bit_piece [0xec] +; ELEMENT_SIZE [0x20] +; ELEMENT_COUNT [0x40] +; WAVE64-NEXT: .cfi_escape 0x10, 0xa8, 0x14, 0x0e, 0x90, 0xa8, 0x14, 0x16, 0xe4, 0x80, 0x02, 0xe6, 0x11, 0x94, 0x08, 0xec, 0x20, 0x40 + +; DW_CFA_expression [0x10] +; VGPR40_wave32 ULEB128(1576)=[0xa8, 0x0c] +; BLOCK_LENGTH ULEB128(14)=[0x0e] +; DW_OP_regx [0x90] +; VGPR40_wave32 ULEB128(1576)=[0xa8, 0x0c] +; DW_OP_swap [0x16] +; DW_OP_LLVM_offset_uconst [0xe4] +; OFFSET ULEB128(128)=[0x80, 0x01] +; DW_OP_LLVM_call_frame_entry_reg [0xe6] +; EXEC_MASK_wave32 ULEB128(1)=[0x01] +; DW_OP_deref_size [0x94] +; SIZE [0x04] +; DW_OP_LLVM_select_bit_piece [0xec] +; ELEMENT_SIZE [0x20] +; ELEMENT_COUNT [0x20] +; WAVE32-NEXT: .cfi_escape 0x10, 0xa8, 0x0c, 0x0e, 0x90, 0xa8, 0x0c, 0x16, 0xe4, 0x80, 0x01, 0xe6, 0x01, 0x94, 0x04, 0xec, 0x20, 0x20 + +; CHECK-NOT: .cfi_{{.*}} + +; CHECK: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill + +; DW_CFA_expression [0x10] +; VGPR41_wave64 ULEB128(2601)=[0xa9, 0x14] +; BLOCK_LENGTH ULEB128(13)=[0x0d] +; DW_OP_regx [0x90] +; VGPR41_wave64 ULEB128(2601)=[0xa9, 0x14] +; DW_OP_swap [0x16] +; DW_OP_LLVM_offset_uconst [0xe4] +; OFFSET ULEB128(0)=[0x00] +; DW_OP_LLVM_call_frame_entry_reg [0xe6] +; EXEC_MASK_wave64 ULEB128(17)=[0x11] +; DW_OP_deref_size [0x94] +; SIZE [0x08] +; DW_OP_LLVM_select_bit_piece [0xec] +; ELEMENT_SIZE [0x20] +; ELEMENT_COUNT [0x40] +; WAVE64-NEXT: .cfi_escape 0x10, 0xa9, 0x14, 0x0d, 0x90, 0xa9, 0x14, 0x16, 0xe4, 0x00, 0xe6, 0x11, 0x94, 0x08, 0xec, 0x20, 0x40 + +; DW_CFA_expression [0x10] +; VGPR41_wave32 ULEB128(1577)=[0xa9, 0x0c] +; BLOCK_LENGTH ULEB128(13)=[0x0d] +; DW_OP_regx [0x90] +; VGPR41_wave32 ULEB128(1577)=[0xa9, 0x0c] +; DW_OP_swap [0x16] +; DW_OP_LLVM_offset_uconst [0xe4] +; OFFSET ULEB128(0)=[0x00] +; DW_OP_LLVM_call_frame_entry_reg [0xe6] +; EXEC_MASK_wave32 ULEB128(1)=[0x01] +; DW_OP_deref_size [0x94] +; SIZE [0x04] +; DW_OP_LLVM_select_bit_piece [0xec] +; ELEMENT_SIZE [0x20] +; ELEMENT_COUNT [0x20] +; WAVE32-NEXT: .cfi_escape 0x10, 0xa9, 0x0c, 0x0d, 0x90, 0xa9, 0x0c, 0x16, 0xe4, 0x00, 0xe6, 0x01, 0x94, 0x04, 0xec, 0x20, 0x20 + +; CHECK-NOT: .cfi_{{.*}} + +; CHECK: .cfi_endproc +define hidden void @func_spill_vgpr_to_vmem() #0 { +entry: + call void asm sideeffect "; clobber", "~{v40}"() #0 + call void asm sideeffect "; clobber", "~{v41}"() #0 + ret void +} + attributes #0 = { nounwind } attributes #1 = { nounwind "frame-pointer"="all" } diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll --- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -192,15 +192,14 @@ ; GFX9-NEXT: v_writelane_b32 v43, s33, 4 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_add_u32 s32, s32, 0x800 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: v_writelane_b32 v43, s34, 0 +; GFX9-NEXT: v_writelane_b32 v43, s35, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+4 -; GFX9-NEXT: v_writelane_b32 v43, s35, 1 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v40, v1 ; GFX9-NEXT: v_mov_b32_e32 v41, v0 diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills.mir b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills.mir --- a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills.mir +++ b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills.mir @@ -2,9 +2,13 @@ # CHECK-LABEL: name: empty_entry_block # CHECK: V_WRITELANE +# CHECK-NEXT: CFI_INSTRUCTION # CHECK-NEXT: V_WRITELANE +# CHECK-NEXT: CFI_INSTRUCTION # CHECK-NEXT: V_WRITELANE +# CHECK-NEXT: CFI_INSTRUCTION # CHECK-NEXT: V_WRITELANE +# CHECK-NEXT: CFI_INSTRUCTION # CHECK: V_READLANE # CHECK-NEXT: V_READLANE # CHECK-NEXT: V_READLANE