diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h @@ -71,12 +71,22 @@ Register PreloadedPrivateBufferReg, Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const; + void emitPrologueEntryCFI(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL) const; + public: bool hasFP(const MachineFunction &MF) const override; /// Create a CFI index for CFIInst and build a MachineInstr around it. void buildCFI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, const MCCFIInstruction &CFIInst) const; + /// Create a CFI index describing a spill of an SGPR to a single lane of + /// a VGPR and build a MachineInstr around it. + void buildCFIForSGPRToVGPRSpill(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, const Register SGPR, + const Register VGPR, const int Lane) const; }; } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -13,6 +13,7 @@ #include "SIRegisterInfo.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/BinaryFormat/Dwarf.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -20,6 +21,7 @@ #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/MC/MCDwarf.h" +#include "llvm/Support/LEB128.h" using namespace llvm; @@ -602,6 +604,64 @@ llvm_unreachable("Invalid TargetStackID::Value"); } +void SIFrameLowering::emitPrologueEntryCFI(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL) const { + const MachineFunction &MF = *MBB.getParent(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + const MCRegisterInfo *MCRI = MF.getMMI().getContext().getRegisterInfo(); + Register StackPtrReg = + MF.getInfo()->getStackPtrOffsetReg(); + + // DW_ASPACE_AMDGPU_private_wave FIXME: should be defined elsewhere + buildCFI(MBB, MBBI, DL, + MCCFIInstruction::createLLVMDefAspaceCfa( + nullptr, MCRI->getDwarfRegNum(StackPtrReg, false), 0, 6)); + + static const char PCEncodedInst[] = { + dwarf::DW_CFA_expression, + 16, // PC 64 + 8, // length + static_cast(dwarf::DW_OP_regx), + 62, // SGPR30 + static_cast(dwarf::DW_OP_piece), + 4, // 32 bits + static_cast(dwarf::DW_OP_regx), + 63, // SGPR31 + static_cast(dwarf::DW_OP_piece), + 4 // 32 bits + }; + buildCFI(MBB, MBBI, DL, + MCCFIInstruction::createEscape( + nullptr, StringRef(PCEncodedInst, sizeof(PCEncodedInst)))); + + static const MCPhysReg CallerSavedRegs[] = { + AMDGPU::VGPR0, AMDGPU::VGPR1, AMDGPU::VGPR2, AMDGPU::VGPR3, + AMDGPU::VGPR4, AMDGPU::VGPR5, AMDGPU::VGPR6, AMDGPU::VGPR7, + AMDGPU::VGPR8, AMDGPU::VGPR9, AMDGPU::VGPR10, AMDGPU::VGPR11, + AMDGPU::VGPR12, AMDGPU::VGPR13, AMDGPU::VGPR14, AMDGPU::VGPR15, + AMDGPU::VGPR16, AMDGPU::VGPR17, AMDGPU::VGPR18, AMDGPU::VGPR19, + AMDGPU::VGPR20, AMDGPU::VGPR21, AMDGPU::VGPR22, AMDGPU::VGPR23, + AMDGPU::VGPR24, AMDGPU::VGPR25, AMDGPU::VGPR26, AMDGPU::VGPR27, + AMDGPU::VGPR28, AMDGPU::VGPR29, AMDGPU::VGPR30, AMDGPU::VGPR31, + AMDGPU::SGPR0, AMDGPU::SGPR1, AMDGPU::SGPR2, AMDGPU::SGPR3, + AMDGPU::SGPR4, AMDGPU::SGPR5, AMDGPU::SGPR6, AMDGPU::SGPR7, + AMDGPU::SGPR8, AMDGPU::SGPR9, AMDGPU::SGPR10, AMDGPU::SGPR11, + AMDGPU::SGPR12, AMDGPU::SGPR13, AMDGPU::SGPR14, AMDGPU::SGPR15, + AMDGPU::SGPR16, AMDGPU::SGPR17, AMDGPU::SGPR18, AMDGPU::SGPR19, + AMDGPU::SGPR20, AMDGPU::SGPR21, AMDGPU::SGPR22, AMDGPU::SGPR23, + AMDGPU::SGPR24, AMDGPU::SGPR25, AMDGPU::SGPR26, AMDGPU::SGPR27, + AMDGPU::SGPR28, AMDGPU::SGPR29, AMDGPU::SGPR30, AMDGPU::SGPR31, + AMDGPU::NoRegister}; + for (int I = 0; CallerSavedRegs[I]; ++I) { + if (!MRI.isPhysRegModified(CallerSavedRegs[I])) + continue; + MCRegister DwarfReg = MCRI->getDwarfRegNum(CallerSavedRegs[I], false); + buildCFI(MBB, MBBI, DL, + MCCFIInstruction::createUndefined(nullptr, DwarfReg)); + } +}; + void SIFrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { SIMachineFunctionInfo *FuncInfo = MF.getInfo(); @@ -615,6 +675,7 @@ const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo &TRI = TII->getRegisterInfo(); + const MCRegisterInfo *MCRI = MF.getMMI().getContext().getRegisterInfo(); Register StackPtrReg = FuncInfo->getStackPtrOffsetReg(); Register FramePtrReg = FuncInfo->getFrameOffsetReg(); @@ -630,11 +691,18 @@ // turn on all lanes before doing the spill to memory. Register ScratchExecCopy; + emitPrologueEntryCFI(MBB, MBBI, DL); + // Emit the copy if we need an FP, and are using a free SGPR to save it. if (FuncInfo->SGPRForFPSaveRestoreCopy) { BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->SGPRForFPSaveRestoreCopy) .addReg(FramePtrReg) .setMIFlag(MachineInstr::FrameSetup); + buildCFI( + MBB, MBBI, DL, + MCCFIInstruction::createRegister( + nullptr, MCRI->getDwarfRegNum(FramePtrReg, false), + MCRI->getDwarfRegNum(FuncInfo->SGPRForFPSaveRestoreCopy, false))); } for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg @@ -662,10 +730,16 @@ .addImm(-1); } + int FI = Reg.FI.getValue(); + buildPrologSpill(LiveRegs, MBB, MBBI, TII, Reg.VGPR, - FuncInfo->getScratchRSrcReg(), - StackPtrReg, - Reg.FI.getValue()); + FuncInfo->getScratchRSrcReg(), StackPtrReg, FI); + + // We spill the entire VGPR, so we can get away with just cfi_offset + buildCFI(MBB, MBBI, DL, + MCCFIInstruction::createOffset( + nullptr, MCRI->getDwarfRegNum(Reg.VGPR, false), + MFI.getObjectOffset(FI) * ST.getWavefrontSize())); } if (ScratchExecCopy != AMDGPU::NoRegister) { @@ -693,6 +767,9 @@ .addReg(FramePtrReg) .addImm(Spill[0].Lane) .addReg(Spill[0].VGPR, RegState::Undef); + + buildCFIForSGPRToVGPRSpill(MBB, MBBI, DL, FramePtrReg, Spill[0].VGPR, + Spill[0].Lane); } if (TRI.needsStackRealignment(MF)) { @@ -732,6 +809,12 @@ .setMIFlag(MachineInstr::FrameSetup); } + if (HasFP) { + buildCFI(MBB, MBBI, DL, + MCCFIInstruction::createDefCfaRegister( + nullptr, MCRI->getDwarfRegNum(FramePtrReg, false))); + } + if (HasFP && RoundedSize != 0) { BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg) .addReg(StackPtrReg) @@ -757,6 +840,7 @@ const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); + const MCRegisterInfo *MCRI = MF.getMMI().getContext().getRegisterInfo(); MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); LivePhysRegs LiveRegs; DebugLoc DL; @@ -766,6 +850,8 @@ uint32_t RoundedSize = FuncInfo->isStackRealigned() ? NumBytes + MFI.getMaxAlign().value() : NumBytes; + const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg(); + const Register FramePtrReg = FuncInfo->getFrameOffsetReg(); if (RoundedSize != 0 && hasFP(MF)) { const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg(); @@ -776,9 +862,9 @@ } if (FuncInfo->SGPRForFPSaveRestoreCopy) { - BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->getFrameOffsetReg()) - .addReg(FuncInfo->SGPRForFPSaveRestoreCopy) - .setMIFlag(MachineInstr::FrameSetup); + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg) + .addReg(FuncInfo->SGPRForFPSaveRestoreCopy) + .setMIFlag(MachineInstr::FrameSetup); } if (FuncInfo->FramePointerSaveIndex) { @@ -791,9 +877,15 @@ = FuncInfo->getSGPRToVGPRSpills(FI); assert(Spill.size() == 1); BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), - FuncInfo->getFrameOffsetReg()) - .addReg(Spill[0].VGPR) - .addImm(Spill[0].Lane); + FramePtrReg) + .addReg(Spill[0].VGPR) + .addImm(Spill[0].Lane); + } + + if (hasFP(MF)) { + buildCFI(MBB, MBBI, DL, + MCCFIInstruction::createDefCfaRegister( + nullptr, MCRI->getDwarfRegNum(StackPtrReg, false))); } Register ScratchExecCopy; @@ -1081,3 +1173,56 @@ .addCFIIndex(MF.addFrameInst(CFIInst)) .setMIFlag(MachineInstr::FrameSetup); } + +static void encodeDwarfRegisterLocation(int DwarfReg, raw_ostream &OS) { + if (DwarfReg < 32) { + OS << uint8_t(dwarf::DW_OP_reg0 + DwarfReg); + } else { + OS << uint8_t(dwarf::DW_OP_regx); + encodeULEB128(DwarfReg, OS); + } +} + +void SIFrameLowering::buildCFIForSGPRToVGPRSpill( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, const Register SGPR, const Register VGPR, + const int Lane) const { + MachineFunction &MF = *MBB.getParent(); + const MCRegisterInfo &MCRI = *MF.getMMI().getContext().getRegisterInfo(); + int DwarfSGPR = MCRI.getDwarfRegNum(SGPR, false); + int DwarfVGPR = MCRI.getDwarfRegNum(VGPR, false); + + // CFI for an SGPR spilled to a single lane of a VGPR is implemented as an + // expression(E) rule where E is a register location description referencing + // a VGPR register location storage at a byte offset of the lane index + // multiplied by the size of an SGPR (4 bytes). In other words we generate + // the following DWARF: + // + // DW_CFA_expression: , + // (DW_OP_regx ) (DW_OP_LLVM_offset_uconst *4) + // + // The memory location description for the current CFA is pushed on the + // stack before E is evaluated, but we choose not to drop it as it would + // require a longer expression E and DWARF defines the result of the + // evaulation to be the location description on the top of the stack (i.e. the + // implictly pushed one is just ignored.) + SmallString<20> CFIInst; + raw_svector_ostream OSCFIInst(CFIInst); + SmallString<20> Block; + raw_svector_ostream OSBlock(Block); + + OSCFIInst << uint8_t(dwarf::DW_CFA_expression); + encodeULEB128(DwarfSGPR, OSCFIInst); + + encodeDwarfRegisterLocation(DwarfVGPR, OSBlock); + OSBlock << uint8_t(dwarf::DW_OP_LLVM_offset_uconst); + // FIXME: + const unsigned SGPRByteSize = 4; + encodeULEB128(Lane * SGPRByteSize, OSBlock); + + encodeULEB128(Block.size(), OSCFIInst); + OSCFIInst << Block; + + buildCFI(MBB, MBBI, DL, + MCCFIInstruction::createEscape(nullptr, OSCFIInst.str())); +} diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll --- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -196,7 +196,7 @@ ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: v_readlane_b32 s42, v0, 0 ; GCN-NEXT: s_setpc_b64 -define void @spill_only_csr_sgpr() { +define void @spill_only_csr_sgpr() #0 { call void asm sideeffect "; clobber s42", "~{s42}"() ret void } @@ -295,8 +295,8 @@ ; GCN-LABEL: {{^}}realign_stack_no_fp_elim: ; GCN: s_waitcnt -; GCN-NEXT: s_add_u32 [[SCRATCH:s[0-9]+]], s32, 0x7ffc0 ; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_add_u32 [[SCRATCH:s[0-9]+]], s32, 0x7ffc0 ; GCN-NEXT: s_and_b32 s33, [[SCRATCH]], 0xfff80000 ; GCN-NEXT: s_add_u32 s32, s32, 0x100000 ; GCN-NEXT: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 @@ -314,14 +314,14 @@ ; GCN-LABEL: {{^}}no_unused_non_csr_sgpr_for_fp: ; GCN: s_waitcnt ; GCN-NEXT: v_writelane_b32 v1, s33, 2 -; GCN-NEXT: v_writelane_b32 v1, s30, 0 ; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: v_writelane_b32 v1, s30, 0 ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 ; GCN: v_writelane_b32 v1, s31, 1 ; GCN: buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:4 ; GCN: ;;#ASMSTART -; GCN: v_readlane_b32 s4, v1, 0 -; GCN-NEXT: s_add_u32 s32, s32, 0x200 +; GCN: s_add_u32 s32, s32, 0x200 +; GCN-NEXT: v_readlane_b32 s4, v1, 0 ; GCN-NEXT: v_readlane_b32 s5, v1, 1 ; GCN-NEXT: s_sub_u32 s32, s32, 0x200 ; GCN-NEXT: v_readlane_b32 s33, v1, 2 @@ -348,8 +348,8 @@ ; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; GCN-NEXT: v_writelane_b32 v32, s33, 2 -; GCN-NEXT: v_writelane_b32 v32, s30, 0 ; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: v_writelane_b32 v32, s30, 0 ; GCN-DAG: v_writelane_b32 v32, s31, 1 ; GCN-DAG: buffer_store_dword @@ -395,8 +395,8 @@ ; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], [[SCRATCH_VGPR]], s[0:3], s32 offen ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; GCN-NEXT: v_writelane_b32 v32, s33, 2 -; GCN-NEXT: v_writelane_b32 v32, s30, 0 ; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: v_writelane_b32 v32, s30, 0 ; GCN-DAG: v_writelane_b32 v32, s31, 1 ; GCN-DAG: s_add_u32 s32, s32, 0x40300{{$}} ; GCN-DAG: buffer_store_dword diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll --- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll @@ -31,8 +31,8 @@ ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_writelane_b32 v32, s33, 2 -; GCN-NEXT: v_writelane_b32 v32, s30, 0 ; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: v_writelane_b32 v32, s30, 0 ; GCN-NEXT: s_add_u32 s32, s32, 0x400 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, func_v2f32@rel32@lo+4 @@ -65,8 +65,8 @@ ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_writelane_b32 v32, s33, 2 -; GCN-NEXT: v_writelane_b32 v32, s30, 0 ; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: v_writelane_b32 v32, s30, 0 ; GCN-NEXT: s_add_u32 s32, s32, 0x400 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, func_v3f32@rel32@lo+4 @@ -99,8 +99,8 @@ ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_writelane_b32 v32, s33, 2 -; GCN-NEXT: v_writelane_b32 v32, s30, 0 ; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: v_writelane_b32 v32, s30, 0 ; GCN-NEXT: s_add_u32 s32, s32, 0x400 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, func_v4f16@rel32@lo+4 @@ -133,8 +133,8 @@ ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_writelane_b32 v32, s33, 2 -; GCN-NEXT: v_writelane_b32 v32, s30, 0 ; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: v_writelane_b32 v32, s30, 0 ; GCN-NEXT: s_add_u32 s32, s32, 0x400 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, func_struct@rel32@lo+4 diff --git a/llvm/test/CodeGen/AMDGPU/debug-frame.ll b/llvm/test/CodeGen/AMDGPU/debug-frame.ll --- a/llvm/test/CodeGen/AMDGPU/debug-frame.ll +++ b/llvm/test/CodeGen/AMDGPU/debug-frame.ll @@ -1,4 +1,5 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=asm -o - %s | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=asm -o - %s | FileCheck --check-prefixes=CHECK,WAVE64 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -filetype=asm -o - %s | FileCheck --check-prefixes=CHECK,WAVE32 %s ; CHECK-LABEL: kern1: ; CHECK: .cfi_startproc @@ -23,6 +24,200 @@ ret void } +; CHECK-LABEL: func1: +; CHECK: .cfi_startproc + +; CHECK-NOT: .cfi_{{.*}} + +; CHECK: %bb.0: +; SGPR32 = 64 +; CHECK-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; DW_CFA_expression [0x10] +; PC_64 ULEB128(17)=[0x10] +; BLOCK_LENGTH ULEB128(8)=[0x08] +; DW_OP_regx [0x90] +; SGPR30 ULEB128(62)=[0x3e] +; DW_OP_piece [0x93] +; PIECE_SIZE [0x04] +; DW_OP_regx [0x90] +; SGPR31 ULEB128(63)=[0x3f] +; DW_OP_piece [0x93] +; PIECE_SIZE [0x04] +; CHECK-NEXT: .cfi_escape 0x10, 0x10, 0x08, 0x90, 0x3e, 0x93, 0x04, 0x90, 0x3f, 0x93, 0x04 + +; CHECK-NOT: .cfi_{{.*}} + +; CHECK: .cfi_endproc +define hidden void @func1() #0 { +entry: + ret void +} + +declare hidden void @ex() #0 + +; CHECK-LABEL: func2: +; CHECK: .cfi_startproc + +; CHECK-NOT: .cfi_{{.*}} + +; CHECK: %bb.0: +; CHECK-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; CHECK-NEXT: .cfi_escape 0x10, 0x10, 0x08, 0x90, 0x3e, 0x93, 0x04, 0x90, 0x3f, 0x93, 0x04 +; VGPR0_wave64 = 2560 +; WAVE64-NEXT: .cfi_undefined 2560 +; WAVE64-NEXT: .cfi_undefined 2561 +; WAVE64-NEXT: .cfi_undefined 2562 +; WAVE64-NEXT: .cfi_undefined 2563 +; WAVE64-NEXT: .cfi_undefined 2564 +; WAVE64-NEXT: .cfi_undefined 2565 +; WAVE64-NEXT: .cfi_undefined 2566 +; WAVE64-NEXT: .cfi_undefined 2567 +; WAVE64-NEXT: .cfi_undefined 2568 +; WAVE64-NEXT: .cfi_undefined 2569 +; WAVE64-NEXT: .cfi_undefined 2570 +; WAVE64-NEXT: .cfi_undefined 2571 +; WAVE64-NEXT: .cfi_undefined 2572 +; WAVE64-NEXT: .cfi_undefined 2573 +; WAVE64-NEXT: .cfi_undefined 2574 +; WAVE64-NEXT: .cfi_undefined 2575 +; WAVE64-NEXT: .cfi_undefined 2576 +; WAVE64-NEXT: .cfi_undefined 2577 +; WAVE64-NEXT: .cfi_undefined 2578 +; WAVE64-NEXT: .cfi_undefined 2579 +; WAVE64-NEXT: .cfi_undefined 2580 +; WAVE64-NEXT: .cfi_undefined 2581 +; WAVE64-NEXT: .cfi_undefined 2582 +; WAVE64-NEXT: .cfi_undefined 2583 +; WAVE64-NEXT: .cfi_undefined 2584 +; WAVE64-NEXT: .cfi_undefined 2585 +; WAVE64-NEXT: .cfi_undefined 2586 +; WAVE64-NEXT: .cfi_undefined 2587 +; WAVE64-NEXT: .cfi_undefined 2588 +; WAVE64-NEXT: .cfi_undefined 2589 +; WAVE64-NEXT: .cfi_undefined 2590 +; WAVE64-NEXT: .cfi_undefined 2591 +; VGPR0_wave32 = 1536 +; WAVE32-NEXT: .cfi_undefined 1536 +; WAVE32-NEXT: .cfi_undefined 1537 +; WAVE32-NEXT: .cfi_undefined 1538 +; WAVE32-NEXT: .cfi_undefined 1539 +; WAVE32-NEXT: .cfi_undefined 1540 +; WAVE32-NEXT: .cfi_undefined 1541 +; WAVE32-NEXT: .cfi_undefined 1542 +; WAVE32-NEXT: .cfi_undefined 1543 +; WAVE32-NEXT: .cfi_undefined 1544 +; WAVE32-NEXT: .cfi_undefined 1545 +; WAVE32-NEXT: .cfi_undefined 1546 +; WAVE32-NEXT: .cfi_undefined 1547 +; WAVE32-NEXT: .cfi_undefined 1548 +; WAVE32-NEXT: .cfi_undefined 1549 +; WAVE32-NEXT: .cfi_undefined 1550 +; WAVE32-NEXT: .cfi_undefined 1551 +; WAVE32-NEXT: .cfi_undefined 1552 +; WAVE32-NEXT: .cfi_undefined 1553 +; WAVE32-NEXT: .cfi_undefined 1554 +; WAVE32-NEXT: .cfi_undefined 1555 +; WAVE32-NEXT: .cfi_undefined 1556 +; WAVE32-NEXT: .cfi_undefined 1557 +; WAVE32-NEXT: .cfi_undefined 1558 +; WAVE32-NEXT: .cfi_undefined 1559 +; WAVE32-NEXT: .cfi_undefined 1560 +; WAVE32-NEXT: .cfi_undefined 1561 +; WAVE32-NEXT: .cfi_undefined 1562 +; WAVE32-NEXT: .cfi_undefined 1563 +; WAVE32-NEXT: .cfi_undefined 1564 +; WAVE32-NEXT: .cfi_undefined 1565 +; WAVE32-NEXT: .cfi_undefined 1566 +; WAVE32-NEXT: .cfi_undefined 1567 +; SGPR0 = 32 +; CHECK-NEXT: .cfi_undefined 32 +; CHECK-NEXT: .cfi_undefined 33 +; CHECK-NEXT: .cfi_undefined 34 +; CHECK-NEXT: .cfi_undefined 35 +; CHECK-NEXT: .cfi_undefined 36 +; CHECK-NEXT: .cfi_undefined 37 +; CHECK-NEXT: .cfi_undefined 38 +; CHECK-NEXT: .cfi_undefined 39 +; CHECK-NEXT: .cfi_undefined 40 +; CHECK-NEXT: .cfi_undefined 41 +; CHECK-NEXT: .cfi_undefined 42 +; CHECK-NEXT: .cfi_undefined 43 +; CHECK-NEXT: .cfi_undefined 44 +; CHECK-NEXT: .cfi_undefined 45 +; CHECK-NEXT: .cfi_undefined 46 +; CHECK-NEXT: .cfi_undefined 47 +; CHECK-NEXT: .cfi_undefined 48 +; CHECK-NEXT: .cfi_undefined 49 +; CHECK-NEXT: .cfi_undefined 50 +; CHECK-NEXT: .cfi_undefined 51 +; CHECK-NEXT: .cfi_undefined 52 +; CHECK-NEXT: .cfi_undefined 53 +; CHECK-NEXT: .cfi_undefined 54 +; CHECK-NEXT: .cfi_undefined 55 +; CHECK-NEXT: .cfi_undefined 56 +; CHECK-NEXT: .cfi_undefined 57 +; CHECK-NEXT: .cfi_undefined 58 +; CHECK-NEXT: .cfi_undefined 59 +; CHECK-NEXT: .cfi_undefined 60 +; CHECK-NEXT: .cfi_undefined 61 +; CHECK-NEXT: .cfi_undefined 62 +; CHECK-NEXT: .cfi_undefined 63 + +; CHECK-NOT: .cfi_{{.*}} + +; WAVE64: s_or_saveexec_b64 s[4:5], -1 +; WAVE32: s_or_saveexec_b32 s4, -1 +; CHECK-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill +; VGPR32_wave64 = 2592 +; WAVE64-NEXT: .cfi_offset 2592, 0 +; VGPR32_wave32 = 1568 +; WAVE32-NEXT: .cfi_offset 1568, 0 +; CHECK-NOT: .cfi_{{.*}} +; WAVE64: s_mov_b64 exec, s[4:5] +; WAVE32: s_mov_b32 exec_lo, s4 + +; CHECK-NOT: .cfi_{{.*}} + +; CHECK: v_writelane_b32 v32, s33, 2 + +; DW_CFA_expression [0x10] SGPR33 ULEB128(65)=[0x41] +; BLOCK_LENGTH ULEB128(5)=[0x05] +; DW_OP_regx [0x90] +; VGPR32_wave64 ULEB128(2592)=[0xa0, 0x14] +; DW_OP_LLVM_offset_uconst [0xe4] +; OFFSET ULEB128(0x08) [0x08] +; WAVE64-NEXT: .cfi_escape 0x10, 0x41, 0x05, 0x90, 0xa0, 0x14, 0xe4, 0x08 + +; DW_CFA_expression [0x10] SGPR33 ULEB128(65)=[0x41] +; BLOCK_LENGTH ULEB128(5)=[0x05] +; DW_OP_regx [0x90] +; VGPR32_wave32 ULEB128(1568)=[0xa0, 0x0c] +; DW_OP_LLVM_offset_uconst [0xe4] +; OFFSET ULEB128(0x08) [0x08] +; WAVE32-NEXT: .cfi_escape 0x10, 0x41, 0x05, 0x90, 0xa0, 0x0c, 0xe4, 0x08 + +; CHECK-NOT: .cfi_{{.*}} + +; CHECK: s_mov_b32 s33, s32 +; SGPR33 = 65 +; CHECK-NEXT: .cfi_def_cfa_register 65 + +; CHECK-NOT: .cfi_{{.*}} + +; CHECK: s_sub_u32 s32, s32, +; CHECK-NEXT: v_readlane_b32 s33, v32, 2 +; SGPR32 = 64 +; CHECK-NEXT: .cfi_def_cfa_register 64 + +; CHECK-NOT: .cfi_{{.*}} + +; CHECK: .cfi_endproc +define hidden void @func2() #0 { +entry: + call void @ex() #0 + ret void +} + attributes #0 = { nounwind } !llvm.dbg.cu = !{!0} diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll --- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -192,6 +192,7 @@ ; GFX9-NEXT: v_writelane_b32 v35, s33, 4 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_add_u32 s32, s32, 0x800 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s33 ; 4-byte Folded Spill diff --git a/llvm/test/CodeGen/AMDGPU/split-arg-dbg-value.ll b/llvm/test/CodeGen/AMDGPU/split-arg-dbg-value.ll --- a/llvm/test/CodeGen/AMDGPU/split-arg-dbg-value.ll +++ b/llvm/test/CodeGen/AMDGPU/split-arg-dbg-value.ll @@ -13,6 +13,8 @@ ; GCN-NEXT: ;DEBUG_VALUE: split_v4f32_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 64 32] $vgpr2 ; GCN-NEXT: ;DEBUG_VALUE: split_v4f32_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 32 32] $vgpr1 ; GCN-NEXT: ;DEBUG_VALUE: split_v4f32_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 0 32] $vgpr0 +; GCN-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; GCN-NEXT: .cfi_escape 0x10, 0x10, 0x08, 0x90, 0x3e, 0x93, 0x04, 0x90, 0x3f, 0x93, 0x04 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: .Ltmp0: ; GCN-NEXT: .loc 0 4 5 prologue_end ; /tmp/dbg.cl:4:5 @@ -35,6 +37,12 @@ ; GCN-NEXT: ;DEBUG_VALUE: split_v4f32_multi_arg:arg0 <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 64 32] $vgpr2 ; GCN-NEXT: ;DEBUG_VALUE: split_v4f32_multi_arg:arg0 <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 32 32] $vgpr1 ; GCN-NEXT: ;DEBUG_VALUE: split_v4f32_multi_arg:arg0 <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 0 32] $vgpr0 +; GCN-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; GCN-NEXT: .cfi_escape 0x10, 0x10, 0x08, 0x90, 0x3e, 0x93, 0x04, 0x90, 0x3f, 0x93, 0x04 +; GCN-NEXT: .cfi_undefined 2560 +; GCN-NEXT: .cfi_undefined 2561 +; GCN-NEXT: .cfi_undefined 2562 +; GCN-NEXT: .cfi_undefined 2563 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: .Ltmp2: ; GCN-NEXT: .loc 0 8 17 prologue_end ; /tmp/dbg.cl:8:17 @@ -65,6 +73,8 @@ ; GCN-NEXT: ; %bb.0: ; GCN-NEXT: ;DEBUG_VALUE: split_v4f16_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 32 32] $vgpr1 ; GCN-NEXT: ;DEBUG_VALUE: split_v4f16_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 0 32] $vgpr0 +; GCN-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; GCN-NEXT: .cfi_escape 0x10, 0x10, 0x08, 0x90, 0x3e, 0x93, 0x04, 0x90, 0x3f, 0x93, 0x04 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: .Ltmp8: ; GCN-NEXT: .loc 0 12 5 prologue_end ; /tmp/dbg.cl:12:5 @@ -82,6 +92,8 @@ ; GCN-NEXT: ; %bb.0: ; GCN-NEXT: ;DEBUG_VALUE: split_f64_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 32 32] $vgpr1 ; GCN-NEXT: ;DEBUG_VALUE: split_f64_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 0 32] $vgpr0 +; GCN-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; GCN-NEXT: .cfi_escape 0x10, 0x10, 0x08, 0x90, 0x3e, 0x93, 0x04, 0x90, 0x3f, 0x93, 0x04 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: .Ltmp10: ; GCN-NEXT: .loc 0 16 5 prologue_end ; /tmp/dbg.cl:16:5 @@ -102,6 +114,8 @@ ; GCN-NEXT: ;DEBUG_VALUE: split_v2f64_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 64 32] $vgpr2 ; GCN-NEXT: ;DEBUG_VALUE: split_v2f64_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 32 32] $vgpr1 ; GCN-NEXT: ;DEBUG_VALUE: split_v2f64_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 0 32] $vgpr0 +; GCN-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; GCN-NEXT: .cfi_escape 0x10, 0x10, 0x08, 0x90, 0x3e, 0x93, 0x04, 0x90, 0x3f, 0x93, 0x04 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: .Ltmp12: ; GCN-NEXT: .loc 0 20 5 prologue_end ; /tmp/dbg.cl:20:5 @@ -119,6 +133,8 @@ ; GCN-NEXT: ; %bb.0: ; GCN-NEXT: ;DEBUG_VALUE: split_i64_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 32 32] $vgpr1 ; GCN-NEXT: ;DEBUG_VALUE: split_i64_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 0 32] $vgpr0 +; GCN-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; GCN-NEXT: .cfi_escape 0x10, 0x10, 0x08, 0x90, 0x3e, 0x93, 0x04, 0x90, 0x3f, 0x93, 0x04 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: .Ltmp14: ; GCN-NEXT: .loc 0 24 5 prologue_end ; /tmp/dbg.cl:24:5 @@ -137,6 +153,8 @@ ; GCN-NEXT: ; %bb.0: ; GCN-NEXT: ;DEBUG_VALUE: split_ptr_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 32 32] $vgpr1 ; GCN-NEXT: ;DEBUG_VALUE: split_ptr_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 0 32] $vgpr0 +; GCN-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; GCN-NEXT: .cfi_escape 0x10, 0x10, 0x08, 0x90, 0x3e, 0x93, 0x04, 0x90, 0x3f, 0x93, 0x04 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: .Ltmp16: ; GCN-NEXT: .loc 0 28 5 prologue_end ; /tmp/dbg.cl:28:5 diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll --- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll @@ -124,8 +124,8 @@ } ; GCN-LABEL: {{^}}default_realign_align128: -; GCN: s_add_u32 [[TMP:s[0-9]+]], s32, 0x1fc0 -; GCN-NEXT: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 +; GCN-DAG: s_add_u32 [[TMP:s[0-9]+]], s32, 0x1fc0 +; GCN-DAG: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 ; GCN-NEXT: s_and_b32 s33, [[TMP]], 0xffffe000 ; GCN-NEXT: s_add_u32 s32, s32, 0x4000 ; GCN-NOT: s33