diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -335,12 +335,23 @@ } // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not -// memory. They should have been removed by now. -static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) { +// memory. They should have been removed by now, except CFI Saved Reg spills. +static bool allStackObjectsAreDead(const MachineFunction &MF) { + const MachineFrameInfo &MFI = MF.getFrameInfo(); + const GCNSubtarget &ST = MF.getSubtarget(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const SIMachineFunctionInfo *FuncInfo = MF.getInfo(); for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); I != E; ++I) { - if (!MFI.isDeadObjectIndex(I)) + if (!MFI.isDeadObjectIndex(I)) { + // determineCalleeSaves() might have added the SGPRSpill stack IDs for + // CFI saves into scratch VGPR, ignore them + if (MFI.getStackID(I) == TargetStackID::SGPRSpill && + TRI->isCFISavedRegsSpillEnabled() && I == FuncInfo->EXECSaveIndex) { + continue; + } return false; + } } return true; @@ -360,8 +371,8 @@ Register ScratchRsrcReg = MFI->getScratchRSrcReg(); - if (!ScratchRsrcReg || (!MRI.isPhysRegUsed(ScratchRsrcReg) && - allStackObjectsAreDead(MF.getFrameInfo()))) + if (!ScratchRsrcReg || + (!MRI.isPhysRegUsed(ScratchRsrcReg) && allStackObjectsAreDead(MF))) return Register(); if (ST.hasSGPRInitBug() || @@ -536,7 +547,7 @@ bool NeedsFlatScratchInit = MFI->hasFlatScratchInit() && (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() || - (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch())); + (!allStackObjectsAreDead(MF) && ST.enableFlatScratch())); if ((NeedsFlatScratchInit || ScratchRsrcReg) && PreloadedScratchWaveOffsetReg && !ST.flatScratchIsArchitected()) { @@ -964,6 +975,7 @@ Optional FPSaveIndex = FuncInfo->FramePointerSaveIndex; Optional BPSaveIndex = FuncInfo->BasePointerSaveIndex; + Optional EXECSaveIndex = FuncInfo->EXECSaveIndex; // VGPRs used for SGPR->VGPR spills for (const SIMachineFunctionInfo::SGPRSpillVGPR &Reg : @@ -1005,6 +1017,15 @@ LiveRegs.addReg(ScratchExecCopy); } + if (TRI.isCFISavedRegsSpillEnabled() && NeedsFrameMoves && EXECSaveIndex) { + Register Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + unsigned NumSubRegs = ST.isWave32() ? 1 : 2; + if (spilledToMemory(MF, *EXECSaveIndex)) + saveSGPRToMemory(MBB, MBBI, Exec, *EXECSaveIndex, LiveRegs, NumSubRegs); + else + saveSGPRToVGPRLane(MBB, MBBI, Exec, *EXECSaveIndex, NumSubRegs); + } + if (FPSaveIndex) { if (spilledToMemory(MF, *FPSaveIndex)) saveSGPRToMemory(MBB, MBBI, FramePtrReg, *FPSaveIndex, LiveRegs); @@ -1266,13 +1287,17 @@ #ifndef NDEBUG static bool allSGPRSpillsAreDead(const MachineFunction &MF) { const MachineFrameInfo &MFI = MF.getFrameInfo(); + const GCNSubtarget &ST = MF.getSubtarget(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); const SIMachineFunctionInfo *FuncInfo = MF.getInfo(); for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); I != E; ++I) { if (!MFI.isDeadObjectIndex(I) && MFI.getStackID(I) == TargetStackID::SGPRSpill && (I != FuncInfo->FramePointerSaveIndex && - I != FuncInfo->BasePointerSaveIndex)) { + I != FuncInfo->BasePointerSaveIndex && + (!TRI->isCFISavedRegsSpillEnabled() || + I != FuncInfo->EXECSaveIndex))) { return false; } } @@ -1378,14 +1403,14 @@ // can. Any remaining SGPR spills will go to memory, so move them back to the // default stack. bool HaveSGPRToVMemSpill = - FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ true); + FuncInfo->removeDeadFrameIndices(MF, /*ResetSGPRSpillStackIDs*/ true); assert(allSGPRSpillsAreDead(MF) && "SGPR spill should have been removed in SILowerSGPRSpills"); // FIXME: The other checks should be redundant with allStackObjectsAreDead, // but currently hasNonSpillStackObjects is set only from source // allocas. Stack temps produced from legalization are not counted currently. - if (!allStackObjectsAreDead(MFI)) { + if (!allStackObjectsAreDead(MF)) { assert(RS && "RegScavenger required if spilling"); // Add an emergency spill slot @@ -1400,6 +1425,35 @@ } } +static void allocateCFISave(MachineFunction &MF, Optional &FI, + Register Reg) { + SIMachineFunctionInfo *MFI = MF.getInfo(); + const GCNSubtarget &ST = MF.getSubtarget(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + if (MFI->haveFreeLanesForSGPRSpill(MF, TRI->getSpillSize(*RC) / 4)) { + int NewFI = MF.getFrameInfo().CreateStackObject( + TRI->getSpillSize(*RC), TRI->getSpillAlign(*RC), true, nullptr, + TargetStackID::SGPRSpill); + if (MFI->allocateSGPRSpillToVGPR(MF, NewFI)) { + FI = NewFI; + } + } else { + int NewFI = MF.getFrameInfo().CreateStackObject( + TRI->getSpillSize(*RC), TRI->getSpillAlign(*RC), true, nullptr, + TargetStackID::SGPRSpill); + if (TRI->spillSGPRToVGPR() && MFI->allocateSGPRSpillToVGPR(MF, NewFI)) { + FI = NewFI; + } else { + // Remove dead index + MF.getFrameInfo().RemoveStackObject(NewFI); + FI = MF.getFrameInfo().CreateSpillStackObject( + TRI->getSpillSize(*RC), Align(TRI->getSpillAlign(*RC))); + } + } + return; +} + void SIFrameLowering::processFunctionBeforeFrameIndicesReplaced( MachineFunction &MF, RegScavenger *RS) const { const GCNSubtarget &ST = MF.getSubtarget(); @@ -1448,6 +1502,13 @@ if (!ST.hasGFX90AInsts()) SavedVGPRs.clearBitsInMask(TRI->getAllAGPRRegMask()); + // FIXME: We would need CFI save for EXEC only when we need frame moves. + const bool NeedsFrameMoves = true; + if (TRI->isCFISavedRegsSpillEnabled() && NeedsFrameMoves) { + allocateCFISave(MF, MFI->EXECSaveIndex, + ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC); + } + // hasFP only knows about stack objects that already exist. We're now // determining the stack slots that will be created, so we have to predict // them. Stack objects force FP usage with calls. @@ -1457,8 +1518,7 @@ // // FIXME: Is this really hasReservedCallFrame? const bool WillHaveFP = - FrameInfo.hasCalls() && - (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo)); + FrameInfo.hasCalls() && (SavedVGPRs.any() || !allStackObjectsAreDead(MF)); // VGPRs used for SGPR spilling need to be specially inserted in the prolog, // so don't allow the default insertion to handle them. diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp --- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -319,7 +319,7 @@ // free frame index ids by the later pass(es) like "stack slot coloring" // which in turn could mess-up with the book keeping of "frame index to VGPR // lane". - FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ false); + FuncInfo->removeDeadFrameIndices(MF, /*ResetSGPRSpillStackIDs*/ false); MadeChange = true; } diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -532,6 +532,8 @@ Register SGPRForBPSaveRestoreCopy; Optional BasePointerSaveIndex; + Optional EXECSaveIndex; + bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg); public: @@ -589,7 +591,7 @@ /// If \p ResetSGPRSpillStackIDs is true, reset the stack ID from sgpr-spill /// to the default stack. - bool removeDeadFrameIndices(MachineFrameInfo &MFI, + bool removeDeadFrameIndices(MachineFunction &MF, bool ResetSGPRSpillStackIDs); int getScavengeFI(MachineFrameInfo &MFI, const SIRegisterInfo &TRI); diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -433,14 +433,18 @@ } bool SIMachineFunctionInfo::removeDeadFrameIndices( - MachineFrameInfo &MFI, bool ResetSGPRSpillStackIDs) { + MachineFunction &MF, bool ResetSGPRSpillStackIDs) { + MachineFrameInfo &MFI = MF.getFrameInfo(); + const GCNSubtarget &ST = MF.getSubtarget(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); // Remove dead frame indices from function frame, however keep FP & BP since // spills for them haven't been inserted yet. And also make sure to remove the // frame indices from `SGPRToVGPRSpills` data structure, otherwise, it could // result in an unexpected side effect and bug, in case of any re-mapping of // freed frame indices by later pass(es) like "stack slot coloring". for (auto &R : make_early_inc_range(SGPRToVGPRSpills)) { - if (R.first != FramePointerSaveIndex && R.first != BasePointerSaveIndex) { + if (R.first != FramePointerSaveIndex && R.first != BasePointerSaveIndex && + (!TRI->isCFISavedRegsSpillEnabled() || R.first != EXECSaveIndex)) { MFI.RemoveStackObject(R.first); SGPRToVGPRSpills.erase(R.first); } @@ -453,12 +457,12 @@ // stack ID. for (int i = MFI.getObjectIndexBegin(), e = MFI.getObjectIndexEnd(); i != e; ++i) { - if (i != FramePointerSaveIndex && i != BasePointerSaveIndex) { + if (i != FramePointerSaveIndex && i != BasePointerSaveIndex && + (!TRI->isCFISavedRegsSpillEnabled() || i != EXECSaveIndex)) if (MFI.getStackID(i) == TargetStackID::SGPRSpill) { MFI.setStackID(i, TargetStackID::Default); HaveSGPRToMemory = true; } - } } } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -70,6 +70,8 @@ return SpillSGPRToVGPR; } + bool isCFISavedRegsSpillEnabled() const; + /// Return the end register initially reserved for the scratch buffer in case /// spilling is needed. MCRegister reservedPrivateSegmentBufferReg(const MachineFunction &MF) const; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -35,6 +35,11 @@ cl::ReallyHidden, cl::init(true)); +static cl::opt EnableSpillCFISavedRegs( + "amdgpu-spill-cfi-saved-regs", + cl::desc("Enable spilling the registers required for CFI emission"), + cl::ReallyHidden, cl::init(false), cl::ZeroOrMore); + std::array, 16> SIRegisterInfo::RegSplitParts; std::array, 9> SIRegisterInfo::SubRegFromChannelTable; @@ -535,6 +540,10 @@ return SubRegFromChannelTable[NumRegIndex - 1][Channel]; } +bool SIRegisterInfo::isCFISavedRegsSpillEnabled() const { + return EnableSpillCFISavedRegs; +} + MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg( const MachineFunction &MF) const { unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4; diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-spill-cfi-saved-regs.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-spill-cfi-saved-regs.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-spill-cfi-saved-regs.ll @@ -0,0 +1,175 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-spill-cfi-saved-regs -verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK,WAVE64 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-spill-cfi-saved-regs -verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK,WAVE32 %s + +; CHECK-LABEL: kern: +; CHECK: .cfi_startproc +; CHECK-NOT: .cfi_{{.*}} +; CHECK: %bb.0: +; CHECK-NEXT: .cfi_escape 0x0f, 0x03, 0x30, 0x36, 0xe9 +; CHECK-NEXT: .cfi_undefined 16 +; CHECK-NOT: .cfi_{{.*}} +; CHECK: .cfi_endproc +define protected amdgpu_kernel void @kern() #0 { +entry: + ret void +} + +; CHECK-LABEL: func_saved_in_clobbered_vgpr: +; CHECK: .cfi_startproc +; CHECK-NOT: .cfi_{{.*}} +; CHECK: %bb.0: +; SGPR32 = 64 +; CHECK-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; CHECK-NEXT: .cfi_escape 0x10, 0x10, 0x08, 0x90, 0x3e, 0x93, 0x04, 0x90, 0x3f, 0x93, 0x04 + + +; FIXME: ideally this would not care what VGPR we spill to, but since we are +; using .cfi_escape it isn't trivial/possible to make this general yet + +; WAVE64: v_writelane_b32 v0, exec_lo, 0 +; WAVE64-NEXT: v_writelane_b32 v0, exec_hi, 1 +; DW_CFA_expression [0x10] +; EXEC_MASK_wave64 ULEB128(17)=[0x11] +; BLOCK_LENGTH ULEB128(12)=[0x0c] +; DW_OP_regx [0x90] +; VGPR0_wave64 ULEB128(2560)=[0x80, 0x14] +; DW_OP_bit_piece [0x9d] +; PIECE_SIZE [0x20] +; PIECE_OFFSET [0x00] +; DW_OP_regx [0x90] +; VGPR0_wave64 ULEB128(2560)=[0x80, 0x14] +; DW_OP_bit_piece [0x9d] +; PIECE_SIZE [0x20] +; PIECE_OFFSET [0x20] +; WAVE64-NEXT: .cfi_escape 0x10, 0x11, 0x0c, 0x90, 0x80, 0x14, 0x9d, 0x20, 0x00, 0x90, 0x80, 0x14, 0x9d, 0x20, 0x20 + +; WAVE32: v_writelane_b32 v0, exec_lo, 0 +; DW_CFA_expression [0x10] +; EXEC_MASK_wave32 ULEB128(1)=[0x01] +; BLOCK_LENGTH ULEB128(6)=[0x06] +; DW_OP_regx [0x90] +; VGPR0_wave32 ULEB128(1536)=[0x80, 0x0c] +; DW_OP_bit_piece [0x9d] +; PIECE_SIZE [0x20] +; PIECE_OFFSET [0x00] +; WAVE32-NEXT: .cfi_escape 0x10, 0x01, 0x06, 0x90, 0x80, 0x0c, 0x9d, 0x20, 0x00 + +; CHECK-NOT: .cfi_{{.*}} +; CHECK: .cfi_endproc +define hidden void @func_saved_in_clobbered_vgpr() #0 { +entry: + ret void +} + +; Check that the option causes a CSR VGPR to spill when needed. + +; CHECK-LABEL: func_saved_in_preserved_vgpr: +; CHECK: %bb.0: + +; CHECK: s_or_saveexec_b{{(32|64)}} +; CHECK: buffer_store_dword [[CSR:v[0-9]+]], off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK: s_mov_b{{(32|64)}} {{(exec|exec_lo)}}, + +; WAVE64: v_writelane_b32 [[CSR]], exec_lo, {{[0-9]+}} +; WAVE64-NEXT: v_writelane_b32 [[CSR]], exec_hi, {{[0-9]+}} + +; WAVE32: v_writelane_b32 [[CSR]], exec_lo, {{[0-9]+}} + +define hidden void @func_saved_in_preserved_vgpr() #0 { +entry: + call void asm sideeffect "; clobber nonpreserved VGPRs", + "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9} + ,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19} + ,~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29} + ,~{v30},~{v31},~{v32},~{v33},~{v34},~{v35},~{v36},~{v37},~{v38},~{v39}"() + ret void +} + +; There's no return here, so the return address live in was deleted. +; CHECK-LABEL: {{^}}empty_func: +; CHECK-NOT: v_writelane_b32 v0, s30, 0 +; CHECK-NOT: v_writelane_b32 v0, s31, 1 +define void @empty_func() { + unreachable +} + +; Check that the option causes EXEC to be spilled to memory. + +; CHECK-LABEL: no_vgprs_to_spill_into: +; CHECK: %bb.0: + +; WAVE64: v_mov_b32_e32 v0, exec_lo +; WAVE64-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; WAVE64-NEXT: v_mov_b32_e32 v0, exec_hi +; WAVE64-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_offset 17, 0 + +define void @no_vgprs_to_spill_into() #1 { + call void asm sideeffect "", + "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9} + ,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19} + ,~{v20},~{v21},~{v22},~{v23},~{v24}"() + + ret void +} + +; Check that the FP and EXEC needs to be spilled to memory, even though +; we have reserved VGPR but there are no available free lanes. + +; CHECK-LABEL: callee_need_to_spill_fp_exec_to_memory: +; CHECK: %bb.0: + +; WAVE32: s_or_saveexec_b32 [[EXEC_COPY:s[0-9]+]], -1 +; WAVE32-NEXT: buffer_store_dword [[RES_VGPR:v[0-9]+]], off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; WAVE32: s_mov_b32 exec_lo, [[EXEC_COPY]] +; WAVE32-NEXT: v_mov_b32_e32 [[TEMP_VGPR:v[0-9]+]], exec_lo +; WAVE32-NEXT: buffer_store_dword [[TEMP_VGPR]], off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_offset 1, 6272 +; WAVE32-NEXT: v_mov_b32_e32 [[TEMP_VGPR]], s33 +; WAVE32-NEXT: buffer_store_dword [[TEMP_VGPR]], off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; WAVE32: buffer_store_dword v40, off, s[0:3], s33 offset +; WAVE32-COUNT-47: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 +; WAVE32: v_writelane_b32 [[RES_VGPR]], s34, 0 +; WAVE32-COUNT-31: v_writelane_b32 [[RES_VGPR]], s{{[0-9]+}}, {{[0-9]+}} + + +define void @callee_need_to_spill_fp_exec_to_memory() #2 { + call void asm sideeffect "; clobber nonpreserved and 32 CSR SGPRs", + "~{s4},~{s5},~{s6},~{s7},~{s8},~{s9} + ,~{s10},~{s11},~{s12},~{s13},~{s14},~{s15},~{s16},~{s17},~{s18},~{s19} + ,~{s20},~{s21},~{s22},~{s23},~{s24},~{s25},~{s26},~{s27},~{s28},~{s29} + ,~{s34},~{s35},~{s36},~{s37},~{s38},~{s39} + ,~{s40},~{s41},~{s42},~{s43},~{s44},~{s45},~{s46},~{s47},~{s48},~{s49} + ,~{s50},~{s51},~{s52},~{s53},~{s54},~{s55},~{s56},~{s57},~{s58},~{s59} + ,~{s60},~{s61},~{s62},~{s63},~{s64},~{s65} + ,~{vcc}"() + + call void asm sideeffect "; clobber all VGPRs except v39", + "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9} + ,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19} + ,~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29} + ,~{v30},~{v31},~{v32},~{v33},~{v34},~{v35},~{v36},~{v37},~{v38} + ,~{v40},~{v41},~{v42},~{v43},~{v44},~{v45},~{v46},~{v47},~{v48},~{v49} + ,~{v50},~{v51},~{v52},~{v53},~{v54},~{v55},~{v56},~{v57},~{v58},~{v59} + ,~{v60},~{v61},~{v62},~{v63},~{v64},~{v65},~{v66},~{v67},~{v68},~{v69} + ,~{v70},~{v71},~{v72},~{v73},~{v74},~{v75},~{v76},~{v77},~{v78},~{v79} + ,~{v80},~{v81},~{v82},~{v83},~{v84},~{v85},~{v86},~{v87},~{v88},~{v89} + ,~{v90},~{v91},~{v92},~{v93},~{v94},~{v95},~{v96},~{v97},~{v98},~{v99} + ,~{v100},~{v101},~{v102},~{v103},~{v104},~{v105},~{v106},~{v107},~{v108},~{v109} + ,~{v110},~{v111},~{v112},~{v113},~{v114},~{v115},~{v116},~{v117},~{v118},~{v119} + ,~{v120},~{v121},~{v122},~{v123},~{v124},~{v125},~{v126},~{v127},~{v128},~{v129}"() + ret void +} + + +attributes #0 = { nounwind } +attributes #1 = { nounwind "amdgpu-waves-per-eu"="10,10" } +attributes #2 = { nounwind "frame-pointer"="all" "amdgpu-waves-per-eu"="12,12" } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, emissionKind: FullDebug) +!1 = !DIFile(filename: "filename", directory: "directory") +!2 = !{i32 7, !"Dwarf Version", i32 4} +!3 = !{i32 2, !"Debug Info Version", i32 3} diff --git a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll --- a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll @@ -1,78 +1,153 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-spill-sgpr-to-vgpr=true < %s | FileCheck -check-prefix=SPILL-TO-VGPR %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-spill-sgpr-to-vgpr=false < %s | FileCheck -check-prefix=NO-SPILL-TO-VGPR %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-spill-sgpr-to-vgpr=true -amdgpu-spill-cfi-saved-regs=false < %s | FileCheck -check-prefixes=NO-CFI-SAVES-SPILL-TO-VGPR %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-spill-sgpr-to-vgpr=true -amdgpu-spill-cfi-saved-regs=true < %s | FileCheck -check-prefixes=CFI-SAVES-SPILL-TO-VGPR %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-spill-sgpr-to-vgpr=false -amdgpu-spill-cfi-saved-regs=false < %s | FileCheck -check-prefixes=NO-CFI-SAVES-NO-SPILL-TO-VGPR %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-spill-sgpr-to-vgpr=false -amdgpu-spill-cfi-saved-regs=true < %s | FileCheck -check-prefixes=CFI-SAVES-NO-SPILL-TO-VGPR %s ; Check frame setup where SGPR spills to VGPRs are disabled or enabled. declare hidden void @external_void_func_void() #0 define void @callee_with_stack_and_call() #0 { -; SPILL-TO-VGPR-LABEL: callee_with_stack_and_call: -; SPILL-TO-VGPR: ; %bb.0: -; SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SPILL-TO-VGPR-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SPILL-TO-VGPR-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[4:5] -; SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, s33, 2 -; SPILL-TO-VGPR-NEXT: s_mov_b32 s33, s32 -; SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, s30, 0 -; SPILL-TO-VGPR-NEXT: s_addk_i32 s32, 0x400 -; SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, s31, 1 -; SPILL-TO-VGPR-NEXT: v_mov_b32_e32 v0, 0 -; SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33 -; SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) -; SPILL-TO-VGPR-NEXT: s_getpc_b64 s[4:5] -; SPILL-TO-VGPR-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 -; SPILL-TO-VGPR-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; SPILL-TO-VGPR-NEXT: s_swappc_b64 s[30:31], s[4:5] -; SPILL-TO-VGPR-NEXT: v_readlane_b32 s30, v40, 0 -; SPILL-TO-VGPR-NEXT: v_readlane_b32 s31, v40, 1 -; SPILL-TO-VGPR-NEXT: s_addk_i32 s32, 0xfc00 -; SPILL-TO-VGPR-NEXT: v_readlane_b32 s33, v40, 2 -; SPILL-TO-VGPR-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SPILL-TO-VGPR-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[4:5] -; SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) -; SPILL-TO-VGPR-NEXT: s_setpc_b64 s[30:31] +; NO-CFI-SAVES-SPILL-TO-VGPR-LABEL: callee_with_stack_and_call: +; NO-CFI-SAVES-SPILL-TO-VGPR: ; %bb.0: +; NO-CFI-SAVES-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; NO-CFI-SAVES-SPILL-TO-VGPR-NEXT: s_or_saveexec_b64 s[4:5], -1 +; NO-CFI-SAVES-SPILL-TO-VGPR-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; NO-CFI-SAVES-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[4:5] +; NO-CFI-SAVES-SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, s33, 2 +; NO-CFI-SAVES-SPILL-TO-VGPR-NEXT: s_mov_b32 s33, s32 +; NO-CFI-SAVES-SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, s30, 0 +; NO-CFI-SAVES-SPILL-TO-VGPR-NEXT: s_addk_i32 s32, 0x400 +; NO-CFI-SAVES-SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, s31, 1 +; NO-CFI-SAVES-SPILL-TO-VGPR-NEXT: v_mov_b32_e32 v0, 0 +; NO-CFI-SAVES-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33 +; NO-CFI-SAVES-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) +; NO-CFI-SAVES-SPILL-TO-VGPR-NEXT: s_getpc_b64 s[4:5] +; NO-CFI-SAVES-SPILL-TO-VGPR-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; NO-CFI-SAVES-SPILL-TO-VGPR-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; NO-CFI-SAVES-SPILL-TO-VGPR-NEXT: s_swappc_b64 s[30:31], s[4:5] +; NO-CFI-SAVES-SPILL-TO-VGPR-NEXT: v_readlane_b32 s30, v40, 0 +; NO-CFI-SAVES-SPILL-TO-VGPR-NEXT: v_readlane_b32 s31, v40, 1 +; NO-CFI-SAVES-SPILL-TO-VGPR-NEXT: s_addk_i32 s32, 0xfc00 +; NO-CFI-SAVES-SPILL-TO-VGPR-NEXT: v_readlane_b32 s33, v40, 2 +; NO-CFI-SAVES-SPILL-TO-VGPR-NEXT: s_or_saveexec_b64 s[4:5], -1 +; NO-CFI-SAVES-SPILL-TO-VGPR-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; NO-CFI-SAVES-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[4:5] +; NO-CFI-SAVES-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) +; NO-CFI-SAVES-SPILL-TO-VGPR-NEXT: s_setpc_b64 s[30:31] ; -; NO-SPILL-TO-VGPR-LABEL: callee_with_stack_and_call: -; NO-SPILL-TO-VGPR: ; %bb.0: -; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; NO-SPILL-TO-VGPR-NEXT: v_mov_b32_e32 v0, s33 -; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; NO-SPILL-TO-VGPR-NEXT: s_mov_b32 s33, s32 -; NO-SPILL-TO-VGPR-NEXT: s_addk_i32 s32, 0x800 -; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 s[8:9], exec -; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, 3 -; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:16 -; NO-SPILL-TO-VGPR-NEXT: v_writelane_b32 v1, s30, 0 -; NO-SPILL-TO-VGPR-NEXT: v_writelane_b32 v1, s31, 1 -; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:16 -; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) -; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[8:9] -; NO-SPILL-TO-VGPR-NEXT: v_mov_b32_e32 v0, 0 -; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33 -; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) -; NO-SPILL-TO-VGPR-NEXT: s_getpc_b64 s[4:5] -; NO-SPILL-TO-VGPR-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 -; NO-SPILL-TO-VGPR-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; NO-SPILL-TO-VGPR-NEXT: s_swappc_b64 s[30:31], s[4:5] -; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 s[4:5], exec -; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, 3 -; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:16 -; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) -; NO-SPILL-TO-VGPR-NEXT: v_readlane_b32 s30, v1, 0 -; NO-SPILL-TO-VGPR-NEXT: v_readlane_b32 s31, v1, 1 -; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:16 -; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) -; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[4:5] -; NO-SPILL-TO-VGPR-NEXT: s_addk_i32 s32, 0xf800 -; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) -; NO-SPILL-TO-VGPR-NEXT: v_readfirstlane_b32 s33, v0 -; NO-SPILL-TO-VGPR-NEXT: s_setpc_b64 s[30:31] +; CFI-SAVES-SPILL-TO-VGPR-LABEL: callee_with_stack_and_call: +; CFI-SAVES-SPILL-TO-VGPR: ; %bb.0: +; CFI-SAVES-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CFI-SAVES-SPILL-TO-VGPR-NEXT: s_or_saveexec_b64 s[4:5], -1 +; CFI-SAVES-SPILL-TO-VGPR-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; CFI-SAVES-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[4:5] +; CFI-SAVES-SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, exec_lo, 2 +; CFI-SAVES-SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, exec_hi, 3 +; CFI-SAVES-SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, s33, 4 +; CFI-SAVES-SPILL-TO-VGPR-NEXT: s_mov_b32 s33, s32 +; CFI-SAVES-SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, s30, 0 +; CFI-SAVES-SPILL-TO-VGPR-NEXT: s_addk_i32 s32, 0x400 +; CFI-SAVES-SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, s31, 1 +; CFI-SAVES-SPILL-TO-VGPR-NEXT: v_mov_b32_e32 v0, 0 +; CFI-SAVES-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33 +; CFI-SAVES-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) +; CFI-SAVES-SPILL-TO-VGPR-NEXT: s_getpc_b64 s[4:5] +; CFI-SAVES-SPILL-TO-VGPR-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; CFI-SAVES-SPILL-TO-VGPR-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; CFI-SAVES-SPILL-TO-VGPR-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CFI-SAVES-SPILL-TO-VGPR-NEXT: v_readlane_b32 s30, v40, 0 +; CFI-SAVES-SPILL-TO-VGPR-NEXT: v_readlane_b32 s31, v40, 1 +; CFI-SAVES-SPILL-TO-VGPR-NEXT: s_addk_i32 s32, 0xfc00 +; CFI-SAVES-SPILL-TO-VGPR-NEXT: v_readlane_b32 s33, v40, 4 +; CFI-SAVES-SPILL-TO-VGPR-NEXT: s_or_saveexec_b64 s[4:5], -1 +; CFI-SAVES-SPILL-TO-VGPR-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; CFI-SAVES-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[4:5] +; CFI-SAVES-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) +; CFI-SAVES-SPILL-TO-VGPR-NEXT: s_setpc_b64 s[30:31] +; +; NO-CFI-SAVES-NO-SPILL-TO-VGPR-LABEL: callee_with_stack_and_call: +; NO-CFI-SAVES-NO-SPILL-TO-VGPR: ; %bb.0: +; NO-CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; NO-CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: v_mov_b32_e32 v0, s33 +; NO-CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; NO-CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: s_mov_b32 s33, s32 +; NO-CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: s_addk_i32 s32, 0x800 +; NO-CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: s_mov_b64 s[8:9], exec +; NO-CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, 3 +; NO-CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:16 +; NO-CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: v_writelane_b32 v1, s30, 0 +; NO-CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: v_writelane_b32 v1, s31, 1 +; NO-CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; NO-CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:16 +; NO-CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) +; NO-CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[8:9] +; NO-CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: v_mov_b32_e32 v0, 0 +; NO-CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33 +; NO-CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) +; NO-CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: s_getpc_b64 s[4:5] +; NO-CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; NO-CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; NO-CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: s_swappc_b64 s[30:31], s[4:5] +; NO-CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: s_mov_b64 s[4:5], exec +; NO-CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, 3 +; NO-CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:16 +; NO-CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; NO-CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) +; NO-CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: v_readlane_b32 s30, v1, 0 +; NO-CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: v_readlane_b32 s31, v1, 1 +; NO-CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:16 +; NO-CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) +; NO-CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[4:5] +; NO-CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: s_addk_i32 s32, 0xf800 +; NO-CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; NO-CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) +; NO-CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: v_readfirstlane_b32 s33, v0 +; NO-CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: s_setpc_b64 s[30:31] +; +; CFI-SAVES-NO-SPILL-TO-VGPR-LABEL: callee_with_stack_and_call: +; CFI-SAVES-NO-SPILL-TO-VGPR: ; %bb.0: +; CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: v_mov_b32_e32 v0, exec_lo +; CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: v_mov_b32_e32 v0, exec_hi +; CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: v_mov_b32_e32 v0, s33 +; CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: s_mov_b32 s33, s32 +; CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: s_addk_i32 s32, 0x800 +; CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: s_mov_b64 s[8:9], exec +; CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, 3 +; CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:24 +; CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: v_writelane_b32 v1, s30, 0 +; CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: v_writelane_b32 v1, s31, 1 +; CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:24 +; CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) +; CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[8:9] +; CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: v_mov_b32_e32 v0, 0 +; CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33 +; CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) +; CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: s_getpc_b64 s[4:5] +; CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: s_mov_b64 s[4:5], exec +; CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, 3 +; CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:24 +; CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) +; CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: v_readlane_b32 s30, v1, 0 +; CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: v_readlane_b32 s31, v1, 1 +; CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:24 +; CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) +; CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[4:5] +; CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: s_addk_i32 s32, 0xf800 +; CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) +; CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: v_readfirstlane_b32 s33, v0 +; CFI-SAVES-NO-SPILL-TO-VGPR-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, addrspace(5) store volatile i32 0, i32 addrspace(5)* %alloca call void @external_void_func_void() diff --git a/llvm/test/CodeGen/AMDGPU/pei-cfi-saves-bug.ll b/llvm/test/CodeGen/AMDGPU/pei-cfi-saves-bug.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/pei-cfi-saves-bug.ll @@ -0,0 +1,117 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-spill-cfi-saved-regs < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-cfi-saved-regs < %s | FileCheck %s + +; Function Attrs: noinline optnone +define fastcc void @tail_callee() #2 { +; CHECK-LABEL: tail_callee: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-NEXT: v_writelane_b32 v0, exec_lo, 0 +; CHECK-NEXT: v_writelane_b32 v0, exec_hi, 1 +; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1 +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] + ret void +} + +; Function Attrs: noinline +define fastcc void @callee_no_fp() #0 { +; CHECK-LABEL: callee_no_fp: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_or_saveexec_b64 s[16:17], -1 +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[16:17] +; CHECK-NEXT: v_writelane_b32 v1, exec_lo, 2 +; CHECK-NEXT: v_writelane_b32 v1, exec_hi, 3 +; CHECK-NEXT: v_writelane_b32 v1, s33, 4 +; CHECK-NEXT: s_mov_b32 s33, s32 +; CHECK-NEXT: v_writelane_b32 v1, s30, 0 +; CHECK-NEXT: s_addk_i32 s32, 0x400 +; CHECK-NEXT: v_writelane_b32 v1, s31, 1 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, tail_callee@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, tail_callee@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] +entry: + tail call fastcc void @tail_callee() #3 + unreachable +} + +define protected amdgpu_kernel void @kernel() #1 { +; CHECK-LABEL: kernel: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; CHECK-NEXT: s_add_u32 s0, s0, s17 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: s_cbranch_scc0 .LBB2_2 +; CHECK-NEXT: ; %bb.1: ; %end +; CHECK-NEXT: s_endpgm +; CHECK-NEXT: .LBB2_2: ; %body +; CHECK-NEXT: s_getpc_b64 s[12:13] +; CHECK-NEXT: s_add_u32 s12, s12, callee_no_fp@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s13, s13, callee_no_fp@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[18:19], s[12:13], 0x0 +; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 +; CHECK-NEXT: s_mov_b32 s12, s14 +; CHECK-NEXT: s_mov_b32 s13, s15 +; CHECK-NEXT: s_mov_b32 s14, s16 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] +entry: + br i1 undef, label %end, label %body + +body: ; preds = %entry + tail call fastcc void @callee_no_fp() #3 + unreachable + +end: ; preds = %entry + ret void +} + +; When we have calls, spilling a CSR VGPR for CFI saves should force FP usage +; Function Attrs: noinline +define dso_local fastcc void @func_needs_fp() unnamed_addr #0 { +; CHECK-LABEL: func_needs_fp: +; CHECK: func_needs_fp$local: +; CHECK-NEXT: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_or_saveexec_b64 s[16:17], -1 +; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[16:17] +; CHECK-NEXT: v_writelane_b32 v40, exec_lo, 2 +; CHECK-NEXT: v_writelane_b32 v40, exec_hi, 3 +; CHECK-NEXT: v_writelane_b32 v40, s33, 4 +; CHECK-NEXT: s_mov_b32 s33, s32 +; CHECK-NEXT: v_writelane_b32 v40, s30, 0 +; CHECK-NEXT: s_addk_i32 s32, 0x400 +; CHECK-NEXT: v_writelane_b32 v40, s31, 1 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, tail_callee_fp@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, tail_callee_fp@rel32@hi+12 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] +entry: + tail call fastcc void @tail_callee_fp() #3 + unreachable +} + +; Function Attrs: noinline optnone +declare dso_local fastcc void @tail_callee_fp() unnamed_addr #2 + +attributes #0 = { noinline } +attributes #1 = { "use-soft-float"="false" } +attributes #2 = { noinline optnone } +attributes #3 = { convergent nounwind } +