diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td --- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -176,6 +176,10 @@ (add CSR_AMDGPU_SI_Gfx, CSR_AMDGPU_AGPRs) >; +def CSR_AMDGPU_CS_ChainPreserve : CalleeSavedRegs< + (sequence "VGPR%u", 8, 255) +>; + def CSR_AMDGPU_NoRegs : CalleeSavedRegs<(add)>; // Calling convention for leaf functions diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -1318,7 +1318,11 @@ SIMachineFunctionInfo *FuncInfo = MF.getInfo(); // Allocate spill slots for WWM reserved VGPRs. - if (!FuncInfo->isEntryFunction()) { + // For chain functions, we only need to do this if we have calls to + // llvm.amdgcn.cs.chain. + bool IsChainWithoutCalls = + FuncInfo->isChainFunction() && !MF.getFrameInfo().hasTailCall(); + if (!FuncInfo->isEntryFunction() && !IsChainWithoutCalls) { for (Register Reg : FuncInfo->getWWMReservedRegs()) { const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg); FuncInfo->allocateWWMSpill(MF, Reg, TRI->getSpillSize(*RC), @@ -1531,8 +1535,15 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, BitVector &SavedVGPRs, RegScavenger *RS) const { - TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS); SIMachineFunctionInfo *MFI = MF.getInfo(); + + // If this is a function with the amdgpu_cs_chain[_preserve] calling + // convention and it doesn't contain any calls to llvm.amdgcn.cs.chain, then + // we don't need to save and restore anything. + if (MFI->isChainFunction() && !MF.getFrameInfo().hasTailCall()) + return; + + TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS); if (MFI->isEntryFunction()) return; @@ -1561,7 +1572,9 @@ else if (TII->isWWMRegSpillOpcode(MI.getOpcode())) NeedExecCopyReservedReg = true; else if (MI.getOpcode() == AMDGPU::SI_RETURN || - MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) { + MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG || + (MFI->isChainFunction() && + MI.getOpcode() == AMDGPU::SI_TCRETURN)) { // We expect all return to be the same size. assert(!ReturnMI || (count_if(MI.operands(), [](auto Op) { return Op.isReg(); }) == diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -302,6 +302,14 @@ if (isEntryFunction() || WWMSpills.count(VGPR)) return; + // Skip if this is a function with the amdgpu_cs_chain or + // amdgpu_cs_chain_preserve calling convention and the register is in the + // range v0-v7. We never need to allocate a spill for these because we don't + // even need to restore the inactive lanes for them (they're scratchier than + // the usual scratch registers). + if (isChainFunction() && VGPR >= AMDGPU::VGPR0 && VGPR < AMDGPU::VGPR8) + return; + WWMSpills.insert(std::make_pair( VGPR, MF.getFrameInfo().CreateSpillStackObject(Size, Alignment))); } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -397,6 +397,8 @@ case CallingConv::AMDGPU_Gfx: return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_SaveList : CSR_AMDGPU_SI_Gfx_SaveList; + case CallingConv::AMDGPU_CS_ChainPreserve: + return CSR_AMDGPU_CS_ChainPreserve_SaveList; default: { // Dummy to not crash RegisterClassInfo. static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister; diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll @@ -549,6 +549,104 @@ unreachable } +define amdgpu_cs_chain void @chain_to_chain_wwm(<3 x i32> inreg %a, <3 x i32> %b) { +; GISEL-GFX11-LABEL: chain_to_chain_wwm: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: s_mov_b32 s3, s0 +; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 3 +; GISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo +; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 4 +; GISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_mov_b32_e32 v2, v1 +; GISEL-GFX11-NEXT: ;;#ASMSTART +; GISEL-GFX11-NEXT: s_nop +; GISEL-GFX11-NEXT: ;;#ASMEND +; GISEL-GFX11-NEXT: s_mov_b32 s0, s3 +; GISEL-GFX11-NEXT: v_mov_b32_e32 v8, v2 +; GISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-GFX11-NEXT: s_getpc_b64 s[4:5] +; GISEL-GFX11-NEXT: s_add_u32 s4, s4, chain_callee@gotpcrel32@lo+4 +; GISEL-GFX11-NEXT: s_addc_u32 s5, s5, chain_callee@gotpcrel32@hi+12 +; GISEL-GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX11-NEXT: s_setpc_b64 s[4:5] +; +; GISEL-GFX10-LABEL: chain_to_chain_wwm: +; GISEL-GFX10: ; %bb.0: +; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX10-NEXT: s_mov_b32 s3, s0 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 3 +; GISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo +; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 4 +; GISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo +; GISEL-GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GISEL-GFX10-NEXT: ;;#ASMSTART +; GISEL-GFX10-NEXT: s_nop +; GISEL-GFX10-NEXT: ;;#ASMEND +; GISEL-GFX10-NEXT: s_mov_b32 s0, s3 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v8, v2 +; GISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-GFX10-NEXT: s_getpc_b64 s[4:5] +; GISEL-GFX10-NEXT: s_add_u32 s4, s4, chain_callee@gotpcrel32@lo+4 +; GISEL-GFX10-NEXT: s_addc_u32 s5, s5, chain_callee@gotpcrel32@hi+12 +; GISEL-GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX10-NEXT: s_setpc_b64 s[4:5] +; +; DAGISEL-GFX11-LABEL: chain_to_chain_wwm: +; DAGISEL-GFX11: ; %bb.0: +; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL-GFX11-NEXT: s_getpc_b64 s[4:5] +; DAGISEL-GFX11-NEXT: s_add_u32 s4, s4, chain_callee@gotpcrel32@lo+4 +; DAGISEL-GFX11-NEXT: s_addc_u32 s5, s5, chain_callee@gotpcrel32@hi+12 +; DAGISEL-GFX11-NEXT: s_mov_b32 s3, s0 +; DAGISEL-GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v1, 3 +; DAGISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo +; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v1, 4 +; DAGISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo +; DAGISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v2, v1 +; DAGISEL-GFX11-NEXT: ;;#ASMSTART +; DAGISEL-GFX11-NEXT: s_nop +; DAGISEL-GFX11-NEXT: ;;#ASMEND +; DAGISEL-GFX11-NEXT: s_mov_b32 s0, s3 +; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v8, v2 +; DAGISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; DAGISEL-GFX11-NEXT: s_setpc_b64 s[4:5] +; +; DAGISEL-GFX10-LABEL: chain_to_chain_wwm: +; DAGISEL-GFX10: ; %bb.0: +; DAGISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL-GFX10-NEXT: s_getpc_b64 s[4:5] +; DAGISEL-GFX10-NEXT: s_add_u32 s4, s4, chain_callee@gotpcrel32@lo+4 +; DAGISEL-GFX10-NEXT: s_addc_u32 s5, s5, chain_callee@gotpcrel32@hi+12 +; DAGISEL-GFX10-NEXT: s_mov_b32 s3, s0 +; DAGISEL-GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, 3 +; DAGISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, 4 +; DAGISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v2, v1 +; DAGISEL-GFX10-NEXT: ;;#ASMSTART +; DAGISEL-GFX10-NEXT: s_nop +; DAGISEL-GFX10-NEXT: ;;#ASMEND +; DAGISEL-GFX10-NEXT: s_mov_b32 s0, s3 +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v8, v2 +; DAGISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; DAGISEL-GFX10-NEXT: s_setpc_b64 s[4:5] + %i = call i32 @llvm.amdgcn.set.inactive(i32 3, i32 4) + call void asm "s_nop", "~{v0},~{v8},~{v16},~{s0}"() + %w = call i32 @llvm.amdgcn.wwm(i32 %i) + %c = insertelement <3 x i32> %b, i32 %w, i32 0 + call void(ptr, i32, <3 x i32>, <3 x i32>, i32, ...) @llvm.amdgcn.cs.chain.v3i32(ptr @chain_callee, i32 -1, <3 x i32> inreg %a, <3 x i32> %c, i32 0) + unreachable +} + define amdgpu_cs_chain void @chain_to_chain_use_all_v0_v7(<3 x i32> inreg %a, <3 x i32> %b) { ; GISEL-GFX11-LABEL: chain_to_chain_use_all_v0_v7: ; GISEL-GFX11: ; %bb.0: @@ -798,3 +896,5 @@ declare amdgpu_cs_chain void @chain_callee_2(<2 x i32> inreg, <2 x i32>) declare amdgpu_cs_chain void @chain_callee(<3 x i32> inreg, <3 x i32>) declare amdgpu_cs_chain void @chain_callee_4(<4 x i32> inreg, <4 x i32>) +declare i32 @llvm.amdgcn.set.inactive(i32, i32) +declare i32 @llvm.amdgcn.wwm(i32) diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll @@ -201,7 +201,6 @@ unreachable } -; FIXME: Preserve things (i.e. v16)! ; FIXME: Setup s32. define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve(<3 x i32> inreg %a, <3 x i32> %b) { @@ -210,13 +209,14 @@ ; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, v8 ; GISEL-GFX11-NEXT: s_mov_b32 s3, s0 +; GISEL-GFX11-NEXT: scratch_store_b32 off, v16, s32 ; 4-byte Folded Spill ; GISEL-GFX11-NEXT: ;;#ASMSTART ; GISEL-GFX11-NEXT: s_nop ; GISEL-GFX11-NEXT: ;;#ASMEND ; GISEL-GFX11-NEXT: s_mov_b32 s0, s3 -; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v8, v1 ; GISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-GFX11-NEXT: scratch_load_b32 v16, off, s32 ; 4-byte Folded Reload ; GISEL-GFX11-NEXT: s_getpc_b64 s[4:5] ; GISEL-GFX11-NEXT: s_add_u32 s4, s4, chain_preserve_callee@gotpcrel32@lo+4 ; GISEL-GFX11-NEXT: s_addc_u32 s5, s5, chain_preserve_callee@gotpcrel32@hi+12 @@ -229,12 +229,14 @@ ; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, v8 ; GISEL-GFX10-NEXT: s_mov_b32 s3, s0 +; GISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], s32 ; 4-byte Folded Spill ; GISEL-GFX10-NEXT: ;;#ASMSTART ; GISEL-GFX10-NEXT: s_nop ; GISEL-GFX10-NEXT: ;;#ASMEND ; GISEL-GFX10-NEXT: s_mov_b32 s0, s3 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v8, v1 ; GISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-GFX10-NEXT: buffer_load_dword v16, off, s[48:51], s32 ; 4-byte Folded Reload ; GISEL-GFX10-NEXT: s_getpc_b64 s[4:5] ; GISEL-GFX10-NEXT: s_add_u32 s4, s4, chain_preserve_callee@gotpcrel32@lo+4 ; GISEL-GFX10-NEXT: s_addc_u32 s5, s5, chain_preserve_callee@gotpcrel32@hi+12 @@ -251,12 +253,14 @@ ; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v1, v8 ; DAGISEL-GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 ; DAGISEL-GFX11-NEXT: s_mov_b32 s3, s0 +; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v16, s32 ; 4-byte Folded Spill ; DAGISEL-GFX11-NEXT: ;;#ASMSTART ; DAGISEL-GFX11-NEXT: s_nop ; DAGISEL-GFX11-NEXT: ;;#ASMEND ; DAGISEL-GFX11-NEXT: s_mov_b32 s0, s3 ; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v8, v1 ; DAGISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-GFX11-NEXT: scratch_load_b32 v16, off, s32 ; 4-byte Folded Reload ; DAGISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; DAGISEL-GFX11-NEXT: s_setpc_b64 s[4:5] ; @@ -269,12 +273,14 @@ ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, v8 ; DAGISEL-GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; DAGISEL-GFX10-NEXT: s_mov_b32 s3, s0 +; DAGISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], s32 ; 4-byte Folded Spill ; DAGISEL-GFX10-NEXT: ;;#ASMSTART ; DAGISEL-GFX10-NEXT: s_nop ; DAGISEL-GFX10-NEXT: ;;#ASMEND ; DAGISEL-GFX10-NEXT: s_mov_b32 s0, s3 ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v8, v1 ; DAGISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-GFX10-NEXT: buffer_load_dword v16, off, s[48:51], s32 ; 4-byte Folded Reload ; DAGISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; DAGISEL-GFX10-NEXT: s_setpc_b64 s[4:5] call void asm "s_nop", "~{v0},~{v8},~{v16},~{s0}"() @@ -288,13 +294,14 @@ ; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, v8 ; GISEL-GFX11-NEXT: s_mov_b32 s3, s0 +; GISEL-GFX11-NEXT: scratch_store_b32 off, v16, s32 ; 4-byte Folded Spill ; GISEL-GFX11-NEXT: ;;#ASMSTART ; GISEL-GFX11-NEXT: s_nop ; GISEL-GFX11-NEXT: ;;#ASMEND ; GISEL-GFX11-NEXT: s_mov_b32 s0, s3 -; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v8, v1 ; GISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-GFX11-NEXT: scratch_load_b32 v16, off, s32 ; 4-byte Folded Reload ; GISEL-GFX11-NEXT: s_getpc_b64 s[4:5] ; GISEL-GFX11-NEXT: s_add_u32 s4, s4, chain_callee@gotpcrel32@lo+4 ; GISEL-GFX11-NEXT: s_addc_u32 s5, s5, chain_callee@gotpcrel32@hi+12 @@ -307,12 +314,14 @@ ; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, v8 ; GISEL-GFX10-NEXT: s_mov_b32 s3, s0 +; GISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], s32 ; 4-byte Folded Spill ; GISEL-GFX10-NEXT: ;;#ASMSTART ; GISEL-GFX10-NEXT: s_nop ; GISEL-GFX10-NEXT: ;;#ASMEND ; GISEL-GFX10-NEXT: s_mov_b32 s0, s3 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v8, v1 ; GISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-GFX10-NEXT: buffer_load_dword v16, off, s[48:51], s32 ; 4-byte Folded Reload ; GISEL-GFX10-NEXT: s_getpc_b64 s[4:5] ; GISEL-GFX10-NEXT: s_add_u32 s4, s4, chain_callee@gotpcrel32@lo+4 ; GISEL-GFX10-NEXT: s_addc_u32 s5, s5, chain_callee@gotpcrel32@hi+12 @@ -329,12 +338,14 @@ ; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v1, v8 ; DAGISEL-GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 ; DAGISEL-GFX11-NEXT: s_mov_b32 s3, s0 +; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v16, s32 ; 4-byte Folded Spill ; DAGISEL-GFX11-NEXT: ;;#ASMSTART ; DAGISEL-GFX11-NEXT: s_nop ; DAGISEL-GFX11-NEXT: ;;#ASMEND ; DAGISEL-GFX11-NEXT: s_mov_b32 s0, s3 ; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v8, v1 ; DAGISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-GFX11-NEXT: scratch_load_b32 v16, off, s32 ; 4-byte Folded Reload ; DAGISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; DAGISEL-GFX11-NEXT: s_setpc_b64 s[4:5] ; @@ -347,12 +358,14 @@ ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, v8 ; DAGISEL-GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; DAGISEL-GFX10-NEXT: s_mov_b32 s3, s0 +; DAGISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], s32 ; 4-byte Folded Spill ; DAGISEL-GFX10-NEXT: ;;#ASMSTART ; DAGISEL-GFX10-NEXT: s_nop ; DAGISEL-GFX10-NEXT: ;;#ASMEND ; DAGISEL-GFX10-NEXT: s_mov_b32 s0, s3 ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v8, v1 ; DAGISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-GFX10-NEXT: buffer_load_dword v16, off, s[48:51], s32 ; 4-byte Folded Reload ; DAGISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; DAGISEL-GFX10-NEXT: s_setpc_b64 s[4:5] call void asm "s_nop", "~{v0},~{v8},~{v16},~{s0}"() @@ -360,10 +373,119 @@ unreachable } +define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inreg %a, <3 x i32> %b) { +; GISEL-GFX11-LABEL: chain_preserve_to_chain_wwm: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: scratch_store_b32 off, v16, s32 ; 4-byte Folded Spill +; GISEL-GFX11-NEXT: s_mov_b32 s3, s0 +; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 3 +; GISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo +; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 4 +; GISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_mov_b32_e32 v2, v1 +; GISEL-GFX11-NEXT: ;;#ASMSTART +; GISEL-GFX11-NEXT: s_nop +; GISEL-GFX11-NEXT: ;;#ASMEND +; GISEL-GFX11-NEXT: s_mov_b32 s0, s3 +; GISEL-GFX11-NEXT: v_mov_b32_e32 v8, v2 +; GISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-GFX11-NEXT: scratch_load_b32 v16, off, s32 ; 4-byte Folded Reload +; GISEL-GFX11-NEXT: s_getpc_b64 s[4:5] +; GISEL-GFX11-NEXT: s_add_u32 s4, s4, chain_callee@gotpcrel32@lo+4 +; GISEL-GFX11-NEXT: s_addc_u32 s5, s5, chain_callee@gotpcrel32@hi+12 +; GISEL-GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX11-NEXT: s_setpc_b64 s[4:5] +; +; GISEL-GFX10-LABEL: chain_preserve_to_chain_wwm: +; GISEL-GFX10: ; %bb.0: +; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], s32 ; 4-byte Folded Spill +; GISEL-GFX10-NEXT: s_mov_b32 s3, s0 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 3 +; GISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo +; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 4 +; GISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo +; GISEL-GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GISEL-GFX10-NEXT: ;;#ASMSTART +; GISEL-GFX10-NEXT: s_nop +; GISEL-GFX10-NEXT: ;;#ASMEND +; GISEL-GFX10-NEXT: s_mov_b32 s0, s3 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v8, v2 +; GISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-GFX10-NEXT: buffer_load_dword v16, off, s[48:51], s32 ; 4-byte Folded Reload +; GISEL-GFX10-NEXT: s_getpc_b64 s[4:5] +; GISEL-GFX10-NEXT: s_add_u32 s4, s4, chain_callee@gotpcrel32@lo+4 +; GISEL-GFX10-NEXT: s_addc_u32 s5, s5, chain_callee@gotpcrel32@hi+12 +; GISEL-GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX10-NEXT: s_setpc_b64 s[4:5] +; +; DAGISEL-GFX11-LABEL: chain_preserve_to_chain_wwm: +; DAGISEL-GFX11: ; %bb.0: +; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL-GFX11-NEXT: s_getpc_b64 s[4:5] +; DAGISEL-GFX11-NEXT: s_add_u32 s4, s4, chain_callee@gotpcrel32@lo+4 +; DAGISEL-GFX11-NEXT: s_addc_u32 s5, s5, chain_callee@gotpcrel32@hi+12 +; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v16, s32 ; 4-byte Folded Spill +; DAGISEL-GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; DAGISEL-GFX11-NEXT: s_mov_b32 s3, s0 +; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v1, 3 +; DAGISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo +; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v1, 4 +; DAGISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo +; DAGISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v2, v1 +; DAGISEL-GFX11-NEXT: ;;#ASMSTART +; DAGISEL-GFX11-NEXT: s_nop +; DAGISEL-GFX11-NEXT: ;;#ASMEND +; DAGISEL-GFX11-NEXT: s_mov_b32 s0, s3 +; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v8, v2 +; DAGISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-GFX11-NEXT: scratch_load_b32 v16, off, s32 ; 4-byte Folded Reload +; DAGISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; DAGISEL-GFX11-NEXT: s_setpc_b64 s[4:5] +; +; DAGISEL-GFX10-LABEL: chain_preserve_to_chain_wwm: +; DAGISEL-GFX10: ; %bb.0: +; DAGISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL-GFX10-NEXT: s_getpc_b64 s[4:5] +; DAGISEL-GFX10-NEXT: s_add_u32 s4, s4, chain_callee@gotpcrel32@lo+4 +; DAGISEL-GFX10-NEXT: s_addc_u32 s5, s5, chain_callee@gotpcrel32@hi+12 +; DAGISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], s32 ; 4-byte Folded Spill +; DAGISEL-GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; DAGISEL-GFX10-NEXT: s_mov_b32 s3, s0 +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, 3 +; DAGISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, 4 +; DAGISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v2, v1 +; DAGISEL-GFX10-NEXT: ;;#ASMSTART +; DAGISEL-GFX10-NEXT: s_nop +; DAGISEL-GFX10-NEXT: ;;#ASMEND +; DAGISEL-GFX10-NEXT: s_mov_b32 s0, s3 +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v8, v2 +; DAGISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-GFX10-NEXT: buffer_load_dword v16, off, s[48:51], s32 ; 4-byte Folded Reload +; DAGISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; DAGISEL-GFX10-NEXT: s_setpc_b64 s[4:5] + %i = call i32 @llvm.amdgcn.set.inactive(i32 3, i32 4) + call void asm "s_nop", "~{v0},~{v8},~{v16},~{s0}"() + %w = call i32 @llvm.amdgcn.wwm(i32 %i) + %c = insertelement <3 x i32> %b, i32 %w, i32 0 + call void(ptr, i32, <3 x i32>, <3 x i32>, i32, ...) @llvm.amdgcn.cs.chain.v3i32(ptr @chain_callee, i32 -1, <3 x i32> inreg %a, <3 x i32> %c, i32 0) + unreachable +} + define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_use_all_v0_v7(<3 x i32> inreg %a, <3 x i32> %b) { ; GISEL-GFX11-LABEL: chain_preserve_to_chain_use_all_v0_v7: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: s_clause 0x1 +; GISEL-GFX11-NEXT: scratch_store_b32 off, v11, s32 offset:4 +; GISEL-GFX11-NEXT: scratch_store_b32 off, v16, s32 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v11, v8 ; GISEL-GFX11-NEXT: s_mov_b32 s3, s0 ; GISEL-GFX11-NEXT: ;;#ASMSTART @@ -373,6 +495,9 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v8, v11 ; GISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-GFX11-NEXT: s_clause 0x1 +; GISEL-GFX11-NEXT: scratch_load_b32 v16, off, s32 +; GISEL-GFX11-NEXT: scratch_load_b32 v11, off, s32 offset:4 ; GISEL-GFX11-NEXT: s_getpc_b64 s[4:5] ; GISEL-GFX11-NEXT: s_add_u32 s4, s4, chain_callee@gotpcrel32@lo+4 ; GISEL-GFX11-NEXT: s_addc_u32 s5, s5, chain_callee@gotpcrel32@hi+12 @@ -383,6 +508,8 @@ ; GISEL-GFX10-LABEL: chain_preserve_to_chain_use_all_v0_v7: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX10-NEXT: buffer_store_dword v11, off, s[48:51], s32 offset:4 ; 4-byte Folded Spill +; GISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], s32 ; 4-byte Folded Spill ; GISEL-GFX10-NEXT: v_mov_b32_e32 v11, v8 ; GISEL-GFX10-NEXT: s_mov_b32 s3, s0 ; GISEL-GFX10-NEXT: ;;#ASMSTART @@ -391,6 +518,9 @@ ; GISEL-GFX10-NEXT: s_mov_b32 s0, s3 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v8, v11 ; GISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-GFX10-NEXT: s_clause 0x1 +; GISEL-GFX10-NEXT: buffer_load_dword v16, off, s[48:51], s32 +; GISEL-GFX10-NEXT: buffer_load_dword v11, off, s[48:51], s32 offset:4 ; GISEL-GFX10-NEXT: s_getpc_b64 s[4:5] ; GISEL-GFX10-NEXT: s_add_u32 s4, s4, chain_callee@gotpcrel32@lo+4 ; GISEL-GFX10-NEXT: s_addc_u32 s5, s5, chain_callee@gotpcrel32@hi+12 @@ -404,15 +534,22 @@ ; DAGISEL-GFX11-NEXT: s_getpc_b64 s[4:5] ; DAGISEL-GFX11-NEXT: s_add_u32 s4, s4, chain_callee@gotpcrel32@lo+4 ; DAGISEL-GFX11-NEXT: s_addc_u32 s5, s5, chain_callee@gotpcrel32@hi+12 -; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v11, v8 +; DAGISEL-GFX11-NEXT: s_clause 0x1 +; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v11, s32 offset:4 +; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v16, s32 ; DAGISEL-GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v11, v8 ; DAGISEL-GFX11-NEXT: s_mov_b32 s3, s0 ; DAGISEL-GFX11-NEXT: ;;#ASMSTART ; DAGISEL-GFX11-NEXT: s_nop ; DAGISEL-GFX11-NEXT: ;;#ASMEND ; DAGISEL-GFX11-NEXT: s_mov_b32 s0, s3 +; DAGISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v8, v11 ; DAGISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-GFX11-NEXT: s_clause 0x1 +; DAGISEL-GFX11-NEXT: scratch_load_b32 v16, off, s32 +; DAGISEL-GFX11-NEXT: scratch_load_b32 v11, off, s32 offset:4 ; DAGISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; DAGISEL-GFX11-NEXT: s_setpc_b64 s[4:5] ; @@ -422,8 +559,10 @@ ; DAGISEL-GFX10-NEXT: s_getpc_b64 s[4:5] ; DAGISEL-GFX10-NEXT: s_add_u32 s4, s4, chain_callee@gotpcrel32@lo+4 ; DAGISEL-GFX10-NEXT: s_addc_u32 s5, s5, chain_callee@gotpcrel32@hi+12 -; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v11, v8 +; DAGISEL-GFX10-NEXT: buffer_store_dword v11, off, s[48:51], s32 offset:4 ; 4-byte Folded Spill +; DAGISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], s32 ; 4-byte Folded Spill ; DAGISEL-GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v11, v8 ; DAGISEL-GFX10-NEXT: s_mov_b32 s3, s0 ; DAGISEL-GFX10-NEXT: ;;#ASMSTART ; DAGISEL-GFX10-NEXT: s_nop @@ -431,6 +570,9 @@ ; DAGISEL-GFX10-NEXT: s_mov_b32 s0, s3 ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v8, v11 ; DAGISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-GFX10-NEXT: s_clause 0x1 +; DAGISEL-GFX10-NEXT: buffer_load_dword v16, off, s[48:51], s32 +; DAGISEL-GFX10-NEXT: buffer_load_dword v11, off, s[48:51], s32 offset:4 ; DAGISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; DAGISEL-GFX10-NEXT: s_setpc_b64 s[4:5] call void asm "s_nop", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v16},~{s0}"() @@ -444,13 +586,14 @@ ; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, v8 ; GISEL-GFX11-NEXT: s_mov_b32 s2, s0 +; GISEL-GFX11-NEXT: scratch_store_b32 off, v16, s32 ; 4-byte Folded Spill ; GISEL-GFX11-NEXT: ;;#ASMSTART ; GISEL-GFX11-NEXT: s_nop ; GISEL-GFX11-NEXT: ;;#ASMEND ; GISEL-GFX11-NEXT: s_mov_b32 s0, s2 -; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v8, v1 ; GISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-GFX11-NEXT: scratch_load_b32 v16, off, s32 ; 4-byte Folded Reload ; GISEL-GFX11-NEXT: s_getpc_b64 s[2:3] ; GISEL-GFX11-NEXT: s_add_u32 s2, s2, chain_preserve_callee_2@gotpcrel32@lo+4 ; GISEL-GFX11-NEXT: s_addc_u32 s3, s3, chain_preserve_callee_2@gotpcrel32@hi+12 @@ -463,12 +606,14 @@ ; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, v8 ; GISEL-GFX10-NEXT: s_mov_b32 s2, s0 +; GISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], s32 ; 4-byte Folded Spill ; GISEL-GFX10-NEXT: ;;#ASMSTART ; GISEL-GFX10-NEXT: s_nop ; GISEL-GFX10-NEXT: ;;#ASMEND ; GISEL-GFX10-NEXT: s_mov_b32 s0, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v8, v1 ; GISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-GFX10-NEXT: buffer_load_dword v16, off, s[48:51], s32 ; 4-byte Folded Reload ; GISEL-GFX10-NEXT: s_getpc_b64 s[2:3] ; GISEL-GFX10-NEXT: s_add_u32 s2, s2, chain_preserve_callee_2@gotpcrel32@lo+4 ; GISEL-GFX10-NEXT: s_addc_u32 s3, s3, chain_preserve_callee_2@gotpcrel32@hi+12 @@ -485,12 +630,14 @@ ; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v1, v8 ; DAGISEL-GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 ; DAGISEL-GFX11-NEXT: s_mov_b32 s2, s0 +; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v16, s32 ; 4-byte Folded Spill ; DAGISEL-GFX11-NEXT: ;;#ASMSTART ; DAGISEL-GFX11-NEXT: s_nop ; DAGISEL-GFX11-NEXT: ;;#ASMEND ; DAGISEL-GFX11-NEXT: s_mov_b32 s0, s2 ; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v8, v1 ; DAGISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-GFX11-NEXT: scratch_load_b32 v16, off, s32 ; 4-byte Folded Reload ; DAGISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; DAGISEL-GFX11-NEXT: s_setpc_b64 s[4:5] ; @@ -503,12 +650,14 @@ ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, v8 ; DAGISEL-GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; DAGISEL-GFX10-NEXT: s_mov_b32 s2, s0 +; DAGISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], s32 ; 4-byte Folded Spill ; DAGISEL-GFX10-NEXT: ;;#ASMSTART ; DAGISEL-GFX10-NEXT: s_nop ; DAGISEL-GFX10-NEXT: ;;#ASMEND ; DAGISEL-GFX10-NEXT: s_mov_b32 s0, s2 ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v8, v1 ; DAGISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-GFX10-NEXT: buffer_load_dword v16, off, s[48:51], s32 ; 4-byte Folded Reload ; DAGISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; DAGISEL-GFX10-NEXT: s_setpc_b64 s[4:5] %s = shufflevector <3 x i32> %a, <3 x i32> zeroinitializer, <2 x i32> @@ -527,3 +676,6 @@ declare void @llvm.amdgcn.cs.chain.v2i32(ptr, i32, <2 x i32>, <2 x i32>, i32, ...) declare amdgpu_cs_chain_preserve void @chain_preserve_callee_2(<2 x i32> inreg, <2 x i32>) + +declare i32 @llvm.amdgcn.set.inactive(i32, i32) +declare i32 @llvm.amdgcn.wwm(i32) diff --git a/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain-preserve.mir b/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain-preserve.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain-preserve.mir @@ -0,0 +1,282 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 +# RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass=prologepilog -o - %s | FileCheck -check-prefix=GCN %s + +--- | + declare amdgpu_cs_chain void @callee() + declare amdgpu_gfx void @gfx_callee() + + define amdgpu_cs_chain_preserve void @preserve_active_lanes_above_args() {ret void} + define amdgpu_cs_chain_preserve void @preserve_all_lanes_wwm_above_args() {ret void} + define amdgpu_cs_chain_preserve void @dont_preserve_args() {ret void} + define amdgpu_cs_chain_preserve void @preserve_inactive_lanes_wwm_args() {ret void} + define amdgpu_cs_chain_preserve void @dont_preserve_if_no_chain_calls() {ret void} + define amdgpu_cs_chain_preserve void @dont_preserve_v0_v7() {ret void} + define amdgpu_cs_chain_preserve void @dont_preserve_sgpr() {ret void} +... +--- + +# NOTE: Since we don't know what the args are, we rely on the fact that we can't +# call llvm.amdgcn.cs.chain with more parameters than we received - so anything +# that is used by the SI_TCRETURN is assumed to have been an arg and therefore +# not preserved. + +--- +name: preserve_active_lanes_above_args +tracksRegLiveness: true +frameInfo: + hasTailCall: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' + isChainFunction: true + returnsVoid: true +body: | + bb.0: + liveins: $sgpr0, $vgpr8, $vgpr9 + + ; GCN-LABEL: name: preserve_active_lanes_above_args + ; GCN: liveins: $sgpr0, $vgpr8, $vgpr9, $vgpr10 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr10, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; GCN-NEXT: renamable $vgpr10 = V_MOV_B32_e32 10, implicit $exec + ; GCN-NEXT: $vgpr8 = COPY killed renamable $vgpr10 + ; GCN-NEXT: $exec_lo = S_MOV_B32 -1 + ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GCN-NEXT: $vgpr10 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5) + ; GCN-NEXT: SI_TCRETURN killed renamable $sgpr4_sgpr5, @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9 + renamable $vgpr10 = V_MOV_B32_e32 10, implicit $exec + $vgpr8 = COPY renamable killed $vgpr10 + $exec_lo = S_MOV_B32 -1 + renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + SI_TCRETURN killed renamable $sgpr4_sgpr5, @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9 + +... + +--- +name: preserve_all_lanes_wwm_above_args +tracksRegLiveness: true +frameInfo: + hasTailCall: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' + isChainFunction: true + returnsVoid: true + wwmReservedRegs: + - '$vgpr11' +body: | + bb.0: + liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9, $vgpr10 + + ; GCN-LABEL: name: preserve_all_lanes_wwm_above_args + ; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9, $vgpr10 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr1 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr10, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr11, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) + ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1 + ; GCN-NEXT: renamable $vgpr10 = V_WRITELANE_B32 $sgpr35, 0, killed $vgpr10 + ; GCN-NEXT: $sgpr35 = S_MOV_B32 5 + ; GCN-NEXT: $sgpr35 = V_READLANE_B32 $vgpr10, 0 + ; GCN-NEXT: renamable $vgpr10 = V_MOV_B32_e32 10, implicit $exec + ; GCN-NEXT: $vgpr8 = COPY killed renamable $vgpr10 + ; GCN-NEXT: $exec_lo = S_MOV_B32 -1 + ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GCN-NEXT: $sgpr1 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: $vgpr10 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5) + ; GCN-NEXT: $vgpr11 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5) + ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1 + ; GCN-NEXT: SI_TCRETURN killed renamable $sgpr4_sgpr5, @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9 + renamable $vgpr10 = V_WRITELANE_B32 $sgpr35, 0, killed $vgpr10 + $sgpr35 = S_MOV_B32 5 + $sgpr35 = V_READLANE_B32 $vgpr10, 0 + renamable $vgpr10 = V_MOV_B32_e32 10, implicit $exec + $vgpr8 = COPY renamable killed $vgpr10 + $exec_lo = S_MOV_B32 -1 + renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + SI_TCRETURN killed renamable $sgpr4_sgpr5, @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9 + +... + +--- +name: dont_preserve_args +tracksRegLiveness: true +frameInfo: + hasTailCall: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' + isChainFunction: true + returnsVoid: true +body: | + bb.0: + liveins: $sgpr0, $vgpr8, $vgpr9 + + ; GCN-LABEL: name: dont_preserve_args + ; GCN: liveins: $sgpr0, $vgpr8, $vgpr9 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec + ; GCN-NEXT: $exec_lo = S_MOV_B32 -1 + ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GCN-NEXT: SI_TCRETURN killed renamable $sgpr4_sgpr5, @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9 + renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec + $exec_lo = S_MOV_B32 -1 + renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + SI_TCRETURN killed renamable $sgpr4_sgpr5, @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9 + +... + +--- +name: preserve_inactive_lanes_wwm_args +tracksRegLiveness: true +frameInfo: + hasTailCall: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' + isChainFunction: true + returnsVoid: true + wwmReservedRegs: + - '$vgpr9' +body: | + bb.0: + liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9, $vgpr10 + + ; GCN-LABEL: name: preserve_inactive_lanes_wwm_args + ; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9, $vgpr10 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr1 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr8, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr9, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) + ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1 + ; GCN-NEXT: renamable $vgpr8 = V_WRITELANE_B32 $sgpr35, 0, killed $vgpr8 + ; GCN-NEXT: $sgpr35 = S_MOV_B32 5 + ; GCN-NEXT: $sgpr35 = V_READLANE_B32 $vgpr8, 0 + ; GCN-NEXT: renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec + ; GCN-NEXT: $exec_lo = S_MOV_B32 -1 + ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GCN-NEXT: $sgpr1 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: $vgpr8 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr8(tied-def 0) :: (load (s32) from %stack.0, addrspace 5) + ; GCN-NEXT: $vgpr9 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr, implicit $vgpr9(tied-def 0) :: (load (s32) from %stack.1, addrspace 5) + ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1 + ; GCN-NEXT: SI_TCRETURN killed renamable $sgpr4_sgpr5, @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9 + renamable $vgpr8 = V_WRITELANE_B32 $sgpr35, 0, killed $vgpr8 + $sgpr35 = S_MOV_B32 5 + $sgpr35 = V_READLANE_B32 $vgpr8, 0 + renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec + $exec_lo = S_MOV_B32 -1 + renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + SI_TCRETURN killed renamable $sgpr4_sgpr5, @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9 + +... + +--- +name: dont_preserve_if_no_chain_calls +tracksRegLiveness: true +frameInfo: + hasTailCall: false +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' + isChainFunction: true + returnsVoid: true + wwmReservedRegs: + - '$vgpr9' +body: | + bb.0: + liveins: $sgpr0, $sgpr35, $vgpr0, $vgpr8, $vgpr9 + + ; GCN-LABEL: name: dont_preserve_if_no_chain_calls + ; GCN: liveins: $sgpr0, $sgpr35, $vgpr0, $vgpr8, $vgpr9 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: renamable $vgpr8 = V_WRITELANE_B32 $sgpr35, 0, killed $vgpr8 + ; GCN-NEXT: $sgpr35 = S_MOV_B32 5 + ; GCN-NEXT: $sgpr35 = V_READLANE_B32 $vgpr8, 0 + ; GCN-NEXT: renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec + ; GCN-NEXT: $vgpr9 = V_MOV_B32_e32 20, implicit $exec + ; GCN-NEXT: $vgpr10 = V_MOV_B32_e32 30, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + renamable $vgpr8 = V_WRITELANE_B32 $sgpr35, 0, killed $vgpr8 + $sgpr35 = S_MOV_B32 5 + $sgpr35 = V_READLANE_B32 $vgpr8, 0 + renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec + $vgpr9 = V_MOV_B32_e32 20, implicit $exec + $vgpr10 = V_MOV_B32_e32 30, implicit $exec + S_ENDPGM 0 +... + +--- +name: dont_preserve_v0_v7 +tracksRegLiveness: true +frameInfo: + hasTailCall: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' + isChainFunction: true + returnsVoid: true + wwmReservedRegs: + - '$vgpr1' +body: | + bb.0: + liveins: $sgpr0, $sgpr35, $vgpr0, $vgpr8 + + ; GCN-LABEL: name: dont_preserve_v0_v7 + ; GCN: liveins: $sgpr0, $sgpr35, $vgpr0, $vgpr8 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr35, 0, killed $vgpr0 + ; GCN-NEXT: $sgpr35 = S_MOV_B32 5 + ; GCN-NEXT: $sgpr35 = V_READLANE_B32 $vgpr0, 0 + ; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 10, implicit $exec + ; GCN-NEXT: renamable $vgpr7 = V_MOV_B32_e32 16, implicit $exec + ; GCN-NEXT: renamable $vgpr8 = COPY killed renamable $vgpr0 + ; GCN-NEXT: renamable $vgpr9 = COPY killed renamable $vgpr7 + ; GCN-NEXT: $exec_lo = S_MOV_B32 -1 + ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GCN-NEXT: SI_TCRETURN killed renamable $sgpr4_sgpr5, @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9 + renamable $vgpr0 = V_WRITELANE_B32 $sgpr35, 0, killed $vgpr0 + $sgpr35 = S_MOV_B32 5 + $sgpr35 = V_READLANE_B32 $vgpr0, 0 + renamable $vgpr0 = V_MOV_B32_e32 10, implicit $exec + renamable $vgpr7 = V_MOV_B32_e32 16, implicit $exec + renamable $vgpr8 = COPY killed renamable $vgpr0 + renamable $vgpr9 = COPY killed renamable $vgpr7 + $exec_lo = S_MOV_B32 -1 + renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + SI_TCRETURN killed renamable $sgpr4_sgpr5, @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9 +... + +--- +name: dont_preserve_sgpr +tracksRegLiveness: true +frameInfo: + hasTailCall: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' + isChainFunction: true + returnsVoid: true +body: | + bb.0: + liveins: $sgpr0 + + ; GCN-LABEL: name: dont_preserve_sgpr + ; GCN: liveins: $sgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: renamable $sgpr1 = S_ADD_I32 killed renamable $sgpr0, renamable $sgpr0, implicit-def dead $scc + ; GCN-NEXT: $sgpr0 = COPY killed renamable $sgpr1 + ; GCN-NEXT: $exec_lo = S_MOV_B32 -1 + ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GCN-NEXT: SI_TCRETURN killed renamable $sgpr4_sgpr5, @callee, 0, amdgpu_allvgprs, implicit $sgpr0 + renamable $sgpr1 = S_ADD_I32 killed renamable $sgpr0, renamable $sgpr0, implicit-def dead $scc + $sgpr0 = COPY killed renamable $sgpr1 + $exec_lo = S_MOV_B32 -1 + renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + SI_TCRETURN killed renamable $sgpr4_sgpr5, @callee, 0, amdgpu_allvgprs, implicit $sgpr0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain.mir b/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain.mir @@ -0,0 +1,242 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 +# RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass=prologepilog -o - %s | FileCheck -check-prefix=GCN %s + +# We're keeping the IR around for the callees and the CCs + +--- | + declare amdgpu_cs_chain void @callee() + declare amdgpu_gfx void @gfx_callee() + + define amdgpu_cs_chain void @preserve_inactive_wwm() {ret void} + define amdgpu_cs_chain void @preserve_inactive_detected_wwm() {ret void} + define amdgpu_cs_chain void @dont_preserve_wwm_if_no_chain_calls() {ret void} + define amdgpu_cs_chain void @dont_preserve_non_wwm() {ret void} + define amdgpu_cs_chain void @dont_preserve_v0_v7() {ret void} + define amdgpu_cs_chain void @dont_preserve_sgpr() {ret void} +... +--- + +# Check that we preserve the inactive lanes of registers v8+ received in the +# MachineFunctionInfo as wwmReservedRegs. + +--- +name: preserve_inactive_wwm +tracksRegLiveness: true +frameInfo: + hasTailCall: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' + returnsVoid: true + wwmReservedRegs: + - '$vgpr8' + - '$vgpr9' +body: | + bb.0: + liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9 + + ; GCN-LABEL: name: preserve_inactive_wwm + ; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr8, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr9, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) + ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1 + ; GCN-NEXT: $exec_lo = S_MOV_B32 -1 + ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: $vgpr8 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr8(tied-def 0) :: (load (s32) from %stack.0, addrspace 5) + ; GCN-NEXT: $vgpr9 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5) + ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1 + ; GCN-NEXT: SI_TCRETURN killed renamable $sgpr4_sgpr5, @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8 + $exec_lo = S_MOV_B32 -1 + renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + SI_TCRETURN killed renamable $sgpr4_sgpr5, @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8 + +... + +# Check that it also works for WWM registers detected as operands of V_WRITELANE_B32 and V_READLANE_B32. + +--- +name: preserve_inactive_detected_wwm +tracksRegLiveness: true +frameInfo: + hasTailCall: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' + returnsVoid: true +body: | + bb.0: + liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9 + + ; GCN-LABEL: name: preserve_inactive_detected_wwm + ; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr8, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr9, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) + ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1 + ; GCN-NEXT: renamable $vgpr8 = V_WRITELANE_B32 $sgpr35, 0, killed $vgpr8 + ; GCN-NEXT: $sgpr35 = S_MOV_B32 5 + ; GCN-NEXT: $sgpr35 = V_READLANE_B32 $vgpr8, 0 + ; GCN-NEXT: renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec + ; GCN-NEXT: renamable $vgpr9 = V_WRITELANE_B32 $sgpr35, 0, killed $vgpr9 + ; GCN-NEXT: $sgpr35 = S_MOV_B32 5 + ; GCN-NEXT: $sgpr35 = V_READLANE_B32 $vgpr9, 0 + ; GCN-NEXT: renamable $vgpr9 = V_MOV_B32_e32 10, implicit $exec + ; GCN-NEXT: $exec_lo = S_MOV_B32 -1 + ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: $vgpr8 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr8(tied-def 0) :: (load (s32) from %stack.0, addrspace 5) + ; GCN-NEXT: $vgpr9 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5) + ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1 + ; GCN-NEXT: SI_TCRETURN killed renamable $sgpr4_sgpr5, @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8 + renamable $vgpr8 = V_WRITELANE_B32 $sgpr35, 0, killed $vgpr8 + $sgpr35 = S_MOV_B32 5 + $sgpr35 = V_READLANE_B32 $vgpr8, 0 + renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec + renamable $vgpr9 = V_WRITELANE_B32 $sgpr35, 0, killed $vgpr9 + $sgpr35 = S_MOV_B32 5 + $sgpr35 = V_READLANE_B32 $vgpr9, 0 + renamable $vgpr9 = V_MOV_B32_e32 10, implicit $exec + $exec_lo = S_MOV_B32 -1 + renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + SI_TCRETURN killed renamable $sgpr4_sgpr5, @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8 + +... + +--- +name: dont_preserve_wwm_if_no_chain_calls +tracksRegLiveness: true +frameInfo: + hasTailCall: false +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' + returnsVoid: true + wwmReservedRegs: + - '$vgpr9' +body: | + bb.0: + liveins: $sgpr35, $vgpr8 + + ; GCN-LABEL: name: dont_preserve_wwm_if_no_chain_calls + ; GCN: liveins: $sgpr35, $vgpr8 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: renamable $vgpr8 = V_WRITELANE_B32 $sgpr35, 0, killed $vgpr8 + ; GCN-NEXT: $sgpr35 = S_MOV_B32 5 + ; GCN-NEXT: $sgpr35 = V_READLANE_B32 $vgpr8, 0 + ; GCN-NEXT: renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + renamable $vgpr8 = V_WRITELANE_B32 $sgpr35, 0, killed $vgpr8 + $sgpr35 = S_MOV_B32 5 + $sgpr35 = V_READLANE_B32 $vgpr8, 0 + renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec + S_ENDPGM 0 +... + +--- +name: dont_preserve_non_wwm +tracksRegLiveness: true +frameInfo: + hasTailCall: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' + isChainFunction: true + returnsVoid: true +body: | + bb.0: + liveins: $sgpr0, $sgpr35, $vgpr0, $vgpr8, $vgpr16 + + ; GCN-LABEL: name: dont_preserve_non_wwm + ; GCN: liveins: $sgpr0, $sgpr35, $vgpr0, $vgpr8, $vgpr16 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: renamable $vgpr16 = V_MOV_B32_e32 16, implicit $exec + ; GCN-NEXT: renamable $vgpr8 = V_MOV_B32_e32 8, implicit $exec + ; GCN-NEXT: $exec_lo = S_MOV_B32 -1 + ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GCN-NEXT: SI_TCRETURN killed renamable $sgpr4_sgpr5, @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8 + renamable $vgpr16 = V_MOV_B32_e32 16, implicit $exec + renamable $vgpr8 = V_MOV_B32_e32 8, implicit $exec + $exec_lo = S_MOV_B32 -1 + renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + SI_TCRETURN killed renamable $sgpr4_sgpr5, @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8 + +... + +--- +name: dont_preserve_v0_v7 +tracksRegLiveness: true +frameInfo: + hasTailCall: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' + isChainFunction: true + returnsVoid: true + wwmReservedRegs: + - '$vgpr1' +body: | + bb.0: + liveins: $sgpr0, $sgpr35, $vgpr0, $vgpr7, $vgpr8, $vgpr9 + + ; GCN-LABEL: name: dont_preserve_v0_v7 + ; GCN: liveins: $sgpr0, $sgpr35, $vgpr0, $vgpr7, $vgpr8, $vgpr9 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr35, 0, killed $vgpr0 + ; GCN-NEXT: $sgpr35 = S_MOV_B32 5 + ; GCN-NEXT: $sgpr35 = V_READLANE_B32 $vgpr0, 0 + ; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 10, implicit $exec + ; GCN-NEXT: renamable $vgpr7 = V_MOV_B32_e32 16, implicit $exec + ; GCN-NEXT: renamable $vgpr8 = COPY killed renamable $vgpr0 + ; GCN-NEXT: renamable $vgpr9 = COPY killed renamable $vgpr7 + ; GCN-NEXT: $exec_lo = S_MOV_B32 -1 + ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GCN-NEXT: SI_TCRETURN killed renamable $sgpr4_sgpr5, @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9 + renamable $vgpr0 = V_WRITELANE_B32 $sgpr35, 0, killed $vgpr0 + $sgpr35 = S_MOV_B32 5 + $sgpr35 = V_READLANE_B32 $vgpr0, 0 + renamable $vgpr0 = V_MOV_B32_e32 10, implicit $exec + renamable $vgpr7 = V_MOV_B32_e32 16, implicit $exec + renamable $vgpr8 = COPY killed renamable $vgpr0 + renamable $vgpr9 = COPY killed renamable $vgpr7 + $exec_lo = S_MOV_B32 -1 + renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + SI_TCRETURN killed renamable $sgpr4_sgpr5, @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9 + +... + +--- +name: dont_preserve_sgpr +tracksRegLiveness: true +frameInfo: + hasTailCall: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' + returnsVoid: true +body: | + bb.0 (%ir-block.0): + liveins: $sgpr0 + + ; GCN-LABEL: name: dont_preserve_sgpr + ; GCN: liveins: $sgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: renamable $sgpr1 = S_ADD_I32 killed renamable $sgpr0, renamable $sgpr0, implicit-def dead $scc + ; GCN-NEXT: $sgpr0 = COPY killed renamable $sgpr1 + ; GCN-NEXT: $exec_lo = S_MOV_B32 -1 + ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GCN-NEXT: SI_TCRETURN killed renamable $sgpr4_sgpr5, @callee, 0, amdgpu_allvgprs, implicit $sgpr0 + renamable $sgpr1 = S_ADD_I32 killed renamable $sgpr0, renamable $sgpr0, implicit-def dead $scc + $sgpr0 = COPY killed renamable $sgpr1 + $exec_lo = S_MOV_B32 -1 + renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + SI_TCRETURN killed renamable $sgpr4_sgpr5, @callee, 0, amdgpu_allvgprs, implicit $sgpr0 + +...