diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -1090,6 +1090,20 @@ // to determine the end of the prologue. DebugLoc DL; + if (FuncInfo->isChainFunction()) { + // Functions with the amdgpu_cs_chain[_preserve] CC don't receive a SP, but + // are free to set one up if they need it. + // FIXME: We shouldn't need to set SP just for the stack objects (we should + // use 0 as an immediate offset instead). + bool UseSP = requiresStackPointerReference(MF) || MFI.hasStackObjects(); + if (UseSP) { + assert(StackPtrReg != AMDGPU::SP_REG); + + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_MOV_B32), StackPtrReg) + .addImm(0); + } + } + bool HasFP = false; bool HasBP = false; uint32_t NumBytes = MFI.getStackSize(); @@ -1808,11 +1822,16 @@ // register. We may need to initialize the stack pointer depending on the frame // properties, which logically overlaps many of the cases where an ordinary // function would require an FP. +// Also used for chain functions. While not technically entry functions, chain +// functions may need to set up a stack pointer in some situations. bool SIFrameLowering::requiresStackPointerReference( const MachineFunction &MF) const { + bool IsChainFunction = MF.getInfo()->isChainFunction(); + // Callable functions always require a stack pointer reference. - assert(MF.getInfo()->isEntryFunction() && - "only expected to call this for entry points"); + assert((MF.getInfo()->isEntryFunction() || + IsChainFunction) && + "only expected to call this for entry points and chain functions"); const MachineFrameInfo &MFI = MF.getFrameInfo(); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -733,12 +733,12 @@ bool SIRegisterInfo::shouldRealignStack(const MachineFunction &MF) const { const SIMachineFunctionInfo *Info = MF.getInfo(); - // On entry, the base address is 0, so it can't possibly need any more - // alignment. + // On entry or in chain functions, the base address is 0, so it can't possibly + // need any more alignment. // FIXME: Should be able to specify the entry frame alignment per calling // convention instead. - if (Info->isEntryFunction()) + if (Info->isEntryFunction() || Info->isChainFunction()) return false; return TargetRegisterInfo::shouldRealignStack(MF); diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll @@ -29,8 +29,6 @@ ret void } -; FIXME: Setup s32. - define amdgpu_cs_chain void @amdgpu_cs_chain_simple_call(<4 x i32> inreg %sgpr, <4 x i32> %vgpr) { ; GISEL-GFX11-LABEL: amdgpu_cs_chain_simple_call: ; GISEL-GFX11: ; %bb.0: @@ -41,7 +39,7 @@ ; GISEL-GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GISEL-GFX11-NEXT: s_mov_b32 s4, use@abs32@lo ; GISEL-GFX11-NEXT: s_mov_b32 s5, use@abs32@hi -; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GISEL-GFX11-NEXT: s_mov_b32 s32, 0 ; GISEL-GFX11-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GISEL-GFX11-NEXT: s_endpgm ; @@ -60,6 +58,7 @@ ; GISEL-GFX10-NEXT: s_mov_b32 s4, use@abs32@lo ; GISEL-GFX10-NEXT: s_mov_b32 s5, use@abs32@hi ; GISEL-GFX10-NEXT: s_mov_b64 s[2:3], s[50:51] +; GISEL-GFX10-NEXT: s_mov_b32 s32, 0 ; GISEL-GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GISEL-GFX10-NEXT: s_endpgm ; @@ -72,7 +71,7 @@ ; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; DAGISEL-GFX11-NEXT: s_mov_b32 s5, use@abs32@hi ; DAGISEL-GFX11-NEXT: s_mov_b32 s4, use@abs32@lo -; DAGISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; DAGISEL-GFX11-NEXT: s_mov_b32 s32, 0 ; DAGISEL-GFX11-NEXT: s_swappc_b64 s[30:31], s[4:5] ; DAGISEL-GFX11-NEXT: s_endpgm ; @@ -91,18 +90,19 @@ ; DAGISEL-GFX10-NEXT: s_mov_b32 s5, use@abs32@hi ; DAGISEL-GFX10-NEXT: s_mov_b32 s4, use@abs32@lo ; DAGISEL-GFX10-NEXT: s_mov_b64 s[2:3], s[50:51] +; DAGISEL-GFX10-NEXT: s_mov_b32 s32, 0 ; DAGISEL-GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; DAGISEL-GFX10-NEXT: s_endpgm call amdgpu_gfx void @use(<4 x i32> %sgpr, <4 x i32> %vgpr) ret void } -; FIXME: Setup s32. - define amdgpu_cs_chain void @amdgpu_cs_chain_spill(<24 x i32> inreg %sgprs, <24 x i32> %vgprs) { ; GISEL-GFX11-LABEL: amdgpu_cs_chain_spill: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: s_mov_b32 s32, 0 +; GISEL-GFX11-NEXT: v_dual_mov_b32 v32, v8 :: v_dual_mov_b32 v33, v9 ; GISEL-GFX11-NEXT: s_add_u32 s24, s32, 4 ; GISEL-GFX11-NEXT: scratch_store_b32 off, v16, s32 ; GISEL-GFX11-NEXT: scratch_store_b32 off, v17, s24 @@ -123,7 +123,6 @@ ; GISEL-GFX11-NEXT: scratch_store_b32 off, v24, s24 ; GISEL-GFX11-NEXT: scratch_store_b32 off, v25, s25 ; GISEL-GFX11-NEXT: s_add_u32 s24, s32, 40 -; GISEL-GFX11-NEXT: v_dual_mov_b32 v32, v8 :: v_dual_mov_b32 v33, v9 ; GISEL-GFX11-NEXT: v_dual_mov_b32 v34, v10 :: v_dual_mov_b32 v35, v11 ; GISEL-GFX11-NEXT: v_dual_mov_b32 v36, v12 :: v_dual_mov_b32 v37, v13 ; GISEL-GFX11-NEXT: v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v39, v15 @@ -171,6 +170,7 @@ ; GISEL-GFX10-NEXT: v_mov_b32_e32 v37, v13 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v38, v14 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v39, v15 +; GISEL-GFX10-NEXT: s_mov_b32 s32, 0 ; GISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], s32 ; GISEL-GFX10-NEXT: buffer_store_dword v17, off, s[48:51], s32 offset:4 ; GISEL-GFX10-NEXT: buffer_store_dword v18, off, s[48:51], s32 offset:8 @@ -229,6 +229,8 @@ ; DAGISEL-GFX11-LABEL: amdgpu_cs_chain_spill: ; DAGISEL-GFX11: ; %bb.0: ; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL-GFX11-NEXT: s_mov_b32 s32, 0 +; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v32, v15 :: v_dual_mov_b32 v33, v14 ; DAGISEL-GFX11-NEXT: s_add_i32 s24, s32, 60 ; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v16, s32 ; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v31, s24 @@ -249,7 +251,6 @@ ; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v24, s24 ; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v23, s25 ; DAGISEL-GFX11-NEXT: s_add_i32 s24, s32, 24 -; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v32, v15 :: v_dual_mov_b32 v33, v14 ; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v34, v13 :: v_dual_mov_b32 v35, v12 ; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v36, v11 :: v_dual_mov_b32 v37, v10 ; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v38, v9 :: v_dual_mov_b32 v39, v8 @@ -297,6 +298,7 @@ ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v37, v10 ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v38, v9 ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v39, v8 +; DAGISEL-GFX10-NEXT: s_mov_b32 s32, 0 ; DAGISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], s32 ; DAGISEL-GFX10-NEXT: buffer_store_dword v17, off, s[48:51], s32 offset:4 ; DAGISEL-GFX10-NEXT: buffer_store_dword v18, off, s[48:51], s32 offset:8 @@ -796,6 +798,81 @@ unreachable } +define amdgpu_cs_chain void @amdgpu_cs_chain_dont_realign_stack(i32 %idx) { +; GISEL-GFX11-LABEL: amdgpu_cs_chain_dont_realign_stack: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: s_mov_b32 s3, 4 +; GISEL-GFX11-NEXT: s_mov_b32 s2, 3 +; GISEL-GFX11-NEXT: s_mov_b32 s1, 2 +; GISEL-GFX11-NEXT: s_mov_b32 s0, 1 +; GISEL-GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v8 +; GISEL-GFX11-NEXT: s_mov_b32 s32, 0 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GISEL-GFX11-NEXT: v_add_nc_u32_e32 v4, s32, v0 +; GISEL-GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GISEL-GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GISEL-GFX11-NEXT: scratch_store_b128 v4, v[0:3], off dlc +; GISEL-GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GISEL-GFX11-NEXT: s_endpgm +; +; GISEL-GFX10-LABEL: amdgpu_cs_chain_dont_realign_stack: +; GISEL-GFX10: ; %bb.0: +; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX10-NEXT: s_mov_b32 s32, 0 +; GISEL-GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v8 +; GISEL-GFX10-NEXT: v_lshrrev_b32_e64 v2, 5, s32 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 1 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v3, 3 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v4, 4 +; GISEL-GFX10-NEXT: v_add_nc_u32_e32 v0, v2, v0 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v2, 2 +; GISEL-GFX10-NEXT: buffer_store_dword v1, v0, s[48:51], 0 offen +; GISEL-GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GISEL-GFX10-NEXT: buffer_store_dword v2, v0, s[48:51], 0 offen offset:4 +; GISEL-GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GISEL-GFX10-NEXT: buffer_store_dword v3, v0, s[48:51], 0 offen offset:8 +; GISEL-GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GISEL-GFX10-NEXT: buffer_store_dword v4, v0, s[48:51], 0 offen offset:12 +; GISEL-GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GISEL-GFX10-NEXT: s_endpgm +; +; DAGISEL-GFX11-LABEL: amdgpu_cs_chain_dont_realign_stack: +; DAGISEL-GFX11: ; %bb.0: +; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL-GFX11-NEXT: s_mov_b32 s32, 0 +; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 +; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 +; DAGISEL-GFX11-NEXT: v_lshl_add_u32 v4, v8, 4, s32 +; DAGISEL-GFX11-NEXT: scratch_store_b128 v4, v[0:3], off dlc +; DAGISEL-GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; DAGISEL-GFX11-NEXT: s_endpgm +; +; DAGISEL-GFX10-LABEL: amdgpu_cs_chain_dont_realign_stack: +; DAGISEL-GFX10: ; %bb.0: +; DAGISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL-GFX10-NEXT: s_mov_b32 s32, 0 +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v0, 4 +; DAGISEL-GFX10-NEXT: v_lshrrev_b32_e64 v2, 5, s32 +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v3, 2 +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v4, 1 +; DAGISEL-GFX10-NEXT: v_lshl_add_u32 v1, v8, 4, v2 +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v2, 3 +; DAGISEL-GFX10-NEXT: buffer_store_dword v0, v1, s[48:51], 0 offen offset:12 +; DAGISEL-GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; DAGISEL-GFX10-NEXT: buffer_store_dword v2, v1, s[48:51], 0 offen offset:8 +; DAGISEL-GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; DAGISEL-GFX10-NEXT: buffer_store_dword v3, v1, s[48:51], 0 offen offset:4 +; DAGISEL-GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; DAGISEL-GFX10-NEXT: buffer_store_dword v4, v1, s[48:51], 0 offen +; DAGISEL-GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; DAGISEL-GFX10-NEXT: s_endpgm + %alloca.align32 = alloca [8 x <4 x i32>], align 32, addrspace(5) + %gep0 = getelementptr inbounds [8 x <4 x i32>], ptr addrspace(5) %alloca.align32, i32 0, i32 %idx + store volatile <4 x i32> , ptr addrspace(5) %gep0, align 32 + ret void +} + declare void @llvm.amdgcn.cs.chain.v2i32(ptr, i32, <2 x i32>, <2 x i32>, i32, ...) declare void @llvm.amdgcn.cs.chain.v3i32(ptr, i32, <3 x i32>, <3 x i32>, i32, ...) declare void @llvm.amdgcn.cs.chain.v4i32(ptr, i32, <4 x i32>, <4 x i32>, i32, ...) diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll @@ -177,14 +177,13 @@ unreachable } -; FIXME: Setup s32. - define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve(<3 x i32> inreg %a, <3 x i32> %b) { ; GISEL-GFX11-LABEL: chain_preserve_to_chain_preserve: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX11-NEXT: scratch_store_b32 off, v16, s32 ; 4-byte Folded Spill +; GISEL-GFX11-NEXT: s_mov_b32 s32, 0 ; GISEL-GFX11-NEXT: s_mov_b32 s3, s0 +; GISEL-GFX11-NEXT: scratch_store_b32 off, v16, s32 ; 4-byte Folded Spill ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, v8 ; GISEL-GFX11-NEXT: ;;#ASMSTART ; GISEL-GFX11-NEXT: s_nop @@ -200,8 +199,9 @@ ; GISEL-GFX10-LABEL: chain_preserve_to_chain_preserve: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], s32 ; 4-byte Folded Spill +; GISEL-GFX10-NEXT: s_mov_b32 s32, 0 ; GISEL-GFX10-NEXT: s_mov_b32 s3, s0 +; GISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], s32 ; 4-byte Folded Spill ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, v8 ; GISEL-GFX10-NEXT: ;;#ASMSTART ; GISEL-GFX10-NEXT: s_nop @@ -217,15 +217,16 @@ ; DAGISEL-GFX11-LABEL: chain_preserve_to_chain_preserve: ; DAGISEL-GFX11: ; %bb.0: ; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v16, s32 ; 4-byte Folded Spill +; DAGISEL-GFX11-NEXT: s_mov_b32 s32, 0 ; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v1, v8 +; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v16, s32 ; 4-byte Folded Spill ; DAGISEL-GFX11-NEXT: s_mov_b32 s3, s0 ; DAGISEL-GFX11-NEXT: ;;#ASMSTART ; DAGISEL-GFX11-NEXT: s_nop ; DAGISEL-GFX11-NEXT: ;;#ASMEND ; DAGISEL-GFX11-NEXT: scratch_load_b32 v16, off, s32 ; 4-byte Folded Reload -; DAGISEL-GFX11-NEXT: s_mov_b32 s5, chain_preserve_callee@abs32@hi ; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v8, v1 +; DAGISEL-GFX11-NEXT: s_mov_b32 s5, chain_preserve_callee@abs32@hi ; DAGISEL-GFX11-NEXT: s_mov_b32 s4, chain_preserve_callee@abs32@lo ; DAGISEL-GFX11-NEXT: s_mov_b32 s0, s3 ; DAGISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 @@ -234,15 +235,16 @@ ; DAGISEL-GFX10-LABEL: chain_preserve_to_chain_preserve: ; DAGISEL-GFX10: ; %bb.0: ; DAGISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; DAGISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], s32 ; 4-byte Folded Spill +; DAGISEL-GFX10-NEXT: s_mov_b32 s32, 0 ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, v8 +; DAGISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], s32 ; 4-byte Folded Spill ; DAGISEL-GFX10-NEXT: s_mov_b32 s3, s0 ; DAGISEL-GFX10-NEXT: ;;#ASMSTART ; DAGISEL-GFX10-NEXT: s_nop ; DAGISEL-GFX10-NEXT: ;;#ASMEND ; DAGISEL-GFX10-NEXT: buffer_load_dword v16, off, s[48:51], s32 ; 4-byte Folded Reload -; DAGISEL-GFX10-NEXT: s_mov_b32 s5, chain_preserve_callee@abs32@hi ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v8, v1 +; DAGISEL-GFX10-NEXT: s_mov_b32 s5, chain_preserve_callee@abs32@hi ; DAGISEL-GFX10-NEXT: s_mov_b32 s4, chain_preserve_callee@abs32@lo ; DAGISEL-GFX10-NEXT: s_mov_b32 s0, s3 ; DAGISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 @@ -256,8 +258,9 @@ ; GISEL-GFX11-LABEL: chain_preserve_to_chain: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX11-NEXT: scratch_store_b32 off, v16, s32 ; 4-byte Folded Spill +; GISEL-GFX11-NEXT: s_mov_b32 s32, 0 ; GISEL-GFX11-NEXT: s_mov_b32 s3, s0 +; GISEL-GFX11-NEXT: scratch_store_b32 off, v16, s32 ; 4-byte Folded Spill ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, v8 ; GISEL-GFX11-NEXT: ;;#ASMSTART ; GISEL-GFX11-NEXT: s_nop @@ -273,8 +276,9 @@ ; GISEL-GFX10-LABEL: chain_preserve_to_chain: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], s32 ; 4-byte Folded Spill +; GISEL-GFX10-NEXT: s_mov_b32 s32, 0 ; GISEL-GFX10-NEXT: s_mov_b32 s3, s0 +; GISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], s32 ; 4-byte Folded Spill ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, v8 ; GISEL-GFX10-NEXT: ;;#ASMSTART ; GISEL-GFX10-NEXT: s_nop @@ -290,15 +294,16 @@ ; DAGISEL-GFX11-LABEL: chain_preserve_to_chain: ; DAGISEL-GFX11: ; %bb.0: ; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v16, s32 ; 4-byte Folded Spill +; DAGISEL-GFX11-NEXT: s_mov_b32 s32, 0 ; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v1, v8 +; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v16, s32 ; 4-byte Folded Spill ; DAGISEL-GFX11-NEXT: s_mov_b32 s3, s0 ; DAGISEL-GFX11-NEXT: ;;#ASMSTART ; DAGISEL-GFX11-NEXT: s_nop ; DAGISEL-GFX11-NEXT: ;;#ASMEND ; DAGISEL-GFX11-NEXT: scratch_load_b32 v16, off, s32 ; 4-byte Folded Reload -; DAGISEL-GFX11-NEXT: s_mov_b32 s5, chain_callee@abs32@hi ; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v8, v1 +; DAGISEL-GFX11-NEXT: s_mov_b32 s5, chain_callee@abs32@hi ; DAGISEL-GFX11-NEXT: s_mov_b32 s4, chain_callee@abs32@lo ; DAGISEL-GFX11-NEXT: s_mov_b32 s0, s3 ; DAGISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 @@ -307,15 +312,16 @@ ; DAGISEL-GFX10-LABEL: chain_preserve_to_chain: ; DAGISEL-GFX10: ; %bb.0: ; DAGISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; DAGISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], s32 ; 4-byte Folded Spill +; DAGISEL-GFX10-NEXT: s_mov_b32 s32, 0 ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, v8 +; DAGISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], s32 ; 4-byte Folded Spill ; DAGISEL-GFX10-NEXT: s_mov_b32 s3, s0 ; DAGISEL-GFX10-NEXT: ;;#ASMSTART ; DAGISEL-GFX10-NEXT: s_nop ; DAGISEL-GFX10-NEXT: ;;#ASMEND ; DAGISEL-GFX10-NEXT: buffer_load_dword v16, off, s[48:51], s32 ; 4-byte Folded Reload -; DAGISEL-GFX10-NEXT: s_mov_b32 s5, chain_callee@abs32@hi ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v8, v1 +; DAGISEL-GFX10-NEXT: s_mov_b32 s5, chain_callee@abs32@hi ; DAGISEL-GFX10-NEXT: s_mov_b32 s4, chain_callee@abs32@lo ; DAGISEL-GFX10-NEXT: s_mov_b32 s0, s3 ; DAGISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 @@ -329,8 +335,9 @@ ; GISEL-GFX11-LABEL: chain_preserve_to_chain_wwm: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX11-NEXT: scratch_store_b32 off, v16, s32 ; 4-byte Folded Spill +; GISEL-GFX11-NEXT: s_mov_b32 s32, 0 ; GISEL-GFX11-NEXT: s_mov_b32 s3, s0 +; GISEL-GFX11-NEXT: scratch_store_b32 off, v16, s32 ; 4-byte Folded Spill ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 3 ; GISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 4 @@ -351,8 +358,9 @@ ; GISEL-GFX10-LABEL: chain_preserve_to_chain_wwm: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], s32 ; 4-byte Folded Spill +; GISEL-GFX10-NEXT: s_mov_b32 s32, 0 ; GISEL-GFX10-NEXT: s_mov_b32 s3, s0 +; GISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], s32 ; 4-byte Folded Spill ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 3 ; GISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 4 @@ -372,8 +380,9 @@ ; DAGISEL-GFX11-LABEL: chain_preserve_to_chain_wwm: ; DAGISEL-GFX11: ; %bb.0: ; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v16, s32 ; 4-byte Folded Spill +; DAGISEL-GFX11-NEXT: s_mov_b32 s32, 0 ; DAGISEL-GFX11-NEXT: s_mov_b32 s3, s0 +; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v16, s32 ; 4-byte Folded Spill ; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v1, 3 ; DAGISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo ; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v1, 4 @@ -394,8 +403,9 @@ ; DAGISEL-GFX10-LABEL: chain_preserve_to_chain_wwm: ; DAGISEL-GFX10: ; %bb.0: ; DAGISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; DAGISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], s32 ; 4-byte Folded Spill +; DAGISEL-GFX10-NEXT: s_mov_b32 s32, 0 ; DAGISEL-GFX10-NEXT: s_mov_b32 s3, s0 +; DAGISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], s32 ; 4-byte Folded Spill ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, 3 ; DAGISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, 4 @@ -423,82 +433,86 @@ ; GISEL-GFX11-LABEL: chain_preserve_to_chain_use_all_v0_v7: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: s_mov_b32 s32, 0 +; GISEL-GFX11-NEXT: s_mov_b32 s3, s0 ; GISEL-GFX11-NEXT: s_clause 0x1 ; GISEL-GFX11-NEXT: scratch_store_b32 off, v11, s32 offset:4 ; GISEL-GFX11-NEXT: scratch_store_b32 off, v16, s32 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v11, v8 -; GISEL-GFX11-NEXT: s_mov_b32 s3, s0 ; GISEL-GFX11-NEXT: ;;#ASMSTART ; GISEL-GFX11-NEXT: s_nop ; GISEL-GFX11-NEXT: ;;#ASMEND ; GISEL-GFX11-NEXT: s_mov_b32 s4, chain_callee@abs32@lo ; GISEL-GFX11-NEXT: s_mov_b32 s5, chain_callee@abs32@hi +; GISEL-GFX11-NEXT: s_mov_b32 s0, s3 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v8, v11 ; GISEL-GFX11-NEXT: s_clause 0x1 ; GISEL-GFX11-NEXT: scratch_load_b32 v16, off, s32 ; GISEL-GFX11-NEXT: scratch_load_b32 v11, off, s32 offset:4 -; GISEL-GFX11-NEXT: s_mov_b32 s0, s3 ; GISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 ; GISEL-GFX11-NEXT: s_setpc_b64 s[4:5] ; ; GISEL-GFX10-LABEL: chain_preserve_to_chain_use_all_v0_v7: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX10-NEXT: s_mov_b32 s32, 0 +; GISEL-GFX10-NEXT: s_mov_b32 s3, s0 ; GISEL-GFX10-NEXT: buffer_store_dword v11, off, s[48:51], s32 offset:4 ; 4-byte Folded Spill ; GISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], s32 ; 4-byte Folded Spill ; GISEL-GFX10-NEXT: v_mov_b32_e32 v11, v8 -; GISEL-GFX10-NEXT: s_mov_b32 s3, s0 ; GISEL-GFX10-NEXT: ;;#ASMSTART ; GISEL-GFX10-NEXT: s_nop ; GISEL-GFX10-NEXT: ;;#ASMEND ; GISEL-GFX10-NEXT: s_mov_b32 s4, chain_callee@abs32@lo ; GISEL-GFX10-NEXT: s_mov_b32 s5, chain_callee@abs32@hi +; GISEL-GFX10-NEXT: s_mov_b32 s0, s3 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v8, v11 ; GISEL-GFX10-NEXT: s_clause 0x1 ; GISEL-GFX10-NEXT: buffer_load_dword v16, off, s[48:51], s32 ; GISEL-GFX10-NEXT: buffer_load_dword v11, off, s[48:51], s32 offset:4 -; GISEL-GFX10-NEXT: s_mov_b32 s0, s3 ; GISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 ; GISEL-GFX10-NEXT: s_setpc_b64 s[4:5] ; ; DAGISEL-GFX11-LABEL: chain_preserve_to_chain_use_all_v0_v7: ; DAGISEL-GFX11: ; %bb.0: ; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL-GFX11-NEXT: s_mov_b32 s32, 0 +; DAGISEL-GFX11-NEXT: s_mov_b32 s3, s0 ; DAGISEL-GFX11-NEXT: s_clause 0x1 ; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v11, s32 offset:4 ; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v16, s32 ; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v11, v8 -; DAGISEL-GFX11-NEXT: s_mov_b32 s3, s0 ; DAGISEL-GFX11-NEXT: ;;#ASMSTART ; DAGISEL-GFX11-NEXT: s_nop ; DAGISEL-GFX11-NEXT: ;;#ASMEND ; DAGISEL-GFX11-NEXT: s_mov_b32 s5, chain_callee@abs32@hi ; DAGISEL-GFX11-NEXT: s_mov_b32 s4, chain_callee@abs32@lo +; DAGISEL-GFX11-NEXT: s_mov_b32 s0, s3 ; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v8, v11 ; DAGISEL-GFX11-NEXT: s_clause 0x1 ; DAGISEL-GFX11-NEXT: scratch_load_b32 v16, off, s32 ; DAGISEL-GFX11-NEXT: scratch_load_b32 v11, off, s32 offset:4 -; DAGISEL-GFX11-NEXT: s_mov_b32 s0, s3 ; DAGISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 ; DAGISEL-GFX11-NEXT: s_setpc_b64 s[4:5] ; ; DAGISEL-GFX10-LABEL: chain_preserve_to_chain_use_all_v0_v7: ; DAGISEL-GFX10: ; %bb.0: ; DAGISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL-GFX10-NEXT: s_mov_b32 s32, 0 +; DAGISEL-GFX10-NEXT: s_mov_b32 s3, s0 ; DAGISEL-GFX10-NEXT: buffer_store_dword v11, off, s[48:51], s32 offset:4 ; 4-byte Folded Spill ; DAGISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], s32 ; 4-byte Folded Spill ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v11, v8 -; DAGISEL-GFX10-NEXT: s_mov_b32 s3, s0 ; DAGISEL-GFX10-NEXT: ;;#ASMSTART ; DAGISEL-GFX10-NEXT: s_nop ; DAGISEL-GFX10-NEXT: ;;#ASMEND ; DAGISEL-GFX10-NEXT: s_mov_b32 s5, chain_callee@abs32@hi ; DAGISEL-GFX10-NEXT: s_mov_b32 s4, chain_callee@abs32@lo +; DAGISEL-GFX10-NEXT: s_mov_b32 s0, s3 ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v8, v11 ; DAGISEL-GFX10-NEXT: s_clause 0x1 ; DAGISEL-GFX10-NEXT: buffer_load_dword v16, off, s[48:51], s32 ; DAGISEL-GFX10-NEXT: buffer_load_dword v11, off, s[48:51], s32 offset:4 -; DAGISEL-GFX10-NEXT: s_mov_b32 s0, s3 ; DAGISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 ; DAGISEL-GFX10-NEXT: s_setpc_b64 s[4:5] call void asm "s_nop", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v16},~{s0}"() @@ -510,8 +524,9 @@ ; GISEL-GFX11-LABEL: chain_preserve_to_chain_preserve_fewer_args: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX11-NEXT: scratch_store_b32 off, v16, s32 ; 4-byte Folded Spill +; GISEL-GFX11-NEXT: s_mov_b32 s32, 0 ; GISEL-GFX11-NEXT: s_mov_b32 s2, s0 +; GISEL-GFX11-NEXT: scratch_store_b32 off, v16, s32 ; 4-byte Folded Spill ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, v8 ; GISEL-GFX11-NEXT: ;;#ASMSTART ; GISEL-GFX11-NEXT: s_nop @@ -527,8 +542,9 @@ ; GISEL-GFX10-LABEL: chain_preserve_to_chain_preserve_fewer_args: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], s32 ; 4-byte Folded Spill +; GISEL-GFX10-NEXT: s_mov_b32 s32, 0 ; GISEL-GFX10-NEXT: s_mov_b32 s2, s0 +; GISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], s32 ; 4-byte Folded Spill ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, v8 ; GISEL-GFX10-NEXT: ;;#ASMSTART ; GISEL-GFX10-NEXT: s_nop @@ -544,15 +560,16 @@ ; DAGISEL-GFX11-LABEL: chain_preserve_to_chain_preserve_fewer_args: ; DAGISEL-GFX11: ; %bb.0: ; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v16, s32 ; 4-byte Folded Spill +; DAGISEL-GFX11-NEXT: s_mov_b32 s32, 0 ; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v1, v8 +; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v16, s32 ; 4-byte Folded Spill ; DAGISEL-GFX11-NEXT: s_mov_b32 s2, s0 ; DAGISEL-GFX11-NEXT: ;;#ASMSTART ; DAGISEL-GFX11-NEXT: s_nop ; DAGISEL-GFX11-NEXT: ;;#ASMEND ; DAGISEL-GFX11-NEXT: scratch_load_b32 v16, off, s32 ; 4-byte Folded Reload -; DAGISEL-GFX11-NEXT: s_mov_b32 s5, chain_preserve_callee_2@abs32@hi ; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v8, v1 +; DAGISEL-GFX11-NEXT: s_mov_b32 s5, chain_preserve_callee_2@abs32@hi ; DAGISEL-GFX11-NEXT: s_mov_b32 s4, chain_preserve_callee_2@abs32@lo ; DAGISEL-GFX11-NEXT: s_mov_b32 s0, s2 ; DAGISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 @@ -561,15 +578,16 @@ ; DAGISEL-GFX10-LABEL: chain_preserve_to_chain_preserve_fewer_args: ; DAGISEL-GFX10: ; %bb.0: ; DAGISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; DAGISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], s32 ; 4-byte Folded Spill +; DAGISEL-GFX10-NEXT: s_mov_b32 s32, 0 ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, v8 +; DAGISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], s32 ; 4-byte Folded Spill ; DAGISEL-GFX10-NEXT: s_mov_b32 s2, s0 ; DAGISEL-GFX10-NEXT: ;;#ASMSTART ; DAGISEL-GFX10-NEXT: s_nop ; DAGISEL-GFX10-NEXT: ;;#ASMEND ; DAGISEL-GFX10-NEXT: buffer_load_dword v16, off, s[48:51], s32 ; 4-byte Folded Reload -; DAGISEL-GFX10-NEXT: s_mov_b32 s5, chain_preserve_callee_2@abs32@hi ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v8, v1 +; DAGISEL-GFX10-NEXT: s_mov_b32 s5, chain_preserve_callee_2@abs32@hi ; DAGISEL-GFX10-NEXT: s_mov_b32 s4, chain_preserve_callee_2@abs32@lo ; DAGISEL-GFX10-NEXT: s_mov_b32 s0, s2 ; DAGISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 @@ -584,6 +602,81 @@ ; Note that amdgpu_cs_chain_preserve functions are not allowed to call ; llvm.amdgcn.cs.chain with more vgpr args than they received as parameters. +define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_dont_realign_stack(i32 %idx) { +; GISEL-GFX11-LABEL: amdgpu_cs_chain_preserve_dont_realign_stack: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: s_mov_b32 s3, 4 +; GISEL-GFX11-NEXT: s_mov_b32 s2, 3 +; GISEL-GFX11-NEXT: s_mov_b32 s1, 2 +; GISEL-GFX11-NEXT: s_mov_b32 s0, 1 +; GISEL-GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v8 +; GISEL-GFX11-NEXT: s_mov_b32 s32, 0 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GISEL-GFX11-NEXT: v_add_nc_u32_e32 v4, s32, v0 +; GISEL-GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GISEL-GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GISEL-GFX11-NEXT: scratch_store_b128 v4, v[0:3], off dlc +; GISEL-GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GISEL-GFX11-NEXT: s_endpgm +; +; GISEL-GFX10-LABEL: amdgpu_cs_chain_preserve_dont_realign_stack: +; GISEL-GFX10: ; %bb.0: +; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX10-NEXT: s_mov_b32 s32, 0 +; GISEL-GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v8 +; GISEL-GFX10-NEXT: v_lshrrev_b32_e64 v2, 5, s32 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 1 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v3, 3 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v4, 4 +; GISEL-GFX10-NEXT: v_add_nc_u32_e32 v0, v2, v0 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v2, 2 +; GISEL-GFX10-NEXT: buffer_store_dword v1, v0, s[48:51], 0 offen +; GISEL-GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GISEL-GFX10-NEXT: buffer_store_dword v2, v0, s[48:51], 0 offen offset:4 +; GISEL-GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GISEL-GFX10-NEXT: buffer_store_dword v3, v0, s[48:51], 0 offen offset:8 +; GISEL-GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GISEL-GFX10-NEXT: buffer_store_dword v4, v0, s[48:51], 0 offen offset:12 +; GISEL-GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GISEL-GFX10-NEXT: s_endpgm +; +; DAGISEL-GFX11-LABEL: amdgpu_cs_chain_preserve_dont_realign_stack: +; DAGISEL-GFX11: ; %bb.0: +; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL-GFX11-NEXT: s_mov_b32 s32, 0 +; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 +; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 +; DAGISEL-GFX11-NEXT: v_lshl_add_u32 v4, v8, 4, s32 +; DAGISEL-GFX11-NEXT: scratch_store_b128 v4, v[0:3], off dlc +; DAGISEL-GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; DAGISEL-GFX11-NEXT: s_endpgm +; +; DAGISEL-GFX10-LABEL: amdgpu_cs_chain_preserve_dont_realign_stack: +; DAGISEL-GFX10: ; %bb.0: +; DAGISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL-GFX10-NEXT: s_mov_b32 s32, 0 +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v0, 4 +; DAGISEL-GFX10-NEXT: v_lshrrev_b32_e64 v2, 5, s32 +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v3, 2 +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v4, 1 +; DAGISEL-GFX10-NEXT: v_lshl_add_u32 v1, v8, 4, v2 +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v2, 3 +; DAGISEL-GFX10-NEXT: buffer_store_dword v0, v1, s[48:51], 0 offen offset:12 +; DAGISEL-GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; DAGISEL-GFX10-NEXT: buffer_store_dword v2, v1, s[48:51], 0 offen offset:8 +; DAGISEL-GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; DAGISEL-GFX10-NEXT: buffer_store_dword v3, v1, s[48:51], 0 offen offset:4 +; DAGISEL-GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; DAGISEL-GFX10-NEXT: buffer_store_dword v4, v1, s[48:51], 0 offen +; DAGISEL-GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; DAGISEL-GFX10-NEXT: s_endpgm + %alloca.align32 = alloca [8 x <4 x i32>], align 32, addrspace(5) + %gep0 = getelementptr inbounds [8 x <4 x i32>], ptr addrspace(5) %alloca.align32, i32 0, i32 %idx + store volatile <4 x i32> , ptr addrspace(5) %gep0, align 32 + ret void +} + declare void @llvm.amdgcn.cs.chain.v3i32(ptr, i32, <3 x i32>, <3 x i32>, i32, ...) declare amdgpu_cs_chain_preserve void @chain_preserve_callee(<3 x i32> inreg, <3 x i32>) declare amdgpu_cs_chain void @chain_callee(<3 x i32> inreg, <3 x i32>) diff --git a/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain-preserve.mir b/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain-preserve.mir --- a/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain-preserve.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain-preserve.mir @@ -36,6 +36,7 @@ ; GCN-LABEL: name: preserve_active_lanes_above_args ; GCN: liveins: $sgpr0, $vgpr8, $vgpr9, $vgpr10 ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr32 = S_MOV_B32 0 ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr10, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) ; GCN-NEXT: renamable $vgpr10 = V_MOV_B32_e32 10, implicit $exec ; GCN-NEXT: $vgpr8 = COPY killed renamable $vgpr10 @@ -69,6 +70,7 @@ ; GCN-LABEL: name: preserve_all_lanes_wwm_above_args ; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9, $vgpr10 ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr32 = S_MOV_B32 0 ; GCN-NEXT: $sgpr1 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr10, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr11, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) @@ -141,6 +143,7 @@ ; GCN-LABEL: name: preserve_inactive_lanes_wwm_args ; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9, $vgpr10 ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr32 = S_MOV_B32 0 ; GCN-NEXT: $sgpr1 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr8, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr9, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) diff --git a/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain.mir b/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain.mir --- a/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain.mir @@ -37,6 +37,7 @@ ; GCN-LABEL: name: preserve_inactive_wwm ; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9 ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr32 = S_MOV_B32 0 ; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr8, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr9, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) @@ -71,6 +72,7 @@ ; GCN-LABEL: name: preserve_inactive_detected_wwm ; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9 ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr32 = S_MOV_B32 0 ; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr8, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr9, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)