diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -1318,7 +1318,8 @@ auto II = MBB.getFirstNonPHI(), IE = MBB.end(); if (IsEntry) { // Skip the instruction that saves LiveMask - if (II != IE && II->getOpcode() == AMDGPU::COPY) + if (II != IE && II->getOpcode() == AMDGPU::COPY && + II->getOperand(1).getReg() == TRI->getExec()) ++II; } diff --git a/llvm/test/CodeGen/AMDGPU/dual-source-blend-export.ll b/llvm/test/CodeGen/AMDGPU/dual-source-blend-export.ll --- a/llvm/test/CodeGen/AMDGPU/dual-source-blend-export.ll +++ b/llvm/test/CodeGen/AMDGPU/dual-source-blend-export.ll @@ -8,35 +8,34 @@ ; GCN-NEXT: s_mov_b32 s1, exec_lo ; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo ; GCN-NEXT: s_mov_b32 m0, s0 -; GCN-NEXT: v_mbcnt_lo_u32_b32 v7, -1, 0 -; GCN-NEXT: lds_param_load v3, attr1.x wait_vdst:15 -; GCN-NEXT: lds_param_load v4, attr1.y wait_vdst:15 -; GCN-NEXT: lds_param_load v5, attr1.z wait_vdst:15 -; GCN-NEXT: lds_param_load v6, attr1.w wait_vdst:15 -; GCN-NEXT: v_mbcnt_hi_u32_b32 v7, -1, v7 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GCN-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_and_b32 v7, 1, v7 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GCN-NEXT: v_interp_p10_f32 v8, v4, v2, v4 wait_exp:2 -; GCN-NEXT: v_interp_p10_f32 v10, v5, v2, v5 wait_exp:1 -; GCN-NEXT: v_interp_p10_f32 v9, v6, v2, v6 -; GCN-NEXT: v_interp_p10_f32 v2, v3, v2, v3 wait_exp:7 -; GCN-NEXT: v_interp_p2_f32 v4, v4, v1, v8 wait_exp:7 +; GCN-NEXT: v_dual_mov_b32 v2, v1 :: v_dual_mov_b32 v3, v0 +; GCN-NEXT: lds_param_load v4, attr1.x wait_vdst:15 +; GCN-NEXT: lds_param_load v5, attr1.y wait_vdst:15 +; GCN-NEXT: lds_param_load v6, attr1.z wait_vdst:15 +; GCN-NEXT: lds_param_load v7, attr1.w wait_vdst:15 +; GCN-NEXT: v_mbcnt_lo_u32_b32 v8, -1, 0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GCN-NEXT: v_mbcnt_hi_u32_b32 v8, -1, v8 +; GCN-NEXT: v_interp_p10_f32 v9, v5, v3, v5 wait_exp:2 +; GCN-NEXT: v_interp_p10_f32 v11, v6, v3, v6 wait_exp:1 +; GCN-NEXT: v_interp_p10_f32 v10, v7, v3, v7 +; GCN-NEXT: v_interp_p10_f32 v3, v4, v3, v4 wait_exp:7 +; GCN-NEXT: v_interp_p2_f32 v5, v5, v2, v9 wait_exp:7 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GCN-NEXT: v_interp_p2_f32 v5, v5, v1, v10 wait_exp:7 -; GCN-NEXT: v_interp_p2_f32 v6, v6, v1, v9 wait_exp:7 +; GCN-NEXT: v_interp_p2_f32 v6, v6, v2, v11 wait_exp:7 +; GCN-NEXT: v_interp_p2_f32 v7, v7, v2, v10 wait_exp:7 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GCN-NEXT: v_interp_p2_f32 v2, v3, v1, v2 wait_exp:7 -; GCN-NEXT: v_mov_b32_dpp v4, v4 dpp8:[1,0,3,2,5,4,7,6] -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GCN-NEXT: v_mov_b32_dpp v6, v6 dpp8:[1,0,3,2,5,4,7,6] -; GCN-NEXT: v_dual_cndmask_b32 v3, v4, v5 :: v_dual_cndmask_b32 v4, v5, v4 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GCN-NEXT: v_cndmask_b32_e32 v5, v2, v6, vcc_lo -; GCN-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo +; GCN-NEXT: v_interp_p2_f32 v2, v4, v2, v3 wait_exp:7 +; GCN-NEXT: v_mov_b32_dpp v5, v5 dpp8:[1,0,3,2,5,4,7,6] +; GCN-NEXT: v_and_b32_e32 v8, 1, v8 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GCN-NEXT: v_mov_b32_dpp v7, v7 dpp8:[1,0,3,2,5,4,7,6] +; GCN-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v8 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GCN-NEXT: v_dual_cndmask_b32 v3, v5, v6 :: v_dual_cndmask_b32 v4, v6, v5 +; GCN-NEXT: v_dual_cndmask_b32 v5, v2, v7 :: v_dual_cndmask_b32 v2, v7, v2 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GCN-NEXT: v_mov_b32_dpp v4, v4 dpp8:[1,0,3,2,5,4,7,6] -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GCN-NEXT: v_mov_b32_dpp v5, v5 dpp8:[1,0,3,2,5,4,7,6] ; GCN-NEXT: s_mov_b32 exec_lo, s1 ; GCN-NEXT: exp dual_src_blend0 v3, v2, off, off diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll @@ -505,6 +505,311 @@ ret void } +; When lowering function arguments, SelectionDAG will put the COPY for the last argument first. +; This used to trigger a bug in si-wqm where the first COPY in the entry block was always skipped +; before entering a strict mode, meaning that we'd only copy the active lanes of the last VGPR +; argument, so we'd end up using arbitrary values for the inactive lanes. +define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) %out, i32 %active, i32 %inactive) { +; GISEL11-LABEL: set_inactive_chain_arg_last_vgpr: +; GISEL11: ; %bb.0: +; GISEL11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL11-NEXT: s_mov_b32 s32, 0 +; GISEL11-NEXT: v_dual_mov_b32 v41, v8 :: v_dual_mov_b32 v42, v9 +; GISEL11-NEXT: v_mov_b32_e32 v43, v10 +; GISEL11-NEXT: s_or_saveexec_b32 s0, -1 +; GISEL11-NEXT: v_mov_b32_e32 v40, v11 +; GISEL11-NEXT: s_mov_b32 exec_lo, s0 +; GISEL11-NEXT: s_getpc_b64 s[0:1] +; GISEL11-NEXT: s_add_u32 s0, s0, gfx_callee@gotpcrel32@lo+4 +; GISEL11-NEXT: s_addc_u32 s1, s1, gfx_callee@gotpcrel32@hi+12 +; GISEL11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0 +; GISEL11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GISEL11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0 +; GISEL11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0 +; GISEL11-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v7, 0 +; GISEL11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v9, 0 +; GISEL11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v11, 0 +; GISEL11-NEXT: s_waitcnt lgkmcnt(0) +; GISEL11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GISEL11-NEXT: v_mov_b32_e32 v12, v43 +; GISEL11-NEXT: s_not_b32 exec_lo, exec_lo +; GISEL11-NEXT: v_mov_b32_e32 v12, v40 +; GISEL11-NEXT: s_not_b32 exec_lo, exec_lo +; GISEL11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL11-NEXT: v_mov_b32_e32 v0, v12 +; GISEL11-NEXT: global_store_b32 v[41:42], v0, off +; GISEL11-NEXT: s_nop 0 +; GISEL11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GISEL11-NEXT: s_endpgm +; +; DAGISEL11-LABEL: set_inactive_chain_arg_last_vgpr: +; DAGISEL11: ; %bb.0: +; DAGISEL11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL11-NEXT: s_mov_b32 s32, 0 +; DAGISEL11-NEXT: s_or_saveexec_b32 s0, -1 +; DAGISEL11-NEXT: v_mov_b32_e32 v40, v11 +; DAGISEL11-NEXT: s_mov_b32 exec_lo, s0 +; DAGISEL11-NEXT: s_getpc_b64 s[0:1] +; DAGISEL11-NEXT: s_add_u32 s0, s0, gfx_callee@gotpcrel32@lo+4 +; DAGISEL11-NEXT: s_addc_u32 s1, s1, gfx_callee@gotpcrel32@hi+12 +; DAGISEL11-NEXT: v_dual_mov_b32 v43, v10 :: v_dual_mov_b32 v42, v9 +; DAGISEL11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; DAGISEL11-NEXT: v_dual_mov_b32 v41, v8 :: v_dual_mov_b32 v0, 0 +; DAGISEL11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; DAGISEL11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v4, 0 +; DAGISEL11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v6, 0 +; DAGISEL11-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_mov_b32 v8, 0 +; DAGISEL11-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v10, 0 +; DAGISEL11-NEXT: v_mov_b32_e32 v11, 0 +; DAGISEL11-NEXT: s_waitcnt lgkmcnt(0) +; DAGISEL11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; DAGISEL11-NEXT: v_mov_b32_e32 v12, v43 +; DAGISEL11-NEXT: s_not_b32 exec_lo, exec_lo +; DAGISEL11-NEXT: v_mov_b32_e32 v12, v40 +; DAGISEL11-NEXT: s_not_b32 exec_lo, exec_lo +; DAGISEL11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL11-NEXT: v_mov_b32_e32 v0, v12 +; DAGISEL11-NEXT: global_store_b32 v[41:42], v0, off +; DAGISEL11-NEXT: s_nop 0 +; DAGISEL11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; DAGISEL11-NEXT: s_endpgm +; +; GISEL10-LABEL: set_inactive_chain_arg_last_vgpr: +; GISEL10: ; %bb.0: +; GISEL10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL10-NEXT: s_mov_b32 s32, 0 +; GISEL10-NEXT: v_mov_b32_e32 v41, v8 +; GISEL10-NEXT: v_mov_b32_e32 v42, v9 +; GISEL10-NEXT: v_mov_b32_e32 v43, v10 +; GISEL10-NEXT: s_or_saveexec_b32 s0, -1 +; GISEL10-NEXT: v_mov_b32_e32 v40, v11 +; GISEL10-NEXT: s_mov_b32 exec_lo, s0 +; GISEL10-NEXT: s_getpc_b64 s[0:1] +; GISEL10-NEXT: s_add_u32 s0, s0, gfx_callee@gotpcrel32@lo+4 +; GISEL10-NEXT: s_addc_u32 s1, s1, gfx_callee@gotpcrel32@hi+12 +; GISEL10-NEXT: v_mov_b32_e32 v0, 0 +; GISEL10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GISEL10-NEXT: v_mov_b32_e32 v1, 0 +; GISEL10-NEXT: v_mov_b32_e32 v2, 0 +; GISEL10-NEXT: v_mov_b32_e32 v3, 0 +; GISEL10-NEXT: v_mov_b32_e32 v4, 0 +; GISEL10-NEXT: v_mov_b32_e32 v5, 0 +; GISEL10-NEXT: v_mov_b32_e32 v6, 0 +; GISEL10-NEXT: v_mov_b32_e32 v7, 0 +; GISEL10-NEXT: v_mov_b32_e32 v8, 0 +; GISEL10-NEXT: v_mov_b32_e32 v9, 0 +; GISEL10-NEXT: v_mov_b32_e32 v10, 0 +; GISEL10-NEXT: v_mov_b32_e32 v11, 0 +; GISEL10-NEXT: s_mov_b64 s[0:1], s[48:49] +; GISEL10-NEXT: s_mov_b64 s[2:3], s[50:51] +; GISEL10-NEXT: s_waitcnt lgkmcnt(0) +; GISEL10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL10-NEXT: v_mov_b32_e32 v12, v43 +; GISEL10-NEXT: s_not_b32 exec_lo, exec_lo +; GISEL10-NEXT: v_mov_b32_e32 v12, v40 +; GISEL10-NEXT: s_not_b32 exec_lo, exec_lo +; GISEL10-NEXT: v_mov_b32_e32 v0, v12 +; GISEL10-NEXT: global_store_dword v[41:42], v0, off +; GISEL10-NEXT: s_endpgm +; +; DAGISEL10-LABEL: set_inactive_chain_arg_last_vgpr: +; DAGISEL10: ; %bb.0: +; DAGISEL10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL10-NEXT: s_mov_b32 s32, 0 +; DAGISEL10-NEXT: s_or_saveexec_b32 s0, -1 +; DAGISEL10-NEXT: v_mov_b32_e32 v40, v11 +; DAGISEL10-NEXT: s_mov_b32 exec_lo, s0 +; DAGISEL10-NEXT: s_getpc_b64 s[0:1] +; DAGISEL10-NEXT: s_add_u32 s0, s0, gfx_callee@gotpcrel32@lo+4 +; DAGISEL10-NEXT: s_addc_u32 s1, s1, gfx_callee@gotpcrel32@hi+12 +; DAGISEL10-NEXT: v_mov_b32_e32 v43, v10 +; DAGISEL10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; DAGISEL10-NEXT: v_mov_b32_e32 v42, v9 +; DAGISEL10-NEXT: v_mov_b32_e32 v41, v8 +; DAGISEL10-NEXT: v_mov_b32_e32 v0, 0 +; DAGISEL10-NEXT: v_mov_b32_e32 v1, 0 +; DAGISEL10-NEXT: v_mov_b32_e32 v2, 0 +; DAGISEL10-NEXT: v_mov_b32_e32 v3, 0 +; DAGISEL10-NEXT: v_mov_b32_e32 v4, 0 +; DAGISEL10-NEXT: v_mov_b32_e32 v5, 0 +; DAGISEL10-NEXT: v_mov_b32_e32 v6, 0 +; DAGISEL10-NEXT: v_mov_b32_e32 v7, 0 +; DAGISEL10-NEXT: v_mov_b32_e32 v8, 0 +; DAGISEL10-NEXT: v_mov_b32_e32 v9, 0 +; DAGISEL10-NEXT: v_mov_b32_e32 v10, 0 +; DAGISEL10-NEXT: v_mov_b32_e32 v11, 0 +; DAGISEL10-NEXT: s_mov_b64 s[0:1], s[48:49] +; DAGISEL10-NEXT: s_mov_b64 s[2:3], s[50:51] +; DAGISEL10-NEXT: s_waitcnt lgkmcnt(0) +; DAGISEL10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; DAGISEL10-NEXT: v_mov_b32_e32 v12, v43 +; DAGISEL10-NEXT: s_not_b32 exec_lo, exec_lo +; DAGISEL10-NEXT: v_mov_b32_e32 v12, v40 +; DAGISEL10-NEXT: s_not_b32 exec_lo, exec_lo +; DAGISEL10-NEXT: v_mov_b32_e32 v0, v12 +; DAGISEL10-NEXT: global_store_dword v[41:42], v0, off +; DAGISEL10-NEXT: s_endpgm +; +; GISEL11_W64-LABEL: set_inactive_chain_arg_last_vgpr: +; GISEL11_W64: ; %bb.0: +; GISEL11_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL11_W64-NEXT: s_mov_b32 s32, 0 +; GISEL11_W64-NEXT: v_mov_b32_e32 v41, v8 +; GISEL11_W64-NEXT: v_mov_b32_e32 v42, v9 +; GISEL11_W64-NEXT: v_mov_b32_e32 v43, v10 +; GISEL11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GISEL11_W64-NEXT: v_mov_b32_e32 v40, v11 +; GISEL11_W64-NEXT: s_mov_b64 exec, s[0:1] +; GISEL11_W64-NEXT: s_getpc_b64 s[0:1] +; GISEL11_W64-NEXT: s_add_u32 s0, s0, gfx_callee@gotpcrel32@lo+4 +; GISEL11_W64-NEXT: s_addc_u32 s1, s1, gfx_callee@gotpcrel32@hi+12 +; GISEL11_W64-NEXT: v_mov_b32_e32 v0, 0 +; GISEL11_W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GISEL11_W64-NEXT: v_mov_b32_e32 v1, 0 +; GISEL11_W64-NEXT: v_mov_b32_e32 v2, 0 +; GISEL11_W64-NEXT: v_mov_b32_e32 v3, 0 +; GISEL11_W64-NEXT: v_mov_b32_e32 v4, 0 +; GISEL11_W64-NEXT: v_mov_b32_e32 v5, 0 +; GISEL11_W64-NEXT: v_mov_b32_e32 v6, 0 +; GISEL11_W64-NEXT: v_mov_b32_e32 v7, 0 +; GISEL11_W64-NEXT: v_mov_b32_e32 v8, 0 +; GISEL11_W64-NEXT: v_mov_b32_e32 v9, 0 +; GISEL11_W64-NEXT: v_mov_b32_e32 v10, 0 +; GISEL11_W64-NEXT: v_mov_b32_e32 v11, 0 +; GISEL11_W64-NEXT: s_waitcnt lgkmcnt(0) +; GISEL11_W64-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GISEL11_W64-NEXT: v_mov_b32_e32 v12, v43 +; GISEL11_W64-NEXT: s_not_b64 exec, exec +; GISEL11_W64-NEXT: v_mov_b32_e32 v12, v40 +; GISEL11_W64-NEXT: s_not_b64 exec, exec +; GISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL11_W64-NEXT: v_mov_b32_e32 v0, v12 +; GISEL11_W64-NEXT: global_store_b32 v[41:42], v0, off +; GISEL11_W64-NEXT: s_nop 0 +; GISEL11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GISEL11_W64-NEXT: s_endpgm +; +; DAGISEL11_W64-LABEL: set_inactive_chain_arg_last_vgpr: +; DAGISEL11_W64: ; %bb.0: +; DAGISEL11_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL11_W64-NEXT: s_mov_b32 s32, 0 +; DAGISEL11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; DAGISEL11_W64-NEXT: v_mov_b32_e32 v40, v11 +; DAGISEL11_W64-NEXT: s_mov_b64 exec, s[0:1] +; DAGISEL11_W64-NEXT: s_getpc_b64 s[0:1] +; DAGISEL11_W64-NEXT: s_add_u32 s0, s0, gfx_callee@gotpcrel32@lo+4 +; DAGISEL11_W64-NEXT: s_addc_u32 s1, s1, gfx_callee@gotpcrel32@hi+12 +; DAGISEL11_W64-NEXT: v_mov_b32_e32 v43, v10 +; DAGISEL11_W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; DAGISEL11_W64-NEXT: v_mov_b32_e32 v42, v9 +; DAGISEL11_W64-NEXT: v_mov_b32_e32 v41, v8 +; DAGISEL11_W64-NEXT: v_mov_b32_e32 v0, 0 +; DAGISEL11_W64-NEXT: v_mov_b32_e32 v1, 0 +; DAGISEL11_W64-NEXT: v_mov_b32_e32 v2, 0 +; DAGISEL11_W64-NEXT: v_mov_b32_e32 v3, 0 +; DAGISEL11_W64-NEXT: v_mov_b32_e32 v4, 0 +; DAGISEL11_W64-NEXT: v_mov_b32_e32 v5, 0 +; DAGISEL11_W64-NEXT: v_mov_b32_e32 v6, 0 +; DAGISEL11_W64-NEXT: v_mov_b32_e32 v7, 0 +; DAGISEL11_W64-NEXT: v_mov_b32_e32 v8, 0 +; DAGISEL11_W64-NEXT: v_mov_b32_e32 v9, 0 +; DAGISEL11_W64-NEXT: v_mov_b32_e32 v10, 0 +; DAGISEL11_W64-NEXT: v_mov_b32_e32 v11, 0 +; DAGISEL11_W64-NEXT: s_waitcnt lgkmcnt(0) +; DAGISEL11_W64-NEXT: s_swappc_b64 s[30:31], s[0:1] +; DAGISEL11_W64-NEXT: v_mov_b32_e32 v12, v43 +; DAGISEL11_W64-NEXT: s_not_b64 exec, exec +; DAGISEL11_W64-NEXT: v_mov_b32_e32 v12, v40 +; DAGISEL11_W64-NEXT: s_not_b64 exec, exec +; DAGISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL11_W64-NEXT: v_mov_b32_e32 v0, v12 +; DAGISEL11_W64-NEXT: global_store_b32 v[41:42], v0, off +; DAGISEL11_W64-NEXT: s_nop 0 +; DAGISEL11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; DAGISEL11_W64-NEXT: s_endpgm +; +; GISEL10_W64-LABEL: set_inactive_chain_arg_last_vgpr: +; GISEL10_W64: ; %bb.0: +; GISEL10_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL10_W64-NEXT: s_mov_b32 s32, 0 +; GISEL10_W64-NEXT: v_mov_b32_e32 v41, v8 +; GISEL10_W64-NEXT: v_mov_b32_e32 v42, v9 +; GISEL10_W64-NEXT: v_mov_b32_e32 v43, v10 +; GISEL10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GISEL10_W64-NEXT: v_mov_b32_e32 v40, v11 +; GISEL10_W64-NEXT: s_mov_b64 exec, s[0:1] +; GISEL10_W64-NEXT: s_getpc_b64 s[0:1] +; GISEL10_W64-NEXT: s_add_u32 s0, s0, gfx_callee@gotpcrel32@lo+4 +; GISEL10_W64-NEXT: s_addc_u32 s1, s1, gfx_callee@gotpcrel32@hi+12 +; GISEL10_W64-NEXT: v_mov_b32_e32 v0, 0 +; GISEL10_W64-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GISEL10_W64-NEXT: v_mov_b32_e32 v1, 0 +; GISEL10_W64-NEXT: v_mov_b32_e32 v2, 0 +; GISEL10_W64-NEXT: v_mov_b32_e32 v3, 0 +; GISEL10_W64-NEXT: v_mov_b32_e32 v4, 0 +; GISEL10_W64-NEXT: v_mov_b32_e32 v5, 0 +; GISEL10_W64-NEXT: v_mov_b32_e32 v6, 0 +; GISEL10_W64-NEXT: v_mov_b32_e32 v7, 0 +; GISEL10_W64-NEXT: v_mov_b32_e32 v8, 0 +; GISEL10_W64-NEXT: v_mov_b32_e32 v9, 0 +; GISEL10_W64-NEXT: v_mov_b32_e32 v10, 0 +; GISEL10_W64-NEXT: v_mov_b32_e32 v11, 0 +; GISEL10_W64-NEXT: s_mov_b64 s[0:1], s[48:49] +; GISEL10_W64-NEXT: s_mov_b64 s[2:3], s[50:51] +; GISEL10_W64-NEXT: s_waitcnt lgkmcnt(0) +; GISEL10_W64-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL10_W64-NEXT: v_mov_b32_e32 v12, v43 +; GISEL10_W64-NEXT: s_not_b64 exec, exec +; GISEL10_W64-NEXT: v_mov_b32_e32 v12, v40 +; GISEL10_W64-NEXT: s_not_b64 exec, exec +; GISEL10_W64-NEXT: v_mov_b32_e32 v0, v12 +; GISEL10_W64-NEXT: global_store_dword v[41:42], v0, off +; GISEL10_W64-NEXT: s_endpgm +; +; DAGISEL10_W64-LABEL: set_inactive_chain_arg_last_vgpr: +; DAGISEL10_W64: ; %bb.0: +; DAGISEL10_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL10_W64-NEXT: s_mov_b32 s32, 0 +; DAGISEL10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; DAGISEL10_W64-NEXT: v_mov_b32_e32 v40, v11 +; DAGISEL10_W64-NEXT: s_mov_b64 exec, s[0:1] +; DAGISEL10_W64-NEXT: s_getpc_b64 s[0:1] +; DAGISEL10_W64-NEXT: s_add_u32 s0, s0, gfx_callee@gotpcrel32@lo+4 +; DAGISEL10_W64-NEXT: s_addc_u32 s1, s1, gfx_callee@gotpcrel32@hi+12 +; DAGISEL10_W64-NEXT: v_mov_b32_e32 v43, v10 +; DAGISEL10_W64-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; DAGISEL10_W64-NEXT: v_mov_b32_e32 v42, v9 +; DAGISEL10_W64-NEXT: v_mov_b32_e32 v41, v8 +; DAGISEL10_W64-NEXT: v_mov_b32_e32 v0, 0 +; DAGISEL10_W64-NEXT: v_mov_b32_e32 v1, 0 +; DAGISEL10_W64-NEXT: v_mov_b32_e32 v2, 0 +; DAGISEL10_W64-NEXT: v_mov_b32_e32 v3, 0 +; DAGISEL10_W64-NEXT: v_mov_b32_e32 v4, 0 +; DAGISEL10_W64-NEXT: v_mov_b32_e32 v5, 0 +; DAGISEL10_W64-NEXT: v_mov_b32_e32 v6, 0 +; DAGISEL10_W64-NEXT: v_mov_b32_e32 v7, 0 +; DAGISEL10_W64-NEXT: v_mov_b32_e32 v8, 0 +; DAGISEL10_W64-NEXT: v_mov_b32_e32 v9, 0 +; DAGISEL10_W64-NEXT: v_mov_b32_e32 v10, 0 +; DAGISEL10_W64-NEXT: v_mov_b32_e32 v11, 0 +; DAGISEL10_W64-NEXT: s_mov_b64 s[0:1], s[48:49] +; DAGISEL10_W64-NEXT: s_mov_b64 s[2:3], s[50:51] +; DAGISEL10_W64-NEXT: s_waitcnt lgkmcnt(0) +; DAGISEL10_W64-NEXT: s_swappc_b64 s[30:31], s[4:5] +; DAGISEL10_W64-NEXT: v_mov_b32_e32 v12, v43 +; DAGISEL10_W64-NEXT: s_not_b64 exec, exec +; DAGISEL10_W64-NEXT: v_mov_b32_e32 v12, v40 +; DAGISEL10_W64-NEXT: s_not_b64 exec, exec +; DAGISEL10_W64-NEXT: v_mov_b32_e32 v0, v12 +; DAGISEL10_W64-NEXT: global_store_dword v[41:42], v0, off +; DAGISEL10_W64-NEXT: s_endpgm + call amdgpu_gfx void @gfx_callee(<12 x i32> zeroinitializer) + %tmp = call i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32 %active, i32 %inactive) #0 + %wwm = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp) + store i32 %wwm, ptr addrspace(1) %out + ret void +} + declare i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32, i32) #0 declare i64 @llvm.amdgcn.set.inactive.chain.arg.i64(i64, i64) #0 declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -1885,8 +1885,9 @@ ; GFX1032-LABEL: test_wwm1: ; GFX1032: ; %bb.0: ; %main_body ; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: v_add_f32_e32 v2, v2, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-NEXT: v_mov_b32_e32 v3, v0 +; GFX1032-NEXT: v_add_f32_e32 v2, v3, v2 ; GFX1032-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-NEXT: v_mov_b32_e32 v0, v2 ; GFX1032-NEXT: ; return to shader part epilog @@ -1894,8 +1895,9 @@ ; GFX1064-LABEL: test_wwm1: ; GFX1064: ; %bb.0: ; %main_body ; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: v_add_f32_e32 v2, v2, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-NEXT: v_mov_b32_e32 v3, v0 +; GFX1064-NEXT: v_add_f32_e32 v2, v3, v2 ; GFX1064-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-NEXT: v_mov_b32_e32 v0, v2 ; GFX1064-NEXT: ; return to shader part epilog @@ -1970,8 +1972,9 @@ ; GFX1032-LABEL: test_strict_wwm1: ; GFX1032: ; %bb.0: ; %main_body ; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: v_add_f32_e32 v2, v2, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-NEXT: v_mov_b32_e32 v3, v0 +; GFX1032-NEXT: v_add_f32_e32 v2, v3, v2 ; GFX1032-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-NEXT: v_mov_b32_e32 v0, v2 ; GFX1032-NEXT: ; return to shader part epilog @@ -1979,8 +1982,9 @@ ; GFX1064-LABEL: test_strict_wwm1: ; GFX1064: ; %bb.0: ; %main_body ; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: v_add_f32_e32 v2, v2, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-NEXT: v_mov_b32_e32 v3, v0 +; GFX1064-NEXT: v_add_f32_e32 v2, v3, v2 ; GFX1064-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-NEXT: v_mov_b32_e32 v0, v2 ; GFX1064-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -336,10 +336,10 @@ ; GFX9-O0-NEXT: s_add_u32 s24, s24, s11 ; GFX9-O0-NEXT: s_addc_u32 s25, s25, 0 ; GFX9-O0-NEXT: ; implicit-def: $vgpr7 : SGPR spill to VGPR lane +; GFX9-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 +; GFX9-O0-NEXT: v_writelane_b32 v7, s12, 0 +; GFX9-O0-NEXT: v_writelane_b32 v7, s13, 1 ; GFX9-O0-NEXT: s_mov_b32 s14, s10 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 -; GFX9-O0-NEXT: v_writelane_b32 v7, s10, 0 -; GFX9-O0-NEXT: v_writelane_b32 v7, s11, 1 ; GFX9-O0-NEXT: s_mov_b32 s13, s9 ; GFX9-O0-NEXT: s_mov_b32 s12, s8 ; GFX9-O0-NEXT: s_mov_b64 s[10:11], s[6:7] @@ -570,10 +570,10 @@ ; GFX9-O0-NEXT: s_add_u32 s24, s24, s11 ; GFX9-O0-NEXT: s_addc_u32 s25, s25, 0 ; GFX9-O0-NEXT: ; implicit-def: $vgpr12 : SGPR spill to VGPR lane +; GFX9-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 +; GFX9-O0-NEXT: v_writelane_b32 v12, s12, 0 +; GFX9-O0-NEXT: v_writelane_b32 v12, s13, 1 ; GFX9-O0-NEXT: s_mov_b32 s14, s10 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 -; GFX9-O0-NEXT: v_writelane_b32 v12, s10, 0 -; GFX9-O0-NEXT: v_writelane_b32 v12, s11, 1 ; GFX9-O0-NEXT: s_mov_b32 s13, s9 ; GFX9-O0-NEXT: s_mov_b32 s12, s8 ; GFX9-O0-NEXT: s_mov_b64 s[10:11], s[6:7] @@ -1204,10 +1204,10 @@ ; GFX9-O0-NEXT: s_add_u32 s24, s24, s11 ; GFX9-O0-NEXT: s_addc_u32 s25, s25, 0 ; GFX9-O0-NEXT: ; implicit-def: $vgpr7 : SGPR spill to VGPR lane +; GFX9-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 +; GFX9-O0-NEXT: v_writelane_b32 v7, s12, 0 +; GFX9-O0-NEXT: v_writelane_b32 v7, s13, 1 ; GFX9-O0-NEXT: s_mov_b32 s14, s10 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 -; GFX9-O0-NEXT: v_writelane_b32 v7, s10, 0 -; GFX9-O0-NEXT: v_writelane_b32 v7, s11, 1 ; GFX9-O0-NEXT: s_mov_b32 s13, s9 ; GFX9-O0-NEXT: s_mov_b32 s12, s8 ; GFX9-O0-NEXT: s_mov_b64 s[10:11], s[6:7] @@ -1438,10 +1438,10 @@ ; GFX9-O0-NEXT: s_add_u32 s24, s24, s11 ; GFX9-O0-NEXT: s_addc_u32 s25, s25, 0 ; GFX9-O0-NEXT: ; implicit-def: $vgpr12 : SGPR spill to VGPR lane +; GFX9-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 +; GFX9-O0-NEXT: v_writelane_b32 v12, s12, 0 +; GFX9-O0-NEXT: v_writelane_b32 v12, s13, 1 ; GFX9-O0-NEXT: s_mov_b32 s14, s10 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 -; GFX9-O0-NEXT: v_writelane_b32 v12, s10, 0 -; GFX9-O0-NEXT: v_writelane_b32 v12, s11, 1 ; GFX9-O0-NEXT: s_mov_b32 s13, s9 ; GFX9-O0-NEXT: s_mov_b32 s12, s8 ; GFX9-O0-NEXT: s_mov_b64 s[10:11], s[6:7]