diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -1320,7 +1320,8 @@ auto II = MBB.getFirstNonPHI(), IE = MBB.end(); if (IsEntry) { // Skip the instruction that saves LiveMask - if (II != IE && II->getOpcode() == AMDGPU::COPY) + if (II != IE && II->getOpcode() == AMDGPU::COPY && + II->getOperand(1).getReg() == TRI->getExec()) ++II; } diff --git a/llvm/test/CodeGen/AMDGPU/dual-source-blend-export.ll b/llvm/test/CodeGen/AMDGPU/dual-source-blend-export.ll --- a/llvm/test/CodeGen/AMDGPU/dual-source-blend-export.ll +++ b/llvm/test/CodeGen/AMDGPU/dual-source-blend-export.ll @@ -8,35 +8,34 @@ ; GCN-NEXT: s_mov_b32 s1, exec_lo ; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo ; GCN-NEXT: s_mov_b32 m0, s0 -; GCN-NEXT: v_mbcnt_lo_u32_b32 v7, -1, 0 -; GCN-NEXT: lds_param_load v3, attr1.x wait_vdst:15 -; GCN-NEXT: lds_param_load v4, attr1.y wait_vdst:15 -; GCN-NEXT: lds_param_load v5, attr1.z wait_vdst:15 -; GCN-NEXT: lds_param_load v6, attr1.w wait_vdst:15 -; GCN-NEXT: v_mbcnt_hi_u32_b32 v7, -1, v7 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GCN-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_and_b32 v7, 1, v7 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GCN-NEXT: v_interp_p10_f32 v8, v4, v2, v4 wait_exp:2 -; GCN-NEXT: v_interp_p10_f32 v10, v5, v2, v5 wait_exp:1 -; GCN-NEXT: v_interp_p10_f32 v9, v6, v2, v6 -; GCN-NEXT: v_interp_p10_f32 v2, v3, v2, v3 wait_exp:7 -; GCN-NEXT: v_interp_p2_f32 v4, v4, v1, v8 wait_exp:7 +; GCN-NEXT: v_dual_mov_b32 v2, v1 :: v_dual_mov_b32 v3, v0 +; GCN-NEXT: lds_param_load v4, attr1.x wait_vdst:15 +; GCN-NEXT: lds_param_load v5, attr1.y wait_vdst:15 +; GCN-NEXT: lds_param_load v6, attr1.z wait_vdst:15 +; GCN-NEXT: lds_param_load v7, attr1.w wait_vdst:15 +; GCN-NEXT: v_mbcnt_lo_u32_b32 v8, -1, 0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GCN-NEXT: v_mbcnt_hi_u32_b32 v8, -1, v8 +; GCN-NEXT: v_interp_p10_f32 v9, v5, v3, v5 wait_exp:2 +; GCN-NEXT: v_interp_p10_f32 v11, v6, v3, v6 wait_exp:1 +; GCN-NEXT: v_interp_p10_f32 v10, v7, v3, v7 +; GCN-NEXT: v_interp_p10_f32 v3, v4, v3, v4 wait_exp:7 +; GCN-NEXT: v_interp_p2_f32 v5, v5, v2, v9 wait_exp:7 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GCN-NEXT: v_interp_p2_f32 v5, v5, v1, v10 wait_exp:7 -; GCN-NEXT: v_interp_p2_f32 v6, v6, v1, v9 wait_exp:7 +; GCN-NEXT: v_interp_p2_f32 v6, v6, v2, v11 wait_exp:7 +; GCN-NEXT: v_interp_p2_f32 v7, v7, v2, v10 wait_exp:7 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GCN-NEXT: v_interp_p2_f32 v2, v3, v1, v2 wait_exp:7 -; GCN-NEXT: v_mov_b32_dpp v4, v4 dpp8:[1,0,3,2,5,4,7,6] -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GCN-NEXT: v_mov_b32_dpp v6, v6 dpp8:[1,0,3,2,5,4,7,6] -; GCN-NEXT: v_dual_cndmask_b32 v3, v4, v5 :: v_dual_cndmask_b32 v4, v5, v4 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GCN-NEXT: v_cndmask_b32_e32 v5, v2, v6, vcc_lo -; GCN-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo +; GCN-NEXT: v_interp_p2_f32 v2, v4, v2, v3 wait_exp:7 +; GCN-NEXT: v_mov_b32_dpp v5, v5 dpp8:[1,0,3,2,5,4,7,6] +; GCN-NEXT: v_and_b32_e32 v8, 1, v8 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GCN-NEXT: v_mov_b32_dpp v7, v7 dpp8:[1,0,3,2,5,4,7,6] +; GCN-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v8 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GCN-NEXT: v_dual_cndmask_b32 v3, v5, v6 :: v_dual_cndmask_b32 v4, v6, v5 +; GCN-NEXT: v_dual_cndmask_b32 v5, v2, v7 :: v_dual_cndmask_b32 v2, v7, v2 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GCN-NEXT: v_mov_b32_dpp v4, v4 dpp8:[1,0,3,2,5,4,7,6] -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GCN-NEXT: v_mov_b32_dpp v5, v5 dpp8:[1,0,3,2,5,4,7,6] ; GCN-NEXT: s_mov_b32 exec_lo, s1 ; GCN-NEXT: exp dual_src_blend0 v3, v2, off, off diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll @@ -523,28 +523,31 @@ ; ; DAGISEL11-LABEL: set_inactive_chain_arg_last_vgpr: ; DAGISEL11: ; %bb.0: +; DAGISEL11-NEXT: s_mov_b32 s32, 0 +; DAGISEL11-NEXT: s_or_saveexec_b32 s0, -1 +; DAGISEL11-NEXT: v_mov_b32_e32 v40, v11 +; DAGISEL11-NEXT: s_mov_b32 exec_lo, s0 ; DAGISEL11-NEXT: s_getpc_b64 s[0:1] ; DAGISEL11-NEXT: s_add_u32 s0, s0, gfx_callee@gotpcrel32@lo+4 ; DAGISEL11-NEXT: s_addc_u32 s1, s1, gfx_callee@gotpcrel32@hi+12 -; DAGISEL11-NEXT: v_dual_mov_b32 v42, v11 :: v_dual_mov_b32 v43, v10 +; DAGISEL11-NEXT: v_dual_mov_b32 v43, v10 :: v_dual_mov_b32 v42, v9 ; DAGISEL11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; DAGISEL11-NEXT: v_dual_mov_b32 v41, v9 :: v_dual_mov_b32 v40, v8 -; DAGISEL11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0 -; DAGISEL11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0 -; DAGISEL11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0 -; DAGISEL11-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v7, 0 -; DAGISEL11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v9, 0 -; DAGISEL11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v11, 0 -; DAGISEL11-NEXT: s_mov_b32 s32, 0 +; DAGISEL11-NEXT: v_dual_mov_b32 v41, v8 :: v_dual_mov_b32 v0, 0 +; DAGISEL11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; DAGISEL11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v4, 0 +; DAGISEL11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v6, 0 +; DAGISEL11-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_mov_b32 v8, 0 +; DAGISEL11-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v10, 0 +; DAGISEL11-NEXT: v_mov_b32_e32 v11, 0 ; DAGISEL11-NEXT: s_waitcnt lgkmcnt(0) ; DAGISEL11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; DAGISEL11-NEXT: v_mov_b32_e32 v12, v43 ; DAGISEL11-NEXT: s_not_b32 exec_lo, exec_lo -; DAGISEL11-NEXT: v_mov_b32_e32 v12, v42 +; DAGISEL11-NEXT: v_mov_b32_e32 v12, v40 ; DAGISEL11-NEXT: s_not_b32 exec_lo, exec_lo ; DAGISEL11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; DAGISEL11-NEXT: v_mov_b32_e32 v0, v12 -; DAGISEL11-NEXT: global_store_b32 v[40:41], v0, off +; DAGISEL11-NEXT: global_store_b32 v[41:42], v0, off ; DAGISEL11-NEXT: s_nop 0 ; DAGISEL11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; DAGISEL11-NEXT: s_endpgm @@ -588,14 +591,17 @@ ; ; DAGISEL10-LABEL: set_inactive_chain_arg_last_vgpr: ; DAGISEL10: ; %bb.0: +; DAGISEL10-NEXT: s_mov_b32 s32, 0 +; DAGISEL10-NEXT: s_or_saveexec_b32 s0, -1 +; DAGISEL10-NEXT: v_mov_b32_e32 v40, v11 +; DAGISEL10-NEXT: s_mov_b32 exec_lo, s0 ; DAGISEL10-NEXT: s_getpc_b64 s[0:1] ; DAGISEL10-NEXT: s_add_u32 s0, s0, gfx_callee@gotpcrel32@lo+4 ; DAGISEL10-NEXT: s_addc_u32 s1, s1, gfx_callee@gotpcrel32@hi+12 -; DAGISEL10-NEXT: v_mov_b32_e32 v42, v11 -; DAGISEL10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; DAGISEL10-NEXT: v_mov_b32_e32 v43, v10 -; DAGISEL10-NEXT: v_mov_b32_e32 v41, v9 -; DAGISEL10-NEXT: v_mov_b32_e32 v40, v8 +; DAGISEL10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; DAGISEL10-NEXT: v_mov_b32_e32 v42, v9 +; DAGISEL10-NEXT: v_mov_b32_e32 v41, v8 ; DAGISEL10-NEXT: v_mov_b32_e32 v0, 0 ; DAGISEL10-NEXT: v_mov_b32_e32 v1, 0 ; DAGISEL10-NEXT: v_mov_b32_e32 v2, 0 @@ -610,15 +616,14 @@ ; DAGISEL10-NEXT: v_mov_b32_e32 v11, 0 ; DAGISEL10-NEXT: s_mov_b64 s[0:1], s[48:49] ; DAGISEL10-NEXT: s_mov_b64 s[2:3], s[50:51] -; DAGISEL10-NEXT: s_mov_b32 s32, 0 ; DAGISEL10-NEXT: s_waitcnt lgkmcnt(0) ; DAGISEL10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; DAGISEL10-NEXT: v_mov_b32_e32 v12, v43 ; DAGISEL10-NEXT: s_not_b32 exec_lo, exec_lo -; DAGISEL10-NEXT: v_mov_b32_e32 v12, v42 +; DAGISEL10-NEXT: v_mov_b32_e32 v12, v40 ; DAGISEL10-NEXT: s_not_b32 exec_lo, exec_lo ; DAGISEL10-NEXT: v_mov_b32_e32 v0, v12 -; DAGISEL10-NEXT: global_store_dword v[40:41], v0, off +; DAGISEL10-NEXT: global_store_dword v[41:42], v0, off ; DAGISEL10-NEXT: s_endpgm ; ; GISEL11_W64-LABEL: set_inactive_chain_arg_last_vgpr: @@ -661,14 +666,17 @@ ; ; DAGISEL11_W64-LABEL: set_inactive_chain_arg_last_vgpr: ; DAGISEL11_W64: ; %bb.0: +; DAGISEL11_W64-NEXT: s_mov_b32 s32, 0 +; DAGISEL11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; DAGISEL11_W64-NEXT: v_mov_b32_e32 v40, v11 +; DAGISEL11_W64-NEXT: s_mov_b64 exec, s[0:1] ; DAGISEL11_W64-NEXT: s_getpc_b64 s[0:1] ; DAGISEL11_W64-NEXT: s_add_u32 s0, s0, gfx_callee@gotpcrel32@lo+4 ; DAGISEL11_W64-NEXT: s_addc_u32 s1, s1, gfx_callee@gotpcrel32@hi+12 -; DAGISEL11_W64-NEXT: v_mov_b32_e32 v42, v11 -; DAGISEL11_W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; DAGISEL11_W64-NEXT: v_mov_b32_e32 v43, v10 -; DAGISEL11_W64-NEXT: v_mov_b32_e32 v41, v9 -; DAGISEL11_W64-NEXT: v_mov_b32_e32 v40, v8 +; DAGISEL11_W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; DAGISEL11_W64-NEXT: v_mov_b32_e32 v42, v9 +; DAGISEL11_W64-NEXT: v_mov_b32_e32 v41, v8 ; DAGISEL11_W64-NEXT: v_mov_b32_e32 v0, 0 ; DAGISEL11_W64-NEXT: v_mov_b32_e32 v1, 0 ; DAGISEL11_W64-NEXT: v_mov_b32_e32 v2, 0 @@ -681,16 +689,15 @@ ; DAGISEL11_W64-NEXT: v_mov_b32_e32 v9, 0 ; DAGISEL11_W64-NEXT: v_mov_b32_e32 v10, 0 ; DAGISEL11_W64-NEXT: v_mov_b32_e32 v11, 0 -; DAGISEL11_W64-NEXT: s_mov_b32 s32, 0 ; DAGISEL11_W64-NEXT: s_waitcnt lgkmcnt(0) ; DAGISEL11_W64-NEXT: s_swappc_b64 s[30:31], s[0:1] ; DAGISEL11_W64-NEXT: v_mov_b32_e32 v12, v43 ; DAGISEL11_W64-NEXT: s_not_b64 exec, exec -; DAGISEL11_W64-NEXT: v_mov_b32_e32 v12, v42 +; DAGISEL11_W64-NEXT: v_mov_b32_e32 v12, v40 ; DAGISEL11_W64-NEXT: s_not_b64 exec, exec ; DAGISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) ; DAGISEL11_W64-NEXT: v_mov_b32_e32 v0, v12 -; DAGISEL11_W64-NEXT: global_store_b32 v[40:41], v0, off +; DAGISEL11_W64-NEXT: global_store_b32 v[41:42], v0, off ; DAGISEL11_W64-NEXT: s_nop 0 ; DAGISEL11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; DAGISEL11_W64-NEXT: s_endpgm @@ -734,14 +741,17 @@ ; ; DAGISEL10_W64-LABEL: set_inactive_chain_arg_last_vgpr: ; DAGISEL10_W64: ; %bb.0: +; DAGISEL10_W64-NEXT: s_mov_b32 s32, 0 +; DAGISEL10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; DAGISEL10_W64-NEXT: v_mov_b32_e32 v40, v11 +; DAGISEL10_W64-NEXT: s_mov_b64 exec, s[0:1] ; DAGISEL10_W64-NEXT: s_getpc_b64 s[0:1] ; DAGISEL10_W64-NEXT: s_add_u32 s0, s0, gfx_callee@gotpcrel32@lo+4 ; DAGISEL10_W64-NEXT: s_addc_u32 s1, s1, gfx_callee@gotpcrel32@hi+12 -; DAGISEL10_W64-NEXT: v_mov_b32_e32 v42, v11 -; DAGISEL10_W64-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; DAGISEL10_W64-NEXT: v_mov_b32_e32 v43, v10 -; DAGISEL10_W64-NEXT: v_mov_b32_e32 v41, v9 -; DAGISEL10_W64-NEXT: v_mov_b32_e32 v40, v8 +; DAGISEL10_W64-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; DAGISEL10_W64-NEXT: v_mov_b32_e32 v42, v9 +; DAGISEL10_W64-NEXT: v_mov_b32_e32 v41, v8 ; DAGISEL10_W64-NEXT: v_mov_b32_e32 v0, 0 ; DAGISEL10_W64-NEXT: v_mov_b32_e32 v1, 0 ; DAGISEL10_W64-NEXT: v_mov_b32_e32 v2, 0 @@ -756,15 +766,14 @@ ; DAGISEL10_W64-NEXT: v_mov_b32_e32 v11, 0 ; DAGISEL10_W64-NEXT: s_mov_b64 s[0:1], s[48:49] ; DAGISEL10_W64-NEXT: s_mov_b64 s[2:3], s[50:51] -; DAGISEL10_W64-NEXT: s_mov_b32 s32, 0 ; DAGISEL10_W64-NEXT: s_waitcnt lgkmcnt(0) ; DAGISEL10_W64-NEXT: s_swappc_b64 s[30:31], s[4:5] ; DAGISEL10_W64-NEXT: v_mov_b32_e32 v12, v43 ; DAGISEL10_W64-NEXT: s_not_b64 exec, exec -; DAGISEL10_W64-NEXT: v_mov_b32_e32 v12, v42 +; DAGISEL10_W64-NEXT: v_mov_b32_e32 v12, v40 ; DAGISEL10_W64-NEXT: s_not_b64 exec, exec ; DAGISEL10_W64-NEXT: v_mov_b32_e32 v0, v12 -; DAGISEL10_W64-NEXT: global_store_dword v[40:41], v0, off +; DAGISEL10_W64-NEXT: global_store_dword v[41:42], v0, off ; DAGISEL10_W64-NEXT: s_endpgm call amdgpu_gfx void @gfx_callee(<12 x i32> zeroinitializer) %tmp = call i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32 %active, i32 %inactive) #0 diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -1885,8 +1885,9 @@ ; GFX1032-LABEL: test_wwm1: ; GFX1032: ; %bb.0: ; %main_body ; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: v_add_f32_e32 v2, v2, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-NEXT: v_mov_b32_e32 v3, v0 +; GFX1032-NEXT: v_add_f32_e32 v2, v3, v2 ; GFX1032-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-NEXT: v_mov_b32_e32 v0, v2 ; GFX1032-NEXT: ; return to shader part epilog @@ -1894,8 +1895,9 @@ ; GFX1064-LABEL: test_wwm1: ; GFX1064: ; %bb.0: ; %main_body ; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: v_add_f32_e32 v2, v2, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-NEXT: v_mov_b32_e32 v3, v0 +; GFX1064-NEXT: v_add_f32_e32 v2, v3, v2 ; GFX1064-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-NEXT: v_mov_b32_e32 v0, v2 ; GFX1064-NEXT: ; return to shader part epilog @@ -1970,8 +1972,9 @@ ; GFX1032-LABEL: test_strict_wwm1: ; GFX1032: ; %bb.0: ; %main_body ; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: v_add_f32_e32 v2, v2, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-NEXT: v_mov_b32_e32 v3, v0 +; GFX1032-NEXT: v_add_f32_e32 v2, v3, v2 ; GFX1032-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-NEXT: v_mov_b32_e32 v0, v2 ; GFX1032-NEXT: ; return to shader part epilog @@ -1979,8 +1982,9 @@ ; GFX1064-LABEL: test_strict_wwm1: ; GFX1064: ; %bb.0: ; %main_body ; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: v_add_f32_e32 v2, v2, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-NEXT: v_mov_b32_e32 v3, v0 +; GFX1064-NEXT: v_add_f32_e32 v2, v3, v2 ; GFX1064-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-NEXT: v_mov_b32_e32 v0, v2 ; GFX1064-NEXT: ; return to shader part epilog