diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -104,7 +104,13 @@ // 64-bit vector move instruction. This is mainly used by the // SIFoldOperands pass to enable folding of inline immediates. def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst), - (ins VSrc_b64:$src0)>; + (ins VSrc_b64:$src0)> { + let isReMaterializable = 1; + let isAsCheapAsAMove = 1; + let isMoveImm = 1; + let SchedRW = [Write64Bit]; + let Size = 16; // Needs maximum 2 v_mov_b32 instructions 8 byte long each. +} // 64-bit vector move with dpp. Expanded post-RA. def V_MOV_B64_DPP_PSEUDO : VOP_DPP_Pseudo <"v_mov_b64_dpp", VOP_I64_I64> { diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll @@ -18,8 +18,8 @@ ; GFX9-NEXT: s_andn2_b32 s0, s0, s1 ; GFX9-NEXT: s_or_b32 s0, s0, s3 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -36,8 +36,8 @@ ; GFX8-NEXT: s_andn2_b32 s0, s0, s1 ; GFX8-NEXT: s_or_b32 s0, s0, s3 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -54,8 +54,8 @@ ; GFX7-NEXT: s_andn2_b32 s0, s0, s1 ; GFX7-NEXT: s_or_b32 s0, s0, s3 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -85,19 +85,19 @@ define amdgpu_ps void @insertelement_v_v2i16_s_s(<2 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 inreg %idx) { ; GFX9-LABEL: insertelement_v_v2i16_s_s: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_and_b32 s1, s3, 1 -; GFX9-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NEXT: s_and_b32 s2, s2, s0 -; GFX9-NEXT: s_lshl_b32 s1, s1, 4 -; GFX9-NEXT: s_lshl_b32 s0, s0, s1 -; GFX9-NEXT: s_lshl_b32 s2, s2, s1 -; GFX9-NEXT: s_not_b32 s0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_or_b32 v2, v0, s0, v1 +; GFX9-NEXT: global_load_dword v2, v[0:1], off +; GFX9-NEXT: s_and_b32 s0, s3, 1 +; GFX9-NEXT: s_mov_b32 s1, 0xffff +; GFX9-NEXT: s_lshl_b32 s0, s0, 4 +; GFX9-NEXT: s_and_b32 s2, s2, s1 +; GFX9-NEXT: s_lshl_b32 s2, s2, s0 +; GFX9-NEXT: s_lshl_b32 s0, s1, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_not_b32 s0, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_or_b32 v2, v2, s0, v3 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -112,10 +112,10 @@ ; GFX8-NEXT: s_not_b32 s0, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v0, s0, v0 -; GFX8-NEXT: v_or_b32_e32 v2, s2, v0 +; GFX8-NEXT: v_and_b32_e32 v2, s0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -130,16 +130,16 @@ ; GFX7-NEXT: s_not_b32 s0, s0 ; GFX7-NEXT: s_lshl_b32 s2, s2, s1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 -; GFX7-NEXT: v_or_b32_e32 v2, s2, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s0, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_v_v2i16_s_s: ; GFX10: ; %bb.0: -; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: global_load_dword v2, v[0:1], off ; GFX10-NEXT: s_and_b32 s0, s3, 1 ; GFX10-NEXT: s_mov_b32 s1, 0xffff ; GFX10-NEXT: s_lshl_b32 s0, s0, 4 @@ -147,10 +147,10 @@ ; GFX10-NEXT: s_lshl_b32 s1, s1, s0 ; GFX10-NEXT: s_lshl_b32 s0, s2, s0 ; GFX10-NEXT: s_not_b32 s1, s1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_or_b32 v2, v0, s1, s0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_and_or_b32 v2, v2, s1, s0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %vec = load <2 x i16>, <2 x i16> addrspace(1 )* %ptr @@ -166,14 +166,14 @@ ; GFX9-NEXT: s_and_b32 s1, s4, 1 ; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: s_lshl_b32 s1, s1, 4 -; GFX9-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX9-NEXT: v_and_b32_e32 v2, s2, v0 ; GFX9-NEXT: s_lshl_b32 s2, s2, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_andn2_b32 s0, s0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_lshl_or_b32 v2, v0, s1, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_lshl_or_b32 v2, v2, s1, v3 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -185,12 +185,12 @@ ; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_lshl_b32 s1, s2, s1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_andn2_b32 s0, s0, s1 -; GFX8-NEXT: v_or_b32_e32 v2, s0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_or_b32_e32 v2, s0, v2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -201,13 +201,13 @@ ; GFX7-NEXT: s_mov_b32 s2, 0xffff ; GFX7-NEXT: s_lshl_b32 s1, s1, 4 ; GFX7-NEXT: v_and_b32_e32 v0, s2, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s1, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, s1, v0 ; GFX7-NEXT: s_lshl_b32 s1, s2, s1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_andn2_b32 s0, s0, s1 -; GFX7-NEXT: v_or_b32_e32 v2, s0, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: v_or_b32_e32 v2, s0, v2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -217,13 +217,13 @@ ; GFX10-NEXT: s_and_b32 s1, s4, 1 ; GFX10-NEXT: s_mov_b32 s2, 0xffff ; GFX10-NEXT: s_lshl_b32 s1, s1, 4 -; GFX10-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX10-NEXT: v_and_b32_e32 v2, s2, v0 ; GFX10-NEXT: s_lshl_b32 s2, s2, s1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_andn2_b32 s0, s0, s2 -; GFX10-NEXT: v_lshl_or_b32 v2, v0, s1, s0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_andn2_b32 s0, s0, s2 +; GFX10-NEXT: v_lshl_or_b32 v2, v2, s1, s0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %vec = load <2 x i16>, <2 x i16> addrspace(4)* %ptr @@ -240,13 +240,13 @@ ; GFX9-NEXT: s_mov_b32 s1, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX9-NEXT: s_and_b32 s2, s4, s1 -; GFX9-NEXT: v_lshlrev_b32_e64 v1, v0, s2 +; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s2 ; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s1 -; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_and_or_b32 v2, s0, v0, v1 +; GFX9-NEXT: v_xor_b32_e32 v3, -1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_and_or_b32 v2, s0, v3, v2 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -257,14 +257,14 @@ ; GFX8-NEXT: s_mov_b32 s1, 0xffff ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX8-NEXT: s_and_b32 s2, s4, s1 -; GFX8-NEXT: v_lshlrev_b32_e64 v1, v0, s2 +; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s2 ; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s1 ; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v0, s0, v0 -; GFX8-NEXT: v_or_b32_e32 v2, v0, v1 +; GFX8-NEXT: v_and_b32_e32 v3, s0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -275,14 +275,14 @@ ; GFX7-NEXT: s_mov_b32 s1, 0xffff ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX7-NEXT: s_and_b32 s2, s4, s1 -; GFX7-NEXT: v_lshl_b32_e32 v1, s2, v0 +; GFX7-NEXT: v_lshl_b32_e32 v2, s2, v0 ; GFX7-NEXT: v_lshl_b32_e32 v0, s1, v0 ; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 -; GFX7-NEXT: v_or_b32_e32 v2, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s0, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -294,12 +294,12 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, s1 ; GFX10-NEXT: s_and_b32 s1, s4, s1 -; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, s1 -; GFX10-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_and_or_b32 v2, s0, v1, v0 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, s1 +; GFX10-NEXT: v_xor_b32_e32 v3, -1, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_and_or_b32 v2, s0, v3, v2 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %vec = load <2 x i16>, <2 x i16> addrspace(4)* %ptr @@ -315,13 +315,13 @@ ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX9-NEXT: s_mov_b32 s1, 0xffff -; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s1 -; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_and_or_b32 v2, s0, v1, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: v_lshlrev_b32_e64 v0, v1, s1 +; GFX9-NEXT: v_xor_b32_e32 v3, -1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_and_or_b32 v2, s0, v3, v2 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -331,14 +331,14 @@ ; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX8-NEXT: s_mov_b32 s1, 0xffff -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s1 -; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_e64 v0, v1, s1 +; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v1, s0, v1 -; GFX8-NEXT: v_or_b32_e32 v2, v1, v0 +; GFX8-NEXT: v_and_b32_e32 v3, s0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -349,14 +349,14 @@ ; GFX7-NEXT: s_mov_b32 s1, 0xffff ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX7-NEXT: v_and_b32_e32 v0, s1, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_lshl_b32_e32 v1, s1, v1 -; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v0 +; GFX7-NEXT: v_lshl_b32_e32 v0, s1, v1 +; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, s0, v1 -; GFX7-NEXT: v_or_b32_e32 v2, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v3, s0, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -367,12 +367,12 @@ ; GFX10-NEXT: s_mov_b32 s1, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, s1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_xor_b32_e32 v1, -1, v2 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_and_or_b32 v2, s0, v1, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_and_or_b32 v2, s0, v2, v3 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %vec = load <2 x i16>, <2 x i16> addrspace(4)* %ptr @@ -384,18 +384,18 @@ define amdgpu_ps void @insertelement_v_v2i16_s_v(<2 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 %idx) { ; GFX9-LABEL: insertelement_v_v2i16_s_v: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_and_b32_e32 v1, 1, v2 +; GFX9-NEXT: global_load_dword v3, v[0:1], off +; GFX9-NEXT: v_and_b32_e32 v0, 1, v2 ; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX9-NEXT: s_and_b32 s1, s2, s0 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX9-NEXT: v_lshlrev_b32_e64 v2, v1, s1 -; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s0 -; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_or_b32 v2, v0, v1, v2 +; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s1 +; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s0 +; GFX9-NEXT: v_xor_b32_e32 v4, -1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_or_b32 v2, v3, v4, v2 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -410,10 +410,10 @@ ; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 ; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX8-NEXT: v_and_b32_e32 v3, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -428,27 +428,27 @@ ; GFX7-NEXT: v_lshl_b32_e32 v1, s0, v1 ; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX7-NEXT: v_and_b32_e32 v3, v0, v1 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_v_v2i16_s_v: ; GFX10: ; %bb.0: -; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: v_and_b32_e32 v1, 1, v2 +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_and_b32_e32 v0, 1, v2 ; GFX10-NEXT: s_mov_b32 s0, 0xffff -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, s0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, s0 ; GFX10-NEXT: s_and_b32 s0, s2, s0 -; GFX10-NEXT: v_lshlrev_b32_e64 v1, v1, s0 -; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_or_b32 v2, v0, v2, v1 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, s0 +; GFX10-NEXT: v_xor_b32_e32 v4, -1, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_and_or_b32 v2, v3, v4, v2 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %vec = load <2 x i16>, <2 x i16> addrspace(1)* %ptr @@ -460,17 +460,17 @@ define amdgpu_ps void @insertelement_v_v2i16_v_s(<2 x i16> addrspace(1)* %ptr, i16 %val, i32 inreg %idx) { ; GFX9-LABEL: insertelement_v_v2i16_v_s: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_and_b32 s1, s2, 1 -; GFX9-NEXT: s_lshl_b32 s1, s1, 4 -; GFX9-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NEXT: s_lshl_b32 s0, s0, s1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: global_load_dword v3, v[0:1], off +; GFX9-NEXT: s_and_b32 s0, s2, 1 +; GFX9-NEXT: s_lshl_b32 s0, s0, 4 +; GFX9-NEXT: s_mov_b32 s1, 0xffff +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: s_lshl_b32 s0, s1, s0 ; GFX9-NEXT: s_not_b32 s0, s0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_or_b32 v2, v0, s0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_or_b32 v2, v3, s0, v2 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -483,12 +483,12 @@ ; GFX8-NEXT: s_lshl_b32 s0, s0, s1 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_not_b32 s0, s0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v0, s0, v0 -; GFX8-NEXT: v_or_b32_e32 v2, v0, v1 +; GFX8-NEXT: v_and_b32_e32 v3, s0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -501,28 +501,28 @@ ; GFX7-NEXT: v_and_b32_e32 v1, s0, v2 ; GFX7-NEXT: s_lshl_b32 s0, s0, s1 ; GFX7-NEXT: s_not_b32 s0, s0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, s1, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, s1, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 -; GFX7-NEXT: v_or_b32_e32 v2, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s0, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_v_v2i16_v_s: ; GFX10: ; %bb.0: -; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: s_and_b32 s0, s2, 1 ; GFX10-NEXT: s_mov_b32 s1, 0xffff ; GFX10-NEXT: s_lshl_b32 s0, s0, 4 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: s_lshl_b32 s0, s1, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_not_b32 s0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_or_b32 v2, v0, s0, v1 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_and_or_b32 v2, v3, s0, v2 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %vec = load <2 x i16>, <2 x i16> addrspace(1)* %ptr @@ -534,17 +534,17 @@ define amdgpu_ps void @insertelement_v_v2i16_v_v(<2 x i16> addrspace(1)* %ptr, i16 %val, i32 %idx) { ; GFX9-LABEL: insertelement_v_v2i16_v_v: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_and_b32_e32 v1, 1, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: v_and_b32_e32 v0, 1, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX9-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s0 -; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_or_b32 v2, v0, v1, v2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s0 +; GFX9-NEXT: v_xor_b32_e32 v3, -1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_or_b32 v2, v4, v3, v2 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -558,10 +558,10 @@ ; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 ; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX8-NEXT: v_and_b32_e32 v3, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -576,26 +576,26 @@ ; GFX7-NEXT: v_lshl_b32_e32 v1, s0, v1 ; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX7-NEXT: v_and_b32_e32 v3, v0, v1 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_v_v2i16_v_v: ; GFX10: ; %bb.0: -; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: v_and_b32_e32 v1, 1, v3 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: v_and_b32_e32 v0, 1, v3 ; GFX10-NEXT: s_mov_b32 s0, 0xffff -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, s0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_xor_b32_e32 v2, -1, v3 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_or_b32 v2, v0, v2, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, s0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_xor_b32_e32 v3, -1, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_and_or_b32 v2, v4, v3, v2 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %vec = load <2 x i16>, <2 x i16> addrspace(1)* %ptr @@ -674,15 +674,15 @@ ; GFX9-NEXT: s_lshl_b32 s2, s2, s3 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 ; GFX9-NEXT: s_not_b32 s0, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; GFX9-NEXT: v_and_or_b32 v2, v3, s0, v2 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc +; GFX9-NEXT: v_and_or_b32 v4, v5, s0, v4 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; @@ -698,15 +698,15 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 ; GFX8-NEXT: s_not_b32 s0, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, s3 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v2, s0, v2 -; GFX8-NEXT: v_or_b32_e32 v2, s2, v2 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v4, s0, v4 +; GFX8-NEXT: v_or_b32_e32 v4, s2, v4 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; @@ -722,15 +722,15 @@ ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 ; GFX7-NEXT: s_not_b32 s0, s0 ; GFX7-NEXT: s_lshl_b32 s2, s2, s3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 -; GFX7-NEXT: v_or_b32_e32 v2, s2, v2 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX7-NEXT: v_mov_b32_e32 v2, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, 0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v4, s0, v4 +; GFX7-NEXT: v_or_b32_e32 v4, s2, v4 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; @@ -748,12 +748,12 @@ ; GFX10-NEXT: s_not_b32 s0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc_lo -; GFX10-NEXT: v_and_or_b32 v2, v2, s0, s2 +; GFX10-NEXT: v_and_or_b32 v4, v2, s0, s2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s1, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v2, s0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm %vec = load <4 x i16>, <4 x i16> addrspace(1 )* %ptr @@ -777,15 +777,15 @@ ; GFX9-NEXT: s_lshl_b32 s5, s5, s4 ; GFX9-NEXT: s_andn2_b32 s3, s3, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_lshl_or_b32 v2, v0, s4, v1 +; GFX9-NEXT: v_lshl_or_b32 v4, v0, s4, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; @@ -804,14 +804,14 @@ ; GFX8-NEXT: s_lshl_b32 s4, s5, s4 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: s_andn2_b32 s3, s3, s4 -; GFX8-NEXT: v_or_b32_e32 v2, s3, v0 +; GFX8-NEXT: v_or_b32_e32 v4, s3, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; @@ -829,15 +829,15 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX7-NEXT: s_lshl_b32 s4, s5, s4 ; GFX7-NEXT: s_andn2_b32 s3, s3, s4 -; GFX7-NEXT: v_or_b32_e32 v2, s3, v0 +; GFX7-NEXT: v_or_b32_e32 v4, s3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX7-NEXT: v_mov_b32_e32 v2, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7-NEXT: v_mov_b32_e32 v2, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, 0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; @@ -857,12 +857,12 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: s_lshl_b32 s5, s5, s4 ; GFX10-NEXT: s_andn2_b32 s3, s3, s5 -; GFX10-NEXT: v_lshl_or_b32 v2, v2, s4, s3 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX10-NEXT: v_lshl_or_b32 v4, v2, s4, s3 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm %vec = load <4 x i16>, <4 x i16> addrspace(4)* %ptr @@ -888,14 +888,14 @@ ; GFX9-NEXT: v_lshlrev_b32_e64 v3, v0, s3 ; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s2 ; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX9-NEXT: v_and_or_b32 v3, v1, v0, v3 +; GFX9-NEXT: v_and_or_b32 v4, v1, v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; @@ -916,14 +916,14 @@ ; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s2 ; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 -; GFX8-NEXT: v_or_b32_e32 v3, v0, v3 +; GFX8-NEXT: v_or_b32_e32 v4, v0, v3 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; @@ -944,14 +944,14 @@ ; GFX7-NEXT: v_lshl_b32_e32 v0, s2, v0 ; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_or_b32_e32 v3, v0, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] ; GFX7-NEXT: v_mov_b32_e32 v3, 0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; @@ -959,25 +959,25 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 1, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 1, v0 ; GFX10-NEXT: s_mov_b32 s2, 0xffff ; GFX10-NEXT: s_and_b32 s3, s4, s2 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, s2 -; GFX10-NEXT: v_lshlrev_b32_e64 v4, v1, s3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, s3 ; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s1 ; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v0, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v3 -; GFX10-NEXT: v_and_or_b32 v2, v5, v2, v4 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v2, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v4 +; GFX10-NEXT: v_and_or_b32 v5, v5, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm %vec = load <4 x i16>, <4 x i16> addrspace(4)* %ptr @@ -1002,14 +1002,14 @@ ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX9-NEXT: v_and_or_b32 v3, v3, v1, v0 +; GFX9-NEXT: v_and_or_b32 v4, v3, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; @@ -1029,14 +1029,14 @@ ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX8-NEXT: v_and_b32_e32 v1, v3, v1 -; GFX8-NEXT: v_or_b32_e32 v3, v1, v0 +; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; @@ -1057,14 +1057,14 @@ ; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX7-NEXT: v_and_b32_e32 v1, v3, v1 -; GFX7-NEXT: v_or_b32_e32 v3, v1, v0 +; GFX7-NEXT: v_or_b32_e32 v4, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] ; GFX7-NEXT: v_mov_b32_e32 v3, 0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; @@ -1085,11 +1085,11 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v4 -; GFX10-NEXT: v_and_or_b32 v2, v5, v3, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v2, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX10-NEXT: v_and_or_b32 v5, v5, v3, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm %vec = load <4 x i16>, <4 x i16> addrspace(4)* %ptr @@ -1102,94 +1102,94 @@ ; GFX9-LABEL: insertelement_v_v4i16_s_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 1, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 1, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: s_and_b32 s1, s2, s0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2 -; GFX9-NEXT: v_lshlrev_b32_e64 v4, v2, s1 +; GFX9-NEXT: v_lshlrev_b32_e64 v6, v2, s1 ; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 ; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX9-NEXT: v_and_or_b32 v2, v5, v2, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc +; GFX9-NEXT: v_and_or_b32 v2, v7, v2, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: global_store_dwordx2 v[3:4], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v4i16_s_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 1, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 1, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: s_and_b32 s1, s2, s0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 4, v2 -; GFX8-NEXT: v_lshlrev_b32_e64 v4, v2, s1 +; GFX8-NEXT: v_lshlrev_b32_e64 v6, v2, s1 ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 ; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 +; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v2, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v2, v7, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_v_v4i16_s_v: ; GFX7: ; %bb.0: ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 1, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 1, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX7-NEXT: s_mov_b32 s0, 0xffff ; GFX7-NEXT: s_and_b32 s1, s2, s0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 4, v2 -; GFX7-NEXT: v_lshl_b32_e32 v4, s1, v2 +; GFX7-NEXT: v_lshl_b32_e32 v6, s1, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v2 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 ; GFX7-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, 0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 +; GFX7-NEXT: v_mov_b32_e32 v4, 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX7-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v2, v7, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7-NEXT: v_mov_b32_e32 v2, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, 0 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: flat_store_dwordx2 v[3:4], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_v_v4i16_s_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: v_and_b32_e32 v3, 1, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 1, v2 ; GFX10-NEXT: s_mov_b32 s0, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 4, v3 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX10-NEXT: s_and_b32 s0, s2, s0 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v3, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v2 -; GFX10-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v3, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v5 +; GFX10-NEXT: v_xor_b32_e32 v3, -1, v4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc_lo -; GFX10-NEXT: v_and_or_b32 v3, v5, v4, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v3, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo +; GFX10-NEXT: v_and_or_b32 v4, v4, v3, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm %vec = load <4 x i16>, <4 x i16> addrspace(1)* %ptr @@ -1210,15 +1210,15 @@ ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX9-NEXT: s_not_b32 s0, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; GFX9-NEXT: v_and_or_b32 v2, v3, s0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc +; GFX9-NEXT: v_and_or_b32 v2, v5, s0, v2 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: global_store_dwordx2 v[3:4], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v4i16_v_s: @@ -1229,20 +1229,20 @@ ; GFX8-NEXT: s_lshl_b32 s2, s2, 4 ; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s2 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: s_not_b32 s0, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v3, s0, v3 -; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v5, s0, v5 +; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_v_v4i16_v_s: @@ -1257,16 +1257,16 @@ ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 ; GFX7-NEXT: s_not_b32 s0, s0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, s2, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, 0 +; GFX7-NEXT: v_mov_b32_e32 v4, 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v3, s0, v3 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v5, s0, v5 +; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7-NEXT: v_mov_b32_e32 v2, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, 0 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: flat_store_dwordx2 v[3:4], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_v_v4i16_v_s: @@ -1282,12 +1282,12 @@ ; GFX10-NEXT: s_not_b32 s0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc_lo -; GFX10-NEXT: v_and_or_b32 v2, v3, s0, v2 +; GFX10-NEXT: v_and_or_b32 v4, v3, s0, v2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s2, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v2, s0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm %vec = load <4 x i16>, <4 x i16> addrspace(1)* %ptr @@ -1300,91 +1300,91 @@ ; GFX9-LABEL: insertelement_v_v4i16_v_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 1, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 1, v3 ; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 4, v3 ; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX9-NEXT: v_lshlrev_b32_e64 v3, v3, s0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 ; GFX9-NEXT: v_xor_b32_e32 v3, -1, v3 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX9-NEXT: v_and_or_b32 v2, v5, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc +; GFX9-NEXT: v_and_or_b32 v2, v7, v3, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v4i16_v_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 1, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 1, v3 ; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 4, v3 ; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v3, s0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 ; GFX8-NEXT: v_xor_b32_e32 v3, -1, v3 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, 0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 +; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v3, v7, v3 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_v_v4i16_v_v: ; GFX7: ; %bb.0: ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 1, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 1, v3 ; GFX7-NEXT: v_and_b32_e32 v3, 1, v3 ; GFX7-NEXT: s_mov_b32 s0, 0xffff ; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 4, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v3, v2 ; GFX7-NEXT: v_lshl_b32_e32 v3, s0, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 ; GFX7-NEXT: v_xor_b32_e32 v3, -1, v3 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, 0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 +; GFX7-NEXT: v_mov_b32_e32 v5, 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v3, v5, v3 +; GFX7-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v3, v7, v3 ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7-NEXT: v_mov_b32_e32 v2, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, 0 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: flat_store_dwordx2 v[4:5], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_v_v4i16_v_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: v_and_b32_e32 v4, 1, v3 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 1, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 1, v3 ; GFX10-NEXT: s_mov_b32 s0, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 4, v4 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, s0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v3 -; GFX10-NEXT: v_xor_b32_e32 v4, -1, v5 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v6 +; GFX10-NEXT: v_xor_b32_e32 v3, -1, v5 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc_lo -; GFX10-NEXT: v_and_or_b32 v2, v5, v4, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v2, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo +; GFX10-NEXT: v_and_or_b32 v4, v4, v3, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm %vec = load <4 x i16>, <4 x i16> addrspace(1)* %ptr @@ -1423,10 +1423,10 @@ ; GFX9-NEXT: s_cmp_eq_u32 s6, 3 ; GFX9-NEXT: s_cselect_b32 s3, s4, s3 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm ; @@ -1459,10 +1459,10 @@ ; GFX8-NEXT: s_cmp_eq_u32 s6, 3 ; GFX8-NEXT: s_cselect_b32 s3, s4, s3 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; @@ -1558,20 +1558,20 @@ ; GFX9-NEXT: s_lshl_b32 s2, s2, s1 ; GFX9-NEXT: s_not_b32 s5, s0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 -; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: v_mov_b32_e32 v6, s2 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v2, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v3, s[2:3] -; GFX9-NEXT: v_and_or_b32 v4, v5, s5, v4 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v2, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[2:3] +; GFX9-NEXT: v_and_or_b32 v6, v7, s5, v6 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm ; @@ -1589,19 +1589,19 @@ ; GFX8-NEXT: s_not_b32 s6, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v3, s[2:3] -; GFX8-NEXT: v_and_b32_e32 v4, s6, v4 -; GFX8-NEXT: v_or_b32_e32 v4, s5, v4 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[2:3] ; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v2, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[2:3] +; GFX8-NEXT: v_and_b32_e32 v6, s6, v6 +; GFX8-NEXT: v_or_b32_e32 v6, s5, v6 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; @@ -1655,14 +1655,14 @@ ; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v2, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v3, s1 -; GFX10-NEXT: v_and_or_b32 v4, v4, s3, s2 +; GFX10-NEXT: v_and_or_b32 v6, v4, s3, s2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s4, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v4, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v4, s1 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v6, s1 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm %vec = load <8 x i16>, <8 x i16> addrspace(1 )* %ptr @@ -1690,21 +1690,21 @@ ; GFX9-NEXT: s_lshl_b32 s7, s7, s4 ; GFX9-NEXT: s_andn2_b32 s6, s6, s7 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_lshl_or_b32 v4, v0, s4, v1 +; GFX9-NEXT: v_lshl_or_b32 v6, v0, s4, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm ; @@ -1727,20 +1727,20 @@ ; GFX8-NEXT: s_lshl_b32 s4, s7, s4 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: s_andn2_b32 s4, s6, s4 -; GFX8-NEXT: v_or_b32_e32 v4, s4, v0 +; GFX8-NEXT: v_or_b32_e32 v6, s4, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 3 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; @@ -1803,16 +1803,16 @@ ; GFX10-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-NEXT: s_lshl_b32 s7, s7, s4 ; GFX10-NEXT: s_andn2_b32 s6, s6, s7 -; GFX10-NEXT: v_lshl_or_b32 v4, v4, s4, s6 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo +; GFX10-NEXT: v_lshl_or_b32 v6, v4, s4, s6 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 1 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 2 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 3 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm %vec = load <8 x i16>, <8 x i16> addrspace(4)* %ptr @@ -1844,18 +1844,18 @@ ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] ; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX9-NEXT: v_and_or_b32 v5, v1, v0, v2 +; GFX9-NEXT: v_and_or_b32 v6, v1, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-NEXT: v_mov_b32_e32 v2, s10 ; GFX9-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm ; @@ -1882,18 +1882,18 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] ; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 -; GFX8-NEXT: v_or_b32_e32 v5, v0, v2 +; GFX8-NEXT: v_or_b32_e32 v6, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NEXT: v_mov_b32_e32 v1, s9 ; GFX8-NEXT: v_mov_b32_e32 v2, s10 ; GFX8-NEXT: v_mov_b32_e32 v3, s11 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] ; GFX8-NEXT: v_mov_b32_e32 v4, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; @@ -1940,17 +1940,17 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 1, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 1, v0 ; GFX10-NEXT: s_mov_b32 s0, 0xffff ; GFX10-NEXT: s_and_b32 s1, s4, s0 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v6 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v1, s1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v4 -; GFX10-NEXT: v_xor_b32_e32 v6, -1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v6 +; GFX10-NEXT: v_lshlrev_b32_e64 v4, v1, s1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v6 +; GFX10-NEXT: v_xor_b32_e32 v5, -1, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s9 ; GFX10-NEXT: v_cndmask_b32_e32 v0, s8, v0, vcc_lo @@ -1960,13 +1960,13 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, s9 ; GFX10-NEXT: v_mov_b32_e32 v2, s10 ; GFX10-NEXT: v_mov_b32_e32 v3, s11 -; GFX10-NEXT: v_and_or_b32 v5, v7, v6, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s2 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v5, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v5, s1 +; GFX10-NEXT: v_and_or_b32 v7, v7, v5, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v7, s2 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v7, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm %vec = load <8 x i16>, <8 x i16> addrspace(4)* %ptr @@ -1997,18 +1997,18 @@ ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] ; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX9-NEXT: v_and_or_b32 v5, v2, v1, v0 +; GFX9-NEXT: v_and_or_b32 v6, v2, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm ; @@ -2034,18 +2034,18 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] ; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX8-NEXT: v_and_b32_e32 v1, v2, v1 -; GFX8-NEXT: v_or_b32_e32 v5, v1, v0 +; GFX8-NEXT: v_or_b32_e32 v6, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NEXT: v_mov_b32_e32 v3, s7 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] ; GFX8-NEXT: v_mov_b32_e32 v4, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; @@ -2091,17 +2091,17 @@ ; GFX10-LABEL: insertelement_s_v8i16_v_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 1, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 1, v1 ; GFX10-NEXT: v_and_b32_e32 v2, 1, v1 ; GFX10-NEXT: s_mov_b32 s0, 0xffff -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v4 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v6 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v2, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v4 -; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_xor_b32_e32 v6, -1, v3 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v6 +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_xor_b32_e32 v5, -1, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: v_cndmask_b32_e32 v1, s4, v1, vcc_lo @@ -2111,13 +2111,13 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: v_mov_b32_e32 v2, s6 ; GFX10-NEXT: v_mov_b32_e32 v3, s7 -; GFX10-NEXT: v_and_or_b32 v5, v7, v6, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s2 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v5, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v5, s1 +; GFX10-NEXT: v_and_or_b32 v7, v7, v5, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v7, s2 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v7, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm %vec = load <8 x i16>, <8 x i16> addrspace(4)* %ptr @@ -2141,19 +2141,19 @@ ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 ; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v5, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v6, s[2:3] -; GFX9-NEXT: v_and_or_b32 v7, v7, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v7, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v7, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX9-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v5, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[2:3] +; GFX9-NEXT: v_and_or_b32 v9, v9, v1, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v9, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v9, s[2:3] +; GFX9-NEXT: global_store_dwordx4 v[7:8], v[0:3], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v8i16_s_v: @@ -2170,20 +2170,20 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 ; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: v_mov_b32_e32 v7, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX8-NEXT: v_mov_b32_e32 v8, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v5, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v6, s[2:3] -; GFX8-NEXT: v_and_b32_e32 v1, v7, v1 -; GFX8-NEXT: v_or_b32_e32 v7, v1, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v7, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v7, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v5, 0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v5, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[2:3] +; GFX8-NEXT: v_and_b32_e32 v1, v9, v1 +; GFX8-NEXT: v_or_b32_e32 v9, v1, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v9, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v9, s[2:3] +; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_v_v8i16_s_v: @@ -2237,14 +2237,14 @@ ; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v5, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s1 -; GFX10-NEXT: v_and_or_b32 v7, v2, v7, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 -; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v7, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v7, s1 -; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX10-NEXT: v_and_or_b32 v9, v2, v7, v0 +; GFX10-NEXT: v_mov_b32_e32 v7, 0 +; GFX10-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v9, s2 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v9, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v9, s1 +; GFX10-NEXT: global_store_dwordx4 v[7:8], v[0:3], off ; GFX10-NEXT: s_endpgm %vec = load <8 x i16>, <8 x i16> addrspace(1)* %ptr %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx @@ -2266,19 +2266,19 @@ ; GFX9-NEXT: s_not_b32 s5, s0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3] -; GFX9-NEXT: v_and_or_b32 v7, v1, s5, v0 +; GFX9-NEXT: v_and_or_b32 v9, v1, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v7, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v7, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v9, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v9, s[2:3] +; GFX9-NEXT: global_store_dwordx4 v[7:8], v[0:3], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v8i16_v_s: @@ -2295,20 +2295,20 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_mov_b32_e32 v7, 0 +; GFX8-NEXT: v_mov_b32_e32 v8, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3] ; GFX8-NEXT: v_and_b32_e32 v1, s5, v1 -; GFX8-NEXT: v_or_b32_e32 v7, v1, v0 +; GFX8-NEXT: v_or_b32_e32 v9, v1, v0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v7, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v7, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v5, 0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v9, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v9, s[2:3] +; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_v_v8i16_v_s: @@ -2355,20 +2355,20 @@ ; GFX10-NEXT: s_mov_b32 s4, 0xffff ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: s_lshl_b32 s2, s4, s2 +; GFX10-NEXT: v_mov_b32_e32 v7, 0 ; GFX10-NEXT: s_not_b32 s2, s2 +; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s1 -; GFX10-NEXT: v_and_or_b32 v7, v0, s2, v1 +; GFX10-NEXT: v_and_or_b32 v9, v0, s2, v1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s3, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 -; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v7, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v7, s1 -; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v9, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v9, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v9, s1 +; GFX10-NEXT: global_store_dwordx4 v[7:8], v[0:3], off ; GFX10-NEXT: s_endpgm %vec = load <8 x i16>, <8 x i16> addrspace(1)* %ptr %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx @@ -2390,7 +2390,9 @@ ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 ; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] @@ -2398,11 +2400,9 @@ ; GFX9-NEXT: v_and_or_b32 v3, v3, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, v3, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v3, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v8i16_v_v: @@ -2418,7 +2418,9 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 ; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX8-NEXT: v_mov_b32_e32 v9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] @@ -2427,11 +2429,9 @@ ; GFX8-NEXT: v_or_b32_e32 v3, v1, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v3, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v3, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v5, 0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_v_v8i16_v_v: @@ -2480,6 +2480,8 @@ ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_xor_b32_e32 v2, -1, v8 +; GFX10-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-NEXT: v_mov_b32_e32 v9, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v6, s0 @@ -2487,11 +2489,9 @@ ; GFX10-NEXT: v_and_or_b32 v3, v3, v2, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, v3, s2 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v3, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v3, s1 -; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off ; GFX10-NEXT: s_endpgm %vec = load <8 x i16>, <8 x i16> addrspace(1)* %ptr %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx @@ -2544,10 +2544,10 @@ ; GFX9-NEXT: s_cmp_eq_u32 s7, 6 ; GFX9-NEXT: s_cselect_b32 s6, s16, s14 ; GFX9-NEXT: s_cmp_eq_u32 s7, 7 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_cselect_b32 s7, s16, s15 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -2603,18 +2603,18 @@ ; GFX8-NEXT: s_cmp_eq_u32 s7, 6 ; GFX8-NEXT: s_cselect_b32 s6, s16, s14 ; GFX8-NEXT: s_cmp_eq_u32 s7, 7 +; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_cselect_b32 s7, s16, s15 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v4, 16 +; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; @@ -3050,12 +3050,11 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; GFX8-NEXT: v_mov_b32_e32 v8, 0 +; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: v_mov_b32_e32 v9, 0 +; GFX8-NEXT: v_mov_b32_e32 v11, 0 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_e32 v0, 16 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_s_v16i16_v_s: @@ -3295,12 +3294,11 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] ; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] ; GFX8-NEXT: v_mov_b32_e32 v8, 0 +; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: v_mov_b32_e32 v9, 0 +; GFX8-NEXT: v_mov_b32_e32 v11, 0 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_e32 v0, 16 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_s_v16i16_s_v: @@ -3535,12 +3533,11 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] ; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] ; GFX8-NEXT: v_mov_b32_e32 v8, 0 +; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: v_mov_b32_e32 v9, 0 +; GFX8-NEXT: v_mov_b32_e32 v11, 0 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_e32 v0, 16 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_s_v16i16_v_v: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll @@ -12,15 +12,15 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -32,15 +32,15 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -75,13 +75,13 @@ ; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s4, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s4, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s4, s0 ; GFX10-NEXT: s_movk_i32 s0, 0xff -; GFX10-NEXT: v_and_b32_sdwa v1, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v3, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm %vec = load <2 x i8>, <2 x i8> addrspace(4)* %ptr @@ -97,15 +97,15 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s3, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -115,15 +115,15 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s3, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -156,13 +156,13 @@ ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s3, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s2, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, s0 ; GFX10-NEXT: s_movk_i32 s0, 0xff -; GFX10-NEXT: v_and_b32_sdwa v1, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v3, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm %vec = load <2 x i8>, <2 x i8> addrspace(1 )* %ptr @@ -178,15 +178,15 @@ ; GFX9-NEXT: global_load_ushort v1, v1, s[2:3] ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -197,15 +197,15 @@ ; GFX8-NEXT: flat_load_ushort v1, v[1:2] ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX8-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -241,11 +241,11 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX10-NEXT: v_and_b32_sdwa v1, v2, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v2, v2, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm %vec = load <2 x i8>, <2 x i8> addrspace(4)* %ptr @@ -262,15 +262,15 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -282,15 +282,15 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cndmask_b32_e32 v4, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX8-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -327,11 +327,11 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s4, vcc_lo -; GFX10-NEXT: v_and_b32_sdwa v1, v2, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v2, v2, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_cndmask_b32_e64 v3, v1, s4, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm %vec = load <2 x i8>, <2 x i8> addrspace(4)* %ptr @@ -352,10 +352,10 @@ ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -371,10 +371,10 @@ ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc ; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX8-NEXT: v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -410,11 +410,11 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX10-NEXT: v_and_b32_sdwa v1, v3, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo -; GFX10-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v3, v3, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm %vec = load <2 x i8>, <2 x i8> addrspace(4)* %ptr @@ -430,15 +430,15 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -448,15 +448,15 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -491,11 +491,11 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX10-NEXT: v_and_b32_sdwa v1, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo -; GFX10-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v3, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm %vec = load <2 x i8>, <2 x i8> addrspace(1)* %ptr @@ -510,15 +510,15 @@ ; GFX9-NEXT: global_load_ushort v0, v[0:1], off ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -527,15 +527,15 @@ ; GFX8-NEXT: flat_load_ushort v0, v[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cndmask_b32_e32 v3, v0, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -569,11 +569,11 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0 -; GFX10-NEXT: v_and_b32_sdwa v1, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo -; GFX10-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v3, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm %vec = load <2 x i8>, <2 x i8> addrspace(1)* %ptr @@ -588,15 +588,15 @@ ; GFX9-NEXT: global_load_ushort v0, v[0:1], off ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -605,15 +605,15 @@ ; GFX8-NEXT: flat_load_ushort v0, v[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -647,11 +647,11 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX10-NEXT: v_and_b32_sdwa v1, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo -; GFX10-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v3, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm %vec = load <2 x i8>, <2 x i8> addrspace(1)* %ptr @@ -744,11 +744,11 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v3 -; GFX9-NEXT: v_or3_b32 v2, v0, v2, v1 +; GFX9-NEXT: v_and_or_b32 v4, v0, s4, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_or3_b32 v2, v4, v2, v3 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -780,11 +780,11 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX8-NEXT: v_or_b32_e32 v2, v0, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v1 +; GFX8-NEXT: v_or_b32_e32 v3, v0, v3 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -850,14 +850,14 @@ ; GFX10-NEXT: s_not_b32 s2, s3 ; GFX10-NEXT: v_mov_b32_e32 v1, 8 ; GFX10-NEXT: v_and_or_b32 v0, v0, s2, s0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v3 -; GFX10-NEXT: v_or3_b32 v2, v0, v2, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX10-NEXT: v_and_or_b32 v4, v0, s1, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_or3_b32 v2, v4, v2, v3 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %vec = load <4 x i8>, <4 x i8> addrspace(1 )* %ptr @@ -891,14 +891,14 @@ ; GFX9-NEXT: s_andn2_b32 s0, s0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, s3, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 24, v0 ; GFX9-NEXT: v_and_or_b32 v2, v0, s5, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX9-NEXT: v_or3_b32 v2, v2, v0, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_or3_b32 v2, v2, v3, v4 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -930,11 +930,11 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v0 ; GFX8-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX8-NEXT: v_or_b32_e32 v2, v0, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v1 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1002,11 +1002,11 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX10-NEXT: s_mov_b32 s0, 16 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s2, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v2 -; GFX10-NEXT: v_or3_b32 v2, v0, v3, v1 +; GFX10-NEXT: v_and_or_b32 v4, v0, s2, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_or3_b32 v2, v4, v3, v2 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %vec = load <4 x i8>, <4 x i8> addrspace(4)* %ptr @@ -1039,15 +1039,15 @@ ; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s5 ; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX9-NEXT: v_and_or_b32 v0, s0, v0, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: s_mov_b32 s2, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 24, v0 ; GFX9-NEXT: v_and_or_b32 v2, v0, s5, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX9-NEXT: v_or3_b32 v2, v2, v0, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_or3_b32 v2, v2, v3, v4 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -1080,11 +1080,11 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v0 ; GFX8-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX8-NEXT: v_or_b32_e32 v2, v0, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v1 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1154,11 +1154,11 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX10-NEXT: s_mov_b32 s0, 16 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v2 -; GFX10-NEXT: v_or3_b32 v2, v0, v3, v1 +; GFX10-NEXT: v_and_or_b32 v4, v0, s1, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_or3_b32 v2, v4, v3, v2 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %vec = load <4 x i8>, <4 x i8> addrspace(4)* %ptr @@ -1190,15 +1190,15 @@ ; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX9-NEXT: v_and_or_b32 v0, s0, v1, v0 ; GFX9-NEXT: s_mov_b32 s1, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: s_mov_b32 s2, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 24, v0 ; GFX9-NEXT: v_and_or_b32 v2, v0, s4, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX9-NEXT: v_or3_b32 v2, v2, v0, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_or3_b32 v2, v2, v3, v4 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -1230,11 +1230,11 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v0 ; GFX8-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX8-NEXT: v_or_b32_e32 v2, v0, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v1 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1303,11 +1303,11 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX10-NEXT: s_mov_b32 s0, 16 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v2 -; GFX10-NEXT: v_or3_b32 v2, v0, v3, v1 +; GFX10-NEXT: v_and_or_b32 v4, v0, s1, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_or3_b32 v2, v4, v3, v2 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %vec = load <4 x i8>, <4 x i8> addrspace(4)* %ptr @@ -1342,11 +1342,11 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v2 -; GFX9-NEXT: v_or3_b32 v2, v0, v3, v1 +; GFX9-NEXT: v_and_or_b32 v4, v0, s3, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_or3_b32 v2, v4, v3, v2 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -1378,11 +1378,11 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX8-NEXT: v_or_b32_e32 v2, v0, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v1 +; GFX8-NEXT: v_or_b32_e32 v3, v0, v3 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1450,12 +1450,12 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, 16 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v3 -; GFX10-NEXT: v_or3_b32 v2, v0, v1, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v2, v0, s1, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_or3_b32 v2, v2, v4, v3 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %vec = load <4 x i8>, <4 x i8> addrspace(1)* %ptr @@ -1489,11 +1489,11 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v2 -; GFX9-NEXT: v_or3_b32 v2, v0, v3, v1 +; GFX9-NEXT: v_and_or_b32 v4, v0, s3, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_or3_b32 v2, v4, v3, v2 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -1525,11 +1525,11 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX8-NEXT: v_or_b32_e32 v2, v0, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v1 +; GFX8-NEXT: v_or_b32_e32 v3, v0, v3 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1594,14 +1594,14 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, 8 ; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, 16 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s0, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v3 -; GFX10-NEXT: v_or3_b32 v2, v0, v2, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX10-NEXT: v_and_or_b32 v4, v0, s0, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_or3_b32 v2, v4, v2, v3 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %vec = load <4 x i8>, <4 x i8> addrspace(1)* %ptr @@ -1636,11 +1636,11 @@ ; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, v1, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v2 -; GFX9-NEXT: v_or3_b32 v2, v0, v4, v1 +; GFX9-NEXT: v_and_or_b32 v3, v0, v1, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_or3_b32 v2, v3, v4, v2 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -1671,11 +1671,11 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX8-NEXT: v_or_b32_e32 v2, v0, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v1 +; GFX8-NEXT: v_or_b32_e32 v3, v0, v3 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1743,12 +1743,12 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, 16 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, 0xff, v0, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v3 -; GFX10-NEXT: v_or3_b32 v2, v0, v1, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v2, 0xff, v0, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_or3_b32 v2, v2, v4, v3 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %vec = load <4 x i8>, <4 x i8> addrspace(1)* %ptr @@ -1763,8 +1763,8 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: s_mov_b32 s8, 0x80008 ; GFX9-NEXT: s_movk_i32 s6, 0xff -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_u32 s9, s0, s8 ; GFX9-NEXT: s_and_b32 s7, s0, s6 @@ -1821,9 +1821,9 @@ ; GFX9-NEXT: s_or_b32 s1, s2, s1 ; GFX9-NEXT: s_lshl_b32 s2, s3, 24 ; GFX9-NEXT: s_or_b32 s1, s1, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_s_v8i8_s_s: @@ -1831,8 +1831,8 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX8-NEXT: s_mov_b32 s8, 0x80008 ; GFX8-NEXT: s_movk_i32 s6, 0xff -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_u32 s9, s0, s8 ; GFX8-NEXT: s_and_b32 s7, s0, s6 @@ -1889,9 +1889,9 @@ ; GFX8-NEXT: s_or_b32 s1, s2, s1 ; GFX8-NEXT: s_lshl_b32 s2, s3, 24 ; GFX8-NEXT: s_or_b32 s1, s1, s2 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_s_v8i8_s_s: @@ -1970,8 +1970,8 @@ ; GFX10-NEXT: s_movk_i32 s2, 0xff ; GFX10-NEXT: s_mov_b32 s6, 0x80010 ; GFX10-NEXT: s_lshr_b32 s7, s5, 2 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_u32 s11, s0, s3 ; GFX10-NEXT: s_bfe_u32 s13, s1, s3 @@ -2009,26 +2009,26 @@ ; GFX10-NEXT: s_bfe_u32 s7, s0, s3 ; GFX10-NEXT: s_bfe_u32 s3, s1, s3 ; GFX10-NEXT: s_and_b32 s5, s0, s2 -; GFX10-NEXT: s_lshr_b32 s4, s0, 24 -; GFX10-NEXT: s_bfe_u32 s0, s0, s6 -; GFX10-NEXT: s_lshl_b32 s7, s7, 8 ; GFX10-NEXT: s_lshr_b32 s8, s1, 24 ; GFX10-NEXT: s_and_b32 s2, s1, s2 ; GFX10-NEXT: s_bfe_u32 s1, s1, s6 ; GFX10-NEXT: s_lshl_b32 s3, s3, 8 -; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: s_or_b32 s5, s5, s7 +; GFX10-NEXT: s_lshr_b32 s4, s0, 24 +; GFX10-NEXT: s_bfe_u32 s0, s0, s6 +; GFX10-NEXT: s_lshl_b32 s7, s7, 8 ; GFX10-NEXT: s_or_b32 s2, s2, s3 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 -; GFX10-NEXT: s_or_b32 s0, s5, s0 -; GFX10-NEXT: s_lshl_b32 s3, s4, 24 +; GFX10-NEXT: s_lshl_b32 s0, s0, 16 +; GFX10-NEXT: s_or_b32 s5, s5, s7 ; GFX10-NEXT: s_or_b32 s1, s2, s1 ; GFX10-NEXT: s_lshl_b32 s2, s8, 24 -; GFX10-NEXT: s_or_b32 s0, s0, s3 +; GFX10-NEXT: s_or_b32 s0, s5, s0 +; GFX10-NEXT: s_lshl_b32 s3, s4, 24 ; GFX10-NEXT: s_or_b32 s1, s1, s2 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: s_or_b32 s0, s0, s3 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX10-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(4)* %ptr %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx @@ -2051,49 +2051,49 @@ ; GFX9-NEXT: s_lshl_b32 s3, s4, s3 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 ; GFX9-NEXT: s_not_b32 s3, s3 -; GFX9-NEXT: v_mov_b32_e32 v4, s2 -; GFX9-NEXT: v_mov_b32_e32 v2, 8 -; GFX9-NEXT: v_mov_b32_e32 v3, 16 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v7, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v9, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v8, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v10, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX9-NEXT: v_or3_b32 v0, v0, v8, v5 -; GFX9-NEXT: v_or3_b32 v1, v1, v10, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX9-NEXT: v_and_or_b32 v4, v5, s3, v4 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v5 -; GFX9-NEXT: v_or3_b32 v1, v1, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v6, s2 +; GFX9-NEXT: v_mov_b32_e32 v4, 8 +; GFX9-NEXT: v_mov_b32_e32 v5, 16 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX9-NEXT: v_or3_b32 v0, v0, v7, v4 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v9, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v11, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v10, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v12, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX9-NEXT: v_or3_b32 v0, v0, v10, v7 +; GFX9-NEXT: v_or3_b32 v1, v1, v12, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc +; GFX9-NEXT: v_and_or_b32 v6, v7, s3, v6 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v4 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v7 +; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v8 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX9-NEXT: v_or3_b32 v0, v0, v9, v6 +; GFX9-NEXT: v_or3_b32 v1, v1, v5, v4 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v8i8_s_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v2, 8 -; GFX8-NEXT: v_mov_b32_e32 v3, 16 +; GFX8-NEXT: v_mov_b32_e32 v4, 8 +; GFX8-NEXT: v_mov_b32_e32 v5, 16 ; GFX8-NEXT: s_lshr_b32 s1, s3, 2 ; GFX8-NEXT: s_and_b32 s3, s3, 3 ; GFX8-NEXT: s_movk_i32 s0, 0xff @@ -2103,45 +2103,45 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 ; GFX8-NEXT: s_not_b32 s0, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, s3 -; GFX8-NEXT: v_mov_b32_e32 v4, 8 -; GFX8-NEXT: v_mov_b32_e32 v5, 16 +; GFX8-NEXT: v_mov_b32_e32 v6, 8 +; GFX8-NEXT: v_mov_b32_e32 v7, 16 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v7 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v2, s0, v2 -; GFX8-NEXT: v_or_b32_e32 v2, s2, v2 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v10, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v11, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v11 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v9 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v4, s0, v4 +; GFX8-NEXT: v_or_b32_e32 v4, s2, v4 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; @@ -2245,21 +2245,21 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v2, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 16 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v5 -; GFX10-NEXT: v_and_or_b32 v1, v1, s4, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v6 +; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v2, v0, s4, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX10-NEXT: v_or3_b32 v1, v1, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_or3_b32 v0, v0, v7, v4 -; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: v_and_or_b32 v3, v1, s4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v6 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_or3_b32 v2, v2, v7, v4 +; GFX10-NEXT: v_or3_b32 v3, v3, v8, v5 +; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX10-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(1 )* %ptr %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx @@ -2321,12 +2321,12 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; GFX9-NEXT: v_or3_b32 v0, v4, v0, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v2, v1, s7, v2 +; GFX9-NEXT: v_and_or_b32 v4, v1, s7, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v3 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX9-NEXT: v_or3_b32 v1, v2, v1, v3 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_or3_b32 v1, v4, v1, v5 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; @@ -2387,10 +2387,10 @@ ; GFX8-NEXT: v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v3 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v3 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; @@ -2510,18 +2510,18 @@ ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s2, v3 +; GFX10-NEXT: v_and_or_b32 v3, v0, s2, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v4 -; GFX10-NEXT: v_and_or_b32 v1, v1, s2, v5 -; GFX10-NEXT: v_or3_b32 v0, v0, v6, v2 -; GFX10-NEXT: v_or3_b32 v1, v1, v7, v3 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX10-NEXT: v_and_or_b32 v5, v1, s2, v5 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_or3_b32 v2, v3, v6, v2 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_or3_b32 v3, v5, v7, v4 +; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX10-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(4)* %ptr %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx @@ -2584,12 +2584,12 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; GFX9-NEXT: v_or3_b32 v0, v4, v0, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v2, v1, s7, v2 +; GFX9-NEXT: v_and_or_b32 v4, v1, s7, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v3 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX9-NEXT: v_or3_b32 v1, v2, v1, v3 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_or3_b32 v1, v4, v1, v5 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; @@ -2652,10 +2652,10 @@ ; GFX8-NEXT: v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v3 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v3 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; @@ -2781,15 +2781,15 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s2, v3 +; GFX10-NEXT: v_and_or_b32 v3, v0, s2, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX10-NEXT: v_and_or_b32 v1, v1, s2, v5 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v4 -; GFX10-NEXT: v_or3_b32 v0, v0, v6, v2 -; GFX10-NEXT: v_or3_b32 v1, v1, v7, v3 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: v_and_or_b32 v5, v1, s2, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_or3_b32 v2, v3, v6, v2 +; GFX10-NEXT: v_or3_b32 v3, v5, v7, v4 +; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX10-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(4)* %ptr %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx @@ -2851,12 +2851,12 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; GFX9-NEXT: v_or3_b32 v0, v4, v0, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v2, v1, s6, v2 +; GFX9-NEXT: v_and_or_b32 v4, v1, s6, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v3 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX9-NEXT: v_or3_b32 v1, v2, v1, v3 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_or3_b32 v1, v4, v1, v5 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; @@ -2918,10 +2918,10 @@ ; GFX8-NEXT: v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v3 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v3 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; @@ -3046,15 +3046,15 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s2, v3 +; GFX10-NEXT: v_and_or_b32 v3, v0, s2, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX10-NEXT: v_and_or_b32 v1, v1, s2, v5 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v4 -; GFX10-NEXT: v_or3_b32 v0, v0, v6, v2 -; GFX10-NEXT: v_or3_b32 v1, v1, v7, v3 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: v_and_or_b32 v5, v1, s2, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_or3_b32 v2, v3, v6, v2 +; GFX10-NEXT: v_or3_b32 v3, v5, v7, v4 +; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX10-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(4)* %ptr %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx @@ -3068,106 +3068,106 @@ ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_mov_b32 s0, 8 ; GFX9-NEXT: s_mov_b32 s1, 16 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 2, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 2, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 3, v2 ; GFX9-NEXT: s_movk_i32 s3, 0xff ; GFX9-NEXT: s_and_b32 s2, s2, s3 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX9-NEXT: v_lshlrev_b32_e64 v6, v2, s2 +; GFX9-NEXT: v_lshlrev_b32_e64 v8, v2, s2 ; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s3 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 ; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, 8 -; GFX9-NEXT: v_mov_b32_e32 v4, 16 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 16 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v9, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v11, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v10, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v12, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX9-NEXT: v_or3_b32 v0, v0, v10, v7 -; GFX9-NEXT: v_or3_b32 v1, v1, v12, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc -; GFX9-NEXT: v_and_or_b32 v2, v7, v2, v6 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v5 -; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v11, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v12, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX9-NEXT: v_or3_b32 v0, v0, v12, v9 +; GFX9-NEXT: v_or3_b32 v1, v1, v14, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v0, v1, vcc +; GFX9-NEXT: v_and_or_b32 v2, v9, v2, v8 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v7 +; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v8 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX9-NEXT: v_or3_b32 v0, v0, v7, v2 -; GFX9-NEXT: v_or3_b32 v1, v1, v4, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: v_or3_b32 v0, v0, v9, v2 +; GFX9-NEXT: v_or3_b32 v1, v1, v6, v5 +; GFX9-NEXT: global_store_dwordx2 v[3:4], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v8i8_s_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v3, 8 -; GFX8-NEXT: v_mov_b32_e32 v4, 16 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 2, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, 8 +; GFX8-NEXT: v_mov_b32_e32 v6, 16 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 2, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 ; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_and_b32 s1, s2, s0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX8-NEXT: v_lshlrev_b32_e64 v8, v2, s1 +; GFX8-NEXT: v_lshlrev_b32_e64 v10, v2, s1 ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 ; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7 -; GFX8-NEXT: v_mov_b32_e32 v5, 8 -; GFX8-NEXT: v_mov_b32_e32 v6, 16 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v9 +; GFX8-NEXT: v_mov_b32_e32 v7, 8 +; GFX8-NEXT: v_mov_b32_e32 v8, 16 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_sdwa v11, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v12 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v10 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v2, v3, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v14 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v12 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v6 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v11 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v10 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v8 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_v_v8i8_s_v: @@ -3271,20 +3271,20 @@ ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v3, 16 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s3, v5 -; GFX10-NEXT: v_and_or_b32 v1, v1, s3, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v6 +; GFX10-NEXT: v_and_or_b32 v5, v0, s3, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX10-NEXT: v_or3_b32 v1, v1, v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_or3_b32 v0, v0, v7, v4 -; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: v_and_or_b32 v8, v1, s3, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_or3_b32 v2, v5, v7, v4 +; GFX10-NEXT: v_or3_b32 v3, v8, v3, v6 +; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX10-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(1)* %ptr %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx @@ -3306,41 +3306,41 @@ ; GFX9-NEXT: s_lshl_b32 s2, s3, s2 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 ; GFX9-NEXT: s_not_b32 s2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, 8 -; GFX9-NEXT: v_mov_b32_e32 v4, 16 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 16 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v7, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v9, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v8, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v10, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX9-NEXT: v_or3_b32 v0, v0, v8, v5 -; GFX9-NEXT: v_or3_b32 v1, v1, v10, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX9-NEXT: v_and_or_b32 v2, v5, s2, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v9, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v11, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v10, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v12, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX9-NEXT: v_or3_b32 v0, v0, v10, v7 +; GFX9-NEXT: v_or3_b32 v1, v1, v12, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc +; GFX9-NEXT: v_and_or_b32 v2, v7, s2, v2 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v5 -; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v6 +; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v7 +; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v8 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX9-NEXT: v_or3_b32 v0, v0, v7, v2 -; GFX9-NEXT: v_or3_b32 v1, v1, v4, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: v_or3_b32 v0, v0, v9, v2 +; GFX9-NEXT: v_or3_b32 v1, v1, v6, v5 +; GFX9-NEXT: global_store_dwordx2 v[3:4], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v8i8_v_s: @@ -3348,55 +3348,55 @@ ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: s_lshr_b32 s1, s2, 2 ; GFX8-NEXT: s_and_b32 s2, s2, 3 -; GFX8-NEXT: v_mov_b32_e32 v3, 8 +; GFX8-NEXT: v_mov_b32_e32 v5, 8 ; GFX8-NEXT: s_lshl_b32 s2, s2, 3 -; GFX8-NEXT: v_mov_b32_e32 v4, 16 -; GFX8-NEXT: v_mov_b32_e32 v7, s2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_mov_b32_e32 v6, 16 +; GFX8-NEXT: v_mov_b32_e32 v9, s2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 ; GFX8-NEXT: s_not_b32 s0, s0 -; GFX8-NEXT: v_mov_b32_e32 v5, 8 -; GFX8-NEXT: v_mov_b32_e32 v6, 16 +; GFX8-NEXT: v_mov_b32_e32 v7, 8 +; GFX8-NEXT: v_mov_b32_e32 v8, 16 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v10, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v10 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v8 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v3, s0, v3 -; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v11, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v12 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v10 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v6 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v5, s0, v5 +; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v6 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v8 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_v_v8i8_v_s: @@ -3499,20 +3499,20 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v2, s0 ; GFX10-NEXT: v_mov_b32_e32 v2, 16 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s3, v5 -; GFX10-NEXT: v_and_or_b32 v1, v1, s3, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v6 +; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX10-NEXT: v_or3_b32 v1, v1, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_or3_b32 v0, v0, v7, v4 -; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: v_and_or_b32 v2, v0, s3, v5 +; GFX10-NEXT: v_and_or_b32 v3, v1, s3, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v6 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_or3_b32 v2, v2, v7, v4 +; GFX10-NEXT: v_or3_b32 v3, v3, v8, v5 +; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX10-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(1)* %ptr %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx @@ -3526,105 +3526,105 @@ ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_mov_b32 s0, 8 ; GFX9-NEXT: s_mov_b32 s1, 16 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 2, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 2, v3 ; GFX9-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX9-NEXT: s_movk_i32 s2, 0xff ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xff +; GFX9-NEXT: v_mov_b32_e32 v6, 0xff ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, v3, v4 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, v3, v6 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 ; GFX9-NEXT: v_xor_b32_e32 v3, -1, v3 -; GFX9-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-NEXT: v_mov_b32_e32 v6, 16 +; GFX9-NEXT: v_mov_b32_e32 v7, 8 +; GFX9-NEXT: v_mov_b32_e32 v8, 16 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v10, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v12, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v11, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s2, v10 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX9-NEXT: v_and_or_b32 v1, v1, s2, v12 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX9-NEXT: v_or3_b32 v0, v0, v11, v8 -; GFX9-NEXT: v_or3_b32 v1, v1, v13, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v0, v1, vcc -; GFX9-NEXT: v_and_or_b32 v2, v8, v3, v2 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v12, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v15, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, s2, v12 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX9-NEXT: v_and_or_b32 v1, v1, s2, v14 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX9-NEXT: v_or3_b32 v0, v0, v13, v10 +; GFX9-NEXT: v_or3_b32 v1, v1, v15, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v0, v1, vcc +; GFX9-NEXT: v_and_or_b32 v2, v10, v3, v2 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, v4, v7 +; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v10, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, v6, v9 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v5 +; GFX9-NEXT: v_and_or_b32 v1, v1, v6, v7 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX9-NEXT: v_or3_b32 v0, v0, v8, v2 -; GFX9-NEXT: v_or3_b32 v1, v1, v6, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: v_or3_b32 v0, v0, v10, v2 +; GFX9-NEXT: v_or3_b32 v1, v1, v8, v3 +; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v8i8_v_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 2, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 2, v3 ; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_mov_b32_e32 v5, 8 -; GFX8-NEXT: v_mov_b32_e32 v6, 16 +; GFX8-NEXT: v_mov_b32_e32 v7, 8 +; GFX8-NEXT: v_mov_b32_e32 v8, 16 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, 0xff +; GFX8-NEXT: v_mov_b32_e32 v6, 0xff ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, v3, v4 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v3, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v11 ; GFX8-NEXT: v_xor_b32_e32 v3, -1, v3 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v9 -; GFX8-NEXT: v_mov_b32_e32 v7, 8 -; GFX8-NEXT: v_mov_b32_e32 v8, 16 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v11 +; GFX8-NEXT: v_mov_b32_e32 v9, 8 +; GFX8-NEXT: v_mov_b32_e32 v10, 16 +; GFX8-NEXT: v_mov_b32_e32 v4, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_sdwa v11, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v12 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v10 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v6 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v3, v4, v3 +; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v14 +; GFX8-NEXT: v_lshlrev_b32_e32 v7, 24, v12 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v8 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v3, v6, v3 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v9 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_v_v8i8_v_v: @@ -3728,21 +3728,21 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v2, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 16 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v6, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, v5, v6 -; GFX10-NEXT: v_and_or_b32 v1, v1, v5, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v7 +; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v2, v0, v5, v6 +; GFX10-NEXT: v_and_or_b32 v3, v1, v5, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX10-NEXT: v_or3_b32 v1, v1, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_or3_b32 v0, v0, v8, v4 -; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v7 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_or3_b32 v2, v2, v8, v4 +; GFX10-NEXT: v_or3_b32 v3, v3, v9, v5 +; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX10-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(1)* %ptr %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx @@ -4235,90 +4235,90 @@ ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX9-NEXT: s_mov_b32 s0, 8 ; GFX9-NEXT: s_mov_b32 s1, 16 -; GFX9-NEXT: v_mov_b32_e32 v4, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 8 ; GFX9-NEXT: s_movk_i32 s6, 0xff -; GFX9-NEXT: v_mov_b32_e32 v5, 16 ; GFX9-NEXT: s_lshr_b32 s4, s3, 2 ; GFX9-NEXT: s_and_b32 s3, s3, 3 +; GFX9-NEXT: v_mov_b32_e32 v7, 16 ; GFX9-NEXT: s_and_b32 s2, s2, s6 ; GFX9-NEXT: s_lshl_b32 s3, s3, 3 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 ; GFX9-NEXT: s_lshl_b32 s2, s2, s3 ; GFX9-NEXT: s_lshl_b32 s3, s6, s3 ; GFX9-NEXT: s_not_b32 s5, s3 -; GFX9-NEXT: v_mov_b32_e32 v6, s2 +; GFX9-NEXT: v_mov_b32_e32 v8, s2 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v11, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v12, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v0, v0, s6, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX9-NEXT: v_and_or_b32 v1, v1, s6, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v3 -; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v2, v2, s6, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v15, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v16, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_and_or_b32 v0, v0, s6, v13 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX9-NEXT: v_or3_b32 v0, v0, v12, v7 -; GFX9-NEXT: v_or3_b32 v1, v1, v14, v8 -; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v17 +; GFX9-NEXT: v_and_or_b32 v1, v1, s6, v15 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX9-NEXT: v_or3_b32 v2, v2, v16, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v19, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_and_or_b32 v2, v2, s6, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX9-NEXT: v_or3_b32 v0, v0, v14, v9 +; GFX9-NEXT: v_or3_b32 v1, v1, v16, v10 +; GFX9-NEXT: v_and_or_b32 v13, v3, s6, v19 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; GFX9-NEXT: v_or3_b32 v2, v2, v18, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v0, v1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 -; GFX9-NEXT: v_or3_b32 v3, v3, v18, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v2, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[2:3] -; GFX9-NEXT: v_and_or_b32 v6, v7, s5, v6 +; GFX9-NEXT: v_or3_b32 v3, v13, v3, v12 +; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v2, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v3, s[2:3] +; GFX9-NEXT: v_and_or_b32 v8, v9, s5, v8 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v3 -; GFX9-NEXT: v_lshlrev_b32_sdwa v10, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v12, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v14, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v11, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v13, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v9 -; GFX9-NEXT: v_or3_b32 v3, v3, v5, v4 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_and_or_b32 v0, v0, s6, v10 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX9-NEXT: v_and_or_b32 v1, v1, s6, v12 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX9-NEXT: v_and_or_b32 v2, v2, s6, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[2:3] +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v12, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v14, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v6 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v11 +; GFX9-NEXT: v_and_or_b32 v0, v0, s6, v12 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX9-NEXT: v_or3_b32 v0, v0, v11, v6 -; GFX9-NEXT: v_or3_b32 v1, v1, v13, v7 -; GFX9-NEXT: v_or3_b32 v2, v2, v15, v8 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_and_or_b32 v1, v1, s6, v14 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX9-NEXT: v_and_or_b32 v2, v2, s6, v16 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX9-NEXT: v_or3_b32 v0, v0, v13, v8 +; GFX9-NEXT: v_or3_b32 v1, v1, v15, v9 +; GFX9-NEXT: v_or3_b32 v2, v2, v17, v10 +; GFX9-NEXT: v_or3_b32 v3, v3, v7, v6 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v16i8_s_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v4, 8 ; GFX8-NEXT: v_mov_b32_e32 v6, 8 -; GFX8-NEXT: v_mov_b32_e32 v5, 16 +; GFX8-NEXT: v_mov_b32_e32 v8, 8 ; GFX8-NEXT: v_mov_b32_e32 v7, 16 +; GFX8-NEXT: v_mov_b32_e32 v9, 16 ; GFX8-NEXT: s_and_b32 s1, s3, 3 ; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_lshr_b32 s4, s3, 2 @@ -4330,75 +4330,75 @@ ; GFX8-NEXT: s_not_b32 s6, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 +; GFX8-NEXT: v_mov_b32_e32 v4, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v11 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 +; GFX8-NEXT: v_lshlrev_b32_sdwa v18, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v15 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v3 +; GFX8-NEXT: v_lshlrev_b32_sdwa v19, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v12 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v17 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v10 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v12, 24, v13 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v19 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v11 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc +; GFX8-NEXT: v_or_b32_e32 v3, v3, v12 +; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v2, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[2:3] +; GFX8-NEXT: v_and_b32_e32 v6, s6, v6 +; GFX8-NEXT: v_or_b32_e32 v6, s5, v6 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] +; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v9 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v13 +; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v3 -; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v10 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v15 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v10, 24, v11 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v17 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v9 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc -; GFX8-NEXT: v_or_b32_e32 v3, v3, v10 -; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v3, s[2:3] -; GFX8-NEXT: v_and_b32_e32 v4, s6, v4 -; GFX8-NEXT: v_or_b32_e32 v4, s5, v4 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[2:3] -; GFX8-NEXT: v_lshlrev_b32_sdwa v10, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v11, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v3 -; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v11 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v13 -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v15 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v9 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v7 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v8 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v6 -; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v13 +; GFX8-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v15 +; GFX8-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v17 +; GFX8-NEXT: v_lshlrev_b32_e32 v8, 24, v11 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v9 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v10 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v8 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; @@ -4566,32 +4566,32 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v6, s1 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v12, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v14, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v13, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v15, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v16, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_or_b32 v3, v3, s4, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v9 +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v10 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX10-NEXT: v_and_or_b32 v1, v1, s4, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX10-NEXT: v_and_or_b32 v2, v2, s4, v14 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX10-NEXT: v_or3_b32 v3, v3, v5, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: v_or3_b32 v0, v0, v11, v6 ; GFX10-NEXT: v_or3_b32 v1, v1, v13, v7 ; GFX10-NEXT: v_or3_b32 v2, v2, v15, v8 -; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: v_or3_b32 v3, v3, v16, v9 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm %vec = load <16 x i8>, <16 x i8> addrspace(1 )* %ptr @@ -4692,18 +4692,18 @@ ; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v2 ; GFX9-NEXT: v_mov_b32_e32 v8, 16 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v5, v2, s11, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v3 +; GFX9-NEXT: v_and_or_b32 v5, v2, s11, v5 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_or3_b32 v2, v5, v2, v6 -; GFX9-NEXT: v_and_or_b32 v4, v3, s11, v4 -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v7 -; GFX9-NEXT: v_or3_b32 v3, v4, v3, v5 +; GFX9-NEXT: v_and_or_b32 v6, v3, s11, v4 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_or3_b32 v3, v6, v3, v7 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm ; @@ -4798,22 +4798,22 @@ ; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX8-NEXT: v_mov_b32_e32 v4, 8 ; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_mov_b32_e32 v8, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v2 ; GFX8-NEXT: v_or_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v6 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v3 ; GFX8-NEXT: v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v6 -; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v7 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v5 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v7 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v6 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; @@ -5036,8 +5036,8 @@ ; GFX10-NEXT: v_or3_b32 v1, v9, v1, v5 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_or3_b32 v2, v11, v2, v7 -; GFX10-NEXT: v_or3_b32 v3, v10, v3, v8 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: v_or3_b32 v3, v10, v3, v8 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm %vec = load <16 x i8>, <16 x i8> addrspace(4)* %ptr @@ -5139,18 +5139,18 @@ ; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v2 ; GFX9-NEXT: v_mov_b32_e32 v8, 16 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v5, v2, s12, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v3 +; GFX9-NEXT: v_and_or_b32 v5, v2, s12, v5 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_or3_b32 v2, v5, v2, v6 -; GFX9-NEXT: v_and_or_b32 v4, v3, s12, v4 -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v7 -; GFX9-NEXT: v_or3_b32 v3, v4, v3, v5 +; GFX9-NEXT: v_and_or_b32 v6, v3, s12, v4 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_or3_b32 v3, v6, v3, v7 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm ; @@ -5247,22 +5247,22 @@ ; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX8-NEXT: v_mov_b32_e32 v4, 8 ; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_mov_b32_e32 v8, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v2 ; GFX8-NEXT: v_or_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v6 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v3 ; GFX8-NEXT: v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v6 -; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v7 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v5 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v7 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v6 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; @@ -5486,8 +5486,8 @@ ; GFX10-NEXT: v_or3_b32 v1, v9, v1, v5 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_or3_b32 v2, v11, v2, v7 -; GFX10-NEXT: v_or3_b32 v3, v10, v3, v8 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: v_or3_b32 v3, v10, v3, v8 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm %vec = load <16 x i8>, <16 x i8> addrspace(4)* %ptr @@ -5588,18 +5588,18 @@ ; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v2 ; GFX9-NEXT: v_mov_b32_e32 v8, 16 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v5, v2, s10, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v3 +; GFX9-NEXT: v_and_or_b32 v5, v2, s10, v5 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_or3_b32 v2, v5, v2, v6 -; GFX9-NEXT: v_and_or_b32 v4, v3, s10, v4 -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v7 -; GFX9-NEXT: v_or3_b32 v3, v4, v3, v5 +; GFX9-NEXT: v_and_or_b32 v6, v3, s10, v4 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_or3_b32 v3, v6, v3, v7 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm ; @@ -5695,22 +5695,22 @@ ; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX8-NEXT: v_mov_b32_e32 v4, 8 ; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_mov_b32_e32 v8, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v2 ; GFX8-NEXT: v_or_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v6 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v3 ; GFX8-NEXT: v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v6 -; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v7 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v5 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v7 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v6 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; @@ -5933,8 +5933,8 @@ ; GFX10-NEXT: v_or3_b32 v1, v9, v1, v5 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_or3_b32 v2, v11, v2, v7 -; GFX10-NEXT: v_or3_b32 v3, v10, v3, v8 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: v_or3_b32 v3, v10, v3, v8 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm %vec = load <16 x i8>, <16 x i8> addrspace(4)* %ptr @@ -5951,17 +5951,9 @@ ; GFX9-NEXT: s_mov_b32 s1, 16 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: s_movk_i32 s6, 0xff -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 2, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, 16 -; GFX9-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 -; GFX9-NEXT: s_and_b32 s2, s2, s6 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX9-NEXT: v_lshlrev_b32_e64 v8, v2, s2 -; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s6 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v7 -; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v7 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v3 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v4 @@ -5970,80 +5962,80 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v5 ; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v16, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 2, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 ; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v13 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v6 ; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v19, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: s_and_b32 s0, s2, s6 ; GFX9-NEXT: v_and_or_b32 v5, v5, s6, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 ; GFX9-NEXT: v_or3_b32 v3, v3, v14, v9 ; GFX9-NEXT: v_or3_b32 v4, v4, v16, v10 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15 +; GFX9-NEXT: v_lshlrev_b32_e64 v17, v2, s0 ; GFX9-NEXT: v_and_or_b32 v13, v6, s6, v19 ; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v12, 24, v12 ; GFX9-NEXT: v_or3_b32 v5, v5, v18, v11 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v7 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v15 +; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s6 ; GFX9-NEXT: v_or3_b32 v6, v13, v6, v12 ; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v5, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v15 +; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[2:3] -; GFX9-NEXT: v_and_or_b32 v2, v9, v2, v8 +; GFX9-NEXT: v_and_or_b32 v2, v9, v2, v17 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[2:3] +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v5 -; GFX9-NEXT: v_lshlrev_b32_sdwa v10, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v12, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v14, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v13, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v11, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v1, v3, s6, v10 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v5 +; GFX9-NEXT: v_lshlrev_b32_sdwa v12, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v14, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v8 -; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v12 -; GFX9-NEXT: v_and_or_b32 v5, v5, s6, v14 -; GFX9-NEXT: v_and_or_b32 v8, v2, s6, v0 -; GFX9-NEXT: v_or3_b32 v0, v1, v11, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX9-NEXT: v_or3_b32 v1, v4, v13, v6 -; GFX9-NEXT: v_or3_b32 v2, v5, v15, v7 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_or3_b32 v3, v8, v16, v9 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v1, v3, s6, v12 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v6 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v10 +; GFX9-NEXT: v_and_or_b32 v10, v2, s6, v0 +; GFX9-NEXT: v_and_or_b32 v5, v5, s6, v16 +; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v14 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX9-NEXT: v_or3_b32 v0, v1, v13, v3 +; GFX9-NEXT: v_or3_b32 v1, v4, v15, v6 +; GFX9-NEXT: v_or3_b32 v2, v5, v17, v9 +; GFX9-NEXT: v_or3_b32 v3, v10, v18, v11 +; GFX9-NEXT: global_store_dwordx4 v[7:8], v[0:3], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v16i8_s_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v0, 8 -; GFX8-NEXT: v_mov_b32_e32 v7, 8 +; GFX8-NEXT: v_mov_b32_e32 v9, 8 ; GFX8-NEXT: v_mov_b32_e32 v1, 16 -; GFX8-NEXT: v_mov_b32_e32 v8, 16 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 2, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_and_b32 s1, s2, s0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 -; GFX8-NEXT: v_lshlrev_b32_e64 v10, v2, s1 -; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v9 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v9 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v9 +; GFX8-NEXT: v_mov_b32_e32 v7, 0 +; GFX8-NEXT: v_mov_b32_e32 v8, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 @@ -6053,10 +6045,12 @@ ; GFX8-NEXT: v_or_b32_sdwa v3, v3, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v15, 2, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v5 -; GFX8-NEXT: v_lshlrev_b32_sdwa v18, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v19, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v18, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v19, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_or_b32_sdwa v4, v5, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v12, 24, v12 @@ -6064,55 +6058,61 @@ ; GFX8-NEXT: v_or_b32_e32 v3, v3, v16 ; GFX8-NEXT: v_lshrrev_b32_e32 v14, 24, v6 ; GFX8-NEXT: v_or_b32_sdwa v5, v6, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v10, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_e32 v1, v4, v18 ; GFX8-NEXT: v_lshlrev_b32_e32 v13, 24, v13 ; GFX8-NEXT: v_or_b32_e32 v3, v3, v11 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v12 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15 +; GFX8-NEXT: v_lshlrev_b32_e64 v17, v2, s1 +; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 ; GFX8-NEXT: v_or_b32_e32 v4, v5, v6 ; GFX8-NEXT: v_lshlrev_b32_e32 v14, 24, v14 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v13 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v0, vcc +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v15 ; GFX8-NEXT: v_or_b32_e32 v4, v4, v14 ; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v1, s[0:1] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v15 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v4, s[2:3] ; GFX8-NEXT: v_and_b32_e32 v2, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v10 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v17 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[2:3] -; GFX8-NEXT: v_lshlrev_b32_sdwa v10, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v11, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v7, 24, v9 +; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v10, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v11 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v11 -; GFX8-NEXT: v_or_b32_e32 v9, v0, v13 -; GFX8-NEXT: v_or_b32_e32 v10, v1, v15 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX8-NEXT: v_or_b32_e32 v8, v2, v8 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v13 +; GFX8-NEXT: v_or_b32_e32 v11, v0, v15 +; GFX8-NEXT: v_or_b32_e32 v12, v1, v17 +; GFX8-NEXT: v_or_b32_e32 v10, v2, v10 ; GFX8-NEXT: v_or_b32_e32 v0, v3, v4 -; GFX8-NEXT: v_or_b32_e32 v1, v9, v5 -; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: v_or_b32_e32 v2, v10, v6 -; GFX8-NEXT: v_or_b32_e32 v3, v8, v7 -; GFX8-NEXT: v_mov_b32_e32 v5, 0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_or_b32_e32 v1, v11, v5 +; GFX8-NEXT: v_or_b32_e32 v2, v12, v6 +; GFX8-NEXT: v_or_b32_e32 v3, v10, v9 +; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_v_v16i8_s_v: @@ -6280,31 +6280,31 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, v0, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v0, s1 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v4 -; GFX10-NEXT: v_lshlrev_b32_sdwa v14, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v4 ; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v12, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v14, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v13, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v15, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_or_b32 v2, v2, s3, v10 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX10-NEXT: v_and_or_b32 v4, v4, s3, v14 -; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX10-NEXT: v_lshlrev_b32_sdwa v13, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v10, v0, s3, v1 ; GFX10-NEXT: v_and_or_b32 v3, v3, s3, v12 +; GFX10-NEXT: v_and_or_b32 v12, v4, s3, v14 +; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v5 +; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v14, v0, s3, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX10-NEXT: v_or3_b32 v0, v2, v11, v5 -; GFX10-NEXT: v_or3_b32 v2, v4, v15, v8 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_or3_b32 v0, v2, v11, v10 ; GFX10-NEXT: v_or3_b32 v1, v3, v13, v6 -; GFX10-NEXT: v_or3_b32 v3, v10, v7, v9 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: v_or3_b32 v2, v12, v15, v8 +; GFX10-NEXT: v_or3_b32 v3, v14, v7, v9 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm %vec = load <16 x i8>, <16 x i8> addrspace(1)* %ptr @@ -6321,77 +6321,77 @@ ; GFX9-NEXT: s_mov_b32 s1, 16 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: s_movk_i32 s6, 0xff -; GFX9-NEXT: v_mov_b32_e32 v1, 16 ; GFX9-NEXT: s_lshr_b32 s4, s2, 2 ; GFX9-NEXT: s_and_b32 s2, s2, 3 +; GFX9-NEXT: v_mov_b32_e32 v1, 16 ; GFX9-NEXT: s_lshl_b32 s2, s2, 3 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_lshl_b32 s2, s6, s2 ; GFX9-NEXT: s_not_b32 s5, s2 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v4 -; GFX9-NEXT: v_lshlrev_b32_sdwa v11, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v5 -; GFX9-NEXT: v_lshlrev_b32_sdwa v12, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v6 -; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v5, v5, s6, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v4 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v15, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v5 +; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v16, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v13 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX9-NEXT: v_or3_b32 v3, v3, v12, v7 -; GFX9-NEXT: v_or3_b32 v4, v4, v14, v8 -; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v6, v6, s6, v17 +; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v15 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX9-NEXT: v_or3_b32 v5, v5, v16, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v6 +; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v19, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_and_or_b32 v5, v5, s6, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX9-NEXT: v_or3_b32 v3, v3, v14, v9 +; GFX9-NEXT: v_or3_b32 v4, v4, v16, v10 +; GFX9-NEXT: v_and_or_b32 v13, v6, s6, v19 +; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; GFX9-NEXT: v_or3_b32 v5, v5, v18, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 -; GFX9-NEXT: v_or3_b32 v6, v6, v18, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v5, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v6, s[2:3] -; GFX9-NEXT: v_and_or_b32 v2, v7, s5, v2 +; GFX9-NEXT: v_or3_b32 v6, v13, v6, v12 +; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v5, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[2:3] +; GFX9-NEXT: v_and_or_b32 v2, v9, s5, v2 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[2:3] +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v5 -; GFX9-NEXT: v_lshlrev_b32_sdwa v10, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v12, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v14, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v13, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v11, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v1, v3, s6, v10 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v5 +; GFX9-NEXT: v_lshlrev_b32_sdwa v12, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v14, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v8 -; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v12 -; GFX9-NEXT: v_and_or_b32 v5, v5, s6, v14 -; GFX9-NEXT: v_and_or_b32 v8, v2, s6, v0 -; GFX9-NEXT: v_or3_b32 v0, v1, v11, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX9-NEXT: v_or3_b32 v1, v4, v13, v6 -; GFX9-NEXT: v_or3_b32 v2, v5, v15, v7 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_or3_b32 v3, v8, v16, v9 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v1, v3, s6, v12 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v6 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v10 +; GFX9-NEXT: v_and_or_b32 v10, v2, s6, v0 +; GFX9-NEXT: v_and_or_b32 v5, v5, s6, v16 +; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v14 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX9-NEXT: v_or3_b32 v0, v1, v13, v3 +; GFX9-NEXT: v_or3_b32 v1, v4, v15, v6 +; GFX9-NEXT: v_or3_b32 v2, v5, v17, v9 +; GFX9-NEXT: v_or3_b32 v3, v10, v18, v11 +; GFX9-NEXT: global_store_dwordx4 v[7:8], v[0:3], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v16i8_v_s: @@ -6400,11 +6400,11 @@ ; GFX8-NEXT: s_and_b32 s1, s2, 3 ; GFX8-NEXT: v_mov_b32_e32 v0, 8 ; GFX8-NEXT: s_lshl_b32 s1, s1, 3 -; GFX8-NEXT: v_mov_b32_e32 v7, 8 +; GFX8-NEXT: v_mov_b32_e32 v9, 8 ; GFX8-NEXT: v_mov_b32_e32 v1, 16 -; GFX8-NEXT: v_mov_b32_e32 v9, s1 -; GFX8-NEXT: v_mov_b32_e32 v8, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_mov_b32_e32 v11, s1 +; GFX8-NEXT: v_mov_b32_e32 v10, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: s_lshr_b32 s4, s2, 2 ; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_lshl_b32 s0, s0, s1 @@ -6412,76 +6412,76 @@ ; GFX8-NEXT: s_not_b32 s5, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 +; GFX8-NEXT: v_mov_b32_e32 v7, 0 +; GFX8-NEXT: v_mov_b32_e32 v8, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v3 -; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v4 +; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v5 -; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v10 +; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v5 +; GFX8-NEXT: v_lshlrev_b32_sdwa v18, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v19, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_or_b32_sdwa v4, v5, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_or_b32_sdwa v5, v5, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v6 -; GFX8-NEXT: v_lshlrev_b32_sdwa v18, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v14 -; GFX8-NEXT: v_or_b32_sdwa v6, v6, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v10, 24, v11 -; GFX8-NEXT: v_or_b32_e32 v1, v5, v16 -; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v12 -; GFX8-NEXT: v_or_b32_e32 v5, v6, v18 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v9 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v11 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v10 +; GFX8-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v16 +; GFX8-NEXT: v_lshrrev_b32_e32 v14, 24, v6 +; GFX8-NEXT: v_or_b32_sdwa v5, v6, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v10, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_e32 v1, v4, v18 +; GFX8-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v11 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v12 +; GFX8-NEXT: v_or_b32_e32 v4, v5, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v14, 24, v14 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v13 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v0, vcc +; GFX8-NEXT: v_or_b32_e32 v4, v4, v14 ; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v1, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v4, s[2:3] ; GFX8-NEXT: v_and_b32_e32 v5, s5, v5 ; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[2:3] -; GFX8-NEXT: v_lshlrev_b32_sdwa v10, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v11, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v7, 24, v9 +; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v10, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v11 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v11 -; GFX8-NEXT: v_or_b32_e32 v9, v0, v13 -; GFX8-NEXT: v_or_b32_e32 v10, v1, v15 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX8-NEXT: v_or_b32_e32 v8, v2, v8 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v13 +; GFX8-NEXT: v_or_b32_e32 v11, v0, v15 +; GFX8-NEXT: v_or_b32_e32 v12, v1, v17 +; GFX8-NEXT: v_or_b32_e32 v10, v2, v10 ; GFX8-NEXT: v_or_b32_e32 v0, v3, v4 -; GFX8-NEXT: v_or_b32_e32 v1, v9, v5 -; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: v_or_b32_e32 v2, v10, v6 -; GFX8-NEXT: v_or_b32_e32 v3, v8, v7 -; GFX8-NEXT: v_mov_b32_e32 v5, 0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_or_b32_e32 v1, v11, v5 +; GFX8-NEXT: v_or_b32_e32 v2, v12, v6 +; GFX8-NEXT: v_or_b32_e32 v3, v10, v9 +; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_v_v16i8_v_s: @@ -6650,29 +6650,29 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v4 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v5 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v12, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v14, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v13, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v15, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v16, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_or_b32 v1, v3, s3, v10 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v6 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v7 -; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v8 -; GFX10-NEXT: v_and_or_b32 v4, v4, s3, v12 -; GFX10-NEXT: v_and_or_b32 v5, v5, s3, v14 -; GFX10-NEXT: v_and_or_b32 v8, v2, s3, v0 +; GFX10-NEXT: v_and_or_b32 v6, v4, s3, v12 +; GFX10-NEXT: v_and_or_b32 v10, v5, s3, v14 +; GFX10-NEXT: v_and_or_b32 v12, v2, s3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX10-NEXT: v_or3_b32 v0, v1, v11, v3 -; GFX10-NEXT: v_or3_b32 v1, v4, v13, v6 -; GFX10-NEXT: v_or3_b32 v2, v5, v15, v7 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: v_or3_b32 v3, v8, v16, v9 +; GFX10-NEXT: v_or3_b32 v0, v1, v11, v3 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: v_or3_b32 v1, v6, v13, v7 +; GFX10-NEXT: v_or3_b32 v2, v10, v15, v8 +; GFX10-NEXT: v_or3_b32 v3, v12, v16, v9 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm %vec = load <16 x i8>, <16 x i8> addrspace(1)* %ptr @@ -6686,169 +6686,169 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off ; GFX9-NEXT: s_mov_b32 s0, 8 -; GFX9-NEXT: s_mov_b32 s1, 16 ; GFX9-NEXT: v_mov_b32_e32 v1, 8 -; GFX9-NEXT: s_movk_i32 s2, 0xff +; GFX9-NEXT: s_mov_b32 s1, 16 ; GFX9-NEXT: v_mov_b32_e32 v8, 16 +; GFX9-NEXT: s_movk_i32 s2, 0xff ; GFX9-NEXT: v_mov_b32_e32 v0, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v5 ; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v15, s0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v19, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v6 +; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v6, v6, v0, v17 +; GFX9-NEXT: v_and_or_b32 v17, v7, v0, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 2, v3 ; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v16, s1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v5, v5, s2, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 2, v3 -; GFX9-NEXT: v_and_or_b32 v4, v4, s2, v13 +; GFX9-NEXT: v_and_or_b32 v13, v4, s2, v13 +; GFX9-NEXT: v_and_or_b32 v15, v5, s2, v15 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v7 ; GFX9-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v19, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v6, v6, v0, v17 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GFX9-NEXT: v_or3_b32 v4, v4, v14, v9 -; GFX9-NEXT: v_or3_b32 v5, v5, v16, v10 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15 -; GFX9-NEXT: v_and_or_b32 v13, v7, v0, v19 +; GFX9-NEXT: v_or3_b32 v9, v13, v14, v9 +; GFX9-NEXT: v_or3_b32 v10, v15, v16, v10 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v19 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX9-NEXT: v_or3_b32 v6, v6, v18, v11 ; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v12, 24, v12 -; GFX9-NEXT: v_or3_b32 v6, v6, v18, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v4, v5, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v9, v10, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v19 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, v3, v0 -; GFX9-NEXT: v_or3_b32 v7, v13, v7, v12 -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v15 +; GFX9-NEXT: v_or3_b32 v7, v17, v7, v12 +; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v6, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v19 ; GFX9-NEXT: v_xor_b32_e32 v3, -1, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v7, s[2:3] -; GFX9-NEXT: v_and_or_b32 v2, v9, v3, v2 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v4, v2, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v5, v6, v2, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v7, s[2:3] +; GFX9-NEXT: v_and_or_b32 v2, v11, v3, v2 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 +; GFX9-NEXT: v_cndmask_b32_e64 v3, v9, v2, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v9, v10, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, v2, s[2:3] -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v5 -; GFX9-NEXT: v_lshlrev_b32_sdwa v13, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v11, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v14, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v4, v4, v0, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX9-NEXT: v_and_or_b32 v5, v5, v0, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX9-NEXT: v_lshlrev_b32_sdwa v12, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v3, v3, v0, v11 -; GFX9-NEXT: v_and_or_b32 v11, v2, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v14, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v3, v3, v0, v13 ; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX9-NEXT: v_and_or_b32 v9, v9, v0, v15 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX9-NEXT: v_or3_b32 v1, v4, v14, v7 -; GFX9-NEXT: v_or3_b32 v2, v5, v16, v9 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_or3_b32 v0, v3, v12, v6 -; GFX9-NEXT: v_or3_b32 v3, v11, v8, v10 +; GFX9-NEXT: v_and_or_b32 v13, v2, v0, v1 +; GFX9-NEXT: v_and_or_b32 v6, v6, v0, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; GFX9-NEXT: v_or3_b32 v0, v3, v14, v7 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_or3_b32 v1, v9, v16, v10 +; GFX9-NEXT: v_or3_b32 v2, v6, v18, v11 +; GFX9-NEXT: v_or3_b32 v3, v13, v8, v12 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v16i8_v_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1] -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 2, v3 -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_mov_b32_e32 v1, 8 ; GFX8-NEXT: v_mov_b32_e32 v9, 8 +; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: v_mov_b32_e32 v8, 16 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX8-NEXT: v_mov_b32_e32 v0, 0xff -; GFX8-NEXT: v_lshlrev_b32_e32 v0, v3, v0 -; GFX8-NEXT: v_mov_b32_e32 v10, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v11 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v11 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v11 -; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v11 ; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v19, v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v6 +; GFX8-NEXT: v_lshlrev_b32_sdwa v18, v10, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v6, v6, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v17, v7, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v19, 2, v3 +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v14, 24, v7 ; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v4, v4, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v5 ; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v19, v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v6 -; GFX8-NEXT: v_lshrrev_b32_e32 v14, 24, v7 -; GFX8-NEXT: v_lshlrev_b32_sdwa v18, v10, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v5, v6, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v6, v7, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v10, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX8-NEXT: v_or_b32_e32 v4, v4, v16 -; GFX8-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; GFX8-NEXT: v_or_b32_sdwa v15, v4, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v0, v3, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v11 +; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v12 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v8 -; GFX8-NEXT: v_lshlrev_b32_e32 v13, 24, v13 -; GFX8-NEXT: v_lshlrev_b32_e32 v14, 24, v14 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v18 -; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX8-NEXT: v_or_b32_e32 v6, v6, v7 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v12 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v13 -; GFX8-NEXT: v_or_b32_e32 v5, v6, v14 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v3, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v4, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v5, s[2:3] -; GFX8-NEXT: v_and_b32_e32 v0, v6, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v12, 24, v13 +; GFX8-NEXT: v_lshlrev_b32_e32 v13, 24, v14 +; GFX8-NEXT: v_or_b32_e32 v14, v15, v16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v10, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_e32 v6, v6, v18 +; GFX8-NEXT: v_or_b32_e32 v3, v14, v3 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v11 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v19 +; GFX8-NEXT: v_or_b32_e32 v7, v17, v7 +; GFX8-NEXT: v_or_b32_e32 v6, v6, v12 +; GFX8-NEXT: v_cndmask_b32_e32 v8, v3, v1, vcc +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v19 +; GFX8-NEXT: v_or_b32_e32 v7, v7, v13 +; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v6, s[0:1] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v19 +; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v7, s[2:3] +; GFX8-NEXT: v_and_b32_e32 v0, v8, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v3, v0, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v0, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v4, v0, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v0, s[2:3] -; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v11, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v0, s[2:3] +; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v10, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v13 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v11 -; GFX8-NEXT: v_or_b32_e32 v8, v0, v10 +; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX8-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v15 -; GFX8-NEXT: v_or_b32_e32 v0, v2, v4 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: v_or_b32_e32 v2, v3, v6 -; GFX8-NEXT: v_or_b32_e32 v3, v8, v7 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v15 +; GFX8-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v11 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v13 +; GFX8-NEXT: v_or_b32_e32 v10, v0, v10 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v17 +; GFX8-NEXT: v_or_b32_e32 v0, v2, v6 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v8 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 +; GFX8-NEXT: v_or_b32_e32 v3, v10, v9 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; @@ -7014,35 +7014,35 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v7, s1 ; GFX10-NEXT: v_and_or_b32 v0, v10, v2, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, v0, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v0, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v0, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v0, s1 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v15, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v4 ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v13, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v15, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v12, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v14, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v16, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_or_b32 v2, v2, v1, v11 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX10-NEXT: v_and_or_b32 v4, v4, v1, v15 -; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX10-NEXT: v_lshlrev_b32_sdwa v14, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v3, v3, v1, v13 ; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v5 +; GFX10-NEXT: v_and_or_b32 v13, v4, v1, v15 ; GFX10-NEXT: v_and_or_b32 v8, v0, v1, v8 -; GFX10-NEXT: v_and_or_b32 v3, v3, v1, v13 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX10-NEXT: v_or3_b32 v0, v2, v12, v5 -; GFX10-NEXT: v_or3_b32 v2, v4, v16, v7 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_or3_b32 v0, v2, v12, v11 ; GFX10-NEXT: v_or3_b32 v1, v3, v14, v6 -; GFX10-NEXT: v_or3_b32 v3, v8, v9, v10 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: v_or3_b32 v2, v13, v16, v7 +; GFX10-NEXT: v_or3_b32 v3, v8, v9, v10 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm %vec = load <16 x i8>, <16 x i8> addrspace(1)* %ptr diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll @@ -799,12 +799,12 @@ ; CI-LABEL: flat_atomic_dec_ret_i64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v2, 42 -; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s2 -; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -814,12 +814,12 @@ ; VI-LABEL: flat_atomic_dec_ret_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v2, 42 -; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -848,14 +848,14 @@ ; CI-LABEL: flat_atomic_dec_ret_i64_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v2, 42 -; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: s_addc_u32 s3, s3, 0 -; CI-NEXT: v_mov_b32_e32 v0, s2 -; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -865,14 +865,14 @@ ; VI-LABEL: flat_atomic_dec_ret_i64_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v2, 42 -; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -904,23 +904,23 @@ ; CI-LABEL: flat_atomic_dec_noret_i64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v2, 42 -; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_dec_noret_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v2, 42 -; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_endpgm ; GFX9-LABEL: flat_atomic_dec_noret_i64: ; GFX9: ; %bb.0: @@ -940,27 +940,27 @@ ; CI-LABEL: flat_atomic_dec_noret_i64_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v2, 42 -; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_dec_noret_i64_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v2, 42 -; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_endpgm ; GFX9-LABEL: flat_atomic_dec_noret_i64_offset: ; GFX9: ; %bb.0: @@ -987,17 +987,17 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: v_add_i32_e32 v3, vcc, v0, v2 -; CI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; CI-NEXT: v_add_i32_e32 v4, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v3 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; CI-NEXT: v_mov_b32_e32 v4, 42 -; CI-NEXT: v_mov_b32_e32 v5, 0 -; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[2:3], v[4:5] glc +; CI-NEXT: v_add_i32_e32 v4, vcc, 40, v4 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[4:5], v[2:3] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; CI-NEXT: s_endpgm @@ -1009,17 +1009,17 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v3, vcc, v0, v2 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v3 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; VI-NEXT: v_mov_b32_e32 v4, 42 -; VI-NEXT: v_mov_b32_e32 v5, 0 -; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[2:3], v[4:5] glc +; VI-NEXT: v_add_u32_e32 v4, vcc, 40, v4 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[4:5], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm @@ -1064,14 +1064,14 @@ ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: v_add_i32_e32 v0, vcc, 40, v0 -; CI-NEXT: v_mov_b32_e32 v2, 42 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: v_mov_b32_e32 v3, 0 -; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; CI-NEXT: v_add_i32_e32 v2, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_dec_noret_i64_offset_addr64: @@ -1080,14 +1080,14 @@ ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v0, vcc, 40, v0 -; VI-NEXT: v_mov_b32_e32 v2, 42 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, 0 -; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: v_add_u32_e32 v2, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_endpgm ; GFX9-LABEL: flat_atomic_dec_noret_i64_offset_addr64: ; GFX9: ; %bb.0: @@ -1358,12 +1358,12 @@ ; CI-LABEL: global_atomic_dec_ret_i64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v2, 42 -; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s2 -; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1373,12 +1373,12 @@ ; VI-LABEL: global_atomic_dec_ret_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v2, 42 -; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1407,14 +1407,14 @@ ; CI-LABEL: global_atomic_dec_ret_i64_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v2, 42 -; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: s_addc_u32 s3, s3, 0 -; CI-NEXT: v_mov_b32_e32 v0, s2 -; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1424,14 +1424,14 @@ ; VI-LABEL: global_atomic_dec_ret_i64_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v2, 42 -; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1463,23 +1463,23 @@ ; CI-LABEL: global_atomic_dec_noret_i64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v2, 42 -; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_atomic_dec_noret_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v2, 42 -; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_endpgm ; GFX9-LABEL: global_atomic_dec_noret_i64: ; GFX9: ; %bb.0: @@ -1499,27 +1499,27 @@ ; CI-LABEL: global_atomic_dec_noret_i64_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v2, 42 -; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_atomic_dec_noret_i64_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v2, 42 -; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_endpgm ; GFX9-LABEL: global_atomic_dec_noret_i64_offset: ; GFX9: ; %bb.0: @@ -1546,17 +1546,17 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: v_add_i32_e32 v3, vcc, v0, v2 -; CI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; CI-NEXT: v_add_i32_e32 v4, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v3 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; CI-NEXT: v_mov_b32_e32 v4, 42 -; CI-NEXT: v_mov_b32_e32 v5, 0 -; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[2:3], v[4:5] glc +; CI-NEXT: v_add_i32_e32 v4, vcc, 40, v4 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[4:5], v[2:3] glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; CI-NEXT: s_endpgm @@ -1568,17 +1568,17 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v3, vcc, v0, v2 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v3 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; VI-NEXT: v_mov_b32_e32 v4, 42 -; VI-NEXT: v_mov_b32_e32 v5, 0 -; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[2:3], v[4:5] glc +; VI-NEXT: v_add_u32_e32 v4, vcc, 40, v4 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[4:5], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm @@ -1623,14 +1623,14 @@ ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: v_add_i32_e32 v0, vcc, 40, v0 -; CI-NEXT: v_mov_b32_e32 v2, 42 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: v_mov_b32_e32 v3, 0 -; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; CI-NEXT: v_add_i32_e32 v2, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_atomic_dec_noret_i64_offset_addr64: @@ -1639,14 +1639,14 @@ ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v0, vcc, 40, v0 -; VI-NEXT: v_mov_b32_e32 v2, 42 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, 0 -; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: v_add_u32_e32 v2, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_endpgm ; GFX9-LABEL: global_atomic_dec_noret_i64_offset_addr64: ; GFX9: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll @@ -859,12 +859,12 @@ ; CI-LABEL: global_atomic_inc_ret_i64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v2, 42 -; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s2 -; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -874,12 +874,12 @@ ; VI-LABEL: global_atomic_inc_ret_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v2, 42 -; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -918,14 +918,14 @@ ; CI-LABEL: global_atomic_inc_ret_i64_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v2, 42 -; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: s_addc_u32 s3, s3, 0 -; CI-NEXT: v_mov_b32_e32 v0, s2 -; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -935,14 +935,14 @@ ; VI-LABEL: global_atomic_inc_ret_i64_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v2, 42 -; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -982,23 +982,23 @@ ; CI-LABEL: global_atomic_inc_noret_i64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v2, 42 -; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_atomic_inc_noret_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v2, 42 -; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_inc_noret_i64: @@ -1028,27 +1028,27 @@ ; CI-LABEL: global_atomic_inc_noret_i64_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v2, 42 -; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_atomic_inc_noret_i64_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v2, 42 -; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_inc_noret_i64_offset: @@ -1083,17 +1083,17 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: v_add_i32_e32 v3, vcc, v0, v2 -; CI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; CI-NEXT: v_add_i32_e32 v4, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v3 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; CI-NEXT: v_mov_b32_e32 v4, 42 -; CI-NEXT: v_mov_b32_e32 v5, 0 -; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[2:3], v[4:5] glc +; CI-NEXT: v_add_i32_e32 v4, vcc, 40, v4 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; CI-NEXT: s_endpgm @@ -1105,17 +1105,17 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v3, vcc, v0, v2 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v3 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; VI-NEXT: v_mov_b32_e32 v4, 42 -; VI-NEXT: v_mov_b32_e32 v5, 0 -; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[2:3], v[4:5] glc +; VI-NEXT: v_add_u32_e32 v4, vcc, 40, v4 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm @@ -1123,25 +1123,25 @@ ; GFX9-LABEL: global_atomic_inc_ret_i64_offset_addr64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] offset:40 glc +; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v3, v[1:2], s[2:3] offset:40 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_inc_ret_i64_offset_addr64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 42 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] offset:40 glc +; GFX10-NEXT: global_atomic_inc_x2 v[0:1], v3, v[1:2], s[2:3] offset:40 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id @@ -1159,14 +1159,14 @@ ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: v_add_i32_e32 v0, vcc, 40, v0 -; CI-NEXT: v_mov_b32_e32 v2, 42 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: v_mov_b32_e32 v3, 0 -; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; CI-NEXT: v_add_i32_e32 v2, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_atomic_inc_noret_i64_offset_addr64: @@ -1175,34 +1175,34 @@ ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v0, vcc, 40, v0 -; VI-NEXT: v_mov_b32_e32 v2, 42 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, 0 -; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: v_add_u32_e32 v2, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_inc_noret_i64_offset_addr64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[0:1] offset:40 glc +; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v0, v[1:2], s[0:1] offset:40 glc ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_inc_noret_i64_offset_addr64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 42 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[0:1] offset:40 glc +; GFX10-NEXT: global_atomic_inc_x2 v[0:1], v0, v[1:2], s[0:1] offset:40 glc ; GFX10-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id @@ -1560,16 +1560,16 @@ ; ; GFX9-LABEL: atomic_inc_shl_base_lds_0_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_add_u32_e32 v2, 2, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, 9 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 9 +; GFX9-NEXT: v_add_u32_e32 v3, 2, v0 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: ds_inc_rtn_u64 v[0:1], v3, v[0:1] offset:16 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX9-NEXT: ds_inc_rtn_u64 v[0:1], v0, v[1:2] offset:16 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v3, v2, s[2:3] -; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dword v2, v3, s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: atomic_inc_shl_base_lds_0_i64: @@ -1598,12 +1598,12 @@ ; GCN-LABEL: flat_atomic_inc_ret_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-NEXT: v_mov_b32_e32 v2, 42 -; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 42 +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; GCN-NEXT: v_mov_b32_e32 v3, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1618,14 +1618,14 @@ ; CI-LABEL: flat_atomic_inc_ret_i64_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v2, 42 -; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: s_addc_u32 s3, s3, 0 -; CI-NEXT: v_mov_b32_e32 v0, s2 -; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1635,14 +1635,14 @@ ; VI-LABEL: flat_atomic_inc_ret_i64_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v2, 42 -; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1667,14 +1667,14 @@ ; GFX10-LABEL: flat_atomic_inc_ret_i64_offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v2, 42 -; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s2, s2, 32 ; GFX10-NEXT: s_addc_u32 s3, s3, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1690,12 +1690,12 @@ ; GCN-LABEL: flat_atomic_inc_noret_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GCN-NEXT: v_mov_b32_e32 v2, 42 -; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 42 +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GCN-NEXT: v_mov_b32_e32 v3, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; GCN-NEXT: s_endpgm %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %ptr, i64 42, i32 0, i32 0, i1 false) ret void @@ -1705,27 +1705,27 @@ ; CI-LABEL: flat_atomic_inc_noret_i64_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v2, 42 -; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_inc_noret_i64_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v2, 42 -; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_inc_noret_i64_offset: @@ -1742,14 +1742,14 @@ ; GFX10-LABEL: flat_atomic_inc_noret_i64_offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v2, 42 -; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 32 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; GFX10-NEXT: s_endpgm %gep = getelementptr i64, i64* %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %gep, i64 42, i32 0, i32 0, i1 false) @@ -1764,17 +1764,17 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: v_add_i32_e32 v3, vcc, v0, v2 -; CI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; CI-NEXT: v_add_i32_e32 v4, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v3 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; CI-NEXT: v_mov_b32_e32 v4, 42 -; CI-NEXT: v_mov_b32_e32 v5, 0 -; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[2:3], v[4:5] glc +; CI-NEXT: v_add_i32_e32 v4, vcc, 40, v4 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; CI-NEXT: s_endpgm @@ -1786,17 +1786,17 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v3, vcc, v0, v2 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v3 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; VI-NEXT: v_mov_b32_e32 v4, 42 -; VI-NEXT: v_mov_b32_e32 v5, 0 -; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[2:3], v[4:5] glc +; VI-NEXT: v_add_u32_e32 v4, vcc, 40, v4 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm @@ -1804,18 +1804,18 @@ ; GFX9-LABEL: flat_atomic_inc_ret_i64_offset_addr64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 3, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, 42 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 -; GFX9-NEXT: v_mov_b32_e32 v4, 42 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[4:5] offset:40 glc +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1825,16 +1825,16 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, 42 -; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, 40 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, 40 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 @@ -1858,14 +1858,14 @@ ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: v_add_i32_e32 v0, vcc, 40, v0 -; CI-NEXT: v_mov_b32_e32 v2, 42 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: v_mov_b32_e32 v3, 0 -; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; CI-NEXT: v_add_i32_e32 v2, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_inc_noret_i64_offset_addr64: @@ -1874,28 +1874,28 @@ ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v0, vcc, 40, v0 -; VI-NEXT: v_mov_b32_e32 v2, 42 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, 0 -; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: v_add_u32_e32 v2, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_inc_noret_i64_offset_addr64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, 42 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] offset:40 glc +; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[3:4], v[1:2] offset:40 glc ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_inc_noret_i64_offset_addr64: @@ -1905,13 +1905,13 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, 42 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, 40 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, 40 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; GFX10-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, i64* %ptr, i32 %id diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll --- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll @@ -243,8 +243,8 @@ ; HSA-LABEL: {{^}}cast_neg1_group_to_flat_addrspacecast: ; HSA: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} -; HSA: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}} -; HSA: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} +; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}} +; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} ; HSA: {{flat|global}}_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]] define amdgpu_kernel void @cast_neg1_group_to_flat_addrspacecast() #0 { %cast = addrspacecast i32 addrspace(3)* inttoptr (i32 -1 to i32 addrspace(3)*) to i32* @@ -296,9 +296,9 @@ ; CI: enable_sgpr_queue_ptr = 1 ; GFX9: enable_sgpr_queue_ptr = 0 -; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} +; HSA: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}} -; HSA: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} +; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} ; HSA: {{flat|global}}_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]] define amdgpu_kernel void @cast_neg1_private_to_flat_addrspacecast() #0 { %cast = addrspacecast i32 addrspace(5)* inttoptr (i32 -1 to i32 addrspace(5)*) to i32* diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -3381,8 +3381,8 @@ ; GFX9-NEXT: s_cbranch_execz BB18_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -3415,8 +3415,8 @@ ; GFX1064-NEXT: s_cbranch_execz BB18_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] @@ -3448,8 +3448,8 @@ ; GFX1032-NEXT: s_cbranch_execz BB18_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] @@ -3798,8 +3798,8 @@ ; GFX9-NEXT: s_cbranch_execz BB20_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -3832,8 +3832,8 @@ ; GFX1064-NEXT: s_cbranch_execz BB20_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] @@ -3865,8 +3865,8 @@ ; GFX1032-NEXT: s_cbranch_execz BB20_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] @@ -4209,8 +4209,8 @@ ; GFX9-NEXT: s_cbranch_execz BB22_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -4242,8 +4242,8 @@ ; GFX1064-NEXT: s_cbranch_execz BB22_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] @@ -4275,8 +4275,8 @@ ; GFX1032-NEXT: s_cbranch_execz BB22_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] @@ -4619,8 +4619,8 @@ ; GFX9-NEXT: s_cbranch_execz BB24_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -4652,8 +4652,8 @@ ; GFX1064-NEXT: s_cbranch_execz BB24_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] @@ -4685,8 +4685,8 @@ ; GFX1032-NEXT: s_cbranch_execz BB24_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -224,10 +224,10 @@ ; GCN-NEXT: v_mov_b32_e32 v0, 2 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: global_load_ushort v0, v[0:1], off -; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_load_short_d16_hi v0, v[1:2], off +; GCN-NEXT: global_load_short_d16_hi v0, v[2:3], off ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; @@ -292,10 +292,10 @@ ; GCN-NEXT: v_mov_b32_e32 v0, 2 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: flat_load_ushort v0, v[0:1] -; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: flat_load_short_d16_hi v0, v[1:2] +; GCN-NEXT: flat_load_short_d16_hi v0, v[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll --- a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll @@ -67,14 +67,14 @@ ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 vcc, 0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 +; GFX9-NEXT: v_sub_u32_e32 v3, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0x7b ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_div_fmas_f32 v2, v1, v1, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x7b -; GFX9-NEXT: ds_write_b32 v0, v1 offset:12 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: ds_write_b32 v3, v4 offset:12 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm @@ -84,11 +84,11 @@ ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_mov_b32 vcc_lo, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, 0x7b -; GFX10-NEXT: v_sub_nc_u32_e32 v3, 0, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, 0x7b +; GFX10-NEXT: v_sub_nc_u32_e32 v2, 0, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: ds_write_b32 v3, v2 offset:12 +; GFX10-NEXT: ds_write_b32 v2, v3 offset:12 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_div_fmas_f32 v4, s0, s0, s0 ; GFX10-NEXT: global_store_dword v[0:1], v4, off @@ -315,15 +315,15 @@ ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 vcc, 0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_sub_u32_e32 v0, 0x3fb, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_sub_u32_e32 v3, 0x3fb, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0x7b ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_div_fmas_f32 v2, v1, v1, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x7b -; GFX9-NEXT: ds_write2_b32 v0, v1, v3 offset1:1 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: ds_write2_b32 v3, v4, v5 offset1:1 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm @@ -333,12 +333,12 @@ ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_mov_b32 vcc_lo, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, 0x7b -; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: v_sub_nc_u32_e32 v4, 0x3fb, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, 0x7b +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, 0x3fb, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: ds_write2_b32 v4, v2, v3 offset1:1 +; GFX10-NEXT: ds_write2_b32 v2, v3, v4 offset1:1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_div_fmas_f32 v5, s0, s0, s0 ; GFX10-NEXT: global_store_dword v[0:1], v5, off diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll --- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll +++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll @@ -85,14 +85,14 @@ ; GFX7-NEXT: s_cbranch_vccnz BB1_2 ; GFX7-NEXT: ; %bb.1: ; %bb0 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 -; GFX7-NEXT: v_mov_b32_e32 v2, 9 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, 9 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: BB1_2: ; %bb1 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 -; GFX7-NEXT: v_mov_b32_e32 v2, 10 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, 10 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm @@ -112,14 +112,14 @@ ; GFX9-NEXT: s_cbranch_vccnz BB1_2 ; GFX9-NEXT: ; %bb.1: ; %bb0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 9 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 9 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: BB1_2: ; %bb1 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 10 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 10 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm @@ -138,14 +138,14 @@ ; GFX10-NEXT: s_cbranch_vccnz BB1_2 ; GFX10-NEXT: ; %bb.1: ; %bb0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, 9 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 9 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: BB1_2: ; %bb1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, 10 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 10 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll @@ -341,8 +341,8 @@ ; GCN-LABEL: {{^}}global_atomic_dec_ret_i64: ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GFX9-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; GFX9: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; CIVI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}} ; GFX9: global_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[ZERO]], v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}} glc{{$}} @@ -354,8 +354,8 @@ ; GCN-LABEL: {{^}}global_atomic_dec_ret_i64_offset: ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GFX9-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; GFX9: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; CIVI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32 glc{{$}} ; GFX9: global_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[ZERO]], v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}} define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { @@ -367,8 +367,8 @@ ; GCN-LABEL: {{^}}global_atomic_dec_noret_i64: ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GFX9-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; GFX9: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; CIVI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GFX9: global_atomic_dec_x2 v[[ZERO]], v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]$}} define amdgpu_kernel void @global_atomic_dec_noret_i64(i64 addrspace(1)* %ptr) nounwind { @@ -378,8 +378,8 @@ ; GCN-LABEL: {{^}}global_atomic_dec_noret_i64_offset: ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GFX9-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; GFX9: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; CIVI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32{{$}} ; GFX9: global_atomic_dec_x2 v[[ZERO]], v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll @@ -190,8 +190,8 @@ ; GCN-LABEL: {{^}}global_atomic_inc_ret_i64: ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GFX9: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; GFX9: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; CIVI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}} ; GFX9: global_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[ZERO]], v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}} glc{{$}} define amdgpu_kernel void @global_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { @@ -201,9 +201,9 @@ } ; GCN-LABEL: {{^}}global_atomic_inc_ret_i64_offset: -; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GFX9-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; GFX9: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; CIVI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32 glc{{$}} ; GFX9: global_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[ZERO]], v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}} define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { @@ -215,8 +215,8 @@ ; GCN-LABEL: {{^}}global_atomic_inc_noret_i64: ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GFX9-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; GFX9: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; CIVI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GFX9: global_atomic_inc_x2 v[[ZERO]], v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]$}} @@ -227,8 +227,8 @@ ; GCN-LABEL: {{^}}global_atomic_inc_noret_i64_offset: ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GFX9-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; GFX9: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; CIVI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32{{$}} ; GFX9: global_atomic_inc_x2 v[[ZERO]], v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind { diff --git a/llvm/test/CodeGen/AMDGPU/remat-vop.mir b/llvm/test/CodeGen/AMDGPU/remat-vop.mir --- a/llvm/test/CodeGen/AMDGPU/remat-vop.mir +++ b/llvm/test/CodeGen/AMDGPU/remat-vop.mir @@ -93,26 +93,17 @@ S_NOP 0, implicit %2 S_ENDPGM 0 ... -# FIXME: V_MOV_B64_PSEUDO is not rematerializable --- name: test_remat_v_mov_b64_pseudo tracksRegLiveness: true -machineFunctionInfo: - stackPtrOffsetReg: $sgpr32 body: | bb.0: ; GCN-LABEL: name: test_remat_v_mov_b64_pseudo ; GCN: renamable $vgpr0_vgpr1 = V_MOV_B64_PSEUDO 1, implicit $exec - ; GCN: SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.1, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.1, align 4, addrspace 5) - ; GCN: renamable $vgpr0_vgpr1 = V_MOV_B64_PSEUDO 2, implicit $exec - ; GCN: SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) - ; GCN: renamable $vgpr0_vgpr1 = V_MOV_B64_PSEUDO 3, implicit $exec - ; GCN: SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.2, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.2, align 4, addrspace 5) - ; GCN: renamable $vgpr0_vgpr1 = SI_SPILL_V64_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.1, align 4, addrspace 5) ; GCN: S_NOP 0, implicit killed renamable $vgpr0_vgpr1 - ; GCN: renamable $vgpr0_vgpr1 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5) + ; GCN: renamable $vgpr0_vgpr1 = V_MOV_B64_PSEUDO 2, implicit $exec ; GCN: S_NOP 0, implicit killed renamable $vgpr0_vgpr1 - ; GCN: renamable $vgpr0_vgpr1 = SI_SPILL_V64_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.2, align 4, addrspace 5) + ; GCN: renamable $vgpr0_vgpr1 = V_MOV_B64_PSEUDO 3, implicit $exec ; GCN: S_NOP 0, implicit killed renamable $vgpr0_vgpr1 ; GCN: S_ENDPGM 0 %0:vreg_64 = V_MOV_B64_PSEUDO 1, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -433,10 +433,10 @@ ; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[16:17], v[7:8] ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[9:10], v0 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB1_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, -1, v2 @@ -446,8 +446,8 @@ ; GCN-IR-NEXT: v_not_b32_e32 v11, v18 ; GCN-IR-NEXT: v_add_i32_e32 v13, vcc, v10, v14 ; GCN-IR-NEXT: v_mov_b32_e32 v18, 0 -; GCN-IR-NEXT: v_addc_u32_e32 v14, vcc, v11, v15, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v19, 0 +; GCN-IR-NEXT: v_addc_u32_e32 v14, vcc, v11, v15, vcc ; GCN-IR-NEXT: BB1_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 @@ -471,14 +471,14 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v14, v19 ; GCN-IR-NEXT: v_mov_b32_e32 v19, v12 ; GCN-IR-NEXT: v_subb_u32_e64 v17, s[4:5], v17, v20, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v18, v11 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB1_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: BB1_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: BB1_5: ; %Flow3 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[7:8], 1 ; GCN-IR-NEXT: v_or_b32_e32 v12, v12, v3 ; GCN-IR-NEXT: v_or_b32_e32 v0, v11, v2 @@ -1503,10 +1503,10 @@ ; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v4 ; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], 24, v4 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB11_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_lshr_b64 v[12:13], 24, v8 @@ -1514,8 +1514,8 @@ ; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, -1, v1, vcc ; GCN-IR-NEXT: v_sub_i32_e32 v10, vcc, 58, v10 ; GCN-IR-NEXT: v_mov_b32_e32 v14, 0 -; GCN-IR-NEXT: v_subb_u32_e32 v11, vcc, 0, v11, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v15, 0 +; GCN-IR-NEXT: v_subb_u32_e32 v11, vcc, 0, v11, vcc ; GCN-IR-NEXT: BB11_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[12:13], v[12:13], 1 @@ -1539,14 +1539,14 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v11, v15 ; GCN-IR-NEXT: v_mov_b32_e32 v15, v7 ; GCN-IR-NEXT: v_subb_u32_e64 v13, s[4:5], v13, v16, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v14, v6 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB11_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: BB11_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: BB11_5: ; %Flow3 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], 1 ; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v1 ; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v0 @@ -1692,83 +1692,83 @@ ; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4 ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v1 ; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GCN-IR-NEXT: v_cndmask_b32_e32 v10, v5, v4, vcc +; GCN-IR-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc ; GCN-IR-NEXT: s_movk_i32 s6, 0xffd0 -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, s6, v10 +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, s6, v6 ; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5] ; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GCN-IR-NEXT: v_mov_b32_e32 v6, s8 +; GCN-IR-NEXT: v_mov_b32_e32 v8, s8 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 -; GCN-IR-NEXT: v_cndmask_b32_e64 v6, v6, 0, s[4:5] +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: v_cndmask_b32_e64 v8, v8, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v3, v2 -; GCN-IR-NEXT: v_mov_b32_e32 v7, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v9, v7 ; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB12_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[8:9], v[4:5] +; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v4 +; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[10:11], v[4:5] ; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v4 +; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], s[8:9], v4 -; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 -; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB12_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 -; GCN-IR-NEXT: v_lshr_b64 v[12:13], s[4:5], v8 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, -1, v0 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, -1, v1, vcc -; GCN-IR-NEXT: v_sub_i32_e32 v10, vcc, 47, v10 +; GCN-IR-NEXT: v_lshr_b64 v[12:13], s[4:5], v10 +; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, -1, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, -1, v1, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v6 ; GCN-IR-NEXT: v_mov_b32_e32 v14, 0 -; GCN-IR-NEXT: v_subb_u32_e32 v11, vcc, 0, v11, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v15, 0 +; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, 0, v7, vcc ; GCN-IR-NEXT: BB12_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[12:13], v[12:13], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5 -; GCN-IR-NEXT: v_or_b32_e32 v12, v12, v6 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v8, 31, v5 +; GCN-IR-NEXT: v_or_b32_e32 v12, v12, v8 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v8, v12 -; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v9, v13, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, v10, v12 +; GCN-IR-NEXT: v_subb_u32_e32 v8, vcc, v11, v13, vcc ; GCN-IR-NEXT: v_or_b32_e32 v4, v14, v4 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v14, 31, v6 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v14, 31, v8 ; GCN-IR-NEXT: v_and_b32_e32 v17, v14, v0 -; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v14 +; GCN-IR-NEXT: v_and_b32_e32 v8, 1, v14 ; GCN-IR-NEXT: v_and_b32_e32 v16, v14, v1 -; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, 1, v10 +; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, 1, v6 ; GCN-IR-NEXT: v_or_b32_e32 v5, v15, v5 -; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, 0, v11, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[14:15], v[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v10, v14 -; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[14:15], v[6:7] +; GCN-IR-NEXT: v_mov_b32_e32 v6, v14 +; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 ; GCN-IR-NEXT: v_sub_i32_e64 v12, s[4:5], v12, v17 -; GCN-IR-NEXT: v_mov_b32_e32 v11, v15 -; GCN-IR-NEXT: v_mov_b32_e32 v15, v7 +; GCN-IR-NEXT: v_mov_b32_e32 v7, v15 +; GCN-IR-NEXT: v_mov_b32_e32 v15, v9 ; GCN-IR-NEXT: v_subb_u32_e64 v13, s[4:5], v13, v16, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GCN-IR-NEXT: v_mov_b32_e32 v14, v6 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v14, v8 +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB12_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: BB12_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: BB12_5: ; %Flow3 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], 1 -; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v1 -; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v0 +; GCN-IR-NEXT: v_or_b32_e32 v9, v9, v1 +; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v0 ; GCN-IR-NEXT: BB12_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-IR-NEXT: v_xor_b32_e32 v0, v6, v2 -; GCN-IR-NEXT: v_xor_b32_e32 v1, v7, v3 +; GCN-IR-NEXT: v_xor_b32_e32 v0, v8, v2 +; GCN-IR-NEXT: v_xor_b32_e32 v1, v9, v3 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; GCN-IR-NEXT: s_setpc_b64 s[30:31] @@ -1820,10 +1820,10 @@ ; GCN-IR-NEXT: v_sub_i32_e64 v3, s[4:5], 63, v3 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: v_lshl_b64 v[3:4], v[7:8], v3 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB13_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_lshr_b64 v[9:10], v[7:8], v9 @@ -1855,14 +1855,14 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 ; GCN-IR-NEXT: v_sub_i32_e64 v9, s[4:5], v0, v9 ; GCN-IR-NEXT: v_subb_u32_e64 v10, s[4:5], v10, v13, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB13_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: BB13_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: BB13_5: ; %Flow3 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[3:4], v[3:4], 1 ; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v4 ; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll --- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll @@ -282,10 +282,10 @@ ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v4, s6 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GCN-NEXT: v_mov_b32_e32 v4, s4 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v6, s4 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_mov_b32_e32 v5, 0 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1] ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm %shift = ashr i128 %lhs, %rhs @@ -587,8 +587,9 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x8 ; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 +; GCN-NEXT: v_mov_b32_e32 v10, 16 ; GCN-NEXT: v_mov_b32_e32 v8, 0 -; GCN-NEXT: v_mov_b32_e32 v9, 0 +; GCN-NEXT: v_mov_b32_e32 v11, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s6, 64, s16 ; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[16:17], 64 @@ -643,11 +644,10 @@ ; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; GCN-NEXT: v_mov_b32_e32 v6, s4 ; GCN-NEXT: v_mov_b32_e32 v7, s3 -; GCN-NEXT: v_mov_b32_e32 v10, s2 +; GCN-NEXT: v_mov_b32_e32 v12, s2 ; GCN-NEXT: v_cndmask_b32_e64 v7, v6, v7, s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v10, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v10, 16 -; GCN-NEXT: v_mov_b32_e32 v11, 0 +; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v9, 0 ; GCN-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -408,52 +408,52 @@ ; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[9:10], v[7:8] ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[0:1], v3 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB1_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, -1, v5 ; GCN-IR-NEXT: v_lshr_b64 v[16:17], v[0:1], v9 ; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, -1, v6, vcc ; GCN-IR-NEXT: v_not_b32_e32 v10, v12 +; GCN-IR-NEXT: v_mov_b32_e32 v18, 0 ; GCN-IR-NEXT: v_not_b32_e32 v11, v13 ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, v10, v14 +; GCN-IR-NEXT: v_mov_b32_e32 v19, 0 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, v11, v15, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v14, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v15, 0 ; GCN-IR-NEXT: BB1_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 +; GCN-IR-NEXT: v_lshl_b64 v[14:15], v[16:17], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v10, 31, v8 -; GCN-IR-NEXT: v_or_b32_e32 v16, v16, v10 +; GCN-IR-NEXT: v_or_b32_e32 v14, v14, v10 ; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v10, vcc, v3, v16 -; GCN-IR-NEXT: v_subb_u32_e32 v10, vcc, v9, v17, vcc -; GCN-IR-NEXT: v_or_b32_e32 v7, v14, v7 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v14, 31, v10 -; GCN-IR-NEXT: v_and_b32_e32 v19, v14, v5 -; GCN-IR-NEXT: v_and_b32_e32 v10, 1, v14 -; GCN-IR-NEXT: v_and_b32_e32 v18, v14, v6 -; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, 1, v12 -; GCN-IR-NEXT: v_or_b32_e32 v8, v15, v8 -; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, 0, v13, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[14:15], v[12:13] -; GCN-IR-NEXT: v_mov_b32_e32 v12, v14 +; GCN-IR-NEXT: v_sub_i32_e32 v10, vcc, v3, v14 +; GCN-IR-NEXT: v_subb_u32_e32 v10, vcc, v9, v15, vcc +; GCN-IR-NEXT: v_or_b32_e32 v7, v18, v7 +; GCN-IR-NEXT: v_add_i32_e32 v18, vcc, 1, v12 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v16, 31, v10 +; GCN-IR-NEXT: v_or_b32_e32 v8, v19, v8 +; GCN-IR-NEXT: v_addc_u32_e32 v19, vcc, 0, v13, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[18:19], v[12:13] +; GCN-IR-NEXT: v_mov_b32_e32 v12, v18 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 -; GCN-IR-NEXT: v_sub_i32_e64 v16, s[4:5], v16, v19 -; GCN-IR-NEXT: v_mov_b32_e32 v13, v15 -; GCN-IR-NEXT: v_mov_b32_e32 v15, v11 -; GCN-IR-NEXT: v_subb_u32_e64 v17, s[4:5], v17, v18, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GCN-IR-NEXT: v_mov_b32_e32 v14, v10 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: v_and_b32_e32 v10, 1, v16 +; GCN-IR-NEXT: v_and_b32_e32 v17, v16, v6 +; GCN-IR-NEXT: v_and_b32_e32 v16, v16, v5 +; GCN-IR-NEXT: v_sub_i32_e64 v16, s[4:5], v14, v16 +; GCN-IR-NEXT: v_mov_b32_e32 v13, v19 +; GCN-IR-NEXT: v_mov_b32_e32 v19, v11 +; GCN-IR-NEXT: v_subb_u32_e64 v17, s[4:5], v15, v17, s[4:5] +; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v18, v10 +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB1_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: BB1_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: BB1_5: ; %Flow3 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 ; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v8 ; GCN-IR-NEXT: v_or_b32_e32 v9, v10, v7 @@ -1682,10 +1682,10 @@ ; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[3:4] ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], 24, v2 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB11_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_lshr_b64 v[10:11], 24, v6 @@ -1693,8 +1693,8 @@ ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc ; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, 58, v8 ; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 -; GCN-IR-NEXT: v_subb_u32_e32 v9, vcc, 0, v9, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 +; GCN-IR-NEXT: v_subb_u32_e32 v9, vcc, 0, v9, vcc ; GCN-IR-NEXT: BB11_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 @@ -1718,14 +1718,14 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v9, v13 ; GCN-IR-NEXT: v_mov_b32_e32 v13, v5 ; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v14, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v12, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB11_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: BB11_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: BB11_5: ; %Flow3 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 ; GCN-IR-NEXT: v_or_b32_e32 v2, v4, v2 @@ -1870,84 +1870,84 @@ ; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 ; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GCN-IR-NEXT: v_cndmask_b32_e32 v8, v3, v2, vcc +; GCN-IR-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc ; GCN-IR-NEXT: s_movk_i32 s6, 0xffd0 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s6, v8 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s6, v4 ; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] ; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GCN-IR-NEXT: v_mov_b32_e32 v4, s8 +; GCN-IR-NEXT: v_mov_b32_e32 v6, s8 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[2:3] -; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 -; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5] +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: v_cndmask_b32_e64 v6, v6, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GCN-IR-NEXT: v_mov_b32_e32 v5, v9 +; GCN-IR-NEXT: v_mov_b32_e32 v7, v5 ; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB12_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v2 -; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc -; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[2:3] +; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v2 +; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[8:9], v[2:3] ; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2 -; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB12_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 -; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[4:5], v6 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, -1, v0 -; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc -; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, 47, v8 +; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[4:5], v8 +; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, -1, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, -1, v1, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 47, v4 ; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 -; GCN-IR-NEXT: v_subb_u32_e32 v9, vcc, 0, v9, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 +; GCN-IR-NEXT: v_subb_u32_e32 v5, vcc, 0, v5, vcc ; GCN-IR-NEXT: BB12_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 -; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v4 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v3 +; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v6 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v6, v10 -; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v7, v11, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v8, v10 +; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v9, v11, vcc ; GCN-IR-NEXT: v_or_b32_e32 v2, v12, v2 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v4 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v6 ; GCN-IR-NEXT: v_and_b32_e32 v15, v12, v0 -; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v12 +; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v12 ; GCN-IR-NEXT: v_and_b32_e32 v14, v12, v1 -; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v8 +; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v4 ; GCN-IR-NEXT: v_or_b32_e32 v3, v13, v3 -; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v9, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[8:9] -; GCN-IR-NEXT: v_mov_b32_e32 v8, v12 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v5, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[4:5] +; GCN-IR-NEXT: v_mov_b32_e32 v4, v12 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v15 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v13 -; GCN-IR-NEXT: v_mov_b32_e32 v13, v5 +; GCN-IR-NEXT: v_mov_b32_e32 v5, v13 +; GCN-IR-NEXT: v_mov_b32_e32 v13, v7 ; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v14, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GCN-IR-NEXT: v_mov_b32_e32 v12, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v12, v6 +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB12_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: BB12_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: BB12_5: ; %Flow3 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 -; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2 +; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v3 +; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v2 ; GCN-IR-NEXT: BB12_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v5 -; GCN-IR-NEXT: v_mul_hi_u32 v3, v0, v4 -; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v4 -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v4 +; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v7 +; GCN-IR-NEXT: v_mul_hi_u32 v3, v0, v6 +; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v6 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v6 ; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 0x8000, v0 @@ -2003,10 +2003,10 @@ ; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v4 ; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[0:1], v4 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB13_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 @@ -2038,14 +2038,14 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v14, 0 ; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v15 ; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v14, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v12, v6 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB13_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: BB13_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: BB13_5: ; %Flow3 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 ; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v5 ; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v4 diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -374,52 +374,52 @@ ; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[12:13], v[6:7] ; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[0:1], v4 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB1_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_lshr_b64 v[12:13], v[0:1], v12 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, -1, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, -1, v3, vcc ; GCN-IR-NEXT: v_not_b32_e32 v6, v8 +; GCN-IR-NEXT: v_mov_b32_e32 v14, 0 ; GCN-IR-NEXT: v_not_b32_e32 v7, v9 ; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, v6, v10 +; GCN-IR-NEXT: v_mov_b32_e32 v15, 0 ; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, v7, v11, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: BB1_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[12:13], v[12:13], 1 +; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[12:13], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5 -; GCN-IR-NEXT: v_or_b32_e32 v12, v12, v6 +; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v6 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v0, v12 -; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v1, v13, vcc -; GCN-IR-NEXT: v_or_b32_e32 v4, v10, v4 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v6 -; GCN-IR-NEXT: v_and_b32_e32 v15, v10, v2 -; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v10 -; GCN-IR-NEXT: v_and_b32_e32 v14, v10, v3 -; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v8 -; GCN-IR-NEXT: v_or_b32_e32 v5, v11, v5 -; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[8:9] -; GCN-IR-NEXT: v_mov_b32_e32 v8, v10 +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v0, v10 +; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v1, v11, vcc +; GCN-IR-NEXT: v_or_b32_e32 v4, v14, v4 +; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, 1, v8 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v6 +; GCN-IR-NEXT: v_or_b32_e32 v5, v15, v5 +; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, 0, v9, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[14:15], v[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v8, v14 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 -; GCN-IR-NEXT: v_sub_i32_e64 v12, s[4:5], v12, v15 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v11 -; GCN-IR-NEXT: v_mov_b32_e32 v11, v7 -; GCN-IR-NEXT: v_subb_u32_e64 v13, s[4:5], v13, v14, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GCN-IR-NEXT: v_mov_b32_e32 v10, v6 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v12 +; GCN-IR-NEXT: v_and_b32_e32 v13, v12, v3 +; GCN-IR-NEXT: v_and_b32_e32 v12, v12, v2 +; GCN-IR-NEXT: v_sub_i32_e64 v12, s[4:5], v10, v12 +; GCN-IR-NEXT: v_mov_b32_e32 v9, v15 +; GCN-IR-NEXT: v_mov_b32_e32 v15, v7 +; GCN-IR-NEXT: v_subb_u32_e64 v13, s[4:5], v11, v13, s[4:5] +; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v14, v6 +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB1_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: BB1_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: BB1_5: ; %Flow3 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], 1 ; GCN-IR-NEXT: v_or_b32_e32 v4, v7, v1 ; GCN-IR-NEXT: v_or_b32_e32 v5, v6, v0 @@ -1244,77 +1244,77 @@ ; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 ; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GCN-IR-NEXT: v_cndmask_b32_e32 v8, v3, v2, vcc -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 0xffffffd0, v8 -; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc +; GCN-IR-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc +; GCN-IR-NEXT: v_add_i32_e32 v5, vcc, 0xffffffd0, v4 +; GCN-IR-NEXT: v_addc_u32_e64 v6, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5] +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[5:6] ; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: v_mov_b32_e32 v2, s8 -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[5:6] +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v2, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GCN-IR-NEXT: v_mov_b32_e32 v3, v9 +; GCN-IR-NEXT: v_mov_b32_e32 v3, v7 ; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB9_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v4 -; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v4 -; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[4:5] -; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2 -; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v5 +; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v6, vcc +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v5 +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[8:9], v[5:6] ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2 +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB9_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 -; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[4:5], v6 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, -1, v0 -; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc -; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, 47, v8 -; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 -; GCN-IR-NEXT: v_subb_u32_e32 v9, vcc, 0, v9, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 +; GCN-IR-NEXT: v_lshr_b64 v[12:13], s[4:5], v8 +; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, -1, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, -1, v1, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v10, vcc, 47, v4 +; GCN-IR-NEXT: v_mov_b32_e32 v14, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v15, 0 +; GCN-IR-NEXT: v_subb_u32_e32 v11, vcc, 0, v7, vcc ; GCN-IR-NEXT: BB9_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 +; GCN-IR-NEXT: v_lshl_b64 v[12:13], v[12:13], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 -; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v4 +; GCN-IR-NEXT: v_or_b32_e32 v4, v12, v4 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v6, v10 -; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v7, v11, vcc -; GCN-IR-NEXT: v_or_b32_e32 v2, v12, v2 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v4 -; GCN-IR-NEXT: v_and_b32_e32 v15, v12, v0 -; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v12 -; GCN-IR-NEXT: v_and_b32_e32 v14, v12, v1 -; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v8 -; GCN-IR-NEXT: v_or_b32_e32 v3, v13, v3 -; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v9, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[8:9] -; GCN-IR-NEXT: v_mov_b32_e32 v8, v12 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v15 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v13 -; GCN-IR-NEXT: v_mov_b32_e32 v13, v5 -; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v14, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GCN-IR-NEXT: v_mov_b32_e32 v12, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: v_sub_i32_e32 v5, vcc, v8, v4 +; GCN-IR-NEXT: v_subb_u32_e32 v5, vcc, v9, v13, vcc +; GCN-IR-NEXT: v_or_b32_e32 v2, v14, v2 +; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, 1, v10 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v7, 31, v5 +; GCN-IR-NEXT: v_or_b32_e32 v3, v15, v3 +; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, 0, v11, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[14:15], v[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v10, v14 +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 +; GCN-IR-NEXT: v_and_b32_e32 v5, 1, v7 +; GCN-IR-NEXT: v_and_b32_e32 v16, v7, v1 +; GCN-IR-NEXT: v_and_b32_e32 v7, v7, v0 +; GCN-IR-NEXT: v_sub_i32_e64 v12, s[4:5], v4, v7 +; GCN-IR-NEXT: v_mov_b32_e32 v11, v15 +; GCN-IR-NEXT: v_mov_b32_e32 v15, v6 +; GCN-IR-NEXT: v_subb_u32_e64 v13, s[4:5], v13, v16, s[4:5] +; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v14, v5 +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB9_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: BB9_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: BB9_5: ; %Flow3 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 -; GCN-IR-NEXT: v_or_b32_e32 v3, v5, v1 -; GCN-IR-NEXT: v_or_b32_e32 v2, v4, v0 +; GCN-IR-NEXT: v_or_b32_e32 v3, v6, v1 +; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v0 ; GCN-IR-NEXT: BB9_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: v_mov_b32_e32 v0, v2 @@ -1359,10 +1359,10 @@ ; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[7:8], v[4:5] ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[0:1], v2 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB10_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_lshr_b64 v[7:8], v[0:1], v7 @@ -1394,14 +1394,14 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_sub_i32_e64 v7, s[4:5], v7, v11 ; GCN-IR-NEXT: v_subb_u32_e64 v8, s[4:5], v8, v6, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v9, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB10_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: BB10_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: BB10_5: ; %Flow3 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v1 ; GCN-IR-NEXT: v_or_b32_e32 v3, v4, v0 @@ -1736,17 +1736,17 @@ ; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[7:8], v[4:5] ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[0:1], v2 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB12_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_lshr_b64 v[7:8], v[0:1], v7 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 0xffffffc4, v6 ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 -; GCN-IR-NEXT: v_addc_u32_e64 v1, s[4:5], 0, -1, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 +; GCN-IR-NEXT: v_addc_u32_e64 v1, s[4:5], 0, -1, vcc ; GCN-IR-NEXT: BB12_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 @@ -1769,14 +1769,14 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v1, v10 ; GCN-IR-NEXT: v_mov_b32_e32 v10, v5 ; GCN-IR-NEXT: v_subbrev_u32_e64 v8, s[4:5], 0, v8, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v9, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB12_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: BB12_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: BB12_5: ; %Flow3 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v1 ; GCN-IR-NEXT: v_or_b32_e32 v3, v4, v0 diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -383,52 +383,52 @@ ; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[7:8], v[5:6] ; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[0:1], v4 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB1_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, -1, v2 ; GCN-IR-NEXT: v_lshr_b64 v[14:15], v[0:1], v7 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, -1, v3, vcc ; GCN-IR-NEXT: v_not_b32_e32 v8, v10 +; GCN-IR-NEXT: v_mov_b32_e32 v16, 0 ; GCN-IR-NEXT: v_not_b32_e32 v9, v11 ; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, v8, v12 +; GCN-IR-NEXT: v_mov_b32_e32 v17, 0 ; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, v9, v13, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 ; GCN-IR-NEXT: BB1_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[14:15], v[14:15], 1 +; GCN-IR-NEXT: v_lshl_b64 v[12:13], v[14:15], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v8, 31, v5 -; GCN-IR-NEXT: v_or_b32_e32 v14, v14, v8 +; GCN-IR-NEXT: v_or_b32_e32 v12, v12, v8 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, v6, v14 -; GCN-IR-NEXT: v_subb_u32_e32 v8, vcc, v7, v15, vcc -; GCN-IR-NEXT: v_or_b32_e32 v4, v12, v4 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v8 -; GCN-IR-NEXT: v_and_b32_e32 v17, v12, v2 -; GCN-IR-NEXT: v_and_b32_e32 v8, 1, v12 -; GCN-IR-NEXT: v_and_b32_e32 v16, v12, v3 -; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v10 -; GCN-IR-NEXT: v_or_b32_e32 v5, v13, v5 -; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v11, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v10, v12 +; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, v6, v12 +; GCN-IR-NEXT: v_subb_u32_e32 v8, vcc, v7, v13, vcc +; GCN-IR-NEXT: v_or_b32_e32 v4, v16, v4 +; GCN-IR-NEXT: v_add_i32_e32 v16, vcc, 1, v10 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v14, 31, v8 +; GCN-IR-NEXT: v_or_b32_e32 v5, v17, v5 +; GCN-IR-NEXT: v_addc_u32_e32 v17, vcc, 0, v11, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[16:17], v[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v10, v16 ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 -; GCN-IR-NEXT: v_sub_i32_e64 v14, s[4:5], v14, v17 -; GCN-IR-NEXT: v_mov_b32_e32 v11, v13 -; GCN-IR-NEXT: v_mov_b32_e32 v13, v9 -; GCN-IR-NEXT: v_subb_u32_e64 v15, s[4:5], v15, v16, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GCN-IR-NEXT: v_mov_b32_e32 v12, v8 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: v_and_b32_e32 v8, 1, v14 +; GCN-IR-NEXT: v_and_b32_e32 v15, v14, v3 +; GCN-IR-NEXT: v_and_b32_e32 v14, v14, v2 +; GCN-IR-NEXT: v_sub_i32_e64 v14, s[4:5], v12, v14 +; GCN-IR-NEXT: v_mov_b32_e32 v11, v17 +; GCN-IR-NEXT: v_mov_b32_e32 v17, v9 +; GCN-IR-NEXT: v_subb_u32_e64 v15, s[4:5], v13, v15, s[4:5] +; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v16, v8 +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB1_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: BB1_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: BB1_5: ; %Flow3 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 ; GCN-IR-NEXT: v_or_b32_e32 v7, v9, v5 ; GCN-IR-NEXT: v_or_b32_e32 v4, v8, v4 @@ -1263,84 +1263,84 @@ ; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 ; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GCN-IR-NEXT: v_cndmask_b32_e32 v8, v3, v2, vcc +; GCN-IR-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc ; GCN-IR-NEXT: s_movk_i32 s6, 0xffd0 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s6, v8 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s6, v4 ; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] ; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GCN-IR-NEXT: v_mov_b32_e32 v4, s8 +; GCN-IR-NEXT: v_mov_b32_e32 v6, s8 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[2:3] -; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 -; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5] +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: v_cndmask_b32_e64 v6, v6, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GCN-IR-NEXT: v_mov_b32_e32 v5, v9 +; GCN-IR-NEXT: v_mov_b32_e32 v7, v5 ; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB8_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v2 -; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc -; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[2:3] +; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v2 +; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[8:9], v[2:3] ; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2 -; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB8_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 -; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[4:5], v6 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, -1, v0 -; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc -; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, 47, v8 +; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[4:5], v8 +; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, -1, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, -1, v1, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 47, v4 ; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 -; GCN-IR-NEXT: v_subb_u32_e32 v9, vcc, 0, v9, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 +; GCN-IR-NEXT: v_subb_u32_e32 v5, vcc, 0, v5, vcc ; GCN-IR-NEXT: BB8_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 -; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v4 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v3 +; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v6 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v6, v10 -; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v7, v11, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v8, v10 +; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v9, v11, vcc ; GCN-IR-NEXT: v_or_b32_e32 v2, v12, v2 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v4 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v6 ; GCN-IR-NEXT: v_and_b32_e32 v15, v12, v0 -; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v12 +; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v12 ; GCN-IR-NEXT: v_and_b32_e32 v14, v12, v1 -; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v8 +; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v4 ; GCN-IR-NEXT: v_or_b32_e32 v3, v13, v3 -; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v9, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[8:9] -; GCN-IR-NEXT: v_mov_b32_e32 v8, v12 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v5, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[4:5] +; GCN-IR-NEXT: v_mov_b32_e32 v4, v12 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v15 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v13 -; GCN-IR-NEXT: v_mov_b32_e32 v13, v5 +; GCN-IR-NEXT: v_mov_b32_e32 v5, v13 +; GCN-IR-NEXT: v_mov_b32_e32 v13, v7 ; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v14, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GCN-IR-NEXT: v_mov_b32_e32 v12, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v12, v6 +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB8_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: BB8_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: BB8_5: ; %Flow3 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 -; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2 +; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v3 +; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v2 ; GCN-IR-NEXT: BB8_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v5 -; GCN-IR-NEXT: v_mul_hi_u32 v3, v0, v4 -; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v4 -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v4 +; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v7 +; GCN-IR-NEXT: v_mul_hi_u32 v3, v0, v6 +; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v6 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v6 ; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 0x8000, v0 @@ -1385,10 +1385,10 @@ ; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[0:1], v2 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB9_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 @@ -1420,14 +1420,14 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 ; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v13 ; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v12, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB9_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: BB9_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: BB9_5: ; %Flow3 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 ; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2 diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll --- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll +++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll @@ -239,11 +239,11 @@ ; VI-NEXT: s_add_i32 s1, s1, 12 ; VI-NEXT: v_add_u32_sdwa v0, vcc, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; VI-NEXT: s_or_b32 s0, s1, 4 -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, s0 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm %load = load <2 x i8>, <2 x i8> addrspace(4)* %arg, align 4 @@ -283,10 +283,10 @@ ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ushort v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v0, 0x3e7, v0 -; VI-NEXT: v_or_b32_e32 v2, 4, v0 +; VI-NEXT: v_add_u16_e32 v2, 0x3e7, v0 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_or_b32_e32 v2, 4, v2 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x()