Index: llvm/include/llvm/Target/GlobalISel/Combine.td =================================================================== --- llvm/include/llvm/Target/GlobalISel/Combine.td +++ llvm/include/llvm/Target/GlobalISel/Combine.td @@ -357,6 +357,8 @@ binop_right_to_zero, p2i_to_i2p, i2p_to_p2i]>; +def known_bits_simplifications : GICombineGroup<[and_trivial_mask]>; + def width_reduction_combines : GICombineGroup<[reduce_shl_of_extend]>; def select_combines : GICombineGroup<[select_undef_cmp, select_constant_cmp]>; @@ -367,4 +369,5 @@ identity_combines, simplify_add_to_sub, hoist_logic_op_with_same_opcode_hands, shl_ashr_to_sext_inreg, sext_inreg_of_load, - width_reduction_combines, select_combines]>; + width_reduction_combines, select_combines, + known_bits_simplifications]>; Index: llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll @@ -159,11 +159,10 @@ ; ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_splat: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: s_mov_b32 s3, 0xffff +; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: s_mov_b32 s1, 0xffc0 ; GFX8-NEXT: s_and_b32 s0, s0, s3 -; GFX8-NEXT: s_and_b32 s2, s2, s3 ; GFX8-NEXT: s_add_i32 s0, s0, s1 ; GFX8-NEXT: s_add_i32 s2, s2, s1 ; GFX8-NEXT: s_lshl_b32 s1, s2, 16 @@ -186,10 +185,9 @@ ; ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_lo: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s1, s0, 16 ; GFX8-NEXT: s_mov_b32 s2, 0xffff +; GFX8-NEXT: s_lshr_b32 s1, s0, 16 ; GFX8-NEXT: s_and_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s1, s1, s2 ; GFX8-NEXT: s_add_i32 s0, s0, 0xffc0 ; GFX8-NEXT: s_add_i32 s1, s1, 4 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 @@ -212,10 +210,9 @@ ; ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_hi: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s1, s0, 16 ; GFX8-NEXT: s_mov_b32 s2, 0xffff +; GFX8-NEXT: s_lshr_b32 s1, s0, 16 ; GFX8-NEXT: s_and_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s1, s1, s2 ; GFX8-NEXT: s_add_i32 s0, s0, 4 ; GFX8-NEXT: s_add_i32 s1, s1, 0xffc0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 @@ -239,13 +236,11 @@ ; ; GFX8-LABEL: s_add_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: s_mov_b32 s3, 0xffff +; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: s_lshr_b32 s4, s1, 16 ; GFX8-NEXT: s_and_b32 s0, s0, s3 ; GFX8-NEXT: s_and_b32 s1, s1, s3 -; GFX8-NEXT: s_and_b32 s2, s2, s3 -; GFX8-NEXT: s_and_b32 s4, s4, s3 ; GFX8-NEXT: s_add_i32 s0, s0, s1 ; GFX8-NEXT: s_add_i32 s2, s2, s4 ; GFX8-NEXT: s_lshl_b32 s1, s2, 16 @@ -271,13 +266,11 @@ ; GFX8-LABEL: s_add_v2i16_fneg_lhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_xor_b32 s0, s0, 0x80008000 -; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: s_mov_b32 s3, 0xffff +; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: s_lshr_b32 s4, s1, 16 ; GFX8-NEXT: s_and_b32 s0, s0, s3 ; GFX8-NEXT: s_and_b32 s1, s1, s3 -; GFX8-NEXT: s_and_b32 s2, s2, s3 -; GFX8-NEXT: s_and_b32 s4, s4, s3 ; GFX8-NEXT: s_add_i32 s0, s0, s1 ; GFX8-NEXT: s_add_i32 s2, s2, s4 ; GFX8-NEXT: s_lshl_b32 s1, s2, 16 @@ -305,13 +298,11 @@ ; GFX8-LABEL: s_add_v2i16_fneg_rhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_xor_b32 s1, s1, 0x80008000 -; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: s_mov_b32 s3, 0xffff +; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: s_lshr_b32 s4, s1, 16 ; GFX8-NEXT: s_and_b32 s0, s0, s3 ; GFX8-NEXT: s_and_b32 s1, s1, s3 -; GFX8-NEXT: s_and_b32 s2, s2, s3 -; GFX8-NEXT: s_and_b32 s4, s4, s3 ; GFX8-NEXT: s_add_i32 s0, s0, s1 ; GFX8-NEXT: s_add_i32 s2, s2, s4 ; GFX8-NEXT: s_lshl_b32 s1, s2, 16 @@ -343,13 +334,11 @@ ; GFX8-NEXT: s_mov_b32 s2, 0x80008000 ; GFX8-NEXT: s_xor_b32 s1, s1, s2 ; GFX8-NEXT: s_xor_b32 s0, s0, s2 -; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: s_mov_b32 s3, 0xffff +; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: s_lshr_b32 s4, s1, 16 ; GFX8-NEXT: s_and_b32 s0, s0, s3 ; GFX8-NEXT: s_and_b32 s1, s1, s3 -; GFX8-NEXT: s_and_b32 s2, s2, s3 -; GFX8-NEXT: s_and_b32 s4, s4, s3 ; GFX8-NEXT: s_add_i32 s0, s0, s1 ; GFX8-NEXT: s_add_i32 s2, s2, s4 ; GFX8-NEXT: s_lshl_b32 s1, s2, 16 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll @@ -47,8 +47,8 @@ ; GFX9-LABEL: v_ashr_i8_7: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, 7 -; GFX9-NEXT: v_ashrrev_i16_sdwa v0, s4, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_mov_b32_e32 v1, 7 +; GFX9-NEXT: v_ashrrev_i16_sdwa v0, v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = ashr i8 %value, 7 ret i8 %result Index: llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll @@ -514,7 +514,6 @@ ; GFX7-NEXT: v_alignbit_b32 v0, v0, v0, 24 ; GFX7-NEXT: v_bfi_b32 v2, s4, v0, v2 ; GFX7-NEXT: v_lshr_b64 v[0:1], v[1:2], 16 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_bswap_i48: @@ -524,7 +523,6 @@ ; GFX8-NEXT: v_perm_b32 v1, 0, v1, s4 ; GFX8-NEXT: v_perm_b32 v2, 0, v0, s4 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[1:2] -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_bswap_i48: @@ -534,7 +532,6 @@ ; GFX9-NEXT: v_perm_b32 v1, 0, v1, s4 ; GFX9-NEXT: v_perm_b32 v2, 0, v0, s4 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[1:2] -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %trunc = trunc i64 %src to i48 %bswap = call i48 @llvm.bswap.i48(i48 %trunc) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll @@ -224,30 +224,28 @@ ; SI-LABEL: v_uitofp_v4i8_to_v4f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; SI-NEXT: s_movk_i32 s4, 0xff +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; SI-NEXT: v_and_b32_e32 v0, s4, v0 +; SI-NEXT: v_and_b32_e32 v3, s4, v0 ; SI-NEXT: v_and_b32_e32 v1, s4, v1 ; SI-NEXT: v_and_b32_e32 v2, s4, v2 -; SI-NEXT: v_and_b32_e32 v3, s4, v3 -; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v3 +; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 -; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v3 +; SI-NEXT: v_mov_b32_e32 v0, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_uitofp_v4i8_to_v4f32: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_movk_i32 s4, 0xff -; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; VI-NEXT: v_and_b32_sdwa v2, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 -; VI-NEXT: v_and_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v0 +; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 ; VI-NEXT: v_mov_b32_e32 v0, v4 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll @@ -19,8 +19,7 @@ ; GCN-NEXT: s_and_b32 s1, s2, s5 ; GCN-NEXT: s_lshl_b32 s1, s1, 16 ; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_and_b32 s1, s3, s5 -; GCN-NEXT: s_lshl_b32 s1, s1, 24 +; GCN-NEXT: s_lshl_b32 s1, s3, 24 ; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: s_and_b32 s1, s4, 3 ; GCN-NEXT: s_lshl_b32 s1, s1, 3 @@ -40,11 +39,12 @@ ; GFX9-NEXT: s_and_b32 s2, s2, 3 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v2, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v3, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v3, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX9-NEXT: v_and_or_b32 v0, v0, s1, v1 -; GFX9-NEXT: v_or3_b32 v0, v0, v2, v3 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v2 ; GFX9-NEXT: s_lshl_b32 s0, s2, 3 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 @@ -61,11 +61,12 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v4 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog @@ -86,7 +87,6 @@ ; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 ; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_and_b32_e32 v3, s0, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -112,11 +112,12 @@ ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v3, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v4, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v4, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX9-NEXT: v_and_or_b32 v0, v0, s5, v2 -; GFX9-NEXT: v_or3_b32 v0, v0, v3, v4 +; GFX9-NEXT: v_or3_b32 v0, v0, v4, v3 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -131,11 +132,12 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v5, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v3, v0, v3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v3, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v5 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -158,7 +160,6 @@ ; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 @@ -189,8 +190,7 @@ ; GFX9-NEXT: s_and_b32 s1, s2, s4 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s3, s4 -; GFX9-NEXT: s_lshl_b32 s1, s1, 24 +; GFX9-NEXT: s_lshl_b32 s1, s3, 24 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: v_lshrrev_b32_e64 v0, v0, s0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 @@ -213,8 +213,7 @@ ; GFX8-NEXT: s_and_b32 s1, s2, s4 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s3, s4 -; GFX8-NEXT: s_lshl_b32 s1, s1, 24 +; GFX8-NEXT: s_lshl_b32 s1, s3, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: v_lshrrev_b32_e64 v0, v0, s0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 @@ -237,8 +236,7 @@ ; GFX7-NEXT: s_and_b32 s1, s2, s4 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 -; GFX7-NEXT: s_and_b32 s1, s3, s4 -; GFX7-NEXT: s_lshl_b32 s1, s1, 24 +; GFX7-NEXT: s_lshl_b32 s1, s3, 24 ; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: v_lshr_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 @@ -259,14 +257,13 @@ ; GCN-NEXT: s_lshr_b32 s3, s1, 16 ; GCN-NEXT: s_lshr_b32 s4, s1, 24 ; GCN-NEXT: s_and_b32 s1, s1, s0 +; GCN-NEXT: s_and_b32 s0, s3, s0 ; GCN-NEXT: s_lshl_b32 s2, s2, 8 ; GCN-NEXT: s_or_b32 s1, s1, s2 -; GCN-NEXT: s_and_b32 s2, s3, s0 -; GCN-NEXT: s_and_b32 s0, s4, s0 -; GCN-NEXT: s_lshl_b32 s2, s2, 16 -; GCN-NEXT: s_or_b32 s1, s1, s2 -; GCN-NEXT: s_lshl_b32 s0, s0, 24 +; GCN-NEXT: s_lshl_b32 s0, s0, 16 ; GCN-NEXT: s_or_b32 s0, s1, s0 +; GCN-NEXT: s_lshl_b32 s1, s4, 24 +; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: ; return to shader part epilog %vector = load <4 x i8>, <4 x i8> addrspace(4)* %ptr %element = extractelement <4 x i8> %vector, i32 0 @@ -284,14 +281,13 @@ ; GCN-NEXT: s_lshr_b32 s3, s1, 16 ; GCN-NEXT: s_lshr_b32 s4, s1, 24 ; GCN-NEXT: s_and_b32 s1, s1, s0 +; GCN-NEXT: s_and_b32 s0, s3, s0 ; GCN-NEXT: s_lshl_b32 s2, s2, 8 ; GCN-NEXT: s_or_b32 s1, s1, s2 -; GCN-NEXT: s_and_b32 s2, s3, s0 -; GCN-NEXT: s_and_b32 s0, s4, s0 -; GCN-NEXT: s_lshl_b32 s2, s2, 16 -; GCN-NEXT: s_or_b32 s1, s1, s2 -; GCN-NEXT: s_lshl_b32 s0, s0, 24 +; GCN-NEXT: s_lshl_b32 s0, s0, 16 ; GCN-NEXT: s_or_b32 s0, s1, s0 +; GCN-NEXT: s_lshl_b32 s1, s4, 24 +; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: s_lshr_b32 s0, s0, 8 ; GCN-NEXT: ; return to shader part epilog %vector = load <4 x i8>, <4 x i8> addrspace(4)* %ptr @@ -310,14 +306,13 @@ ; GCN-NEXT: s_lshr_b32 s3, s1, 16 ; GCN-NEXT: s_lshr_b32 s4, s1, 24 ; GCN-NEXT: s_and_b32 s1, s1, s0 +; GCN-NEXT: s_and_b32 s0, s3, s0 ; GCN-NEXT: s_lshl_b32 s2, s2, 8 ; GCN-NEXT: s_or_b32 s1, s1, s2 -; GCN-NEXT: s_and_b32 s2, s3, s0 -; GCN-NEXT: s_and_b32 s0, s4, s0 -; GCN-NEXT: s_lshl_b32 s2, s2, 16 -; GCN-NEXT: s_or_b32 s1, s1, s2 -; GCN-NEXT: s_lshl_b32 s0, s0, 24 +; GCN-NEXT: s_lshl_b32 s0, s0, 16 ; GCN-NEXT: s_or_b32 s0, s1, s0 +; GCN-NEXT: s_lshl_b32 s1, s4, 24 +; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: s_lshr_b32 s0, s0, 16 ; GCN-NEXT: ; return to shader part epilog %vector = load <4 x i8>, <4 x i8> addrspace(4)* %ptr @@ -336,14 +331,13 @@ ; GCN-NEXT: s_lshr_b32 s3, s1, 16 ; GCN-NEXT: s_lshr_b32 s4, s1, 24 ; GCN-NEXT: s_and_b32 s1, s1, s0 +; GCN-NEXT: s_and_b32 s0, s3, s0 ; GCN-NEXT: s_lshl_b32 s2, s2, 8 ; GCN-NEXT: s_or_b32 s1, s1, s2 -; GCN-NEXT: s_and_b32 s2, s3, s0 -; GCN-NEXT: s_and_b32 s0, s4, s0 -; GCN-NEXT: s_lshl_b32 s2, s2, 16 -; GCN-NEXT: s_or_b32 s1, s1, s2 -; GCN-NEXT: s_lshl_b32 s0, s0, 24 +; GCN-NEXT: s_lshl_b32 s0, s0, 16 ; GCN-NEXT: s_or_b32 s0, s1, s0 +; GCN-NEXT: s_lshl_b32 s1, s4, 24 +; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: s_lshr_b32 s0, s0, 24 ; GCN-NEXT: ; return to shader part epilog %vector = load <4 x i8>, <4 x i8> addrspace(4)* %ptr @@ -360,11 +354,12 @@ ; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 -; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_and_b32_sdwa v3, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s5, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v4i8_idx0: @@ -377,11 +372,12 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v4i8_idx0: @@ -396,11 +392,10 @@ ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 @@ -421,11 +416,12 @@ ; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 -; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_and_b32_sdwa v3, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s5, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -439,11 +435,12 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -459,11 +456,10 @@ ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 @@ -485,11 +481,12 @@ ; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 -; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_and_b32_sdwa v3, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s5, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -503,11 +500,12 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -523,11 +521,10 @@ ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 @@ -549,11 +546,12 @@ ; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 -; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_and_b32_sdwa v3, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s5, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -567,11 +565,12 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -587,11 +586,10 @@ ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 @@ -620,8 +618,7 @@ ; GCN-NEXT: s_and_b32 s2, s3, s9 ; GCN-NEXT: s_lshl_b32 s2, s2, 16 ; GCN-NEXT: s_or_b32 s0, s0, s2 -; GCN-NEXT: s_and_b32 s2, s5, s9 -; GCN-NEXT: s_lshl_b32 s2, s2, 24 +; GCN-NEXT: s_lshl_b32 s2, s5, 24 ; GCN-NEXT: s_lshr_b32 s6, s1, 8 ; GCN-NEXT: s_or_b32 s0, s0, s2 ; GCN-NEXT: s_and_b32 s2, s6, s9 @@ -633,8 +630,7 @@ ; GCN-NEXT: s_and_b32 s2, s7, s9 ; GCN-NEXT: s_lshl_b32 s2, s2, 16 ; GCN-NEXT: s_or_b32 s1, s1, s2 -; GCN-NEXT: s_and_b32 s2, s8, s9 -; GCN-NEXT: s_lshl_b32 s2, s2, 24 +; GCN-NEXT: s_lshl_b32 s2, s8, 24 ; GCN-NEXT: s_or_b32 s1, s1, s2 ; GCN-NEXT: s_lshr_b32 s2, s4, 2 ; GCN-NEXT: s_cmp_eq_u32 s2, 1 @@ -653,24 +649,25 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_mov_b32 s0, 8 -; GFX9-NEXT: v_mov_b32_e32 v2, 8 ; GFX9-NEXT: s_movk_i32 s1, 0xff ; GFX9-NEXT: s_lshr_b32 s3, s2, 2 ; GFX9-NEXT: s_and_b32 s2, s2, 3 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s3, 1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v5, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v6, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v0, v0, s1, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v6, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_sdwa v7, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v8, v1, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v1, v1, s1, v2 -; GFX9-NEXT: v_or3_b32 v0, v0, v5, v6 -; GFX9-NEXT: v_or3_b32 v1, v1, v7, v8 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX9-NEXT: v_and_or_b32 v0, v0, s1, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX9-NEXT: v_and_or_b32 v1, v1, s1, v4 +; GFX9-NEXT: v_or3_b32 v0, v0, v6, v3 +; GFX9-NEXT: v_or3_b32 v1, v1, v7, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX9-NEXT: s_lshl_b32 s0, s2, 3 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, s0, v0 @@ -690,19 +687,21 @@ ; GFX8-NEXT: s_lshl_b32 s0, s1, 3 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v7, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v8, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v9, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v9, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v4, v1, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v1 +; GFX8-NEXT: v_and_b32_sdwa v4, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v9 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v6 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v8 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v6 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 @@ -730,11 +729,9 @@ ; GFX7-NEXT: v_and_b32_e32 v3, s0, v3 ; GFX7-NEXT: v_and_b32_e32 v6, s0, v6 ; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX7-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GFX7-NEXT: v_and_b32_e32 v4, s0, v4 -; GFX7-NEXT: v_and_b32_e32 v7, s0, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 @@ -761,24 +758,25 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_mov_b32 s4, 8 -; GFX9-NEXT: v_mov_b32_e32 v3, 8 ; GFX9-NEXT: s_movk_i32 s5, 0xff -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 2, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 2, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, s4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v7, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v8, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v0, v0, s5, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v6, s4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v8, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_sdwa v9, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v10, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v1, v1, s5, v3 -; GFX9-NEXT: v_or3_b32 v0, v0, v7, v8 -; GFX9-NEXT: v_or3_b32 v1, v1, v9, v10 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX9-NEXT: v_and_or_b32 v0, v0, s5, v4 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX9-NEXT: v_and_or_b32 v1, v1, s5, v6 +; GFX9-NEXT: v_or3_b32 v0, v0, v8, v5 +; GFX9-NEXT: v_or3_b32 v1, v1, v9, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, v1, v0 @@ -797,19 +795,21 @@ ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v9, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v10, v0, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v11, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v11, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v5, v1, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v1 +; GFX8-NEXT: v_and_b32_sdwa v5, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v11 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v10 +; GFX8-NEXT: v_lshlrev_b32_e32 v7, 24, v8 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v11 +; GFX8-NEXT: v_lshlrev_b32_e32 v8, 24, v10 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v8 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v1, v0 @@ -841,8 +841,6 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; GFX7-NEXT: v_and_b32_e32 v6, s4, v6 -; GFX7-NEXT: v_and_b32_e32 v9, s4, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8 @@ -881,8 +879,7 @@ ; GCN-NEXT: s_and_b32 s2, s3, s8 ; GCN-NEXT: s_lshl_b32 s2, s2, 16 ; GCN-NEXT: s_or_b32 s0, s0, s2 -; GCN-NEXT: s_and_b32 s2, s4, s8 -; GCN-NEXT: s_lshl_b32 s2, s2, 24 +; GCN-NEXT: s_lshl_b32 s2, s4, 24 ; GCN-NEXT: s_lshr_b32 s5, s1, 8 ; GCN-NEXT: s_or_b32 s0, s0, s2 ; GCN-NEXT: s_and_b32 s2, s5, s8 @@ -894,8 +891,7 @@ ; GCN-NEXT: s_and_b32 s2, s6, s8 ; GCN-NEXT: s_lshl_b32 s2, s2, 16 ; GCN-NEXT: s_or_b32 s1, s1, s2 -; GCN-NEXT: s_and_b32 s2, s7, s8 -; GCN-NEXT: s_lshl_b32 s2, s2, 24 +; GCN-NEXT: s_lshl_b32 s2, s7, 24 ; GCN-NEXT: s_or_b32 s1, s1, s2 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: v_mov_b32_e32 v3, s1 @@ -925,8 +921,7 @@ ; GCN-NEXT: s_and_b32 s1, s2, s4 ; GCN-NEXT: s_lshl_b32 s1, s1, 16 ; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_and_b32 s1, s3, s4 -; GCN-NEXT: s_lshl_b32 s1, s1, 24 +; GCN-NEXT: s_lshl_b32 s1, s3, 24 ; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: ; return to shader part epilog %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr @@ -950,8 +945,7 @@ ; GCN-NEXT: s_and_b32 s1, s2, s4 ; GCN-NEXT: s_lshl_b32 s1, s1, 16 ; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_and_b32 s1, s3, s4 -; GCN-NEXT: s_lshl_b32 s1, s1, 24 +; GCN-NEXT: s_lshl_b32 s1, s3, 24 ; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: s_lshr_b32 s0, s0, 8 ; GCN-NEXT: ; return to shader part epilog @@ -976,8 +970,7 @@ ; GCN-NEXT: s_and_b32 s1, s2, s4 ; GCN-NEXT: s_lshl_b32 s1, s1, 16 ; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_and_b32 s1, s3, s4 -; GCN-NEXT: s_lshl_b32 s1, s1, 24 +; GCN-NEXT: s_lshl_b32 s1, s3, 24 ; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: s_lshr_b32 s0, s0, 16 ; GCN-NEXT: ; return to shader part epilog @@ -1002,8 +995,7 @@ ; GCN-NEXT: s_and_b32 s1, s2, s4 ; GCN-NEXT: s_lshl_b32 s1, s1, 16 ; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_and_b32 s1, s3, s4 -; GCN-NEXT: s_lshl_b32 s1, s1, 24 +; GCN-NEXT: s_lshl_b32 s1, s3, 24 ; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: s_lshr_b32 s0, s0, 24 ; GCN-NEXT: ; return to shader part epilog @@ -1028,8 +1020,7 @@ ; GCN-NEXT: s_and_b32 s1, s2, s4 ; GCN-NEXT: s_lshl_b32 s1, s1, 16 ; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_and_b32 s1, s3, s4 -; GCN-NEXT: s_lshl_b32 s1, s1, 24 +; GCN-NEXT: s_lshl_b32 s1, s3, 24 ; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: ; return to shader part epilog %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr @@ -1053,8 +1044,7 @@ ; GCN-NEXT: s_and_b32 s1, s2, s4 ; GCN-NEXT: s_lshl_b32 s1, s1, 16 ; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_and_b32 s1, s3, s4 -; GCN-NEXT: s_lshl_b32 s1, s1, 24 +; GCN-NEXT: s_lshl_b32 s1, s3, 24 ; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: s_lshr_b32 s0, s0, 8 ; GCN-NEXT: ; return to shader part epilog @@ -1079,8 +1069,7 @@ ; GCN-NEXT: s_and_b32 s1, s2, s4 ; GCN-NEXT: s_lshl_b32 s1, s1, 16 ; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_and_b32 s1, s3, s4 -; GCN-NEXT: s_lshl_b32 s1, s1, 24 +; GCN-NEXT: s_lshl_b32 s1, s3, 24 ; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: s_lshr_b32 s0, s0, 16 ; GCN-NEXT: ; return to shader part epilog @@ -1105,8 +1094,7 @@ ; GCN-NEXT: s_and_b32 s1, s2, s4 ; GCN-NEXT: s_lshl_b32 s1, s1, 16 ; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_and_b32 s1, s3, s4 -; GCN-NEXT: s_lshl_b32 s1, s1, 24 +; GCN-NEXT: s_lshl_b32 s1, s3, 24 ; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: s_lshr_b32 s0, s0, 24 ; GCN-NEXT: ; return to shader part epilog @@ -1124,11 +1112,12 @@ ; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 -; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_and_b32_sdwa v3, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s5, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v8i8_idx0: @@ -1141,11 +1130,12 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v8i8_idx0: @@ -1160,11 +1150,10 @@ ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 @@ -1185,11 +1174,12 @@ ; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 -; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_and_b32_sdwa v3, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s5, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1203,11 +1193,12 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -1223,11 +1214,10 @@ ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 @@ -1249,11 +1239,12 @@ ; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 -; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_and_b32_sdwa v3, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s5, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1267,11 +1258,12 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -1287,11 +1279,10 @@ ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 @@ -1313,11 +1304,12 @@ ; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 -; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_and_b32_sdwa v3, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s5, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1331,11 +1323,12 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -1351,11 +1344,10 @@ ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 @@ -1378,10 +1370,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v2, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v1 +; GFX9-NEXT: v_and_b32_sdwa v3, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v0, v1, s5, v0 -; GFX9-NEXT: v_and_b32_sdwa v1, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v8i8_idx4: @@ -1394,10 +1387,11 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX8-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -1413,11 +1407,10 @@ ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 @@ -1439,10 +1432,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v2, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v1 +; GFX9-NEXT: v_and_b32_sdwa v3, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v0, v1, s5, v0 -; GFX9-NEXT: v_and_b32_sdwa v1, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1456,10 +1450,11 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX8-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -1476,11 +1471,10 @@ ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 @@ -1503,10 +1497,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v2, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v1 +; GFX9-NEXT: v_and_b32_sdwa v3, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v0, v1, s5, v0 -; GFX9-NEXT: v_and_b32_sdwa v1, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1520,10 +1515,11 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX8-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -1540,11 +1536,10 @@ ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 @@ -1567,10 +1562,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v2, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v1 +; GFX9-NEXT: v_and_b32_sdwa v3, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v0, v1, s5, v0 -; GFX9-NEXT: v_and_b32_sdwa v1, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1584,10 +1580,11 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX8-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -1604,11 +1601,10 @@ ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 @@ -1637,8 +1633,7 @@ ; GCN-NEXT: s_and_b32 s5, s6, s17 ; GCN-NEXT: s_lshl_b32 s5, s5, 16 ; GCN-NEXT: s_or_b32 s0, s0, s5 -; GCN-NEXT: s_and_b32 s5, s7, s17 -; GCN-NEXT: s_lshl_b32 s5, s5, 24 +; GCN-NEXT: s_lshl_b32 s5, s7, 24 ; GCN-NEXT: s_lshr_b32 s8, s1, 8 ; GCN-NEXT: s_or_b32 s0, s0, s5 ; GCN-NEXT: s_and_b32 s5, s8, s17 @@ -1650,8 +1645,7 @@ ; GCN-NEXT: s_and_b32 s5, s9, s17 ; GCN-NEXT: s_lshl_b32 s5, s5, 16 ; GCN-NEXT: s_or_b32 s1, s1, s5 -; GCN-NEXT: s_and_b32 s5, s10, s17 -; GCN-NEXT: s_lshl_b32 s5, s5, 24 +; GCN-NEXT: s_lshl_b32 s5, s10, 24 ; GCN-NEXT: s_lshr_b32 s11, s2, 8 ; GCN-NEXT: s_or_b32 s1, s1, s5 ; GCN-NEXT: s_and_b32 s5, s11, s17 @@ -1663,8 +1657,7 @@ ; GCN-NEXT: s_and_b32 s5, s12, s17 ; GCN-NEXT: s_lshl_b32 s5, s5, 16 ; GCN-NEXT: s_or_b32 s2, s2, s5 -; GCN-NEXT: s_and_b32 s5, s13, s17 -; GCN-NEXT: s_lshl_b32 s5, s5, 24 +; GCN-NEXT: s_lshl_b32 s5, s13, 24 ; GCN-NEXT: s_lshr_b32 s14, s3, 8 ; GCN-NEXT: s_or_b32 s2, s2, s5 ; GCN-NEXT: s_and_b32 s5, s14, s17 @@ -1676,8 +1669,7 @@ ; GCN-NEXT: s_and_b32 s5, s15, s17 ; GCN-NEXT: s_lshl_b32 s5, s5, 16 ; GCN-NEXT: s_or_b32 s3, s3, s5 -; GCN-NEXT: s_and_b32 s5, s16, s17 -; GCN-NEXT: s_lshl_b32 s5, s5, 24 +; GCN-NEXT: s_lshl_b32 s5, s16, 24 ; GCN-NEXT: s_or_b32 s3, s3, s5 ; GCN-NEXT: s_lshr_b32 s5, s4, 2 ; GCN-NEXT: s_cmp_eq_u32 s5, 1 @@ -1702,38 +1694,42 @@ ; GFX9-NEXT: s_mov_b32 s0, 8 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: s_movk_i32 s1, 0xff -; GFX9-NEXT: v_mov_b32_e32 v4, 0xff ; GFX9-NEXT: s_lshr_b32 s3, s2, 2 +; GFX9-NEXT: v_mov_b32_e32 v4, 0xff ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s3, 1 ; GFX9-NEXT: s_and_b32 s2, s2, 3 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v6, s0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v3 ; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v10, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v11, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; GFX9-NEXT: v_and_b32_sdwa v14, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v15, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v10, v5, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX9-NEXT: v_and_or_b32 v0, v0, s1, v6 -; GFX9-NEXT: v_and_b32_sdwa v12, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v13, v1, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v1, v1, s1, v7 -; GFX9-NEXT: v_and_b32_sdwa v14, v2, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v15, v2, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v2, v2, s1, v8 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or3_b32 v0, v0, v10, v11 -; GFX9-NEXT: v_or3_b32 v1, v1, v12, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX9-NEXT: v_and_or_b32 v1, v1, s1, v8 +; GFX9-NEXT: v_and_b32_sdwa v16, v2, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 24, v3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX9-NEXT: v_and_or_b32 v2, v2, s1, v10 +; GFX9-NEXT: v_or3_b32 v0, v0, v14, v7 +; GFX9-NEXT: v_or3_b32 v1, v1, v15, v9 +; GFX9-NEXT: v_and_b32_sdwa v17, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX9-NEXT: v_and_b32_sdwa v16, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v17, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 24, v13 ; GFX9-NEXT: v_and_or_b32 v3, v3, v4, v5 -; GFX9-NEXT: v_or3_b32 v2, v2, v14, v15 +; GFX9-NEXT: v_or3_b32 v2, v2, v16, v11 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s3, 2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_or3_b32 v3, v3, v16, v17 +; GFX9-NEXT: v_or3_b32 v3, v3, v17, v12 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s3, 3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: s_lshl_b32 s0, s2, 3 @@ -1754,37 +1750,41 @@ ; GFX8-NEXT: s_and_b32 s1, s2, 3 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 8, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v8, 24, v9 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v12, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v13, v0, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 8, v2 +; GFX8-NEXT: v_and_b32_sdwa v16, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v14, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v7, v1, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v12 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v14 -; GFX8-NEXT: v_and_b32_sdwa v15, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v16, v2, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v6, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v15 -; GFX8-NEXT: v_and_b32_sdwa v17, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v4, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v13 +; GFX8-NEXT: v_and_b32_sdwa v7, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v10, 24, v11 +; GFX8-NEXT: v_lshrrev_b32_e32 v14, 8, v3 +; GFX8-NEXT: v_lshlrev_b32_sdwa v11, v6, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v16 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v17 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v2 +; GFX8-NEXT: v_and_b32_sdwa v17, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v6, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v15, 24, v3 +; GFX8-NEXT: v_and_b32_sdwa v4, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v12, 24, v13 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v17 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v10 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX8-NEXT: v_or_b32_e32 v2, v2, v16 +; GFX8-NEXT: v_lshlrev_b32_e32 v13, 24, v15 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v12 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 3 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v13 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX8-NEXT: s_lshl_b32 s0, s1, 3 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, s0, v0 @@ -1816,27 +1816,24 @@ ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 8, v3 ; GFX7-NEXT: v_and_b32_e32 v6, s0, v6 ; GFX7-NEXT: v_and_b32_e32 v9, s0, v9 -; GFX7-NEXT: v_and_b32_e32 v11, v11, v4 +; GFX7-NEXT: v_and_b32_e32 v11, s0, v11 ; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_and_b32_e32 v1, s0, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 ; GFX7-NEXT: v_lshrrev_b32_e32 v13, 24, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v7, s0, v7 -; GFX7-NEXT: v_and_b32_e32 v10, s0, v10 ; GFX7-NEXT: v_and_b32_e32 v12, v12, v4 ; GFX7-NEXT: v_and_b32_e32 v14, v14, v4 ; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v8 -; GFX7-NEXT: v_and_b32_e32 v13, v13, v4 -; GFX7-NEXT: v_and_b32_e32 v15, v15, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v16, 24, v3 ; GFX7-NEXT: v_and_b32_e32 v3, v3, v4 +; GFX7-NEXT: v_and_b32_e32 v4, v15, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 @@ -1844,20 +1841,19 @@ ; GFX7-NEXT: v_or_b32_e32 v1, v1, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v11 -; GFX7-NEXT: v_and_b32_e32 v4, v16, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v13, 24, v13 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v12 -; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v14 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v7 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v10 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v15 +; GFX7-NEXT: v_lshlrev_b32_e32 v15, 24, v16 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v13 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 2 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v15 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 3 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX7-NEXT: s_lshl_b32 s0, s2, 3 @@ -1877,38 +1873,42 @@ ; GFX9-NEXT: s_mov_b32 s4, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 8 ; GFX9-NEXT: s_movk_i32 s5, 0xff -; GFX9-NEXT: v_mov_b32_e32 v0, 0xff ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 2, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xff ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 ; GFX9-NEXT: v_and_b32_e32 v2, 3, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 8, v6 ; GFX9-NEXT: v_lshlrev_b32_sdwa v8, s4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v10, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v12, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v13, v3, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v14, v4, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v15, v4, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 24, v5 +; GFX9-NEXT: v_and_b32_sdwa v16, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v17, v4, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_sdwa v12, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_and_or_b32 v3, v3, s5, v8 -; GFX9-NEXT: v_and_or_b32 v4, v4, s5, v9 -; GFX9-NEXT: v_and_b32_sdwa v16, v5, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v17, v5, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v18, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v19, v6, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v4, v4, s5, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v6 +; GFX9-NEXT: v_and_b32_sdwa v18, v5, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v19, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v0, v6, v0, v1 -; GFX9-NEXT: v_or3_b32 v1, v3, v12, v13 -; GFX9-NEXT: v_or3_b32 v3, v4, v14, v15 -; GFX9-NEXT: v_and_or_b32 v5, v5, s5, v10 +; GFX9-NEXT: v_or3_b32 v1, v3, v16, v9 +; GFX9-NEXT: v_or3_b32 v3, v4, v17, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; GFX9-NEXT: v_and_or_b32 v5, v5, s5, v12 +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 24, v15 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_or3_b32 v4, v5, v16, v17 +; GFX9-NEXT: v_or3_b32 v4, v5, v18, v13 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_or3_b32 v0, v0, v18, v19 +; GFX9-NEXT: v_or3_b32 v0, v0, v19, v14 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v2 @@ -1929,36 +1929,40 @@ ; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 8, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 8, v4 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 8, v5 -; GFX8-NEXT: v_lshlrev_b32_sdwa v10, v7, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshrrev_b32_e32 v13, 8, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v10, 24, v11 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v4 ; GFX8-NEXT: v_lshlrev_b32_sdwa v11, v7, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v14, v3, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v14, 8, v5 +; GFX8-NEXT: v_and_b32_sdwa v18, v3, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v15, v3, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v16, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v3, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v14 -; GFX8-NEXT: v_and_b32_sdwa v8, v4, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v3, v3, v16 -; GFX8-NEXT: v_and_b32_sdwa v17, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v4, v5, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v7, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v18, v5, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v4, v4, v17 -; GFX8-NEXT: v_and_b32_sdwa v19, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v5, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v15 +; GFX8-NEXT: v_lshlrev_b32_e32 v12, 24, v13 +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 8, v6 +; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v7, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v8, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v1, v1, v18 ; GFX8-NEXT: v_or_b32_e32 v3, v3, v8 +; GFX8-NEXT: v_lshrrev_b32_e32 v15, 24, v5 +; GFX8-NEXT: v_and_b32_sdwa v19, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v4, v5, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v7, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 24, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v14, 24, v15 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v19 +; GFX8-NEXT: v_and_b32_sdwa v0, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v5, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v1, v1, v10 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v12 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX8-NEXT: v_and_b32_sdwa v0, v6, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v5, v5, v19 -; GFX8-NEXT: v_or_b32_e32 v4, v4, v18 +; GFX8-NEXT: v_lshlrev_b32_e32 v15, 24, v17 +; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v14 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 2, v9 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v15 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 3, v9 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v2 @@ -1991,27 +1995,24 @@ ; GFX7-NEXT: v_lshrrev_b32_e32 v15, 8, v6 ; GFX7-NEXT: v_and_b32_e32 v7, s4, v7 ; GFX7-NEXT: v_and_b32_e32 v10, s4, v10 -; GFX7-NEXT: v_and_b32_e32 v12, v12, v0 +; GFX7-NEXT: v_and_b32_e32 v12, s4, v12 ; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 24, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v6 -; GFX7-NEXT: v_and_b32_e32 v8, s4, v8 -; GFX7-NEXT: v_and_b32_e32 v11, s4, v11 ; GFX7-NEXT: v_and_b32_e32 v13, v13, v0 ; GFX7-NEXT: v_and_b32_e32 v15, v15, v0 ; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v6 ; GFX7-NEXT: v_and_b32_e32 v5, s4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; GFX7-NEXT: v_or_b32_e32 v3, v4, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 -; GFX7-NEXT: v_and_b32_e32 v14, v14, v0 -; GFX7-NEXT: v_and_b32_e32 v16, v16, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v17, 24, v6 ; GFX7-NEXT: v_and_b32_e32 v6, v6, v0 +; GFX7-NEXT: v_and_b32_e32 v0, v16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v11, 24, v11 @@ -2019,20 +2020,19 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; GFX7-NEXT: v_or_b32_e32 v4, v5, v12 ; GFX7-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; GFX7-NEXT: v_and_b32_e32 v0, v17, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 24, v14 ; GFX7-NEXT: v_or_b32_e32 v4, v4, v13 -; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_or_b32_e32 v5, v6, v15 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v8 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v11 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v16 +; GFX7-NEXT: v_lshlrev_b32_e32 v16, 24, v17 +; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX7-NEXT: v_or_b32_e32 v4, v4, v14 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 2, v18 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v16 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 3, v18 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v2 @@ -2062,8 +2062,7 @@ ; GCN-NEXT: s_and_b32 s4, s5, s16 ; GCN-NEXT: s_lshl_b32 s4, s4, 16 ; GCN-NEXT: s_or_b32 s0, s0, s4 -; GCN-NEXT: s_and_b32 s4, s6, s16 -; GCN-NEXT: s_lshl_b32 s4, s4, 24 +; GCN-NEXT: s_lshl_b32 s4, s6, 24 ; GCN-NEXT: s_lshr_b32 s7, s1, 8 ; GCN-NEXT: s_or_b32 s0, s0, s4 ; GCN-NEXT: s_and_b32 s4, s7, s16 @@ -2075,8 +2074,7 @@ ; GCN-NEXT: s_and_b32 s4, s8, s16 ; GCN-NEXT: s_lshl_b32 s4, s4, 16 ; GCN-NEXT: s_or_b32 s1, s1, s4 -; GCN-NEXT: s_and_b32 s4, s9, s16 -; GCN-NEXT: s_lshl_b32 s4, s4, 24 +; GCN-NEXT: s_lshl_b32 s4, s9, 24 ; GCN-NEXT: s_lshr_b32 s10, s2, 8 ; GCN-NEXT: s_or_b32 s1, s1, s4 ; GCN-NEXT: s_and_b32 s4, s10, s16 @@ -2088,8 +2086,7 @@ ; GCN-NEXT: s_and_b32 s4, s11, s16 ; GCN-NEXT: s_lshl_b32 s4, s4, 16 ; GCN-NEXT: s_or_b32 s2, s2, s4 -; GCN-NEXT: s_and_b32 s4, s12, s16 -; GCN-NEXT: s_lshl_b32 s4, s4, 24 +; GCN-NEXT: s_lshl_b32 s4, s12, 24 ; GCN-NEXT: s_lshr_b32 s13, s3, 8 ; GCN-NEXT: s_or_b32 s2, s2, s4 ; GCN-NEXT: s_and_b32 s4, s13, s16 @@ -2101,8 +2098,7 @@ ; GCN-NEXT: s_and_b32 s4, s14, s16 ; GCN-NEXT: s_lshl_b32 s4, s4, 16 ; GCN-NEXT: s_or_b32 s3, s3, s4 -; GCN-NEXT: s_and_b32 s4, s15, s16 -; GCN-NEXT: s_lshl_b32 s4, s4, 24 +; GCN-NEXT: s_lshl_b32 s4, s15, 24 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: v_mov_b32_e32 v3, s1 ; GCN-NEXT: s_or_b32 s3, s3, s4 @@ -2131,11 +2127,12 @@ ; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 -; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_and_b32_sdwa v3, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s5, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v16i8_idx0: @@ -2148,11 +2145,12 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v16i8_idx0: @@ -2167,11 +2165,10 @@ ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 @@ -2192,11 +2189,12 @@ ; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 -; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_and_b32_sdwa v3, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s5, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2210,11 +2208,12 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -2230,11 +2229,10 @@ ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 @@ -2256,11 +2254,12 @@ ; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 -; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_and_b32_sdwa v3, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s5, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2274,11 +2273,12 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -2294,11 +2294,10 @@ ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 @@ -2320,11 +2319,12 @@ ; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 -; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_and_b32_sdwa v3, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s5, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2338,11 +2338,12 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -2358,11 +2359,10 @@ ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 @@ -2385,10 +2385,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v2, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v1 +; GFX9-NEXT: v_and_b32_sdwa v3, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v0, v1, s5, v0 -; GFX9-NEXT: v_and_b32_sdwa v1, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v16i8_idx4: @@ -2401,10 +2402,11 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX8-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -2420,11 +2422,10 @@ ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 @@ -2446,10 +2447,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v2, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v1 +; GFX9-NEXT: v_and_b32_sdwa v3, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v0, v1, s5, v0 -; GFX9-NEXT: v_and_b32_sdwa v1, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2463,10 +2465,11 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX8-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -2483,11 +2486,10 @@ ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 @@ -2510,10 +2512,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v2, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v1 +; GFX9-NEXT: v_and_b32_sdwa v3, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v0, v1, s5, v0 -; GFX9-NEXT: v_and_b32_sdwa v1, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2527,10 +2530,11 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX8-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -2547,11 +2551,10 @@ ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 @@ -2574,10 +2577,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v2, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v1 +; GFX9-NEXT: v_and_b32_sdwa v3, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v0, v1, s5, v0 -; GFX9-NEXT: v_and_b32_sdwa v1, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2591,10 +2595,11 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX8-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -2611,11 +2616,10 @@ ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 @@ -2637,11 +2641,12 @@ ; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 24, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v1, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v3, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v0, v2, s5, v0 -; GFX9-NEXT: v_and_b32_sdwa v2, v2, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v16i8_idx8: @@ -2654,10 +2659,11 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v2 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v2 +; GFX8-NEXT: v_and_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_and_b32_sdwa v1, v2, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -2673,11 +2679,10 @@ ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v2 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v2 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 @@ -2698,11 +2703,12 @@ ; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 24, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v1, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v3, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v0, v2, s5, v0 -; GFX9-NEXT: v_and_b32_sdwa v2, v2, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2716,10 +2722,11 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v2 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v2 +; GFX8-NEXT: v_and_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_and_b32_sdwa v1, v2, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -2736,11 +2743,10 @@ ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v2 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v2 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 @@ -2762,11 +2768,12 @@ ; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 24, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v1, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v3, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v0, v2, s5, v0 -; GFX9-NEXT: v_and_b32_sdwa v2, v2, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2780,10 +2787,11 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v2 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v2 +; GFX8-NEXT: v_and_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_and_b32_sdwa v1, v2, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -2800,11 +2808,10 @@ ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v2 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v2 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 @@ -2826,11 +2833,12 @@ ; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 24, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v1, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v3, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v0, v2, s5, v0 -; GFX9-NEXT: v_and_b32_sdwa v2, v2, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2844,10 +2852,11 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v2 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v2 +; GFX8-NEXT: v_and_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_and_b32_sdwa v1, v2, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -2864,11 +2873,10 @@ ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v2 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v2 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 @@ -2890,11 +2898,12 @@ ; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 24, v3 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v1, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v2, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v0, v3, s5, v0 -; GFX9-NEXT: v_and_b32_sdwa v2, v3, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v16i8_idx12: @@ -2907,10 +2916,11 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v3 +; GFX8-NEXT: v_and_b32_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_and_b32_sdwa v1, v3, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -2926,11 +2936,10 @@ ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v3 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v3 ; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 @@ -2951,11 +2960,12 @@ ; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 24, v3 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v1, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v2, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v0, v3, s5, v0 -; GFX9-NEXT: v_and_b32_sdwa v2, v3, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2969,10 +2979,11 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v3 +; GFX8-NEXT: v_and_b32_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_and_b32_sdwa v1, v3, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -2989,11 +3000,10 @@ ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v3 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v3 ; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3015,11 +3025,12 @@ ; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 24, v3 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v1, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v2, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v0, v3, s5, v0 -; GFX9-NEXT: v_and_b32_sdwa v2, v3, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3033,10 +3044,11 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v3 +; GFX8-NEXT: v_and_b32_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_and_b32_sdwa v1, v3, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -3053,11 +3065,10 @@ ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v3 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v3 ; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3079,11 +3090,12 @@ ; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 24, v3 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v1, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v2, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v0, v3, s5, v0 -; GFX9-NEXT: v_and_b32_sdwa v2, v3, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3097,10 +3109,11 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v3 +; GFX8-NEXT: v_and_b32_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_and_b32_sdwa v1, v3, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -3117,11 +3130,10 @@ ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v3 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v3 ; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll @@ -596,18 +596,20 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v4, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v5, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v5, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX9-NEXT: v_and_or_b32 v0, v0, s1, v3 -; GFX9-NEXT: v_or3_b32 v0, v0, v4, v5 +; GFX9-NEXT: v_or3_b32 v0, v0, v5, v4 ; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v3, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v4, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v4, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v3 ; GFX9-NEXT: v_and_or_b32 v0, v0, s1, v1 -; GFX9-NEXT: v_or3_b32 v2, v0, v3, v4 +; GFX9-NEXT: v_or3_b32 v2, v0, v4, v2 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off @@ -618,8 +620,8 @@ ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: v_mov_b32_e32 v1, 8 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NEXT: s_and_b32 s1, s3, 3 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 3 ; GFX8-NEXT: s_and_b32 s2, s2, s0 ; GFX8-NEXT: s_lshl_b32 s0, s0, s1 @@ -629,20 +631,22 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v5, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v6, v0, v3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v6, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v5 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX8-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX8-NEXT: v_or_b32_e32 v0, s2, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v3, v0, v3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v3, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_or_b32_e32 v2, v0, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -670,7 +674,6 @@ ; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 ; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_and_b32_e32 v3, s0, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -685,7 +688,6 @@ ; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 ; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_and_b32_e32 v3, s0, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -717,8 +719,7 @@ ; GFX9-NEXT: s_and_b32 s2, s3, s6 ; GFX9-NEXT: s_lshl_b32 s2, s2, 16 ; GFX9-NEXT: s_or_b32 s1, s1, s2 -; GFX9-NEXT: s_and_b32 s2, s5, s6 -; GFX9-NEXT: s_lshl_b32 s2, s2, 24 +; GFX9-NEXT: s_lshl_b32 s2, s5, 24 ; GFX9-NEXT: s_or_b32 s1, s1, s2 ; GFX9-NEXT: s_and_b32 s2, s4, 3 ; GFX9-NEXT: s_lshl_b32 s2, s2, 3 @@ -727,11 +728,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, s2, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_and_or_b32 v1, v0, s6, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v0, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v0, v0, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v2, v1, v2, v0 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX9-NEXT: v_or3_b32 v2, v1, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off @@ -741,7 +743,7 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX8-NEXT: s_movk_i32 s5, 0xff -; GFX8-NEXT: v_mov_b32_e32 v2, 8 +; GFX8-NEXT: v_mov_b32_e32 v3, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshr_b32 s1, s0, 8 ; GFX8-NEXT: s_and_b32 s1, s1, s5 @@ -753,8 +755,7 @@ ; GFX8-NEXT: s_and_b32 s1, s2, s5 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s3, s5 -; GFX8-NEXT: s_lshl_b32 s1, s1, 24 +; GFX8-NEXT: s_lshl_b32 s1, s3, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s4, 3 ; GFX8-NEXT: s_lshl_b32 s1, s1, 3 @@ -764,13 +765,14 @@ ; GFX8-NEXT: s_andn2_b32 s0, s0, s1 ; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v3, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v2, v1, v0 +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -792,8 +794,7 @@ ; GFX7-NEXT: s_and_b32 s1, s2, s5 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 -; GFX7-NEXT: s_and_b32 s1, s3, s5 -; GFX7-NEXT: s_lshl_b32 s1, s1, 24 +; GFX7-NEXT: s_lshl_b32 s1, s3, 24 ; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: s_and_b32 s1, s4, 3 ; GFX7-NEXT: s_lshl_b32 s1, s1, 3 @@ -811,8 +812,7 @@ ; GFX7-NEXT: v_and_b32_e32 v1, s5, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, s5, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -844,8 +844,7 @@ ; GFX9-NEXT: s_and_b32 s2, s3, s6 ; GFX9-NEXT: s_lshl_b32 s2, s2, 16 ; GFX9-NEXT: s_or_b32 s1, s1, s2 -; GFX9-NEXT: s_and_b32 s2, s5, s6 -; GFX9-NEXT: s_lshl_b32 s2, s2, 24 +; GFX9-NEXT: s_lshl_b32 s2, s5, 24 ; GFX9-NEXT: s_or_b32 s1, s1, s2 ; GFX9-NEXT: s_and_b32 s2, s4, s6 ; GFX9-NEXT: v_lshlrev_b32_e64 v1, v0, s2 @@ -853,11 +852,12 @@ ; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX9-NEXT: v_and_or_b32 v0, s1, v0, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_and_or_b32 v1, v0, s6, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v0, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v0, v0, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v2, v1, v2, v0 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX9-NEXT: v_or3_b32 v2, v1, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off @@ -869,7 +869,7 @@ ; GFX8-NEXT: s_movk_i32 s5, 0xff ; GFX8-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 8 +; GFX8-NEXT: v_mov_b32_e32 v3, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshr_b32 s1, s0, 8 ; GFX8-NEXT: s_and_b32 s1, s1, s5 @@ -881,8 +881,7 @@ ; GFX8-NEXT: s_and_b32 s1, s2, s5 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s3, s5 -; GFX8-NEXT: s_lshl_b32 s1, s1, 24 +; GFX8-NEXT: s_lshl_b32 s1, s3, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s4, s5 ; GFX8-NEXT: v_lshlrev_b32_e64 v1, v0, s1 @@ -891,13 +890,14 @@ ; GFX8-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v3, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v2, v1, v0 +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -920,8 +920,7 @@ ; GFX7-NEXT: s_and_b32 s1, s2, s5 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 -; GFX7-NEXT: s_and_b32 s1, s3, s5 -; GFX7-NEXT: s_lshl_b32 s1, s1, 24 +; GFX7-NEXT: s_lshl_b32 s1, s3, 24 ; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: s_and_b32 s1, s4, s5 ; GFX7-NEXT: v_lshl_b32_e32 v1, s1, v0 @@ -939,8 +938,7 @@ ; GFX7-NEXT: v_and_b32_e32 v1, s5, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, s5, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -971,20 +969,20 @@ ; GFX9-NEXT: s_or_b32 s1, s1, s2 ; GFX9-NEXT: s_and_b32 s2, s3, s5 ; GFX9-NEXT: s_lshl_b32 s2, s2, 16 -; GFX9-NEXT: s_or_b32 s1, s1, s2 -; GFX9-NEXT: s_and_b32 s2, s4, s5 ; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s5 -; GFX9-NEXT: s_lshl_b32 s2, s2, 24 +; GFX9-NEXT: s_or_b32 s1, s1, s2 +; GFX9-NEXT: s_lshl_b32 s2, s4, 24 ; GFX9-NEXT: s_or_b32 s1, s1, s2 ; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX9-NEXT: v_and_or_b32 v0, s1, v1, v0 ; GFX9-NEXT: s_mov_b32 s0, 8 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v2, v1, v2, v0 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX9-NEXT: v_or3_b32 v2, v1, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off @@ -1007,23 +1005,23 @@ ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s2, s4 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s3, s4 ; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s4 -; GFX8-NEXT: s_lshl_b32 s1, s1, 24 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_lshl_b32 s1, s3, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX8-NEXT: v_and_b32_e32 v1, s0, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 8 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, 8 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v3, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v2, v1, v0 +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -1047,8 +1045,7 @@ ; GFX7-NEXT: s_and_b32 s1, s2, s4 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 -; GFX7-NEXT: s_and_b32 s1, s3, s4 -; GFX7-NEXT: s_lshl_b32 s1, s1, 24 +; GFX7-NEXT: s_lshl_b32 s1, s3, 24 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_lshl_b32_e32 v1, s4, v1 ; GFX7-NEXT: s_or_b32 s0, s0, s1 @@ -1065,8 +1062,7 @@ ; GFX7-NEXT: v_and_b32_e32 v1, s4, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -1094,18 +1090,20 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v5, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v6, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v6, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; GFX9-NEXT: v_and_or_b32 v0, v0, s1, v4 -; GFX9-NEXT: v_or3_b32 v0, v0, v5, v6 +; GFX9-NEXT: v_or3_b32 v0, v0, v6, v5 ; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v3 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v3, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v4, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v4, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v3 ; GFX9-NEXT: v_and_or_b32 v0, v0, s1, v1 -; GFX9-NEXT: v_or3_b32 v2, v0, v3, v4 +; GFX9-NEXT: v_or3_b32 v2, v0, v4, v2 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off @@ -1116,8 +1114,8 @@ ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: v_mov_b32_e32 v1, 8 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: s_and_b32 s1, s2, s0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v2 ; GFX8-NEXT: v_lshlrev_b32_e64 v5, v2, s1 @@ -1127,20 +1125,22 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v7, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v8, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v8, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v7 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 ; GFX8-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v2, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v4, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v4, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v0, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -1168,7 +1168,6 @@ ; GFX7-NEXT: v_and_b32_e32 v4, s0, v4 ; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX7-NEXT: v_and_b32_e32 v5, s0, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 @@ -1183,7 +1182,6 @@ ; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 ; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_and_b32_e32 v3, s0, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -1211,18 +1209,20 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v4, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v5, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v5, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX9-NEXT: v_and_or_b32 v0, v0, s1, v3 -; GFX9-NEXT: v_or3_b32 v0, v0, v4, v5 +; GFX9-NEXT: v_or3_b32 v0, v0, v5, v4 ; GFX9-NEXT: v_and_or_b32 v0, v0, s2, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v3, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v4, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v4, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v3 ; GFX9-NEXT: v_and_or_b32 v0, v0, s1, v1 -; GFX9-NEXT: v_or3_b32 v2, v0, v3, v4 +; GFX9-NEXT: v_or3_b32 v2, v0, v4, v2 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off @@ -1244,20 +1244,22 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v6, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v7, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v7, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v6 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 ; GFX8-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v2, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v4, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v4, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v0, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -1285,7 +1287,6 @@ ; GFX7-NEXT: v_and_b32_e32 v3, s0, v3 ; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX7-NEXT: v_and_b32_e32 v4, s0, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 @@ -1300,7 +1301,6 @@ ; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 ; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_and_b32_e32 v3, s0, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -1329,18 +1329,20 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v5, s0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v6, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v7, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v7, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX9-NEXT: v_and_or_b32 v0, v0, s1, v5 -; GFX9-NEXT: v_or3_b32 v0, v0, v6, v7 +; GFX9-NEXT: v_or3_b32 v0, v0, v7, v6 ; GFX9-NEXT: v_and_or_b32 v0, v0, v3, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v5, v0, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX9-NEXT: v_and_or_b32 v0, v0, v1, v2 -; GFX9-NEXT: v_or3_b32 v2, v0, v3, v5 +; GFX9-NEXT: v_or3_b32 v2, v0, v5, v3 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off @@ -1351,8 +1353,8 @@ ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: v_mov_b32_e32 v4, 8 -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX8-NEXT: v_mov_b32_e32 v6, s0 +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v3, s0 @@ -1362,20 +1364,22 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v8, v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v6, v0, v6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v6, v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX8-NEXT: v_lshlrev_b32_e32 v7, 24, v8 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 ; GFX8-NEXT: v_and_b32_e32 v0, v0, v3 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX8-NEXT: v_or_b32_e32 v2, v0, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v0, v3 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -1403,7 +1407,6 @@ ; GFX7-NEXT: v_and_b32_e32 v5, s2, v5 ; GFX7-NEXT: v_and_b32_e32 v0, s2, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GFX7-NEXT: v_and_b32_e32 v6, s2, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 24, v6 @@ -1412,18 +1415,17 @@ ; GFX7-NEXT: v_and_b32_e32 v0, v0, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v2, v2, v1 -; GFX7-NEXT: v_and_b32_e32 v3, v3, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v1, v3, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX7-NEXT: v_and_b32_e32 v1, v4, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v4 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -1451,8 +1453,7 @@ ; GFX9-NEXT: s_and_b32 s2, s3, s10 ; GFX9-NEXT: s_lshl_b32 s2, s2, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s2 -; GFX9-NEXT: s_and_b32 s2, s6, s10 -; GFX9-NEXT: s_lshl_b32 s2, s2, 24 +; GFX9-NEXT: s_lshl_b32 s2, s6, 24 ; GFX9-NEXT: s_lshr_b32 s7, s1, 8 ; GFX9-NEXT: s_or_b32 s0, s0, s2 ; GFX9-NEXT: s_and_b32 s2, s7, s10 @@ -1464,8 +1465,7 @@ ; GFX9-NEXT: s_and_b32 s2, s8, s10 ; GFX9-NEXT: s_lshl_b32 s2, s2, 16 ; GFX9-NEXT: s_or_b32 s1, s1, s2 -; GFX9-NEXT: s_and_b32 s2, s9, s10 -; GFX9-NEXT: s_lshl_b32 s2, s2, 24 +; GFX9-NEXT: s_lshl_b32 s2, s9, 24 ; GFX9-NEXT: s_or_b32 s1, s1, s2 ; GFX9-NEXT: s_lshr_b32 s2, s5, 2 ; GFX9-NEXT: s_cmp_eq_u32 s2, 1 @@ -1491,8 +1491,7 @@ ; GFX9-NEXT: s_and_b32 s2, s3, s10 ; GFX9-NEXT: s_lshl_b32 s2, s2, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s2 -; GFX9-NEXT: s_and_b32 s2, s4, s10 -; GFX9-NEXT: s_lshl_b32 s2, s2, 24 +; GFX9-NEXT: s_lshl_b32 s2, s4, 24 ; GFX9-NEXT: s_lshr_b32 s5, s1, 8 ; GFX9-NEXT: s_or_b32 s0, s0, s2 ; GFX9-NEXT: s_and_b32 s2, s5, s10 @@ -1504,8 +1503,7 @@ ; GFX9-NEXT: s_and_b32 s2, s6, s10 ; GFX9-NEXT: s_lshl_b32 s2, s2, 16 ; GFX9-NEXT: s_or_b32 s1, s1, s2 -; GFX9-NEXT: s_and_b32 s2, s7, s10 -; GFX9-NEXT: s_lshl_b32 s2, s2, 24 +; GFX9-NEXT: s_lshl_b32 s2, s7, 24 ; GFX9-NEXT: s_or_b32 s1, s1, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -1529,8 +1527,7 @@ ; GFX8-NEXT: s_and_b32 s2, s3, s10 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s2, s6, s10 -; GFX8-NEXT: s_lshl_b32 s2, s2, 24 +; GFX8-NEXT: s_lshl_b32 s2, s6, 24 ; GFX8-NEXT: s_lshr_b32 s7, s1, 8 ; GFX8-NEXT: s_or_b32 s0, s0, s2 ; GFX8-NEXT: s_and_b32 s2, s7, s10 @@ -1542,8 +1539,7 @@ ; GFX8-NEXT: s_and_b32 s2, s8, s10 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s1, s1, s2 -; GFX8-NEXT: s_and_b32 s2, s9, s10 -; GFX8-NEXT: s_lshl_b32 s2, s2, 24 +; GFX8-NEXT: s_lshl_b32 s2, s9, 24 ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: s_lshr_b32 s2, s5, 2 ; GFX8-NEXT: s_cmp_eq_u32 s2, 1 @@ -1569,8 +1565,7 @@ ; GFX8-NEXT: s_and_b32 s2, s3, s10 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s2, s4, s10 -; GFX8-NEXT: s_lshl_b32 s2, s2, 24 +; GFX8-NEXT: s_lshl_b32 s2, s4, 24 ; GFX8-NEXT: s_lshr_b32 s5, s1, 8 ; GFX8-NEXT: s_or_b32 s0, s0, s2 ; GFX8-NEXT: s_and_b32 s2, s5, s10 @@ -1582,8 +1577,7 @@ ; GFX8-NEXT: s_and_b32 s2, s6, s10 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s1, s1, s2 -; GFX8-NEXT: s_and_b32 s2, s7, s10 -; GFX8-NEXT: s_lshl_b32 s2, s2, 24 +; GFX8-NEXT: s_lshl_b32 s2, s7, 24 ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -1605,8 +1599,7 @@ ; GFX7-NEXT: s_and_b32 s2, s3, s10 ; GFX7-NEXT: s_lshl_b32 s2, s2, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s2 -; GFX7-NEXT: s_and_b32 s2, s6, s10 -; GFX7-NEXT: s_lshl_b32 s2, s2, 24 +; GFX7-NEXT: s_lshl_b32 s2, s6, 24 ; GFX7-NEXT: s_lshr_b32 s7, s1, 8 ; GFX7-NEXT: s_or_b32 s0, s0, s2 ; GFX7-NEXT: s_and_b32 s2, s7, s10 @@ -1618,8 +1611,7 @@ ; GFX7-NEXT: s_and_b32 s2, s8, s10 ; GFX7-NEXT: s_lshl_b32 s2, s2, 16 ; GFX7-NEXT: s_or_b32 s1, s1, s2 -; GFX7-NEXT: s_and_b32 s2, s9, s10 -; GFX7-NEXT: s_lshl_b32 s2, s2, 24 +; GFX7-NEXT: s_lshl_b32 s2, s9, 24 ; GFX7-NEXT: s_or_b32 s1, s1, s2 ; GFX7-NEXT: s_lshr_b32 s2, s5, 2 ; GFX7-NEXT: s_cmp_eq_u32 s2, 1 @@ -1645,8 +1637,7 @@ ; GFX7-NEXT: s_and_b32 s4, s5, s10 ; GFX7-NEXT: s_lshl_b32 s4, s4, 16 ; GFX7-NEXT: s_or_b32 s2, s2, s4 -; GFX7-NEXT: s_and_b32 s4, s6, s10 -; GFX7-NEXT: s_lshl_b32 s4, s4, 24 +; GFX7-NEXT: s_lshl_b32 s4, s6, 24 ; GFX7-NEXT: s_lshr_b32 s7, s3, 8 ; GFX7-NEXT: s_or_b32 s2, s2, s4 ; GFX7-NEXT: s_and_b32 s4, s7, s10 @@ -1658,8 +1649,7 @@ ; GFX7-NEXT: s_and_b32 s4, s8, s10 ; GFX7-NEXT: s_lshl_b32 s4, s4, 16 ; GFX7-NEXT: s_or_b32 s3, s3, s4 -; GFX7-NEXT: s_and_b32 s4, s9, s10 -; GFX7-NEXT: s_lshl_b32 s4, s4, 24 +; GFX7-NEXT: s_lshl_b32 s4, s9, 24 ; GFX7-NEXT: s_or_b32 s3, s3, s4 ; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 @@ -1679,7 +1669,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_mov_b32 s0, 8 -; GFX9-NEXT: v_mov_b32_e32 v2, 8 ; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: s_lshr_b32 s1, s3, 2 ; GFX9-NEXT: s_and_b32 s3, s3, 3 @@ -1690,37 +1679,42 @@ ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 ; GFX9-NEXT: s_not_b32 s3, s3 ; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, 8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v6, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v7, v0, s4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_sdwa v6, s0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v8, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v9, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v4 -; GFX9-NEXT: v_and_b32_sdwa v8, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v9, v1, s4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v5 -; GFX9-NEXT: v_or3_b32 v0, v0, v6, v7 -; GFX9-NEXT: v_or3_b32 v1, v1, v8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v6 +; GFX9-NEXT: v_or3_b32 v0, v0, v8, v5 +; GFX9-NEXT: v_or3_b32 v1, v1, v9, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc ; GFX9-NEXT: v_and_or_b32 v3, v4, s3, v3 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v5, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v6, v0, s4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v7, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v8, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v3 -; GFX9-NEXT: v_and_b32_sdwa v7, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v8, v1, s4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v6 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_or3_b32 v0, v0, v5, v6 -; GFX9-NEXT: v_or3_b32 v1, v1, v7, v8 +; GFX9-NEXT: v_or3_b32 v0, v0, v7, v4 +; GFX9-NEXT: v_or3_b32 v1, v1, v8, v5 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm @@ -1742,19 +1736,21 @@ ; GFX8-NEXT: s_lshl_b32 s2, s2, s3 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v7, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v8, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v6 +; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v1 +; GFX8-NEXT: v_and_b32_sdwa v9, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v9, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v10, v1, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v9 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX8-NEXT: v_and_b32_sdwa v10, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 +; GFX8-NEXT: v_lshlrev_b32_e32 v7, 24, v8 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v10 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX8-NEXT: v_and_b32_e32 v2, s0, v2 ; GFX8-NEXT: v_or_b32_e32 v2, s2, v2 @@ -1762,20 +1758,22 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v6, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v7, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v8, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v8, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v4, v1, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX8-NEXT: v_and_b32_sdwa v4, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v8 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v7 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v6 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm @@ -1810,8 +1808,6 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX7-NEXT: v_and_b32_e32 v1, s6, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GFX7-NEXT: v_and_b32_e32 v4, s6, v4 -; GFX7-NEXT: v_and_b32_e32 v7, s6, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 @@ -1839,11 +1835,9 @@ ; GFX7-NEXT: v_and_b32_e32 v3, s6, v3 ; GFX7-NEXT: v_and_b32_e32 v6, s6, v6 ; GFX7-NEXT: v_and_b32_e32 v0, s6, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX7-NEXT: v_and_b32_e32 v1, s6, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GFX7-NEXT: v_and_b32_e32 v4, s6, v4 -; GFX7-NEXT: v_and_b32_e32 v7, s6, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 @@ -1881,8 +1875,7 @@ ; GFX9-NEXT: s_and_b32 s2, s3, s10 ; GFX9-NEXT: s_lshl_b32 s2, s2, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s2 -; GFX9-NEXT: s_and_b32 s2, s6, s10 -; GFX9-NEXT: s_lshl_b32 s2, s2, 24 +; GFX9-NEXT: s_lshl_b32 s2, s6, 24 ; GFX9-NEXT: s_lshr_b32 s7, s1, 8 ; GFX9-NEXT: s_or_b32 s0, s0, s2 ; GFX9-NEXT: s_and_b32 s2, s7, s10 @@ -1894,8 +1887,7 @@ ; GFX9-NEXT: s_and_b32 s2, s8, s10 ; GFX9-NEXT: s_lshl_b32 s2, s2, 16 ; GFX9-NEXT: s_or_b32 s1, s1, s2 -; GFX9-NEXT: s_and_b32 s2, s9, s10 -; GFX9-NEXT: s_lshl_b32 s2, s2, 24 +; GFX9-NEXT: s_lshl_b32 s2, s9, 24 ; GFX9-NEXT: s_or_b32 s1, s1, s2 ; GFX9-NEXT: s_lshr_b32 s2, s4, 2 ; GFX9-NEXT: s_cmp_eq_u32 s2, 1 @@ -1913,17 +1905,19 @@ ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX9-NEXT: v_and_or_b32 v2, v0, s10, v2 -; GFX9-NEXT: v_and_b32_sdwa v4, v0, s10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v0, v0, s10 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v2, v4, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX9-NEXT: v_or3_b32 v0, v2, v0, v3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 ; GFX9-NEXT: v_and_or_b32 v2, v1, s10, v2 -; GFX9-NEXT: v_and_b32_sdwa v3, v1, s10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v1, v1, s10 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v1, v2, v3, v1 +; GFX9-NEXT: v_and_b32_sdwa v1, v1, s10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v5 +; GFX9-NEXT: v_or3_b32 v1, v2, v1, v3 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off @@ -1933,7 +1927,8 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX8-NEXT: s_movk_i32 s9, 0xff -; GFX8-NEXT: v_mov_b32_e32 v4, 8 +; GFX8-NEXT: v_mov_b32_e32 v6, 8 +; GFX8-NEXT: v_mov_b32_e32 v7, s9 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshr_b32 s2, s0, 8 ; GFX8-NEXT: s_and_b32 s2, s2, s9 @@ -1945,8 +1940,7 @@ ; GFX8-NEXT: s_and_b32 s2, s3, s9 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s2, s5, s9 -; GFX8-NEXT: s_lshl_b32 s2, s2, 24 +; GFX8-NEXT: s_lshl_b32 s2, s5, 24 ; GFX8-NEXT: s_lshr_b32 s6, s1, 8 ; GFX8-NEXT: s_or_b32 s0, s0, s2 ; GFX8-NEXT: s_and_b32 s2, s6, s9 @@ -1958,8 +1952,7 @@ ; GFX8-NEXT: s_and_b32 s2, s7, s9 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s1, s1, s2 -; GFX8-NEXT: s_and_b32 s2, s8, s9 -; GFX8-NEXT: s_lshl_b32 s2, s2, 24 +; GFX8-NEXT: s_lshl_b32 s2, s8, 24 ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: s_lshr_b32 s2, s4, 2 ; GFX8-NEXT: s_cmp_eq_u32 s2, 1 @@ -1978,21 +1971,21 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_mov_b32_e32 v4, s9 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX8-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v5, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v2, v2, v5 -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, 8 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v1 ; GFX8-NEXT: v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v5 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -2014,8 +2007,7 @@ ; GFX7-NEXT: s_and_b32 s2, s3, s9 ; GFX7-NEXT: s_lshl_b32 s2, s2, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s2 -; GFX7-NEXT: s_and_b32 s2, s5, s9 -; GFX7-NEXT: s_lshl_b32 s2, s2, 24 +; GFX7-NEXT: s_lshl_b32 s2, s5, 24 ; GFX7-NEXT: s_lshr_b32 s6, s1, 8 ; GFX7-NEXT: s_or_b32 s0, s0, s2 ; GFX7-NEXT: s_and_b32 s2, s6, s9 @@ -2027,8 +2019,7 @@ ; GFX7-NEXT: s_and_b32 s2, s7, s9 ; GFX7-NEXT: s_lshl_b32 s2, s2, 16 ; GFX7-NEXT: s_or_b32 s1, s1, s2 -; GFX7-NEXT: s_and_b32 s2, s8, s9 -; GFX7-NEXT: s_lshl_b32 s2, s2, 24 +; GFX7-NEXT: s_lshl_b32 s2, s8, 24 ; GFX7-NEXT: s_or_b32 s1, s1, s2 ; GFX7-NEXT: s_lshr_b32 s2, s4, 2 ; GFX7-NEXT: s_cmp_eq_u32 s2, 1 @@ -2055,8 +2046,7 @@ ; GFX7-NEXT: v_and_b32_e32 v2, s9, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v2, s9, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_and_b32_e32 v2, s9, v5 @@ -2068,8 +2058,7 @@ ; GFX7-NEXT: v_and_b32_e32 v2, s9, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX7-NEXT: v_and_b32_e32 v2, s9, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v7 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -2101,8 +2090,7 @@ ; GFX9-NEXT: s_and_b32 s2, s3, s10 ; GFX9-NEXT: s_lshl_b32 s2, s2, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s2 -; GFX9-NEXT: s_and_b32 s2, s6, s10 -; GFX9-NEXT: s_lshl_b32 s2, s2, 24 +; GFX9-NEXT: s_lshl_b32 s2, s6, 24 ; GFX9-NEXT: s_lshr_b32 s7, s1, 8 ; GFX9-NEXT: s_or_b32 s0, s0, s2 ; GFX9-NEXT: s_and_b32 s2, s7, s10 @@ -2114,8 +2102,7 @@ ; GFX9-NEXT: s_and_b32 s2, s8, s10 ; GFX9-NEXT: s_lshl_b32 s2, s2, 16 ; GFX9-NEXT: s_or_b32 s1, s1, s2 -; GFX9-NEXT: s_and_b32 s2, s9, s10 -; GFX9-NEXT: s_lshl_b32 s2, s2, 24 +; GFX9-NEXT: s_lshl_b32 s2, s9, 24 ; GFX9-NEXT: s_or_b32 s1, s1, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 @@ -2133,17 +2120,19 @@ ; GFX9-NEXT: s_mov_b32 s5, 8 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX9-NEXT: v_and_or_b32 v2, v0, s10, v2 -; GFX9-NEXT: v_and_b32_sdwa v4, v0, s10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v0, v0, s10 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v2, v4, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX9-NEXT: v_or3_b32 v0, v2, v0, v3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 ; GFX9-NEXT: v_and_or_b32 v2, v1, s10, v2 -; GFX9-NEXT: v_and_b32_sdwa v3, v1, s10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v1, v1, s10 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v1, v2, v3, v1 +; GFX9-NEXT: v_and_b32_sdwa v1, v1, s10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v5 +; GFX9-NEXT: v_or3_b32 v1, v2, v1, v3 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off @@ -2167,8 +2156,7 @@ ; GFX8-NEXT: s_and_b32 s2, s3, s9 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s2, s5, s9 -; GFX8-NEXT: s_lshl_b32 s2, s2, 24 +; GFX8-NEXT: s_lshl_b32 s2, s5, 24 ; GFX8-NEXT: s_lshr_b32 s6, s1, 8 ; GFX8-NEXT: s_or_b32 s0, s0, s2 ; GFX8-NEXT: s_and_b32 s2, s6, s9 @@ -2180,8 +2168,7 @@ ; GFX8-NEXT: s_and_b32 s2, s7, s9 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s1, s1, s2 -; GFX8-NEXT: s_and_b32 s2, s8, s9 -; GFX8-NEXT: s_lshl_b32 s2, s2, 24 +; GFX8-NEXT: s_lshl_b32 s2, s8, 24 ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 @@ -2198,23 +2185,24 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, 8 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_mov_b32_e32 v4, s9 -; GFX8-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v5, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v6, 8 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_mov_b32_e32 v7, s9 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX8-NEXT: v_or_b32_e32 v2, v2, v5 -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX8-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, 8 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v1 ; GFX8-NEXT: v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v5 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -2238,8 +2226,7 @@ ; GFX7-NEXT: s_and_b32 s2, s3, s9 ; GFX7-NEXT: s_lshl_b32 s2, s2, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s2 -; GFX7-NEXT: s_and_b32 s2, s5, s9 -; GFX7-NEXT: s_lshl_b32 s2, s2, 24 +; GFX7-NEXT: s_lshl_b32 s2, s5, 24 ; GFX7-NEXT: s_lshr_b32 s6, s1, 8 ; GFX7-NEXT: s_or_b32 s0, s0, s2 ; GFX7-NEXT: s_and_b32 s2, s6, s9 @@ -2251,8 +2238,7 @@ ; GFX7-NEXT: s_and_b32 s2, s7, s9 ; GFX7-NEXT: s_lshl_b32 s2, s2, 16 ; GFX7-NEXT: s_or_b32 s1, s1, s2 -; GFX7-NEXT: s_and_b32 s2, s8, s9 -; GFX7-NEXT: s_lshl_b32 s2, s2, 24 +; GFX7-NEXT: s_lshl_b32 s2, s8, 24 ; GFX7-NEXT: s_or_b32 s1, s1, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 @@ -2279,8 +2265,7 @@ ; GFX7-NEXT: v_and_b32_e32 v2, s9, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v2, s9, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_and_b32_e32 v2, s9, v5 @@ -2292,8 +2277,7 @@ ; GFX7-NEXT: v_and_b32_e32 v2, s9, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX7-NEXT: v_and_b32_e32 v2, s9, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v7 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -2325,8 +2309,7 @@ ; GFX9-NEXT: s_and_b32 s2, s3, s9 ; GFX9-NEXT: s_lshl_b32 s2, s2, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s2 -; GFX9-NEXT: s_and_b32 s2, s5, s9 -; GFX9-NEXT: s_lshl_b32 s2, s2, 24 +; GFX9-NEXT: s_lshl_b32 s2, s5, 24 ; GFX9-NEXT: s_lshr_b32 s6, s1, 8 ; GFX9-NEXT: s_or_b32 s0, s0, s2 ; GFX9-NEXT: s_and_b32 s2, s6, s9 @@ -2338,8 +2321,7 @@ ; GFX9-NEXT: s_and_b32 s2, s7, s9 ; GFX9-NEXT: s_lshl_b32 s2, s2, 16 ; GFX9-NEXT: s_or_b32 s1, s1, s2 -; GFX9-NEXT: s_and_b32 s2, s8, s9 -; GFX9-NEXT: s_lshl_b32 s2, s2, 24 +; GFX9-NEXT: s_lshl_b32 s2, s8, 24 ; GFX9-NEXT: s_or_b32 s1, s1, s2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s9 @@ -2356,17 +2338,19 @@ ; GFX9-NEXT: s_mov_b32 s4, 8 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX9-NEXT: v_and_or_b32 v2, v0, s9, v2 -; GFX9-NEXT: v_and_b32_sdwa v4, v0, s9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v0, v0, s9 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v2, v4, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX9-NEXT: v_or3_b32 v0, v2, v0, v3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 ; GFX9-NEXT: v_and_or_b32 v2, v1, s9, v2 -; GFX9-NEXT: v_and_b32_sdwa v3, v1, s9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v1, v1, s9 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v1, v2, v3, v1 +; GFX9-NEXT: v_and_b32_sdwa v1, v1, s9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v5 +; GFX9-NEXT: v_or3_b32 v1, v2, v1, v3 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off @@ -2390,8 +2374,7 @@ ; GFX8-NEXT: s_and_b32 s2, s3, s8 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s2, s4, s8 -; GFX8-NEXT: s_lshl_b32 s2, s2, 24 +; GFX8-NEXT: s_lshl_b32 s2, s4, 24 ; GFX8-NEXT: s_lshr_b32 s5, s1, 8 ; GFX8-NEXT: s_or_b32 s0, s0, s2 ; GFX8-NEXT: s_and_b32 s2, s5, s8 @@ -2403,8 +2386,7 @@ ; GFX8-NEXT: s_and_b32 s2, s6, s8 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s1, s1, s2 -; GFX8-NEXT: s_and_b32 s2, s7, s8 -; GFX8-NEXT: s_lshl_b32 s2, s2, 24 +; GFX8-NEXT: s_lshl_b32 s2, s7, 24 ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s8 @@ -2420,23 +2402,24 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, 8 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_mov_b32_e32 v4, s8 -; GFX8-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v5, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v6, 8 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_mov_b32_e32 v7, s8 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX8-NEXT: v_or_b32_e32 v2, v2, v5 -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX8-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, 8 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v1 ; GFX8-NEXT: v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v5 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -2460,8 +2443,7 @@ ; GFX7-NEXT: s_and_b32 s2, s3, s8 ; GFX7-NEXT: s_lshl_b32 s2, s2, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s2 -; GFX7-NEXT: s_and_b32 s2, s4, s8 -; GFX7-NEXT: s_lshl_b32 s2, s2, 24 +; GFX7-NEXT: s_lshl_b32 s2, s4, 24 ; GFX7-NEXT: s_lshr_b32 s5, s1, 8 ; GFX7-NEXT: s_or_b32 s0, s0, s2 ; GFX7-NEXT: s_and_b32 s2, s5, s8 @@ -2473,8 +2455,7 @@ ; GFX7-NEXT: s_and_b32 s2, s6, s8 ; GFX7-NEXT: s_lshl_b32 s2, s2, 16 ; GFX7-NEXT: s_or_b32 s1, s1, s2 -; GFX7-NEXT: s_and_b32 s2, s7, s8 -; GFX7-NEXT: s_lshl_b32 s2, s2, 24 +; GFX7-NEXT: s_lshl_b32 s2, s7, 24 ; GFX7-NEXT: s_or_b32 s1, s1, s2 ; GFX7-NEXT: v_and_b32_e32 v0, s8, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 @@ -2501,8 +2482,7 @@ ; GFX7-NEXT: v_and_b32_e32 v2, s8, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v2, s8, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_and_b32_e32 v2, s8, v5 @@ -2514,8 +2494,7 @@ ; GFX7-NEXT: v_and_b32_e32 v2, s8, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX7-NEXT: v_and_b32_e32 v2, s8, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v7 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -2533,7 +2512,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_mov_b32 s0, 8 -; GFX9-NEXT: v_mov_b32_e32 v3, 8 ; GFX9-NEXT: s_movk_i32 s3, 0xff ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 2, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 3, v2 @@ -2543,37 +2521,42 @@ ; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s3 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, 8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v6, s0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v8, v0, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v9, v0, s3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_sdwa v8, s0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v10, v0, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v11, v1, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v6 -; GFX9-NEXT: v_and_b32_sdwa v10, v1, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v11, v1, s3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v7 -; GFX9-NEXT: v_or3_b32 v0, v0, v8, v9 -; GFX9-NEXT: v_or3_b32 v1, v1, v10, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v8 +; GFX9-NEXT: v_or3_b32 v0, v0, v10, v7 +; GFX9-NEXT: v_or3_b32 v1, v1, v11, v9 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc ; GFX9-NEXT: v_and_or_b32 v2, v6, v2, v5 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v5, v0, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v6, v0, s3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v7, v0, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v8, v1, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v2 -; GFX9-NEXT: v_and_b32_sdwa v7, v1, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v8, v1, s3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v6 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_or3_b32 v0, v0, v5, v6 -; GFX9-NEXT: v_or3_b32 v1, v1, v7, v8 +; GFX9-NEXT: v_or3_b32 v0, v0, v7, v4 +; GFX9-NEXT: v_or3_b32 v1, v1, v8, v5 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm @@ -2596,39 +2579,43 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v10, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v11, v0, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v8, 24, v9 +; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v1 +; GFX8-NEXT: v_and_b32_sdwa v12, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v12, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v13, v1, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v10 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v12 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v11 +; GFX8-NEXT: v_and_b32_sdwa v13, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v12 +; GFX8-NEXT: v_lshlrev_b32_e32 v10, 24, v11 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v13 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v10 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; GFX8-NEXT: v_and_b32_e32 v2, v3, v2 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v6, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v7, v0, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v8, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v8, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v5, v1, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v8 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX8-NEXT: v_and_b32_sdwa v5, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v7 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v6 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm @@ -2665,8 +2652,6 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_and_b32_e32 v1, s3, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; GFX7-NEXT: v_and_b32_e32 v7, s3, v7 -; GFX7-NEXT: v_and_b32_e32 v10, s3, v10 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9 @@ -2696,8 +2681,6 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX7-NEXT: v_and_b32_e32 v1, s3, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GFX7-NEXT: v_and_b32_e32 v4, s3, v4 -; GFX7-NEXT: v_and_b32_e32 v7, s3, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 @@ -2721,7 +2704,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_mov_b32 s0, 8 -; GFX9-NEXT: v_mov_b32_e32 v3, 8 ; GFX9-NEXT: s_movk_i32 s3, 0xff ; GFX9-NEXT: s_lshr_b32 s1, s2, 2 ; GFX9-NEXT: s_and_b32 s2, s2, 3 @@ -2730,37 +2712,42 @@ ; GFX9-NEXT: s_lshl_b32 s2, s3, s2 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 ; GFX9-NEXT: s_not_b32 s2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, 8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v6, v0, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v7, v0, s3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_sdwa v6, s0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v8, v0, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v9, v1, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v4 -; GFX9-NEXT: v_and_b32_sdwa v8, v1, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v9, v1, s3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v5 -; GFX9-NEXT: v_or3_b32 v0, v0, v6, v7 -; GFX9-NEXT: v_or3_b32 v1, v1, v8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v6 +; GFX9-NEXT: v_or3_b32 v0, v0, v8, v5 +; GFX9-NEXT: v_or3_b32 v1, v1, v9, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc ; GFX9-NEXT: v_and_or_b32 v2, v4, s2, v2 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v5, v0, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v6, v0, s3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v7, v0, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v8, v1, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v2 -; GFX9-NEXT: v_and_b32_sdwa v7, v1, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v8, v1, s3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v6 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_or3_b32 v0, v0, v5, v6 -; GFX9-NEXT: v_or3_b32 v1, v1, v7, v8 +; GFX9-NEXT: v_or3_b32 v0, v0, v7, v4 +; GFX9-NEXT: v_or3_b32 v1, v1, v8, v5 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm @@ -2782,19 +2769,21 @@ ; GFX8-NEXT: s_not_b32 s0, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v8, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v9, v0, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v7 +; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v1 +; GFX8-NEXT: v_and_b32_sdwa v10, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v10, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v11, v1, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v10 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 +; GFX8-NEXT: v_and_b32_sdwa v11, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v10 +; GFX8-NEXT: v_lshlrev_b32_e32 v8, 24, v9 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v11 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v8 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; GFX8-NEXT: v_and_b32_e32 v3, s0, v3 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 @@ -2802,20 +2791,22 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v6, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v7, v0, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v8, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v8, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v5, v1, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v8 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX8-NEXT: v_and_b32_sdwa v5, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v7 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v6 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm @@ -2851,8 +2842,6 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX7-NEXT: v_and_b32_e32 v1, s3, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; GFX7-NEXT: v_and_b32_e32 v5, s3, v5 -; GFX7-NEXT: v_and_b32_e32 v8, s3, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 @@ -2883,8 +2872,6 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX7-NEXT: v_and_b32_e32 v1, s3, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GFX7-NEXT: v_and_b32_e32 v4, s3, v4 -; GFX7-NEXT: v_and_b32_e32 v7, s3, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 @@ -2908,7 +2895,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_mov_b32 s0, 8 -; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: s_movk_i32 s1, 0xff ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 2, v3 ; GFX9-NEXT: v_and_b32_e32 v3, 3, v3 @@ -2918,37 +2904,42 @@ ; GFX9-NEXT: v_lshlrev_b32_e32 v3, v3, v4 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 ; GFX9-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v7, s0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v9, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v10, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_sdwa v9, s0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v11, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v12, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX9-NEXT: v_and_or_b32 v0, v0, s1, v7 -; GFX9-NEXT: v_and_b32_sdwa v11, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v12, v1, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v1, v1, s1, v8 -; GFX9-NEXT: v_or3_b32 v0, v0, v9, v10 -; GFX9-NEXT: v_or3_b32 v1, v1, v11, v12 +; GFX9-NEXT: v_and_or_b32 v1, v1, s1, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX9-NEXT: v_or3_b32 v0, v0, v11, v8 +; GFX9-NEXT: v_or3_b32 v1, v1, v12, v10 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc ; GFX9-NEXT: v_and_or_b32 v2, v7, v3, v2 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v6, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v7, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v8, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX9-NEXT: v_and_or_b32 v0, v0, v4, v2 -; GFX9-NEXT: v_and_b32_sdwa v8, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v9, v1, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v3 +; GFX9-NEXT: v_and_b32_sdwa v9, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v0, v8, v3 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_or3_b32 v0, v0, v6, v7 -; GFX9-NEXT: v_or3_b32 v1, v1, v8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v7 +; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v5 +; GFX9-NEXT: v_or3_b32 v1, v1, v9, v6 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm @@ -2971,39 +2962,43 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v8 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 8, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v11, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v12, v0, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v10 +; GFX8-NEXT: v_lshlrev_b32_sdwa v10, v6, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v13, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v13, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v7, v1, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v11 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v13 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v12 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v1 +; GFX8-NEXT: v_and_b32_sdwa v7, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v13 +; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v12 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v11 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc ; GFX8-NEXT: v_and_b32_e32 v3, v5, v3 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v5, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v7, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v8, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v8, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v4, v1, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v8 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX8-NEXT: v_and_b32_sdwa v4, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v7 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v6 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm @@ -3037,11 +3032,9 @@ ; GFX7-NEXT: v_and_b32_e32 v7, s0, v7 ; GFX7-NEXT: v_and_b32_e32 v10, s0, v10 ; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GFX7-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; GFX7-NEXT: v_and_b32_e32 v8, s0, v8 -; GFX7-NEXT: v_and_b32_e32 v11, s0, v11 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10 @@ -3061,29 +3054,27 @@ ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; GFX7-NEXT: v_and_b32_e32 v2, v2, v4 ; GFX7-NEXT: v_and_b32_e32 v6, v6, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v3, v3, v4 -; GFX7-NEXT: v_and_b32_e32 v7, v7, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v1 ; GFX7-NEXT: v_and_b32_e32 v0, v0, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX7-NEXT: v_and_b32_e32 v1, v1, v4 +; GFX7-NEXT: v_and_b32_e32 v4, v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; GFX7-NEXT: v_and_b32_e32 v5, v5, v4 -; GFX7-NEXT: v_and_b32_e32 v4, v8, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v7 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v8 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v7 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(1)* %ptr @@ -3110,8 +3101,7 @@ ; GFX9-NEXT: s_and_b32 s6, s7, s18 ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s6 -; GFX9-NEXT: s_and_b32 s6, s8, s18 -; GFX9-NEXT: s_lshl_b32 s6, s6, 24 +; GFX9-NEXT: s_lshl_b32 s6, s8, 24 ; GFX9-NEXT: s_lshr_b32 s9, s1, 8 ; GFX9-NEXT: s_or_b32 s0, s0, s6 ; GFX9-NEXT: s_and_b32 s6, s9, s18 @@ -3123,8 +3113,7 @@ ; GFX9-NEXT: s_and_b32 s6, s10, s18 ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 ; GFX9-NEXT: s_or_b32 s1, s1, s6 -; GFX9-NEXT: s_and_b32 s6, s11, s18 -; GFX9-NEXT: s_lshl_b32 s6, s6, 24 +; GFX9-NEXT: s_lshl_b32 s6, s11, 24 ; GFX9-NEXT: s_lshr_b32 s12, s2, 8 ; GFX9-NEXT: s_or_b32 s1, s1, s6 ; GFX9-NEXT: s_and_b32 s6, s12, s18 @@ -3136,8 +3125,7 @@ ; GFX9-NEXT: s_and_b32 s6, s13, s18 ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 ; GFX9-NEXT: s_or_b32 s2, s2, s6 -; GFX9-NEXT: s_and_b32 s6, s14, s18 -; GFX9-NEXT: s_lshl_b32 s6, s6, 24 +; GFX9-NEXT: s_lshl_b32 s6, s14, 24 ; GFX9-NEXT: s_lshr_b32 s15, s3, 8 ; GFX9-NEXT: s_or_b32 s2, s2, s6 ; GFX9-NEXT: s_and_b32 s6, s15, s18 @@ -3149,8 +3137,7 @@ ; GFX9-NEXT: s_and_b32 s6, s16, s18 ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 ; GFX9-NEXT: s_or_b32 s3, s3, s6 -; GFX9-NEXT: s_and_b32 s6, s17, s18 -; GFX9-NEXT: s_lshl_b32 s6, s6, 24 +; GFX9-NEXT: s_lshl_b32 s6, s17, 24 ; GFX9-NEXT: s_or_b32 s3, s3, s6 ; GFX9-NEXT: s_lshr_b32 s6, s5, 2 ; GFX9-NEXT: s_cmp_eq_u32 s6, 1 @@ -3184,8 +3171,7 @@ ; GFX9-NEXT: s_and_b32 s4, s5, s18 ; GFX9-NEXT: s_lshl_b32 s4, s4, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s4 -; GFX9-NEXT: s_and_b32 s4, s6, s18 -; GFX9-NEXT: s_lshl_b32 s4, s4, 24 +; GFX9-NEXT: s_lshl_b32 s4, s6, 24 ; GFX9-NEXT: s_lshr_b32 s7, s1, 8 ; GFX9-NEXT: s_or_b32 s0, s0, s4 ; GFX9-NEXT: s_and_b32 s4, s7, s18 @@ -3197,8 +3183,7 @@ ; GFX9-NEXT: s_and_b32 s4, s8, s18 ; GFX9-NEXT: s_lshl_b32 s4, s4, 16 ; GFX9-NEXT: s_or_b32 s1, s1, s4 -; GFX9-NEXT: s_and_b32 s4, s9, s18 -; GFX9-NEXT: s_lshl_b32 s4, s4, 24 +; GFX9-NEXT: s_lshl_b32 s4, s9, 24 ; GFX9-NEXT: s_lshr_b32 s10, s2, 8 ; GFX9-NEXT: s_or_b32 s1, s1, s4 ; GFX9-NEXT: s_and_b32 s4, s10, s18 @@ -3210,8 +3195,7 @@ ; GFX9-NEXT: s_and_b32 s4, s11, s18 ; GFX9-NEXT: s_lshl_b32 s4, s4, 16 ; GFX9-NEXT: s_or_b32 s2, s2, s4 -; GFX9-NEXT: s_and_b32 s4, s12, s18 -; GFX9-NEXT: s_lshl_b32 s4, s4, 24 +; GFX9-NEXT: s_lshl_b32 s4, s12, 24 ; GFX9-NEXT: s_lshr_b32 s13, s3, 8 ; GFX9-NEXT: s_or_b32 s2, s2, s4 ; GFX9-NEXT: s_and_b32 s4, s13, s18 @@ -3223,8 +3207,7 @@ ; GFX9-NEXT: s_and_b32 s4, s14, s18 ; GFX9-NEXT: s_lshl_b32 s4, s4, 16 ; GFX9-NEXT: s_or_b32 s3, s3, s4 -; GFX9-NEXT: s_and_b32 s4, s15, s18 -; GFX9-NEXT: s_lshl_b32 s4, s4, 24 +; GFX9-NEXT: s_lshl_b32 s4, s15, 24 ; GFX9-NEXT: s_or_b32 s3, s3, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -3250,8 +3233,7 @@ ; GFX8-NEXT: s_and_b32 s6, s7, s18 ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s6 -; GFX8-NEXT: s_and_b32 s6, s8, s18 -; GFX8-NEXT: s_lshl_b32 s6, s6, 24 +; GFX8-NEXT: s_lshl_b32 s6, s8, 24 ; GFX8-NEXT: s_lshr_b32 s9, s1, 8 ; GFX8-NEXT: s_or_b32 s0, s0, s6 ; GFX8-NEXT: s_and_b32 s6, s9, s18 @@ -3263,8 +3245,7 @@ ; GFX8-NEXT: s_and_b32 s6, s10, s18 ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 ; GFX8-NEXT: s_or_b32 s1, s1, s6 -; GFX8-NEXT: s_and_b32 s6, s11, s18 -; GFX8-NEXT: s_lshl_b32 s6, s6, 24 +; GFX8-NEXT: s_lshl_b32 s6, s11, 24 ; GFX8-NEXT: s_lshr_b32 s12, s2, 8 ; GFX8-NEXT: s_or_b32 s1, s1, s6 ; GFX8-NEXT: s_and_b32 s6, s12, s18 @@ -3276,8 +3257,7 @@ ; GFX8-NEXT: s_and_b32 s6, s13, s18 ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 ; GFX8-NEXT: s_or_b32 s2, s2, s6 -; GFX8-NEXT: s_and_b32 s6, s14, s18 -; GFX8-NEXT: s_lshl_b32 s6, s6, 24 +; GFX8-NEXT: s_lshl_b32 s6, s14, 24 ; GFX8-NEXT: s_lshr_b32 s15, s3, 8 ; GFX8-NEXT: s_or_b32 s2, s2, s6 ; GFX8-NEXT: s_and_b32 s6, s15, s18 @@ -3289,8 +3269,7 @@ ; GFX8-NEXT: s_and_b32 s6, s16, s18 ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 ; GFX8-NEXT: s_or_b32 s3, s3, s6 -; GFX8-NEXT: s_and_b32 s6, s17, s18 -; GFX8-NEXT: s_lshl_b32 s6, s6, 24 +; GFX8-NEXT: s_lshl_b32 s6, s17, 24 ; GFX8-NEXT: s_or_b32 s3, s3, s6 ; GFX8-NEXT: s_lshr_b32 s6, s5, 2 ; GFX8-NEXT: s_cmp_eq_u32 s6, 1 @@ -3324,8 +3303,7 @@ ; GFX8-NEXT: s_and_b32 s4, s5, s18 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s4 -; GFX8-NEXT: s_and_b32 s4, s6, s18 -; GFX8-NEXT: s_lshl_b32 s4, s4, 24 +; GFX8-NEXT: s_lshl_b32 s4, s6, 24 ; GFX8-NEXT: s_lshr_b32 s7, s1, 8 ; GFX8-NEXT: s_or_b32 s0, s0, s4 ; GFX8-NEXT: s_and_b32 s4, s7, s18 @@ -3337,8 +3315,7 @@ ; GFX8-NEXT: s_and_b32 s4, s8, s18 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16 ; GFX8-NEXT: s_or_b32 s1, s1, s4 -; GFX8-NEXT: s_and_b32 s4, s9, s18 -; GFX8-NEXT: s_lshl_b32 s4, s4, 24 +; GFX8-NEXT: s_lshl_b32 s4, s9, 24 ; GFX8-NEXT: s_lshr_b32 s10, s2, 8 ; GFX8-NEXT: s_or_b32 s1, s1, s4 ; GFX8-NEXT: s_and_b32 s4, s10, s18 @@ -3350,8 +3327,7 @@ ; GFX8-NEXT: s_and_b32 s4, s11, s18 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16 ; GFX8-NEXT: s_or_b32 s2, s2, s4 -; GFX8-NEXT: s_and_b32 s4, s12, s18 -; GFX8-NEXT: s_lshl_b32 s4, s4, 24 +; GFX8-NEXT: s_lshl_b32 s4, s12, 24 ; GFX8-NEXT: s_lshr_b32 s13, s3, 8 ; GFX8-NEXT: s_or_b32 s2, s2, s4 ; GFX8-NEXT: s_and_b32 s4, s13, s18 @@ -3363,8 +3339,7 @@ ; GFX8-NEXT: s_and_b32 s4, s14, s18 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16 ; GFX8-NEXT: s_or_b32 s3, s3, s4 -; GFX8-NEXT: s_and_b32 s4, s15, s18 -; GFX8-NEXT: s_lshl_b32 s4, s4, 24 +; GFX8-NEXT: s_lshl_b32 s4, s15, 24 ; GFX8-NEXT: s_or_b32 s3, s3, s4 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -3388,8 +3363,7 @@ ; GFX7-NEXT: s_and_b32 s6, s7, s18 ; GFX7-NEXT: s_lshl_b32 s6, s6, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s6 -; GFX7-NEXT: s_and_b32 s6, s8, s18 -; GFX7-NEXT: s_lshl_b32 s6, s6, 24 +; GFX7-NEXT: s_lshl_b32 s6, s8, 24 ; GFX7-NEXT: s_lshr_b32 s9, s1, 8 ; GFX7-NEXT: s_or_b32 s0, s0, s6 ; GFX7-NEXT: s_and_b32 s6, s9, s18 @@ -3401,8 +3375,7 @@ ; GFX7-NEXT: s_and_b32 s6, s10, s18 ; GFX7-NEXT: s_lshl_b32 s6, s6, 16 ; GFX7-NEXT: s_or_b32 s1, s1, s6 -; GFX7-NEXT: s_and_b32 s6, s11, s18 -; GFX7-NEXT: s_lshl_b32 s6, s6, 24 +; GFX7-NEXT: s_lshl_b32 s6, s11, 24 ; GFX7-NEXT: s_lshr_b32 s12, s2, 8 ; GFX7-NEXT: s_or_b32 s1, s1, s6 ; GFX7-NEXT: s_and_b32 s6, s12, s18 @@ -3414,8 +3387,7 @@ ; GFX7-NEXT: s_and_b32 s6, s13, s18 ; GFX7-NEXT: s_lshl_b32 s6, s6, 16 ; GFX7-NEXT: s_or_b32 s2, s2, s6 -; GFX7-NEXT: s_and_b32 s6, s14, s18 -; GFX7-NEXT: s_lshl_b32 s6, s6, 24 +; GFX7-NEXT: s_lshl_b32 s6, s14, 24 ; GFX7-NEXT: s_lshr_b32 s15, s3, 8 ; GFX7-NEXT: s_or_b32 s2, s2, s6 ; GFX7-NEXT: s_and_b32 s6, s15, s18 @@ -3427,8 +3399,7 @@ ; GFX7-NEXT: s_and_b32 s6, s16, s18 ; GFX7-NEXT: s_lshl_b32 s6, s6, 16 ; GFX7-NEXT: s_or_b32 s3, s3, s6 -; GFX7-NEXT: s_and_b32 s6, s17, s18 -; GFX7-NEXT: s_lshl_b32 s6, s6, 24 +; GFX7-NEXT: s_lshl_b32 s6, s17, 24 ; GFX7-NEXT: s_or_b32 s3, s3, s6 ; GFX7-NEXT: s_lshr_b32 s6, s5, 2 ; GFX7-NEXT: s_cmp_eq_u32 s6, 1 @@ -3460,12 +3431,11 @@ ; GFX7-NEXT: s_lshl_b32 s4, s4, 8 ; GFX7-NEXT: s_or_b32 s4, s5, s4 ; GFX7-NEXT: s_and_b32 s5, s6, s18 -; GFX7-NEXT: s_lshl_b32 s5, s5, 16 ; GFX7-NEXT: s_lshr_b32 s9, s7, 8 -; GFX7-NEXT: s_or_b32 s4, s4, s5 -; GFX7-NEXT: s_and_b32 s5, s8, s18 +; GFX7-NEXT: s_lshl_b32 s5, s5, 16 ; GFX7-NEXT: s_and_b32 s6, s9, s18 -; GFX7-NEXT: s_lshl_b32 s5, s5, 24 +; GFX7-NEXT: s_or_b32 s4, s4, s5 +; GFX7-NEXT: s_lshl_b32 s5, s8, 24 ; GFX7-NEXT: s_or_b32 s4, s4, s5 ; GFX7-NEXT: s_lshr_b32 s10, s7, 16 ; GFX7-NEXT: s_and_b32 s5, s7, s18 @@ -3475,8 +3445,7 @@ ; GFX7-NEXT: s_lshl_b32 s6, s6, 16 ; GFX7-NEXT: s_lshr_b32 s11, s7, 24 ; GFX7-NEXT: s_or_b32 s5, s5, s6 -; GFX7-NEXT: s_and_b32 s6, s11, s18 -; GFX7-NEXT: s_lshl_b32 s6, s6, 24 +; GFX7-NEXT: s_lshl_b32 s6, s11, 24 ; GFX7-NEXT: s_lshr_b32 s12, s2, 8 ; GFX7-NEXT: s_or_b32 s5, s5, s6 ; GFX7-NEXT: s_and_b32 s6, s12, s18 @@ -3488,8 +3457,7 @@ ; GFX7-NEXT: s_and_b32 s6, s13, s18 ; GFX7-NEXT: s_lshl_b32 s6, s6, 16 ; GFX7-NEXT: s_or_b32 s2, s2, s6 -; GFX7-NEXT: s_and_b32 s6, s14, s18 -; GFX7-NEXT: s_lshl_b32 s6, s6, 24 +; GFX7-NEXT: s_lshl_b32 s6, s14, 24 ; GFX7-NEXT: s_lshr_b32 s15, s3, 8 ; GFX7-NEXT: s_or_b32 s6, s2, s6 ; GFX7-NEXT: s_lshr_b32 s16, s3, 16 @@ -3501,8 +3469,7 @@ ; GFX7-NEXT: s_and_b32 s3, s16, s18 ; GFX7-NEXT: s_lshl_b32 s3, s3, 16 ; GFX7-NEXT: s_or_b32 s2, s2, s3 -; GFX7-NEXT: s_and_b32 s3, s17, s18 -; GFX7-NEXT: s_lshl_b32 s3, s3, 24 +; GFX7-NEXT: s_lshl_b32 s3, s17, 24 ; GFX7-NEXT: s_or_b32 s7, s2, s3 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 @@ -3538,31 +3505,35 @@ ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v6, s0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v3 -; GFX9-NEXT: v_and_b32_sdwa v10, v0, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v11, v0, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v0, v0, s6, v6 -; GFX9-NEXT: v_and_b32_sdwa v12, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v13, v1, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v1, v1, s6, v7 ; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v14, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v15, v2, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v2, v2, s6, v8 -; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or3_b32 v0, v0, v10, v11 -; GFX9-NEXT: v_or3_b32 v1, v1, v12, v13 -; GFX9-NEXT: v_and_b32_sdwa v16, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v17, v3, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v9 -; GFX9-NEXT: v_or3_b32 v2, v2, v14, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 8, v3 +; GFX9-NEXT: v_and_b32_sdwa v14, v0, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v15, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX9-NEXT: v_and_or_b32 v0, v0, s6, v6 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX9-NEXT: v_and_or_b32 v1, v1, s6, v8 +; GFX9-NEXT: v_lshlrev_b32_sdwa v10, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 24, v3 +; GFX9-NEXT: v_and_b32_sdwa v16, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX9-NEXT: v_and_or_b32 v2, v2, s6, v10 +; GFX9-NEXT: v_lshlrev_b32_sdwa v12, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or3_b32 v0, v0, v14, v7 +; GFX9-NEXT: v_or3_b32 v1, v1, v15, v9 +; GFX9-NEXT: v_and_b32_sdwa v17, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v12 +; GFX9-NEXT: v_or3_b32 v2, v2, v16, v11 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 -; GFX9-NEXT: v_or3_b32 v3, v3, v16, v17 +; GFX9-NEXT: v_or3_b32 v3, v3, v17, v13 ; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[2:3] ; GFX9-NEXT: v_and_or_b32 v5, v6, s5, v5 @@ -3572,30 +3543,34 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v3 ; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v9, v0, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v10, v0, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v13, v0, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v14, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v15, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v16, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX9-NEXT: v_and_or_b32 v0, v0, s6, v5 -; GFX9-NEXT: v_and_b32_sdwa v11, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v12, v1, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v1, v1, s6, v6 -; GFX9-NEXT: v_and_b32_sdwa v13, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v14, v2, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v2, v2, s6, v7 -; GFX9-NEXT: v_and_b32_sdwa v15, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v16, v3, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v4 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v12 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_or3_b32 v0, v0, v9, v10 -; GFX9-NEXT: v_or3_b32 v1, v1, v11, v12 -; GFX9-NEXT: v_or3_b32 v2, v2, v13, v14 -; GFX9-NEXT: v_or3_b32 v3, v3, v15, v16 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX9-NEXT: v_and_or_b32 v1, v1, s6, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX9-NEXT: v_and_or_b32 v2, v2, s6, v9 +; GFX9-NEXT: v_or3_b32 v0, v0, v13, v6 +; GFX9-NEXT: v_or3_b32 v1, v1, v14, v8 +; GFX9-NEXT: v_or3_b32 v2, v2, v15, v10 +; GFX9-NEXT: v_or3_b32 v3, v3, v16, v11 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm @@ -3619,34 +3594,38 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v7, 24, v8 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v11, v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v12, v0, v6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 8, v2 +; GFX8-NEXT: v_and_b32_sdwa v15, v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v13, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v14, v1, v6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v11 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v13 -; GFX8-NEXT: v_and_b32_sdwa v15, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v16, v2, v6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v5, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v15 -; GFX8-NEXT: v_and_b32_sdwa v17, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v18, v3, v6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v12 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v14 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v17 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GFX8-NEXT: v_and_b32_sdwa v16, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v10 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v10, v5, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 8, v3 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v15 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v16 +; GFX8-NEXT: v_and_b32_sdwa v17, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v12 +; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v5, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v17 +; GFX8-NEXT: v_lshrrev_b32_e32 v14, 24, v3 +; GFX8-NEXT: v_and_b32_sdwa v18, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v9 +; GFX8-NEXT: v_lshlrev_b32_e32 v13, 24, v14 ; GFX8-NEXT: v_or_b32_e32 v3, v3, v18 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v11 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GFX8-NEXT: v_or_b32_e32 v3, v3, v13 ; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v3, s[2:3] ; GFX8-NEXT: v_and_b32_e32 v4, s6, v4 @@ -3657,34 +3636,38 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[2:3] ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 8, v3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v10, v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v11, v0, v6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v10, v5, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v5, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v3 +; GFX8-NEXT: v_and_b32_sdwa v14, v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v12, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v13, v1, v6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v14, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v15, v2, v6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v16, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v6, v3, v6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v15, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v16, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v6, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v10 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v12 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v14 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v16 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v11 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v13 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v15 +; GFX8-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v14 +; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v15 +; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v16 +; GFX8-NEXT: v_lshlrev_b32_e32 v12, 24, v13 ; GFX8-NEXT: v_or_b32_e32 v3, v3, v6 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v9 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v11 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v12 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm @@ -3728,8 +3711,6 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; GFX7-NEXT: v_lshrrev_b32_e32 v12, 24, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v6, s6, v6 -; GFX7-NEXT: v_and_b32_e32 v9, s6, v9 ; GFX7-NEXT: v_and_b32_e32 v11, s6, v11 ; GFX7-NEXT: v_and_b32_e32 v13, s6, v13 ; GFX7-NEXT: v_and_b32_e32 v2, s6, v2 @@ -3739,7 +3720,6 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v7 ; GFX7-NEXT: v_lshrrev_b32_e32 v15, 24, v3 -; GFX7-NEXT: v_and_b32_e32 v12, s6, v12 ; GFX7-NEXT: v_and_b32_e32 v14, s6, v14 ; GFX7-NEXT: v_and_b32_e32 v3, s6, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 @@ -3749,7 +3729,6 @@ ; GFX7-NEXT: v_or_b32_e32 v1, v1, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v10 -; GFX7-NEXT: v_and_b32_e32 v15, s6, v15 ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 24, v12 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v11 ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14 @@ -3774,16 +3753,10 @@ ; GFX7-NEXT: v_and_b32_e32 v4, s6, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 8, v1 ; GFX7-NEXT: v_and_b32_e32 v0, s6, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s6, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s6, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 8, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX7-NEXT: v_and_b32_e32 v4, s6, v7 ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v9, 24, v1 @@ -3793,8 +3766,7 @@ ; GFX7-NEXT: v_and_b32_e32 v4, s6, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s6, v9 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v9 ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 8, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX7-NEXT: v_and_b32_e32 v4, s6, v10 @@ -3806,8 +3778,7 @@ ; GFX7-NEXT: v_and_b32_e32 v4, s6, v11 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s6, v12 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v12 ; GFX7-NEXT: v_lshrrev_b32_e32 v13, 8, v3 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX7-NEXT: v_and_b32_e32 v4, s6, v13 @@ -3815,12 +3786,16 @@ ; GFX7-NEXT: v_lshrrev_b32_e32 v15, 24, v3 ; GFX7-NEXT: v_and_b32_e32 v3, s6, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX7-NEXT: v_and_b32_e32 v5, s6, v5 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: v_and_b32_e32 v4, s6, v14 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s6, v15 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v15 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX7-NEXT: s_endpgm @@ -3848,8 +3823,7 @@ ; GFX9-NEXT: s_and_b32 s6, s7, s18 ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s6 -; GFX9-NEXT: s_and_b32 s6, s8, s18 -; GFX9-NEXT: s_lshl_b32 s6, s6, 24 +; GFX9-NEXT: s_lshl_b32 s6, s8, 24 ; GFX9-NEXT: s_lshr_b32 s9, s1, 8 ; GFX9-NEXT: s_or_b32 s0, s0, s6 ; GFX9-NEXT: s_and_b32 s6, s9, s18 @@ -3861,8 +3835,7 @@ ; GFX9-NEXT: s_and_b32 s6, s10, s18 ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 ; GFX9-NEXT: s_or_b32 s1, s1, s6 -; GFX9-NEXT: s_and_b32 s6, s11, s18 -; GFX9-NEXT: s_lshl_b32 s6, s6, 24 +; GFX9-NEXT: s_lshl_b32 s6, s11, 24 ; GFX9-NEXT: s_lshr_b32 s12, s2, 8 ; GFX9-NEXT: s_or_b32 s1, s1, s6 ; GFX9-NEXT: s_and_b32 s6, s12, s18 @@ -3874,8 +3847,7 @@ ; GFX9-NEXT: s_and_b32 s6, s13, s18 ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 ; GFX9-NEXT: s_or_b32 s2, s2, s6 -; GFX9-NEXT: s_and_b32 s6, s14, s18 -; GFX9-NEXT: s_lshl_b32 s6, s6, 24 +; GFX9-NEXT: s_lshl_b32 s6, s14, 24 ; GFX9-NEXT: s_lshr_b32 s15, s3, 8 ; GFX9-NEXT: s_or_b32 s2, s2, s6 ; GFX9-NEXT: s_and_b32 s6, s15, s18 @@ -3887,8 +3859,7 @@ ; GFX9-NEXT: s_and_b32 s6, s16, s18 ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 ; GFX9-NEXT: s_or_b32 s3, s3, s6 -; GFX9-NEXT: s_and_b32 s6, s17, s18 -; GFX9-NEXT: s_lshl_b32 s6, s6, 24 +; GFX9-NEXT: s_lshl_b32 s6, s17, 24 ; GFX9-NEXT: s_or_b32 s3, s3, s6 ; GFX9-NEXT: s_lshr_b32 s6, s4, 2 ; GFX9-NEXT: s_cmp_eq_u32 s6, 1 @@ -3916,30 +3887,34 @@ ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s6, 3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_and_or_b32 v4, v0, s18, v4 -; GFX9-NEXT: v_and_b32_sdwa v8, v0, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v0, v0, s18 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v4, v8, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX9-NEXT: v_or3_b32 v0, v4, v0, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 8 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v2 -; GFX9-NEXT: v_and_or_b32 v5, v1, s18, v5 -; GFX9-NEXT: v_and_b32_sdwa v8, v1, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v1, v1, s18 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v1, v5, v8, v1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v3 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_and_or_b32 v5, v1, s18, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v2 +; GFX9-NEXT: v_and_b32_sdwa v1, v1, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v7 +; GFX9-NEXT: v_or3_b32 v1, v5, v1, v6 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_and_or_b32 v5, v2, s18, v5 -; GFX9-NEXT: v_and_b32_sdwa v6, v2, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v2, v2, s18 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v2, v5, v6, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v3 +; GFX9-NEXT: v_and_b32_sdwa v2, v2, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v9 +; GFX9-NEXT: v_or3_b32 v2, v5, v2, v6 ; GFX9-NEXT: v_and_or_b32 v4, v3, s18, v4 -; GFX9-NEXT: v_and_b32_sdwa v5, v3, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v3, v3, s18 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v3, v4, v5, v3 +; GFX9-NEXT: v_and_b32_sdwa v3, v3, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v11 +; GFX9-NEXT: v_or3_b32 v3, v4, v3, v5 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off @@ -3949,7 +3924,7 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX8-NEXT: s_movk_i32 s17, 0xff -; GFX8-NEXT: v_mov_b32_e32 v8, 8 +; GFX8-NEXT: v_mov_b32_e32 v12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshr_b32 s5, s0, 8 ; GFX8-NEXT: s_and_b32 s5, s5, s17 @@ -3961,8 +3936,7 @@ ; GFX8-NEXT: s_and_b32 s5, s6, s17 ; GFX8-NEXT: s_lshl_b32 s5, s5, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s5 -; GFX8-NEXT: s_and_b32 s5, s7, s17 -; GFX8-NEXT: s_lshl_b32 s5, s5, 24 +; GFX8-NEXT: s_lshl_b32 s5, s7, 24 ; GFX8-NEXT: s_lshr_b32 s8, s1, 8 ; GFX8-NEXT: s_or_b32 s0, s0, s5 ; GFX8-NEXT: s_and_b32 s5, s8, s17 @@ -3974,8 +3948,7 @@ ; GFX8-NEXT: s_and_b32 s5, s9, s17 ; GFX8-NEXT: s_lshl_b32 s5, s5, 16 ; GFX8-NEXT: s_or_b32 s1, s1, s5 -; GFX8-NEXT: s_and_b32 s5, s10, s17 -; GFX8-NEXT: s_lshl_b32 s5, s5, 24 +; GFX8-NEXT: s_lshl_b32 s5, s10, 24 ; GFX8-NEXT: s_lshr_b32 s11, s2, 8 ; GFX8-NEXT: s_or_b32 s1, s1, s5 ; GFX8-NEXT: s_and_b32 s5, s11, s17 @@ -3987,8 +3960,7 @@ ; GFX8-NEXT: s_and_b32 s5, s12, s17 ; GFX8-NEXT: s_lshl_b32 s5, s5, 16 ; GFX8-NEXT: s_or_b32 s2, s2, s5 -; GFX8-NEXT: s_and_b32 s5, s13, s17 -; GFX8-NEXT: s_lshl_b32 s5, s5, 24 +; GFX8-NEXT: s_lshl_b32 s5, s13, 24 ; GFX8-NEXT: s_lshr_b32 s14, s3, 8 ; GFX8-NEXT: s_or_b32 s2, s2, s5 ; GFX8-NEXT: s_and_b32 s5, s14, s17 @@ -4000,8 +3972,7 @@ ; GFX8-NEXT: s_and_b32 s5, s15, s17 ; GFX8-NEXT: s_lshl_b32 s5, s5, 16 ; GFX8-NEXT: s_or_b32 s3, s3, s5 -; GFX8-NEXT: s_and_b32 s5, s16, s17 -; GFX8-NEXT: s_lshl_b32 s5, s5, 24 +; GFX8-NEXT: s_lshl_b32 s5, s16, 24 ; GFX8-NEXT: s_or_b32 s3, s3, s5 ; GFX8-NEXT: s_lshr_b32 s5, s4, 2 ; GFX8-NEXT: s_cmp_eq_u32 s5, 1 @@ -4030,35 +4001,39 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_mov_b32_e32 v8, s17 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v12, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_mov_b32_e32 v12, s17 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX8-NEXT: v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v9, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v4, v4, v9 -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v5 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, 8 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX8-NEXT: v_or_b32_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v9, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v9 -; GFX8-NEXT: v_and_b32_sdwa v1, v1, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v1, v5, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v7 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v3 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v2 ; GFX8-NEXT: v_or_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v6, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX8-NEXT: v_and_b32_sdwa v2, v2, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_and_b32_sdwa v2, v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v3 ; GFX8-NEXT: v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v5, v3, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX8-NEXT: v_and_b32_sdwa v3, v3, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v3, v3, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v9 ; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v11 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -4080,8 +4055,7 @@ ; GFX7-NEXT: s_and_b32 s5, s6, s17 ; GFX7-NEXT: s_lshl_b32 s5, s5, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s5 -; GFX7-NEXT: s_and_b32 s5, s7, s17 -; GFX7-NEXT: s_lshl_b32 s5, s5, 24 +; GFX7-NEXT: s_lshl_b32 s5, s7, 24 ; GFX7-NEXT: s_lshr_b32 s8, s1, 8 ; GFX7-NEXT: s_or_b32 s0, s0, s5 ; GFX7-NEXT: s_and_b32 s5, s8, s17 @@ -4093,8 +4067,7 @@ ; GFX7-NEXT: s_and_b32 s5, s9, s17 ; GFX7-NEXT: s_lshl_b32 s5, s5, 16 ; GFX7-NEXT: s_or_b32 s1, s1, s5 -; GFX7-NEXT: s_and_b32 s5, s10, s17 -; GFX7-NEXT: s_lshl_b32 s5, s5, 24 +; GFX7-NEXT: s_lshl_b32 s5, s10, 24 ; GFX7-NEXT: s_lshr_b32 s11, s2, 8 ; GFX7-NEXT: s_or_b32 s1, s1, s5 ; GFX7-NEXT: s_and_b32 s5, s11, s17 @@ -4106,8 +4079,7 @@ ; GFX7-NEXT: s_and_b32 s5, s12, s17 ; GFX7-NEXT: s_lshl_b32 s5, s5, 16 ; GFX7-NEXT: s_or_b32 s2, s2, s5 -; GFX7-NEXT: s_and_b32 s5, s13, s17 -; GFX7-NEXT: s_lshl_b32 s5, s5, 24 +; GFX7-NEXT: s_lshl_b32 s5, s13, 24 ; GFX7-NEXT: s_lshr_b32 s14, s3, 8 ; GFX7-NEXT: s_or_b32 s2, s2, s5 ; GFX7-NEXT: s_and_b32 s5, s14, s17 @@ -4119,8 +4091,7 @@ ; GFX7-NEXT: s_and_b32 s5, s15, s17 ; GFX7-NEXT: s_lshl_b32 s5, s5, 16 ; GFX7-NEXT: s_or_b32 s3, s3, s5 -; GFX7-NEXT: s_and_b32 s5, s16, s17 -; GFX7-NEXT: s_lshl_b32 s5, s5, 24 +; GFX7-NEXT: s_lshl_b32 s5, s16, 24 ; GFX7-NEXT: s_or_b32 s3, s3, s5 ; GFX7-NEXT: s_lshr_b32 s5, s4, 2 ; GFX7-NEXT: s_cmp_eq_u32 s5, 1 @@ -4157,8 +4128,7 @@ ; GFX7-NEXT: v_and_b32_e32 v4, s17, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s17, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 8, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX7-NEXT: v_and_b32_e32 v4, s17, v7 @@ -4170,8 +4140,7 @@ ; GFX7-NEXT: v_and_b32_e32 v4, s17, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s17, v9 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v9 ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 8, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX7-NEXT: v_and_b32_e32 v4, s17, v10 @@ -4183,8 +4152,7 @@ ; GFX7-NEXT: v_and_b32_e32 v4, s17, v11 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s17, v12 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v12 ; GFX7-NEXT: v_lshrrev_b32_e32 v13, 8, v3 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX7-NEXT: v_and_b32_e32 v4, s17, v13 @@ -4196,8 +4164,7 @@ ; GFX7-NEXT: v_and_b32_e32 v4, s17, v14 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s17, v15 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v15 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -4229,8 +4196,7 @@ ; GFX9-NEXT: s_and_b32 s5, s7, s18 ; GFX9-NEXT: s_lshl_b32 s5, s5, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s5 -; GFX9-NEXT: s_and_b32 s5, s8, s18 -; GFX9-NEXT: s_lshl_b32 s5, s5, 24 +; GFX9-NEXT: s_lshl_b32 s5, s8, 24 ; GFX9-NEXT: s_lshr_b32 s9, s1, 8 ; GFX9-NEXT: s_or_b32 s8, s0, s5 ; GFX9-NEXT: s_lshr_b32 s10, s1, 16 @@ -4242,8 +4208,7 @@ ; GFX9-NEXT: s_and_b32 s1, s10, s18 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s11, s18 -; GFX9-NEXT: s_lshl_b32 s1, s1, 24 +; GFX9-NEXT: s_lshl_b32 s1, s11, 24 ; GFX9-NEXT: s_lshr_b32 s12, s2, 8 ; GFX9-NEXT: s_or_b32 s9, s0, s1 ; GFX9-NEXT: s_and_b32 s1, s12, s18 @@ -4255,8 +4220,7 @@ ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_lshr_b32 s14, s2, 24 ; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s14, s18 -; GFX9-NEXT: s_lshl_b32 s1, s1, 24 +; GFX9-NEXT: s_lshl_b32 s1, s14, 24 ; GFX9-NEXT: s_lshr_b32 s15, s3, 8 ; GFX9-NEXT: s_or_b32 s10, s0, s1 ; GFX9-NEXT: s_and_b32 s1, s15, s18 @@ -4268,8 +4232,7 @@ ; GFX9-NEXT: s_lshr_b32 s17, s3, 24 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s17, s18 -; GFX9-NEXT: s_lshl_b32 s1, s1, 24 +; GFX9-NEXT: s_lshl_b32 s1, s17, 24 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: v_mov_b32_e32 v2, s9 ; GFX9-NEXT: s_or_b32 s11, s0, s1 @@ -4289,38 +4252,42 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] -; GFX9-NEXT: s_mov_b32 s6, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_mov_b32_e32 v2, s10 ; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: s_mov_b32 s6, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX9-NEXT: v_and_or_b32 v4, v0, s18, v4 -; GFX9-NEXT: v_and_b32_sdwa v8, v0, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v0, v0, s18 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v4, v8, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_or_b32 v4, v0, s18, v4 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX9-NEXT: v_or3_b32 v0, v4, v0, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 8 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v2 -; GFX9-NEXT: v_and_or_b32 v5, v1, s18, v5 -; GFX9-NEXT: v_and_b32_sdwa v8, v1, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v1, v1, s18 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v1, v5, v8, v1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v3 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_and_or_b32 v5, v1, s18, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v2 +; GFX9-NEXT: v_and_b32_sdwa v1, v1, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v7 +; GFX9-NEXT: v_or3_b32 v1, v5, v1, v6 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_and_or_b32 v5, v2, s18, v5 -; GFX9-NEXT: v_and_b32_sdwa v6, v2, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v2, v2, s18 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v2, v5, v6, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v3 +; GFX9-NEXT: v_and_b32_sdwa v2, v2, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v9 +; GFX9-NEXT: v_or3_b32 v2, v5, v2, v6 ; GFX9-NEXT: v_and_or_b32 v4, v3, s18, v4 -; GFX9-NEXT: v_and_b32_sdwa v5, v3, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v3, v3, s18 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v3, v4, v5, v3 +; GFX9-NEXT: v_and_b32_sdwa v3, v3, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v11 +; GFX9-NEXT: v_or3_b32 v3, v4, v3, v5 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off @@ -4344,8 +4311,7 @@ ; GFX8-NEXT: s_and_b32 s5, s6, s18 ; GFX8-NEXT: s_lshl_b32 s5, s5, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s5 -; GFX8-NEXT: s_and_b32 s5, s7, s18 -; GFX8-NEXT: s_lshl_b32 s5, s5, 24 +; GFX8-NEXT: s_lshl_b32 s5, s7, 24 ; GFX8-NEXT: s_lshr_b32 s9, s1, 8 ; GFX8-NEXT: s_or_b32 s8, s0, s5 ; GFX8-NEXT: s_lshr_b32 s10, s1, 16 @@ -4357,8 +4323,7 @@ ; GFX8-NEXT: s_and_b32 s1, s10, s18 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s11, s18 -; GFX8-NEXT: s_lshl_b32 s1, s1, 24 +; GFX8-NEXT: s_lshl_b32 s1, s11, 24 ; GFX8-NEXT: s_lshr_b32 s12, s2, 8 ; GFX8-NEXT: s_or_b32 s9, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s12, s18 @@ -4370,8 +4335,7 @@ ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_lshr_b32 s14, s2, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s14, s18 -; GFX8-NEXT: s_lshl_b32 s1, s1, 24 +; GFX8-NEXT: s_lshl_b32 s1, s14, 24 ; GFX8-NEXT: s_lshr_b32 s15, s3, 8 ; GFX8-NEXT: s_or_b32 s10, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s15, s18 @@ -4383,8 +4347,7 @@ ; GFX8-NEXT: s_lshr_b32 s17, s3, 24 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s17, s18 -; GFX8-NEXT: s_lshl_b32 s1, s1, 24 +; GFX8-NEXT: s_lshl_b32 s1, s17, 24 ; GFX8-NEXT: v_mov_b32_e32 v1, s8 ; GFX8-NEXT: v_mov_b32_e32 v2, s9 ; GFX8-NEXT: s_or_b32 s11, s0, s1 @@ -4406,42 +4369,46 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v0 -; GFX8-NEXT: v_mov_b32_e32 v8, 8 -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_mov_b32_e32 v8, s18 +; GFX8-NEXT: v_mov_b32_e32 v12, 8 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v12, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_mov_b32_e32 v1, s9 -; GFX8-NEXT: v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v9, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v2, s10 ; GFX8-NEXT: v_mov_b32_e32 v3, s11 +; GFX8-NEXT: v_mov_b32_e32 v12, s18 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX8-NEXT: v_or_b32_e32 v4, v4, v9 -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX8-NEXT: v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v5 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, 8 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX8-NEXT: v_or_b32_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v9, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v9 -; GFX8-NEXT: v_and_b32_sdwa v1, v1, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v1, v5, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v7 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v3 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v2 ; GFX8-NEXT: v_or_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v6, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX8-NEXT: v_and_b32_sdwa v2, v2, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_and_b32_sdwa v2, v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v3 ; GFX8-NEXT: v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v5, v3, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX8-NEXT: v_and_b32_sdwa v3, v3, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v3, v3, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v9 ; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v11 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -4465,8 +4432,7 @@ ; GFX7-NEXT: s_and_b32 s5, s6, s18 ; GFX7-NEXT: s_lshl_b32 s5, s5, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s5 -; GFX7-NEXT: s_and_b32 s5, s7, s18 -; GFX7-NEXT: s_lshl_b32 s5, s5, 24 +; GFX7-NEXT: s_lshl_b32 s5, s7, 24 ; GFX7-NEXT: s_lshr_b32 s9, s1, 8 ; GFX7-NEXT: s_or_b32 s8, s0, s5 ; GFX7-NEXT: s_lshr_b32 s10, s1, 16 @@ -4478,8 +4444,7 @@ ; GFX7-NEXT: s_and_b32 s1, s10, s18 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 -; GFX7-NEXT: s_and_b32 s1, s11, s18 -; GFX7-NEXT: s_lshl_b32 s1, s1, 24 +; GFX7-NEXT: s_lshl_b32 s1, s11, 24 ; GFX7-NEXT: s_lshr_b32 s12, s2, 8 ; GFX7-NEXT: s_or_b32 s9, s0, s1 ; GFX7-NEXT: s_and_b32 s1, s12, s18 @@ -4491,8 +4456,7 @@ ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_lshr_b32 s14, s2, 24 ; GFX7-NEXT: s_or_b32 s0, s0, s1 -; GFX7-NEXT: s_and_b32 s1, s14, s18 -; GFX7-NEXT: s_lshl_b32 s1, s1, 24 +; GFX7-NEXT: s_lshl_b32 s1, s14, 24 ; GFX7-NEXT: s_lshr_b32 s15, s3, 8 ; GFX7-NEXT: s_or_b32 s10, s0, s1 ; GFX7-NEXT: s_and_b32 s1, s15, s18 @@ -4504,8 +4468,7 @@ ; GFX7-NEXT: s_lshr_b32 s17, s3, 24 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 -; GFX7-NEXT: s_and_b32 s1, s17, s18 -; GFX7-NEXT: s_lshl_b32 s1, s1, 24 +; GFX7-NEXT: s_lshl_b32 s1, s17, 24 ; GFX7-NEXT: v_mov_b32_e32 v1, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s9 ; GFX7-NEXT: s_or_b32 s11, s0, s1 @@ -4542,8 +4505,7 @@ ; GFX7-NEXT: v_and_b32_e32 v4, s18, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s18, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 8, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX7-NEXT: v_and_b32_e32 v4, s18, v7 @@ -4555,8 +4517,7 @@ ; GFX7-NEXT: v_and_b32_e32 v4, s18, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s18, v9 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v9 ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 8, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX7-NEXT: v_and_b32_e32 v4, s18, v10 @@ -4568,8 +4529,7 @@ ; GFX7-NEXT: v_and_b32_e32 v4, s18, v11 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s18, v12 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v12 ; GFX7-NEXT: v_lshrrev_b32_e32 v13, 8, v3 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX7-NEXT: v_and_b32_e32 v4, s18, v13 @@ -4581,8 +4541,7 @@ ; GFX7-NEXT: v_and_b32_e32 v4, s18, v14 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s18, v15 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v15 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -4614,8 +4573,7 @@ ; GFX9-NEXT: s_and_b32 s4, s5, s17 ; GFX9-NEXT: s_lshl_b32 s4, s4, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s4 -; GFX9-NEXT: s_and_b32 s4, s6, s17 -; GFX9-NEXT: s_lshl_b32 s4, s4, 24 +; GFX9-NEXT: s_lshl_b32 s4, s6, 24 ; GFX9-NEXT: s_lshr_b32 s7, s1, 8 ; GFX9-NEXT: s_or_b32 s4, s0, s4 ; GFX9-NEXT: s_lshr_b32 s9, s1, 16 @@ -4627,8 +4585,7 @@ ; GFX9-NEXT: s_and_b32 s1, s9, s17 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s10, s17 -; GFX9-NEXT: s_lshl_b32 s1, s1, 24 +; GFX9-NEXT: s_lshl_b32 s1, s10, 24 ; GFX9-NEXT: s_lshr_b32 s11, s2, 8 ; GFX9-NEXT: s_or_b32 s5, s0, s1 ; GFX9-NEXT: s_and_b32 s1, s11, s17 @@ -4640,8 +4597,7 @@ ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_lshr_b32 s13, s2, 24 ; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s13, s17 -; GFX9-NEXT: s_lshl_b32 s1, s1, 24 +; GFX9-NEXT: s_lshl_b32 s1, s13, 24 ; GFX9-NEXT: s_lshr_b32 s14, s3, 8 ; GFX9-NEXT: s_or_b32 s6, s0, s1 ; GFX9-NEXT: s_and_b32 s1, s14, s17 @@ -4653,8 +4609,7 @@ ; GFX9-NEXT: s_lshr_b32 s16, s3, 24 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s16, s17 -; GFX9-NEXT: s_lshl_b32 s1, s1, 24 +; GFX9-NEXT: s_lshl_b32 s1, s16, 24 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: s_or_b32 s7, s0, s1 @@ -4678,33 +4633,37 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] ; GFX9-NEXT: s_mov_b32 s8, 8 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX9-NEXT: v_and_or_b32 v4, v0, s17, v4 -; GFX9-NEXT: v_and_b32_sdwa v8, v0, s17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v0, v0, s17 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v4, v8, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_or_b32 v4, v0, s17, v4 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX9-NEXT: v_or3_b32 v0, v4, v0, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 8 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v2 -; GFX9-NEXT: v_and_or_b32 v5, v1, s17, v5 -; GFX9-NEXT: v_and_b32_sdwa v8, v1, s17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v1, v1, s17 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v1, v5, v8, v1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v3 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_and_or_b32 v5, v1, s17, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v2 +; GFX9-NEXT: v_and_b32_sdwa v1, v1, s17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v7 +; GFX9-NEXT: v_or3_b32 v1, v5, v1, v6 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_and_or_b32 v5, v2, s17, v5 -; GFX9-NEXT: v_and_b32_sdwa v6, v2, s17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v2, v2, s17 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v2, v5, v6, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v3 +; GFX9-NEXT: v_and_b32_sdwa v2, v2, s17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v9 +; GFX9-NEXT: v_or3_b32 v2, v5, v2, v6 ; GFX9-NEXT: v_and_or_b32 v4, v3, s17, v4 -; GFX9-NEXT: v_and_b32_sdwa v5, v3, s17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v3, v3, s17 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v3, v4, v5, v3 +; GFX9-NEXT: v_and_b32_sdwa v3, v3, s17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v11 +; GFX9-NEXT: v_or3_b32 v3, v4, v3, v5 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off @@ -4728,8 +4687,7 @@ ; GFX8-NEXT: s_and_b32 s4, s5, s16 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s4 -; GFX8-NEXT: s_and_b32 s4, s6, s16 -; GFX8-NEXT: s_lshl_b32 s4, s4, 24 +; GFX8-NEXT: s_lshl_b32 s4, s6, 24 ; GFX8-NEXT: s_lshr_b32 s7, s1, 8 ; GFX8-NEXT: s_or_b32 s4, s0, s4 ; GFX8-NEXT: s_lshr_b32 s8, s1, 16 @@ -4741,8 +4699,7 @@ ; GFX8-NEXT: s_and_b32 s1, s8, s16 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s9, s16 -; GFX8-NEXT: s_lshl_b32 s1, s1, 24 +; GFX8-NEXT: s_lshl_b32 s1, s9, 24 ; GFX8-NEXT: s_lshr_b32 s10, s2, 8 ; GFX8-NEXT: s_or_b32 s5, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s10, s16 @@ -4754,8 +4711,7 @@ ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_lshr_b32 s12, s2, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s12, s16 -; GFX8-NEXT: s_lshl_b32 s1, s1, 24 +; GFX8-NEXT: s_lshl_b32 s1, s12, 24 ; GFX8-NEXT: s_lshr_b32 s13, s3, 8 ; GFX8-NEXT: s_or_b32 s6, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s13, s16 @@ -4767,8 +4723,7 @@ ; GFX8-NEXT: s_lshr_b32 s15, s3, 24 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s15, s16 -; GFX8-NEXT: s_lshl_b32 s1, s1, 24 +; GFX8-NEXT: s_lshl_b32 s1, s15, 24 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: s_or_b32 s7, s0, s1 @@ -4792,39 +4747,43 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v0 -; GFX8-NEXT: v_mov_b32_e32 v8, 8 -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_mov_b32_e32 v8, s16 -; GFX8-NEXT: v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v9, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v12, 8 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v12, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_mov_b32_e32 v12, s16 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX8-NEXT: v_or_b32_e32 v4, v4, v9 -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX8-NEXT: v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v5 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, 8 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX8-NEXT: v_or_b32_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v9, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v9 -; GFX8-NEXT: v_and_b32_sdwa v1, v1, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v1, v5, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v7 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v3 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v2 ; GFX8-NEXT: v_or_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v6, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX8-NEXT: v_and_b32_sdwa v2, v2, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_and_b32_sdwa v2, v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v3 ; GFX8-NEXT: v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v5, v3, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX8-NEXT: v_and_b32_sdwa v3, v3, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v3, v3, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v9 ; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v11 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -4848,8 +4807,7 @@ ; GFX7-NEXT: s_and_b32 s4, s5, s16 ; GFX7-NEXT: s_lshl_b32 s4, s4, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s4 -; GFX7-NEXT: s_and_b32 s4, s6, s16 -; GFX7-NEXT: s_lshl_b32 s4, s4, 24 +; GFX7-NEXT: s_lshl_b32 s4, s6, 24 ; GFX7-NEXT: s_lshr_b32 s7, s1, 8 ; GFX7-NEXT: s_or_b32 s4, s0, s4 ; GFX7-NEXT: s_lshr_b32 s8, s1, 16 @@ -4861,8 +4819,7 @@ ; GFX7-NEXT: s_and_b32 s1, s8, s16 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 -; GFX7-NEXT: s_and_b32 s1, s9, s16 -; GFX7-NEXT: s_lshl_b32 s1, s1, 24 +; GFX7-NEXT: s_lshl_b32 s1, s9, 24 ; GFX7-NEXT: s_lshr_b32 s10, s2, 8 ; GFX7-NEXT: s_or_b32 s5, s0, s1 ; GFX7-NEXT: s_and_b32 s1, s10, s16 @@ -4874,8 +4831,7 @@ ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_lshr_b32 s12, s2, 24 ; GFX7-NEXT: s_or_b32 s0, s0, s1 -; GFX7-NEXT: s_and_b32 s1, s12, s16 -; GFX7-NEXT: s_lshl_b32 s1, s1, 24 +; GFX7-NEXT: s_lshl_b32 s1, s12, 24 ; GFX7-NEXT: s_lshr_b32 s13, s3, 8 ; GFX7-NEXT: s_or_b32 s6, s0, s1 ; GFX7-NEXT: s_and_b32 s1, s13, s16 @@ -4887,8 +4843,7 @@ ; GFX7-NEXT: s_lshr_b32 s15, s3, 24 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 -; GFX7-NEXT: s_and_b32 s1, s15, s16 -; GFX7-NEXT: s_lshl_b32 s1, s1, 24 +; GFX7-NEXT: s_lshl_b32 s1, s15, 24 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: s_or_b32 s7, s0, s1 @@ -4925,8 +4880,7 @@ ; GFX7-NEXT: v_and_b32_e32 v4, s16, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s16, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 8, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX7-NEXT: v_and_b32_e32 v4, s16, v7 @@ -4938,8 +4892,7 @@ ; GFX7-NEXT: v_and_b32_e32 v4, s16, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s16, v9 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v9 ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 8, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX7-NEXT: v_and_b32_e32 v4, s16, v10 @@ -4951,8 +4904,7 @@ ; GFX7-NEXT: v_and_b32_e32 v4, s16, v11 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s16, v12 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v12 ; GFX7-NEXT: v_lshrrev_b32_e32 v13, 8, v3 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX7-NEXT: v_and_b32_e32 v4, s16, v13 @@ -4964,8 +4916,7 @@ ; GFX7-NEXT: v_and_b32_e32 v4, s16, v14 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s16, v15 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v15 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -4997,31 +4948,35 @@ ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 8, v5 ; GFX9-NEXT: v_lshlrev_b32_sdwa v8, s0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 8, v6 -; GFX9-NEXT: v_and_b32_sdwa v12, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v13, v3, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v8 -; GFX9-NEXT: v_and_b32_sdwa v14, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v15, v4, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v9 ; GFX9-NEXT: v_lshlrev_b32_sdwa v10, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v16, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v17, v5, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v5, v5, s6, v10 -; GFX9-NEXT: v_lshlrev_b32_sdwa v11, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or3_b32 v3, v3, v12, v13 -; GFX9-NEXT: v_or3_b32 v4, v4, v14, v15 -; GFX9-NEXT: v_and_b32_sdwa v18, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v19, v6, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v6, v6, s6, v11 -; GFX9-NEXT: v_or3_b32 v5, v5, v16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 24, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 8, v6 +; GFX9-NEXT: v_and_b32_sdwa v16, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v17, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v8 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v10 +; GFX9-NEXT: v_lshlrev_b32_sdwa v12, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v6 +; GFX9-NEXT: v_and_b32_sdwa v18, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; GFX9-NEXT: v_and_or_b32 v5, v5, s6, v12 +; GFX9-NEXT: v_lshlrev_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or3_b32 v3, v3, v16, v9 +; GFX9-NEXT: v_or3_b32 v4, v4, v17, v11 +; GFX9-NEXT: v_and_b32_sdwa v19, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 24, v15 +; GFX9-NEXT: v_and_or_b32 v6, v6, s6, v14 +; GFX9-NEXT: v_or3_b32 v5, v5, v18, v13 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v3, v4, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1 -; GFX9-NEXT: v_or3_b32 v6, v6, v18, v19 +; GFX9-NEXT: v_or3_b32 v6, v6, v19, v15 ; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v5, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v6, s[2:3] ; GFX9-NEXT: v_and_or_b32 v2, v8, v2, v7 @@ -5030,29 +4985,33 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v4, v5, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[2:3] ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v9, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v10, v1, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v13, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v14, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v15, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v1, v1, s6, v5 +; GFX9-NEXT: v_and_b32_sdwa v16, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX9-NEXT: v_and_or_b32 v5, v2, s6, v0 -; GFX9-NEXT: v_and_b32_sdwa v11, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v12, v3, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v13, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v14, v4, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v15, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v16, v2, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v7 -; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v6 -; GFX9-NEXT: v_or3_b32 v0, v1, v9, v10 -; GFX9-NEXT: v_or3_b32 v1, v3, v11, v12 -; GFX9-NEXT: v_or3_b32 v2, v4, v13, v14 -; GFX9-NEXT: v_or3_b32 v3, v5, v15, v16 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v12 +; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v7 +; GFX9-NEXT: v_or3_b32 v0, v1, v13, v6 +; GFX9-NEXT: v_or3_b32 v1, v3, v14, v8 +; GFX9-NEXT: v_or3_b32 v2, v4, v15, v10 +; GFX9-NEXT: v_or3_b32 v3, v5, v16, v11 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off @@ -5077,35 +5036,39 @@ ; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 8, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 8, v4 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 8, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v13, 8, v6 -; GFX8-NEXT: v_lshlrev_b32_sdwa v11, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v14, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v15, v3, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v16, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v3, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v14, 8, v5 ; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v14 -; GFX8-NEXT: v_and_b32_sdwa v17, v4, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v3, v3, v16 -; GFX8-NEXT: v_and_b32_sdwa v18, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v4, v5, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v19, v5, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v10, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v5, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v4, v4, v18 -; GFX8-NEXT: v_and_b32_sdwa v6, v6, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v5, v5, v10 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v15 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v17 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 8, v6 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v4 +; GFX8-NEXT: v_and_b32_sdwa v18, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v19, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v15, 24, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v12, 24, v13 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v19 +; GFX8-NEXT: v_and_b32_sdwa v10, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v4, v5, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v18 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 24, v6 +; GFX8-NEXT: v_or_b32_sdwa v5, v6, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v13, 24, v15 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v10 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v11 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v12 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX8-NEXT: v_or_b32_e32 v4, v4, v19 +; GFX8-NEXT: v_lshlrev_b32_e32 v14, 24, v17 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v13 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v3, vcc +; GFX8-NEXT: v_or_b32_e32 v5, v5, v14 ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v4, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v5, s[2:3] ; GFX8-NEXT: v_and_b32_e32 v2, v6, v2 @@ -5115,34 +5078,38 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[2:3] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 8, v2 ; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v12, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v13, v3, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v14, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v15, v4, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v16, v2, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v10, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v2 +; GFX8-NEXT: v_and_b32_sdwa v15, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v16, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v14, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v7, v2, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v10, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v11, v0, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v4, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v0 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v7, v2, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v2, v3, v12 -; GFX8-NEXT: v_or_b32_e32 v3, v4, v14 -; GFX8-NEXT: v_or_b32_e32 v4, v1, v16 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v10 -; GFX8-NEXT: v_or_b32_e32 v1, v2, v13 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v15 -; GFX8-NEXT: v_or_b32_e32 v3, v4, v7 +; GFX8-NEXT: v_or_b32_e32 v3, v4, v16 +; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX8-NEXT: v_or_b32_e32 v4, v1, v7 +; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX8-NEXT: v_lshlrev_b32_e32 v12, 24, v13 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v9 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v11 +; GFX8-NEXT: v_or_b32_e32 v3, v4, v12 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v11 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v14 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm @@ -5187,8 +5154,6 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 ; GFX7-NEXT: v_lshrrev_b32_e32 v13, 24, v5 ; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v6 -; GFX7-NEXT: v_and_b32_e32 v7, s6, v7 -; GFX7-NEXT: v_and_b32_e32 v10, s6, v10 ; GFX7-NEXT: v_and_b32_e32 v12, s6, v12 ; GFX7-NEXT: v_and_b32_e32 v14, s6, v14 ; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 @@ -5198,7 +5163,6 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; GFX7-NEXT: v_or_b32_e32 v3, v4, v8 ; GFX7-NEXT: v_lshrrev_b32_e32 v16, 24, v6 -; GFX7-NEXT: v_and_b32_e32 v13, s6, v13 ; GFX7-NEXT: v_and_b32_e32 v15, s6, v15 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 @@ -5208,7 +5172,6 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; GFX7-NEXT: v_or_b32_e32 v4, v5, v11 ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GFX7-NEXT: v_and_b32_e32 v16, s6, v16 ; GFX7-NEXT: v_lshlrev_b32_e32 v13, 24, v13 ; GFX7-NEXT: v_or_b32_e32 v3, v4, v12 ; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15 @@ -5232,16 +5195,10 @@ ; GFX7-NEXT: v_and_b32_e32 v2, s6, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 8, v1 ; GFX7-NEXT: v_and_b32_e32 v0, s6, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v2, s6, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v2, s6, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 8, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_and_b32_e32 v2, s6, v7 ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v9, 24, v1 @@ -5251,8 +5208,7 @@ ; GFX7-NEXT: v_and_b32_e32 v2, s6, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX7-NEXT: v_and_b32_e32 v2, s6, v9 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v9 ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 8, v3 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v3 @@ -5264,8 +5220,7 @@ ; GFX7-NEXT: v_and_b32_e32 v3, s6, v11 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX7-NEXT: v_and_b32_e32 v3, s6, v12 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v12 ; GFX7-NEXT: v_lshrrev_b32_e32 v13, 8, v4 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v4 @@ -5273,12 +5228,16 @@ ; GFX7-NEXT: v_and_b32_e32 v3, s6, v4 ; GFX7-NEXT: v_and_b32_e32 v4, s6, v13 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX7-NEXT: v_and_b32_e32 v5, s6, v5 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: v_and_b32_e32 v4, s6, v14 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s6, v15 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v15 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX7-NEXT: s_endpgm @@ -5305,31 +5264,35 @@ ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v4 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v5 -; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v6 -; GFX9-NEXT: v_and_b32_sdwa v10, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v11, v3, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v2, v3, s6, v2 -; GFX9-NEXT: v_and_b32_sdwa v12, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v13, v4, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v3, v4, s6, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v5 ; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v14, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v15, v5, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v4, v5, s6, v8 -; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or3_b32 v2, v2, v10, v11 -; GFX9-NEXT: v_or3_b32 v3, v3, v12, v13 -; GFX9-NEXT: v_and_b32_sdwa v16, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v17, v6, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v5, v6, s6, v9 -; GFX9-NEXT: v_or3_b32 v4, v4, v14, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 8, v6 +; GFX9-NEXT: v_and_b32_sdwa v14, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v2, v3, s6, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX9-NEXT: v_and_b32_sdwa v15, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX9-NEXT: v_and_or_b32 v3, v4, s6, v8 +; GFX9-NEXT: v_lshlrev_b32_sdwa v10, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 24, v6 +; GFX9-NEXT: v_and_b32_sdwa v16, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX9-NEXT: v_and_or_b32 v4, v5, s6, v10 +; GFX9-NEXT: v_lshlrev_b32_sdwa v12, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or3_b32 v2, v2, v14, v7 +; GFX9-NEXT: v_or3_b32 v3, v3, v15, v9 +; GFX9-NEXT: v_and_b32_sdwa v17, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v5, v6, s6, v12 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; GFX9-NEXT: v_or3_b32 v4, v4, v16, v11 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v3, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 -; GFX9-NEXT: v_or3_b32 v5, v5, v16, v17 +; GFX9-NEXT: v_or3_b32 v5, v5, v17, v13 ; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v4, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v5, s[2:3] ; GFX9-NEXT: v_and_or_b32 v1, v6, s5, v1 @@ -5339,29 +5302,33 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v1, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, v1, s[2:3] ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v9, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v10, v2, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v13, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v14, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v15, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v2, v2, s6, v5 +; GFX9-NEXT: v_and_b32_sdwa v16, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX9-NEXT: v_and_or_b32 v5, v1, s6, v0 -; GFX9-NEXT: v_and_b32_sdwa v11, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v12, v3, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v13, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v14, v4, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v15, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v16, v1, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v6 -; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v7 -; GFX9-NEXT: v_or3_b32 v0, v2, v9, v10 -; GFX9-NEXT: v_or3_b32 v1, v3, v11, v12 -; GFX9-NEXT: v_or3_b32 v2, v4, v13, v14 -; GFX9-NEXT: v_or3_b32 v3, v5, v15, v16 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v12 +; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v9 +; GFX9-NEXT: v_or3_b32 v0, v2, v13, v6 +; GFX9-NEXT: v_or3_b32 v1, v3, v14, v8 +; GFX9-NEXT: v_or3_b32 v2, v4, v15, v10 +; GFX9-NEXT: v_or3_b32 v3, v5, v16, v11 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off @@ -5386,34 +5353,38 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v4 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v5 -; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 8, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v8, 24, v9 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v4 ; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v12, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 8, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v10, 24, v11 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v5 +; GFX8-NEXT: v_lshlrev_b32_sdwa v11, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v14, 8, v6 +; GFX8-NEXT: v_and_b32_sdwa v16, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v13, v3, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v14, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v3, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v12 -; GFX8-NEXT: v_and_b32_sdwa v15, v4, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v3, v3, v14 -; GFX8-NEXT: v_and_b32_sdwa v16, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v4, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v10, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v17, v5, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v4, v4, v16 -; GFX8-NEXT: v_and_b32_sdwa v18, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v5, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v13 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v15 -; GFX8-NEXT: v_and_b32_sdwa v19, v6, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v5, v5, v18 -; GFX8-NEXT: v_or_b32_e32 v4, v4, v17 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v3, vcc +; GFX8-NEXT: v_and_b32_sdwa v17, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v16 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v17 +; GFX8-NEXT: v_lshlrev_b32_e32 v12, 24, v13 +; GFX8-NEXT: v_and_b32_sdwa v18, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v4, v5, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v18 +; GFX8-NEXT: v_lshrrev_b32_e32 v15, 24, v6 +; GFX8-NEXT: v_and_b32_sdwa v19, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v5, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v10 +; GFX8-NEXT: v_lshlrev_b32_e32 v14, 24, v15 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v19 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v12 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v3, vcc +; GFX8-NEXT: v_or_b32_e32 v5, v5, v14 ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v4, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v5, s[2:3] ; GFX8-NEXT: v_and_b32_e32 v6, s5, v6 @@ -5424,34 +5395,38 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[2:3] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 8, v2 ; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v12, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v13, v3, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v14, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v15, v4, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v16, v2, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v10, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v2 +; GFX8-NEXT: v_and_b32_sdwa v15, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v16, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v14, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v7, v2, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v10, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v11, v0, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v4, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v0 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v7, v2, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v2, v3, v12 -; GFX8-NEXT: v_or_b32_e32 v3, v4, v14 -; GFX8-NEXT: v_or_b32_e32 v4, v1, v16 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v10 -; GFX8-NEXT: v_or_b32_e32 v1, v2, v13 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v15 -; GFX8-NEXT: v_or_b32_e32 v3, v4, v7 +; GFX8-NEXT: v_or_b32_e32 v3, v4, v16 +; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX8-NEXT: v_or_b32_e32 v4, v1, v7 +; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX8-NEXT: v_lshlrev_b32_e32 v12, 24, v13 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v9 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v11 +; GFX8-NEXT: v_or_b32_e32 v3, v4, v12 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v11 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v14 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm @@ -5495,8 +5470,6 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 ; GFX7-NEXT: v_lshrrev_b32_e32 v13, 24, v5 ; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v6 -; GFX7-NEXT: v_and_b32_e32 v7, s6, v7 -; GFX7-NEXT: v_and_b32_e32 v10, s6, v10 ; GFX7-NEXT: v_and_b32_e32 v12, s6, v12 ; GFX7-NEXT: v_and_b32_e32 v14, s6, v14 ; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 @@ -5506,7 +5479,6 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; GFX7-NEXT: v_or_b32_e32 v3, v4, v8 ; GFX7-NEXT: v_lshrrev_b32_e32 v16, 24, v6 -; GFX7-NEXT: v_and_b32_e32 v13, s6, v13 ; GFX7-NEXT: v_and_b32_e32 v15, s6, v15 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 @@ -5516,7 +5488,6 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; GFX7-NEXT: v_or_b32_e32 v4, v5, v11 ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GFX7-NEXT: v_and_b32_e32 v16, s6, v16 ; GFX7-NEXT: v_lshlrev_b32_e32 v13, 24, v13 ; GFX7-NEXT: v_or_b32_e32 v3, v4, v12 ; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15 @@ -5541,27 +5512,20 @@ ; GFX7-NEXT: v_and_b32_e32 v0, s6, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 8, v2 ; GFX7-NEXT: v_and_b32_e32 v1, s6, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_and_b32_e32 v1, s6, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, s6, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 8, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v1, s6, v7 ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v9, 24, v2 -; GFX7-NEXT: v_and_b32_e32 v1, s6, v2 -; GFX7-NEXT: v_and_b32_e32 v2, s6, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_and_b32_e32 v2, s6, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX7-NEXT: v_and_b32_e32 v2, s6, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX7-NEXT: v_and_b32_e32 v2, s6, v9 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v9 ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 8, v3 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v3 @@ -5573,8 +5537,7 @@ ; GFX7-NEXT: v_and_b32_e32 v3, s6, v11 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX7-NEXT: v_and_b32_e32 v3, s6, v12 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v12 ; GFX7-NEXT: v_lshrrev_b32_e32 v13, 8, v4 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v4 @@ -5582,12 +5545,16 @@ ; GFX7-NEXT: v_and_b32_e32 v3, s6, v4 ; GFX7-NEXT: v_and_b32_e32 v4, s6, v13 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX7-NEXT: v_and_b32_e32 v5, s6, v5 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: v_and_b32_e32 v4, s6, v14 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s6, v15 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v15 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX7-NEXT: s_endpgm @@ -5616,29 +5583,33 @@ ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v5 ; GFX9-NEXT: v_lshlrev_b32_sdwa v9, s0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v10, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v13, v4, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v14, v4, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v4, v4, s1, v9 -; GFX9-NEXT: v_lshlrev_b32_sdwa v12, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v15, v5, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v16, v5, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v5, v5, s1, v10 ; GFX9-NEXT: v_lshlrev_b32_sdwa v11, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v17, v6, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v18, v6, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v6, v6, s1, v11 -; GFX9-NEXT: v_and_b32_sdwa v19, v7, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v9, v7, v0, v12 -; GFX9-NEXT: v_and_b32_sdwa v7, v7, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v4, v4, v13, v14 -; GFX9-NEXT: v_or3_b32 v5, v5, v15, v16 -; GFX9-NEXT: v_or3_b32 v7, v9, v19, v7 -; GFX9-NEXT: v_or3_b32 v6, v6, v17, v18 +; GFX9-NEXT: v_and_b32_sdwa v17, v4, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v18, v5, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v5, v5, s1, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 24, v7 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_or_b32 v4, v4, s1, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX9-NEXT: v_and_b32_sdwa v19, v6, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v6, v6, s1, v13 +; GFX9-NEXT: v_and_or_b32 v9, v7, v0, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 24, v14 +; GFX9-NEXT: v_and_b32_sdwa v7, v7, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 24, v16 +; GFX9-NEXT: v_or3_b32 v4, v4, v17, v10 +; GFX9-NEXT: v_or3_b32 v5, v5, v18, v11 +; GFX9-NEXT: v_or3_b32 v7, v9, v7, v13 +; GFX9-NEXT: v_or3_b32 v6, v6, v19, v12 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v4, v5, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[0:1] @@ -5648,31 +5619,35 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v5, v6, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, v2, s[2:3] -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 8, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v10, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v3 +; GFX9-NEXT: v_and_b32_sdwa v15, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v16, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v12, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v13, v4, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v14, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v15, v5, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_or_b32 v4, v4, v0, v7 -; GFX9-NEXT: v_and_or_b32 v5, v5, v0, v8 -; GFX9-NEXT: v_and_b32_sdwa v10, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v11, v3, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 24, v2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX9-NEXT: v_and_or_b32 v4, v4, v0, v8 +; GFX9-NEXT: v_and_or_b32 v5, v5, v0, v10 +; GFX9-NEXT: v_and_b32_sdwa v14, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v3, v3, v0, v6 ; GFX9-NEXT: v_and_or_b32 v6, v2, v0, v1 -; GFX9-NEXT: v_and_b32_sdwa v16, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v17, v2, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v1, v4, v12, v13 -; GFX9-NEXT: v_or3_b32 v2, v5, v14, v15 +; GFX9-NEXT: v_and_b32_sdwa v17, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 24, v13 +; GFX9-NEXT: v_or3_b32 v1, v4, v15, v9 +; GFX9-NEXT: v_or3_b32 v2, v5, v16, v11 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_or3_b32 v0, v3, v10, v11 -; GFX9-NEXT: v_or3_b32 v3, v6, v16, v17 +; GFX9-NEXT: v_or3_b32 v0, v3, v14, v7 +; GFX9-NEXT: v_or3_b32 v3, v6, v17, v12 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm @@ -5696,35 +5671,39 @@ ; GFX8-NEXT: v_xor_b32_e32 v3, -1, v3 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 8, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 8, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 8, v4 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshrrev_b32_e32 v13, 8, v6 -; GFX8-NEXT: v_lshrrev_b32_e32 v14, 8, v7 -; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v8, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v15, v4, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v16, v4, v9 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v17, v5, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v4, v5, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v15, 8, v6 ; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v8, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v8, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v15 -; GFX8-NEXT: v_and_b32_sdwa v9, v5, v9 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v4, v4, v17 -; GFX8-NEXT: v_and_b32_sdwa v18, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v5, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v19, v6, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v11, v7, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v6, v7, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v5, v5, v18 -; GFX8-NEXT: v_and_b32_sdwa v7, v7, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v6, v6, v11 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v16 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 8, v7 +; GFX8-NEXT: v_and_b32_sdwa v19, v4, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v14, 24, v5 +; GFX8-NEXT: v_or_b32_sdwa v4, v5, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v9, v5, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v8, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 24, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v13, 24, v14 ; GFX8-NEXT: v_or_b32_e32 v4, v4, v9 +; GFX8-NEXT: v_and_b32_sdwa v11, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v5, v6, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v8, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v19 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 24, v7 +; GFX8-NEXT: v_or_b32_sdwa v6, v7, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v7, v7, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v14, 24, v16 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v11 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v12 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v13 ; GFX8-NEXT: v_or_b32_e32 v6, v6, v7 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v19 +; GFX8-NEXT: v_lshlrev_b32_e32 v15, 24, v18 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v14 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc +; GFX8-NEXT: v_or_b32_e32 v6, v6, v15 ; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v5, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v6, s[2:3] ; GFX8-NEXT: v_and_b32_e32 v3, v7, v3 @@ -5734,34 +5713,38 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[2:3] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v2 -; GFX8-NEXT: v_and_b32_sdwa v10, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v11, v1, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v12, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v13, v3, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v14, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v15, v4, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v16, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v17, v2, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v3, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v3, v3, v14 -; GFX8-NEXT: v_or_b32_e32 v4, v2, v16 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v10 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v12 -; GFX8-NEXT: v_or_b32_e32 v2, v3, v15 -; GFX8-NEXT: v_or_b32_e32 v3, v4, v17 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 8, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v10, v8, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v8, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v2 +; GFX8-NEXT: v_and_b32_sdwa v14, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v15, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v16, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v4, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX8-NEXT: v_lshlrev_b32_e32 v12, 24, v13 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v14 +; GFX8-NEXT: v_or_b32_e32 v5, v2, v0 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v16 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v15 +; GFX8-NEXT: v_or_b32_e32 v0, v1, v6 +; GFX8-NEXT: v_or_b32_e32 v1, v3, v9 +; GFX8-NEXT: v_or_b32_e32 v2, v4, v11 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v12 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v11 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v13 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm @@ -5799,15 +5782,13 @@ ; GFX7-NEXT: v_lshrrev_b32_e32 v16, 8, v7 ; GFX7-NEXT: v_and_b32_e32 v1, s0, v1 ; GFX7-NEXT: v_and_b32_e32 v11, s0, v11 -; GFX7-NEXT: v_and_b32_e32 v13, v13, v8 +; GFX7-NEXT: v_and_b32_e32 v13, s0, v13 ; GFX7-NEXT: v_and_b32_e32 v4, s0, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX7-NEXT: v_and_b32_e32 v5, s0, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 ; GFX7-NEXT: v_lshrrev_b32_e32 v15, 24, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; GFX7-NEXT: v_and_b32_e32 v9, s0, v9 -; GFX7-NEXT: v_and_b32_e32 v12, s0, v12 ; GFX7-NEXT: v_and_b32_e32 v14, v14, v8 ; GFX7-NEXT: v_and_b32_e32 v16, v16, v8 ; GFX7-NEXT: v_and_b32_e32 v6, s0, v6 @@ -5817,7 +5798,6 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; GFX7-NEXT: v_or_b32_e32 v4, v5, v10 ; GFX7-NEXT: v_lshrrev_b32_e32 v18, 24, v7 -; GFX7-NEXT: v_and_b32_e32 v15, v15, v8 ; GFX7-NEXT: v_and_b32_e32 v17, v17, v8 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 24, v9 @@ -5827,7 +5807,6 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; GFX7-NEXT: v_or_b32_e32 v5, v6, v13 ; GFX7-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; GFX7-NEXT: v_and_b32_e32 v18, v18, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v15, 24, v15 ; GFX7-NEXT: v_or_b32_e32 v4, v5, v14 ; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v17 @@ -5852,16 +5831,10 @@ ; GFX7-NEXT: v_and_b32_e32 v2, v2, v8 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 8, v1 ; GFX7-NEXT: v_and_b32_e32 v0, v0, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v2, v5, v8 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v2, v6, v8 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 8, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_and_b32_e32 v2, v7, v8 ; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 24, v1 @@ -5871,8 +5844,7 @@ ; GFX7-NEXT: v_and_b32_e32 v2, v9, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX7-NEXT: v_and_b32_e32 v2, v10, v8 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v10 ; GFX7-NEXT: v_lshrrev_b32_e32 v11, 8, v3 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v3 @@ -5884,8 +5856,7 @@ ; GFX7-NEXT: v_and_b32_e32 v3, v12, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX7-NEXT: v_and_b32_e32 v3, v13, v8 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v13 ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 8, v4 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v4 @@ -5893,12 +5864,16 @@ ; GFX7-NEXT: v_and_b32_e32 v3, v4, v8 ; GFX7-NEXT: v_and_b32_e32 v4, v14, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX7-NEXT: v_and_b32_e32 v5, v5, v8 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: v_and_b32_e32 v4, v15, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX7-NEXT: v_and_b32_e32 v4, v16, v8 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v16 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX7-NEXT: s_endpgm Index: llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -46,8 +46,8 @@ ; GFX9-LABEL: v_lshr_i8_7: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, 7 -; GFX9-NEXT: v_lshrrev_b16_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_mov_b32_e32 v1, 7 +; GFX9-NEXT: v_lshrrev_b16_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = lshr i8 %value, 7 ret i8 %result @@ -557,13 +557,11 @@ ; ; GFX8-LABEL: s_lshr_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: s_mov_b32 s3, 0xffff +; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: s_lshr_b32 s4, s1, 16 ; GFX8-NEXT: s_and_b32 s0, s0, s3 ; GFX8-NEXT: s_and_b32 s1, s1, s3 -; GFX8-NEXT: s_and_b32 s2, s2, s3 -; GFX8-NEXT: s_and_b32 s4, s4, s3 ; GFX8-NEXT: s_lshr_b32 s0, s0, s1 ; GFX8-NEXT: s_lshr_b32 s1, s2, s4 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 @@ -740,21 +738,17 @@ ; ; GFX8-LABEL: s_lshr_v4i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s4, s0, 16 ; GFX8-NEXT: s_mov_b32 s6, 0xffff +; GFX8-NEXT: s_lshr_b32 s4, s0, 16 ; GFX8-NEXT: s_lshr_b32 s7, s2, 16 -; GFX8-NEXT: s_lshr_b32 s5, s1, 16 -; GFX8-NEXT: s_lshr_b32 s8, s3, 16 ; GFX8-NEXT: s_and_b32 s0, s0, s6 ; GFX8-NEXT: s_and_b32 s2, s2, s6 -; GFX8-NEXT: s_and_b32 s4, s4, s6 -; GFX8-NEXT: s_and_b32 s7, s7, s6 ; GFX8-NEXT: s_lshr_b32 s0, s0, s2 ; GFX8-NEXT: s_lshr_b32 s2, s4, s7 +; GFX8-NEXT: s_lshr_b32 s5, s1, 16 +; GFX8-NEXT: s_lshr_b32 s8, s3, 16 ; GFX8-NEXT: s_and_b32 s1, s1, s6 ; GFX8-NEXT: s_and_b32 s3, s3, s6 -; GFX8-NEXT: s_and_b32 s5, s5, s6 -; GFX8-NEXT: s_and_b32 s8, s8, s6 ; GFX8-NEXT: s_lshr_b32 s1, s1, s3 ; GFX8-NEXT: s_lshr_b32 s3, s5, s8 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 @@ -932,39 +926,31 @@ ; ; GFX8-LABEL: s_lshr_v8i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s8, s0, 16 ; GFX8-NEXT: s_mov_b32 s12, 0xffff +; GFX8-NEXT: s_lshr_b32 s8, s0, 16 ; GFX8-NEXT: s_lshr_b32 s13, s4, 16 -; GFX8-NEXT: s_lshr_b32 s9, s1, 16 -; GFX8-NEXT: s_lshr_b32 s14, s5, 16 ; GFX8-NEXT: s_and_b32 s0, s0, s12 ; GFX8-NEXT: s_and_b32 s4, s4, s12 -; GFX8-NEXT: s_and_b32 s8, s8, s12 -; GFX8-NEXT: s_and_b32 s13, s13, s12 -; GFX8-NEXT: s_lshr_b32 s10, s2, 16 -; GFX8-NEXT: s_lshr_b32 s15, s6, 16 ; GFX8-NEXT: s_lshr_b32 s0, s0, s4 ; GFX8-NEXT: s_lshr_b32 s4, s8, s13 +; GFX8-NEXT: s_lshr_b32 s9, s1, 16 +; GFX8-NEXT: s_lshr_b32 s14, s5, 16 ; GFX8-NEXT: s_and_b32 s1, s1, s12 ; GFX8-NEXT: s_and_b32 s5, s5, s12 -; GFX8-NEXT: s_and_b32 s9, s9, s12 -; GFX8-NEXT: s_and_b32 s14, s14, s12 -; GFX8-NEXT: s_lshr_b32 s11, s3, 16 -; GFX8-NEXT: s_lshr_b32 s16, s7, 16 ; GFX8-NEXT: s_lshr_b32 s1, s1, s5 +; GFX8-NEXT: s_lshr_b32 s10, s2, 16 +; GFX8-NEXT: s_lshr_b32 s15, s6, 16 ; GFX8-NEXT: s_and_b32 s2, s2, s12 ; GFX8-NEXT: s_and_b32 s6, s6, s12 -; GFX8-NEXT: s_and_b32 s10, s10, s12 -; GFX8-NEXT: s_and_b32 s15, s15, s12 ; GFX8-NEXT: s_lshr_b32 s5, s9, s14 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16 ; GFX8-NEXT: s_and_b32 s0, s0, s12 ; GFX8-NEXT: s_lshr_b32 s2, s2, s6 +; GFX8-NEXT: s_lshr_b32 s11, s3, 16 +; GFX8-NEXT: s_lshr_b32 s16, s7, 16 ; GFX8-NEXT: s_or_b32 s0, s4, s0 ; GFX8-NEXT: s_and_b32 s3, s3, s12 ; GFX8-NEXT: s_and_b32 s7, s7, s12 -; GFX8-NEXT: s_and_b32 s11, s11, s12 -; GFX8-NEXT: s_and_b32 s16, s16, s12 ; GFX8-NEXT: s_lshr_b32 s6, s10, s15 ; GFX8-NEXT: s_lshl_b32 s4, s5, 16 ; GFX8-NEXT: s_and_b32 s1, s1, s12 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-and.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-and.mir @@ -0,0 +1,22 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s + +--- +name: remove_and_255_zextload +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: remove_and_255_zextload + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK: %ptr:_(p1) = COPY $vgpr0_vgpr1 + ; CHECK: %load:_(s32) = G_ZEXTLOAD %ptr(p1) :: (load 1, addrspace 1) + ; CHECK: $vgpr0 = COPY %load(s32) + %ptr:_(p1) = COPY $vgpr0_vgpr1 + %load:_(s32) = G_ZEXTLOAD %ptr :: (load 1, addrspace 1, align 1) + %mask:_(s32) = G_CONSTANT i32 255 + %and:_(s32) = G_AND %load, %mask + $vgpr0 = COPY %and + +... Index: llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll @@ -461,7 +461,6 @@ ; GFX7-NEXT: s_and_b32 s0, s0, 0x3fff3fff ; GFX7-NEXT: s_lshr_b32 s1, s0, 16 ; GFX7-NEXT: s_and_b32 s0, s0, s2 -; GFX7-NEXT: s_and_b32 s1, s1, s2 ; GFX7-NEXT: s_lshl_b32 s0, s0, 2 ; GFX7-NEXT: s_lshl_b32 s1, s1, 2 ; GFX7-NEXT: ; return to shader part epilog @@ -473,7 +472,6 @@ ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 ; GFX8-NEXT: s_mov_b32 s3, s2 ; GFX8-NEXT: s_and_b32 s0, s0, s4 -; GFX8-NEXT: s_and_b32 s1, s1, s4 ; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GFX8-NEXT: s_mov_b32 s5, s4 ; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5] @@ -485,9 +483,7 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s0, s0, 0x3fff3fff ; GFX9-NEXT: s_lshr_b32 s1, s0, 16 -; GFX9-NEXT: s_mov_b32 s2, 0xffff -; GFX9-NEXT: s_and_b32 s0, s0, s2 -; GFX9-NEXT: s_and_b32 s1, s1, s2 +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff ; GFX9-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-NEXT: s_lshl_b32 s1, s1, 2 ; GFX9-NEXT: ; return to shader part epilog @@ -506,14 +502,13 @@ ; GFX7-LABEL: v_shl_v2i32_zext_v2i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0x3fff3fff, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX7-NEXT: s_mov_b32 s4, 0xffff -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -521,24 +516,19 @@ ; GFX8-LABEL: v_shl_v2i32_zext_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_and_b32_e32 v0, 0x3fff3fff, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v1, 0x3fff3fff, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_shl_v2i32_zext_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 0x3fff3fff, v0 -; GFX9-NEXT: s_mov_b32 s5, 0xffff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: s_mov_b32 s4, 2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 0x3fff3fff, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %and = and <2 x i16> %x, %ext = zext <2 x i16> %and to <2 x i32> Index: llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll @@ -551,13 +551,11 @@ ; ; GFX8-LABEL: s_shl_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: s_mov_b32 s3, 0xffff +; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: s_lshr_b32 s4, s1, 16 ; GFX8-NEXT: s_and_b32 s0, s0, s3 ; GFX8-NEXT: s_and_b32 s1, s1, s3 -; GFX8-NEXT: s_and_b32 s2, s2, s3 -; GFX8-NEXT: s_and_b32 s4, s4, s3 ; GFX8-NEXT: s_lshl_b32 s0, s0, s1 ; GFX8-NEXT: s_lshl_b32 s1, s2, s4 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 @@ -722,21 +720,17 @@ ; ; GFX8-LABEL: s_shl_v4i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s4, s0, 16 ; GFX8-NEXT: s_mov_b32 s6, 0xffff +; GFX8-NEXT: s_lshr_b32 s4, s0, 16 ; GFX8-NEXT: s_lshr_b32 s7, s2, 16 -; GFX8-NEXT: s_lshr_b32 s5, s1, 16 -; GFX8-NEXT: s_lshr_b32 s8, s3, 16 ; GFX8-NEXT: s_and_b32 s0, s0, s6 ; GFX8-NEXT: s_and_b32 s2, s2, s6 -; GFX8-NEXT: s_and_b32 s4, s4, s6 -; GFX8-NEXT: s_and_b32 s7, s7, s6 ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 ; GFX8-NEXT: s_lshl_b32 s2, s4, s7 +; GFX8-NEXT: s_lshr_b32 s5, s1, 16 +; GFX8-NEXT: s_lshr_b32 s8, s3, 16 ; GFX8-NEXT: s_and_b32 s1, s1, s6 ; GFX8-NEXT: s_and_b32 s3, s3, s6 -; GFX8-NEXT: s_and_b32 s5, s5, s6 -; GFX8-NEXT: s_and_b32 s8, s8, s6 ; GFX8-NEXT: s_lshl_b32 s1, s1, s3 ; GFX8-NEXT: s_lshl_b32 s3, s5, s8 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 @@ -898,39 +892,31 @@ ; ; GFX8-LABEL: s_shl_v8i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s8, s0, 16 ; GFX8-NEXT: s_mov_b32 s12, 0xffff +; GFX8-NEXT: s_lshr_b32 s8, s0, 16 ; GFX8-NEXT: s_lshr_b32 s13, s4, 16 -; GFX8-NEXT: s_lshr_b32 s9, s1, 16 -; GFX8-NEXT: s_lshr_b32 s14, s5, 16 ; GFX8-NEXT: s_and_b32 s0, s0, s12 ; GFX8-NEXT: s_and_b32 s4, s4, s12 -; GFX8-NEXT: s_and_b32 s8, s8, s12 -; GFX8-NEXT: s_and_b32 s13, s13, s12 -; GFX8-NEXT: s_lshr_b32 s10, s2, 16 -; GFX8-NEXT: s_lshr_b32 s15, s6, 16 ; GFX8-NEXT: s_lshl_b32 s0, s0, s4 ; GFX8-NEXT: s_lshl_b32 s4, s8, s13 +; GFX8-NEXT: s_lshr_b32 s9, s1, 16 +; GFX8-NEXT: s_lshr_b32 s14, s5, 16 ; GFX8-NEXT: s_and_b32 s1, s1, s12 ; GFX8-NEXT: s_and_b32 s5, s5, s12 -; GFX8-NEXT: s_and_b32 s9, s9, s12 -; GFX8-NEXT: s_and_b32 s14, s14, s12 -; GFX8-NEXT: s_lshr_b32 s11, s3, 16 -; GFX8-NEXT: s_lshr_b32 s16, s7, 16 ; GFX8-NEXT: s_lshl_b32 s1, s1, s5 +; GFX8-NEXT: s_lshr_b32 s10, s2, 16 +; GFX8-NEXT: s_lshr_b32 s15, s6, 16 ; GFX8-NEXT: s_and_b32 s2, s2, s12 ; GFX8-NEXT: s_and_b32 s6, s6, s12 -; GFX8-NEXT: s_and_b32 s10, s10, s12 -; GFX8-NEXT: s_and_b32 s15, s15, s12 ; GFX8-NEXT: s_lshl_b32 s5, s9, s14 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16 ; GFX8-NEXT: s_and_b32 s0, s0, s12 ; GFX8-NEXT: s_lshl_b32 s2, s2, s6 +; GFX8-NEXT: s_lshr_b32 s11, s3, 16 +; GFX8-NEXT: s_lshr_b32 s16, s7, 16 ; GFX8-NEXT: s_or_b32 s0, s4, s0 ; GFX8-NEXT: s_and_b32 s3, s3, s12 ; GFX8-NEXT: s_and_b32 s7, s7, s12 -; GFX8-NEXT: s_and_b32 s11, s11, s12 -; GFX8-NEXT: s_and_b32 s16, s16, s12 ; GFX8-NEXT: s_lshl_b32 s6, s10, s15 ; GFX8-NEXT: s_lshl_b32 s4, s5, 16 ; GFX8-NEXT: s_and_b32 s1, s1, s12 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -205,10 +205,7 @@ ; GFX6-NEXT: v_min_u32_e32 v2, v3, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 24, v1 -; GFX6-NEXT: v_mov_b32_e32 v2, 0xff ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v0 -; GFX6-NEXT: v_and_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -223,10 +220,9 @@ ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX8-NEXT: v_add_u16_e64 v0, v0, v1 clamp ; GFX8-NEXT: v_add_u16_e64 v1, v3, v2 clamp -; GFX8-NEXT: v_mov_b32_e32 v2, 0xff -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, 8, v1 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uaddsat_v2i8: @@ -291,10 +287,7 @@ ; GFX6-NEXT: s_cmp_lt_u32 s3, s2 ; GFX6-NEXT: s_cselect_b32 s2, s3, s2 ; GFX6-NEXT: s_add_i32 s1, s1, s2 -; GFX6-NEXT: s_movk_i32 s2, 0xff ; GFX6-NEXT: s_lshr_b32 s1, s1, 24 -; GFX6-NEXT: s_and_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s0, s0, s2 ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -311,11 +304,10 @@ ; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp ; GFX8-NEXT: s_lshl_b32 s0, s2, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, 0xff ; GFX8-NEXT: v_add_u16_e64 v1, s0, v1 clamp -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, 8, v1 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -399,24 +391,19 @@ ; GFX6-NEXT: v_min_u32_e32 v3, v5, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 24, v1 -; GFX6-NEXT: s_movk_i32 s4, 0xff -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 24, v7 ; GFX6-NEXT: v_xor_b32_e32 v5, -1, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX6-NEXT: v_min_u32_e32 v4, v5, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 24, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -565,17 +552,12 @@ ; GFX6-NEXT: s_cmp_lt_u32 s5, s4 ; GFX6-NEXT: s_cselect_b32 s4, s5, s4 ; GFX6-NEXT: s_add_i32 s3, s3, s4 -; GFX6-NEXT: s_movk_i32 s4, 0xff -; GFX6-NEXT: s_and_b32 s1, s1, s4 -; GFX6-NEXT: s_and_b32 s0, s0, s4 ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s4 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_lshl_b32 s1, s2, 16 ; GFX6-NEXT: s_lshr_b32 s3, s3, 24 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s3, s4 -; GFX6-NEXT: s_lshl_b32 s1, s1, 24 +; GFX6-NEXT: s_lshl_b32 s1, s3, 24 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; @@ -1892,10 +1874,7 @@ ; GFX6-NEXT: s_cmp_lt_u32 s3, s2 ; GFX6-NEXT: s_cselect_b32 s2, s3, s2 ; GFX6-NEXT: s_add_i32 s1, s1, s2 -; GFX6-NEXT: s_mov_b32 s2, 0xffff ; GFX6-NEXT: s_lshr_b32 s1, s1, 16 -; GFX6-NEXT: s_and_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s0, s0, s2 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -1946,10 +1925,7 @@ ; GFX6-NEXT: v_min_u32_e32 v1, s1, v1 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, s0, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s0, 0xffff ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s0, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog @@ -1994,10 +1970,7 @@ ; GFX6-NEXT: v_min_u32_e32 v2, s0, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s0, 0xffff ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s0, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog @@ -2063,19 +2036,14 @@ ; GFX6-NEXT: v_xor_b32_e32 v5, -1, v3 ; GFX6-NEXT: v_min_u32_e32 v4, v5, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_uaddsat_v4i16: @@ -2142,16 +2110,11 @@ ; GFX6-NEXT: s_cmp_lt_u32 s5, s4 ; GFX6-NEXT: s_cselect_b32 s4, s5, s4 ; GFX6-NEXT: s_add_i32 s3, s3, s4 -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: s_and_b32 s1, s1, s4 -; GFX6-NEXT: s_lshr_b32 s3, s3, 16 -; GFX6-NEXT: s_and_b32 s0, s0, s4 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_lshr_b32 s3, s3, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s4 -; GFX6-NEXT: s_and_b32 s2, s3, s4 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_or_b32 s1, s1, s2 +; GFX6-NEXT: s_lshl_b32 s1, s3, 16 +; GFX6-NEXT: s_or_b32 s1, s2, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_uaddsat_v4i16: @@ -2241,29 +2204,22 @@ ; GFX6-NEXT: v_min_u32_e32 v6, v7, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v11 ; GFX6-NEXT: v_xor_b32_e32 v7, -1, v5 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_min_u32_e32 v6, v7, v6 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_uaddsat_v6i16: @@ -2351,20 +2307,13 @@ ; GFX6-NEXT: s_cmp_lt_u32 s7, s6 ; GFX6-NEXT: s_cselect_b32 s6, s7, s6 ; GFX6-NEXT: s_add_i32 s5, s5, s6 -; GFX6-NEXT: s_mov_b32 s6, 0xffff -; GFX6-NEXT: s_and_b32 s1, s1, s6 -; GFX6-NEXT: s_and_b32 s0, s0, s6 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s6 -; GFX6-NEXT: s_and_b32 s2, s3, s6 +; GFX6-NEXT: s_lshl_b32 s1, s3, 16 ; GFX6-NEXT: s_lshr_b32 s5, s5, 16 -; GFX6-NEXT: s_and_b32 s3, s5, s6 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s2, s4, s6 -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_or_b32 s2, s2, s3 +; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_lshl_b32 s2, s5, 16 +; GFX6-NEXT: s_or_b32 s2, s4, s2 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_uaddsat_v6i16: @@ -2466,36 +2415,27 @@ ; GFX6-NEXT: v_xor_b32_e32 v9, -1, v6 ; GFX6-NEXT: v_min_u32_e32 v8, v9, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v15 ; GFX6-NEXT: v_xor_b32_e32 v9, -1, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_min_u32_e32 v8, v9, v8 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v4 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v5 ; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_uaddsat_v8i16: @@ -2603,24 +2543,15 @@ ; GFX6-NEXT: s_cmp_lt_u32 s9, s8 ; GFX6-NEXT: s_cselect_b32 s8, s9, s8 ; GFX6-NEXT: s_add_i32 s7, s7, s8 -; GFX6-NEXT: s_mov_b32 s8, 0xffff -; GFX6-NEXT: s_and_b32 s1, s1, s8 -; GFX6-NEXT: s_and_b32 s0, s0, s8 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s8 -; GFX6-NEXT: s_and_b32 s2, s3, s8 -; GFX6-NEXT: s_and_b32 s3, s5, s8 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_lshl_b32 s1, s3, 16 ; GFX6-NEXT: s_lshr_b32 s7, s7, 16 -; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s2, s4, s8 -; GFX6-NEXT: s_and_b32 s4, s7, s8 -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_and_b32 s3, s6, s8 -; GFX6-NEXT: s_lshl_b32 s4, s4, 16 -; GFX6-NEXT: s_or_b32 s3, s3, s4 +; GFX6-NEXT: s_lshl_b32 s3, s7, 16 +; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_lshl_b32 s2, s5, 16 +; GFX6-NEXT: s_or_b32 s2, s4, s2 +; GFX6-NEXT: s_or_b32 s3, s6, s3 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_uaddsat_v8i16: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -199,10 +199,7 @@ ; GFX6-NEXT: v_min_u32_e32 v2, v1, v2 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 24, v1 -; GFX6-NEXT: v_mov_b32_e32 v2, 0xff ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v0 -; GFX6-NEXT: v_and_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -217,10 +214,9 @@ ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX8-NEXT: v_sub_u16_e64 v0, v0, v1 clamp ; GFX8-NEXT: v_sub_u16_e64 v1, v3, v2 clamp -; GFX8-NEXT: v_mov_b32_e32 v2, 0xff -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, 8, v1 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_usubsat_v2i8: @@ -283,10 +279,7 @@ ; GFX6-NEXT: s_cmp_lt_u32 s1, s2 ; GFX6-NEXT: s_cselect_b32 s2, s1, s2 ; GFX6-NEXT: s_sub_i32 s1, s1, s2 -; GFX6-NEXT: s_movk_i32 s2, 0xff ; GFX6-NEXT: s_lshr_b32 s1, s1, 24 -; GFX6-NEXT: s_and_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s0, s0, s2 ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -303,11 +296,10 @@ ; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp ; GFX8-NEXT: s_lshl_b32 s0, s2, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, 0xff ; GFX8-NEXT: v_sub_u16_e64 v1, s0, v1 clamp -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, 8, v1 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -386,25 +378,20 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v6 ; GFX6-NEXT: v_min_u32_e32 v3, v2, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 24, v1 -; GFX6-NEXT: s_movk_i32 s4, 0xff ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 24, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX6-NEXT: v_min_u32_e32 v4, v3, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 24, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -549,17 +536,12 @@ ; GFX6-NEXT: s_cmp_lt_u32 s3, s4 ; GFX6-NEXT: s_cselect_b32 s4, s3, s4 ; GFX6-NEXT: s_sub_i32 s3, s3, s4 -; GFX6-NEXT: s_movk_i32 s4, 0xff -; GFX6-NEXT: s_and_b32 s1, s1, s4 -; GFX6-NEXT: s_and_b32 s0, s0, s4 ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s4 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_lshl_b32 s1, s2, 16 ; GFX6-NEXT: s_lshr_b32 s3, s3, 24 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s3, s4 -; GFX6-NEXT: s_lshl_b32 s1, s1, 24 +; GFX6-NEXT: s_lshl_b32 s1, s3, 24 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; @@ -1802,10 +1784,7 @@ ; GFX6-NEXT: s_cmp_lt_u32 s1, s2 ; GFX6-NEXT: s_cselect_b32 s2, s1, s2 ; GFX6-NEXT: s_sub_i32 s1, s1, s2 -; GFX6-NEXT: s_mov_b32 s2, 0xffff ; GFX6-NEXT: s_lshr_b32 s1, s1, 16 -; GFX6-NEXT: s_and_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s0, s0, s2 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -1854,10 +1833,7 @@ ; GFX6-NEXT: v_min_u32_e32 v1, s0, v1 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s0, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s0, 0xffff ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s0, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog @@ -1900,10 +1876,7 @@ ; GFX6-NEXT: v_min_u32_e32 v2, s0, v1 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s0, 0xffff ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s0, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog @@ -1965,19 +1938,14 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7 ; GFX6-NEXT: v_min_u32_e32 v4, v3, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_usubsat_v4i16: @@ -2040,16 +2008,11 @@ ; GFX6-NEXT: s_cmp_lt_u32 s3, s4 ; GFX6-NEXT: s_cselect_b32 s4, s3, s4 ; GFX6-NEXT: s_sub_i32 s3, s3, s4 -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: s_and_b32 s1, s1, s4 -; GFX6-NEXT: s_lshr_b32 s3, s3, 16 -; GFX6-NEXT: s_and_b32 s0, s0, s4 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_lshr_b32 s3, s3, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s4 -; GFX6-NEXT: s_and_b32 s2, s3, s4 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_or_b32 s1, s1, s2 +; GFX6-NEXT: s_lshl_b32 s1, s3, 16 +; GFX6-NEXT: s_or_b32 s1, s2, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_usubsat_v4i16: @@ -2133,29 +2096,22 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v10 ; GFX6-NEXT: v_min_u32_e32 v6, v4, v6 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_min_u32_e32 v6, v5, v6 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_usubsat_v6i16: @@ -2237,20 +2193,13 @@ ; GFX6-NEXT: s_cmp_lt_u32 s5, s6 ; GFX6-NEXT: s_cselect_b32 s6, s5, s6 ; GFX6-NEXT: s_sub_i32 s5, s5, s6 -; GFX6-NEXT: s_mov_b32 s6, 0xffff -; GFX6-NEXT: s_and_b32 s1, s1, s6 -; GFX6-NEXT: s_and_b32 s0, s0, s6 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s6 -; GFX6-NEXT: s_and_b32 s2, s3, s6 +; GFX6-NEXT: s_lshl_b32 s1, s3, 16 ; GFX6-NEXT: s_lshr_b32 s5, s5, 16 -; GFX6-NEXT: s_and_b32 s3, s5, s6 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s2, s4, s6 -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_or_b32 s2, s2, s3 +; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_lshl_b32 s2, s5, 16 +; GFX6-NEXT: s_or_b32 s2, s4, s2 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_usubsat_v6i16: @@ -2344,36 +2293,27 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v14 ; GFX6-NEXT: v_min_u32_e32 v8, v6, v8 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v8 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v15 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_min_u32_e32 v8, v7, v8 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v8 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v4 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v5 ; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_usubsat_v8i16: @@ -2473,24 +2413,15 @@ ; GFX6-NEXT: s_cmp_lt_u32 s7, s8 ; GFX6-NEXT: s_cselect_b32 s8, s7, s8 ; GFX6-NEXT: s_sub_i32 s7, s7, s8 -; GFX6-NEXT: s_mov_b32 s8, 0xffff -; GFX6-NEXT: s_and_b32 s1, s1, s8 -; GFX6-NEXT: s_and_b32 s0, s0, s8 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s8 -; GFX6-NEXT: s_and_b32 s2, s3, s8 -; GFX6-NEXT: s_and_b32 s3, s5, s8 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_lshl_b32 s1, s3, 16 ; GFX6-NEXT: s_lshr_b32 s7, s7, 16 -; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s2, s4, s8 -; GFX6-NEXT: s_and_b32 s4, s7, s8 -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_and_b32 s3, s6, s8 -; GFX6-NEXT: s_lshl_b32 s4, s4, 16 -; GFX6-NEXT: s_or_b32 s3, s3, s4 +; GFX6-NEXT: s_lshl_b32 s3, s7, 16 +; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_lshl_b32 s2, s5, 16 +; GFX6-NEXT: s_or_b32 s2, s4, s2 +; GFX6-NEXT: s_or_b32 s3, s6, s3 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_usubsat_v8i16: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll @@ -37,7 +37,6 @@ ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 ; GFX8-NEXT: s_mov_b32 s3, s2 ; GFX8-NEXT: s_and_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s1, s1, s2 ; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 @@ -118,21 +117,19 @@ ; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] ; GFX8-NEXT: s_mov_b32 s4, 0xffff ; GFX8-NEXT: s_lshr_b32 s3, s0, 16 -; GFX8-NEXT: s_lshr_b32 s5, s1, 16 ; GFX8-NEXT: s_and_b32 s2, s0, s4 -; GFX8-NEXT: s_and_b32 s0, s1, s4 -; GFX8-NEXT: s_and_b32 s1, s5, s4 ; GFX8-NEXT: s_mov_b32 s5, s4 -; GFX8-NEXT: s_and_b32 s3, s3, s4 -; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], s[4:5] -; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5] +; GFX8-NEXT: s_lshr_b32 s7, s1, 16 +; GFX8-NEXT: s_and_b32 s6, s1, s4 +; GFX8-NEXT: s_and_b64 s[0:1], s[2:3], s[4:5] +; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] +; GFX8-NEXT: s_and_b64 s[2:3], s[6:7], s[4:5] ; GFX8-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] -; GFX8-NEXT: s_xor_b64 s[6:7], s[0:1], s[4:5] -; GFX8-NEXT: s_and_b32 s1, s2, s4 -; GFX8-NEXT: s_lshl_b32 s0, s3, 16 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_lshl_b32 s1, s7, 16 -; GFX8-NEXT: s_and_b32 s2, s6, s4 +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 +; GFX8-NEXT: s_and_b32 s0, s0, s4 +; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_lshl_b32 s1, s3, 16 +; GFX8-NEXT: s_and_b32 s2, s2, s4 ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: ; return to shader part epilog ;