diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -3327,9 +3327,9 @@ // FIXME: We should choose either a zext or a sext based on other constants // already around. def : Pat<(i32 (anyext i1:$in)), - (SELECT_I4 crbitrc:$in, (LI 1), (LI 0))>; + (SELECT_I4 $in, (LI 1), (LI 0))>; def : Pat<(i64 (anyext i1:$in)), - (SELECT_I8 crbitrc:$in, (LI8 1), (LI8 0))>; + (SELECT_I8 $in, (LI8 1), (LI8 0))>; // match setcc on i1 variables. // CRANDC is: @@ -3735,34 +3735,34 @@ multiclass FSetCCPat { defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETUGE)), - (EXTRACT_SUBREG (FCmp Ty:$s1, Ty:$s2), sub_lt)>; + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_lt)>; defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETGE)), - (EXTRACT_SUBREG (FCmp Ty:$s1, Ty:$s2), sub_lt)>; + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_lt)>; defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETULE)), - (EXTRACT_SUBREG (FCmp Ty:$s1, Ty:$s2), sub_gt)>; + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_gt)>; defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETLE)), - (EXTRACT_SUBREG (FCmp Ty:$s1, Ty:$s2), sub_gt)>; + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_gt)>; defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETUNE)), - (EXTRACT_SUBREG (FCmp Ty:$s1, Ty:$s2), sub_eq)>; + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_eq)>; defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETNE)), - (EXTRACT_SUBREG (FCmp Ty:$s1, Ty:$s2), sub_eq)>; + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_eq)>; defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETO)), - (EXTRACT_SUBREG (FCmp Ty:$s1, Ty:$s2), sub_un)>; + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_un)>; def : Pat<(i1 (SetCC Ty:$s1, Ty:$s2, SETOLT)), - (EXTRACT_SUBREG (FCmp Ty:$s1, Ty:$s2), sub_lt)>; + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_lt)>; def : Pat<(i1 (SetCC Ty:$s1, Ty:$s2, SETLT)), - (EXTRACT_SUBREG (FCmp Ty:$s1, Ty:$s2), sub_lt)>; + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_lt)>; def : Pat<(i1 (SetCC Ty:$s1, Ty:$s2, SETOGT)), - (EXTRACT_SUBREG (FCmp Ty:$s1, Ty:$s2), sub_gt)>; + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_gt)>; def : Pat<(i1 (SetCC Ty:$s1, Ty:$s2, SETGT)), - (EXTRACT_SUBREG (FCmp Ty:$s1, Ty:$s2), sub_gt)>; + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_gt)>; def : Pat<(i1 (SetCC Ty:$s1, Ty:$s2, SETOEQ)), - (EXTRACT_SUBREG (FCmp Ty:$s1, Ty:$s2), sub_eq)>; + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_eq)>; def : Pat<(i1 (SetCC Ty:$s1, Ty:$s2, SETEQ)), - (EXTRACT_SUBREG (FCmp Ty:$s1, Ty:$s2), sub_eq)>; + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_eq)>; def : Pat<(i1 (SetCC Ty:$s1, Ty:$s2, SETUO)), - (EXTRACT_SUBREG (FCmp Ty:$s1, Ty:$s2), sub_un)>; + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_un)>; } let Predicates = [HasFPU] in { diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll @@ -669,10 +669,9 @@ ; GFX8-NEXT: v_mul_f32_e32 v3, v7, v5 ; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX8-NEXT: v_div_fixup_f16 v0, v2, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 ; GFX8-NEXT: v_div_fixup_f16 v1, v3, v4, v6 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fdiv_v2f16: @@ -762,10 +761,8 @@ ; GFX8-NEXT: v_rcp_f16_e32 v2, v1 ; GFX8-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v2, v0, v2 -; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_mov_b32_e32 v1, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fdiv_v2f16_afn: @@ -894,10 +891,9 @@ ; GFX8-NEXT: v_mul_f32_e32 v3, v7, v5 ; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX8-NEXT: v_div_fixup_f16 v0, v2, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 ; GFX8-NEXT: v_div_fixup_f16 v1, v3, v4, v6 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fdiv_v2f16_ulp25: @@ -1051,9 +1047,8 @@ ; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 ; GFX8-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_rcp_v2f16: @@ -1202,9 +1197,8 @@ ; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 ; GFX8-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_rcp_v2f16_arcp: @@ -1285,11 +1279,9 @@ ; GFX8-LABEL: v_rcp_v2f16_arcp_afn: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_rcp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-NEXT: v_rcp_f16_e32 v0, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_rcp_f16_e32 v1, v0 +; GFX8-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_rcp_v2f16_arcp_afn: @@ -1397,11 +1389,9 @@ ; GFX8-LABEL: v_rcp_v2f16_ulp25: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_rcp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-NEXT: v_rcp_f16_e32 v0, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_rcp_f16_e32 v1, v0 +; GFX8-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_rcp_v2f16_ulp25: @@ -1457,10 +1447,8 @@ ; GFX8-NEXT: v_rcp_f16_e32 v2, v1 ; GFX8-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v2, v0, v2 -; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_mov_b32_e32 v1, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fdiv_v2f16_afn_ulp25: @@ -1589,10 +1577,9 @@ ; GFX8-NEXT: v_mul_f32_e32 v3, v7, v5 ; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX8-NEXT: v_div_fixup_f16 v0, v2, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 ; GFX8-NEXT: v_div_fixup_f16 v1, v3, v4, v6 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fdiv_v2f16_arcp_ulp25: @@ -1682,10 +1669,8 @@ ; GFX8-NEXT: v_rcp_f16_e32 v2, v1 ; GFX8-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v2, v0, v2 -; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_mov_b32_e32 v1, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fdiv_v2f16_arcp_afn_ulp25: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll @@ -208,13 +208,11 @@ ; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 ; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v2 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: v_exp_f16_e32 v0, v0 +; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX8-NEXT: v_exp_f16_e32 v1, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_pow_v2f16: @@ -226,14 +224,13 @@ ; GFX9-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v2 ; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX9-NEXT: v_exp_f16_e32 v1, v1 ; GFX9-NEXT: v_exp_f16_e32 v0, v0 -; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_pow_v2f16: @@ -248,37 +245,39 @@ ; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX10-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 ; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX10-NEXT: v_exp_f16_e32 v1, v1 +; GFX10-NEXT: v_exp_f16_e32 v1, v2 ; GFX10-NEXT: v_exp_f16_e32 v0, v0 -; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_pow_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX11-NEXT: v_log_f16_e32 v0, v0 +; GFX11-NEXT: v_log_f16_e32 v2, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_log_f16_e32 v2, v2 +; GFX11-NEXT: v_log_f16_e32 v0, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v0, v0, v1 :: v_dual_mul_dx9_zero_f32 v1, v2, v3 -; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v1, v2, v1 :: v_dual_mul_dx9_zero_f32 v0, v0, v3 ; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX11-NEXT: v_exp_f16_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-NEXT: v_exp_f16_e32 v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_exp_f16_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y) ret <2 x half> %pow @@ -319,13 +318,11 @@ ; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 ; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v2 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: v_exp_f16_e32 v0, v0 +; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX8-NEXT: v_exp_f16_e32 v1, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_pow_v2f16_fneg_lhs: @@ -338,14 +335,13 @@ ; GFX9-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v2 ; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX9-NEXT: v_exp_f16_e32 v1, v1 ; GFX9-NEXT: v_exp_f16_e32 v0, v0 -; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_pow_v2f16_fneg_lhs: @@ -361,11 +357,12 @@ ; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX10-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 ; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX10-NEXT: v_exp_f16_e32 v1, v1 +; GFX10-NEXT: v_exp_f16_e32 v1, v2 ; GFX10-NEXT: v_exp_f16_e32 v0, v0 -; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_pow_v2f16_fneg_lhs: @@ -376,23 +373,25 @@ ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX11-NEXT: v_log_f16_e32 v0, v0 +; GFX11-NEXT: v_log_f16_e32 v2, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_log_f16_e32 v2, v2 +; GFX11-NEXT: v_log_f16_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v0, v0, v1 :: v_dual_mul_dx9_zero_f32 v1, v2, v3 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v1, v2, v1 :: v_dual_mul_dx9_zero_f32 v0, v0, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_exp_f16_e32 v0, v0 ; GFX11-NEXT: v_exp_f16_e32 v1, v1 +; GFX11-NEXT: v_exp_f16_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %x.fneg = fneg <2 x half> %x %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x.fneg, <2 x half> %y) @@ -429,18 +428,16 @@ ; GFX8-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX8-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 ; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX8-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 +; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v2 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: v_exp_f16_e32 v0, v0 -; GFX8-NEXT: v_exp_f16_e32 v1, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_exp_f16_e32 v1, v2 +; GFX8-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_pow_v2f16_fneg_rhs: @@ -453,14 +450,13 @@ ; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX9-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 ; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX9-NEXT: v_exp_f16_e32 v1, v2 ; GFX9-NEXT: v_exp_f16_e32 v0, v0 -; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_pow_v2f16_fneg_rhs: @@ -468,46 +464,48 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_log_f16_e32 v2, v0 -; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX10-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 ; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX10-NEXT: v_exp_f16_e32 v1, v1 +; GFX10-NEXT: v_exp_f16_e32 v1, v2 ; GFX10-NEXT: v_exp_f16_e32 v0, v0 -; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_pow_v2f16_fneg_rhs: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX11-NEXT: v_log_f16_e32 v0, v0 +; GFX11-NEXT: v_log_f16_e32 v2, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_log_f16_e32 v2, v2 +; GFX11-NEXT: v_log_f16_e32 v0, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v0, v0, v1 :: v_dual_mul_dx9_zero_f32 v1, v2, v3 -; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v1, v2, v1 :: v_dual_mul_dx9_zero_f32 v0, v0, v3 ; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX11-NEXT: v_exp_f16_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-NEXT: v_exp_f16_e32 v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_exp_f16_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %y.fneg = fneg <2 x half> %y %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y.fneg) @@ -550,18 +548,16 @@ ; GFX8-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX8-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 ; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX8-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 +; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v2 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: v_exp_f16_e32 v0, v0 -; GFX8-NEXT: v_exp_f16_e32 v1, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_exp_f16_e32 v1, v2 +; GFX8-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_pow_v2f16_fneg_lhs_rhs: @@ -575,14 +571,13 @@ ; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX9-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 ; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX9-NEXT: v_exp_f16_e32 v1, v2 ; GFX9-NEXT: v_exp_f16_e32 v0, v0 -; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_pow_v2f16_fneg_lhs_rhs: @@ -599,11 +594,12 @@ ; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX10-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 ; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX10-NEXT: v_exp_f16_e32 v1, v1 +; GFX10-NEXT: v_exp_f16_e32 v1, v2 ; GFX10-NEXT: v_exp_f16_e32 v0, v0 -; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_pow_v2f16_fneg_lhs_rhs: @@ -613,26 +609,27 @@ ; GFX11-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 ; GFX11-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX11-NEXT: v_log_f16_e32 v0, v0 +; GFX11-NEXT: v_log_f16_e32 v2, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_log_f16_e32 v2, v2 +; GFX11-NEXT: v_log_f16_e32 v0, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v0, v0, v1 :: v_dual_mul_dx9_zero_f32 v1, v2, v3 -; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v1, v2, v1 :: v_dual_mul_dx9_zero_f32 v0, v0, v3 ; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX11-NEXT: v_exp_f16_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-NEXT: v_exp_f16_e32 v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_exp_f16_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %x.fneg = fneg <2 x half> %x %y.fneg = fneg <2 x half> %y diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll @@ -532,12 +532,10 @@ ; CI-NEXT: v_fma_f32 v3, -v3, v6, v4 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v6 -; CI-NEXT: v_bfe_u32 v0, v0, 0, 16 ; CI-NEXT: v_div_fixup_f32 v3, v3, v2, v1 ; CI-NEXT: v_trunc_f32_e32 v3, v3 ; CI-NEXT: v_fma_f32 v1, -v3, v2, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_bfe_u32 v1, v1, 0, 16 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v2, v0, v1 ; CI-NEXT: v_mov_b32_e32 v0, s4 @@ -573,9 +571,8 @@ ; VI-NEXT: v_div_fixup_f16 v1, v1, v2, s1 ; VI-NEXT: v_trunc_f16_e32 v1, v1 ; VI-NEXT: v_fma_f16 v1, -v1, v2, s1 -; VI-NEXT: v_mov_b32_e32 v2, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_or_b32_e32 v2, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -670,18 +667,14 @@ ; CI-NEXT: v_fma_f32 v5, -v5, v8, v6 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v5, v5, v7, v8 -; CI-NEXT: v_bfe_u32 v1, v1, 0, 16 -; CI-NEXT: v_bfe_u32 v0, v0, 0, 16 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 -; CI-NEXT: v_bfe_u32 v1, v2, 0, 16 ; CI-NEXT: v_div_fixup_f32 v5, v5, v4, v3 ; CI-NEXT: v_trunc_f32_e32 v5, v5 ; CI-NEXT: v_fma_f32 v3, -v5, v4, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_bfe_u32 v2, v3, 0, 16 -; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; CI-NEXT: v_or_b32_e32 v1, v1, v2 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; CI-NEXT: v_or_b32_e32 v1, v2, v1 ; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -723,6 +716,8 @@ ; VI-NEXT: v_trunc_f16_e32 v1, v1 ; VI-NEXT: v_fma_f16 v1, -v1, v2, s6 ; VI-NEXT: v_cvt_f32_f16_e32 v2, s3 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: v_mul_f32_e32 v2, v2, v4 ; VI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; VI-NEXT: v_mov_b32_e32 v4, s9 @@ -735,11 +730,8 @@ ; VI-NEXT: v_div_fixup_f16 v3, v3, v4, s7 ; VI-NEXT: v_trunc_f16_e32 v3, v3 ; VI-NEXT: v_fma_f16 v3, -v3, v4, s7 -; VI-NEXT: v_mov_b32_e32 v4, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -41,7 +41,7 @@ ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: s_and_b32 s2, s2, 0x7f ; GFX8-NEXT: s_and_b32 s1, s1, 0x7f -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: s_lshr_b32 s1, s1, 1 @@ -72,7 +72,7 @@ ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_and_b32 s2, s2, 0x7f ; GFX9-NEXT: s_and_b32 s1, s1, 0x7f -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: s_lshr_b32 s1, s1, 1 @@ -102,7 +102,7 @@ ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 ; GFX10-NEXT: s_and_b32 s2, s2, 0x7f ; GFX10-NEXT: s_and_b32 s1, s1, 0x7f -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: s_lshr_b32 s1, s1, 1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -134,7 +134,7 @@ ; GFX11-NEXT: s_and_b32 s2, s2, 0x7f ; GFX11-NEXT: s_and_b32 s1, s1, 0x7f ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX11-NEXT: s_lshr_b32 s1, s1, 1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -352,7 +352,7 @@ ; GFX8-LABEL: s_fshl_i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_and_b32 s3, s2, 7 ; GFX8-NEXT: s_andn2_b32 s2, 7, s2 ; GFX8-NEXT: s_lshr_b32 s1, s1, 1 @@ -364,7 +364,7 @@ ; GFX9-LABEL: s_fshl_i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_and_b32 s3, s2, 7 ; GFX9-NEXT: s_andn2_b32 s2, 7, s2 ; GFX9-NEXT: s_lshr_b32 s1, s1, 1 @@ -377,7 +377,7 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: s_and_b32 s3, s2, 7 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: s_andn2_b32 s2, 7, s2 ; GFX10-NEXT: s_lshr_b32 s1, s1, 1 ; GFX10-NEXT: s_lshl_b32 s0, s0, s3 @@ -389,7 +389,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_and_b32 s1, s1, 0xff ; GFX11-NEXT: s_and_b32 s3, s2, 7 -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: s_and_not1_b32 s2, 7, s2 ; GFX11-NEXT: s_lshr_b32 s1, s1, 1 ; GFX11-NEXT: s_lshl_b32 s0, s0, s3 @@ -406,7 +406,7 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v3, 7, v2 -; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX6-NEXT: v_bfe_u32 v1, v1, 1, 7 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v3, v0 @@ -418,7 +418,7 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v3, 7, v2 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, v3, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, 1 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 @@ -431,7 +431,7 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v3, 7, v2 -; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX9-NEXT: v_not_b32_e32 v2, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 1 ; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 @@ -444,7 +444,7 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2 +; GFX10-NEXT: v_not_b32_e32 v3, v2 ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX10-NEXT: v_and_b32_e32 v3, 7, v3 @@ -458,7 +458,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_xor_b32_e32 v3, -1, v2 +; GFX11-NEXT: v_not_b32_e32 v3, v2 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) @@ -485,7 +485,7 @@ ; GFX8-LABEL: s_fshl_i8_4: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshl_b32 s0, s0, 4 ; GFX8-NEXT: s_lshr_b32 s1, s1, 4 ; GFX8-NEXT: s_or_b32 s0, s0, s1 @@ -494,7 +494,7 @@ ; GFX9-LABEL: s_fshl_i8_4: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_lshl_b32 s0, s0, 4 ; GFX9-NEXT: s_lshr_b32 s1, s1, 4 ; GFX9-NEXT: s_or_b32 s0, s0, s1 @@ -504,7 +504,7 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: s_lshl_b32 s0, s0, 4 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: s_lshr_b32 s1, s1, 4 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog @@ -513,7 +513,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_and_b32 s1, s1, 0xff ; GFX11-NEXT: s_lshl_b32 s0, s0, 4 -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_lshr_b32 s1, s1, 4 ; GFX11-NEXT: s_or_b32 s0, s0, s1 @@ -584,7 +584,7 @@ ; GFX8-LABEL: s_fshl_i8_5: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshl_b32 s0, s0, 5 ; GFX8-NEXT: s_lshr_b32 s1, s1, 3 ; GFX8-NEXT: s_or_b32 s0, s0, s1 @@ -593,7 +593,7 @@ ; GFX9-LABEL: s_fshl_i8_5: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_lshl_b32 s0, s0, 5 ; GFX9-NEXT: s_lshr_b32 s1, s1, 3 ; GFX9-NEXT: s_or_b32 s0, s0, s1 @@ -603,7 +603,7 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: s_lshl_b32 s0, s0, 5 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: s_lshr_b32 s1, s1, 3 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog @@ -612,7 +612,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_and_b32 s1, s1, 0xff ; GFX11-NEXT: s_lshl_b32 s0, s0, 5 -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_lshr_b32 s1, s1, 3 ; GFX11-NEXT: s_or_b32 s0, s0, s1 @@ -700,7 +700,7 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s4, s1, 8 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshr_b32 s5, s2, 8 ; GFX8-NEXT: s_and_b32 s6, s2, 7 ; GFX8-NEXT: s_andn2_b32 s2, 7, s2 @@ -712,15 +712,14 @@ ; GFX8-NEXT: s_and_b32 s1, s5, 7 ; GFX8-NEXT: s_lshl_b32 s1, s3, s1 ; GFX8-NEXT: s_and_b32 s3, s4, 0xff -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_andn2_b32 s2, 7, s5 ; GFX8-NEXT: s_lshr_b32 s3, s3, 1 ; GFX8-NEXT: s_lshr_b32 s2, s3, s2 ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_bfe_u32 s2, 8, 0x100000 ; GFX8-NEXT: s_and_b32 s0, s0, 0xff -; GFX8-NEXT: s_lshl_b32 s1, s1, s2 +; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; @@ -728,7 +727,7 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_lshr_b32 s4, s1, 8 ; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_lshr_b32 s5, s2, 8 ; GFX9-NEXT: s_and_b32 s6, s2, 7 ; GFX9-NEXT: s_andn2_b32 s2, 7, s2 @@ -740,15 +739,14 @@ ; GFX9-NEXT: s_and_b32 s1, s5, 7 ; GFX9-NEXT: s_lshl_b32 s1, s3, s1 ; GFX9-NEXT: s_and_b32 s3, s4, 0xff -; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX9-NEXT: s_andn2_b32 s2, 7, s5 ; GFX9-NEXT: s_lshr_b32 s3, s3, 1 ; GFX9-NEXT: s_lshr_b32 s2, s3, s2 ; GFX9-NEXT: s_or_b32 s1, s1, s2 ; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_bfe_u32 s2, 8, 0x100000 ; GFX9-NEXT: s_and_b32 s0, s0, 0xff -; GFX9-NEXT: s_lshl_b32 s1, s1, s2 +; GFX9-NEXT: s_lshl_b32 s1, s1, 8 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; @@ -759,9 +757,9 @@ ; GFX10-NEXT: s_and_b32 s4, s4, 0xff ; GFX10-NEXT: s_and_b32 s6, s2, 7 ; GFX10-NEXT: s_and_b32 s1, s1, 0xff -; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX10-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX10-NEXT: s_lshr_b32 s3, s0, 8 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: s_lshl_b32 s0, s0, s6 ; GFX10-NEXT: s_and_b32 s6, s5, 7 ; GFX10-NEXT: s_andn2_b32 s5, 7, s5 @@ -774,9 +772,8 @@ ; GFX10-NEXT: s_or_b32 s2, s3, s4 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: s_and_b32 s1, s2, 0xff -; GFX10-NEXT: s_bfe_u32 s2, 8, 0x100000 ; GFX10-NEXT: s_and_b32 s0, s0, 0xff -; GFX10-NEXT: s_lshl_b32 s1, s1, s2 +; GFX10-NEXT: s_lshl_b32 s1, s1, 8 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog ; @@ -787,9 +784,9 @@ ; GFX11-NEXT: s_and_b32 s4, s4, 0xff ; GFX11-NEXT: s_and_b32 s6, s2, 7 ; GFX11-NEXT: s_and_b32 s1, s1, 0xff -; GFX11-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX11-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX11-NEXT: s_lshr_b32 s3, s0, 8 -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: s_lshl_b32 s0, s0, s6 ; GFX11-NEXT: s_and_b32 s6, s5, 7 ; GFX11-NEXT: s_and_not1_b32 s5, 7, s5 @@ -802,9 +799,8 @@ ; GFX11-NEXT: s_or_b32 s2, s3, s4 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_and_b32 s1, s2, 0xff -; GFX11-NEXT: s_bfe_u32 s2, 8, 0x100000 ; GFX11-NEXT: s_and_b32 s0, s0, 0xff -; GFX11-NEXT: s_lshl_b32 s1, s1, s2 +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: ; return to shader part epilog @@ -822,7 +818,7 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 8, v2 ; GFX6-NEXT: v_and_b32_e32 v5, 7, v2 -; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v5, v0 @@ -830,7 +826,7 @@ ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v2, v5 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 7, v4 -; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX6-NEXT: v_not_b32_e32 v4, v4 ; GFX6-NEXT: v_bfe_u32 v1, v1, 8, 8 ; GFX6-NEXT: v_and_b32_e32 v4, 7, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 @@ -849,7 +845,7 @@ ; GFX8-NEXT: v_and_b32_e32 v6, 7, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v2 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, v6, v0 ; GFX8-NEXT: v_mov_b32_e32 v6, 1 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v1 @@ -858,7 +854,7 @@ ; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_and_b32_e32 v1, 7, v5 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v5 +; GFX8-NEXT: v_not_b32_e32 v2, v5 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, v1, v3 ; GFX8-NEXT: v_lshrrev_b16_sdwa v3, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -874,7 +870,7 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v2 ; GFX9-NEXT: v_and_b32_e32 v6, 7, v2 -; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX9-NEXT: v_not_b32_e32 v2, v2 ; GFX9-NEXT: s_mov_b32 s4, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 ; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 @@ -884,7 +880,7 @@ ; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 7, v5 -; GFX9-NEXT: v_xor_b32_e32 v2, -1, v5 +; GFX9-NEXT: v_not_b32_e32 v2, v5 ; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, v1, v3 ; GFX9-NEXT: v_lshrrev_b16_sdwa v3, s4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -902,9 +898,9 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v0 -; GFX10-NEXT: v_xor_b32_e32 v7, -1, v2 +; GFX10-NEXT: v_not_b32_e32 v7, v2 ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX10-NEXT: v_xor_b32_e32 v6, -1, v3 +; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX10-NEXT: v_and_b32_e32 v3, 7, v3 ; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 @@ -930,9 +926,9 @@ ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 8, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v0 -; GFX11-NEXT: v_xor_b32_e32 v7, -1, v2 +; GFX11-NEXT: v_not_b32_e32 v7, v2 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-NEXT: v_xor_b32_e32 v6, -1, v3 +; GFX11-NEXT: v_not_b32_e32 v6, v3 ; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX11-NEXT: v_and_b32_e32 v3, 7, v3 ; GFX11-NEXT: v_and_b32_e32 v2, 7, v2 @@ -1016,7 +1012,7 @@ ; GFX8-NEXT: s_lshr_b32 s7, s1, 16 ; GFX8-NEXT: s_lshr_b32 s8, s1, 24 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshr_b32 s9, s2, 8 ; GFX8-NEXT: s_lshr_b32 s10, s2, 16 ; GFX8-NEXT: s_lshr_b32 s11, s2, 24 @@ -1032,7 +1028,7 @@ ; GFX8-NEXT: s_and_b32 s1, s9, 7 ; GFX8-NEXT: s_lshl_b32 s1, s3, s1 ; GFX8-NEXT: s_and_b32 s3, s6, 0xff -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_andn2_b32 s2, 7, s9 ; GFX8-NEXT: s_lshr_b32 s3, s3, 1 ; GFX8-NEXT: s_lshr_b32 s2, s3, s2 @@ -1040,7 +1036,7 @@ ; GFX8-NEXT: s_and_b32 s2, s10, 7 ; GFX8-NEXT: s_lshl_b32 s2, s4, s2 ; GFX8-NEXT: s_and_b32 s4, s7, 0xff -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX8-NEXT: s_andn2_b32 s3, 7, s10 ; GFX8-NEXT: s_lshr_b32 s4, s4, 1 ; GFX8-NEXT: s_lshr_b32 s3, s4, s3 @@ -1069,7 +1065,7 @@ ; GFX9-NEXT: s_lshr_b32 s7, s1, 16 ; GFX9-NEXT: s_lshr_b32 s8, s1, 24 ; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_lshr_b32 s9, s2, 8 ; GFX9-NEXT: s_lshr_b32 s10, s2, 16 ; GFX9-NEXT: s_lshr_b32 s11, s2, 24 @@ -1085,7 +1081,7 @@ ; GFX9-NEXT: s_and_b32 s1, s9, 7 ; GFX9-NEXT: s_lshl_b32 s1, s3, s1 ; GFX9-NEXT: s_and_b32 s3, s6, 0xff -; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX9-NEXT: s_andn2_b32 s2, 7, s9 ; GFX9-NEXT: s_lshr_b32 s3, s3, 1 ; GFX9-NEXT: s_lshr_b32 s2, s3, s2 @@ -1093,7 +1089,7 @@ ; GFX9-NEXT: s_and_b32 s2, s10, 7 ; GFX9-NEXT: s_lshl_b32 s2, s4, s2 ; GFX9-NEXT: s_and_b32 s4, s7, 0xff -; GFX9-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX9-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX9-NEXT: s_andn2_b32 s3, 7, s10 ; GFX9-NEXT: s_lshr_b32 s4, s4, 1 ; GFX9-NEXT: s_lshr_b32 s3, s4, s3 @@ -1123,7 +1119,7 @@ ; GFX10-NEXT: s_lshr_b32 s8, s1, 24 ; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: s_lshr_b32 s9, s2, 8 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: s_lshr_b32 s10, s2, 16 ; GFX10-NEXT: s_lshr_b32 s11, s2, 24 ; GFX10-NEXT: s_and_b32 s12, s2, 7 @@ -1133,7 +1129,7 @@ ; GFX10-NEXT: s_lshr_b32 s1, s1, s2 ; GFX10-NEXT: s_and_b32 s2, s6, 0xff ; GFX10-NEXT: s_and_b32 s6, s9, 7 -; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX10-NEXT: s_andn2_b32 s9, 7, s9 ; GFX10-NEXT: s_lshr_b32 s2, s2, 1 ; GFX10-NEXT: s_lshr_b32 s4, s0, 16 @@ -1145,7 +1141,7 @@ ; GFX10-NEXT: s_or_b32 s1, s3, s2 ; GFX10-NEXT: s_and_b32 s2, s7, 0xff ; GFX10-NEXT: s_and_b32 s3, s10, 7 -; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX10-NEXT: s_andn2_b32 s6, 7, s10 ; GFX10-NEXT: s_lshr_b32 s2, s2, 1 ; GFX10-NEXT: s_lshl_b32 s3, s4, s3 @@ -1176,7 +1172,7 @@ ; GFX11-NEXT: s_lshr_b32 s8, s1, 24 ; GFX11-NEXT: s_and_b32 s1, s1, 0xff ; GFX11-NEXT: s_lshr_b32 s9, s2, 8 -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: s_lshr_b32 s10, s2, 16 ; GFX11-NEXT: s_lshr_b32 s11, s2, 24 ; GFX11-NEXT: s_and_b32 s12, s2, 7 @@ -1186,7 +1182,7 @@ ; GFX11-NEXT: s_lshr_b32 s1, s1, s2 ; GFX11-NEXT: s_and_b32 s2, s6, 0xff ; GFX11-NEXT: s_and_b32 s6, s9, 7 -; GFX11-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX11-NEXT: s_and_not1_b32 s9, 7, s9 ; GFX11-NEXT: s_lshr_b32 s2, s2, 1 ; GFX11-NEXT: s_lshr_b32 s4, s0, 16 @@ -1198,7 +1194,7 @@ ; GFX11-NEXT: s_or_b32 s1, s3, s2 ; GFX11-NEXT: s_and_b32 s2, s7, 0xff ; GFX11-NEXT: s_and_b32 s3, s10, 7 -; GFX11-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX11-NEXT: s_and_not1_b32 s6, 7, s10 ; GFX11-NEXT: s_lshr_b32 s2, s2, 1 ; GFX11-NEXT: s_lshl_b32 s3, s4, s3 @@ -1238,7 +1234,7 @@ ; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v8, 24, v2 ; GFX6-NEXT: v_and_b32_e32 v9, 7, v2 -; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 24, v0 @@ -1248,7 +1244,7 @@ ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v2, v9 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 7, v6 -; GFX6-NEXT: v_xor_b32_e32 v6, -1, v6 +; GFX6-NEXT: v_not_b32_e32 v6, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_bfe_u32 v3, v1, 8, 8 ; GFX6-NEXT: v_and_b32_e32 v6, 7, v6 @@ -1256,13 +1252,13 @@ ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v6, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_and_b32_e32 v3, 7, v7 -; GFX6-NEXT: v_xor_b32_e32 v6, -1, v7 +; GFX6-NEXT: v_not_b32_e32 v6, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, v3, v4 ; GFX6-NEXT: v_bfe_u32 v4, v1, 16, 8 ; GFX6-NEXT: v_and_b32_e32 v6, 7, v6 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v6, v4 -; GFX6-NEXT: v_xor_b32_e32 v6, -1, v8 +; GFX6-NEXT: v_not_b32_e32 v6, v8 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 7, v8 ; GFX6-NEXT: v_and_b32_e32 v6, 7, v6 @@ -1289,7 +1285,7 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v2 ; GFX8-NEXT: v_and_b32_e32 v8, 7, v2 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: v_mov_b32_e32 v10, 1 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX8-NEXT: v_lshrrev_b16_sdwa v11, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -1298,7 +1294,7 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v1 ; GFX8-NEXT: v_or_b32_e32 v2, v8, v2 ; GFX8-NEXT: v_and_b32_e32 v8, 7, v5 -; GFX8-NEXT: v_xor_b32_e32 v5, -1, v5 +; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX8-NEXT: v_and_b32_e32 v5, 7, v5 ; GFX8-NEXT: v_lshrrev_b16_sdwa v4, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -1307,7 +1303,7 @@ ; GFX8-NEXT: v_lshrrev_b16_e32 v4, v5, v4 ; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX8-NEXT: v_and_b32_e32 v4, 7, v6 -; GFX8-NEXT: v_xor_b32_e32 v5, -1, v6 +; GFX8-NEXT: v_not_b32_e32 v5, v6 ; GFX8-NEXT: v_and_b32_sdwa v6, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_e32 v5, 7, v5 ; GFX8-NEXT: v_lshrrev_b16_e32 v6, 1, v6 @@ -1315,7 +1311,7 @@ ; GFX8-NEXT: v_lshrrev_b16_e32 v5, v5, v6 ; GFX8-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX8-NEXT: v_and_b32_e32 v5, 7, v7 -; GFX8-NEXT: v_xor_b32_e32 v6, -1, v7 +; GFX8-NEXT: v_not_b32_e32 v6, v7 ; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX8-NEXT: v_mov_b32_e32 v5, 1 ; GFX8-NEXT: v_and_b32_e32 v6, 7, v6 @@ -1340,7 +1336,7 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v2 ; GFX9-NEXT: v_and_b32_e32 v8, 7, v2 -; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX9-NEXT: v_not_b32_e32 v2, v2 ; GFX9-NEXT: s_mov_b32 s5, 1 ; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX9-NEXT: v_lshrrev_b16_sdwa v10, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -1349,7 +1345,7 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 ; GFX9-NEXT: v_or_b32_e32 v2, v8, v2 ; GFX9-NEXT: v_and_b32_e32 v8, 7, v5 -; GFX9-NEXT: v_xor_b32_e32 v5, -1, v5 +; GFX9-NEXT: v_not_b32_e32 v5, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX9-NEXT: v_and_b32_e32 v5, 7, v5 ; GFX9-NEXT: v_lshrrev_b16_sdwa v4, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -1358,7 +1354,7 @@ ; GFX9-NEXT: v_lshrrev_b16_e32 v4, v5, v4 ; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX9-NEXT: v_and_b32_e32 v4, 7, v6 -; GFX9-NEXT: v_xor_b32_e32 v5, -1, v6 +; GFX9-NEXT: v_not_b32_e32 v5, v6 ; GFX9-NEXT: v_and_b32_sdwa v6, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v5, 7, v5 ; GFX9-NEXT: v_lshrrev_b16_e32 v6, 1, v6 @@ -1366,7 +1362,7 @@ ; GFX9-NEXT: v_lshrrev_b16_e32 v5, v5, v6 ; GFX9-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX9-NEXT: v_and_b32_e32 v5, 7, v7 -; GFX9-NEXT: v_xor_b32_e32 v6, -1, v7 +; GFX9-NEXT: v_not_b32_e32 v6, v7 ; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-NEXT: v_mov_b32_e32 v5, 1 ; GFX9-NEXT: v_and_b32_e32 v6, 7, v6 @@ -1394,20 +1390,20 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v1 -; GFX10-NEXT: v_xor_b32_e32 v9, -1, v2 +; GFX10-NEXT: v_not_b32_e32 v9, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX10-NEXT: v_lshlrev_b16 v0, v10, v0 -; GFX10-NEXT: v_xor_b32_e32 v10, -1, v8 +; GFX10-NEXT: v_not_b32_e32 v10, v8 ; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 ; GFX10-NEXT: v_mov_b32_e32 v13, 0xff ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX10-NEXT: v_and_b32_e32 v12, 0xff, v1 ; GFX10-NEXT: v_and_b32_e32 v6, 0xff, v6 ; GFX10-NEXT: v_lshlrev_b16 v3, v8, v3 -; GFX10-NEXT: v_xor_b32_e32 v8, -1, v11 +; GFX10-NEXT: v_not_b32_e32 v8, v11 ; GFX10-NEXT: v_and_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_xor_b32_e32 v13, -1, v2 +; GFX10-NEXT: v_not_b32_e32 v13, v2 ; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 ; GFX10-NEXT: v_lshrrev_b16 v6, 1, v6 ; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 @@ -1448,7 +1444,7 @@ ; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v2 ; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX11-NEXT: v_xor_b32_e32 v13, -1, v9 +; GFX11-NEXT: v_not_b32_e32 v13, v9 ; GFX11-NEXT: v_lshrrev_b32_e32 v11, 24, v2 ; GFX11-NEXT: v_and_b32_e32 v9, 7, v9 ; GFX11-NEXT: v_lshrrev_b32_e32 v8, 24, v1 @@ -1456,13 +1452,13 @@ ; GFX11-NEXT: v_and_b32_e32 v13, 7, v13 ; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 ; GFX11-NEXT: v_lshlrev_b16 v3, v9, v3 -; GFX11-NEXT: v_xor_b32_e32 v9, -1, v10 +; GFX11-NEXT: v_not_b32_e32 v9, v10 ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX11-NEXT: v_lshrrev_b16 v6, v13, v6 -; GFX11-NEXT: v_xor_b32_e32 v13, -1, v11 +; GFX11-NEXT: v_not_b32_e32 v13, v11 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX11-NEXT: v_and_b32_e32 v12, 7, v2 -; GFX11-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX11-NEXT: v_not_b32_e32 v2, v2 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-NEXT: v_and_b32_e32 v10, 7, v10 ; GFX11-NEXT: v_and_b32_e32 v9, 7, v9 @@ -1834,10 +1830,10 @@ ; GFX6-NEXT: s_lshl_b32 s0, s0, 8 ; GFX6-NEXT: s_and_b32 s6, s6, 0xff ; GFX6-NEXT: s_or_b32 s0, s9, s0 -; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX6-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: s_lshr_b32 s8, s1, 8 -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 ; GFX6-NEXT: s_and_b32 s1, s1, 0xff ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -1845,8 +1841,8 @@ ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 ; GFX6-NEXT: s_and_b32 s6, s8, 0xff ; GFX6-NEXT: s_or_b32 s1, s7, s1 -; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX6-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 ; GFX6-NEXT: v_mov_b32_e32 v1, 0xffffffe8 ; GFX6-NEXT: s_or_b32 s1, s1, s6 @@ -1858,9 +1854,9 @@ ; GFX6-NEXT: s_lshl_b32 s2, s2, 8 ; GFX6-NEXT: s_and_b32 s6, s6, 0xff ; GFX6-NEXT: s_or_b32 s2, s9, s2 -; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX6-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX6-NEXT: s_lshr_b32 s8, s3, 8 -; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX6-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 ; GFX6-NEXT: s_and_b32 s3, s3, 0xff ; GFX6-NEXT: s_or_b32 s2, s2, s6 @@ -1868,8 +1864,8 @@ ; GFX6-NEXT: s_and_b32 s6, s8, 0xff ; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX6-NEXT: s_or_b32 s3, s7, s3 -; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX6-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX6-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX6-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 ; GFX6-NEXT: s_or_b32 s3, s3, s6 ; GFX6-NEXT: s_lshr_b32 s6, s4, 16 @@ -1881,9 +1877,9 @@ ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 ; GFX6-NEXT: s_or_b32 s4, s9, s4 -; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX6-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 ; GFX6-NEXT: s_or_b32 s4, s4, s6 ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 @@ -1901,9 +1897,9 @@ ; GFX6-NEXT: s_and_b32 s6, s8, 0xff ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX6-NEXT: s_or_b32 s5, s7, s5 -; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX6-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 24, v0 -; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX6-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX6-NEXT: s_or_b32 s5, s5, s6 @@ -1954,26 +1950,25 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s6, s0, 8 ; GFX8-NEXT: s_and_b32 s6, s6, 0xff -; GFX8-NEXT: s_bfe_u32 s10, 8, 0x100000 ; GFX8-NEXT: s_lshr_b32 s7, s0, 16 ; GFX8-NEXT: s_lshr_b32 s8, s0, 24 ; GFX8-NEXT: s_and_b32 s0, s0, 0xff -; GFX8-NEXT: s_lshl_b32 s6, s6, s10 +; GFX8-NEXT: s_lshl_b32 s6, s6, 8 ; GFX8-NEXT: s_or_b32 s0, s0, s6 ; GFX8-NEXT: s_and_b32 s6, s7, 0xff ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 -; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: s_lshr_b32 s9, s1, 8 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_or_b32 s0, s0, s6 -; GFX8-NEXT: s_lshl_b32 s1, s1, s10 +; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_and_b32 s6, s9, 0xff ; GFX8-NEXT: s_or_b32 s1, s8, s1 -; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: s_or_b32 s1, s1, s6 @@ -1983,22 +1978,22 @@ ; GFX8-NEXT: s_lshr_b32 s7, s2, 16 ; GFX8-NEXT: s_lshr_b32 s8, s2, 24 ; GFX8-NEXT: s_and_b32 s2, s2, 0xff -; GFX8-NEXT: s_lshl_b32 s6, s6, s10 +; GFX8-NEXT: s_lshl_b32 s6, s6, 8 ; GFX8-NEXT: s_or_b32 s2, s2, s6 ; GFX8-NEXT: s_and_b32 s6, s7, 0xff ; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffe8 -; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX8-NEXT: v_mul_lo_u32 v2, v0, v1 ; GFX8-NEXT: s_lshr_b32 s9, s3, 8 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 ; GFX8-NEXT: s_and_b32 s3, s3, 0xff ; GFX8-NEXT: s_or_b32 s2, s2, s6 -; GFX8-NEXT: s_lshl_b32 s3, s3, s10 +; GFX8-NEXT: s_lshl_b32 s3, s3, 8 ; GFX8-NEXT: s_and_b32 s6, s9, 0xff ; GFX8-NEXT: s_or_b32 s3, s8, s3 -; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 ; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX8-NEXT: s_or_b32 s3, s3, s6 @@ -2007,14 +2002,14 @@ ; GFX8-NEXT: s_lshr_b32 s7, s4, 16 ; GFX8-NEXT: s_lshr_b32 s8, s4, 24 ; GFX8-NEXT: s_and_b32 s4, s4, 0xff -; GFX8-NEXT: s_lshl_b32 s6, s6, s10 +; GFX8-NEXT: s_lshl_b32 s6, s6, 8 ; GFX8-NEXT: s_or_b32 s4, s4, s6 ; GFX8-NEXT: s_and_b32 s6, s7, 0xff ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 -; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 ; GFX8-NEXT: s_or_b32 s4, s4, s6 ; GFX8-NEXT: v_mul_hi_u32 v0, s4, v0 @@ -2024,7 +2019,7 @@ ; GFX8-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX8-NEXT: s_and_b32 s5, s5, 0xff ; GFX8-NEXT: v_mul_lo_u32 v1, v2, v1 -; GFX8-NEXT: s_lshl_b32 s5, s5, s10 +; GFX8-NEXT: s_lshl_b32 s5, s5, 8 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v0 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 @@ -2032,9 +2027,9 @@ ; GFX8-NEXT: s_and_b32 s6, s9, 0xff ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX8-NEXT: s_or_b32 s5, s8, s5 -; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v0 -; GFX8-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX8-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX8-NEXT: s_or_b32 s5, s5, s6 @@ -2084,28 +2079,27 @@ ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_lshr_b32 s7, s0, 8 ; GFX9-NEXT: s_and_b32 s7, s7, 0xff -; GFX9-NEXT: s_bfe_u32 s12, 8, 0x100000 +; GFX9-NEXT: s_lshr_b32 s9, s0, 16 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_lshr_b32 s9, s0, 16 ; GFX9-NEXT: s_lshr_b32 s10, s0, 24 ; GFX9-NEXT: s_and_b32 s0, s0, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s7, s12 +; GFX9-NEXT: s_lshl_b32 s7, s7, 8 ; GFX9-NEXT: s_or_b32 s0, s0, s7 ; GFX9-NEXT: s_and_b32 s7, s9, 0xff -; GFX9-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX9-NEXT: s_and_b32 s7, 0xffff, s7 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffffffe8 ; GFX9-NEXT: s_lshr_b32 s11, s1, 8 -; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX9-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: v_mul_lo_u32 v2, v0, v1 ; GFX9-NEXT: s_or_b32 s0, s0, s7 -; GFX9-NEXT: s_lshl_b32 s1, s1, s12 +; GFX9-NEXT: s_lshl_b32 s1, s1, 8 ; GFX9-NEXT: s_and_b32 s7, s11, 0xff ; GFX9-NEXT: s_or_b32 s1, s10, s1 -; GFX9-NEXT: s_bfe_u32 s7, s7, 0x100000 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_and_b32 s7, 0xffff, s7 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: s_or_b32 s1, s1, s7 ; GFX9-NEXT: s_lshr_b32 s7, s2, 8 @@ -2114,23 +2108,23 @@ ; GFX9-NEXT: s_lshr_b32 s9, s2, 16 ; GFX9-NEXT: s_lshr_b32 s10, s2, 24 ; GFX9-NEXT: s_and_b32 s2, s2, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s7, s12 +; GFX9-NEXT: s_lshl_b32 s7, s7, 8 ; GFX9-NEXT: s_or_b32 s2, s2, s7 ; GFX9-NEXT: s_and_b32 s7, s9, 0xff -; GFX9-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX9-NEXT: s_and_b32 s7, 0xffff, s7 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 ; GFX9-NEXT: s_lshr_b32 s11, s3, 8 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: s_and_b32 s3, s3, 0xff ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX9-NEXT: s_or_b32 s2, s2, s7 -; GFX9-NEXT: s_lshl_b32 s3, s3, s12 +; GFX9-NEXT: s_lshl_b32 s3, s3, 8 ; GFX9-NEXT: s_and_b32 s7, s11, 0xff ; GFX9-NEXT: s_or_b32 s3, s10, s3 -; GFX9-NEXT: s_bfe_u32 s7, s7, 0x100000 -; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX9-NEXT: s_and_b32 s7, 0xffff, s7 +; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: s_or_b32 s3, s3, s7 ; GFX9-NEXT: s_lshr_b32 s7, s4, 8 @@ -2140,11 +2134,11 @@ ; GFX9-NEXT: s_lshr_b32 s9, s4, 16 ; GFX9-NEXT: s_lshr_b32 s10, s4, 24 ; GFX9-NEXT: s_and_b32 s4, s4, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s7, s12 +; GFX9-NEXT: s_lshl_b32 s7, s7, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s7 ; GFX9-NEXT: s_and_b32 s7, s9, 0xff -; GFX9-NEXT: s_bfe_u32 s7, s7, 0x100000 -; GFX9-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX9-NEXT: s_and_b32 s7, 0xffff, s7 +; GFX9-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: v_mul_lo_u32 v1, v2, v1 ; GFX9-NEXT: s_or_b32 s4, s4, s7 @@ -2152,12 +2146,12 @@ ; GFX9-NEXT: s_lshr_b32 s11, s5, 8 ; GFX9-NEXT: s_and_b32 s5, s5, 0xff ; GFX9-NEXT: v_mul_hi_u32 v1, v2, v1 -; GFX9-NEXT: s_lshl_b32 s5, s5, s12 +; GFX9-NEXT: s_lshl_b32 s5, s5, 8 ; GFX9-NEXT: s_and_b32 s7, s11, 0xff ; GFX9-NEXT: s_or_b32 s5, s10, s5 -; GFX9-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX9-NEXT: s_and_b32 s7, 0xffff, s7 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, 24 -; GFX9-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX9-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: s_or_b32 s5, s5, s7 ; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 @@ -2214,61 +2208,61 @@ ; GFX10-NEXT: s_and_b32 s6, s6, 0xff ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX10-NEXT: s_bfe_u32 s10, 8, 0x100000 ; GFX10-NEXT: s_lshr_b32 s8, s0, 24 ; GFX10-NEXT: s_and_b32 s0, s0, 0xff -; GFX10-NEXT: s_lshl_b32 s6, s6, s10 +; GFX10-NEXT: s_lshl_b32 s6, s6, 8 ; GFX10-NEXT: s_and_b32 s7, s7, 0xff ; GFX10-NEXT: s_or_b32 s0, s0, s6 -; GFX10-NEXT: s_bfe_u32 s6, s7, 0x100000 +; GFX10-NEXT: s_and_b32 s6, 0xffff, s7 ; GFX10-NEXT: s_lshr_b32 s7, s4, 8 +; GFX10-NEXT: s_lshr_b32 s10, s4, 16 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX10-NEXT: s_and_b32 s7, s7, 0xff -; GFX10-NEXT: s_lshr_b32 s11, s4, 16 -; GFX10-NEXT: s_lshr_b32 s12, s4, 24 +; GFX10-NEXT: s_lshr_b32 s11, s4, 24 +; GFX10-NEXT: s_and_b32 s4, s4, 0xff ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX10-NEXT: s_and_b32 s4, s4, 0xff -; GFX10-NEXT: s_lshl_b32 s7, s7, s10 -; GFX10-NEXT: s_lshr_b32 s13, s5, 8 +; GFX10-NEXT: s_lshl_b32 s7, s7, 8 +; GFX10-NEXT: s_lshr_b32 s12, s5, 8 +; GFX10-NEXT: s_or_b32 s4, s4, s7 ; GFX10-NEXT: v_mul_lo_u32 v2, 0xffffffe8, v0 ; GFX10-NEXT: v_mul_lo_u32 v3, 0xffffffe8, v1 -; GFX10-NEXT: s_or_b32 s4, s4, s7 -; GFX10-NEXT: s_and_b32 s7, s11, 0xff -; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX10-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX10-NEXT: s_and_b32 s7, s10, 0xff +; GFX10-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX10-NEXT: s_and_b32 s7, 0xffff, s7 ; GFX10-NEXT: s_and_b32 s5, s5, 0xff ; GFX10-NEXT: s_lshl_b32 s7, s7, 16 +; GFX10-NEXT: s_lshl_b32 s5, s5, 8 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX10-NEXT: s_lshl_b32 s5, s5, s10 ; GFX10-NEXT: s_or_b32 s4, s4, s7 -; GFX10-NEXT: s_and_b32 s7, s13, 0xff -; GFX10-NEXT: s_or_b32 s5, s12, s5 -; GFX10-NEXT: s_bfe_u32 s7, s7, 0x100000 -; GFX10-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX10-NEXT: s_and_b32 s7, s12, 0xff +; GFX10-NEXT: s_or_b32 s5, s11, s5 +; GFX10-NEXT: s_and_b32 s7, 0xffff, s7 +; GFX10-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX10-NEXT: s_lshl_b32 s7, s7, 16 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 -; GFX10-NEXT: s_lshl_b32 s7, s7, 16 -; GFX10-NEXT: s_lshr_b32 s9, s1, 8 ; GFX10-NEXT: s_or_b32 s5, s5, s7 +; GFX10-NEXT: s_lshr_b32 s9, s1, 8 +; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX10-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX10-NEXT: s_and_b32 s1, s1, 0xff +; GFX10-NEXT: s_lshl_b32 s1, s1, 8 ; GFX10-NEXT: s_and_b32 s7, s9, 0xff -; GFX10-NEXT: s_lshl_b32 s1, s1, s10 -; GFX10-NEXT: s_lshr_b32 s9, s2, 16 ; GFX10-NEXT: s_or_b32 s1, s8, s1 ; GFX10-NEXT: s_lshr_b32 s8, s2, 8 +; GFX10-NEXT: s_lshr_b32 s9, s2, 16 +; GFX10-NEXT: s_and_b32 s8, s8, 0xff ; GFX10-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX10-NEXT: v_mul_lo_u32 v1, v1, 24 -; GFX10-NEXT: s_and_b32 s8, s8, 0xff -; GFX10-NEXT: s_lshr_b32 s11, s2, 24 +; GFX10-NEXT: s_lshr_b32 s10, s2, 24 ; GFX10-NEXT: s_and_b32 s2, s2, 0xff -; GFX10-NEXT: s_lshl_b32 s8, s8, s10 -; GFX10-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX10-NEXT: s_lshl_b32 s8, s8, 8 +; GFX10-NEXT: s_and_b32 s7, 0xffff, s7 ; GFX10-NEXT: s_or_b32 s2, s2, s8 +; GFX10-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, s4, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s5, v1 ; GFX10-NEXT: s_lshr_b32 s4, s3, 8 @@ -2277,20 +2271,20 @@ ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v1 -; GFX10-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX10-NEXT: s_lshl_b32 s3, s3, s10 +; GFX10-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX10-NEXT: s_lshl_b32 s3, s3, 8 ; GFX10-NEXT: s_and_b32 s4, s4, 0xff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 -; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX10-NEXT: s_lshl_b32 s5, s5, 16 -; GFX10-NEXT: s_or_b32 s3, s11, s3 +; GFX10-NEXT: s_or_b32 s3, s10, s3 ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX10-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX10-NEXT: s_or_b32 s2, s2, s5 -; GFX10-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX10-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 @@ -2300,17 +2294,16 @@ ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 23, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GFX10-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX10-NEXT: s_lshl_b32 s6, s6, 16 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, 23, v1 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff, v1 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX10-NEXT: s_lshl_b32 s7, s7, 16 +; GFX10-NEXT: s_or_b32 s0, s0, s6 ; GFX10-NEXT: v_lshrrev_b32_e64 v2, v2, s2 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GFX10-NEXT: s_lshr_b32 s2, s3, 1 -; GFX10-NEXT: s_or_b32 s0, s0, s6 ; GFX10-NEXT: s_or_b32 s1, s1, s7 ; GFX10-NEXT: v_lshl_or_b32 v0, s0, v0, v2 ; GFX10-NEXT: v_lshrrev_b32_e64 v3, v3, s2 @@ -2335,95 +2328,94 @@ ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, 24 ; GFX11-NEXT: s_lshr_b32 s6, s0, 8 -; GFX11-NEXT: s_bfe_u32 s9, 8, 0x100000 +; GFX11-NEXT: s_lshr_b32 s7, s0, 16 ; GFX11-NEXT: s_and_b32 s6, s6, 0xff ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX11-NEXT: s_lshr_b32 s7, s0, 16 ; GFX11-NEXT: s_lshr_b32 s8, s0, 24 ; GFX11-NEXT: s_and_b32 s0, s0, 0xff -; GFX11-NEXT: s_lshl_b32 s6, s6, s9 -; GFX11-NEXT: s_lshr_b32 s11, s4, 24 +; GFX11-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-NEXT: s_lshr_b32 s10, s4, 24 ; GFX11-NEXT: s_or_b32 s0, s0, s6 ; GFX11-NEXT: s_and_b32 s6, s7, 0xff -; GFX11-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX11-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX11-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_dual_mul_f32 v0, 0x4f7ffffe, v0 :: v_dual_mul_f32 v1, 0x4f7ffffe, v1 -; GFX11-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX11-NEXT: s_lshr_b32 s7, s4, 16 ; GFX11-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-NEXT: s_lshr_b32 s7, s4, 16 +; GFX11-NEXT: s_or_b32 s0, s0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX11-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX11-NEXT: s_or_b32 s0, s0, s6 ; GFX11-NEXT: s_lshr_b32 s6, s4, 8 ; GFX11-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-NEXT: s_and_b32 s6, s6, 0xff ; GFX11-NEXT: v_mul_lo_u32 v2, 0xffffffe8, v0 ; GFX11-NEXT: v_mul_lo_u32 v3, 0xffffffe8, v1 -; GFX11-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-NEXT: s_lshl_b32 s6, s6, 8 ; GFX11-NEXT: s_and_b32 s7, s7, 0xff -; GFX11-NEXT: s_lshl_b32 s6, s6, s9 -; GFX11-NEXT: s_lshr_b32 s12, s5, 8 ; GFX11-NEXT: s_or_b32 s4, s4, s6 -; GFX11-NEXT: s_bfe_u32 s6, s7, 0x100000 +; GFX11-NEXT: s_and_b32 s6, 0xffff, s7 +; GFX11-NEXT: s_lshr_b32 s11, s5, 8 +; GFX11-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX11-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX11-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX11-NEXT: s_lshl_b32 s6, s6, 16 ; GFX11-NEXT: s_and_b32 s5, s5, 0xff ; GFX11-NEXT: s_or_b32 s4, s4, s6 -; GFX11-NEXT: s_lshl_b32 s5, s5, s9 -; GFX11-NEXT: s_and_b32 s6, s12, 0xff -; GFX11-NEXT: s_or_b32 s5, s11, s5 +; GFX11-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-NEXT: s_and_b32 s6, s11, 0xff +; GFX11-NEXT: s_or_b32 s5, s10, s5 +; GFX11-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX11-NEXT: v_mul_hi_u32 v2, v1, v3 -; GFX11-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX11-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX11-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX11-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-NEXT: s_lshr_b32 s9, s1, 8 ; GFX11-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX11-NEXT: s_or_b32 s5, s5, s6 -; GFX11-NEXT: s_lshr_b32 s10, s1, 8 ; GFX11-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-NEXT: s_and_b32 s7, s9, 0xff ; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX11-NEXT: s_and_b32 s7, s10, 0xff -; GFX11-NEXT: s_lshl_b32 s1, s1, s9 -; GFX11-NEXT: s_bfe_u32 s6, s7, 0x100000 +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s6, 0xffff, s7 +; GFX11-NEXT: s_lshr_b32 s7, s2, 8 ; GFX11-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX11-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX11-NEXT: s_lshr_b32 s7, s2, 8 ; GFX11-NEXT: s_or_b32 s1, s8, s1 ; GFX11-NEXT: s_lshr_b32 s8, s2, 16 ; GFX11-NEXT: s_and_b32 s7, s7, 0xff -; GFX11-NEXT: s_lshr_b32 s10, s3, 8 -; GFX11-NEXT: s_lshl_b32 s7, s7, s9 +; GFX11-NEXT: s_lshr_b32 s9, s3, 8 +; GFX11-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-NEXT: s_and_b32 s3, s3, 0xff ; GFX11-NEXT: v_sub_nc_u32_e32 v0, s4, v0 ; GFX11-NEXT: v_mul_lo_u32 v1, v1, 24 ; GFX11-NEXT: s_lshr_b32 s4, s2, 24 ; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 ; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 ; GFX11-NEXT: s_or_b32 s2, s2, s7 -; GFX11-NEXT: s_lshl_b32 s3, s3, s9 +; GFX11-NEXT: s_or_b32 s3, s4, s3 ; GFX11-NEXT: v_sub_nc_u32_e32 v1, s5, v1 ; GFX11-NEXT: s_and_b32 s5, s8, 0xff ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo -; GFX11-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX11-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX11-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 ; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 24, v0 ; GFX11-NEXT: s_lshl_b32 s5, s5, 16 -; GFX11-NEXT: s_or_b32 s3, s4, s3 +; GFX11-NEXT: s_and_b32 s4, s9, 0xff ; GFX11-NEXT: s_or_b32 s2, s2, s5 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX11-NEXT: s_and_b32 s4, s10, 0xff ; GFX11-NEXT: s_lshr_b32 s2, s2, 1 -; GFX11-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX11-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX11-NEXT: s_lshl_b32 s6, s6, 16 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_sub_nc_u32_e32 v3, 23, v0 @@ -2433,7 +2425,7 @@ ; GFX11-NEXT: v_sub_nc_u32_e32 v3, 23, v1 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX11-NEXT: v_lshrrev_b32_e64 v2, v2, s2 -; GFX11-NEXT: s_bfe_u32 s2, s3, 0x100000 +; GFX11-NEXT: s_and_b32 s2, 0xffff, s3 ; GFX11-NEXT: s_lshl_b32 s3, s4, 16 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GFX11-NEXT: s_or_b32 s2, s2, s3 @@ -2892,7 +2884,7 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_alignbit_b32 v1, v0, v1, 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: v_alignbit_b32 v0, v0, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -2901,7 +2893,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_alignbit_b32 v1, v0, v1, 1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -2910,7 +2902,7 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_alignbit_b32 v1, v0, v1, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX9-NEXT: v_not_b32_e32 v2, v2 ; GFX9-NEXT: v_alignbit_b32 v0, v0, v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2920,7 +2912,7 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_alignbit_b32 v1, v0, v1, 1 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX10-NEXT: v_not_b32_e32 v2, v2 ; GFX10-NEXT: v_alignbit_b32 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -2930,7 +2922,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_alignbit_b32 v1, v0, v1, 1 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX11-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX11-NEXT: v_not_b32_e32 v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_alignbit_b32 v0, v0, v1, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -3016,7 +3008,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_alignbit_b32 v1, s0, v1, 1 ; GFX6-NEXT: s_lshr_b32 s0, s0, 1 -; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX6-NEXT: v_not_b32_e32 v0, v0 ; GFX6-NEXT: v_alignbit_b32 v0, s0, v1, v0 ; GFX6-NEXT: ; return to shader part epilog ; @@ -3025,7 +3017,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_alignbit_b32 v1, s0, v1, 1 ; GFX8-NEXT: s_lshr_b32 s0, s0, 1 -; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: v_alignbit_b32 v0, s0, v1, v0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -3034,14 +3026,14 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_alignbit_b32 v1, s0, v1, 1 ; GFX9-NEXT: s_lshr_b32 s0, s0, 1 -; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX9-NEXT: v_not_b32_e32 v0, v0 ; GFX9-NEXT: v_alignbit_b32 v0, s0, v1, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: v_fshl_i32_ssv: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_alignbit_b32 v1, s0, s1, 1 -; GFX10-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX10-NEXT: v_not_b32_e32 v0, v0 ; GFX10-NEXT: s_lshr_b32 s0, s0, 1 ; GFX10-NEXT: v_alignbit_b32 v0, s0, v1, v0 ; GFX10-NEXT: ; return to shader part epilog @@ -3049,7 +3041,7 @@ ; GFX11-LABEL: v_fshl_i32_ssv: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_alignbit_b32 v1, s0, s1, 1 -; GFX11-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX11-NEXT: v_not_b32_e32 v0, v0 ; GFX11-NEXT: s_lshr_b32 s0, s0, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_alignbit_b32 v0, s0, v1, v0 @@ -3166,11 +3158,11 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_alignbit_b32 v2, v0, v2, 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX6-NEXT: v_not_b32_e32 v4, v4 ; GFX6-NEXT: v_alignbit_b32 v0, v0, v2, v4 ; GFX6-NEXT: v_alignbit_b32 v2, v1, v3, 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX6-NEXT: v_xor_b32_e32 v3, -1, v5 +; GFX6-NEXT: v_not_b32_e32 v3, v5 ; GFX6-NEXT: v_alignbit_b32 v1, v1, v2, v3 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -3179,11 +3171,11 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_alignbit_b32 v2, v0, v2, 1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, v4 ; GFX8-NEXT: v_alignbit_b32 v2, v1, v3, 1 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX8-NEXT: v_xor_b32_e32 v3, -1, v5 +; GFX8-NEXT: v_not_b32_e32 v3, v5 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v2, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -3192,11 +3184,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_alignbit_b32 v2, v0, v2, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX9-NEXT: v_not_b32_e32 v4, v4 ; GFX9-NEXT: v_alignbit_b32 v0, v0, v2, v4 ; GFX9-NEXT: v_alignbit_b32 v2, v1, v3, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_xor_b32_e32 v3, -1, v5 +; GFX9-NEXT: v_not_b32_e32 v3, v5 ; GFX9-NEXT: v_alignbit_b32 v1, v1, v2, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3206,10 +3198,10 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_alignbit_b32 v2, v0, v2, 1 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX10-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX10-NEXT: v_not_b32_e32 v4, v4 ; GFX10-NEXT: v_alignbit_b32 v3, v1, v3, 1 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX10-NEXT: v_xor_b32_e32 v5, -1, v5 +; GFX10-NEXT: v_not_b32_e32 v5, v5 ; GFX10-NEXT: v_alignbit_b32 v0, v0, v2, v4 ; GFX10-NEXT: v_alignbit_b32 v1, v1, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -3220,10 +3212,10 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_alignbit_b32 v2, v0, v2, 1 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX11-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX11-NEXT: v_not_b32_e32 v4, v4 ; GFX11-NEXT: v_alignbit_b32 v3, v1, v3, 1 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX11-NEXT: v_xor_b32_e32 v5, -1, v5 +; GFX11-NEXT: v_not_b32_e32 v5, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_alignbit_b32 v0, v0, v2, v4 ; GFX11-NEXT: v_alignbit_b32 v1, v1, v3, v5 @@ -3238,15 +3230,15 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_alignbit_b32 v3, v0, v3, 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_xor_b32_e32 v6, -1, v6 +; GFX6-NEXT: v_not_b32_e32 v6, v6 ; GFX6-NEXT: v_alignbit_b32 v0, v0, v3, v6 ; GFX6-NEXT: v_alignbit_b32 v3, v1, v4, 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX6-NEXT: v_xor_b32_e32 v4, -1, v7 +; GFX6-NEXT: v_not_b32_e32 v4, v7 ; GFX6-NEXT: v_alignbit_b32 v1, v1, v3, v4 ; GFX6-NEXT: v_alignbit_b32 v3, v2, v5, 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2 -; GFX6-NEXT: v_xor_b32_e32 v4, -1, v8 +; GFX6-NEXT: v_not_b32_e32 v4, v8 ; GFX6-NEXT: v_alignbit_b32 v2, v2, v3, v4 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -3255,15 +3247,15 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_alignbit_b32 v3, v0, v3, 1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX8-NEXT: v_xor_b32_e32 v6, -1, v6 +; GFX8-NEXT: v_not_b32_e32 v6, v6 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, v6 ; GFX8-NEXT: v_alignbit_b32 v3, v1, v4, 1 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v7 +; GFX8-NEXT: v_not_b32_e32 v4, v7 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v3, v4 ; GFX8-NEXT: v_alignbit_b32 v3, v2, v5, 1 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v2 -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v8 +; GFX8-NEXT: v_not_b32_e32 v4, v8 ; GFX8-NEXT: v_alignbit_b32 v2, v2, v3, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -3272,15 +3264,15 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_alignbit_b32 v3, v0, v3, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_xor_b32_e32 v6, -1, v6 +; GFX9-NEXT: v_not_b32_e32 v6, v6 ; GFX9-NEXT: v_alignbit_b32 v0, v0, v3, v6 ; GFX9-NEXT: v_alignbit_b32 v3, v1, v4, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_xor_b32_e32 v4, -1, v7 +; GFX9-NEXT: v_not_b32_e32 v4, v7 ; GFX9-NEXT: v_alignbit_b32 v1, v1, v3, v4 ; GFX9-NEXT: v_alignbit_b32 v3, v2, v5, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v2 -; GFX9-NEXT: v_xor_b32_e32 v4, -1, v8 +; GFX9-NEXT: v_not_b32_e32 v4, v8 ; GFX9-NEXT: v_alignbit_b32 v2, v2, v3, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3290,13 +3282,13 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_alignbit_b32 v3, v0, v3, 1 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX10-NEXT: v_xor_b32_e32 v6, -1, v6 +; GFX10-NEXT: v_not_b32_e32 v6, v6 ; GFX10-NEXT: v_alignbit_b32 v4, v1, v4, 1 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX10-NEXT: v_xor_b32_e32 v7, -1, v7 +; GFX10-NEXT: v_not_b32_e32 v7, v7 ; GFX10-NEXT: v_alignbit_b32 v5, v2, v5, 1 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 1, v2 -; GFX10-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX10-NEXT: v_not_b32_e32 v8, v8 ; GFX10-NEXT: v_alignbit_b32 v0, v0, v3, v6 ; GFX10-NEXT: v_alignbit_b32 v1, v1, v4, v7 ; GFX10-NEXT: v_alignbit_b32 v2, v2, v5, v8 @@ -3308,13 +3300,13 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_alignbit_b32 v3, v0, v3, 1 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX11-NEXT: v_xor_b32_e32 v6, -1, v6 +; GFX11-NEXT: v_not_b32_e32 v6, v6 ; GFX11-NEXT: v_alignbit_b32 v4, v1, v4, 1 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX11-NEXT: v_xor_b32_e32 v7, -1, v7 +; GFX11-NEXT: v_not_b32_e32 v7, v7 ; GFX11-NEXT: v_alignbit_b32 v5, v2, v5, 1 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 1, v2 -; GFX11-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX11-NEXT: v_not_b32_e32 v8, v8 ; GFX11-NEXT: v_alignbit_b32 v0, v0, v3, v6 ; GFX11-NEXT: v_alignbit_b32 v1, v1, v4, v7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) @@ -3330,19 +3322,19 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_alignbit_b32 v4, v0, v4, 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX6-NEXT: v_not_b32_e32 v8, v8 ; GFX6-NEXT: v_alignbit_b32 v0, v0, v4, v8 ; GFX6-NEXT: v_alignbit_b32 v4, v1, v5, 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX6-NEXT: v_xor_b32_e32 v5, -1, v9 +; GFX6-NEXT: v_not_b32_e32 v5, v9 ; GFX6-NEXT: v_alignbit_b32 v1, v1, v4, v5 ; GFX6-NEXT: v_alignbit_b32 v4, v2, v6, 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2 -; GFX6-NEXT: v_xor_b32_e32 v5, -1, v10 +; GFX6-NEXT: v_not_b32_e32 v5, v10 ; GFX6-NEXT: v_alignbit_b32 v2, v2, v4, v5 ; GFX6-NEXT: v_alignbit_b32 v4, v3, v7, 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 1, v3 -; GFX6-NEXT: v_xor_b32_e32 v5, -1, v11 +; GFX6-NEXT: v_not_b32_e32 v5, v11 ; GFX6-NEXT: v_alignbit_b32 v3, v3, v4, v5 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -3351,19 +3343,19 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_alignbit_b32 v4, v0, v4, 1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX8-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX8-NEXT: v_not_b32_e32 v8, v8 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v4, v8 ; GFX8-NEXT: v_alignbit_b32 v4, v1, v5, 1 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX8-NEXT: v_xor_b32_e32 v5, -1, v9 +; GFX8-NEXT: v_not_b32_e32 v5, v9 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, v5 ; GFX8-NEXT: v_alignbit_b32 v4, v2, v6, 1 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v2 -; GFX8-NEXT: v_xor_b32_e32 v5, -1, v10 +; GFX8-NEXT: v_not_b32_e32 v5, v10 ; GFX8-NEXT: v_alignbit_b32 v2, v2, v4, v5 ; GFX8-NEXT: v_alignbit_b32 v4, v3, v7, 1 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 1, v3 -; GFX8-NEXT: v_xor_b32_e32 v5, -1, v11 +; GFX8-NEXT: v_not_b32_e32 v5, v11 ; GFX8-NEXT: v_alignbit_b32 v3, v3, v4, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -3372,19 +3364,19 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_alignbit_b32 v4, v0, v4, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX9-NEXT: v_not_b32_e32 v8, v8 ; GFX9-NEXT: v_alignbit_b32 v0, v0, v4, v8 ; GFX9-NEXT: v_alignbit_b32 v4, v1, v5, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_xor_b32_e32 v5, -1, v9 +; GFX9-NEXT: v_not_b32_e32 v5, v9 ; GFX9-NEXT: v_alignbit_b32 v1, v1, v4, v5 ; GFX9-NEXT: v_alignbit_b32 v4, v2, v6, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v2 -; GFX9-NEXT: v_xor_b32_e32 v5, -1, v10 +; GFX9-NEXT: v_not_b32_e32 v5, v10 ; GFX9-NEXT: v_alignbit_b32 v2, v2, v4, v5 ; GFX9-NEXT: v_alignbit_b32 v4, v3, v7, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 1, v3 -; GFX9-NEXT: v_xor_b32_e32 v5, -1, v11 +; GFX9-NEXT: v_not_b32_e32 v5, v11 ; GFX9-NEXT: v_alignbit_b32 v3, v3, v4, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3394,16 +3386,16 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_alignbit_b32 v4, v0, v4, 1 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX10-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX10-NEXT: v_not_b32_e32 v8, v8 ; GFX10-NEXT: v_alignbit_b32 v5, v1, v5, 1 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX10-NEXT: v_xor_b32_e32 v9, -1, v9 +; GFX10-NEXT: v_not_b32_e32 v9, v9 ; GFX10-NEXT: v_alignbit_b32 v6, v2, v6, 1 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 1, v2 -; GFX10-NEXT: v_xor_b32_e32 v10, -1, v10 +; GFX10-NEXT: v_not_b32_e32 v10, v10 ; GFX10-NEXT: v_alignbit_b32 v7, v3, v7, 1 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 1, v3 -; GFX10-NEXT: v_xor_b32_e32 v11, -1, v11 +; GFX10-NEXT: v_not_b32_e32 v11, v11 ; GFX10-NEXT: v_alignbit_b32 v0, v0, v4, v8 ; GFX10-NEXT: v_alignbit_b32 v1, v1, v5, v9 ; GFX10-NEXT: v_alignbit_b32 v2, v2, v6, v10 @@ -3416,16 +3408,16 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_alignbit_b32 v4, v0, v4, 1 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX11-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX11-NEXT: v_not_b32_e32 v8, v8 ; GFX11-NEXT: v_alignbit_b32 v5, v1, v5, 1 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX11-NEXT: v_xor_b32_e32 v9, -1, v9 +; GFX11-NEXT: v_not_b32_e32 v9, v9 ; GFX11-NEXT: v_alignbit_b32 v6, v2, v6, 1 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 1, v2 -; GFX11-NEXT: v_xor_b32_e32 v10, -1, v10 +; GFX11-NEXT: v_not_b32_e32 v10, v10 ; GFX11-NEXT: v_alignbit_b32 v7, v3, v7, 1 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 1, v3 -; GFX11-NEXT: v_xor_b32_e32 v11, -1, v11 +; GFX11-NEXT: v_not_b32_e32 v11, v11 ; GFX11-NEXT: v_alignbit_b32 v0, v0, v4, v8 ; GFX11-NEXT: v_alignbit_b32 v1, v1, v5, v9 ; GFX11-NEXT: v_alignbit_b32 v2, v2, v6, v10 @@ -3441,9 +3433,9 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_and_b32 s3, s2, 15 ; GFX6-NEXT: s_andn2_b32 s2, 15, s2 -; GFX6-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX6-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX6-NEXT: s_bfe_u32 s1, s1, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX6-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX6-NEXT: s_lshl_b32 s0, s0, s3 ; GFX6-NEXT: s_lshr_b32 s1, s1, s2 ; GFX6-NEXT: s_or_b32 s0, s0, s1 @@ -3452,13 +3444,12 @@ ; GFX8-LABEL: s_fshl_i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s3, s2, 15 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX8-NEXT: s_andn2_b32 s2, 15, s2 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX8-NEXT: s_lshr_b32 s1, s1, 1 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_lshl_b32 s0, s0, s3 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s3, 1, 0x100000 -; GFX8-NEXT: s_lshr_b32 s1, s1, s3 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX8-NEXT: s_lshr_b32 s1, s1, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog @@ -3466,13 +3457,12 @@ ; GFX9-LABEL: s_fshl_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s3, s2, 15 -; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX9-NEXT: s_andn2_b32 s2, 15, s2 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX9-NEXT: s_lshr_b32 s1, s1, 1 +; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX9-NEXT: s_lshl_b32 s0, s0, s3 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_bfe_u32 s3, 1, 0x100000 -; GFX9-NEXT: s_lshr_b32 s1, s1, s3 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX9-NEXT: s_lshr_b32 s1, s1, s2 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog @@ -3481,11 +3471,10 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_and_b32 s3, s2, 15 ; GFX10-NEXT: s_andn2_b32 s2, 15, s2 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX10-NEXT: s_bfe_u32 s4, 1, 0x100000 -; GFX10-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX10-NEXT: s_lshr_b32 s1, s1, s4 -; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX10-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX10-NEXT: s_lshr_b32 s1, s1, 1 +; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX10-NEXT: s_lshl_b32 s0, s0, s3 ; GFX10-NEXT: s_lshr_b32 s1, s1, s2 ; GFX10-NEXT: s_or_b32 s0, s0, s1 @@ -3495,11 +3484,10 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_and_b32 s3, s2, 15 ; GFX11-NEXT: s_and_not1_b32 s2, 15, s2 -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX11-NEXT: s_bfe_u32 s4, 1, 0x100000 -; GFX11-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX11-NEXT: s_lshr_b32 s1, s1, s4 -; GFX11-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX11-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX11-NEXT: s_lshr_b32 s1, s1, 1 +; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX11-NEXT: s_lshl_b32 s0, s0, s3 ; GFX11-NEXT: s_lshr_b32 s1, s1, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -3519,41 +3507,33 @@ ; ; GFX8-LABEL: s_fshl_i16_4: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s2, 4, 0x100000 -; GFX8-NEXT: s_lshl_b32 s0, s0, s2 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s2, 12, 0x100000 -; GFX8-NEXT: s_lshr_b32 s1, s1, s2 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_lshl_b32 s0, s0, 4 +; GFX8-NEXT: s_lshr_b32 s1, s1, 12 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshl_i16_4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_bfe_u32 s2, 4, 0x100000 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_bfe_u32 s2, 12, 0x100000 -; GFX9-NEXT: s_lshr_b32 s1, s1, s2 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX9-NEXT: s_lshl_b32 s0, s0, 4 +; GFX9-NEXT: s_lshr_b32 s1, s1, 12 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fshl_i16_4: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_bfe_u32 s2, 4, 0x100000 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX10-NEXT: s_bfe_u32 s3, 12, 0x100000 -; GFX10-NEXT: s_lshl_b32 s0, s0, s2 -; GFX10-NEXT: s_lshr_b32 s1, s1, s3 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX10-NEXT: s_lshl_b32 s0, s0, 4 +; GFX10-NEXT: s_lshr_b32 s1, s1, 12 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshl_i16_4: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_bfe_u32 s2, 4, 0x100000 -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX11-NEXT: s_bfe_u32 s3, 12, 0x100000 -; GFX11-NEXT: s_lshl_b32 s0, s0, s2 -; GFX11-NEXT: s_lshr_b32 s1, s1, s3 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX11-NEXT: s_lshl_b32 s0, s0, 4 +; GFX11-NEXT: s_lshr_b32 s1, s1, 12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: ; return to shader part epilog @@ -3571,41 +3551,33 @@ ; ; GFX8-LABEL: s_fshl_i16_5: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s2, 5, 0x100000 -; GFX8-NEXT: s_lshl_b32 s0, s0, s2 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s2, 11, 0x100000 -; GFX8-NEXT: s_lshr_b32 s1, s1, s2 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_lshl_b32 s0, s0, 5 +; GFX8-NEXT: s_lshr_b32 s1, s1, 11 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshl_i16_5: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_bfe_u32 s2, 5, 0x100000 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_bfe_u32 s2, 11, 0x100000 -; GFX9-NEXT: s_lshr_b32 s1, s1, s2 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX9-NEXT: s_lshl_b32 s0, s0, 5 +; GFX9-NEXT: s_lshr_b32 s1, s1, 11 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fshl_i16_5: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_bfe_u32 s2, 5, 0x100000 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX10-NEXT: s_bfe_u32 s3, 11, 0x100000 -; GFX10-NEXT: s_lshl_b32 s0, s0, s2 -; GFX10-NEXT: s_lshr_b32 s1, s1, s3 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX10-NEXT: s_lshl_b32 s0, s0, 5 +; GFX10-NEXT: s_lshr_b32 s1, s1, 11 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshl_i16_5: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_bfe_u32 s2, 5, 0x100000 -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX11-NEXT: s_bfe_u32 s3, 11, 0x100000 -; GFX11-NEXT: s_lshl_b32 s0, s0, s2 -; GFX11-NEXT: s_lshr_b32 s1, s1, s3 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX11-NEXT: s_lshl_b32 s0, s0, 5 +; GFX11-NEXT: s_lshr_b32 s1, s1, 11 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: ; return to shader part epilog @@ -3620,9 +3592,9 @@ ; GFX6-NEXT: v_and_b32_e32 v3, 15, v2 ; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX6-NEXT: v_bfe_u32 v3, v3, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_bfe_u32 v1, v1, 1, 15 -; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v3, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3783,10 +3755,10 @@ ; GFX6-NEXT: v_and_b32_e32 v1, 15, v0 ; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX6-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_lshl_b32_e32 v1, s0, v1 ; GFX6-NEXT: s_bfe_u32 s0, s1, 0xf0001 -; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: ; return to shader part epilog @@ -3796,10 +3768,9 @@ ; GFX8-NEXT: v_and_b32_e32 v1, 15, v0 ; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX8-NEXT: v_lshlrev_b16_e64 v1, v1, s0 -; GFX8-NEXT: s_bfe_u32 s0, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, 1, 0x100000 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s1 ; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX8-NEXT: s_lshr_b32 s0, s0, s1 +; GFX8-NEXT: s_lshr_b32 s0, s0, 1 ; GFX8-NEXT: v_lshrrev_b16_e64 v0, v0, s0 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: ; return to shader part epilog @@ -3809,10 +3780,9 @@ ; GFX9-NEXT: v_and_b32_e32 v1, 15, v0 ; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX9-NEXT: v_lshlrev_b16_e64 v1, v1, s0 -; GFX9-NEXT: s_bfe_u32 s0, s1, 0x100000 -; GFX9-NEXT: s_bfe_u32 s1, 1, 0x100000 +; GFX9-NEXT: s_and_b32 s0, 0xffff, s1 ; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX9-NEXT: s_lshr_b32 s0, s0, s1 +; GFX9-NEXT: s_lshr_b32 s0, s0, 1 ; GFX9-NEXT: v_lshrrev_b16_e64 v0, v0, s0 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX9-NEXT: ; return to shader part epilog @@ -3821,9 +3791,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: v_xor_b32_e32 v1, -1, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX10-NEXT: s_bfe_u32 s2, 1, 0x100000 -; GFX10-NEXT: s_lshr_b32 s1, s1, s2 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX10-NEXT: s_lshr_b32 s1, s1, 1 ; GFX10-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX10-NEXT: v_lshlrev_b16 v0, v0, s0 ; GFX10-NEXT: v_lshrrev_b16 v1, v1, s1 @@ -3834,11 +3803,11 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: v_xor_b32_e32 v1, -1, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX11-NEXT: s_bfe_u32 s2, 1, 0x100000 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-NEXT: s_lshr_b32 s1, s1, s2 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_lshr_b32 s1, s1, 1 ; GFX11-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b16 v0, v0, s0 ; GFX11-NEXT: v_lshrrev_b16 v1, v1, s1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3854,9 +3823,9 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_and_b32 s2, s1, 15 ; GFX6-NEXT: s_andn2_b32 s1, 15, s1 -; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX6-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX6-NEXT: v_bfe_u32 v0, v0, 1, 15 -; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX6-NEXT: s_lshl_b32 s0, s0, s2 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s1, v0 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 @@ -3866,7 +3835,7 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s2, s1, 15 ; GFX8-NEXT: s_andn2_b32 s1, 15, s1 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 1, v0 ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 ; GFX8-NEXT: v_lshrrev_b16_e32 v0, s1, v0 @@ -3877,7 +3846,7 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s2, s1, 15 ; GFX9-NEXT: s_andn2_b32 s1, 15, s1 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 1, v0 ; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: v_lshrrev_b16_e32 v0, s1, v0 @@ -3889,7 +3858,7 @@ ; GFX10-NEXT: v_lshrrev_b16 v0, 1, v0 ; GFX10-NEXT: s_andn2_b32 s2, 15, s1 ; GFX10-NEXT: s_and_b32 s1, s1, 15 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: v_lshrrev_b16 v0, s2, v0 ; GFX10-NEXT: s_lshl_b32 s0, s0, s1 ; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 @@ -3901,7 +3870,7 @@ ; GFX11-NEXT: s_and_not1_b32 s2, 15, s1 ; GFX11-NEXT: s_and_b32 s1, s1, 15 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: v_lshrrev_b16 v0, s2, v0 ; GFX11-NEXT: s_lshl_b32 s0, s0, s1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) @@ -3917,9 +3886,9 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_and_b32 s2, s1, 15 ; GFX6-NEXT: s_andn2_b32 s1, 15, s1 -; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX6-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX6-NEXT: s_bfe_u32 s0, s0, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s2, v0 ; GFX6-NEXT: s_lshr_b32 s0, s0, s1 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 @@ -3929,11 +3898,10 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s2, s1, 15 ; GFX8-NEXT: s_andn2_b32 s1, 15, s1 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_lshr_b32 s0, s0, 1 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, s2, v0 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_bfe_u32 s2, 1, 0x100000 -; GFX8-NEXT: s_lshr_b32 s0, s0, s2 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX8-NEXT: s_lshr_b32 s0, s0, s1 ; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX8-NEXT: ; return to shader part epilog @@ -3942,11 +3910,10 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s2, s1, 15 ; GFX9-NEXT: s_andn2_b32 s1, 15, s1 +; GFX9-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX9-NEXT: s_lshr_b32 s0, s0, 1 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, s2, v0 -; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX9-NEXT: s_bfe_u32 s2, 1, 0x100000 -; GFX9-NEXT: s_lshr_b32 s0, s0, s2 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX9-NEXT: s_lshr_b32 s0, s0, s1 ; GFX9-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX9-NEXT: ; return to shader part epilog @@ -3955,11 +3922,10 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_and_b32 s2, s1, 15 ; GFX10-NEXT: s_andn2_b32 s1, 15, s1 -; GFX10-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX10-NEXT: s_bfe_u32 s3, 1, 0x100000 +; GFX10-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX10-NEXT: v_lshlrev_b16 v0, s2, v0 -; GFX10-NEXT: s_lshr_b32 s0, s0, s3 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_lshr_b32 s0, s0, 1 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: s_lshr_b32 s0, s0, s1 ; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog @@ -3968,11 +3934,10 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_and_b32 s2, s1, 15 ; GFX11-NEXT: s_and_not1_b32 s1, 15, s1 -; GFX11-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX11-NEXT: s_bfe_u32 s3, 1, 0x100000 +; GFX11-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX11-NEXT: v_lshlrev_b16 v0, s2, v0 -; GFX11-NEXT: s_lshr_b32 s0, s0, s3 -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_lshr_b32 s0, s0, 1 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshr_b32 s0, s0, s1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) @@ -3988,51 +3953,50 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_and_b32 s6, s4, 15 ; GFX6-NEXT: s_andn2_b32 s4, 15, s4 -; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX6-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX6-NEXT: s_bfe_u32 s2, s2, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX6-NEXT: s_lshl_b32 s0, s0, s6 ; GFX6-NEXT: s_lshr_b32 s2, s2, s4 ; GFX6-NEXT: s_or_b32 s0, s0, s2 ; GFX6-NEXT: s_and_b32 s2, s5, 15 ; GFX6-NEXT: s_andn2_b32 s4, 15, s5 -; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX6-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX6-NEXT: s_lshl_b32 s1, s1, s2 ; GFX6-NEXT: s_bfe_u32 s2, s3, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s3, s4, 0x100000 +; GFX6-NEXT: s_and_b32 s3, 0xffff, s4 ; GFX6-NEXT: s_lshr_b32 s2, s2, s3 ; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fshl_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b32 s6, s2, 15 -; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX8-NEXT: s_lshr_b32 s3, s0, 16 ; GFX8-NEXT: s_lshr_b32 s4, s1, 16 ; GFX8-NEXT: s_lshr_b32 s5, s2, 16 +; GFX8-NEXT: s_and_b32 s6, s2, 15 ; GFX8-NEXT: s_andn2_b32 s2, 15, s2 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX8-NEXT: s_lshr_b32 s1, s1, 1 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_lshr_b32 s3, s0, 16 ; GFX8-NEXT: s_lshl_b32 s0, s0, s6 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s6, 1, 0x100000 -; GFX8-NEXT: s_lshr_b32 s1, s1, s6 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX8-NEXT: s_lshr_b32 s1, s1, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s5, 15 ; GFX8-NEXT: s_andn2_b32 s2, 15, s5 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshl_b32 s1, s3, s1 -; GFX8-NEXT: s_lshr_b32 s3, s4, s6 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_lshr_b32 s3, s4, 1 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_lshr_b32 s2, s3, s2 ; GFX8-NEXT: s_or_b32 s1, s1, s2 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog @@ -4118,19 +4082,19 @@ ; GFX6-NEXT: v_and_b32_e32 v6, 15, v4 ; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX6-NEXT: v_bfe_u32 v6, v6, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX6-NEXT: v_bfe_u32 v2, v2, 1, 15 -; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v6, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v5 ; GFX6-NEXT: v_xor_b32_e32 v4, -1, v5 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_bfe_u32 v2, v3, 1, 15 -; GFX6-NEXT: v_bfe_u32 v3, v4, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v3, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -4154,8 +4118,8 @@ ; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_lshrrev_b16_e32 v1, v3, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_mov_b32_e32 v1, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -4206,17 +4170,13 @@ ; GFX6-LABEL: v_fshl_v2i16_4_8: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_bfe_u32 s4, 4, 0x100000 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX6-NEXT: v_bfe_u32 v2, v2, 1, 15 -; GFX6-NEXT: s_bfe_u32 s4, 11, 0x100000 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, s4, v2 -; GFX6-NEXT: s_bfe_u32 s4, 8, 0x100000 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 11, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_bfe_u32 v2, v3, 1, 15 -; GFX6-NEXT: s_bfe_u32 s4, 7, 0x100000 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 7, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -4231,8 +4191,8 @@ ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -4274,23 +4234,23 @@ ; GFX6-NEXT: v_and_b32_e32 v2, 15, v0 ; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX6-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, s0, v2 ; GFX6-NEXT: s_bfe_u32 s0, s2, 0xf0001 -; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v1 ; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX6-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: s_bfe_u32 s0, s3, 0xf0001 -; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_lshl_b32_e32 v2, s1, v2 ; GFX6-NEXT: v_lshr_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 -; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog @@ -4299,25 +4259,24 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: v_and_b32_e32 v2, 15, v0 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX8-NEXT: v_lshlrev_b16_e64 v2, v2, s0 -; GFX8-NEXT: s_bfe_u32 s0, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, 1, 0x100000 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s1 ; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX8-NEXT: s_lshr_b32 s0, s0, s1 +; GFX8-NEXT: s_lshr_b32 s0, s0, 1 ; GFX8-NEXT: v_lshrrev_b16_e64 v0, v0, s0 +; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v1 ; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX8-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX8-NEXT: s_lshr_b32 s0, s3, s1 +; GFX8-NEXT: s_lshr_b32 s0, s3, 1 ; GFX8-NEXT: v_lshlrev_b16_e64 v2, v2, s2 ; GFX8-NEXT: v_lshrrev_b16_e64 v1, v1, s0 ; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: ; return to shader part epilog ; @@ -4377,22 +4336,22 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_and_b32 s4, s2, 15 ; GFX6-NEXT: s_andn2_b32 s2, 15, s2 -; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX6-NEXT: v_bfe_u32 v0, v0, 1, 15 -; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX6-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX6-NEXT: s_lshl_b32 s0, s0, s4 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s2, v0 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX6-NEXT: s_and_b32 s0, s3, 15 ; GFX6-NEXT: s_andn2_b32 s2, 15, s3 -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX6-NEXT: s_lshl_b32 s0, s1, s0 ; GFX6-NEXT: v_bfe_u32 v1, v1, 1, 15 -; GFX6-NEXT: s_bfe_u32 s1, s2, 0x100000 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s2 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, s1, v1 ; GFX6-NEXT: v_or_b32_e32 v1, s0, v1 -; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 -; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog @@ -4402,7 +4361,7 @@ ; GFX8-NEXT: s_and_b32 s4, s1, 15 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: s_andn2_b32 s1, 15, s1 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v0 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: s_lshl_b32 s0, s0, s4 @@ -4411,13 +4370,13 @@ ; GFX8-NEXT: s_and_b32 s0, s3, 15 ; GFX8-NEXT: v_mov_b32_e32 v2, 1 ; GFX8-NEXT: s_andn2_b32 s1, 15, s3 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: v_lshrrev_b16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: s_lshl_b32 s0, s2, s0 ; GFX8-NEXT: v_lshrrev_b16_e32 v0, s1, v0 ; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: ; return to shader part epilog ; @@ -4474,49 +4433,48 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_and_b32 s4, s2, 15 ; GFX6-NEXT: s_andn2_b32 s2, 15, s2 -; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX6-NEXT: s_bfe_u32 s0, s0, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX6-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX6-NEXT: s_lshr_b32 s0, s0, s2 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX6-NEXT: s_and_b32 s0, s3, 15 ; GFX6-NEXT: s_andn2_b32 s2, 15, s3 -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, s0, v1 ; GFX6-NEXT: s_bfe_u32 s0, s1, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s1, s2, 0x100000 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s2 ; GFX6-NEXT: s_lshr_b32 s0, s0, s1 ; GFX6-NEXT: v_or_b32_e32 v1, s0, v1 -; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 -; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: v_fshl_v2i16_vss: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b32 s4, s1, 15 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 +; GFX8-NEXT: s_and_b32 s4, s1, 15 ; GFX8-NEXT: s_andn2_b32 s1, 15, s1 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_lshr_b32 s0, s0, 1 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, s4, v0 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_bfe_u32 s4, 1, 0x100000 -; GFX8-NEXT: s_lshr_b32 s0, s0, s4 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX8-NEXT: s_lshr_b32 s0, s0, s1 ; GFX8-NEXT: v_or_b32_e32 v1, s0, v1 ; GFX8-NEXT: s_and_b32 s0, s3, 15 ; GFX8-NEXT: s_andn2_b32 s1, 15, s3 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: s_lshr_b32 s0, s2, s4 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_lshr_b32 s0, s2, 1 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: s_lshr_b32 s0, s0, s1 ; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: ; return to shader part epilog ; @@ -4588,72 +4546,71 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_and_b32 s9, s6, 15 ; GFX6-NEXT: s_andn2_b32 s6, 15, s6 -; GFX6-NEXT: s_bfe_u32 s9, s9, 0x100000 +; GFX6-NEXT: s_and_b32 s9, 0xffff, s9 ; GFX6-NEXT: s_bfe_u32 s3, s3, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX6-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX6-NEXT: s_lshl_b32 s0, s0, s9 ; GFX6-NEXT: s_lshr_b32 s3, s3, s6 ; GFX6-NEXT: s_or_b32 s0, s0, s3 ; GFX6-NEXT: s_and_b32 s3, s7, 15 ; GFX6-NEXT: s_andn2_b32 s6, 15, s7 -; GFX6-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX6-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX6-NEXT: s_lshl_b32 s1, s1, s3 ; GFX6-NEXT: s_bfe_u32 s3, s4, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s4, s6, 0x100000 +; GFX6-NEXT: s_and_b32 s4, 0xffff, s6 ; GFX6-NEXT: s_lshr_b32 s3, s3, s4 ; GFX6-NEXT: s_or_b32 s1, s1, s3 ; GFX6-NEXT: s_and_b32 s3, s8, 15 ; GFX6-NEXT: s_andn2_b32 s4, 15, s8 -; GFX6-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX6-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX6-NEXT: s_lshl_b32 s2, s2, s3 ; GFX6-NEXT: s_bfe_u32 s3, s5, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX6-NEXT: s_lshr_b32 s3, s3, s4 -; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_bfe_u32 s1, s2, 0x100000 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s2 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fshl_v3i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b32 s9, s4, 15 -; GFX8-NEXT: s_bfe_u32 s9, s9, 0x100000 -; GFX8-NEXT: s_lshr_b32 s6, s0, 16 ; GFX8-NEXT: s_lshr_b32 s7, s2, 16 ; GFX8-NEXT: s_lshr_b32 s8, s4, 16 +; GFX8-NEXT: s_and_b32 s9, s4, 15 ; GFX8-NEXT: s_andn2_b32 s4, 15, s4 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_and_b32 s9, 0xffff, s9 +; GFX8-NEXT: s_lshr_b32 s2, s2, 1 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX8-NEXT: s_lshr_b32 s6, s0, 16 ; GFX8-NEXT: s_lshl_b32 s0, s0, s9 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_bfe_u32 s9, 1, 0x100000 -; GFX8-NEXT: s_lshr_b32 s2, s2, s9 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX8-NEXT: s_lshr_b32 s2, s2, s4 ; GFX8-NEXT: s_or_b32 s0, s0, s2 ; GFX8-NEXT: s_and_b32 s2, s8, 15 ; GFX8-NEXT: s_andn2_b32 s4, 15, s8 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_lshl_b32 s2, s6, s2 -; GFX8-NEXT: s_lshr_b32 s6, s7, s9 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX8-NEXT: s_lshr_b32 s6, s7, 1 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX8-NEXT: s_lshr_b32 s4, s6, s4 ; GFX8-NEXT: s_or_b32 s2, s2, s4 ; GFX8-NEXT: s_and_b32 s4, s5, 15 ; GFX8-NEXT: s_andn2_b32 s5, 15, s5 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_lshl_b32 s1, s1, s4 -; GFX8-NEXT: s_lshr_b32 s3, s3, s9 -; GFX8-NEXT: s_bfe_u32 s4, s5, 0x100000 +; GFX8-NEXT: s_lshr_b32 s3, s3, 1 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s5 ; GFX8-NEXT: s_lshr_b32 s3, s3, s4 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_or_b32 s1, s1, s3 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshl_v3i16: @@ -4808,28 +4765,28 @@ ; GFX6-NEXT: v_and_b32_e32 v9, 15, v6 ; GFX6-NEXT: v_xor_b32_e32 v6, -1, v6 ; GFX6-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX6-NEXT: v_bfe_u32 v9, v9, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX6-NEXT: v_bfe_u32 v3, v3, 1, 15 -; GFX6-NEXT: v_bfe_u32 v6, v6, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v9, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v6, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX6-NEXT: v_and_b32_e32 v3, 15, v7 ; GFX6-NEXT: v_xor_b32_e32 v6, -1, v7 ; GFX6-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX6-NEXT: v_bfe_u32 v3, v3, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v3, v1 ; GFX6-NEXT: v_bfe_u32 v3, v4, 1, 15 -; GFX6-NEXT: v_bfe_u32 v4, v6, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v6 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v4, v3 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX6-NEXT: v_and_b32_e32 v3, 15, v8 ; GFX6-NEXT: v_xor_b32_e32 v4, -1, v8 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX6-NEXT: v_bfe_u32 v3, v3, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v3, v2 ; GFX6-NEXT: v_bfe_u32 v3, v5, 1, 15 -; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v4, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -4859,11 +4816,11 @@ ; GFX8-NEXT: v_lshlrev_b16_e32 v1, v2, v1 ; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v3 ; GFX8-NEXT: v_lshrrev_b16_e32 v2, v5, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fshl_v3i16: @@ -4936,95 +4893,94 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_and_b32 s12, s8, 15 ; GFX6-NEXT: s_andn2_b32 s8, 15, s8 -; GFX6-NEXT: s_bfe_u32 s12, s12, 0x100000 +; GFX6-NEXT: s_and_b32 s12, 0xffff, s12 ; GFX6-NEXT: s_bfe_u32 s4, s4, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX6-NEXT: s_and_b32 s8, 0xffff, s8 ; GFX6-NEXT: s_lshl_b32 s0, s0, s12 ; GFX6-NEXT: s_lshr_b32 s4, s4, s8 ; GFX6-NEXT: s_or_b32 s0, s0, s4 ; GFX6-NEXT: s_and_b32 s4, s9, 15 ; GFX6-NEXT: s_andn2_b32 s8, 15, s9 -; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX6-NEXT: s_lshl_b32 s1, s1, s4 ; GFX6-NEXT: s_bfe_u32 s4, s5, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s5, s8, 0x100000 +; GFX6-NEXT: s_and_b32 s5, 0xffff, s8 ; GFX6-NEXT: s_lshr_b32 s4, s4, s5 ; GFX6-NEXT: s_or_b32 s1, s1, s4 ; GFX6-NEXT: s_and_b32 s4, s10, 15 ; GFX6-NEXT: s_andn2_b32 s5, 15, s10 -; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX6-NEXT: s_lshl_b32 s2, s2, s4 ; GFX6-NEXT: s_bfe_u32 s4, s6, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX6-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX6-NEXT: s_lshr_b32 s4, s4, s5 ; GFX6-NEXT: s_or_b32 s2, s2, s4 ; GFX6-NEXT: s_and_b32 s4, s11, 15 ; GFX6-NEXT: s_andn2_b32 s5, 15, s11 -; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX6-NEXT: s_lshl_b32 s3, s3, s4 ; GFX6-NEXT: s_bfe_u32 s4, s7, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX6-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX6-NEXT: s_lshr_b32 s4, s4, s5 -; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX6-NEXT: s_or_b32 s3, s3, s4 -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_bfe_u32 s1, s2, 0x100000 -; GFX6-NEXT: s_bfe_u32 s2, s3, 0x100000 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s2 +; GFX6-NEXT: s_and_b32 s2, 0xffff, s3 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fshl_v4i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b32 s12, s4, 15 -; GFX8-NEXT: s_bfe_u32 s12, s12, 0x100000 -; GFX8-NEXT: s_lshr_b32 s6, s0, 16 ; GFX8-NEXT: s_lshr_b32 s8, s2, 16 ; GFX8-NEXT: s_lshr_b32 s10, s4, 16 +; GFX8-NEXT: s_and_b32 s12, s4, 15 ; GFX8-NEXT: s_andn2_b32 s4, 15, s4 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_and_b32 s12, 0xffff, s12 +; GFX8-NEXT: s_lshr_b32 s2, s2, 1 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX8-NEXT: s_lshr_b32 s6, s0, 16 ; GFX8-NEXT: s_lshl_b32 s0, s0, s12 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_bfe_u32 s12, 1, 0x100000 -; GFX8-NEXT: s_lshr_b32 s2, s2, s12 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX8-NEXT: s_lshr_b32 s2, s2, s4 ; GFX8-NEXT: s_or_b32 s0, s0, s2 ; GFX8-NEXT: s_and_b32 s2, s10, 15 ; GFX8-NEXT: s_andn2_b32 s4, 15, s10 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_lshl_b32 s2, s6, s2 -; GFX8-NEXT: s_lshr_b32 s6, s8, s12 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX8-NEXT: s_lshr_b32 s6, s8, 1 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX8-NEXT: s_lshr_b32 s4, s6, s4 ; GFX8-NEXT: s_or_b32 s2, s2, s4 ; GFX8-NEXT: s_and_b32 s4, s5, 15 ; GFX8-NEXT: s_lshr_b32 s9, s3, 16 ; GFX8-NEXT: s_lshr_b32 s11, s5, 16 ; GFX8-NEXT: s_andn2_b32 s5, 15, s5 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_lshr_b32 s7, s1, 16 ; GFX8-NEXT: s_lshl_b32 s1, s1, s4 -; GFX8-NEXT: s_lshr_b32 s3, s3, s12 -; GFX8-NEXT: s_bfe_u32 s4, s5, 0x100000 +; GFX8-NEXT: s_lshr_b32 s3, s3, 1 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s5 ; GFX8-NEXT: s_lshr_b32 s3, s3, s4 ; GFX8-NEXT: s_or_b32 s1, s1, s3 ; GFX8-NEXT: s_and_b32 s3, s11, 15 ; GFX8-NEXT: s_andn2_b32 s4, 15, s11 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_lshr_b32 s5, s9, s12 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX8-NEXT: s_lshr_b32 s5, s9, 1 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX8-NEXT: s_lshl_b32 s3, s7, s3 ; GFX8-NEXT: s_lshr_b32 s4, s5, s4 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_or_b32 s3, s3, s4 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_bfe_u32 s2, s3, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s3 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: ; return to shader part epilog @@ -5166,37 +5122,37 @@ ; GFX6-NEXT: v_and_b32_e32 v12, 15, v8 ; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8 ; GFX6-NEXT: v_and_b32_e32 v8, 15, v8 -; GFX6-NEXT: v_bfe_u32 v12, v12, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; GFX6-NEXT: v_bfe_u32 v4, v4, 1, 15 -; GFX6-NEXT: v_bfe_u32 v8, v8, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v12, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v4 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v9 ; GFX6-NEXT: v_xor_b32_e32 v8, -1, v9 ; GFX6-NEXT: v_and_b32_e32 v8, 15, v8 -; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v4, v1 ; GFX6-NEXT: v_bfe_u32 v4, v5, 1, 15 -; GFX6-NEXT: v_bfe_u32 v5, v8, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v8 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v10 ; GFX6-NEXT: v_xor_b32_e32 v5, -1, v10 ; GFX6-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_bfe_u32 v4, v6, 1, 15 -; GFX6-NEXT: v_bfe_u32 v5, v5, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v11 ; GFX6-NEXT: v_xor_b32_e32 v5, -1, v11 ; GFX6-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, v4, v3 ; GFX6-NEXT: v_bfe_u32 v4, v7, 1, 15 -; GFX6-NEXT: v_bfe_u32 v5, v5, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -5236,9 +5192,10 @@ ; GFX8-NEXT: v_lshrrev_b16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_lshrrev_b16_e32 v3, v6, v3 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_mov_b32_e32 v3, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -353,7 +353,7 @@ ; GFX8-NEXT: s_and_b32 s3, s2, 7 ; GFX8-NEXT: s_andn2_b32 s2, 7, s2 ; GFX8-NEXT: s_lshl_b32 s0, s0, 1 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 ; GFX8-NEXT: s_lshr_b32 s1, s1, s3 ; GFX8-NEXT: s_or_b32 s0, s0, s1 @@ -365,7 +365,7 @@ ; GFX9-NEXT: s_and_b32 s3, s2, 7 ; GFX9-NEXT: s_andn2_b32 s2, 7, s2 ; GFX9-NEXT: s_lshl_b32 s0, s0, 1 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: s_lshr_b32 s1, s1, s3 ; GFX9-NEXT: s_or_b32 s0, s0, s1 @@ -377,7 +377,7 @@ ; GFX10-NEXT: s_and_b32 s3, s2, 7 ; GFX10-NEXT: s_andn2_b32 s2, 7, s2 ; GFX10-NEXT: s_lshl_b32 s0, s0, 1 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: s_lshl_b32 s0, s0, s2 ; GFX10-NEXT: s_lshr_b32 s1, s1, s3 ; GFX10-NEXT: s_or_b32 s0, s0, s1 @@ -389,7 +389,7 @@ ; GFX11-NEXT: s_and_b32 s3, s2, 7 ; GFX11-NEXT: s_and_not1_b32 s2, 7, s2 ; GFX11-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: s_lshl_b32 s0, s0, s2 ; GFX11-NEXT: s_lshr_b32 s1, s1, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -404,7 +404,7 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v3, 7, v2 -; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 @@ -417,7 +417,7 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v3, 7, v2 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 1, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, v2, v0 @@ -429,7 +429,7 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v3, 7, v2 -; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX9-NEXT: v_not_b32_e32 v2, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, v2, v0 @@ -441,7 +441,7 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2 +; GFX10-NEXT: v_not_b32_e32 v3, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 @@ -455,7 +455,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_xor_b32_e32 v3, -1, v2 +; GFX11-NEXT: v_not_b32_e32 v3, v2 ; GFX11-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 @@ -481,7 +481,7 @@ ; GFX8-LABEL: s_fshr_i8_4: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshl_b32 s0, s0, 4 ; GFX8-NEXT: s_lshr_b32 s1, s1, 4 ; GFX8-NEXT: s_or_b32 s0, s0, s1 @@ -490,7 +490,7 @@ ; GFX9-LABEL: s_fshr_i8_4: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_lshl_b32 s0, s0, 4 ; GFX9-NEXT: s_lshr_b32 s1, s1, 4 ; GFX9-NEXT: s_or_b32 s0, s0, s1 @@ -500,7 +500,7 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: s_lshl_b32 s0, s0, 4 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: s_lshr_b32 s1, s1, 4 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog @@ -509,7 +509,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_and_b32 s1, s1, 0xff ; GFX11-NEXT: s_lshl_b32 s0, s0, 4 -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_lshr_b32 s1, s1, 4 ; GFX11-NEXT: s_or_b32 s0, s0, s1 @@ -580,7 +580,7 @@ ; GFX8-LABEL: s_fshr_i8_5: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshl_b32 s0, s0, 3 ; GFX8-NEXT: s_lshr_b32 s1, s1, 5 ; GFX8-NEXT: s_or_b32 s0, s0, s1 @@ -589,7 +589,7 @@ ; GFX9-LABEL: s_fshr_i8_5: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_lshl_b32 s0, s0, 3 ; GFX9-NEXT: s_lshr_b32 s1, s1, 5 ; GFX9-NEXT: s_or_b32 s0, s0, s1 @@ -599,7 +599,7 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: s_lshl_b32 s0, s0, 3 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: s_lshr_b32 s1, s1, 5 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog @@ -608,7 +608,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_and_b32 s1, s1, 0xff ; GFX11-NEXT: s_lshl_b32 s0, s0, 3 -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_lshr_b32 s1, s1, 5 ; GFX11-NEXT: s_or_b32 s0, s0, s1 @@ -703,7 +703,7 @@ ; GFX8-NEXT: s_lshl_b32 s0, s0, 1 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_andn2_b32 s2, 7, s5 ; GFX8-NEXT: s_lshl_b32 s3, s3, 1 ; GFX8-NEXT: s_lshr_b32 s1, s1, s6 @@ -711,13 +711,12 @@ ; GFX8-NEXT: s_and_b32 s3, s4, 0xff ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s5, 7 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_lshr_b32 s1, s3, s1 ; GFX8-NEXT: s_or_b32 s1, s2, s1 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_bfe_u32 s2, 8, 0x100000 ; GFX8-NEXT: s_and_b32 s0, s0, 0xff -; GFX8-NEXT: s_lshl_b32 s1, s1, s2 +; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; @@ -731,7 +730,7 @@ ; GFX9-NEXT: s_lshl_b32 s0, s0, 1 ; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_lshl_b32 s0, s0, s2 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_andn2_b32 s2, 7, s5 ; GFX9-NEXT: s_lshl_b32 s3, s3, 1 ; GFX9-NEXT: s_lshr_b32 s1, s1, s6 @@ -739,13 +738,12 @@ ; GFX9-NEXT: s_and_b32 s3, s4, 0xff ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_and_b32 s1, s5, 7 -; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX9-NEXT: s_lshr_b32 s1, s3, s1 ; GFX9-NEXT: s_or_b32 s1, s2, s1 ; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_bfe_u32 s2, 8, 0x100000 ; GFX9-NEXT: s_and_b32 s0, s0, 0xff -; GFX9-NEXT: s_lshl_b32 s1, s1, s2 +; GFX9-NEXT: s_lshl_b32 s1, s1, 8 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; @@ -763,17 +761,16 @@ ; GFX10-NEXT: s_and_b32 s2, s5, 7 ; GFX10-NEXT: s_andn2_b32 s5, 7, s5 ; GFX10-NEXT: s_lshl_b32 s3, s3, 1 -; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: s_lshl_b32 s3, s3, s5 ; GFX10-NEXT: s_lshr_b32 s2, s4, s2 ; GFX10-NEXT: s_lshr_b32 s1, s1, s6 ; GFX10-NEXT: s_or_b32 s2, s3, s2 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: s_and_b32 s1, s2, 0xff -; GFX10-NEXT: s_bfe_u32 s2, 8, 0x100000 ; GFX10-NEXT: s_and_b32 s0, s0, 0xff -; GFX10-NEXT: s_lshl_b32 s1, s1, s2 +; GFX10-NEXT: s_lshl_b32 s1, s1, 8 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog ; @@ -791,17 +788,16 @@ ; GFX11-NEXT: s_and_b32 s2, s5, 7 ; GFX11-NEXT: s_and_not1_b32 s5, 7, s5 ; GFX11-NEXT: s_lshl_b32 s3, s3, 1 -; GFX11-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: s_lshl_b32 s3, s3, s5 ; GFX11-NEXT: s_lshr_b32 s2, s4, s2 ; GFX11-NEXT: s_lshr_b32 s1, s1, s6 ; GFX11-NEXT: s_or_b32 s2, s3, s2 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_and_b32 s1, s2, 0xff -; GFX11-NEXT: s_bfe_u32 s2, 8, 0x100000 ; GFX11-NEXT: s_and_b32 s0, s0, 0xff -; GFX11-NEXT: s_lshl_b32 s1, s1, s2 +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: ; return to shader part epilog @@ -819,7 +815,7 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 8, v2 ; GFX6-NEXT: v_and_b32_e32 v5, 7, v2 -; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -828,7 +824,7 @@ ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 7, v4 -; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX6-NEXT: v_not_b32_e32 v4, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 7, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3 ; GFX6-NEXT: v_bfe_u32 v1, v1, 8, 8 @@ -846,14 +842,14 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v2 ; GFX8-NEXT: v_and_b32_e32 v6, 7, v2 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 1, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, v2, v0 ; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v5 +; GFX8-NEXT: v_not_b32_e32 v2, v5 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_and_b32_e32 v1, 7, v5 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 @@ -871,14 +867,14 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v2 ; GFX9-NEXT: v_and_b32_e32 v6, 7, v2 -; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX9-NEXT: v_not_b32_e32 v2, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, v2, v0 ; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_xor_b32_e32 v2, -1, v5 +; GFX9-NEXT: v_not_b32_e32 v2, v5 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 7, v5 ; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 @@ -899,8 +895,8 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX10-NEXT: v_and_b32_e32 v7, 7, v2 -; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX10-NEXT: v_xor_b32_e32 v6, -1, v3 +; GFX10-NEXT: v_not_b32_e32 v2, v2 +; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 7, v3 ; GFX10-NEXT: v_lshlrev_b16 v4, 1, v4 ; GFX10-NEXT: v_and_b32_e32 v5, 0xff, v5 @@ -927,8 +923,8 @@ ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 8, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX11-NEXT: v_and_b32_e32 v7, 7, v2 -; GFX11-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX11-NEXT: v_xor_b32_e32 v6, -1, v3 +; GFX11-NEXT: v_not_b32_e32 v2, v2 +; GFX11-NEXT: v_not_b32_e32 v6, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 7, v3 ; GFX11-NEXT: v_lshlrev_b16 v4, 1, v4 ; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 @@ -1025,7 +1021,7 @@ ; GFX8-NEXT: s_lshl_b32 s0, s0, 1 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_andn2_b32 s2, 7, s9 ; GFX8-NEXT: s_lshl_b32 s3, s3, 1 ; GFX8-NEXT: s_lshr_b32 s1, s1, s12 @@ -1033,7 +1029,7 @@ ; GFX8-NEXT: s_and_b32 s3, s6, 0xff ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s9, 7 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_lshr_b32 s1, s3, s1 ; GFX8-NEXT: s_andn2_b32 s3, 7, s10 ; GFX8-NEXT: s_lshl_b32 s4, s4, 1 @@ -1041,7 +1037,7 @@ ; GFX8-NEXT: s_and_b32 s4, s7, 0xff ; GFX8-NEXT: s_or_b32 s1, s2, s1 ; GFX8-NEXT: s_and_b32 s2, s10, 7 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX8-NEXT: s_lshr_b32 s2, s4, s2 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_or_b32 s2, s3, s2 @@ -1078,7 +1074,7 @@ ; GFX9-NEXT: s_lshl_b32 s0, s0, 1 ; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_lshl_b32 s0, s0, s2 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_andn2_b32 s2, 7, s9 ; GFX9-NEXT: s_lshl_b32 s3, s3, 1 ; GFX9-NEXT: s_lshr_b32 s1, s1, s12 @@ -1086,7 +1082,7 @@ ; GFX9-NEXT: s_and_b32 s3, s6, 0xff ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_and_b32 s1, s9, 7 -; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX9-NEXT: s_lshr_b32 s1, s3, s1 ; GFX9-NEXT: s_andn2_b32 s3, 7, s10 ; GFX9-NEXT: s_lshl_b32 s4, s4, 1 @@ -1094,7 +1090,7 @@ ; GFX9-NEXT: s_and_b32 s4, s7, 0xff ; GFX9-NEXT: s_or_b32 s1, s2, s1 ; GFX9-NEXT: s_and_b32 s2, s10, 7 -; GFX9-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX9-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX9-NEXT: s_lshr_b32 s2, s4, s2 ; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_or_b32 s2, s3, s2 @@ -1131,12 +1127,12 @@ ; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: s_lshl_b32 s0, s0, 1 ; GFX10-NEXT: s_and_b32 s6, s6, 0xff -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: s_lshl_b32 s0, s0, s2 ; GFX10-NEXT: s_and_b32 s2, s9, 7 ; GFX10-NEXT: s_andn2_b32 s9, 7, s9 ; GFX10-NEXT: s_lshl_b32 s3, s3, 1 -; GFX10-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX10-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX10-NEXT: s_lshr_b32 s1, s1, s12 ; GFX10-NEXT: s_lshl_b32 s3, s3, s9 ; GFX10-NEXT: s_lshr_b32 s2, s6, s2 @@ -1146,7 +1142,7 @@ ; GFX10-NEXT: s_and_b32 s2, s10, 7 ; GFX10-NEXT: s_andn2_b32 s3, 7, s10 ; GFX10-NEXT: s_lshl_b32 s4, s4, 1 -; GFX10-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX10-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX10-NEXT: s_lshl_b32 s3, s4, s3 ; GFX10-NEXT: s_lshr_b32 s2, s6, s2 ; GFX10-NEXT: s_andn2_b32 s4, 7, s11 @@ -1184,12 +1180,12 @@ ; GFX11-NEXT: s_and_b32 s1, s1, 0xff ; GFX11-NEXT: s_lshl_b32 s0, s0, 1 ; GFX11-NEXT: s_and_b32 s6, s6, 0xff -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: s_lshl_b32 s0, s0, s2 ; GFX11-NEXT: s_and_b32 s2, s9, 7 ; GFX11-NEXT: s_and_not1_b32 s9, 7, s9 ; GFX11-NEXT: s_lshl_b32 s3, s3, 1 -; GFX11-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX11-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX11-NEXT: s_lshr_b32 s1, s1, s12 ; GFX11-NEXT: s_lshl_b32 s3, s3, s9 ; GFX11-NEXT: s_lshr_b32 s2, s6, s2 @@ -1199,7 +1195,7 @@ ; GFX11-NEXT: s_and_b32 s2, s10, 7 ; GFX11-NEXT: s_and_not1_b32 s3, 7, s10 ; GFX11-NEXT: s_lshl_b32 s4, s4, 1 -; GFX11-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX11-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX11-NEXT: s_lshl_b32 s3, s4, s3 ; GFX11-NEXT: s_lshr_b32 s2, s6, s2 ; GFX11-NEXT: s_and_not1_b32 s4, 7, s11 @@ -1237,7 +1233,7 @@ ; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v9, 24, v2 ; GFX6-NEXT: v_and_b32_e32 v10, 7, v2 -; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 24, v0 @@ -1248,13 +1244,13 @@ ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v10, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 7, v7 -; GFX6-NEXT: v_xor_b32_e32 v7, -1, v7 +; GFX6-NEXT: v_not_b32_e32 v7, v7 ; GFX6-NEXT: v_and_b32_e32 v7, 7, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, v7, v3 ; GFX6-NEXT: v_bfe_u32 v7, v1, 8, 8 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v2, v7 -; GFX6-NEXT: v_xor_b32_e32 v7, -1, v8 +; GFX6-NEXT: v_not_b32_e32 v7, v8 ; GFX6-NEXT: v_lshrrev_b32_e32 v6, 24, v1 ; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX6-NEXT: v_and_b32_e32 v3, 7, v8 @@ -1264,7 +1260,7 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v3, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX6-NEXT: v_xor_b32_e32 v4, -1, v9 +; GFX6-NEXT: v_not_b32_e32 v4, v9 ; GFX6-NEXT: v_and_b32_e32 v3, 7, v9 ; GFX6-NEXT: v_and_b32_e32 v4, 7, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 1, v5 @@ -1290,7 +1286,7 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v2 ; GFX8-NEXT: v_and_b32_e32 v8, 7, v2 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v9, 1, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, v2, v9 @@ -1298,7 +1294,7 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v8 ; GFX8-NEXT: v_and_b32_e32 v8, 7, v5 -; GFX8-NEXT: v_xor_b32_e32 v5, -1, v5 +; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v1 ; GFX8-NEXT: v_and_b32_e32 v5, 7, v5 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 1, v3 @@ -1306,17 +1302,17 @@ ; GFX8-NEXT: v_lshrrev_b16_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX8-NEXT: v_and_b32_e32 v4, 7, v6 -; GFX8-NEXT: v_xor_b32_e32 v5, -1, v6 +; GFX8-NEXT: v_not_b32_e32 v5, v6 ; GFX8-NEXT: v_mov_b32_e32 v6, 1 -; GFX8-NEXT: v_mov_b32_e32 v9, 0xff ; GFX8-NEXT: v_and_b32_e32 v5, 7, v5 ; GFX8-NEXT: v_lshlrev_b16_sdwa v8, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_lshlrev_b16_e32 v5, v5, v8 -; GFX8-NEXT: v_and_b32_sdwa v8, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v8, 0xff +; GFX8-NEXT: v_and_b32_sdwa v8, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v8 ; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 ; GFX8-NEXT: v_and_b32_e32 v5, 7, v7 -; GFX8-NEXT: v_xor_b32_e32 v7, -1, v7 +; GFX8-NEXT: v_not_b32_e32 v7, v7 ; GFX8-NEXT: v_and_b32_e32 v7, 7, v7 ; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, v7, v0 @@ -1340,7 +1336,7 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v2 ; GFX9-NEXT: v_and_b32_e32 v8, 7, v2 -; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX9-NEXT: v_not_b32_e32 v2, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v9, 1, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, v2, v9 @@ -1348,7 +1344,7 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX9-NEXT: v_or_b32_e32 v2, v2, v8 ; GFX9-NEXT: v_and_b32_e32 v8, 7, v5 -; GFX9-NEXT: v_xor_b32_e32 v5, -1, v5 +; GFX9-NEXT: v_not_b32_e32 v5, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 ; GFX9-NEXT: v_and_b32_e32 v5, 7, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v3, 1, v3 @@ -1356,24 +1352,23 @@ ; GFX9-NEXT: v_lshrrev_b16_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX9-NEXT: v_and_b32_e32 v4, 7, v6 -; GFX9-NEXT: v_xor_b32_e32 v5, -1, v6 +; GFX9-NEXT: v_not_b32_e32 v5, v6 ; GFX9-NEXT: v_mov_b32_e32 v6, 1 -; GFX9-NEXT: v_mov_b32_e32 v9, 0xff +; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: v_and_b32_e32 v5, 7, v5 ; GFX9-NEXT: v_lshlrev_b16_sdwa v8, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_lshlrev_b16_e32 v5, v5, v8 -; GFX9-NEXT: v_and_b32_sdwa v8, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v8, v1, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_lshrrev_b16_e32 v4, v4, v8 ; GFX9-NEXT: v_or_b32_e32 v4, v5, v4 ; GFX9-NEXT: v_and_b32_e32 v5, 7, v7 -; GFX9-NEXT: v_xor_b32_e32 v7, -1, v7 +; GFX9-NEXT: v_not_b32_e32 v7, v7 ; GFX9-NEXT: v_and_b32_e32 v7, 7, v7 ; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, v7, v0 ; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, 8 -; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_and_or_b32 v1, v2, s4, v1 ; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v4 @@ -1387,48 +1382,48 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX10-NEXT: v_xor_b32_e32 v8, -1, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v2 -; GFX10-NEXT: v_xor_b32_e32 v10, -1, v5 +; GFX10-NEXT: v_not_b32_e32 v8, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; GFX10-NEXT: v_not_b32_e32 v12, v7 ; GFX10-NEXT: v_lshlrev_b16 v3, 1, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v1 -; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 -; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX10-NEXT: v_and_b32_e32 v12, 7, v12 ; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 -; GFX10-NEXT: v_mov_b32_e32 v13, 0xff -; GFX10-NEXT: v_xor_b32_e32 v14, -1, v12 -; GFX10-NEXT: v_lshlrev_b16 v3, v10, v3 -; GFX10-NEXT: v_xor_b32_e32 v10, -1, v11 +; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 +; GFX10-NEXT: v_not_b32_e32 v13, v10 +; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_lshlrev_b16 v3, v12, v3 +; GFX10-NEXT: v_not_b32_e32 v12, v11 ; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v1 ; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0 ; GFX10-NEXT: v_and_b32_e32 v8, 0xff, v1 -; GFX10-NEXT: v_and_b32_e32 v5, 7, v5 -; GFX10-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 +; GFX10-NEXT: v_and_b32_e32 v7, 7, v7 +; GFX10-NEXT: v_and_b32_e32 v6, 0xff, v6 ; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 +; GFX10-NEXT: v_and_b32_e32 v13, 7, v13 ; GFX10-NEXT: v_lshlrev_b16 v4, 1, v4 -; GFX10-NEXT: v_and_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_and_b32_e32 v13, 7, v14 -; GFX10-NEXT: v_lshlrev_b16 v6, 1, v6 +; GFX10-NEXT: v_and_b32_sdwa v1, v1, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_and_b32_e32 v12, 7, v12 +; GFX10-NEXT: v_lshlrev_b16 v5, 1, v5 +; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 ; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX10-NEXT: v_lshrrev_b16 v5, v5, v7 -; GFX10-NEXT: v_lshlrev_b16 v4, v10, v4 -; GFX10-NEXT: v_lshrrev_b16 v1, v11, v1 -; GFX10-NEXT: v_lshlrev_b16 v6, v13, v6 -; GFX10-NEXT: v_lshrrev_b16 v7, v12, v9 +; GFX10-NEXT: v_lshrrev_b16 v6, v7, v6 +; GFX10-NEXT: v_lshlrev_b16 v4, v13, v4 +; GFX10-NEXT: v_lshrrev_b16 v1, v10, v1 +; GFX10-NEXT: v_lshlrev_b16 v5, v12, v5 +; GFX10-NEXT: v_lshrrev_b16 v7, v11, v9 ; GFX10-NEXT: v_lshrrev_b16 v2, v2, v8 -; GFX10-NEXT: v_or_b32_e32 v3, v3, v5 -; GFX10-NEXT: v_mov_b32_e32 v5, 8 +; GFX10-NEXT: v_or_b32_e32 v3, v3, v6 +; GFX10-NEXT: v_mov_b32_e32 v6, 8 ; GFX10-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX10-NEXT: v_or_b32_e32 v4, v6, v7 +; GFX10-NEXT: v_or_b32_e32 v4, v5, v7 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2 @@ -1447,18 +1442,18 @@ ; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v13, 24, v2 ; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX11-NEXT: v_xor_b32_e32 v12, -1, v7 +; GFX11-NEXT: v_not_b32_e32 v12, v7 ; GFX11-NEXT: v_and_b32_e32 v7, 7, v7 ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v1 ; GFX11-NEXT: v_and_b32_e32 v12, 7, v12 ; GFX11-NEXT: v_lshlrev_b16 v3, 1, v3 -; GFX11-NEXT: v_xor_b32_e32 v14, -1, v11 +; GFX11-NEXT: v_not_b32_e32 v14, v11 ; GFX11-NEXT: v_lshrrev_b16 v6, v7, v6 -; GFX11-NEXT: v_xor_b32_e32 v7, -1, v13 +; GFX11-NEXT: v_not_b32_e32 v7, v13 ; GFX11-NEXT: v_lshrrev_b32_e32 v9, 24, v1 -; GFX11-NEXT: v_xor_b32_e32 v10, -1, v2 +; GFX11-NEXT: v_not_b32_e32 v10, v2 ; GFX11-NEXT: v_lshlrev_b16 v3, v12, v3 ; GFX11-NEXT: v_and_b32_e32 v11, 7, v11 ; GFX11-NEXT: v_and_b32_e32 v12, 7, v14 @@ -1858,9 +1853,9 @@ ; GFX6-NEXT: s_lshl_b32 s2, s2, 8 ; GFX6-NEXT: s_and_b32 s8, s8, 0xff ; GFX6-NEXT: s_or_b32 s2, s11, s2 -; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX6-NEXT: s_and_b32 s8, 0xffff, s8 ; GFX6-NEXT: s_lshr_b32 s10, s3, 8 -; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX6-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX6-NEXT: s_lshl_b32 s8, s8, 16 ; GFX6-NEXT: s_and_b32 s3, s3, 0xff ; GFX6-NEXT: s_or_b32 s2, s2, s8 @@ -1868,8 +1863,8 @@ ; GFX6-NEXT: s_and_b32 s8, s10, 0xff ; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX6-NEXT: s_or_b32 s3, s9, s3 -; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 -; GFX6-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX6-NEXT: s_and_b32 s8, 0xffff, s8 +; GFX6-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX6-NEXT: s_lshl_b32 s8, s8, 16 ; GFX6-NEXT: s_or_b32 s3, s3, s8 ; GFX6-NEXT: s_lshr_b32 s8, s4, 16 @@ -1881,9 +1876,9 @@ ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 ; GFX6-NEXT: s_or_b32 s4, s11, s4 -; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX6-NEXT: s_and_b32 s8, 0xffff, s8 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX6-NEXT: s_lshl_b32 s8, s8, 16 ; GFX6-NEXT: s_or_b32 s4, s4, s8 ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 @@ -1901,9 +1896,9 @@ ; GFX6-NEXT: s_and_b32 s8, s10, 0xff ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX6-NEXT: s_or_b32 s5, s9, s5 -; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX6-NEXT: s_and_b32 s8, 0xffff, s8 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 24, v0 -; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX6-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX6-NEXT: s_lshl_b32 s8, s8, 16 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX6-NEXT: s_or_b32 s5, s5, s8 @@ -1911,8 +1906,8 @@ ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX6-NEXT: s_and_b32 s6, s6, 0xff -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX6-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, 24 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 23, v0 ; GFX6-NEXT: s_lshl_b32 s4, s6, 17 @@ -1929,8 +1924,8 @@ ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 24, v1 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 -; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX6-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX6-NEXT: s_and_b32 s7, 0xffff, s7 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 23, v1 ; GFX6-NEXT: s_lshl_b32 s0, s7, 17 @@ -1964,11 +1959,10 @@ ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: s_lshr_b32 s9, s1, 8 -; GFX8-NEXT: s_bfe_u32 s10, 8, 0x100000 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_lshr_b32 s6, s0, 8 ; GFX8-NEXT: s_lshr_b32 s8, s0, 24 -; GFX8-NEXT: s_lshl_b32 s1, s1, s10 +; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: s_and_b32 s6, s6, 0xff ; GFX8-NEXT: s_or_b32 s1, s8, s1 @@ -1976,74 +1970,74 @@ ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: s_lshr_b32 s7, s0, 16 ; GFX8-NEXT: s_and_b32 s0, s0, 0xff -; GFX8-NEXT: s_lshl_b32 s6, s6, s10 +; GFX8-NEXT: s_lshl_b32 s6, s6, 8 ; GFX8-NEXT: s_and_b32 s8, s8, 0xff ; GFX8-NEXT: s_or_b32 s0, s0, s6 ; GFX8-NEXT: s_and_b32 s6, s7, 0xff ; GFX8-NEXT: s_and_b32 s7, s9, 0xff ; GFX8-NEXT: s_lshr_b32 s9, s2, 16 -; GFX8-NEXT: s_lshr_b32 s11, s2, 24 +; GFX8-NEXT: s_lshr_b32 s10, s2, 24 ; GFX8-NEXT: s_and_b32 s2, s2, 0xff -; GFX8-NEXT: s_lshl_b32 s8, s8, s10 +; GFX8-NEXT: s_lshl_b32 s8, s8, 8 ; GFX8-NEXT: s_or_b32 s2, s2, s8 ; GFX8-NEXT: s_and_b32 s8, s9, 0xff ; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffe8 -; GFX8-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX8-NEXT: s_and_b32 s8, 0xffff, s8 ; GFX8-NEXT: v_mul_lo_u32 v2, v0, v1 -; GFX8-NEXT: s_lshr_b32 s12, s3, 8 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_lshr_b32 s11, s3, 8 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_lshl_b32 s8, s8, 16 ; GFX8-NEXT: s_and_b32 s3, s3, 0xff ; GFX8-NEXT: s_or_b32 s2, s2, s8 -; GFX8-NEXT: s_lshl_b32 s3, s3, s10 -; GFX8-NEXT: s_and_b32 s8, s12, 0xff -; GFX8-NEXT: s_or_b32 s3, s11, s3 -; GFX8-NEXT: s_bfe_u32 s8, s8, 0x100000 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX8-NEXT: s_lshl_b32 s3, s3, 8 +; GFX8-NEXT: s_and_b32 s8, s11, 0xff +; GFX8-NEXT: s_or_b32 s3, s10, s3 +; GFX8-NEXT: s_and_b32 s8, 0xffff, s8 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_lshl_b32 s8, s8, 16 ; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX8-NEXT: s_or_b32 s3, s3, s8 ; GFX8-NEXT: s_lshr_b32 s8, s4, 8 ; GFX8-NEXT: s_and_b32 s8, s8, 0xff ; GFX8-NEXT: s_lshr_b32 s9, s4, 16 -; GFX8-NEXT: s_lshr_b32 s11, s4, 24 +; GFX8-NEXT: s_lshr_b32 s10, s4, 24 ; GFX8-NEXT: s_and_b32 s4, s4, 0xff -; GFX8-NEXT: s_lshl_b32 s8, s8, s10 +; GFX8-NEXT: s_lshl_b32 s8, s8, 8 ; GFX8-NEXT: s_or_b32 s4, s4, s8 ; GFX8-NEXT: s_and_b32 s8, s9, 0xff ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 -; GFX8-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX8-NEXT: s_and_b32 s8, 0xffff, s8 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX8-NEXT: s_lshl_b32 s8, s8, 16 ; GFX8-NEXT: s_or_b32 s4, s4, s8 ; GFX8-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX8-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX8-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX8-NEXT: s_lshr_b32 s12, s5, 8 +; GFX8-NEXT: s_lshr_b32 s11, s5, 8 ; GFX8-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX8-NEXT: s_and_b32 s5, s5, 0xff ; GFX8-NEXT: v_mul_lo_u32 v1, v2, v1 -; GFX8-NEXT: s_lshl_b32 s5, s5, s10 +; GFX8-NEXT: s_lshl_b32 s5, s5, 8 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v0 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX8-NEXT: v_mul_hi_u32 v1, v2, v1 -; GFX8-NEXT: s_and_b32 s8, s12, 0xff +; GFX8-NEXT: s_and_b32 s8, s11, 0xff ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX8-NEXT: s_or_b32 s5, s11, s5 -; GFX8-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX8-NEXT: s_or_b32 s5, s10, s5 +; GFX8-NEXT: s_and_b32 s8, 0xffff, s8 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v0 -; GFX8-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX8-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX8-NEXT: s_lshl_b32 s8, s8, 16 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX8-NEXT: s_or_b32 s5, s5, s8 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 ; GFX8-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 23, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, v1, 24 ; GFX8-NEXT: s_lshl_b32 s4, s6, 17 @@ -2060,8 +2054,8 @@ ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 24, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s7, 0xffff, s7 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 23, v1 ; GFX8-NEXT: s_lshl_b32 s0, s7, 17 @@ -2093,45 +2087,44 @@ ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffffffe8 ; GFX9-NEXT: s_lshr_b32 s11, s1, 8 -; GFX9-NEXT: s_bfe_u32 s12, 8, 0x100000 +; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_lshr_b32 s7, s0, 8 ; GFX9-NEXT: s_lshr_b32 s10, s0, 24 +; GFX9-NEXT: s_lshl_b32 s1, s1, 8 ; GFX9-NEXT: v_mul_lo_u32 v2, v0, v1 -; GFX9-NEXT: s_lshl_b32 s1, s1, s12 ; GFX9-NEXT: s_and_b32 s7, s7, 0xff ; GFX9-NEXT: s_or_b32 s1, s10, s1 -; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX9-NEXT: s_lshr_b32 s10, s2, 8 +; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX9-NEXT: s_lshr_b32 s9, s0, 16 ; GFX9-NEXT: s_and_b32 s0, s0, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s7, s12 +; GFX9-NEXT: s_lshl_b32 s7, s7, 8 ; GFX9-NEXT: s_and_b32 s10, s10, 0xff ; GFX9-NEXT: s_or_b32 s0, s0, s7 ; GFX9-NEXT: s_and_b32 s7, s9, 0xff ; GFX9-NEXT: s_and_b32 s9, s11, 0xff ; GFX9-NEXT: s_lshr_b32 s11, s2, 16 -; GFX9-NEXT: s_lshr_b32 s13, s2, 24 +; GFX9-NEXT: s_lshr_b32 s12, s2, 24 ; GFX9-NEXT: s_and_b32 s2, s2, 0xff -; GFX9-NEXT: s_lshl_b32 s10, s10, s12 +; GFX9-NEXT: s_lshl_b32 s10, s10, 8 ; GFX9-NEXT: s_or_b32 s2, s2, s10 ; GFX9-NEXT: s_and_b32 s10, s11, 0xff ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 -; GFX9-NEXT: s_bfe_u32 s10, s10, 0x100000 +; GFX9-NEXT: s_and_b32 s10, 0xffff, s10 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX9-NEXT: s_lshr_b32 s14, s3, 8 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX9-NEXT: s_lshr_b32 s13, s3, 8 +; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX9-NEXT: s_lshl_b32 s10, s10, 16 ; GFX9-NEXT: s_and_b32 s3, s3, 0xff ; GFX9-NEXT: s_or_b32 s2, s2, s10 -; GFX9-NEXT: s_lshl_b32 s3, s3, s12 -; GFX9-NEXT: s_and_b32 s10, s14, 0xff -; GFX9-NEXT: s_or_b32 s3, s13, s3 -; GFX9-NEXT: s_bfe_u32 s10, s10, 0x100000 -; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX9-NEXT: s_lshl_b32 s3, s3, 8 +; GFX9-NEXT: s_and_b32 s10, s13, 0xff +; GFX9-NEXT: s_or_b32 s3, s12, s3 +; GFX9-NEXT: s_and_b32 s10, 0xffff, s10 +; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX9-NEXT: s_lshl_b32 s10, s10, 16 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX9-NEXT: s_or_b32 s3, s3, s10 @@ -2139,25 +2132,25 @@ ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: s_and_b32 s10, s10, 0xff ; GFX9-NEXT: s_lshr_b32 s11, s4, 16 -; GFX9-NEXT: s_lshr_b32 s13, s4, 24 +; GFX9-NEXT: s_lshr_b32 s12, s4, 24 ; GFX9-NEXT: s_and_b32 s4, s4, 0xff -; GFX9-NEXT: s_lshl_b32 s10, s10, s12 +; GFX9-NEXT: s_lshl_b32 s10, s10, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s10 ; GFX9-NEXT: s_and_b32 s10, s11, 0xff -; GFX9-NEXT: s_bfe_u32 s10, s10, 0x100000 +; GFX9-NEXT: s_and_b32 s10, 0xffff, s10 ; GFX9-NEXT: v_mul_lo_u32 v1, v2, v1 -; GFX9-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX9-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX9-NEXT: s_lshl_b32 s10, s10, 16 ; GFX9-NEXT: s_or_b32 s4, s4, s10 ; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX9-NEXT: s_lshr_b32 s14, s5, 8 +; GFX9-NEXT: s_lshr_b32 s13, s5, 8 ; GFX9-NEXT: s_and_b32 s5, s5, 0xff ; GFX9-NEXT: v_mul_hi_u32 v1, v2, v1 -; GFX9-NEXT: s_lshl_b32 s5, s5, s12 -; GFX9-NEXT: s_and_b32 s10, s14, 0xff -; GFX9-NEXT: s_or_b32 s5, s13, s5 -; GFX9-NEXT: s_bfe_u32 s10, s10, 0x100000 -; GFX9-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX9-NEXT: s_lshl_b32 s5, s5, 8 +; GFX9-NEXT: s_and_b32 s10, s13, 0xff +; GFX9-NEXT: s_or_b32 s5, s12, s5 +; GFX9-NEXT: s_and_b32 s10, 0xffff, s10 +; GFX9-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX9-NEXT: s_lshl_b32 s10, s10, 16 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX9-NEXT: s_or_b32 s5, s5, s10 @@ -2170,8 +2163,8 @@ ; GFX9-NEXT: v_mul_lo_u32 v1, v1, 24 ; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 -; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX9-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX9-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX9-NEXT: s_and_b32 s7, 0xffff, s7 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: v_sub_u32_e32 v3, 23, v0 ; GFX9-NEXT: s_lshl_b32 s4, s7, 17 @@ -2187,8 +2180,8 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_subrev_u32_e32 v2, 24, v1 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_bfe_u32 s9, s9, 0x100000 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX9-NEXT: s_and_b32 s9, 0xffff, s9 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_sub_u32_e32 v2, 23, v1 ; GFX9-NEXT: s_lshl_b32 s0, s9, 17 @@ -2219,105 +2212,104 @@ ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, 24 ; GFX10-NEXT: s_lshr_b32 s9, s1, 8 -; GFX10-NEXT: s_bfe_u32 s10, 8, 0x100000 ; GFX10-NEXT: s_and_b32 s1, s1, 0xff +; GFX10-NEXT: s_lshr_b32 s6, s0, 8 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX10-NEXT: s_lshr_b32 s6, s0, 8 ; GFX10-NEXT: s_lshr_b32 s8, s0, 24 -; GFX10-NEXT: s_lshl_b32 s1, s1, s10 +; GFX10-NEXT: s_lshl_b32 s1, s1, 8 ; GFX10-NEXT: s_and_b32 s6, s6, 0xff ; GFX10-NEXT: s_or_b32 s1, s8, s1 ; GFX10-NEXT: s_lshr_b32 s8, s4, 8 ; GFX10-NEXT: s_lshr_b32 s7, s0, 16 ; GFX10-NEXT: s_and_b32 s0, s0, 0xff +; GFX10-NEXT: s_lshl_b32 s6, s6, 8 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; GFX10-NEXT: s_lshl_b32 s6, s6, s10 ; GFX10-NEXT: s_and_b32 s8, s8, 0xff ; GFX10-NEXT: s_or_b32 s0, s0, s6 +; GFX10-NEXT: s_and_b32 s6, s7, 0xff ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX10-NEXT: s_and_b32 s6, s7, 0xff ; GFX10-NEXT: s_and_b32 s7, s9, 0xff ; GFX10-NEXT: s_lshr_b32 s9, s4, 16 +; GFX10-NEXT: s_lshr_b32 s10, s4, 24 ; GFX10-NEXT: v_mul_lo_u32 v2, 0xffffffe8, v0 ; GFX10-NEXT: v_mul_lo_u32 v3, 0xffffffe8, v1 -; GFX10-NEXT: s_lshr_b32 s11, s4, 24 ; GFX10-NEXT: s_and_b32 s4, s4, 0xff -; GFX10-NEXT: s_lshl_b32 s8, s8, s10 -; GFX10-NEXT: s_lshr_b32 s12, s5, 8 +; GFX10-NEXT: s_lshl_b32 s8, s8, 8 +; GFX10-NEXT: s_lshr_b32 s11, s5, 8 ; GFX10-NEXT: s_or_b32 s4, s4, s8 ; GFX10-NEXT: s_and_b32 s8, s9, 0xff +; GFX10-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX10-NEXT: s_bfe_u32 s8, s8, 0x100000 -; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX10-NEXT: s_and_b32 s8, 0xffff, s8 ; GFX10-NEXT: s_and_b32 s5, s5, 0xff ; GFX10-NEXT: s_lshl_b32 s8, s8, 16 -; GFX10-NEXT: s_lshl_b32 s5, s5, s10 +; GFX10-NEXT: s_lshl_b32 s5, s5, 8 ; GFX10-NEXT: s_or_b32 s4, s4, s8 +; GFX10-NEXT: s_and_b32 s8, s11, 0xff ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 -; GFX10-NEXT: s_and_b32 s8, s12, 0xff -; GFX10-NEXT: s_or_b32 s5, s11, s5 -; GFX10-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX10-NEXT: s_or_b32 s5, s10, s5 +; GFX10-NEXT: s_and_b32 s8, 0xffff, s8 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 +; GFX10-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX10-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX10-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX10-NEXT: s_lshl_b32 s8, s8, 16 ; GFX10-NEXT: s_lshr_b32 s9, s2, 8 ; GFX10-NEXT: s_or_b32 s5, s5, s8 ; GFX10-NEXT: s_lshr_b32 s8, s2, 16 ; GFX10-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX10-NEXT: s_and_b32 s9, s9, 0xff +; GFX10-NEXT: s_lshr_b32 s10, s2, 24 ; GFX10-NEXT: v_mul_lo_u32 v0, v0, 24 -; GFX10-NEXT: s_lshr_b32 s11, s2, 24 -; GFX10-NEXT: s_lshr_b32 s12, s3, 8 +; GFX10-NEXT: s_lshr_b32 s11, s3, 8 ; GFX10-NEXT: s_and_b32 s2, s2, 0xff -; GFX10-NEXT: s_lshl_b32 s9, s9, s10 +; GFX10-NEXT: s_lshl_b32 s9, s9, 8 ; GFX10-NEXT: s_and_b32 s8, s8, 0xff ; GFX10-NEXT: v_mul_lo_u32 v1, v1, 24 ; GFX10-NEXT: s_and_b32 s3, s3, 0xff -; GFX10-NEXT: v_sub_nc_u32_e32 v0, s4, v0 ; GFX10-NEXT: s_or_b32 s2, s2, s9 -; GFX10-NEXT: s_bfe_u32 s4, s8, 0x100000 -; GFX10-NEXT: s_lshl_b32 s3, s3, s10 -; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, s4, v0 +; GFX10-NEXT: s_and_b32 s4, 0xffff, s8 +; GFX10-NEXT: s_lshl_b32 s3, s3, 8 +; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX10-NEXT: s_lshl_b32 s4, s4, 16 ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s5, v1 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX10-NEXT: s_and_b32 s5, s12, 0xff -; GFX10-NEXT: s_lshl_b32 s4, s4, 16 -; GFX10-NEXT: s_or_b32 s3, s11, s3 -; GFX10-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX10-NEXT: s_and_b32 s5, s11, 0xff +; GFX10-NEXT: s_or_b32 s3, s10, s3 +; GFX10-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX10-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 -; GFX10-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX10-NEXT: s_lshl_b32 s5, s5, 16 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v0 ; GFX10-NEXT: s_or_b32 s2, s2, s4 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v0 +; GFX10-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX10-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX10-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX10-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX10-NEXT: s_or_b32 s3, s3, s5 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX10-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX10-NEXT: s_and_b32 s7, 0xffff, s7 ; GFX10-NEXT: s_lshl_b32 s4, s6, 17 +; GFX10-NEXT: s_lshl_b32 s0, s0, 1 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, 23, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX10-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GFX10-NEXT: s_lshl_b32 s0, s0, 1 +; GFX10-NEXT: s_or_b32 s0, s4, s0 ; GFX10-NEXT: s_lshl_b32 s1, s1, 1 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 23, v1 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX10-NEXT: v_lshrrev_b32_e64 v0, v0, s2 -; GFX10-NEXT: s_or_b32 s0, s4, s0 ; GFX10-NEXT: s_lshl_b32 s2, s7, 17 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX10-NEXT: v_lshrrev_b32_e64 v1, v1, s3 @@ -2344,96 +2336,95 @@ ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, 24 ; GFX11-NEXT: s_lshr_b32 s6, s0, 8 -; GFX11-NEXT: s_bfe_u32 s9, 8, 0x100000 +; GFX11-NEXT: s_lshr_b32 s7, s0, 16 ; GFX11-NEXT: s_and_b32 s6, s6, 0xff ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX11-NEXT: s_lshr_b32 s7, s0, 16 ; GFX11-NEXT: s_lshr_b32 s8, s0, 24 ; GFX11-NEXT: s_and_b32 s0, s0, 0xff -; GFX11-NEXT: s_lshl_b32 s6, s6, s9 -; GFX11-NEXT: s_lshr_b32 s10, s1, 8 +; GFX11-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-NEXT: s_lshr_b32 s9, s1, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s6 ; GFX11-NEXT: s_and_b32 s6, s7, 0xff -; GFX11-NEXT: s_and_b32 s7, s10, 0xff +; GFX11-NEXT: s_and_b32 s7, s9, 0xff +; GFX11-NEXT: s_lshr_b32 s9, s4, 8 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_dual_mul_f32 v0, 0x4f7ffffe, v0 :: v_dual_mul_f32 v1, 0x4f7ffffe, v1 -; GFX11-NEXT: s_lshr_b32 s10, s4, 8 -; GFX11-NEXT: s_lshr_b32 s11, s4, 16 -; GFX11-NEXT: s_and_b32 s10, s10, 0xff +; GFX11-NEXT: s_lshr_b32 s10, s4, 16 +; GFX11-NEXT: s_and_b32 s9, s9, 0xff +; GFX11-NEXT: s_and_b32 s11, s4, 0xff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX11-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX11-NEXT: s_and_b32 s12, s4, 0xff -; GFX11-NEXT: s_lshl_b32 s10, s10, s9 -; GFX11-NEXT: s_and_b32 s11, s11, 0xff +; GFX11-NEXT: s_lshl_b32 s9, s9, 8 +; GFX11-NEXT: s_and_b32 s10, s10, 0xff +; GFX11-NEXT: s_or_b32 s9, s11, s9 ; GFX11-NEXT: v_mul_lo_u32 v2, 0xffffffe8, v0 -; GFX11-NEXT: s_or_b32 s10, s12, s10 ; GFX11-NEXT: v_mul_lo_u32 v3, 0xffffffe8, v1 -; GFX11-NEXT: s_bfe_u32 s11, s11, 0x100000 -; GFX11-NEXT: s_bfe_u32 s10, s10, 0x100000 -; GFX11-NEXT: s_lshl_b32 s11, s11, 16 -; GFX11-NEXT: s_lshr_b32 s12, s5, 8 -; GFX11-NEXT: s_or_b32 s10, s10, s11 -; GFX11-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX11-NEXT: s_and_b32 s10, 0xffff, s10 +; GFX11-NEXT: s_and_b32 s9, 0xffff, s9 +; GFX11-NEXT: s_lshl_b32 s10, s10, 16 +; GFX11-NEXT: s_lshr_b32 s11, s5, 8 +; GFX11-NEXT: s_or_b32 s9, s9, s10 ; GFX11-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX11-NEXT: s_lshr_b32 s4, s4, 24 -; GFX11-NEXT: s_lshl_b32 s5, s5, s9 -; GFX11-NEXT: s_and_b32 s11, s12, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-NEXT: s_and_b32 s10, s11, 0xff ; GFX11-NEXT: s_or_b32 s4, s4, s5 -; GFX11-NEXT: s_bfe_u32 s5, s11, 0x100000 -; GFX11-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX11-NEXT: s_and_b32 s5, 0xffff, s10 +; GFX11-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 ; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX11-NEXT: v_mul_hi_u32 v2, v1, v3 -; GFX11-NEXT: s_lshl_b32 s5, s5, 16 -; GFX11-NEXT: s_and_b32 s1, s1, 0xff ; GFX11-NEXT: s_or_b32 s4, s4, s5 -; GFX11-NEXT: v_mul_hi_u32 v0, s10, v0 -; GFX11-NEXT: s_lshl_b32 s1, s1, s9 -; GFX11-NEXT: s_lshr_b32 s11, s2, 16 +; GFX11-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-NEXT: s_lshr_b32 s10, s2, 16 +; GFX11-NEXT: v_mul_hi_u32 v0, s9, v0 +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_lshr_b32 s5, s2, 24 ; GFX11-NEXT: s_or_b32 s1, s8, s1 ; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX11-NEXT: s_lshr_b32 s8, s2, 8 -; GFX11-NEXT: s_lshr_b32 s5, s2, 24 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff ; GFX11-NEXT: s_and_b32 s8, s8, 0xff ; GFX11-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX11-NEXT: v_mul_hi_u32 v1, s4, v1 -; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: s_lshl_b32 s8, s8, s9 -; GFX11-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX11-NEXT: s_lshl_b32 s8, s8, 8 +; GFX11-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX11-NEXT: s_or_b32 s2, s2, s8 -; GFX11-NEXT: s_and_b32 s8, s11, 0xff -; GFX11-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX11-NEXT: v_sub_nc_u32_e32 v0, s10, v0 +; GFX11-NEXT: s_and_b32 s8, s10, 0xff +; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX11-NEXT: s_and_b32 s8, 0xffff, s8 +; GFX11-NEXT: v_sub_nc_u32_e32 v0, s9, v0 ; GFX11-NEXT: v_mul_lo_u32 v1, v1, 24 -; GFX11-NEXT: s_bfe_u32 s8, s8, 0x100000 -; GFX11-NEXT: s_lshr_b32 s10, s3, 8 +; GFX11-NEXT: s_lshr_b32 s9, s3, 8 ; GFX11-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s8, 16 ; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX11-NEXT: s_lshl_b32 s8, s8, 16 -; GFX11-NEXT: s_lshl_b32 s3, s3, s9 +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_or_b32 s2, s2, s8 ; GFX11-NEXT: v_sub_nc_u32_e32 v1, s4, v1 -; GFX11-NEXT: s_and_b32 s4, s10, 0xff +; GFX11-NEXT: s_and_b32 s4, s9, 0xff ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo -; GFX11-NEXT: s_or_b32 s2, s2, s8 -; GFX11-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 24, v1 +; GFX11-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX11-NEXT: s_or_b32 s3, s5, s3 +; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 24, v1 +; GFX11-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX11-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX11-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX11-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX11-NEXT: s_lshl_b32 s4, s4, 16 ; GFX11-NEXT: s_lshl_b32 s5, s6, 17 +; GFX11-NEXT: s_lshl_b32 s0, s0, 1 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 -; GFX11-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX11-NEXT: s_or_b32 s0, s5, s0 -; GFX11-NEXT: s_bfe_u32 s7, s7, 0x100000 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX11-NEXT: s_and_b32 s7, 0xffff, s7 ; GFX11-NEXT: s_lshl_b32 s1, s1, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 24, v1 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 @@ -3250,9 +3241,9 @@ ; GFX6-NEXT: s_and_b32 s3, s2, 15 ; GFX6-NEXT: s_andn2_b32 s2, 15, s2 ; GFX6-NEXT: s_lshl_b32 s0, s0, 1 -; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX6-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX6-NEXT: s_lshl_b32 s0, s0, s2 -; GFX6-NEXT: s_bfe_u32 s2, s3, 0x100000 +; GFX6-NEXT: s_and_b32 s2, 0xffff, s3 ; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_lshr_b32 s1, s1, s2 ; GFX6-NEXT: s_or_b32 s0, s0, s1 @@ -3262,12 +3253,11 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s3, s2, 15 ; GFX8-NEXT: s_andn2_b32 s2, 15, s2 -; GFX8-NEXT: s_bfe_u32 s4, 1, 0x100000 -; GFX8-NEXT: s_lshl_b32 s0, s0, s4 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s2, s3, 0x100000 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s3 ; GFX8-NEXT: s_lshr_b32 s1, s1, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog @@ -3276,12 +3266,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s3, s2, 15 ; GFX9-NEXT: s_andn2_b32 s2, 15, s2 -; GFX9-NEXT: s_bfe_u32 s4, 1, 0x100000 -; GFX9-NEXT: s_lshl_b32 s0, s0, s4 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX9-NEXT: s_lshl_b32 s0, s0, s2 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_bfe_u32 s2, s3, 0x100000 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX9-NEXT: s_and_b32 s2, 0xffff, s3 ; GFX9-NEXT: s_lshr_b32 s1, s1, s2 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog @@ -3289,12 +3278,11 @@ ; GFX10-LABEL: s_fshr_i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_and_b32 s3, s2, 15 -; GFX10-NEXT: s_bfe_u32 s4, 1, 0x100000 ; GFX10-NEXT: s_andn2_b32 s2, 15, s2 -; GFX10-NEXT: s_lshl_b32 s0, s0, s4 -; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX10-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX10-NEXT: s_lshl_b32 s0, s0, 1 +; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX10-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX10-NEXT: s_lshl_b32 s0, s0, s2 ; GFX10-NEXT: s_lshr_b32 s1, s1, s3 ; GFX10-NEXT: s_or_b32 s0, s0, s1 @@ -3303,12 +3291,11 @@ ; GFX11-LABEL: s_fshr_i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_and_b32 s3, s2, 15 -; GFX11-NEXT: s_bfe_u32 s4, 1, 0x100000 ; GFX11-NEXT: s_and_not1_b32 s2, 15, s2 -; GFX11-NEXT: s_lshl_b32 s0, s0, s4 -; GFX11-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX11-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX11-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX11-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX11-NEXT: s_lshl_b32 s0, s0, s2 ; GFX11-NEXT: s_lshr_b32 s1, s1, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -3328,41 +3315,33 @@ ; ; GFX8-LABEL: s_fshr_i16_4: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s2, 12, 0x100000 -; GFX8-NEXT: s_lshl_b32 s0, s0, s2 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s2, 4, 0x100000 -; GFX8-NEXT: s_lshr_b32 s1, s1, s2 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_lshl_b32 s0, s0, 12 +; GFX8-NEXT: s_lshr_b32 s1, s1, 4 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshr_i16_4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_bfe_u32 s2, 12, 0x100000 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_bfe_u32 s2, 4, 0x100000 -; GFX9-NEXT: s_lshr_b32 s1, s1, s2 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX9-NEXT: s_lshl_b32 s0, s0, 12 +; GFX9-NEXT: s_lshr_b32 s1, s1, 4 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fshr_i16_4: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_bfe_u32 s2, 12, 0x100000 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX10-NEXT: s_bfe_u32 s3, 4, 0x100000 -; GFX10-NEXT: s_lshl_b32 s0, s0, s2 -; GFX10-NEXT: s_lshr_b32 s1, s1, s3 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX10-NEXT: s_lshl_b32 s0, s0, 12 +; GFX10-NEXT: s_lshr_b32 s1, s1, 4 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshr_i16_4: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_bfe_u32 s2, 12, 0x100000 -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX11-NEXT: s_bfe_u32 s3, 4, 0x100000 -; GFX11-NEXT: s_lshl_b32 s0, s0, s2 -; GFX11-NEXT: s_lshr_b32 s1, s1, s3 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX11-NEXT: s_lshl_b32 s0, s0, 12 +; GFX11-NEXT: s_lshr_b32 s1, s1, 4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: ; return to shader part epilog @@ -3380,41 +3359,33 @@ ; ; GFX8-LABEL: s_fshr_i16_5: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s2, 11, 0x100000 -; GFX8-NEXT: s_lshl_b32 s0, s0, s2 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s2, 5, 0x100000 -; GFX8-NEXT: s_lshr_b32 s1, s1, s2 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_lshl_b32 s0, s0, 11 +; GFX8-NEXT: s_lshr_b32 s1, s1, 5 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshr_i16_5: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_bfe_u32 s2, 11, 0x100000 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_bfe_u32 s2, 5, 0x100000 -; GFX9-NEXT: s_lshr_b32 s1, s1, s2 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX9-NEXT: s_lshl_b32 s0, s0, 11 +; GFX9-NEXT: s_lshr_b32 s1, s1, 5 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fshr_i16_5: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_bfe_u32 s2, 11, 0x100000 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX10-NEXT: s_bfe_u32 s3, 5, 0x100000 -; GFX10-NEXT: s_lshl_b32 s0, s0, s2 -; GFX10-NEXT: s_lshr_b32 s1, s1, s3 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX10-NEXT: s_lshl_b32 s0, s0, 11 +; GFX10-NEXT: s_lshr_b32 s1, s1, 5 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshr_i16_5: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_bfe_u32 s2, 11, 0x100000 -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX11-NEXT: s_bfe_u32 s3, 5, 0x100000 -; GFX11-NEXT: s_lshl_b32 s0, s0, s2 -; GFX11-NEXT: s_lshr_b32 s1, s1, s3 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX11-NEXT: s_lshl_b32 s0, s0, 11 +; GFX11-NEXT: s_lshr_b32 s1, s1, 5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: ; return to shader part epilog @@ -3430,9 +3401,9 @@ ; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_bfe_u32 v2, v3, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3594,9 +3565,9 @@ ; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX6-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX6-NEXT: s_lshl_b32 s0, s0, 1 -; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshl_b32_e32 v0, s0, v0 -; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: s_and_b32 s0, s1, 0xffff ; GFX6-NEXT: v_lshr_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3606,9 +3577,8 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: v_and_b32_e32 v1, 15, v0 ; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX8-NEXT: s_bfe_u32 s2, 1, 0x100000 ; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX8-NEXT: s_lshl_b32 s0, s0, s2 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 ; GFX8-NEXT: v_lshlrev_b16_e64 v0, v0, s0 ; GFX8-NEXT: v_lshrrev_b16_e64 v1, v1, s1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3618,9 +3588,8 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: v_and_b32_e32 v1, 15, v0 ; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX9-NEXT: s_bfe_u32 s2, 1, 0x100000 ; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 ; GFX9-NEXT: v_lshlrev_b16_e64 v0, v0, s0 ; GFX9-NEXT: v_lshrrev_b16_e64 v1, v1, s1 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3630,8 +3599,7 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: v_xor_b32_e32 v1, -1, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX10-NEXT: s_bfe_u32 s2, 1, 0x100000 -; GFX10-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10-NEXT: s_lshl_b32 s0, s0, 1 ; GFX10-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX10-NEXT: v_lshrrev_b16 v0, v0, s1 ; GFX10-NEXT: v_lshlrev_b16 v1, v1, s0 @@ -3642,14 +3610,12 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: v_xor_b32_e32 v1, -1, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX11-NEXT: s_bfe_u32 s2, 1, 0x100000 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: s_lshl_b32 s0, s0, s2 -; GFX11-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX11-NEXT: s_lshl_b32 s0, s0, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX11-NEXT: v_lshrrev_b16 v0, v0, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b16 v1, v1, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-NEXT: ; return to shader part epilog %result = call i16 @llvm.fshr.i16(i16 %lhs, i16 %rhs, i16 %amt) @@ -3663,9 +3629,9 @@ ; GFX6-NEXT: s_and_b32 s2, s1, 15 ; GFX6-NEXT: s_andn2_b32 s1, 15, s1 ; GFX6-NEXT: s_lshl_b32 s0, s0, 1 -; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX6-NEXT: s_lshl_b32 s0, s0, s1 -; GFX6-NEXT: s_bfe_u32 s1, s2, 0x100000 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s2 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s1, v0 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 @@ -3675,9 +3641,8 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s2, s1, 15 ; GFX8-NEXT: s_andn2_b32 s1, 15, s1 -; GFX8-NEXT: s_bfe_u32 s3, 1, 0x100000 -; GFX8-NEXT: s_lshl_b32 s0, s0, s3 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshl_b32 s0, s0, s1 ; GFX8-NEXT: v_lshrrev_b16_e32 v0, s2, v0 ; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 @@ -3687,9 +3652,8 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s2, s1, 15 ; GFX9-NEXT: s_andn2_b32 s1, 15, s1 -; GFX9-NEXT: s_bfe_u32 s3, 1, 0x100000 -; GFX9-NEXT: s_lshl_b32 s0, s0, s3 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_lshl_b32 s0, s0, s1 ; GFX9-NEXT: v_lshrrev_b16_e32 v0, s2, v0 ; GFX9-NEXT: v_or_b32_e32 v0, s0, v0 @@ -3698,11 +3662,10 @@ ; GFX10-LABEL: v_fshr_i16_svs: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_and_b32 s2, s1, 15 -; GFX10-NEXT: s_bfe_u32 s3, 1, 0x100000 ; GFX10-NEXT: s_andn2_b32 s1, 15, s1 ; GFX10-NEXT: v_lshrrev_b16 v0, s2, v0 -; GFX10-NEXT: s_lshl_b32 s0, s0, s3 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_lshl_b32 s0, s0, 1 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: s_lshl_b32 s0, s0, s1 ; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog @@ -3710,11 +3673,10 @@ ; GFX11-LABEL: v_fshr_i16_svs: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_and_b32 s2, s1, 15 -; GFX11-NEXT: s_bfe_u32 s3, 1, 0x100000 ; GFX11-NEXT: s_and_not1_b32 s1, 15, s1 ; GFX11-NEXT: v_lshrrev_b16 v0, s2, v0 -; GFX11-NEXT: s_lshl_b32 s0, s0, s3 -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s0, s0, s1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) @@ -3731,9 +3693,9 @@ ; GFX6-NEXT: s_and_b32 s2, s1, 15 ; GFX6-NEXT: s_andn2_b32 s1, 15, s1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s1, v0 -; GFX6-NEXT: s_bfe_u32 s1, s2, 0x100000 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s2 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshr_b32 s0, s0, s1 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 @@ -3745,8 +3707,8 @@ ; GFX8-NEXT: s_andn2_b32 s1, 15, s1 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 1, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, s1, v0 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s2, 0x100000 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s2 ; GFX8-NEXT: s_lshr_b32 s0, s0, s1 ; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX8-NEXT: ; return to shader part epilog @@ -3757,8 +3719,8 @@ ; GFX9-NEXT: s_andn2_b32 s1, 15, s1 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, s1, v0 -; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX9-NEXT: s_bfe_u32 s1, s2, 0x100000 +; GFX9-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s2 ; GFX9-NEXT: s_lshr_b32 s0, s0, s1 ; GFX9-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX9-NEXT: ; return to shader part epilog @@ -3768,8 +3730,8 @@ ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 ; GFX10-NEXT: s_andn2_b32 s2, 15, s1 ; GFX10-NEXT: s_and_b32 s1, s1, 15 -; GFX10-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: v_lshlrev_b16 v0, s2, v0 ; GFX10-NEXT: s_lshr_b32 s0, s0, s1 ; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 @@ -3780,8 +3742,8 @@ ; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0 ; GFX11-NEXT: s_and_not1_b32 s2, 15, s1 ; GFX11-NEXT: s_and_b32 s1, s1, 15 -; GFX11-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: v_lshlrev_b16 v0, s2, v0 ; GFX11-NEXT: s_lshr_b32 s0, s0, s1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) @@ -3798,79 +3760,75 @@ ; GFX6-NEXT: s_lshl_b32 s5, s5, 16 ; GFX6-NEXT: s_and_b32 s4, s4, 0xffff ; GFX6-NEXT: s_or_b32 s4, s5, s4 -; GFX6-NEXT: s_bfe_u32 s5, 1, 0x100000 -; GFX6-NEXT: s_lshl_b32 s0, s0, s5 -; GFX6-NEXT: s_bfe_u32 s6, s2, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s7, 14, 0x100000 -; GFX6-NEXT: s_lshl_b32 s1, s1, s5 +; GFX6-NEXT: s_bfe_u32 s5, s2, 0xf0001 +; GFX6-NEXT: s_lshl_b32 s0, s0, 1 +; GFX6-NEXT: s_lshr_b32 s5, s5, 14 +; GFX6-NEXT: s_or_b32 s0, s0, s5 ; GFX6-NEXT: s_bfe_u32 s5, s3, 0xf0001 -; GFX6-NEXT: s_lshr_b32 s6, s6, s7 -; GFX6-NEXT: s_lshr_b32 s5, s5, s7 +; GFX6-NEXT: s_lshl_b32 s1, s1, 1 +; GFX6-NEXT: s_lshr_b32 s5, s5, 14 ; GFX6-NEXT: s_xor_b32 s4, s4, -1 -; GFX6-NEXT: s_or_b32 s0, s0, s6 ; GFX6-NEXT: s_or_b32 s1, s1, s5 ; GFX6-NEXT: s_lshl_b32 s2, s2, 1 ; GFX6-NEXT: s_lshr_b32 s5, s4, 16 ; GFX6-NEXT: s_and_b32 s6, s4, 15 ; GFX6-NEXT: s_andn2_b32 s4, 15, s4 -; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX6-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX6-NEXT: s_bfe_u32 s2, s2, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX6-NEXT: s_lshl_b32 s0, s0, s6 ; GFX6-NEXT: s_lshr_b32 s2, s2, s4 ; GFX6-NEXT: s_or_b32 s0, s0, s2 ; GFX6-NEXT: s_and_b32 s2, s5, 15 ; GFX6-NEXT: s_lshl_b32 s3, s3, 1 ; GFX6-NEXT: s_andn2_b32 s4, 15, s5 -; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX6-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX6-NEXT: s_lshl_b32 s1, s1, s2 ; GFX6-NEXT: s_bfe_u32 s2, s3, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s3, s4, 0x100000 +; GFX6-NEXT: s_and_b32 s3, 0xffff, s4 ; GFX6-NEXT: s_lshr_b32 s2, s2, s3 ; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fshr_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s5, 1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s6, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s7, 15, 0x100000 +; GFX8-NEXT: s_and_b32 s5, 0xffff, s1 ; GFX8-NEXT: s_lshr_b32 s3, s0, 16 ; GFX8-NEXT: s_lshr_b32 s4, s1, 16 -; GFX8-NEXT: s_lshl_b32 s0, s0, s5 -; GFX8-NEXT: s_lshr_b32 s6, s6, s7 -; GFX8-NEXT: s_or_b32 s0, s0, s6 -; GFX8-NEXT: s_lshl_b32 s3, s3, s5 -; GFX8-NEXT: s_lshr_b32 s6, s4, s7 -; GFX8-NEXT: s_lshl_b32 s1, s1, s5 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: s_lshr_b32 s5, s5, 15 +; GFX8-NEXT: s_or_b32 s0, s0, s5 +; GFX8-NEXT: s_lshl_b32 s3, s3, 1 +; GFX8-NEXT: s_lshr_b32 s5, s4, 15 +; GFX8-NEXT: s_lshl_b32 s1, s1, 1 ; GFX8-NEXT: s_xor_b32 s2, s2, -1 -; GFX8-NEXT: s_or_b32 s3, s3, s6 -; GFX8-NEXT: s_lshr_b32 s6, s2, 16 -; GFX8-NEXT: s_and_b32 s7, s2, 15 +; GFX8-NEXT: s_or_b32 s3, s3, s5 +; GFX8-NEXT: s_lshr_b32 s5, s2, 16 +; GFX8-NEXT: s_and_b32 s6, s2, 15 ; GFX8-NEXT: s_andn2_b32 s2, 15, s2 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s7, s7, 0x100000 -; GFX8-NEXT: s_lshr_b32 s1, s1, s5 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_lshl_b32 s0, s0, s7 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX8-NEXT: s_lshr_b32 s1, s1, 1 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_lshl_b32 s0, s0, s6 ; GFX8-NEXT: s_lshr_b32 s1, s1, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s6, 15 -; GFX8-NEXT: s_lshl_b32 s4, s4, s5 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_andn2_b32 s2, 15, s6 +; GFX8-NEXT: s_and_b32 s1, s5, 15 +; GFX8-NEXT: s_lshl_b32 s4, s4, 1 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_andn2_b32 s2, 15, s5 ; GFX8-NEXT: s_lshl_b32 s1, s3, s1 -; GFX8-NEXT: s_bfe_u32 s3, s4, 0x100000 -; GFX8-NEXT: s_lshr_b32 s3, s3, s5 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s4 +; GFX8-NEXT: s_lshr_b32 s3, s3, 1 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_lshr_b32 s2, s3, s2 ; GFX8-NEXT: s_or_b32 s1, s1, s2 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog @@ -3953,15 +3911,13 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX6-NEXT: s_bfe_u32 s4, 1, 0x100000 ; GFX6-NEXT: v_bfe_u32 v5, v2, 1, 15 -; GFX6-NEXT: s_bfe_u32 s5, 14, 0x100000 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, s5, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 14, v5 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v5 ; GFX6-NEXT: v_bfe_u32 v5, v3, 1, 15 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, s5, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 14, v5 ; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v4 @@ -3969,9 +3925,9 @@ ; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX6-NEXT: v_bfe_u32 v6, v6, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX6-NEXT: v_bfe_u32 v2, v2, 1, 15 -; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v6, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 @@ -3979,10 +3935,10 @@ ; GFX6-NEXT: v_xor_b32_e32 v4, -1, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_bfe_u32 v2, v3, 1, 15 -; GFX6-NEXT: v_bfe_u32 v3, v4, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v3, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -4016,8 +3972,8 @@ ; GFX8-NEXT: v_lshlrev_b16_e32 v0, v3, v0 ; GFX8-NEXT: v_lshrrev_b16_e32 v1, v4, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_mov_b32_e32 v1, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -4068,17 +4024,13 @@ ; GFX6-LABEL: v_fshr_v2i16_4_8: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_bfe_u32 s4, 12, 0x100000 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX6-NEXT: v_bfe_u32 v2, v2, 1, 15 -; GFX6-NEXT: s_bfe_u32 s4, 3, 0x100000 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, s4, v2 -; GFX6-NEXT: s_bfe_u32 s4, 8, 0x100000 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 12, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 3, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_bfe_u32 v2, v3, 1, 15 -; GFX6-NEXT: s_bfe_u32 s4, 7, 0x100000 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 7, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -4093,8 +4045,8 @@ ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -4136,79 +4088,75 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_bfe_u32 s4, 1, 0x100000 -; GFX6-NEXT: s_bfe_u32 s5, s2, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s6, 14, 0x100000 +; GFX6-NEXT: s_bfe_u32 s4, s2, 0xf0001 ; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX6-NEXT: s_lshl_b32 s0, s0, s4 -; GFX6-NEXT: s_lshr_b32 s5, s5, s6 +; GFX6-NEXT: s_lshl_b32 s0, s0, 1 +; GFX6-NEXT: s_lshr_b32 s4, s4, 14 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v0 ; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX6-NEXT: s_or_b32 s0, s0, s5 +; GFX6-NEXT: s_or_b32 s0, s0, s4 ; GFX6-NEXT: s_lshl_b32 s2, s2, 1 ; GFX6-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, s0, v2 ; GFX6-NEXT: s_bfe_u32 s0, s2, 0xf0001 -; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0 -; GFX6-NEXT: s_lshl_b32 s1, s1, s4 ; GFX6-NEXT: s_bfe_u32 s4, s3, 0xf0001 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v1 ; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX6-NEXT: s_lshr_b32 s4, s4, s6 +; GFX6-NEXT: s_lshl_b32 s1, s1, 1 +; GFX6-NEXT: s_lshr_b32 s4, s4, 14 ; GFX6-NEXT: s_lshl_b32 s3, s3, 1 ; GFX6-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX6-NEXT: s_or_b32 s1, s1, s4 -; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: s_bfe_u32 s0, s3, 0xf0001 -; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_lshl_b32_e32 v2, s1, v2 ; GFX6-NEXT: v_lshr_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 -; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: v_fshr_v2i16_ssv: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s4, 1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s5, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s6, 15, 0x100000 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s1 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_lshl_b32 s0, s0, s4 -; GFX8-NEXT: s_lshr_b32 s5, s5, s6 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: s_lshr_b32 s4, s4, 15 ; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: s_or_b32 s0, s0, s5 -; GFX8-NEXT: s_lshl_b32 s1, s1, s4 +; GFX8-NEXT: s_or_b32 s0, s0, s4 +; GFX8-NEXT: s_lshl_b32 s1, s1, 1 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX8-NEXT: v_lshlrev_b16_e64 v2, v2, s0 -; GFX8-NEXT: s_bfe_u32 s0, s1, 0x100000 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s1 ; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX8-NEXT: s_lshr_b32 s0, s0, s4 -; GFX8-NEXT: s_lshr_b32 s5, s3, s6 -; GFX8-NEXT: s_lshl_b32 s3, s3, s4 +; GFX8-NEXT: s_lshr_b32 s0, s0, 1 +; GFX8-NEXT: s_lshr_b32 s4, s3, 15 +; GFX8-NEXT: s_lshl_b32 s3, s3, 1 ; GFX8-NEXT: v_lshrrev_b16_e64 v0, v0, s0 -; GFX8-NEXT: s_lshl_b32 s2, s2, s4 +; GFX8-NEXT: s_lshl_b32 s2, s2, 1 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v1 ; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX8-NEXT: s_bfe_u32 s0, s3, 0x100000 -; GFX8-NEXT: s_or_b32 s2, s2, s5 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s3 +; GFX8-NEXT: s_or_b32 s2, s2, s4 ; GFX8-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX8-NEXT: s_lshr_b32 s0, s0, s4 +; GFX8-NEXT: s_lshr_b32 s0, s0, 1 ; GFX8-NEXT: v_lshlrev_b16_e64 v2, v2, s2 ; GFX8-NEXT: v_lshrrev_b16_e64 v1, v1, s0 ; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: ; return to shader part epilog ; @@ -4262,18 +4210,16 @@ define amdgpu_ps float @v_fshr_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, <2 x i16> inreg %amt) { ; GFX6-LABEL: v_fshr_v2i16_svs: ; GFX6: ; %bb.0: +; GFX6-NEXT: v_bfe_u32 v2, v0, 1, 15 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_and_b32 s2, s2, 0xffff -; GFX6-NEXT: s_or_b32 s2, s3, s2 -; GFX6-NEXT: s_bfe_u32 s3, 1, 0x100000 -; GFX6-NEXT: v_bfe_u32 v2, v0, 1, 15 -; GFX6-NEXT: s_bfe_u32 s4, 14, 0x100000 -; GFX6-NEXT: s_lshl_b32 s0, s0, s3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, s4, v2 +; GFX6-NEXT: s_lshl_b32 s0, s0, 1 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 14, v2 ; GFX6-NEXT: v_bfe_u32 v3, v1, 1, 15 +; GFX6-NEXT: s_or_b32 s2, s3, s2 ; GFX6-NEXT: v_or_b32_e32 v2, s0, v2 -; GFX6-NEXT: s_lshl_b32 s0, s1, s3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, s4, v3 +; GFX6-NEXT: s_lshl_b32 s0, s1, 1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 14, v3 ; GFX6-NEXT: v_or_b32_e32 v3, s0, v3 ; GFX6-NEXT: s_xor_b32 s0, s2, -1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -4281,35 +4227,34 @@ ; GFX6-NEXT: s_and_b32 s2, s0, 15 ; GFX6-NEXT: s_andn2_b32 s0, 15, s0 ; GFX6-NEXT: v_bfe_u32 v0, v0, 1, 15 -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX6-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX6-NEXT: s_and_b32 s0, s1, 15 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, s2, v2 ; GFX6-NEXT: s_andn2_b32 s1, 15, s1 -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, s0, v3 ; GFX6-NEXT: v_bfe_u32 v1, v1, 1, 15 -; GFX6-NEXT: s_bfe_u32 s0, s1, 0x100000 +; GFX6-NEXT: s_and_b32 s0, 0xffff, s1 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 -; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: v_fshr_v2i16_svs: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s3, 1, 0x100000 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_lshl_b32 s0, s0, s3 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 ; GFX8-NEXT: v_lshrrev_b16_e32 v1, 15, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 15 ; GFX8-NEXT: v_or_b32_e32 v1, s0, v1 -; GFX8-NEXT: s_lshl_b32 s0, s2, s3 +; GFX8-NEXT: s_lshl_b32 s0, s2, 1 ; GFX8-NEXT: v_lshrrev_b16_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v2, s0, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 1, v0 @@ -4326,11 +4271,11 @@ ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 1, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, s0, v2 ; GFX8-NEXT: v_lshrrev_b16_e32 v0, s1, v0 -; GFX8-NEXT: v_lshlrev_b16_e32 v1, s2, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, s2, v1 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: ; return to shader part epilog ; @@ -4397,77 +4342,73 @@ ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_and_b32 s2, s2, 0xffff ; GFX6-NEXT: s_or_b32 s2, s3, s2 -; GFX6-NEXT: s_bfe_u32 s3, 1, 0x100000 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s3, v0 -; GFX6-NEXT: s_bfe_u32 s4, s0, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s5, 14, 0x100000 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, s3, v1 +; GFX6-NEXT: s_bfe_u32 s3, s0, 0xf0001 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: s_lshr_b32 s3, s3, 14 +; GFX6-NEXT: v_or_b32_e32 v0, s3, v0 ; GFX6-NEXT: s_bfe_u32 s3, s1, 0xf0001 -; GFX6-NEXT: s_lshr_b32 s4, s4, s5 -; GFX6-NEXT: s_lshr_b32 s3, s3, s5 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX6-NEXT: s_lshr_b32 s3, s3, 14 ; GFX6-NEXT: s_xor_b32 s2, s2, -1 -; GFX6-NEXT: v_or_b32_e32 v0, s4, v0 ; GFX6-NEXT: v_or_b32_e32 v1, s3, v1 ; GFX6-NEXT: s_lshl_b32 s0, s0, 1 ; GFX6-NEXT: s_lshr_b32 s3, s2, 16 ; GFX6-NEXT: s_and_b32 s4, s2, 15 ; GFX6-NEXT: s_andn2_b32 s2, 15, s2 -; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX6-NEXT: s_bfe_u32 s0, s0, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX6-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX6-NEXT: s_lshr_b32 s0, s0, s2 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX6-NEXT: s_and_b32 s0, s3, 15 ; GFX6-NEXT: s_lshl_b32 s1, s1, 1 ; GFX6-NEXT: s_andn2_b32 s2, 15, s3 -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, s0, v1 ; GFX6-NEXT: s_bfe_u32 s0, s1, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s1, s2, 0x100000 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s2 ; GFX6-NEXT: s_lshr_b32 s0, s0, s1 ; GFX6-NEXT: v_or_b32_e32 v1, s0, v1 -; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 -; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: v_fshr_v2i16_vss: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s3, s0, 0x100000 -; GFX8-NEXT: s_bfe_u32 s4, 15, 0x100000 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s0 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 1, v0 -; GFX8-NEXT: s_lshr_b32 s3, s3, s4 +; GFX8-NEXT: s_lshr_b32 s3, s3, 15 ; GFX8-NEXT: v_mov_b32_e32 v2, 1 ; GFX8-NEXT: v_or_b32_e32 v1, s3, v1 ; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: s_lshr_b32 s3, s2, s4 -; GFX8-NEXT: v_or_b32_e32 v0, s3, v0 -; GFX8-NEXT: s_bfe_u32 s3, 1, 0x100000 -; GFX8-NEXT: s_lshl_b32 s0, s0, s3 +; GFX8-NEXT: s_lshr_b32 s3, s2, 15 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 ; GFX8-NEXT: s_xor_b32 s1, s1, -1 -; GFX8-NEXT: s_lshr_b32 s4, s1, 16 -; GFX8-NEXT: s_and_b32 s5, s1, 15 +; GFX8-NEXT: v_or_b32_e32 v0, s3, v0 +; GFX8-NEXT: s_lshr_b32 s3, s1, 16 +; GFX8-NEXT: s_and_b32 s4, s1, 15 ; GFX8-NEXT: s_andn2_b32 s1, 15, s1 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_lshr_b32 s0, s0, s3 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: v_lshlrev_b16_e32 v1, s5, v1 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_lshr_b32 s0, s0, 1 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, s4, v1 ; GFX8-NEXT: s_lshr_b32 s0, s0, s1 -; GFX8-NEXT: s_lshl_b32 s2, s2, s3 +; GFX8-NEXT: s_lshl_b32 s2, s2, 1 ; GFX8-NEXT: v_or_b32_e32 v1, s0, v1 -; GFX8-NEXT: s_and_b32 s0, s4, 15 -; GFX8-NEXT: s_andn2_b32 s1, 15, s4 +; GFX8-NEXT: s_and_b32 s0, s3, 15 +; GFX8-NEXT: s_andn2_b32 s1, 15, s3 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, s0, v0 -; GFX8-NEXT: s_bfe_u32 s0, s2, 0x100000 -; GFX8-NEXT: s_lshr_b32 s0, s0, s3 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s2 +; GFX8-NEXT: s_lshr_b32 s0, s0, 1 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshr_b32 s0, s0, s1 ; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: ; return to shader part epilog ; @@ -4530,113 +4471,109 @@ ; GFX6-NEXT: s_lshl_b32 s7, s7, 16 ; GFX6-NEXT: s_or_b32 s6, s6, s7 ; GFX6-NEXT: s_and_b32 s7, s8, 0xffff -; GFX6-NEXT: s_bfe_u32 s8, 1, 0x100000 -; GFX6-NEXT: s_bfe_u32 s9, s3, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s10, 14, 0x100000 -; GFX6-NEXT: s_lshl_b32 s0, s0, s8 -; GFX6-NEXT: s_lshr_b32 s9, s9, s10 -; GFX6-NEXT: s_or_b32 s0, s0, s9 -; GFX6-NEXT: s_bfe_u32 s9, s4, 0xf0001 -; GFX6-NEXT: s_lshl_b32 s1, s1, s8 -; GFX6-NEXT: s_lshr_b32 s9, s9, s10 +; GFX6-NEXT: s_bfe_u32 s8, s3, 0xf0001 +; GFX6-NEXT: s_lshl_b32 s0, s0, 1 +; GFX6-NEXT: s_lshr_b32 s8, s8, 14 +; GFX6-NEXT: s_or_b32 s0, s0, s8 +; GFX6-NEXT: s_bfe_u32 s8, s4, 0xf0001 +; GFX6-NEXT: s_lshl_b32 s1, s1, 1 +; GFX6-NEXT: s_lshr_b32 s8, s8, 14 ; GFX6-NEXT: s_xor_b32 s6, s6, -1 -; GFX6-NEXT: s_or_b32 s1, s1, s9 +; GFX6-NEXT: s_or_b32 s1, s1, s8 ; GFX6-NEXT: s_lshl_b32 s3, s3, 1 -; GFX6-NEXT: s_lshr_b32 s9, s6, 16 -; GFX6-NEXT: s_and_b32 s11, s6, 15 +; GFX6-NEXT: s_lshr_b32 s8, s6, 16 +; GFX6-NEXT: s_and_b32 s9, s6, 15 ; GFX6-NEXT: s_andn2_b32 s6, 15, s6 -; GFX6-NEXT: s_bfe_u32 s11, s11, 0x100000 +; GFX6-NEXT: s_and_b32 s9, 0xffff, s9 ; GFX6-NEXT: s_bfe_u32 s3, s3, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX6-NEXT: s_lshl_b32 s0, s0, s11 +; GFX6-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX6-NEXT: s_lshl_b32 s0, s0, s9 ; GFX6-NEXT: s_lshr_b32 s3, s3, s6 ; GFX6-NEXT: s_or_b32 s0, s0, s3 -; GFX6-NEXT: s_and_b32 s3, s9, 15 +; GFX6-NEXT: s_and_b32 s3, s8, 15 ; GFX6-NEXT: s_lshl_b32 s4, s4, 1 -; GFX6-NEXT: s_andn2_b32 s6, 15, s9 -; GFX6-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX6-NEXT: s_andn2_b32 s6, 15, s8 +; GFX6-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX6-NEXT: s_lshl_b32 s1, s1, s3 ; GFX6-NEXT: s_bfe_u32 s3, s4, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s4, s6, 0x100000 +; GFX6-NEXT: s_and_b32 s4, 0xffff, s6 ; GFX6-NEXT: s_lshr_b32 s3, s3, s4 ; GFX6-NEXT: s_or_b32 s1, s1, s3 ; GFX6-NEXT: s_bfe_u32 s3, s5, 0xf0001 -; GFX6-NEXT: s_lshl_b32 s2, s2, s8 -; GFX6-NEXT: s_lshr_b32 s3, s3, s10 +; GFX6-NEXT: s_lshl_b32 s2, s2, 1 +; GFX6-NEXT: s_lshr_b32 s3, s3, 14 ; GFX6-NEXT: s_xor_b32 s4, s7, -1 ; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: s_lshl_b32 s3, s5, 1 ; GFX6-NEXT: s_and_b32 s5, s4, 15 ; GFX6-NEXT: s_andn2_b32 s4, 15, s4 -; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX6-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX6-NEXT: s_bfe_u32 s3, s3, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX6-NEXT: s_lshl_b32 s2, s2, s5 ; GFX6-NEXT: s_lshr_b32 s3, s3, s4 -; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_bfe_u32 s1, s2, 0x100000 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s2 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fshr_v3i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s8, 1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x100000 -; GFX8-NEXT: s_bfe_u32 s10, 15, 0x100000 +; GFX8-NEXT: s_and_b32 s8, 0xffff, s2 ; GFX8-NEXT: s_lshr_b32 s6, s0, 16 ; GFX8-NEXT: s_lshr_b32 s7, s2, 16 -; GFX8-NEXT: s_lshl_b32 s0, s0, s8 -; GFX8-NEXT: s_lshr_b32 s9, s9, s10 -; GFX8-NEXT: s_or_b32 s0, s0, s9 -; GFX8-NEXT: s_lshl_b32 s6, s6, s8 -; GFX8-NEXT: s_lshr_b32 s9, s7, s10 -; GFX8-NEXT: s_lshl_b32 s2, s2, s8 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: s_lshr_b32 s8, s8, 15 +; GFX8-NEXT: s_or_b32 s0, s0, s8 +; GFX8-NEXT: s_lshl_b32 s6, s6, 1 +; GFX8-NEXT: s_lshr_b32 s8, s7, 15 +; GFX8-NEXT: s_lshl_b32 s2, s2, 1 ; GFX8-NEXT: s_xor_b32 s4, s4, -1 -; GFX8-NEXT: s_or_b32 s6, s6, s9 -; GFX8-NEXT: s_lshr_b32 s9, s4, 16 -; GFX8-NEXT: s_and_b32 s11, s4, 15 +; GFX8-NEXT: s_or_b32 s6, s6, s8 +; GFX8-NEXT: s_lshr_b32 s8, s4, 16 +; GFX8-NEXT: s_and_b32 s9, s4, 15 ; GFX8-NEXT: s_andn2_b32 s4, 15, s4 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_bfe_u32 s11, s11, 0x100000 -; GFX8-NEXT: s_lshr_b32 s2, s2, s8 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_lshl_b32 s0, s0, s11 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_and_b32 s9, 0xffff, s9 +; GFX8-NEXT: s_lshr_b32 s2, s2, 1 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX8-NEXT: s_lshl_b32 s0, s0, s9 ; GFX8-NEXT: s_lshr_b32 s2, s2, s4 ; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s2, s9, 15 -; GFX8-NEXT: s_lshl_b32 s7, s7, s8 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_andn2_b32 s4, 15, s9 +; GFX8-NEXT: s_and_b32 s2, s8, 15 +; GFX8-NEXT: s_lshl_b32 s7, s7, 1 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_andn2_b32 s4, 15, s8 ; GFX8-NEXT: s_lshl_b32 s2, s6, s2 -; GFX8-NEXT: s_bfe_u32 s6, s7, 0x100000 -; GFX8-NEXT: s_lshr_b32 s6, s6, s8 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX8-NEXT: s_and_b32 s6, 0xffff, s7 +; GFX8-NEXT: s_lshr_b32 s6, s6, 1 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX8-NEXT: s_lshr_b32 s4, s6, s4 ; GFX8-NEXT: s_or_b32 s2, s2, s4 -; GFX8-NEXT: s_bfe_u32 s4, s3, 0x100000 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s3 ; GFX8-NEXT: s_and_b32 s5, s5, 0xffff -; GFX8-NEXT: s_lshl_b32 s1, s1, s8 -; GFX8-NEXT: s_lshr_b32 s4, s4, s10 +; GFX8-NEXT: s_lshl_b32 s1, s1, 1 +; GFX8-NEXT: s_lshr_b32 s4, s4, 15 ; GFX8-NEXT: s_or_b32 s1, s1, s4 -; GFX8-NEXT: s_lshl_b32 s3, s3, s8 +; GFX8-NEXT: s_lshl_b32 s3, s3, 1 ; GFX8-NEXT: s_xor_b32 s4, s5, -1 ; GFX8-NEXT: s_and_b32 s5, s4, 15 ; GFX8-NEXT: s_andn2_b32 s4, 15, s4 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX8-NEXT: s_lshr_b32 s3, s3, s8 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX8-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX8-NEXT: s_lshr_b32 s3, s3, 1 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX8-NEXT: s_lshl_b32 s1, s1, s5 ; GFX8-NEXT: s_lshr_b32 s3, s3, s4 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_or_b32 s1, s1, s3 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshr_v3i16: @@ -4788,15 +4725,13 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX6-NEXT: v_or_b32_e32 v6, v6, v7 ; GFX6-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; GFX6-NEXT: s_bfe_u32 s4, 1, 0x100000 ; GFX6-NEXT: v_bfe_u32 v8, v3, 1, 15 -; GFX6-NEXT: s_bfe_u32 s5, 14, 0x100000 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v8, s5, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v8, 14, v8 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v8 ; GFX6-NEXT: v_bfe_u32 v8, v4, 1, 15 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v8, s5, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v8, 14, v8 ; GFX6-NEXT: v_xor_b32_e32 v6, -1, v6 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v8 ; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v6 @@ -4804,9 +4739,9 @@ ; GFX6-NEXT: v_xor_b32_e32 v6, -1, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3 ; GFX6-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX6-NEXT: v_bfe_u32 v9, v9, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX6-NEXT: v_bfe_u32 v3, v3, 1, 15 -; GFX6-NEXT: v_bfe_u32 v6, v6, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v9, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v6, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 @@ -4814,24 +4749,24 @@ ; GFX6-NEXT: v_xor_b32_e32 v6, -1, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v4 ; GFX6-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX6-NEXT: v_bfe_u32 v3, v3, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v3, v1 ; GFX6-NEXT: v_bfe_u32 v3, v4, 1, 15 -; GFX6-NEXT: v_bfe_u32 v4, v6, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v6 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v4, v3 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX6-NEXT: v_bfe_u32 v3, v5, 1, 15 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, s4, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, s5, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 14, v3 ; GFX6-NEXT: v_xor_b32_e32 v4, -1, v7 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 15, v4 ; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX6-NEXT: v_bfe_u32 v5, v5, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX6-NEXT: v_bfe_u32 v3, v3, 1, 15 -; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v5, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v4, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 @@ -4878,11 +4813,11 @@ ; GFX8-NEXT: v_lshrrev_b16_e32 v3, 1, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, v5, v1 ; GFX8-NEXT: v_lshrrev_b16_e32 v3, v4, v3 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_mov_b32_e32 v3, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fshr_v3i16: @@ -4959,47 +4894,45 @@ ; GFX6-NEXT: s_lshl_b32 s9, s11, 16 ; GFX6-NEXT: s_and_b32 s10, s10, 0xffff ; GFX6-NEXT: s_or_b32 s9, s9, s10 -; GFX6-NEXT: s_bfe_u32 s10, 1, 0x100000 -; GFX6-NEXT: s_bfe_u32 s11, s4, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s12, 14, 0x100000 -; GFX6-NEXT: s_lshl_b32 s0, s0, s10 -; GFX6-NEXT: s_lshr_b32 s11, s11, s12 -; GFX6-NEXT: s_or_b32 s0, s0, s11 -; GFX6-NEXT: s_bfe_u32 s11, s5, 0xf0001 -; GFX6-NEXT: s_lshl_b32 s1, s1, s10 -; GFX6-NEXT: s_lshr_b32 s11, s11, s12 +; GFX6-NEXT: s_bfe_u32 s10, s4, 0xf0001 +; GFX6-NEXT: s_lshl_b32 s0, s0, 1 +; GFX6-NEXT: s_lshr_b32 s10, s10, 14 +; GFX6-NEXT: s_or_b32 s0, s0, s10 +; GFX6-NEXT: s_bfe_u32 s10, s5, 0xf0001 +; GFX6-NEXT: s_lshl_b32 s1, s1, 1 +; GFX6-NEXT: s_lshr_b32 s10, s10, 14 ; GFX6-NEXT: s_xor_b32 s8, s8, -1 -; GFX6-NEXT: s_or_b32 s1, s1, s11 +; GFX6-NEXT: s_or_b32 s1, s1, s10 ; GFX6-NEXT: s_lshl_b32 s4, s4, 1 -; GFX6-NEXT: s_lshr_b32 s11, s8, 16 -; GFX6-NEXT: s_and_b32 s13, s8, 15 +; GFX6-NEXT: s_lshr_b32 s10, s8, 16 +; GFX6-NEXT: s_and_b32 s11, s8, 15 ; GFX6-NEXT: s_andn2_b32 s8, 15, s8 -; GFX6-NEXT: s_bfe_u32 s13, s13, 0x100000 +; GFX6-NEXT: s_and_b32 s11, 0xffff, s11 ; GFX6-NEXT: s_bfe_u32 s4, s4, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 -; GFX6-NEXT: s_lshl_b32 s0, s0, s13 +; GFX6-NEXT: s_and_b32 s8, 0xffff, s8 +; GFX6-NEXT: s_lshl_b32 s0, s0, s11 ; GFX6-NEXT: s_lshr_b32 s4, s4, s8 ; GFX6-NEXT: s_or_b32 s0, s0, s4 -; GFX6-NEXT: s_and_b32 s4, s11, 15 +; GFX6-NEXT: s_and_b32 s4, s10, 15 ; GFX6-NEXT: s_lshl_b32 s5, s5, 1 -; GFX6-NEXT: s_andn2_b32 s8, 15, s11 -; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: s_andn2_b32 s8, 15, s10 +; GFX6-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX6-NEXT: s_lshl_b32 s1, s1, s4 ; GFX6-NEXT: s_bfe_u32 s4, s5, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s5, s8, 0x100000 +; GFX6-NEXT: s_and_b32 s5, 0xffff, s8 ; GFX6-NEXT: s_lshr_b32 s4, s4, s5 ; GFX6-NEXT: s_or_b32 s1, s1, s4 -; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_lshl_b32 s1, s2, s10 +; GFX6-NEXT: s_lshl_b32 s1, s2, 1 ; GFX6-NEXT: s_bfe_u32 s2, s6, 0xf0001 -; GFX6-NEXT: s_lshr_b32 s2, s2, s12 +; GFX6-NEXT: s_lshr_b32 s2, s2, 14 ; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_lshl_b32 s2, s3, s10 +; GFX6-NEXT: s_lshl_b32 s2, s3, 1 ; GFX6-NEXT: s_bfe_u32 s3, s7, 0xf0001 -; GFX6-NEXT: s_lshr_b32 s3, s3, s12 +; GFX6-NEXT: s_lshr_b32 s3, s3, 14 ; GFX6-NEXT: s_xor_b32 s5, s9, -1 ; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: s_lshl_b32 s3, s6, 1 @@ -5007,98 +4940,96 @@ ; GFX6-NEXT: s_lshr_b32 s6, s5, 16 ; GFX6-NEXT: s_and_b32 s7, s5, 15 ; GFX6-NEXT: s_andn2_b32 s5, 15, s5 -; GFX6-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX6-NEXT: s_and_b32 s7, 0xffff, s7 ; GFX6-NEXT: s_bfe_u32 s3, s3, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX6-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX6-NEXT: s_lshl_b32 s1, s1, s7 ; GFX6-NEXT: s_lshr_b32 s3, s3, s5 ; GFX6-NEXT: s_or_b32 s1, s1, s3 ; GFX6-NEXT: s_and_b32 s3, s6, 15 ; GFX6-NEXT: s_andn2_b32 s5, 15, s6 -; GFX6-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX6-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX6-NEXT: s_lshl_b32 s2, s2, s3 ; GFX6-NEXT: s_bfe_u32 s3, s4, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s4, s5, 0x100000 +; GFX6-NEXT: s_and_b32 s4, 0xffff, s5 ; GFX6-NEXT: s_lshr_b32 s3, s3, s4 ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX6-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fshr_v4i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s8, 1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x100000 -; GFX8-NEXT: s_bfe_u32 s10, 15, 0x100000 +; GFX8-NEXT: s_and_b32 s8, 0xffff, s2 ; GFX8-NEXT: s_lshr_b32 s6, s0, 16 ; GFX8-NEXT: s_lshr_b32 s7, s2, 16 -; GFX8-NEXT: s_lshl_b32 s0, s0, s8 -; GFX8-NEXT: s_lshr_b32 s9, s9, s10 -; GFX8-NEXT: s_or_b32 s0, s0, s9 -; GFX8-NEXT: s_lshl_b32 s6, s6, s8 -; GFX8-NEXT: s_lshr_b32 s9, s7, s10 -; GFX8-NEXT: s_lshl_b32 s2, s2, s8 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: s_lshr_b32 s8, s8, 15 +; GFX8-NEXT: s_or_b32 s0, s0, s8 +; GFX8-NEXT: s_lshl_b32 s6, s6, 1 +; GFX8-NEXT: s_lshr_b32 s8, s7, 15 +; GFX8-NEXT: s_lshl_b32 s2, s2, 1 ; GFX8-NEXT: s_xor_b32 s4, s4, -1 -; GFX8-NEXT: s_or_b32 s6, s6, s9 -; GFX8-NEXT: s_lshr_b32 s9, s4, 16 -; GFX8-NEXT: s_and_b32 s11, s4, 15 +; GFX8-NEXT: s_or_b32 s6, s6, s8 +; GFX8-NEXT: s_lshr_b32 s8, s4, 16 +; GFX8-NEXT: s_and_b32 s9, s4, 15 ; GFX8-NEXT: s_andn2_b32 s4, 15, s4 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_bfe_u32 s11, s11, 0x100000 -; GFX8-NEXT: s_lshr_b32 s2, s2, s8 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_lshl_b32 s0, s0, s11 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_and_b32 s9, 0xffff, s9 +; GFX8-NEXT: s_lshr_b32 s2, s2, 1 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX8-NEXT: s_lshl_b32 s0, s0, s9 ; GFX8-NEXT: s_lshr_b32 s2, s2, s4 ; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s2, s9, 15 -; GFX8-NEXT: s_lshl_b32 s7, s7, s8 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_andn2_b32 s4, 15, s9 +; GFX8-NEXT: s_and_b32 s2, s8, 15 +; GFX8-NEXT: s_lshl_b32 s7, s7, 1 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_andn2_b32 s4, 15, s8 ; GFX8-NEXT: s_lshl_b32 s2, s6, s2 -; GFX8-NEXT: s_bfe_u32 s6, s7, 0x100000 -; GFX8-NEXT: s_lshr_b32 s6, s6, s8 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX8-NEXT: s_and_b32 s6, 0xffff, s7 +; GFX8-NEXT: s_lshr_b32 s6, s6, 1 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX8-NEXT: s_lshr_b32 s4, s6, s4 ; GFX8-NEXT: s_or_b32 s2, s2, s4 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 -; GFX8-NEXT: s_bfe_u32 s6, s3, 0x100000 +; GFX8-NEXT: s_and_b32 s6, 0xffff, s3 ; GFX8-NEXT: s_or_b32 s0, s0, s2 ; GFX8-NEXT: s_lshr_b32 s2, s1, 16 ; GFX8-NEXT: s_lshr_b32 s4, s3, 16 -; GFX8-NEXT: s_lshl_b32 s1, s1, s8 -; GFX8-NEXT: s_lshr_b32 s6, s6, s10 +; GFX8-NEXT: s_lshl_b32 s1, s1, 1 +; GFX8-NEXT: s_lshr_b32 s6, s6, 15 ; GFX8-NEXT: s_or_b32 s1, s1, s6 -; GFX8-NEXT: s_lshl_b32 s2, s2, s8 -; GFX8-NEXT: s_lshr_b32 s6, s4, s10 -; GFX8-NEXT: s_lshl_b32 s3, s3, s8 +; GFX8-NEXT: s_lshl_b32 s2, s2, 1 +; GFX8-NEXT: s_lshr_b32 s6, s4, 15 +; GFX8-NEXT: s_lshl_b32 s3, s3, 1 ; GFX8-NEXT: s_xor_b32 s5, s5, -1 ; GFX8-NEXT: s_or_b32 s2, s2, s6 ; GFX8-NEXT: s_lshr_b32 s6, s5, 16 ; GFX8-NEXT: s_and_b32 s7, s5, 15 ; GFX8-NEXT: s_andn2_b32 s5, 15, s5 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_bfe_u32 s7, s7, 0x100000 -; GFX8-NEXT: s_lshr_b32 s3, s3, s8 -; GFX8-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX8-NEXT: s_and_b32 s7, 0xffff, s7 +; GFX8-NEXT: s_lshr_b32 s3, s3, 1 +; GFX8-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX8-NEXT: s_lshl_b32 s1, s1, s7 ; GFX8-NEXT: s_lshr_b32 s3, s3, s5 ; GFX8-NEXT: s_or_b32 s1, s1, s3 ; GFX8-NEXT: s_and_b32 s3, s6, 15 -; GFX8-NEXT: s_lshl_b32 s4, s4, s8 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX8-NEXT: s_lshl_b32 s4, s4, 1 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_andn2_b32 s5, 15, s6 ; GFX8-NEXT: s_lshl_b32 s2, s2, s3 -; GFX8-NEXT: s_bfe_u32 s3, s4, 0x100000 -; GFX8-NEXT: s_lshr_b32 s3, s3, s8 -; GFX8-NEXT: s_bfe_u32 s4, s5, 0x100000 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s4 +; GFX8-NEXT: s_lshr_b32 s3, s3, 1 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s5 ; GFX8-NEXT: s_lshr_b32 s3, s3, s4 ; GFX8-NEXT: s_or_b32 s2, s2, s3 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: ; return to shader part epilog @@ -5237,15 +5168,13 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v9, 16, v11 ; GFX6-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; GFX6-NEXT: v_or_b32_e32 v9, v9, v10 -; GFX6-NEXT: s_bfe_u32 s4, 1, 0x100000 ; GFX6-NEXT: v_bfe_u32 v10, v4, 1, 15 -; GFX6-NEXT: s_bfe_u32 s5, 14, 0x100000 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v10, s5, v10 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v10, 14, v10 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v10 ; GFX6-NEXT: v_bfe_u32 v10, v5, 1, 15 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v10, s5, v10 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v10, 14, v10 ; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v10 ; GFX6-NEXT: v_lshrrev_b32_e32 v10, 16, v8 @@ -5253,9 +5182,9 @@ ; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v4 ; GFX6-NEXT: v_and_b32_e32 v8, 15, v8 -; GFX6-NEXT: v_bfe_u32 v11, v11, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; GFX6-NEXT: v_bfe_u32 v4, v4, 1, 15 -; GFX6-NEXT: v_bfe_u32 v8, v8, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v11, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v4 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 @@ -5263,19 +5192,19 @@ ; GFX6-NEXT: v_xor_b32_e32 v8, -1, v10 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 1, v5 ; GFX6-NEXT: v_and_b32_e32 v8, 15, v8 -; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v4, v1 ; GFX6-NEXT: v_bfe_u32 v4, v5, 1, 15 -; GFX6-NEXT: v_bfe_u32 v5, v8, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v8 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX6-NEXT: v_bfe_u32 v4, v6, 1, 15 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, s4, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, s5, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 14, v4 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX6-NEXT: v_bfe_u32 v4, v7, 1, 15 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, s4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, s5, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 14, v4 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v6 ; GFX6-NEXT: v_xor_b32_e32 v6, -1, v9 @@ -5284,19 +5213,19 @@ ; GFX6-NEXT: v_and_b32_e32 v8, 15, v6 ; GFX6-NEXT: v_xor_b32_e32 v6, -1, v6 ; GFX6-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX6-NEXT: v_bfe_u32 v8, v8, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX6-NEXT: v_bfe_u32 v4, v4, 1, 15 -; GFX6-NEXT: v_bfe_u32 v6, v6, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v8, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v6, v4 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v7 ; GFX6-NEXT: v_xor_b32_e32 v6, -1, v7 ; GFX6-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, v4, v3 ; GFX6-NEXT: v_bfe_u32 v4, v5, 1, 15 -; GFX6-NEXT: v_bfe_u32 v5, v6, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v6 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -5330,36 +5259,37 @@ ; GFX8-NEXT: v_lshlrev_b16_e32 v0, v6, v0 ; GFX8-NEXT: v_lshrrev_b16_e32 v2, v7, v2 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b16_e32 v4, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v6, 15, v3 -; GFX8-NEXT: v_or_b32_e32 v4, v4, v6 -; GFX8-NEXT: v_mov_b32_e32 v6, 1 -; GFX8-NEXT: v_lshlrev_b16_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b16_sdwa v7, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_xor_b32_e32 v5, -1, v5 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 -; GFX8-NEXT: v_lshlrev_b16_e32 v7, 1, v3 -; GFX8-NEXT: v_lshlrev_b16_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; GFX8-NEXT: v_and_b32_e32 v8, 15, v5 +; GFX8-NEXT: v_lshlrev_b16_e32 v2, 1, v1 +; GFX8-NEXT: v_lshrrev_b16_e32 v4, 15, v3 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, 1 +; GFX8-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b16_sdwa v6, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v6 +; GFX8-NEXT: v_lshlrev_b16_e32 v6, 1, v3 +; GFX8-NEXT: v_lshlrev_b16_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_xor_b32_e32 v4, -1, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v7, 15, v4 +; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX8-NEXT: v_lshrrev_b16_e32 v6, 1, v6 +; GFX8-NEXT: v_lshlrev_b16_e32 v2, v7, v2 +; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v6 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX8-NEXT: v_and_b32_e32 v4, 15, v5 ; GFX8-NEXT: v_xor_b32_e32 v5, -1, v5 ; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX8-NEXT: v_lshrrev_b16_e32 v7, 1, v7 -; GFX8-NEXT: v_lshlrev_b16_e32 v4, v8, v4 -; GFX8-NEXT: v_lshrrev_b16_e32 v5, v5, v7 -; GFX8-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX8-NEXT: v_and_b32_e32 v5, 15, v6 -; GFX8-NEXT: v_xor_b32_e32 v6, -1, v6 -; GFX8-NEXT: v_and_b32_e32 v6, 15, v6 ; GFX8-NEXT: v_lshrrev_b16_e32 v3, 1, v3 -; GFX8-NEXT: v_lshlrev_b16_e32 v1, v5, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v3, v6, v3 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, v4, v1 +; GFX8-NEXT: v_lshrrev_b16_e32 v3, v5, v3 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fshr_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll @@ -31,7 +31,8 @@ ; ; GFX10GISEL-LABEL: sample_d_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_perm_b32 v4, v5, v4, 0x5040100 +; GFX10GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX10GISEL-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; GFX10GISEL-NEXT: image_sample_d v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -58,8 +59,9 @@ ; GFX10GISEL-LABEL: sample_d_3d: ; GFX10GISEL: ; %bb.0: ; %main_body ; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v7 +; GFX10GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX10GISEL-NEXT: v_mov_b32_e32 v7, v8 -; GFX10GISEL-NEXT: v_perm_b32 v6, v9, v6, 0x5040100 +; GFX10GISEL-NEXT: v_lshl_or_b32 v6, v9, 16, v6 ; GFX10GISEL-NEXT: image_sample_d v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -95,7 +97,8 @@ ; ; GFX10GISEL-LABEL: sample_c_d_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_perm_b32 v5, v6, v5, 0x5040100 +; GFX10GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX10GISEL-NEXT: v_lshl_or_b32 v5, v6, 16, v5 ; GFX10GISEL-NEXT: image_sample_c_d v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -114,7 +117,8 @@ ; ; GFX10GISEL-LABEL: sample_d_cl_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 +; GFX10GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10GISEL-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10GISEL-NEXT: image_sample_d_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -139,8 +143,9 @@ ; GFX10GISEL-LABEL: sample_d_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body ; GFX10GISEL-NEXT: v_mov_b32_e32 v7, v5 +; GFX10GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX10GISEL-NEXT: v_mov_b32_e32 v5, v6 -; GFX10GISEL-NEXT: v_perm_b32 v4, v7, v4, 0x5040100 +; GFX10GISEL-NEXT: v_lshl_or_b32 v4, v7, 16, v4 ; GFX10GISEL-NEXT: image_sample_d_cl v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -159,7 +164,8 @@ ; ; GFX10GISEL-LABEL: sample_c_d_cl_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 +; GFX10GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10GISEL-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10GISEL-NEXT: image_sample_c_d_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -185,8 +191,9 @@ ; GFX10GISEL-LABEL: sample_c_d_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body ; GFX10GISEL-NEXT: v_mov_b32_e32 v8, v6 +; GFX10GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX10GISEL-NEXT: v_mov_b32_e32 v6, v7 -; GFX10GISEL-NEXT: v_perm_b32 v5, v8, v5, 0x5040100 +; GFX10GISEL-NEXT: v_lshl_or_b32 v5, v8, 16, v5 ; GFX10GISEL-NEXT: image_sample_c_d_cl v[0:3], v[0:6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -222,7 +229,8 @@ ; ; GFX10GISEL-LABEL: sample_cd_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_perm_b32 v4, v5, v4, 0x5040100 +; GFX10GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX10GISEL-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; GFX10GISEL-NEXT: image_sample_cd v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -258,7 +266,8 @@ ; ; GFX10GISEL-LABEL: sample_c_cd_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_perm_b32 v5, v6, v5, 0x5040100 +; GFX10GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX10GISEL-NEXT: v_lshl_or_b32 v5, v6, 16, v5 ; GFX10GISEL-NEXT: image_sample_c_cd v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -277,7 +286,8 @@ ; ; GFX10GISEL-LABEL: sample_cd_cl_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 +; GFX10GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10GISEL-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10GISEL-NEXT: image_sample_cd_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -302,8 +312,9 @@ ; GFX10GISEL-LABEL: sample_cd_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body ; GFX10GISEL-NEXT: v_mov_b32_e32 v7, v5 +; GFX10GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX10GISEL-NEXT: v_mov_b32_e32 v5, v6 -; GFX10GISEL-NEXT: v_perm_b32 v4, v7, v4, 0x5040100 +; GFX10GISEL-NEXT: v_lshl_or_b32 v4, v7, 16, v4 ; GFX10GISEL-NEXT: image_sample_cd_cl v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -322,7 +333,8 @@ ; ; GFX10GISEL-LABEL: sample_c_cd_cl_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 +; GFX10GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10GISEL-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10GISEL-NEXT: image_sample_c_cd_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -348,8 +360,9 @@ ; GFX10GISEL-LABEL: sample_c_cd_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body ; GFX10GISEL-NEXT: v_mov_b32_e32 v8, v6 +; GFX10GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX10GISEL-NEXT: v_mov_b32_e32 v6, v7 -; GFX10GISEL-NEXT: v_perm_b32 v5, v8, v5, 0x5040100 +; GFX10GISEL-NEXT: v_lshl_or_b32 v5, v8, 16, v5 ; GFX10GISEL-NEXT: image_sample_c_cd_cl v[0:3], v[0:6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -376,8 +389,9 @@ ; GFX10GISEL-LABEL: sample_c_d_o_2darray_V1: ; GFX10GISEL: ; %bb.0: ; %main_body ; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v7 +; GFX10GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX10GISEL-NEXT: v_mov_b32_e32 v7, v8 -; GFX10GISEL-NEXT: v_perm_b32 v6, v9, v6, 0x5040100 +; GFX10GISEL-NEXT: v_lshl_or_b32 v6, v9, 16, v6 ; GFX10GISEL-NEXT: image_sample_c_d_o v0, v[0:7], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -404,8 +418,9 @@ ; GFX10GISEL-LABEL: sample_c_d_o_2darray_V2: ; GFX10GISEL: ; %bb.0: ; %main_body ; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v7 +; GFX10GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX10GISEL-NEXT: v_mov_b32_e32 v7, v8 -; GFX10GISEL-NEXT: v_perm_b32 v6, v9, v6, 0x5040100 +; GFX10GISEL-NEXT: v_lshl_or_b32 v6, v9, 16, v6 ; GFX10GISEL-NEXT: image_sample_c_d_o v[0:1], v[0:7], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -464,8 +479,10 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_d_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX10GISEL-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 +; GFX10GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v2 ; GFX10GISEL-NEXT: image_sample_d_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -488,9 +505,11 @@ ; GFX10GISEL-LABEL: sample_g16_noa16_d_3d: ; GFX10GISEL: ; %bb.0: ; %main_body ; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v3 +; GFX10GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10GISEL-NEXT: v_mov_b32_e32 v3, v2 -; GFX10GISEL-NEXT: v_perm_b32 v2, v1, v0, 0x5040100 -; GFX10GISEL-NEXT: v_perm_b32 v4, v4, v9, 0x5040100 +; GFX10GISEL-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX10GISEL-NEXT: v_lshl_or_b32 v2, v1, 16, v0 +; GFX10GISEL-NEXT: v_lshl_or_b32 v4, v4, 16, v9 ; GFX10GISEL-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -527,8 +546,10 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_c_d_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 -; GFX10GISEL-NEXT: v_perm_b32 v2, v4, v3, 0x5040100 +; GFX10GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10GISEL-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10GISEL-NEXT: v_lshl_or_b32 v2, v4, 16, v3 ; GFX10GISEL-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -565,8 +586,10 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_d_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX10GISEL-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 +; GFX10GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v2 ; GFX10GISEL-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -606,10 +629,11 @@ ; GFX10GISEL-LABEL: sample_g16_noa16_c_d_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body ; GFX10GISEL-NEXT: v_mov_b32_e32 v8, v2 -; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v3 ; GFX10GISEL-NEXT: v_mov_b32_e32 v2, v0 -; GFX10GISEL-NEXT: v_perm_b32 v3, v8, v1, 0x5040100 -; GFX10GISEL-NEXT: v_perm_b32 v4, v4, v9, 0x5040100 +; GFX10GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX10GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX10GISEL-NEXT: v_lshl_or_b32 v3, v8, 16, v0 +; GFX10GISEL-NEXT: v_lshl_or_b32 v4, v4, 16, v1 ; GFX10GISEL-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -646,8 +670,10 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_cd_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX10GISEL-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 +; GFX10GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v2 ; GFX10GISEL-NEXT: image_sample_cd_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -684,8 +710,10 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_c_cd_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 -; GFX10GISEL-NEXT: v_perm_b32 v2, v4, v3, 0x5040100 +; GFX10GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10GISEL-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10GISEL-NEXT: v_lshl_or_b32 v2, v4, 16, v3 ; GFX10GISEL-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -722,8 +750,10 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_cd_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX10GISEL-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 +; GFX10GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v2 ; GFX10GISEL-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -763,10 +793,11 @@ ; GFX10GISEL-LABEL: sample_g16_noa16_c_cd_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body ; GFX10GISEL-NEXT: v_mov_b32_e32 v8, v2 -; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v3 ; GFX10GISEL-NEXT: v_mov_b32_e32 v2, v0 -; GFX10GISEL-NEXT: v_perm_b32 v3, v8, v1, 0x5040100 -; GFX10GISEL-NEXT: v_perm_b32 v4, v4, v9, 0x5040100 +; GFX10GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX10GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX10GISEL-NEXT: v_lshl_or_b32 v3, v8, 16, v0 +; GFX10GISEL-NEXT: v_lshl_or_b32 v4, v4, 16, v1 ; GFX10GISEL-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -792,11 +823,12 @@ ; GFX10GISEL: ; %bb.0: ; %main_body ; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v2 ; GFX10GISEL-NEXT: v_mov_b32_e32 v10, v3 -; GFX10GISEL-NEXT: v_mov_b32_e32 v11, v4 ; GFX10GISEL-NEXT: v_mov_b32_e32 v2, v0 ; GFX10GISEL-NEXT: v_mov_b32_e32 v3, v1 -; GFX10GISEL-NEXT: v_perm_b32 v4, v10, v9, 0x5040100 -; GFX10GISEL-NEXT: v_perm_b32 v5, v5, v11, 0x5040100 +; GFX10GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX10GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX10GISEL-NEXT: v_lshl_or_b32 v5, v5, 16, v1 +; GFX10GISEL-NEXT: v_lshl_or_b32 v4, v10, 16, v0 ; GFX10GISEL-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -822,11 +854,12 @@ ; GFX10GISEL: ; %bb.0: ; %main_body ; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v2 ; GFX10GISEL-NEXT: v_mov_b32_e32 v10, v3 -; GFX10GISEL-NEXT: v_mov_b32_e32 v11, v4 ; GFX10GISEL-NEXT: v_mov_b32_e32 v2, v0 ; GFX10GISEL-NEXT: v_mov_b32_e32 v3, v1 -; GFX10GISEL-NEXT: v_perm_b32 v4, v10, v9, 0x5040100 -; GFX10GISEL-NEXT: v_perm_b32 v5, v5, v11, 0x5040100 +; GFX10GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX10GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX10GISEL-NEXT: v_lshl_or_b32 v5, v5, 16, v1 +; GFX10GISEL-NEXT: v_lshl_or_b32 v4, v10, 16, v0 ; GFX10GISEL-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp --- a/llvm/utils/TableGen/GlobalISelEmitter.cpp +++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp @@ -3634,7 +3634,7 @@ const TreePatternNode *Src, const TreePatternNode *Dst); Expected createAndImportSubInstructionRenderer( action_iterator InsertPt, RuleMatcher &M, const TreePatternNode *Dst, - unsigned TempReg); + const TreePatternNode *Src, unsigned TempReg); Expected createInstructionRenderer(action_iterator InsertPt, RuleMatcher &M, const TreePatternNode *Dst); @@ -3644,14 +3644,12 @@ BuildMIAction &DstMIBuilder, const TreePatternNode *Dst); - Expected - importExplicitUseRenderers(action_iterator InsertPt, RuleMatcher &M, - BuildMIAction &DstMIBuilder, - const llvm::TreePatternNode *Dst); - Expected - importExplicitUseRenderer(action_iterator InsertPt, RuleMatcher &Rule, - BuildMIAction &DstMIBuilder, - TreePatternNode *DstChild); + Expected importExplicitUseRenderers( + action_iterator InsertPt, RuleMatcher &M, BuildMIAction &DstMIBuilder, + const llvm::TreePatternNode *Dst, const TreePatternNode *Src); + Expected importExplicitUseRenderer( + action_iterator InsertPt, RuleMatcher &Rule, BuildMIAction &DstMIBuilder, + TreePatternNode *DstChild, const TreePatternNode *Src); Error importDefaultOperandRenderers(action_iterator InsertPt, RuleMatcher &M, BuildMIAction &DstMIBuilder, DagInit *DefaultOps) const; @@ -4450,7 +4448,7 @@ Expected GlobalISelEmitter::importExplicitUseRenderer( action_iterator InsertPt, RuleMatcher &Rule, BuildMIAction &DstMIBuilder, - TreePatternNode *DstChild) { + TreePatternNode *DstChild, const TreePatternNode *Src) { const auto &SubOperand = Rule.getComplexSubOperand(DstChild->getName()); if (SubOperand) { @@ -4520,7 +4518,7 @@ DstMIBuilder.addRenderer(TempRegID); auto InsertPtOrError = createAndImportSubInstructionRenderer( - ++InsertPt, Rule, DstChild, TempRegID); + ++InsertPt, Rule, DstChild, Src, TempRegID); if (auto Error = InsertPtOrError.takeError()) return std::move(Error); return InsertPtOrError.get(); @@ -4592,6 +4590,16 @@ return failedImport( "Dst pattern child def is an unsupported tablegen class"); } + + // Handle the case where the MVT/register class is omitted in the dest pattern + // but MVT exists in the source pattern. + if (isa(DstChild->getLeafValue())) { + for (unsigned NumOp = 0; NumOp < Src->getNumChildren(); NumOp++) + if (Src->getChild(NumOp)->getName() == DstChild->getName()) { + DstMIBuilder.addRenderer(Src->getChild(NumOp)->getName()); + return InsertPt; + } + } return failedImport("Dst pattern child is an unsupported kind"); } @@ -4621,8 +4629,9 @@ .takeError()) return std::move(Error); - if (auto Error = importExplicitUseRenderers(InsertPt, M, DstMIBuilder, Dst) - .takeError()) + if (auto Error = + importExplicitUseRenderers(InsertPt, M, DstMIBuilder, Dst, Src) + .takeError()) return std::move(Error); return DstMIBuilder; @@ -4631,7 +4640,7 @@ Expected GlobalISelEmitter::createAndImportSubInstructionRenderer( const action_iterator InsertPt, RuleMatcher &M, const TreePatternNode *Dst, - unsigned TempRegID) { + const TreePatternNode *Src, unsigned TempRegID) { auto InsertPtOrError = createInstructionRenderer(InsertPt, M, Dst); // TODO: Assert there's exactly one result. @@ -4645,8 +4654,8 @@ // Assign the result to TempReg. DstMIBuilder.addRenderer(TempRegID, true); - InsertPtOrError = - importExplicitUseRenderers(InsertPtOrError.get(), M, DstMIBuilder, Dst); + InsertPtOrError = importExplicitUseRenderers(InsertPtOrError.get(), M, + DstMIBuilder, Dst, Src); if (auto Error = InsertPtOrError.takeError()) return std::move(Error); @@ -4808,7 +4817,7 @@ Expected GlobalISelEmitter::importExplicitUseRenderers( action_iterator InsertPt, RuleMatcher &M, BuildMIAction &DstMIBuilder, - const llvm::TreePatternNode *Dst) { + const llvm::TreePatternNode *Dst, const llvm::TreePatternNode *Src) { const CodeGenInstruction *DstI = DstMIBuilder.getCGI(); CodeGenInstruction *OrigDstI = &Target.getInstruction(Dst->getOperator()); @@ -4837,7 +4846,7 @@ InsertPt, *ExtractSrcTy, TempRegID); auto InsertPtOrError = createAndImportSubInstructionRenderer( - ++InsertPt, M, ValChild, TempRegID); + ++InsertPt, M, ValChild, Src, TempRegID); if (auto Error = InsertPtOrError.takeError()) return std::move(Error); @@ -4887,7 +4896,7 @@ CodeGenSubRegIndex *SubIdx = CGRegs.getSubRegIdx(SubRegInit->getDef()); auto InsertPtOrError = - importExplicitUseRenderer(InsertPt, M, DstMIBuilder, ValChild); + importExplicitUseRenderer(InsertPt, M, DstMIBuilder, ValChild, Src); if (auto Error = InsertPtOrError.takeError()) return std::move(Error); InsertPt = InsertPtOrError.get(); @@ -4956,7 +4965,7 @@ } auto InsertPtOrError = importExplicitUseRenderer(InsertPt, M, DstMIBuilder, - Dst->getChild(Child)); + Dst->getChild(Child), Src); if (auto Error = InsertPtOrError.takeError()) return std::move(Error); InsertPt = InsertPtOrError.get();