diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -3348,9 +3348,9 @@ // FIXME: We should choose either a zext or a sext based on other constants // already around. def : Pat<(i32 (anyext i1:$in)), - (SELECT_I4 crbitrc:$in, (LI 1), (LI 0))>; + (SELECT_I4 $in, (LI 1), (LI 0))>; def : Pat<(i64 (anyext i1:$in)), - (SELECT_I8 crbitrc:$in, (LI8 1), (LI8 0))>; + (SELECT_I8 $in, (LI8 1), (LI8 0))>; // match setcc on i1 variables. // CRANDC is: @@ -3756,34 +3756,34 @@ multiclass FSetCCPat { defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETUGE)), - (EXTRACT_SUBREG (FCmp Ty:$s1, Ty:$s2), sub_lt)>; + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_lt)>; defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETGE)), - (EXTRACT_SUBREG (FCmp Ty:$s1, Ty:$s2), sub_lt)>; + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_lt)>; defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETULE)), - (EXTRACT_SUBREG (FCmp Ty:$s1, Ty:$s2), sub_gt)>; + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_gt)>; defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETLE)), - (EXTRACT_SUBREG (FCmp Ty:$s1, Ty:$s2), sub_gt)>; + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_gt)>; defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETUNE)), - (EXTRACT_SUBREG (FCmp Ty:$s1, Ty:$s2), sub_eq)>; + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_eq)>; defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETNE)), - (EXTRACT_SUBREG (FCmp Ty:$s1, Ty:$s2), sub_eq)>; + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_eq)>; defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETO)), - (EXTRACT_SUBREG (FCmp Ty:$s1, Ty:$s2), sub_un)>; + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_un)>; def : Pat<(i1 (SetCC Ty:$s1, Ty:$s2, SETOLT)), - (EXTRACT_SUBREG (FCmp Ty:$s1, Ty:$s2), sub_lt)>; + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_lt)>; def : Pat<(i1 (SetCC Ty:$s1, Ty:$s2, SETLT)), - (EXTRACT_SUBREG (FCmp Ty:$s1, Ty:$s2), sub_lt)>; + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_lt)>; def : Pat<(i1 (SetCC Ty:$s1, Ty:$s2, SETOGT)), - (EXTRACT_SUBREG (FCmp Ty:$s1, Ty:$s2), sub_gt)>; + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_gt)>; def : Pat<(i1 (SetCC Ty:$s1, Ty:$s2, SETGT)), - (EXTRACT_SUBREG (FCmp Ty:$s1, Ty:$s2), sub_gt)>; + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_gt)>; def : Pat<(i1 (SetCC Ty:$s1, Ty:$s2, SETOEQ)), - (EXTRACT_SUBREG (FCmp Ty:$s1, Ty:$s2), sub_eq)>; + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_eq)>; def : Pat<(i1 (SetCC Ty:$s1, Ty:$s2, SETEQ)), - (EXTRACT_SUBREG (FCmp Ty:$s1, Ty:$s2), sub_eq)>; + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_eq)>; def : Pat<(i1 (SetCC Ty:$s1, Ty:$s2, SETUO)), - (EXTRACT_SUBREG (FCmp Ty:$s1, Ty:$s2), sub_un)>; + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_un)>; } let Predicates = [HasFPU] in { diff --git a/llvm/test/CodeGen/AArch64/arm64-vcvt.ll b/llvm/test/CodeGen/AArch64/arm64-vcvt.ll --- a/llvm/test/CodeGen/AArch64/arm64-vcvt.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vcvt.ll @@ -402,8 +402,8 @@ define <1 x i64> @fcvtzs_1d_intrinsic(<1 x double> %A) nounwind { ;CHECK-LABEL: fcvtzs_1d_intrinsic: ;CHECK-NOT: ld1 -;CHECK: fcvtzs d0, d0 -;CHECK-NEXT: ret +;CHECK: fcvtzs{{.*}}, d0 +;CHECK: ret %tmp3 = call <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.v1f64(<1 x double> %A) ret <1 x i64> %tmp3 } @@ -481,8 +481,8 @@ define <1 x i64> @fcvtzu_1d_intrinsic(<1 x double> %A) nounwind { ;CHECK-LABEL: fcvtzu_1d_intrinsic: ;CHECK-NOT: ld1 -;CHECK: fcvtzu d0, d0 -;CHECK-NEXT: ret +;CHECK: fcvtzu{{.*}}, d0 +;CHECK: ret %tmp3 = call <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.v1f64(<1 x double> %A) ret <1 x i64> %tmp3 } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll @@ -40,19 +40,22 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v6, v[0:1], off -; GFX9-NEXT: global_load_ushort v7, v[0:1], off offset:2 -; GFX9-NEXT: global_load_ushort v8, v[0:1], off offset:4 -; GFX9-NEXT: global_load_ushort v9, v[2:3], off -; GFX9-NEXT: global_load_ushort v10, v[2:3], off offset:2 -; GFX9-NEXT: global_load_ushort v11, v[2:3], off offset:4 -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4 +; GFX9-NEXT: global_load_ushort v7, v[0:1], off offset:4 +; GFX9-NEXT: global_load_ushort v8, v[2:3], off +; GFX9-NEXT: global_load_ushort v9, v[2:3], off offset:4 +; GFX9-NEXT: global_load_ushort v10, v[0:1], off offset:2 +; GFX9-NEXT: global_load_ushort v11, v[2:3], off offset:2 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_pk_add_u16 v2, v7, v9 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_perm_b32 v1, v10, v9, s4 -; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v10, 16, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_add_u16 v2, v8, v11 +; GFX9-NEXT: v_lshl_or_b32 v1, v11, 16, v1 +; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 ; GFX9-NEXT: global_store_short v[4:5], v0, off ; GFX9-NEXT: global_store_short_d16_hi v[4:5], v0, off offset:2 ; GFX9-NEXT: global_store_short v[4:5], v2, off offset:4 @@ -202,27 +205,34 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v6, v[0:1], off -; GFX9-NEXT: global_load_ushort v7, v[0:1], off offset:2 -; GFX9-NEXT: global_load_ushort v8, v[0:1], off offset:4 -; GFX9-NEXT: global_load_ushort v9, v[0:1], off offset:6 -; GFX9-NEXT: global_load_ushort v10, v[0:1], off offset:8 -; GFX9-NEXT: global_load_ushort v11, v[2:3], off -; GFX9-NEXT: global_load_ushort v12, v[2:3], off offset:2 -; GFX9-NEXT: global_load_ushort v13, v[2:3], off offset:4 -; GFX9-NEXT: global_load_ushort v14, v[2:3], off offset:6 -; GFX9-NEXT: global_load_ushort v15, v[2:3], off offset:8 -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: global_load_ushort v7, v[0:1], off offset:4 +; GFX9-NEXT: global_load_ushort v8, v[0:1], off offset:8 +; GFX9-NEXT: global_load_ushort v9, v[2:3], off +; GFX9-NEXT: global_load_ushort v10, v[2:3], off offset:4 +; GFX9-NEXT: global_load_ushort v11, v[2:3], off offset:8 +; GFX9-NEXT: global_load_ushort v12, v[0:1], off offset:2 +; GFX9-NEXT: global_load_ushort v13, v[0:1], off offset:6 +; GFX9-NEXT: global_load_ushort v14, v[2:3], off offset:2 +; GFX9-NEXT: global_load_ushort v15, v[2:3], off offset:6 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 ; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v7 ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_perm_b32 v1, v9, v8, s4 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v10 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_pk_add_u16 v6, v8, v11 ; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_perm_b32 v2, v12, v11, s4 -; GFX9-NEXT: v_pk_add_u16 v0, v0, v2 +; GFX9-NEXT: v_lshl_or_b32 v0, v12, 16, v0 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshl_or_b32 v1, v13, 16, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_perm_b32 v3, v14, v13, s4 +; GFX9-NEXT: v_lshl_or_b32 v2, v14, 16, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_add_u16 v6, v10, v15 +; GFX9-NEXT: v_lshl_or_b32 v3, v15, 16, v3 +; GFX9-NEXT: v_pk_add_u16 v0, v0, v2 ; GFX9-NEXT: v_pk_add_u16 v1, v1, v3 ; GFX9-NEXT: global_store_short v[4:5], v0, off ; GFX9-NEXT: global_store_short_d16_hi v[4:5], v0, off offset:2 @@ -409,36 +419,47 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v6, v[0:1], off -; GFX9-NEXT: global_load_ushort v7, v[0:1], off offset:2 -; GFX9-NEXT: global_load_ushort v8, v[0:1], off offset:4 -; GFX9-NEXT: global_load_ushort v9, v[0:1], off offset:6 -; GFX9-NEXT: global_load_ushort v10, v[0:1], off offset:8 -; GFX9-NEXT: global_load_ushort v11, v[0:1], off offset:10 -; GFX9-NEXT: global_load_ushort v12, v[0:1], off offset:12 -; GFX9-NEXT: global_load_ushort v13, v[2:3], off -; GFX9-NEXT: global_load_ushort v14, v[2:3], off offset:2 -; GFX9-NEXT: global_load_ushort v15, v[2:3], off offset:4 -; GFX9-NEXT: global_load_ushort v16, v[2:3], off offset:6 -; GFX9-NEXT: global_load_ushort v17, v[2:3], off offset:8 -; GFX9-NEXT: global_load_ushort v18, v[2:3], off offset:10 -; GFX9-NEXT: global_load_ushort v19, v[2:3], off offset:12 -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: global_load_ushort v7, v[0:1], off offset:4 +; GFX9-NEXT: global_load_ushort v8, v[0:1], off offset:8 +; GFX9-NEXT: global_load_ushort v9, v[0:1], off offset:12 +; GFX9-NEXT: global_load_ushort v10, v[2:3], off +; GFX9-NEXT: global_load_ushort v11, v[2:3], off offset:4 +; GFX9-NEXT: global_load_ushort v12, v[2:3], off offset:8 +; GFX9-NEXT: global_load_ushort v13, v[2:3], off offset:12 +; GFX9-NEXT: global_load_ushort v14, v[0:1], off offset:2 +; GFX9-NEXT: global_load_ushort v15, v[0:1], off offset:6 +; GFX9-NEXT: global_load_ushort v16, v[0:1], off offset:10 +; GFX9-NEXT: global_load_ushort v17, v[2:3], off offset:2 +; GFX9-NEXT: global_load_ushort v18, v[2:3], off offset:6 +; GFX9-NEXT: global_load_ushort v19, v[2:3], off offset:10 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 ; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4 -; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_perm_b32 v1, v9, v8, s4 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v8 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v10 ; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_perm_b32 v2, v11, v10, s4 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v11 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v12 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_pk_add_u16 v8, v9, v13 ; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_perm_b32 v3, v14, v13, s4 -; GFX9-NEXT: v_pk_add_u16 v0, v0, v3 +; GFX9-NEXT: v_lshl_or_b32 v0, v14, 16, v0 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshl_or_b32 v1, v15, 16, v1 ; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_perm_b32 v6, v16, v15, s4 -; GFX9-NEXT: v_pk_add_u16 v1, v1, v6 +; GFX9-NEXT: v_lshl_or_b32 v2, v16, 16, v2 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_perm_b32 v7, v18, v17, s4 +; GFX9-NEXT: v_lshl_or_b32 v6, v18, 16, v6 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_add_u16 v8, v12, v19 +; GFX9-NEXT: v_lshl_or_b32 v7, v19, 16, v7 +; GFX9-NEXT: v_pk_add_u16 v0, v0, v3 +; GFX9-NEXT: v_pk_add_u16 v1, v1, v6 ; GFX9-NEXT: v_pk_add_u16 v2, v2, v7 ; GFX9-NEXT: global_store_short v[4:5], v0, off ; GFX9-NEXT: global_store_short_d16_hi v[4:5], v0, off offset:2 @@ -697,28 +718,30 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off ; GFX9-NEXT: global_load_ushort v14, v[0:1], off offset:16 -; GFX9-NEXT: global_load_ushort v15, v[0:1], off offset:18 -; GFX9-NEXT: global_load_ushort v16, v[2:3], off offset:16 -; GFX9-NEXT: global_load_ushort v17, v[2:3], off offset:18 +; GFX9-NEXT: global_load_ushort v15, v[2:3], off offset:16 ; GFX9-NEXT: global_load_dwordx4 v[10:13], v[2:3], off -; GFX9-NEXT: global_load_ushort v18, v[0:1], off offset:20 -; GFX9-NEXT: global_load_ushort v19, v[2:3], off offset:20 -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: global_load_ushort v16, v[0:1], off offset:20 +; GFX9-NEXT: global_load_ushort v17, v[2:3], off offset:20 +; GFX9-NEXT: global_load_ushort v18, v[0:1], off offset:18 +; GFX9-NEXT: global_load_ushort v19, v[2:3], off offset:18 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_perm_b32 v14, v15, v14, s4 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_perm_b32 v15, v17, v16, s4 -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_pk_add_u16 v0, v6, v10 ; GFX9-NEXT: v_pk_add_u16 v1, v7, v11 ; GFX9-NEXT: v_pk_add_u16 v2, v8, v12 ; GFX9-NEXT: v_pk_add_u16 v3, v9, v13 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshl_or_b32 v7, v18, 16, v14 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_add_u16 v6, v18, v19 -; GFX9-NEXT: v_pk_add_u16 v7, v14, v15 +; GFX9-NEXT: v_lshl_or_b32 v8, v19, 16, v15 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off -; GFX9-NEXT: global_store_short v[4:5], v7, off offset:16 -; GFX9-NEXT: global_store_short_d16_hi v[4:5], v7, off offset:18 +; GFX9-NEXT: v_pk_add_u16 v6, v16, v17 +; GFX9-NEXT: v_pk_add_u16 v0, v7, v8 +; GFX9-NEXT: global_store_short v[4:5], v0, off offset:16 +; GFX9-NEXT: global_store_short_d16_hi v[4:5], v0, off offset:18 ; GFX9-NEXT: global_store_short v[4:5], v6, off offset:20 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll @@ -99,7 +99,7 @@ ; GCN-LABEL: v_andn2_i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_xor_b32_e32 v1, -1, v1 +; GCN-NEXT: v_not_b32_e32 v1, v1 ; GCN-NEXT: v_and_b32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; @@ -107,7 +107,7 @@ ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10PLUS-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX10PLUS-NEXT: v_not_b32_e32 v1, v1 ; GFX10PLUS-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %not.src1 = xor i32 %src1, -1 @@ -118,13 +118,13 @@ define amdgpu_ps float @v_andn2_i32_sv(i32 inreg %src0, i32 %src1) { ; GCN-LABEL: v_andn2_i32_sv: ; GCN: ; %bb.0: -; GCN-NEXT: v_xor_b32_e32 v0, -1, v0 +; GCN-NEXT: v_not_b32_e32 v0, v0 ; GCN-NEXT: v_and_b32_e32 v0, s2, v0 ; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: v_andn2_i32_sv: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX10PLUS-NEXT: v_not_b32_e32 v0, v0 ; GFX10PLUS-NEXT: v_and_b32_e32 v0, s2, v0 ; GFX10PLUS-NEXT: ; return to shader part epilog %not.src1 = xor i32 %src1, -1 @@ -248,8 +248,8 @@ ; GCN-LABEL: v_andn2_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_xor_b32_e32 v2, -1, v2 -; GCN-NEXT: v_xor_b32_e32 v3, -1, v3 +; GCN-NEXT: v_not_b32_e32 v2, v2 +; GCN-NEXT: v_not_b32_e32 v3, v3 ; GCN-NEXT: v_and_b32_e32 v0, v0, v2 ; GCN-NEXT: v_and_b32_e32 v1, v1, v3 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -258,8 +258,8 @@ ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10PLUS-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX10PLUS-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX10PLUS-NEXT: v_not_b32_e32 v2, v2 +; GFX10PLUS-NEXT: v_not_b32_e32 v3, v3 ; GFX10PLUS-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX10PLUS-NEXT: v_and_b32_e32 v1, v1, v3 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] @@ -271,16 +271,16 @@ define amdgpu_ps <2 x float> @v_andn2_i64_sv(i64 inreg %src0, i64 %src1) { ; GCN-LABEL: v_andn2_i64_sv: ; GCN: ; %bb.0: -; GCN-NEXT: v_xor_b32_e32 v0, -1, v0 -; GCN-NEXT: v_xor_b32_e32 v1, -1, v1 +; GCN-NEXT: v_not_b32_e32 v0, v0 +; GCN-NEXT: v_not_b32_e32 v1, v1 ; GCN-NEXT: v_and_b32_e32 v0, s2, v0 ; GCN-NEXT: v_and_b32_e32 v1, s3, v1 ; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: v_andn2_i64_sv: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX10PLUS-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX10PLUS-NEXT: v_not_b32_e32 v0, v0 +; GFX10PLUS-NEXT: v_not_b32_e32 v1, v1 ; GFX10PLUS-NEXT: v_and_b32_e32 v0, s2, v0 ; GFX10PLUS-NEXT: v_and_b32_e32 v1, s3, v1 ; GFX10PLUS-NEXT: ; return to shader part epilog @@ -466,14 +466,14 @@ ; GCN: ; %bb.0: ; GCN-NEXT: v_xor_b32_e32 v0, -1, v0 ; GCN-NEXT: v_and_b32_e32 v0, s2, v0 -; GCN-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: v_andn2_i16_sv: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX10PLUS-NEXT: v_and_b32_e32 v0, s2, v0 -; GFX10PLUS-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10PLUS-NEXT: ; return to shader part epilog %not.src1 = xor i16 %src1, -1 %and = and i16 %src0, %not.src1 @@ -487,14 +487,14 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_xor_b32 s0, s2, -1 ; GCN-NEXT: v_and_b32_e32 v0, s0, v0 -; GCN-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: v_andn2_i16_vs: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_xor_b32 s0, s2, -1 ; GFX10PLUS-NEXT: v_and_b32_e32 v0, s0, v0 -; GFX10PLUS-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10PLUS-NEXT: ; return to shader part epilog %not.src1 = xor i16 %src1, -1 %and = and i16 %src0, %not.src1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll @@ -458,8 +458,8 @@ ; GFX7-NEXT: s_lshl_b32 s2, s1, 8 ; GFX7-NEXT: s_bfe_u32 s1, s1, 0x80008 ; GFX7-NEXT: s_or_b32 s1, s1, s2 -; GFX7-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX7-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX7-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX7-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: ; return to shader part epilog @@ -497,7 +497,7 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_bswap_i16_zext_to_i32: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll @@ -1530,15 +1530,16 @@ ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_bfe_u32 v0, v0, 0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_test_uitofp_i16_byte_to_f32: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; VI-NEXT: v_mov_b32_e32 v1, 0xffff +; VI-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; VI-NEXT: s_setpc_b64 s[30:31] %masked = and i16 %arg0, 255 %itofp = uitofp i16 %masked to float diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll @@ -669,10 +669,9 @@ ; GFX8-NEXT: v_mul_f32_e32 v3, v7, v5 ; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX8-NEXT: v_div_fixup_f16 v0, v2, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 ; GFX8-NEXT: v_div_fixup_f16 v1, v3, v4, v6 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fdiv_v2f16: @@ -762,10 +761,8 @@ ; GFX8-NEXT: v_rcp_f16_e32 v2, v1 ; GFX8-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v2, v0, v2 -; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_mov_b32_e32 v1, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fdiv_v2f16_afn: @@ -894,10 +891,9 @@ ; GFX8-NEXT: v_mul_f32_e32 v3, v7, v5 ; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX8-NEXT: v_div_fixup_f16 v0, v2, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 ; GFX8-NEXT: v_div_fixup_f16 v1, v3, v4, v6 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fdiv_v2f16_ulp25: @@ -1051,9 +1047,8 @@ ; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 ; GFX8-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_rcp_v2f16: @@ -1202,9 +1197,8 @@ ; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 ; GFX8-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_rcp_v2f16_arcp: @@ -1285,11 +1279,9 @@ ; GFX8-LABEL: v_rcp_v2f16_arcp_afn: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_rcp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-NEXT: v_rcp_f16_e32 v0, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_rcp_f16_e32 v1, v0 +; GFX8-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_rcp_v2f16_arcp_afn: @@ -1397,11 +1389,9 @@ ; GFX8-LABEL: v_rcp_v2f16_ulp25: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_rcp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-NEXT: v_rcp_f16_e32 v0, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_rcp_f16_e32 v1, v0 +; GFX8-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_rcp_v2f16_ulp25: @@ -1457,10 +1447,8 @@ ; GFX8-NEXT: v_rcp_f16_e32 v2, v1 ; GFX8-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v2, v0, v2 -; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_mov_b32_e32 v1, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fdiv_v2f16_afn_ulp25: @@ -1589,10 +1577,9 @@ ; GFX8-NEXT: v_mul_f32_e32 v3, v7, v5 ; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX8-NEXT: v_div_fixup_f16 v0, v2, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 ; GFX8-NEXT: v_div_fixup_f16 v1, v3, v4, v6 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fdiv_v2f16_arcp_ulp25: @@ -1682,10 +1669,8 @@ ; GFX8-NEXT: v_rcp_f16_e32 v2, v1 ; GFX8-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v2, v0, v2 -; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_mov_b32_e32 v1, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fdiv_v2f16_arcp_afn_ulp25: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll @@ -269,9 +269,8 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX8-NEXT: v_fma_f16 v0, v0, v1, v2 ; GFX8-NEXT: v_fma_f16 v1, v3, v4, v5 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fma_v2f16: @@ -327,9 +326,8 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX8-NEXT: v_fma_f16 v0, v0, v1, v2 ; GFX8-NEXT: v_fma_f16 v1, v3, v4, v5 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fma_v2f16_fneg_lhs: @@ -386,9 +384,8 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX8-NEXT: v_fma_f16 v0, v0, v1, v2 ; GFX8-NEXT: v_fma_f16 v1, v3, v4, v5 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fma_v2f16_fneg_rhs: @@ -439,9 +436,8 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX8-NEXT: v_fma_f16 v0, v0, v1, v2 ; GFX8-NEXT: v_fma_f16 v1, v3, v4, v5 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fma_v2f16_fneg_lhs_rhs: @@ -498,11 +494,9 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v4 ; GFX8-NEXT: v_fma_f16 v0, v0, v2, v4 ; GFX8-NEXT: v_fma_f16 v2, v6, v7, v8 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_fma_f16 v1, v1, v3, v5 -; GFX8-NEXT: v_mov_b32_e32 v3, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fma_v3f16: @@ -568,13 +562,12 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v5 ; GFX8-NEXT: v_fma_f16 v0, v0, v2, v4 ; GFX8-NEXT: v_fma_f16 v2, v6, v8, v10 -; GFX8-NEXT: v_mov_b32_e32 v4, 16 ; GFX8-NEXT: v_fma_f16 v1, v1, v3, v5 ; GFX8-NEXT: v_fma_f16 v3, v7, v9, v11 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fma_v4f16: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll @@ -297,11 +297,10 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, 0x4000 ; GFX8-NEXT: v_max_f16_e32 v1, 2.0, v0 ; GFX8-NEXT: v_max_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v0, 4.0, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_mov_b32_e32 v2, 0x4400 ; GFX8-NEXT: v_min_f16_e32 v1, 4.0, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_min_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] %maxnum = call nnan <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> ) %fmed = call nnan <2 x half> @llvm.minnum.v2f16(<2 x half> %maxnum, <2 x half> ) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll @@ -15,10 +15,8 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mul_f16_e32 v2, v0, v1 -; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v1, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmul_v2f16: @@ -43,10 +41,8 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 ; GFX8-NEXT: v_mul_f16_e32 v2, v0, v1 -; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v1, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmul_v2f16_fneg_lhs: @@ -72,10 +68,8 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 ; GFX8-NEXT: v_mul_f16_e32 v2, v0, v1 -; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v1, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmul_v2f16_fneg_rhs: @@ -100,10 +94,8 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mul_f16_e32 v2, v0, v1 -; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v1, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmul_v2f16_fneg_lhs_fneg_rhs: @@ -130,12 +122,9 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mul_f16_e32 v4, v0, v2 -; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v1, v1, v3 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmul_v3f16: @@ -164,12 +153,9 @@ ; GFX8-NEXT: v_mov_b32_e32 v4, 0x80008000 ; GFX8-NEXT: v_xor_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_mul_f16_e32 v4, v0, v2 -; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v1, v1, v3 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmul_v3f16_fneg_lhs: @@ -199,12 +185,9 @@ ; GFX8-NEXT: v_mov_b32_e32 v4, 0x80008000 ; GFX8-NEXT: v_xor_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_mul_f16_e32 v4, v0, v2 -; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v1, v1, v3 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmul_v3f16_fneg_rhs: @@ -231,12 +214,9 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mul_f16_e32 v4, v0, v2 -; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v1, v1, v3 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmul_v3f16_fneg_lhs_fneg_rhs: @@ -264,14 +244,11 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mul_f16_e32 v4, v0, v2 -; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v2, v1, v3 -; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v3, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmul_v4f16: @@ -299,14 +276,11 @@ ; GFX8-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 ; GFX8-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 ; GFX8-NEXT: v_mul_f16_e32 v4, v0, v2 -; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v2, v1, v3 -; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v3, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmul_v4f16_fneg_lhs: @@ -335,14 +309,11 @@ ; GFX8-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 ; GFX8-NEXT: v_xor_b32_e32 v3, 0x80008000, v3 ; GFX8-NEXT: v_mul_f16_e32 v4, v0, v2 -; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v2, v1, v3 -; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v3, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmul_v4f16_fneg_rhs: @@ -369,14 +340,11 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mul_f16_e32 v4, v0, v2 -; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v2, v1, v3 -; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v3, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmul_v4f16_fneg_lhs_fneg_rhs: @@ -405,19 +373,14 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mul_f16_e32 v6, v0, v3 -; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v3, v1, v4 -; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v4, v2, v5 -; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v5, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_mov_b32_e32 v3, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmul_v6f16: @@ -448,19 +411,14 @@ ; GFX8-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 ; GFX8-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 ; GFX8-NEXT: v_mul_f16_e32 v6, v0, v3 -; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v3, v1, v4 -; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v4, v2, v5 -; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v5, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_mov_b32_e32 v3, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmul_v6f16_fneg_lhs: @@ -492,19 +450,14 @@ ; GFX8-NEXT: v_xor_b32_e32 v4, 0x80008000, v4 ; GFX8-NEXT: v_xor_b32_e32 v5, 0x80008000, v5 ; GFX8-NEXT: v_mul_f16_e32 v6, v0, v3 -; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v3, v1, v4 -; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v4, v2, v5 -; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v5, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_mov_b32_e32 v3, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmul_v6f16_fneg_rhs: @@ -533,19 +486,14 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mul_f16_e32 v6, v0, v3 -; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v3, v1, v4 -; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v4, v2, v5 -; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v5, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_mov_b32_e32 v3, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmul_v6f16_fneg_lhs_fneg_rhs: @@ -576,23 +524,17 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mul_f16_e32 v8, v0, v4 -; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v4, v1, v5 -; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v5, v2, v6 -; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v6, v3, v7 -; GFX8-NEXT: v_mul_f16_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v7, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_mov_b32_e32 v7, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_mul_f16_sdwa v3, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v6, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmul_v8f16: @@ -626,23 +568,17 @@ ; GFX8-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 ; GFX8-NEXT: v_xor_b32_e32 v3, 0x80008000, v3 ; GFX8-NEXT: v_mul_f16_e32 v8, v0, v4 -; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v4, v1, v5 -; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v5, v2, v6 -; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v6, v3, v7 -; GFX8-NEXT: v_mul_f16_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v7, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_mov_b32_e32 v7, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_mul_f16_sdwa v3, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v6, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmul_v8f16_fneg_lhs: @@ -677,23 +613,17 @@ ; GFX8-NEXT: v_xor_b32_e32 v6, 0x80008000, v6 ; GFX8-NEXT: v_xor_b32_e32 v7, 0x80008000, v7 ; GFX8-NEXT: v_mul_f16_e32 v8, v0, v4 -; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v4, v1, v5 -; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v5, v2, v6 -; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v6, v3, v7 -; GFX8-NEXT: v_mul_f16_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v7, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_mov_b32_e32 v7, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_mul_f16_sdwa v3, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v6, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmul_v8f16_fneg_rhs: @@ -724,23 +654,17 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mul_f16_e32 v8, v0, v4 -; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v4, v1, v5 -; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v5, v2, v6 -; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v6, v3, v7 -; GFX8-NEXT: v_mul_f16_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v7, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_mov_b32_e32 v7, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_mul_f16_sdwa v3, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v6, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmul_v8f16_fneg_lhs_fneg_rhs: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll @@ -208,13 +208,11 @@ ; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 ; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v2 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: v_exp_f16_e32 v0, v0 +; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX8-NEXT: v_exp_f16_e32 v1, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_pow_v2f16: @@ -226,14 +224,13 @@ ; GFX9-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v2 ; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX9-NEXT: v_exp_f16_e32 v1, v1 ; GFX9-NEXT: v_exp_f16_e32 v0, v0 -; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_pow_v2f16: @@ -248,37 +245,39 @@ ; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX10-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 ; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX10-NEXT: v_exp_f16_e32 v1, v1 +; GFX10-NEXT: v_exp_f16_e32 v1, v2 ; GFX10-NEXT: v_exp_f16_e32 v0, v0 -; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_pow_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX11-NEXT: v_log_f16_e32 v0, v0 +; GFX11-NEXT: v_log_f16_e32 v2, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_log_f16_e32 v2, v2 +; GFX11-NEXT: v_log_f16_e32 v0, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v0, v0, v1 :: v_dual_mul_dx9_zero_f32 v1, v2, v3 -; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v1, v2, v1 :: v_dual_mul_dx9_zero_f32 v0, v0, v3 ; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX11-NEXT: v_exp_f16_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-NEXT: v_exp_f16_e32 v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_exp_f16_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y) ret <2 x half> %pow @@ -319,13 +318,11 @@ ; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 ; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v2 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: v_exp_f16_e32 v0, v0 +; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX8-NEXT: v_exp_f16_e32 v1, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_pow_v2f16_fneg_lhs: @@ -338,14 +335,13 @@ ; GFX9-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v2 ; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX9-NEXT: v_exp_f16_e32 v1, v1 ; GFX9-NEXT: v_exp_f16_e32 v0, v0 -; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_pow_v2f16_fneg_lhs: @@ -361,11 +357,12 @@ ; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX10-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 ; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX10-NEXT: v_exp_f16_e32 v1, v1 +; GFX10-NEXT: v_exp_f16_e32 v1, v2 ; GFX10-NEXT: v_exp_f16_e32 v0, v0 -; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_pow_v2f16_fneg_lhs: @@ -376,23 +373,25 @@ ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX11-NEXT: v_log_f16_e32 v0, v0 +; GFX11-NEXT: v_log_f16_e32 v2, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_log_f16_e32 v2, v2 +; GFX11-NEXT: v_log_f16_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v0, v0, v1 :: v_dual_mul_dx9_zero_f32 v1, v2, v3 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v1, v2, v1 :: v_dual_mul_dx9_zero_f32 v0, v0, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_exp_f16_e32 v0, v0 ; GFX11-NEXT: v_exp_f16_e32 v1, v1 +; GFX11-NEXT: v_exp_f16_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %x.fneg = fneg <2 x half> %x %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x.fneg, <2 x half> %y) @@ -429,18 +428,16 @@ ; GFX8-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX8-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 ; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX8-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 +; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v2 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: v_exp_f16_e32 v0, v0 -; GFX8-NEXT: v_exp_f16_e32 v1, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_exp_f16_e32 v1, v2 +; GFX8-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_pow_v2f16_fneg_rhs: @@ -453,14 +450,13 @@ ; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX9-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 ; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX9-NEXT: v_exp_f16_e32 v1, v2 ; GFX9-NEXT: v_exp_f16_e32 v0, v0 -; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_pow_v2f16_fneg_rhs: @@ -468,46 +464,48 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_log_f16_e32 v2, v0 -; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX10-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 ; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX10-NEXT: v_exp_f16_e32 v1, v1 +; GFX10-NEXT: v_exp_f16_e32 v1, v2 ; GFX10-NEXT: v_exp_f16_e32 v0, v0 -; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_pow_v2f16_fneg_rhs: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX11-NEXT: v_log_f16_e32 v0, v0 +; GFX11-NEXT: v_log_f16_e32 v2, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_log_f16_e32 v2, v2 +; GFX11-NEXT: v_log_f16_e32 v0, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v0, v0, v1 :: v_dual_mul_dx9_zero_f32 v1, v2, v3 -; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v1, v2, v1 :: v_dual_mul_dx9_zero_f32 v0, v0, v3 ; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX11-NEXT: v_exp_f16_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-NEXT: v_exp_f16_e32 v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_exp_f16_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %y.fneg = fneg <2 x half> %y %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y.fneg) @@ -550,18 +548,16 @@ ; GFX8-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX8-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 ; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX8-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 +; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v2 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: v_exp_f16_e32 v0, v0 -; GFX8-NEXT: v_exp_f16_e32 v1, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_exp_f16_e32 v1, v2 +; GFX8-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_pow_v2f16_fneg_lhs_rhs: @@ -575,14 +571,13 @@ ; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX9-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 ; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX9-NEXT: v_exp_f16_e32 v1, v2 ; GFX9-NEXT: v_exp_f16_e32 v0, v0 -; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_pow_v2f16_fneg_lhs_rhs: @@ -599,11 +594,12 @@ ; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX10-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 ; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX10-NEXT: v_exp_f16_e32 v1, v1 +; GFX10-NEXT: v_exp_f16_e32 v1, v2 ; GFX10-NEXT: v_exp_f16_e32 v0, v0 -; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_pow_v2f16_fneg_lhs_rhs: @@ -613,26 +609,27 @@ ; GFX11-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 ; GFX11-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX11-NEXT: v_log_f16_e32 v0, v0 +; GFX11-NEXT: v_log_f16_e32 v2, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_log_f16_e32 v2, v2 +; GFX11-NEXT: v_log_f16_e32 v0, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v0, v0, v1 :: v_dual_mul_dx9_zero_f32 v1, v2, v3 -; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v1, v2, v1 :: v_dual_mul_dx9_zero_f32 v0, v0, v3 ; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX11-NEXT: v_exp_f16_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-NEXT: v_exp_f16_e32 v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_exp_f16_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %x.fneg = fneg <2 x half> %x %y.fneg = fneg <2 x half> %y diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll @@ -532,12 +532,10 @@ ; CI-NEXT: v_fma_f32 v3, -v3, v6, v4 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v6 -; CI-NEXT: v_bfe_u32 v0, v0, 0, 16 ; CI-NEXT: v_div_fixup_f32 v3, v3, v2, v1 ; CI-NEXT: v_trunc_f32_e32 v3, v3 ; CI-NEXT: v_fma_f32 v1, -v3, v2, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_bfe_u32 v1, v1, 0, 16 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v2, v0, v1 ; CI-NEXT: v_mov_b32_e32 v0, s4 @@ -573,9 +571,8 @@ ; VI-NEXT: v_div_fixup_f16 v1, v1, v2, s1 ; VI-NEXT: v_trunc_f16_e32 v1, v1 ; VI-NEXT: v_fma_f16 v1, -v1, v2, s1 -; VI-NEXT: v_mov_b32_e32 v2, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_or_b32_e32 v2, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -670,18 +667,14 @@ ; CI-NEXT: v_fma_f32 v5, -v5, v8, v6 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v5, v5, v7, v8 -; CI-NEXT: v_bfe_u32 v1, v1, 0, 16 -; CI-NEXT: v_bfe_u32 v0, v0, 0, 16 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 -; CI-NEXT: v_bfe_u32 v1, v2, 0, 16 ; CI-NEXT: v_div_fixup_f32 v5, v5, v4, v3 ; CI-NEXT: v_trunc_f32_e32 v5, v5 ; CI-NEXT: v_fma_f32 v3, -v5, v4, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_bfe_u32 v2, v3, 0, 16 -; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; CI-NEXT: v_or_b32_e32 v1, v1, v2 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; CI-NEXT: v_or_b32_e32 v1, v2, v1 ; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -723,6 +716,8 @@ ; VI-NEXT: v_trunc_f16_e32 v1, v1 ; VI-NEXT: v_fma_f16 v1, -v1, v2, s6 ; VI-NEXT: v_cvt_f32_f16_e32 v2, s3 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: v_mul_f32_e32 v2, v2, v4 ; VI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; VI-NEXT: v_mov_b32_e32 v4, s9 @@ -735,11 +730,8 @@ ; VI-NEXT: v_div_fixup_f16 v3, v3, v4, s7 ; VI-NEXT: v_trunc_f16_e32 v3, v3 ; VI-NEXT: v_fma_f16 v3, -v3, v4, s7 -; VI-NEXT: v_mov_b32_e32 v4, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -41,7 +41,7 @@ ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: s_and_b32 s2, s2, 0x7f ; GFX8-NEXT: s_and_b32 s1, s1, 0x7f -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: s_lshr_b32 s1, s1, 1 @@ -72,7 +72,7 @@ ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_and_b32 s2, s2, 0x7f ; GFX9-NEXT: s_and_b32 s1, s1, 0x7f -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: s_lshr_b32 s1, s1, 1 @@ -102,7 +102,7 @@ ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 ; GFX10-NEXT: s_and_b32 s2, s2, 0x7f ; GFX10-NEXT: s_and_b32 s1, s1, 0x7f -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: s_lshr_b32 s1, s1, 1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -134,7 +134,7 @@ ; GFX11-NEXT: s_and_b32 s2, s2, 0x7f ; GFX11-NEXT: s_and_b32 s1, s1, 0x7f ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX11-NEXT: s_lshr_b32 s1, s1, 1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -352,7 +352,7 @@ ; GFX8-LABEL: s_fshl_i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_and_b32 s3, s2, 7 ; GFX8-NEXT: s_andn2_b32 s2, 7, s2 ; GFX8-NEXT: s_lshr_b32 s1, s1, 1 @@ -364,7 +364,7 @@ ; GFX9-LABEL: s_fshl_i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_and_b32 s3, s2, 7 ; GFX9-NEXT: s_andn2_b32 s2, 7, s2 ; GFX9-NEXT: s_lshr_b32 s1, s1, 1 @@ -377,7 +377,7 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: s_and_b32 s3, s2, 7 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: s_andn2_b32 s2, 7, s2 ; GFX10-NEXT: s_lshr_b32 s1, s1, 1 ; GFX10-NEXT: s_lshl_b32 s0, s0, s3 @@ -389,7 +389,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_and_b32 s1, s1, 0xff ; GFX11-NEXT: s_and_b32 s3, s2, 7 -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: s_and_not1_b32 s2, 7, s2 ; GFX11-NEXT: s_lshr_b32 s1, s1, 1 ; GFX11-NEXT: s_lshl_b32 s0, s0, s3 @@ -406,7 +406,7 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v3, 7, v2 -; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX6-NEXT: v_bfe_u32 v1, v1, 1, 7 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v3, v0 @@ -418,7 +418,7 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v3, 7, v2 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, v3, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, 1 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 @@ -431,7 +431,7 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v3, 7, v2 -; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX9-NEXT: v_not_b32_e32 v2, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 1 ; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 @@ -444,7 +444,7 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2 +; GFX10-NEXT: v_not_b32_e32 v3, v2 ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX10-NEXT: v_and_b32_e32 v3, 7, v3 @@ -458,7 +458,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_xor_b32_e32 v3, -1, v2 +; GFX11-NEXT: v_not_b32_e32 v3, v2 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) @@ -485,7 +485,7 @@ ; GFX8-LABEL: s_fshl_i8_4: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshl_b32 s0, s0, 4 ; GFX8-NEXT: s_lshr_b32 s1, s1, 4 ; GFX8-NEXT: s_or_b32 s0, s0, s1 @@ -494,7 +494,7 @@ ; GFX9-LABEL: s_fshl_i8_4: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_lshl_b32 s0, s0, 4 ; GFX9-NEXT: s_lshr_b32 s1, s1, 4 ; GFX9-NEXT: s_or_b32 s0, s0, s1 @@ -504,7 +504,7 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: s_lshl_b32 s0, s0, 4 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: s_lshr_b32 s1, s1, 4 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog @@ -513,7 +513,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_and_b32 s1, s1, 0xff ; GFX11-NEXT: s_lshl_b32 s0, s0, 4 -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_lshr_b32 s1, s1, 4 ; GFX11-NEXT: s_or_b32 s0, s0, s1 @@ -584,7 +584,7 @@ ; GFX8-LABEL: s_fshl_i8_5: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshl_b32 s0, s0, 5 ; GFX8-NEXT: s_lshr_b32 s1, s1, 3 ; GFX8-NEXT: s_or_b32 s0, s0, s1 @@ -593,7 +593,7 @@ ; GFX9-LABEL: s_fshl_i8_5: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_lshl_b32 s0, s0, 5 ; GFX9-NEXT: s_lshr_b32 s1, s1, 3 ; GFX9-NEXT: s_or_b32 s0, s0, s1 @@ -603,7 +603,7 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: s_lshl_b32 s0, s0, 5 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: s_lshr_b32 s1, s1, 3 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog @@ -612,7 +612,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_and_b32 s1, s1, 0xff ; GFX11-NEXT: s_lshl_b32 s0, s0, 5 -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_lshr_b32 s1, s1, 3 ; GFX11-NEXT: s_or_b32 s0, s0, s1 @@ -700,7 +700,7 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s4, s1, 8 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshr_b32 s5, s2, 8 ; GFX8-NEXT: s_and_b32 s6, s2, 7 ; GFX8-NEXT: s_andn2_b32 s2, 7, s2 @@ -712,15 +712,14 @@ ; GFX8-NEXT: s_and_b32 s1, s5, 7 ; GFX8-NEXT: s_lshl_b32 s1, s3, s1 ; GFX8-NEXT: s_and_b32 s3, s4, 0xff -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_andn2_b32 s2, 7, s5 ; GFX8-NEXT: s_lshr_b32 s3, s3, 1 ; GFX8-NEXT: s_lshr_b32 s2, s3, s2 ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_bfe_u32 s2, 8, 0x100000 ; GFX8-NEXT: s_and_b32 s0, s0, 0xff -; GFX8-NEXT: s_lshl_b32 s1, s1, s2 +; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; @@ -728,7 +727,7 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_lshr_b32 s4, s1, 8 ; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_lshr_b32 s5, s2, 8 ; GFX9-NEXT: s_and_b32 s6, s2, 7 ; GFX9-NEXT: s_andn2_b32 s2, 7, s2 @@ -740,15 +739,14 @@ ; GFX9-NEXT: s_and_b32 s1, s5, 7 ; GFX9-NEXT: s_lshl_b32 s1, s3, s1 ; GFX9-NEXT: s_and_b32 s3, s4, 0xff -; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX9-NEXT: s_andn2_b32 s2, 7, s5 ; GFX9-NEXT: s_lshr_b32 s3, s3, 1 ; GFX9-NEXT: s_lshr_b32 s2, s3, s2 ; GFX9-NEXT: s_or_b32 s1, s1, s2 ; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_bfe_u32 s2, 8, 0x100000 ; GFX9-NEXT: s_and_b32 s0, s0, 0xff -; GFX9-NEXT: s_lshl_b32 s1, s1, s2 +; GFX9-NEXT: s_lshl_b32 s1, s1, 8 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; @@ -759,9 +757,9 @@ ; GFX10-NEXT: s_and_b32 s4, s4, 0xff ; GFX10-NEXT: s_and_b32 s6, s2, 7 ; GFX10-NEXT: s_and_b32 s1, s1, 0xff -; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX10-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX10-NEXT: s_lshr_b32 s3, s0, 8 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: s_lshl_b32 s0, s0, s6 ; GFX10-NEXT: s_and_b32 s6, s5, 7 ; GFX10-NEXT: s_andn2_b32 s5, 7, s5 @@ -774,9 +772,8 @@ ; GFX10-NEXT: s_or_b32 s2, s3, s4 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: s_and_b32 s1, s2, 0xff -; GFX10-NEXT: s_bfe_u32 s2, 8, 0x100000 ; GFX10-NEXT: s_and_b32 s0, s0, 0xff -; GFX10-NEXT: s_lshl_b32 s1, s1, s2 +; GFX10-NEXT: s_lshl_b32 s1, s1, 8 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog ; @@ -787,9 +784,9 @@ ; GFX11-NEXT: s_and_b32 s4, s4, 0xff ; GFX11-NEXT: s_and_b32 s6, s2, 7 ; GFX11-NEXT: s_and_b32 s1, s1, 0xff -; GFX11-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX11-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX11-NEXT: s_lshr_b32 s3, s0, 8 -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: s_lshl_b32 s0, s0, s6 ; GFX11-NEXT: s_and_b32 s6, s5, 7 ; GFX11-NEXT: s_and_not1_b32 s5, 7, s5 @@ -802,9 +799,8 @@ ; GFX11-NEXT: s_or_b32 s2, s3, s4 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_and_b32 s1, s2, 0xff -; GFX11-NEXT: s_bfe_u32 s2, 8, 0x100000 ; GFX11-NEXT: s_and_b32 s0, s0, 0xff -; GFX11-NEXT: s_lshl_b32 s1, s1, s2 +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: ; return to shader part epilog @@ -822,7 +818,7 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 8, v2 ; GFX6-NEXT: v_and_b32_e32 v5, 7, v2 -; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v5, v0 @@ -830,7 +826,7 @@ ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v2, v5 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 7, v4 -; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX6-NEXT: v_not_b32_e32 v4, v4 ; GFX6-NEXT: v_bfe_u32 v1, v1, 8, 8 ; GFX6-NEXT: v_and_b32_e32 v4, 7, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 @@ -849,7 +845,7 @@ ; GFX8-NEXT: v_and_b32_e32 v6, 7, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v2 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, v6, v0 ; GFX8-NEXT: v_mov_b32_e32 v6, 1 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v1 @@ -858,7 +854,7 @@ ; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_and_b32_e32 v1, 7, v5 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v5 +; GFX8-NEXT: v_not_b32_e32 v2, v5 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, v1, v3 ; GFX8-NEXT: v_lshrrev_b16_sdwa v3, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -874,7 +870,7 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v2 ; GFX9-NEXT: v_and_b32_e32 v6, 7, v2 -; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX9-NEXT: v_not_b32_e32 v2, v2 ; GFX9-NEXT: s_mov_b32 s4, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 ; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 @@ -884,7 +880,7 @@ ; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 7, v5 -; GFX9-NEXT: v_xor_b32_e32 v2, -1, v5 +; GFX9-NEXT: v_not_b32_e32 v2, v5 ; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, v1, v3 ; GFX9-NEXT: v_lshrrev_b16_sdwa v3, s4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -902,9 +898,9 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v0 -; GFX10-NEXT: v_xor_b32_e32 v7, -1, v2 +; GFX10-NEXT: v_not_b32_e32 v7, v2 ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX10-NEXT: v_xor_b32_e32 v6, -1, v3 +; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX10-NEXT: v_and_b32_e32 v3, 7, v3 ; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 @@ -930,9 +926,9 @@ ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 8, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v0 -; GFX11-NEXT: v_xor_b32_e32 v7, -1, v2 +; GFX11-NEXT: v_not_b32_e32 v7, v2 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-NEXT: v_xor_b32_e32 v6, -1, v3 +; GFX11-NEXT: v_not_b32_e32 v6, v3 ; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX11-NEXT: v_and_b32_e32 v3, 7, v3 ; GFX11-NEXT: v_and_b32_e32 v2, 7, v2 @@ -1016,7 +1012,7 @@ ; GFX8-NEXT: s_lshr_b32 s7, s1, 16 ; GFX8-NEXT: s_lshr_b32 s8, s1, 24 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshr_b32 s9, s2, 8 ; GFX8-NEXT: s_lshr_b32 s10, s2, 16 ; GFX8-NEXT: s_lshr_b32 s11, s2, 24 @@ -1032,7 +1028,7 @@ ; GFX8-NEXT: s_and_b32 s1, s9, 7 ; GFX8-NEXT: s_lshl_b32 s1, s3, s1 ; GFX8-NEXT: s_and_b32 s3, s6, 0xff -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_andn2_b32 s2, 7, s9 ; GFX8-NEXT: s_lshr_b32 s3, s3, 1 ; GFX8-NEXT: s_lshr_b32 s2, s3, s2 @@ -1040,7 +1036,7 @@ ; GFX8-NEXT: s_and_b32 s2, s10, 7 ; GFX8-NEXT: s_lshl_b32 s2, s4, s2 ; GFX8-NEXT: s_and_b32 s4, s7, 0xff -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX8-NEXT: s_andn2_b32 s3, 7, s10 ; GFX8-NEXT: s_lshr_b32 s4, s4, 1 ; GFX8-NEXT: s_lshr_b32 s3, s4, s3 @@ -1069,7 +1065,7 @@ ; GFX9-NEXT: s_lshr_b32 s7, s1, 16 ; GFX9-NEXT: s_lshr_b32 s8, s1, 24 ; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_lshr_b32 s9, s2, 8 ; GFX9-NEXT: s_lshr_b32 s10, s2, 16 ; GFX9-NEXT: s_lshr_b32 s11, s2, 24 @@ -1085,7 +1081,7 @@ ; GFX9-NEXT: s_and_b32 s1, s9, 7 ; GFX9-NEXT: s_lshl_b32 s1, s3, s1 ; GFX9-NEXT: s_and_b32 s3, s6, 0xff -; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX9-NEXT: s_andn2_b32 s2, 7, s9 ; GFX9-NEXT: s_lshr_b32 s3, s3, 1 ; GFX9-NEXT: s_lshr_b32 s2, s3, s2 @@ -1093,7 +1089,7 @@ ; GFX9-NEXT: s_and_b32 s2, s10, 7 ; GFX9-NEXT: s_lshl_b32 s2, s4, s2 ; GFX9-NEXT: s_and_b32 s4, s7, 0xff -; GFX9-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX9-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX9-NEXT: s_andn2_b32 s3, 7, s10 ; GFX9-NEXT: s_lshr_b32 s4, s4, 1 ; GFX9-NEXT: s_lshr_b32 s3, s4, s3 @@ -1123,7 +1119,7 @@ ; GFX10-NEXT: s_lshr_b32 s8, s1, 24 ; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: s_lshr_b32 s9, s2, 8 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: s_lshr_b32 s10, s2, 16 ; GFX10-NEXT: s_lshr_b32 s11, s2, 24 ; GFX10-NEXT: s_and_b32 s12, s2, 7 @@ -1133,7 +1129,7 @@ ; GFX10-NEXT: s_lshr_b32 s1, s1, s2 ; GFX10-NEXT: s_and_b32 s2, s6, 0xff ; GFX10-NEXT: s_and_b32 s6, s9, 7 -; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX10-NEXT: s_andn2_b32 s9, 7, s9 ; GFX10-NEXT: s_lshr_b32 s2, s2, 1 ; GFX10-NEXT: s_lshr_b32 s4, s0, 16 @@ -1145,7 +1141,7 @@ ; GFX10-NEXT: s_or_b32 s1, s3, s2 ; GFX10-NEXT: s_and_b32 s2, s7, 0xff ; GFX10-NEXT: s_and_b32 s3, s10, 7 -; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX10-NEXT: s_andn2_b32 s6, 7, s10 ; GFX10-NEXT: s_lshr_b32 s2, s2, 1 ; GFX10-NEXT: s_lshl_b32 s3, s4, s3 @@ -1176,7 +1172,7 @@ ; GFX11-NEXT: s_lshr_b32 s8, s1, 24 ; GFX11-NEXT: s_and_b32 s1, s1, 0xff ; GFX11-NEXT: s_lshr_b32 s9, s2, 8 -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: s_lshr_b32 s10, s2, 16 ; GFX11-NEXT: s_lshr_b32 s11, s2, 24 ; GFX11-NEXT: s_and_b32 s12, s2, 7 @@ -1186,7 +1182,7 @@ ; GFX11-NEXT: s_lshr_b32 s1, s1, s2 ; GFX11-NEXT: s_and_b32 s2, s6, 0xff ; GFX11-NEXT: s_and_b32 s6, s9, 7 -; GFX11-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX11-NEXT: s_and_not1_b32 s9, 7, s9 ; GFX11-NEXT: s_lshr_b32 s2, s2, 1 ; GFX11-NEXT: s_lshr_b32 s4, s0, 16 @@ -1198,7 +1194,7 @@ ; GFX11-NEXT: s_or_b32 s1, s3, s2 ; GFX11-NEXT: s_and_b32 s2, s7, 0xff ; GFX11-NEXT: s_and_b32 s3, s10, 7 -; GFX11-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX11-NEXT: s_and_not1_b32 s6, 7, s10 ; GFX11-NEXT: s_lshr_b32 s2, s2, 1 ; GFX11-NEXT: s_lshl_b32 s3, s4, s3 @@ -1238,7 +1234,7 @@ ; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v8, 24, v2 ; GFX6-NEXT: v_and_b32_e32 v9, 7, v2 -; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 24, v0 @@ -1248,7 +1244,7 @@ ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v2, v9 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 7, v6 -; GFX6-NEXT: v_xor_b32_e32 v6, -1, v6 +; GFX6-NEXT: v_not_b32_e32 v6, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_bfe_u32 v3, v1, 8, 8 ; GFX6-NEXT: v_and_b32_e32 v6, 7, v6 @@ -1256,13 +1252,13 @@ ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v6, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_and_b32_e32 v3, 7, v7 -; GFX6-NEXT: v_xor_b32_e32 v6, -1, v7 +; GFX6-NEXT: v_not_b32_e32 v6, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, v3, v4 ; GFX6-NEXT: v_bfe_u32 v4, v1, 16, 8 ; GFX6-NEXT: v_and_b32_e32 v6, 7, v6 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v6, v4 -; GFX6-NEXT: v_xor_b32_e32 v6, -1, v8 +; GFX6-NEXT: v_not_b32_e32 v6, v8 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 7, v8 ; GFX6-NEXT: v_and_b32_e32 v6, 7, v6 @@ -1289,7 +1285,7 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v2 ; GFX8-NEXT: v_and_b32_e32 v8, 7, v2 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: v_mov_b32_e32 v10, 1 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX8-NEXT: v_lshrrev_b16_sdwa v11, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -1298,7 +1294,7 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v1 ; GFX8-NEXT: v_or_b32_e32 v2, v8, v2 ; GFX8-NEXT: v_and_b32_e32 v8, 7, v5 -; GFX8-NEXT: v_xor_b32_e32 v5, -1, v5 +; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX8-NEXT: v_and_b32_e32 v5, 7, v5 ; GFX8-NEXT: v_lshrrev_b16_sdwa v4, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -1307,7 +1303,7 @@ ; GFX8-NEXT: v_lshrrev_b16_e32 v4, v5, v4 ; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX8-NEXT: v_and_b32_e32 v4, 7, v6 -; GFX8-NEXT: v_xor_b32_e32 v5, -1, v6 +; GFX8-NEXT: v_not_b32_e32 v5, v6 ; GFX8-NEXT: v_and_b32_sdwa v6, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_e32 v5, 7, v5 ; GFX8-NEXT: v_lshrrev_b16_e32 v6, 1, v6 @@ -1315,7 +1311,7 @@ ; GFX8-NEXT: v_lshrrev_b16_e32 v5, v5, v6 ; GFX8-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX8-NEXT: v_and_b32_e32 v5, 7, v7 -; GFX8-NEXT: v_xor_b32_e32 v6, -1, v7 +; GFX8-NEXT: v_not_b32_e32 v6, v7 ; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX8-NEXT: v_mov_b32_e32 v5, 1 ; GFX8-NEXT: v_and_b32_e32 v6, 7, v6 @@ -1340,7 +1336,7 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v2 ; GFX9-NEXT: v_and_b32_e32 v8, 7, v2 -; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX9-NEXT: v_not_b32_e32 v2, v2 ; GFX9-NEXT: s_mov_b32 s5, 1 ; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX9-NEXT: v_lshrrev_b16_sdwa v10, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -1349,7 +1345,7 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 ; GFX9-NEXT: v_or_b32_e32 v2, v8, v2 ; GFX9-NEXT: v_and_b32_e32 v8, 7, v5 -; GFX9-NEXT: v_xor_b32_e32 v5, -1, v5 +; GFX9-NEXT: v_not_b32_e32 v5, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX9-NEXT: v_and_b32_e32 v5, 7, v5 ; GFX9-NEXT: v_lshrrev_b16_sdwa v4, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -1358,7 +1354,7 @@ ; GFX9-NEXT: v_lshrrev_b16_e32 v4, v5, v4 ; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX9-NEXT: v_and_b32_e32 v4, 7, v6 -; GFX9-NEXT: v_xor_b32_e32 v5, -1, v6 +; GFX9-NEXT: v_not_b32_e32 v5, v6 ; GFX9-NEXT: v_and_b32_sdwa v6, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v5, 7, v5 ; GFX9-NEXT: v_lshrrev_b16_e32 v6, 1, v6 @@ -1366,7 +1362,7 @@ ; GFX9-NEXT: v_lshrrev_b16_e32 v5, v5, v6 ; GFX9-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX9-NEXT: v_and_b32_e32 v5, 7, v7 -; GFX9-NEXT: v_xor_b32_e32 v6, -1, v7 +; GFX9-NEXT: v_not_b32_e32 v6, v7 ; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-NEXT: v_mov_b32_e32 v5, 1 ; GFX9-NEXT: v_and_b32_e32 v6, 7, v6 @@ -1394,20 +1390,20 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v1 -; GFX10-NEXT: v_xor_b32_e32 v9, -1, v2 +; GFX10-NEXT: v_not_b32_e32 v9, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX10-NEXT: v_lshlrev_b16 v0, v10, v0 -; GFX10-NEXT: v_xor_b32_e32 v10, -1, v8 +; GFX10-NEXT: v_not_b32_e32 v10, v8 ; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 ; GFX10-NEXT: v_mov_b32_e32 v13, 0xff ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX10-NEXT: v_and_b32_e32 v12, 0xff, v1 ; GFX10-NEXT: v_and_b32_e32 v6, 0xff, v6 ; GFX10-NEXT: v_lshlrev_b16 v3, v8, v3 -; GFX10-NEXT: v_xor_b32_e32 v8, -1, v11 +; GFX10-NEXT: v_not_b32_e32 v8, v11 ; GFX10-NEXT: v_and_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_xor_b32_e32 v13, -1, v2 +; GFX10-NEXT: v_not_b32_e32 v13, v2 ; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 ; GFX10-NEXT: v_lshrrev_b16 v6, 1, v6 ; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 @@ -1448,7 +1444,7 @@ ; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v2 ; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX11-NEXT: v_xor_b32_e32 v13, -1, v9 +; GFX11-NEXT: v_not_b32_e32 v13, v9 ; GFX11-NEXT: v_lshrrev_b32_e32 v11, 24, v2 ; GFX11-NEXT: v_and_b32_e32 v9, 7, v9 ; GFX11-NEXT: v_lshrrev_b32_e32 v8, 24, v1 @@ -1456,13 +1452,13 @@ ; GFX11-NEXT: v_and_b32_e32 v13, 7, v13 ; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 ; GFX11-NEXT: v_lshlrev_b16 v3, v9, v3 -; GFX11-NEXT: v_xor_b32_e32 v9, -1, v10 +; GFX11-NEXT: v_not_b32_e32 v9, v10 ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX11-NEXT: v_lshrrev_b16 v6, v13, v6 -; GFX11-NEXT: v_xor_b32_e32 v13, -1, v11 +; GFX11-NEXT: v_not_b32_e32 v13, v11 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX11-NEXT: v_and_b32_e32 v12, 7, v2 -; GFX11-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX11-NEXT: v_not_b32_e32 v2, v2 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-NEXT: v_and_b32_e32 v10, 7, v10 ; GFX11-NEXT: v_and_b32_e32 v9, 7, v9 @@ -1834,10 +1830,10 @@ ; GFX6-NEXT: s_lshl_b32 s0, s0, 8 ; GFX6-NEXT: s_and_b32 s6, s6, 0xff ; GFX6-NEXT: s_or_b32 s0, s9, s0 -; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX6-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: s_lshr_b32 s8, s1, 8 -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 ; GFX6-NEXT: s_and_b32 s1, s1, 0xff ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -1845,8 +1841,8 @@ ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 ; GFX6-NEXT: s_and_b32 s6, s8, 0xff ; GFX6-NEXT: s_or_b32 s1, s7, s1 -; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX6-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 ; GFX6-NEXT: v_mov_b32_e32 v1, 0xffffffe8 ; GFX6-NEXT: s_or_b32 s1, s1, s6 @@ -1858,9 +1854,9 @@ ; GFX6-NEXT: s_lshl_b32 s2, s2, 8 ; GFX6-NEXT: s_and_b32 s6, s6, 0xff ; GFX6-NEXT: s_or_b32 s2, s9, s2 -; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX6-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX6-NEXT: s_lshr_b32 s8, s3, 8 -; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX6-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 ; GFX6-NEXT: s_and_b32 s3, s3, 0xff ; GFX6-NEXT: s_or_b32 s2, s2, s6 @@ -1868,8 +1864,8 @@ ; GFX6-NEXT: s_and_b32 s6, s8, 0xff ; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX6-NEXT: s_or_b32 s3, s7, s3 -; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX6-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX6-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX6-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 ; GFX6-NEXT: s_or_b32 s3, s3, s6 ; GFX6-NEXT: s_lshr_b32 s6, s4, 16 @@ -1881,9 +1877,9 @@ ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 ; GFX6-NEXT: s_or_b32 s4, s9, s4 -; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX6-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 ; GFX6-NEXT: s_or_b32 s4, s4, s6 ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 @@ -1901,9 +1897,9 @@ ; GFX6-NEXT: s_and_b32 s6, s8, 0xff ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX6-NEXT: s_or_b32 s5, s7, s5 -; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX6-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 24, v0 -; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX6-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX6-NEXT: s_or_b32 s5, s5, s6 @@ -1954,26 +1950,25 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s6, s0, 8 ; GFX8-NEXT: s_and_b32 s6, s6, 0xff -; GFX8-NEXT: s_bfe_u32 s10, 8, 0x100000 ; GFX8-NEXT: s_lshr_b32 s7, s0, 16 ; GFX8-NEXT: s_lshr_b32 s8, s0, 24 ; GFX8-NEXT: s_and_b32 s0, s0, 0xff -; GFX8-NEXT: s_lshl_b32 s6, s6, s10 +; GFX8-NEXT: s_lshl_b32 s6, s6, 8 ; GFX8-NEXT: s_or_b32 s0, s0, s6 ; GFX8-NEXT: s_and_b32 s6, s7, 0xff ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 -; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: s_lshr_b32 s9, s1, 8 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_or_b32 s0, s0, s6 -; GFX8-NEXT: s_lshl_b32 s1, s1, s10 +; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_and_b32 s6, s9, 0xff ; GFX8-NEXT: s_or_b32 s1, s8, s1 -; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: s_or_b32 s1, s1, s6 @@ -1983,22 +1978,22 @@ ; GFX8-NEXT: s_lshr_b32 s7, s2, 16 ; GFX8-NEXT: s_lshr_b32 s8, s2, 24 ; GFX8-NEXT: s_and_b32 s2, s2, 0xff -; GFX8-NEXT: s_lshl_b32 s6, s6, s10 +; GFX8-NEXT: s_lshl_b32 s6, s6, 8 ; GFX8-NEXT: s_or_b32 s2, s2, s6 ; GFX8-NEXT: s_and_b32 s6, s7, 0xff ; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffe8 -; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX8-NEXT: v_mul_lo_u32 v2, v0, v1 ; GFX8-NEXT: s_lshr_b32 s9, s3, 8 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 ; GFX8-NEXT: s_and_b32 s3, s3, 0xff ; GFX8-NEXT: s_or_b32 s2, s2, s6 -; GFX8-NEXT: s_lshl_b32 s3, s3, s10 +; GFX8-NEXT: s_lshl_b32 s3, s3, 8 ; GFX8-NEXT: s_and_b32 s6, s9, 0xff ; GFX8-NEXT: s_or_b32 s3, s8, s3 -; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 ; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX8-NEXT: s_or_b32 s3, s3, s6 @@ -2007,14 +2002,14 @@ ; GFX8-NEXT: s_lshr_b32 s7, s4, 16 ; GFX8-NEXT: s_lshr_b32 s8, s4, 24 ; GFX8-NEXT: s_and_b32 s4, s4, 0xff -; GFX8-NEXT: s_lshl_b32 s6, s6, s10 +; GFX8-NEXT: s_lshl_b32 s6, s6, 8 ; GFX8-NEXT: s_or_b32 s4, s4, s6 ; GFX8-NEXT: s_and_b32 s6, s7, 0xff ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 -; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 ; GFX8-NEXT: s_or_b32 s4, s4, s6 ; GFX8-NEXT: v_mul_hi_u32 v0, s4, v0 @@ -2024,7 +2019,7 @@ ; GFX8-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX8-NEXT: s_and_b32 s5, s5, 0xff ; GFX8-NEXT: v_mul_lo_u32 v1, v2, v1 -; GFX8-NEXT: s_lshl_b32 s5, s5, s10 +; GFX8-NEXT: s_lshl_b32 s5, s5, 8 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v0 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 @@ -2032,9 +2027,9 @@ ; GFX8-NEXT: s_and_b32 s6, s9, 0xff ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX8-NEXT: s_or_b32 s5, s8, s5 -; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v0 -; GFX8-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX8-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX8-NEXT: s_or_b32 s5, s5, s6 @@ -2084,28 +2079,27 @@ ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_lshr_b32 s7, s0, 8 ; GFX9-NEXT: s_and_b32 s7, s7, 0xff -; GFX9-NEXT: s_bfe_u32 s12, 8, 0x100000 +; GFX9-NEXT: s_lshr_b32 s9, s0, 16 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_lshr_b32 s9, s0, 16 ; GFX9-NEXT: s_lshr_b32 s10, s0, 24 ; GFX9-NEXT: s_and_b32 s0, s0, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s7, s12 +; GFX9-NEXT: s_lshl_b32 s7, s7, 8 ; GFX9-NEXT: s_or_b32 s0, s0, s7 ; GFX9-NEXT: s_and_b32 s7, s9, 0xff -; GFX9-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX9-NEXT: s_and_b32 s7, 0xffff, s7 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffffffe8 ; GFX9-NEXT: s_lshr_b32 s11, s1, 8 -; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX9-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: v_mul_lo_u32 v2, v0, v1 ; GFX9-NEXT: s_or_b32 s0, s0, s7 -; GFX9-NEXT: s_lshl_b32 s1, s1, s12 +; GFX9-NEXT: s_lshl_b32 s1, s1, 8 ; GFX9-NEXT: s_and_b32 s7, s11, 0xff ; GFX9-NEXT: s_or_b32 s1, s10, s1 -; GFX9-NEXT: s_bfe_u32 s7, s7, 0x100000 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_and_b32 s7, 0xffff, s7 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: s_or_b32 s1, s1, s7 ; GFX9-NEXT: s_lshr_b32 s7, s2, 8 @@ -2114,23 +2108,23 @@ ; GFX9-NEXT: s_lshr_b32 s9, s2, 16 ; GFX9-NEXT: s_lshr_b32 s10, s2, 24 ; GFX9-NEXT: s_and_b32 s2, s2, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s7, s12 +; GFX9-NEXT: s_lshl_b32 s7, s7, 8 ; GFX9-NEXT: s_or_b32 s2, s2, s7 ; GFX9-NEXT: s_and_b32 s7, s9, 0xff -; GFX9-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX9-NEXT: s_and_b32 s7, 0xffff, s7 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 ; GFX9-NEXT: s_lshr_b32 s11, s3, 8 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: s_and_b32 s3, s3, 0xff ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX9-NEXT: s_or_b32 s2, s2, s7 -; GFX9-NEXT: s_lshl_b32 s3, s3, s12 +; GFX9-NEXT: s_lshl_b32 s3, s3, 8 ; GFX9-NEXT: s_and_b32 s7, s11, 0xff ; GFX9-NEXT: s_or_b32 s3, s10, s3 -; GFX9-NEXT: s_bfe_u32 s7, s7, 0x100000 -; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX9-NEXT: s_and_b32 s7, 0xffff, s7 +; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: s_or_b32 s3, s3, s7 ; GFX9-NEXT: s_lshr_b32 s7, s4, 8 @@ -2140,11 +2134,11 @@ ; GFX9-NEXT: s_lshr_b32 s9, s4, 16 ; GFX9-NEXT: s_lshr_b32 s10, s4, 24 ; GFX9-NEXT: s_and_b32 s4, s4, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s7, s12 +; GFX9-NEXT: s_lshl_b32 s7, s7, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s7 ; GFX9-NEXT: s_and_b32 s7, s9, 0xff -; GFX9-NEXT: s_bfe_u32 s7, s7, 0x100000 -; GFX9-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX9-NEXT: s_and_b32 s7, 0xffff, s7 +; GFX9-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: v_mul_lo_u32 v1, v2, v1 ; GFX9-NEXT: s_or_b32 s4, s4, s7 @@ -2152,12 +2146,12 @@ ; GFX9-NEXT: s_lshr_b32 s11, s5, 8 ; GFX9-NEXT: s_and_b32 s5, s5, 0xff ; GFX9-NEXT: v_mul_hi_u32 v1, v2, v1 -; GFX9-NEXT: s_lshl_b32 s5, s5, s12 +; GFX9-NEXT: s_lshl_b32 s5, s5, 8 ; GFX9-NEXT: s_and_b32 s7, s11, 0xff ; GFX9-NEXT: s_or_b32 s5, s10, s5 -; GFX9-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX9-NEXT: s_and_b32 s7, 0xffff, s7 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, 24 -; GFX9-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX9-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: s_or_b32 s5, s5, s7 ; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 @@ -2214,61 +2208,61 @@ ; GFX10-NEXT: s_and_b32 s6, s6, 0xff ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX10-NEXT: s_bfe_u32 s10, 8, 0x100000 ; GFX10-NEXT: s_lshr_b32 s8, s0, 24 ; GFX10-NEXT: s_and_b32 s0, s0, 0xff -; GFX10-NEXT: s_lshl_b32 s6, s6, s10 +; GFX10-NEXT: s_lshl_b32 s6, s6, 8 ; GFX10-NEXT: s_and_b32 s7, s7, 0xff ; GFX10-NEXT: s_or_b32 s0, s0, s6 -; GFX10-NEXT: s_bfe_u32 s6, s7, 0x100000 +; GFX10-NEXT: s_and_b32 s6, 0xffff, s7 ; GFX10-NEXT: s_lshr_b32 s7, s4, 8 +; GFX10-NEXT: s_lshr_b32 s10, s4, 16 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX10-NEXT: s_and_b32 s7, s7, 0xff -; GFX10-NEXT: s_lshr_b32 s11, s4, 16 -; GFX10-NEXT: s_lshr_b32 s12, s4, 24 +; GFX10-NEXT: s_lshr_b32 s11, s4, 24 +; GFX10-NEXT: s_and_b32 s4, s4, 0xff ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX10-NEXT: s_and_b32 s4, s4, 0xff -; GFX10-NEXT: s_lshl_b32 s7, s7, s10 -; GFX10-NEXT: s_lshr_b32 s13, s5, 8 +; GFX10-NEXT: s_lshl_b32 s7, s7, 8 +; GFX10-NEXT: s_lshr_b32 s12, s5, 8 +; GFX10-NEXT: s_or_b32 s4, s4, s7 ; GFX10-NEXT: v_mul_lo_u32 v2, 0xffffffe8, v0 ; GFX10-NEXT: v_mul_lo_u32 v3, 0xffffffe8, v1 -; GFX10-NEXT: s_or_b32 s4, s4, s7 -; GFX10-NEXT: s_and_b32 s7, s11, 0xff -; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX10-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX10-NEXT: s_and_b32 s7, s10, 0xff +; GFX10-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX10-NEXT: s_and_b32 s7, 0xffff, s7 ; GFX10-NEXT: s_and_b32 s5, s5, 0xff ; GFX10-NEXT: s_lshl_b32 s7, s7, 16 +; GFX10-NEXT: s_lshl_b32 s5, s5, 8 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX10-NEXT: s_lshl_b32 s5, s5, s10 ; GFX10-NEXT: s_or_b32 s4, s4, s7 -; GFX10-NEXT: s_and_b32 s7, s13, 0xff -; GFX10-NEXT: s_or_b32 s5, s12, s5 -; GFX10-NEXT: s_bfe_u32 s7, s7, 0x100000 -; GFX10-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX10-NEXT: s_and_b32 s7, s12, 0xff +; GFX10-NEXT: s_or_b32 s5, s11, s5 +; GFX10-NEXT: s_and_b32 s7, 0xffff, s7 +; GFX10-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX10-NEXT: s_lshl_b32 s7, s7, 16 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 -; GFX10-NEXT: s_lshl_b32 s7, s7, 16 -; GFX10-NEXT: s_lshr_b32 s9, s1, 8 ; GFX10-NEXT: s_or_b32 s5, s5, s7 +; GFX10-NEXT: s_lshr_b32 s9, s1, 8 +; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX10-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX10-NEXT: s_and_b32 s1, s1, 0xff +; GFX10-NEXT: s_lshl_b32 s1, s1, 8 ; GFX10-NEXT: s_and_b32 s7, s9, 0xff -; GFX10-NEXT: s_lshl_b32 s1, s1, s10 -; GFX10-NEXT: s_lshr_b32 s9, s2, 16 ; GFX10-NEXT: s_or_b32 s1, s8, s1 ; GFX10-NEXT: s_lshr_b32 s8, s2, 8 +; GFX10-NEXT: s_lshr_b32 s9, s2, 16 +; GFX10-NEXT: s_and_b32 s8, s8, 0xff ; GFX10-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX10-NEXT: v_mul_lo_u32 v1, v1, 24 -; GFX10-NEXT: s_and_b32 s8, s8, 0xff -; GFX10-NEXT: s_lshr_b32 s11, s2, 24 +; GFX10-NEXT: s_lshr_b32 s10, s2, 24 ; GFX10-NEXT: s_and_b32 s2, s2, 0xff -; GFX10-NEXT: s_lshl_b32 s8, s8, s10 -; GFX10-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX10-NEXT: s_lshl_b32 s8, s8, 8 +; GFX10-NEXT: s_and_b32 s7, 0xffff, s7 ; GFX10-NEXT: s_or_b32 s2, s2, s8 +; GFX10-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, s4, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s5, v1 ; GFX10-NEXT: s_lshr_b32 s4, s3, 8 @@ -2277,20 +2271,20 @@ ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v1 -; GFX10-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX10-NEXT: s_lshl_b32 s3, s3, s10 +; GFX10-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX10-NEXT: s_lshl_b32 s3, s3, 8 ; GFX10-NEXT: s_and_b32 s4, s4, 0xff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 -; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX10-NEXT: s_lshl_b32 s5, s5, 16 -; GFX10-NEXT: s_or_b32 s3, s11, s3 +; GFX10-NEXT: s_or_b32 s3, s10, s3 ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX10-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX10-NEXT: s_or_b32 s2, s2, s5 -; GFX10-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX10-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 @@ -2300,17 +2294,16 @@ ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 23, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GFX10-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX10-NEXT: s_lshl_b32 s6, s6, 16 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, 23, v1 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff, v1 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX10-NEXT: s_lshl_b32 s7, s7, 16 +; GFX10-NEXT: s_or_b32 s0, s0, s6 ; GFX10-NEXT: v_lshrrev_b32_e64 v2, v2, s2 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GFX10-NEXT: s_lshr_b32 s2, s3, 1 -; GFX10-NEXT: s_or_b32 s0, s0, s6 ; GFX10-NEXT: s_or_b32 s1, s1, s7 ; GFX10-NEXT: v_lshl_or_b32 v0, s0, v0, v2 ; GFX10-NEXT: v_lshrrev_b32_e64 v3, v3, s2 @@ -2335,95 +2328,94 @@ ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, 24 ; GFX11-NEXT: s_lshr_b32 s6, s0, 8 -; GFX11-NEXT: s_bfe_u32 s9, 8, 0x100000 +; GFX11-NEXT: s_lshr_b32 s7, s0, 16 ; GFX11-NEXT: s_and_b32 s6, s6, 0xff ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX11-NEXT: s_lshr_b32 s7, s0, 16 ; GFX11-NEXT: s_lshr_b32 s8, s0, 24 ; GFX11-NEXT: s_and_b32 s0, s0, 0xff -; GFX11-NEXT: s_lshl_b32 s6, s6, s9 -; GFX11-NEXT: s_lshr_b32 s11, s4, 24 +; GFX11-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-NEXT: s_lshr_b32 s10, s4, 24 ; GFX11-NEXT: s_or_b32 s0, s0, s6 ; GFX11-NEXT: s_and_b32 s6, s7, 0xff -; GFX11-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX11-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX11-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_dual_mul_f32 v0, 0x4f7ffffe, v0 :: v_dual_mul_f32 v1, 0x4f7ffffe, v1 -; GFX11-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX11-NEXT: s_lshr_b32 s7, s4, 16 ; GFX11-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-NEXT: s_lshr_b32 s7, s4, 16 +; GFX11-NEXT: s_or_b32 s0, s0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX11-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX11-NEXT: s_or_b32 s0, s0, s6 ; GFX11-NEXT: s_lshr_b32 s6, s4, 8 ; GFX11-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-NEXT: s_and_b32 s6, s6, 0xff ; GFX11-NEXT: v_mul_lo_u32 v2, 0xffffffe8, v0 ; GFX11-NEXT: v_mul_lo_u32 v3, 0xffffffe8, v1 -; GFX11-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-NEXT: s_lshl_b32 s6, s6, 8 ; GFX11-NEXT: s_and_b32 s7, s7, 0xff -; GFX11-NEXT: s_lshl_b32 s6, s6, s9 -; GFX11-NEXT: s_lshr_b32 s12, s5, 8 ; GFX11-NEXT: s_or_b32 s4, s4, s6 -; GFX11-NEXT: s_bfe_u32 s6, s7, 0x100000 +; GFX11-NEXT: s_and_b32 s6, 0xffff, s7 +; GFX11-NEXT: s_lshr_b32 s11, s5, 8 +; GFX11-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX11-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX11-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX11-NEXT: s_lshl_b32 s6, s6, 16 ; GFX11-NEXT: s_and_b32 s5, s5, 0xff ; GFX11-NEXT: s_or_b32 s4, s4, s6 -; GFX11-NEXT: s_lshl_b32 s5, s5, s9 -; GFX11-NEXT: s_and_b32 s6, s12, 0xff -; GFX11-NEXT: s_or_b32 s5, s11, s5 +; GFX11-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-NEXT: s_and_b32 s6, s11, 0xff +; GFX11-NEXT: s_or_b32 s5, s10, s5 +; GFX11-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX11-NEXT: v_mul_hi_u32 v2, v1, v3 -; GFX11-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX11-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX11-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX11-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-NEXT: s_lshr_b32 s9, s1, 8 ; GFX11-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX11-NEXT: s_or_b32 s5, s5, s6 -; GFX11-NEXT: s_lshr_b32 s10, s1, 8 ; GFX11-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-NEXT: s_and_b32 s7, s9, 0xff ; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX11-NEXT: s_and_b32 s7, s10, 0xff -; GFX11-NEXT: s_lshl_b32 s1, s1, s9 -; GFX11-NEXT: s_bfe_u32 s6, s7, 0x100000 +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s6, 0xffff, s7 +; GFX11-NEXT: s_lshr_b32 s7, s2, 8 ; GFX11-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX11-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX11-NEXT: s_lshr_b32 s7, s2, 8 ; GFX11-NEXT: s_or_b32 s1, s8, s1 ; GFX11-NEXT: s_lshr_b32 s8, s2, 16 ; GFX11-NEXT: s_and_b32 s7, s7, 0xff -; GFX11-NEXT: s_lshr_b32 s10, s3, 8 -; GFX11-NEXT: s_lshl_b32 s7, s7, s9 +; GFX11-NEXT: s_lshr_b32 s9, s3, 8 +; GFX11-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-NEXT: s_and_b32 s3, s3, 0xff ; GFX11-NEXT: v_sub_nc_u32_e32 v0, s4, v0 ; GFX11-NEXT: v_mul_lo_u32 v1, v1, 24 ; GFX11-NEXT: s_lshr_b32 s4, s2, 24 ; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 ; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 ; GFX11-NEXT: s_or_b32 s2, s2, s7 -; GFX11-NEXT: s_lshl_b32 s3, s3, s9 +; GFX11-NEXT: s_or_b32 s3, s4, s3 ; GFX11-NEXT: v_sub_nc_u32_e32 v1, s5, v1 ; GFX11-NEXT: s_and_b32 s5, s8, 0xff ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo -; GFX11-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX11-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX11-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 ; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 24, v0 ; GFX11-NEXT: s_lshl_b32 s5, s5, 16 -; GFX11-NEXT: s_or_b32 s3, s4, s3 +; GFX11-NEXT: s_and_b32 s4, s9, 0xff ; GFX11-NEXT: s_or_b32 s2, s2, s5 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX11-NEXT: s_and_b32 s4, s10, 0xff ; GFX11-NEXT: s_lshr_b32 s2, s2, 1 -; GFX11-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX11-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX11-NEXT: s_lshl_b32 s6, s6, 16 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_sub_nc_u32_e32 v3, 23, v0 @@ -2433,7 +2425,7 @@ ; GFX11-NEXT: v_sub_nc_u32_e32 v3, 23, v1 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX11-NEXT: v_lshrrev_b32_e64 v2, v2, s2 -; GFX11-NEXT: s_bfe_u32 s2, s3, 0x100000 +; GFX11-NEXT: s_and_b32 s2, 0xffff, s3 ; GFX11-NEXT: s_lshl_b32 s3, s4, 16 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GFX11-NEXT: s_or_b32 s2, s2, s3 @@ -2892,7 +2884,7 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_alignbit_b32 v1, v0, v1, 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: v_alignbit_b32 v0, v0, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -2901,7 +2893,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_alignbit_b32 v1, v0, v1, 1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -2910,7 +2902,7 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_alignbit_b32 v1, v0, v1, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX9-NEXT: v_not_b32_e32 v2, v2 ; GFX9-NEXT: v_alignbit_b32 v0, v0, v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2920,7 +2912,7 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_alignbit_b32 v1, v0, v1, 1 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX10-NEXT: v_not_b32_e32 v2, v2 ; GFX10-NEXT: v_alignbit_b32 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -2930,7 +2922,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_alignbit_b32 v1, v0, v1, 1 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX11-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX11-NEXT: v_not_b32_e32 v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_alignbit_b32 v0, v0, v1, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -3016,7 +3008,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_alignbit_b32 v1, s0, v1, 1 ; GFX6-NEXT: s_lshr_b32 s0, s0, 1 -; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX6-NEXT: v_not_b32_e32 v0, v0 ; GFX6-NEXT: v_alignbit_b32 v0, s0, v1, v0 ; GFX6-NEXT: ; return to shader part epilog ; @@ -3025,7 +3017,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_alignbit_b32 v1, s0, v1, 1 ; GFX8-NEXT: s_lshr_b32 s0, s0, 1 -; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: v_alignbit_b32 v0, s0, v1, v0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -3034,14 +3026,14 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_alignbit_b32 v1, s0, v1, 1 ; GFX9-NEXT: s_lshr_b32 s0, s0, 1 -; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX9-NEXT: v_not_b32_e32 v0, v0 ; GFX9-NEXT: v_alignbit_b32 v0, s0, v1, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: v_fshl_i32_ssv: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_alignbit_b32 v1, s0, s1, 1 -; GFX10-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX10-NEXT: v_not_b32_e32 v0, v0 ; GFX10-NEXT: s_lshr_b32 s0, s0, 1 ; GFX10-NEXT: v_alignbit_b32 v0, s0, v1, v0 ; GFX10-NEXT: ; return to shader part epilog @@ -3049,7 +3041,7 @@ ; GFX11-LABEL: v_fshl_i32_ssv: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_alignbit_b32 v1, s0, s1, 1 -; GFX11-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX11-NEXT: v_not_b32_e32 v0, v0 ; GFX11-NEXT: s_lshr_b32 s0, s0, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_alignbit_b32 v0, s0, v1, v0 @@ -3166,11 +3158,11 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_alignbit_b32 v2, v0, v2, 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX6-NEXT: v_not_b32_e32 v4, v4 ; GFX6-NEXT: v_alignbit_b32 v0, v0, v2, v4 ; GFX6-NEXT: v_alignbit_b32 v2, v1, v3, 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX6-NEXT: v_xor_b32_e32 v3, -1, v5 +; GFX6-NEXT: v_not_b32_e32 v3, v5 ; GFX6-NEXT: v_alignbit_b32 v1, v1, v2, v3 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -3179,11 +3171,11 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_alignbit_b32 v2, v0, v2, 1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, v4 ; GFX8-NEXT: v_alignbit_b32 v2, v1, v3, 1 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX8-NEXT: v_xor_b32_e32 v3, -1, v5 +; GFX8-NEXT: v_not_b32_e32 v3, v5 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v2, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -3192,11 +3184,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_alignbit_b32 v2, v0, v2, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX9-NEXT: v_not_b32_e32 v4, v4 ; GFX9-NEXT: v_alignbit_b32 v0, v0, v2, v4 ; GFX9-NEXT: v_alignbit_b32 v2, v1, v3, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_xor_b32_e32 v3, -1, v5 +; GFX9-NEXT: v_not_b32_e32 v3, v5 ; GFX9-NEXT: v_alignbit_b32 v1, v1, v2, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3206,10 +3198,10 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_alignbit_b32 v2, v0, v2, 1 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX10-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX10-NEXT: v_not_b32_e32 v4, v4 ; GFX10-NEXT: v_alignbit_b32 v3, v1, v3, 1 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX10-NEXT: v_xor_b32_e32 v5, -1, v5 +; GFX10-NEXT: v_not_b32_e32 v5, v5 ; GFX10-NEXT: v_alignbit_b32 v0, v0, v2, v4 ; GFX10-NEXT: v_alignbit_b32 v1, v1, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -3220,10 +3212,10 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_alignbit_b32 v2, v0, v2, 1 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX11-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX11-NEXT: v_not_b32_e32 v4, v4 ; GFX11-NEXT: v_alignbit_b32 v3, v1, v3, 1 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX11-NEXT: v_xor_b32_e32 v5, -1, v5 +; GFX11-NEXT: v_not_b32_e32 v5, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_alignbit_b32 v0, v0, v2, v4 ; GFX11-NEXT: v_alignbit_b32 v1, v1, v3, v5 @@ -3238,15 +3230,15 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_alignbit_b32 v3, v0, v3, 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_xor_b32_e32 v6, -1, v6 +; GFX6-NEXT: v_not_b32_e32 v6, v6 ; GFX6-NEXT: v_alignbit_b32 v0, v0, v3, v6 ; GFX6-NEXT: v_alignbit_b32 v3, v1, v4, 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX6-NEXT: v_xor_b32_e32 v4, -1, v7 +; GFX6-NEXT: v_not_b32_e32 v4, v7 ; GFX6-NEXT: v_alignbit_b32 v1, v1, v3, v4 ; GFX6-NEXT: v_alignbit_b32 v3, v2, v5, 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2 -; GFX6-NEXT: v_xor_b32_e32 v4, -1, v8 +; GFX6-NEXT: v_not_b32_e32 v4, v8 ; GFX6-NEXT: v_alignbit_b32 v2, v2, v3, v4 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -3255,15 +3247,15 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_alignbit_b32 v3, v0, v3, 1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX8-NEXT: v_xor_b32_e32 v6, -1, v6 +; GFX8-NEXT: v_not_b32_e32 v6, v6 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, v6 ; GFX8-NEXT: v_alignbit_b32 v3, v1, v4, 1 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v7 +; GFX8-NEXT: v_not_b32_e32 v4, v7 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v3, v4 ; GFX8-NEXT: v_alignbit_b32 v3, v2, v5, 1 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v2 -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v8 +; GFX8-NEXT: v_not_b32_e32 v4, v8 ; GFX8-NEXT: v_alignbit_b32 v2, v2, v3, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -3272,15 +3264,15 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_alignbit_b32 v3, v0, v3, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_xor_b32_e32 v6, -1, v6 +; GFX9-NEXT: v_not_b32_e32 v6, v6 ; GFX9-NEXT: v_alignbit_b32 v0, v0, v3, v6 ; GFX9-NEXT: v_alignbit_b32 v3, v1, v4, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_xor_b32_e32 v4, -1, v7 +; GFX9-NEXT: v_not_b32_e32 v4, v7 ; GFX9-NEXT: v_alignbit_b32 v1, v1, v3, v4 ; GFX9-NEXT: v_alignbit_b32 v3, v2, v5, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v2 -; GFX9-NEXT: v_xor_b32_e32 v4, -1, v8 +; GFX9-NEXT: v_not_b32_e32 v4, v8 ; GFX9-NEXT: v_alignbit_b32 v2, v2, v3, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3290,13 +3282,13 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_alignbit_b32 v3, v0, v3, 1 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX10-NEXT: v_xor_b32_e32 v6, -1, v6 +; GFX10-NEXT: v_not_b32_e32 v6, v6 ; GFX10-NEXT: v_alignbit_b32 v4, v1, v4, 1 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX10-NEXT: v_xor_b32_e32 v7, -1, v7 +; GFX10-NEXT: v_not_b32_e32 v7, v7 ; GFX10-NEXT: v_alignbit_b32 v5, v2, v5, 1 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 1, v2 -; GFX10-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX10-NEXT: v_not_b32_e32 v8, v8 ; GFX10-NEXT: v_alignbit_b32 v0, v0, v3, v6 ; GFX10-NEXT: v_alignbit_b32 v1, v1, v4, v7 ; GFX10-NEXT: v_alignbit_b32 v2, v2, v5, v8 @@ -3308,13 +3300,13 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_alignbit_b32 v3, v0, v3, 1 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX11-NEXT: v_xor_b32_e32 v6, -1, v6 +; GFX11-NEXT: v_not_b32_e32 v6, v6 ; GFX11-NEXT: v_alignbit_b32 v4, v1, v4, 1 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX11-NEXT: v_xor_b32_e32 v7, -1, v7 +; GFX11-NEXT: v_not_b32_e32 v7, v7 ; GFX11-NEXT: v_alignbit_b32 v5, v2, v5, 1 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 1, v2 -; GFX11-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX11-NEXT: v_not_b32_e32 v8, v8 ; GFX11-NEXT: v_alignbit_b32 v0, v0, v3, v6 ; GFX11-NEXT: v_alignbit_b32 v1, v1, v4, v7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) @@ -3330,19 +3322,19 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_alignbit_b32 v4, v0, v4, 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX6-NEXT: v_not_b32_e32 v8, v8 ; GFX6-NEXT: v_alignbit_b32 v0, v0, v4, v8 ; GFX6-NEXT: v_alignbit_b32 v4, v1, v5, 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX6-NEXT: v_xor_b32_e32 v5, -1, v9 +; GFX6-NEXT: v_not_b32_e32 v5, v9 ; GFX6-NEXT: v_alignbit_b32 v1, v1, v4, v5 ; GFX6-NEXT: v_alignbit_b32 v4, v2, v6, 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2 -; GFX6-NEXT: v_xor_b32_e32 v5, -1, v10 +; GFX6-NEXT: v_not_b32_e32 v5, v10 ; GFX6-NEXT: v_alignbit_b32 v2, v2, v4, v5 ; GFX6-NEXT: v_alignbit_b32 v4, v3, v7, 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 1, v3 -; GFX6-NEXT: v_xor_b32_e32 v5, -1, v11 +; GFX6-NEXT: v_not_b32_e32 v5, v11 ; GFX6-NEXT: v_alignbit_b32 v3, v3, v4, v5 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -3351,19 +3343,19 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_alignbit_b32 v4, v0, v4, 1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX8-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX8-NEXT: v_not_b32_e32 v8, v8 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v4, v8 ; GFX8-NEXT: v_alignbit_b32 v4, v1, v5, 1 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX8-NEXT: v_xor_b32_e32 v5, -1, v9 +; GFX8-NEXT: v_not_b32_e32 v5, v9 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, v5 ; GFX8-NEXT: v_alignbit_b32 v4, v2, v6, 1 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v2 -; GFX8-NEXT: v_xor_b32_e32 v5, -1, v10 +; GFX8-NEXT: v_not_b32_e32 v5, v10 ; GFX8-NEXT: v_alignbit_b32 v2, v2, v4, v5 ; GFX8-NEXT: v_alignbit_b32 v4, v3, v7, 1 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 1, v3 -; GFX8-NEXT: v_xor_b32_e32 v5, -1, v11 +; GFX8-NEXT: v_not_b32_e32 v5, v11 ; GFX8-NEXT: v_alignbit_b32 v3, v3, v4, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -3372,19 +3364,19 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_alignbit_b32 v4, v0, v4, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX9-NEXT: v_not_b32_e32 v8, v8 ; GFX9-NEXT: v_alignbit_b32 v0, v0, v4, v8 ; GFX9-NEXT: v_alignbit_b32 v4, v1, v5, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_xor_b32_e32 v5, -1, v9 +; GFX9-NEXT: v_not_b32_e32 v5, v9 ; GFX9-NEXT: v_alignbit_b32 v1, v1, v4, v5 ; GFX9-NEXT: v_alignbit_b32 v4, v2, v6, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v2 -; GFX9-NEXT: v_xor_b32_e32 v5, -1, v10 +; GFX9-NEXT: v_not_b32_e32 v5, v10 ; GFX9-NEXT: v_alignbit_b32 v2, v2, v4, v5 ; GFX9-NEXT: v_alignbit_b32 v4, v3, v7, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 1, v3 -; GFX9-NEXT: v_xor_b32_e32 v5, -1, v11 +; GFX9-NEXT: v_not_b32_e32 v5, v11 ; GFX9-NEXT: v_alignbit_b32 v3, v3, v4, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3394,16 +3386,16 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_alignbit_b32 v4, v0, v4, 1 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX10-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX10-NEXT: v_not_b32_e32 v8, v8 ; GFX10-NEXT: v_alignbit_b32 v5, v1, v5, 1 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX10-NEXT: v_xor_b32_e32 v9, -1, v9 +; GFX10-NEXT: v_not_b32_e32 v9, v9 ; GFX10-NEXT: v_alignbit_b32 v6, v2, v6, 1 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 1, v2 -; GFX10-NEXT: v_xor_b32_e32 v10, -1, v10 +; GFX10-NEXT: v_not_b32_e32 v10, v10 ; GFX10-NEXT: v_alignbit_b32 v7, v3, v7, 1 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 1, v3 -; GFX10-NEXT: v_xor_b32_e32 v11, -1, v11 +; GFX10-NEXT: v_not_b32_e32 v11, v11 ; GFX10-NEXT: v_alignbit_b32 v0, v0, v4, v8 ; GFX10-NEXT: v_alignbit_b32 v1, v1, v5, v9 ; GFX10-NEXT: v_alignbit_b32 v2, v2, v6, v10 @@ -3416,16 +3408,16 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_alignbit_b32 v4, v0, v4, 1 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX11-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX11-NEXT: v_not_b32_e32 v8, v8 ; GFX11-NEXT: v_alignbit_b32 v5, v1, v5, 1 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX11-NEXT: v_xor_b32_e32 v9, -1, v9 +; GFX11-NEXT: v_not_b32_e32 v9, v9 ; GFX11-NEXT: v_alignbit_b32 v6, v2, v6, 1 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 1, v2 -; GFX11-NEXT: v_xor_b32_e32 v10, -1, v10 +; GFX11-NEXT: v_not_b32_e32 v10, v10 ; GFX11-NEXT: v_alignbit_b32 v7, v3, v7, 1 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 1, v3 -; GFX11-NEXT: v_xor_b32_e32 v11, -1, v11 +; GFX11-NEXT: v_not_b32_e32 v11, v11 ; GFX11-NEXT: v_alignbit_b32 v0, v0, v4, v8 ; GFX11-NEXT: v_alignbit_b32 v1, v1, v5, v9 ; GFX11-NEXT: v_alignbit_b32 v2, v2, v6, v10 @@ -3441,9 +3433,9 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_and_b32 s3, s2, 15 ; GFX6-NEXT: s_andn2_b32 s2, 15, s2 -; GFX6-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX6-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX6-NEXT: s_bfe_u32 s1, s1, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX6-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX6-NEXT: s_lshl_b32 s0, s0, s3 ; GFX6-NEXT: s_lshr_b32 s1, s1, s2 ; GFX6-NEXT: s_or_b32 s0, s0, s1 @@ -3452,13 +3444,12 @@ ; GFX8-LABEL: s_fshl_i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s3, s2, 15 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX8-NEXT: s_andn2_b32 s2, 15, s2 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX8-NEXT: s_lshr_b32 s1, s1, 1 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_lshl_b32 s0, s0, s3 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s3, 1, 0x100000 -; GFX8-NEXT: s_lshr_b32 s1, s1, s3 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX8-NEXT: s_lshr_b32 s1, s1, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog @@ -3466,13 +3457,12 @@ ; GFX9-LABEL: s_fshl_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s3, s2, 15 -; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX9-NEXT: s_andn2_b32 s2, 15, s2 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX9-NEXT: s_lshr_b32 s1, s1, 1 +; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX9-NEXT: s_lshl_b32 s0, s0, s3 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_bfe_u32 s3, 1, 0x100000 -; GFX9-NEXT: s_lshr_b32 s1, s1, s3 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX9-NEXT: s_lshr_b32 s1, s1, s2 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog @@ -3481,11 +3471,10 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_and_b32 s3, s2, 15 ; GFX10-NEXT: s_andn2_b32 s2, 15, s2 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX10-NEXT: s_bfe_u32 s4, 1, 0x100000 -; GFX10-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX10-NEXT: s_lshr_b32 s1, s1, s4 -; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX10-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX10-NEXT: s_lshr_b32 s1, s1, 1 +; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX10-NEXT: s_lshl_b32 s0, s0, s3 ; GFX10-NEXT: s_lshr_b32 s1, s1, s2 ; GFX10-NEXT: s_or_b32 s0, s0, s1 @@ -3495,11 +3484,10 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_and_b32 s3, s2, 15 ; GFX11-NEXT: s_and_not1_b32 s2, 15, s2 -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX11-NEXT: s_bfe_u32 s4, 1, 0x100000 -; GFX11-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX11-NEXT: s_lshr_b32 s1, s1, s4 -; GFX11-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX11-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX11-NEXT: s_lshr_b32 s1, s1, 1 +; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX11-NEXT: s_lshl_b32 s0, s0, s3 ; GFX11-NEXT: s_lshr_b32 s1, s1, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -3519,41 +3507,33 @@ ; ; GFX8-LABEL: s_fshl_i16_4: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s2, 4, 0x100000 -; GFX8-NEXT: s_lshl_b32 s0, s0, s2 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s2, 12, 0x100000 -; GFX8-NEXT: s_lshr_b32 s1, s1, s2 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_lshl_b32 s0, s0, 4 +; GFX8-NEXT: s_lshr_b32 s1, s1, 12 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshl_i16_4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_bfe_u32 s2, 4, 0x100000 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_bfe_u32 s2, 12, 0x100000 -; GFX9-NEXT: s_lshr_b32 s1, s1, s2 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX9-NEXT: s_lshl_b32 s0, s0, 4 +; GFX9-NEXT: s_lshr_b32 s1, s1, 12 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fshl_i16_4: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_bfe_u32 s2, 4, 0x100000 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX10-NEXT: s_bfe_u32 s3, 12, 0x100000 -; GFX10-NEXT: s_lshl_b32 s0, s0, s2 -; GFX10-NEXT: s_lshr_b32 s1, s1, s3 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX10-NEXT: s_lshl_b32 s0, s0, 4 +; GFX10-NEXT: s_lshr_b32 s1, s1, 12 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshl_i16_4: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_bfe_u32 s2, 4, 0x100000 -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX11-NEXT: s_bfe_u32 s3, 12, 0x100000 -; GFX11-NEXT: s_lshl_b32 s0, s0, s2 -; GFX11-NEXT: s_lshr_b32 s1, s1, s3 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX11-NEXT: s_lshl_b32 s0, s0, 4 +; GFX11-NEXT: s_lshr_b32 s1, s1, 12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: ; return to shader part epilog @@ -3571,41 +3551,33 @@ ; ; GFX8-LABEL: s_fshl_i16_5: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s2, 5, 0x100000 -; GFX8-NEXT: s_lshl_b32 s0, s0, s2 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s2, 11, 0x100000 -; GFX8-NEXT: s_lshr_b32 s1, s1, s2 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_lshl_b32 s0, s0, 5 +; GFX8-NEXT: s_lshr_b32 s1, s1, 11 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshl_i16_5: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_bfe_u32 s2, 5, 0x100000 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_bfe_u32 s2, 11, 0x100000 -; GFX9-NEXT: s_lshr_b32 s1, s1, s2 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX9-NEXT: s_lshl_b32 s0, s0, 5 +; GFX9-NEXT: s_lshr_b32 s1, s1, 11 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fshl_i16_5: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_bfe_u32 s2, 5, 0x100000 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX10-NEXT: s_bfe_u32 s3, 11, 0x100000 -; GFX10-NEXT: s_lshl_b32 s0, s0, s2 -; GFX10-NEXT: s_lshr_b32 s1, s1, s3 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX10-NEXT: s_lshl_b32 s0, s0, 5 +; GFX10-NEXT: s_lshr_b32 s1, s1, 11 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshl_i16_5: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_bfe_u32 s2, 5, 0x100000 -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX11-NEXT: s_bfe_u32 s3, 11, 0x100000 -; GFX11-NEXT: s_lshl_b32 s0, s0, s2 -; GFX11-NEXT: s_lshr_b32 s1, s1, s3 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX11-NEXT: s_lshl_b32 s0, s0, 5 +; GFX11-NEXT: s_lshr_b32 s1, s1, 11 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: ; return to shader part epilog @@ -3620,9 +3592,9 @@ ; GFX6-NEXT: v_and_b32_e32 v3, 15, v2 ; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX6-NEXT: v_bfe_u32 v3, v3, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_bfe_u32 v1, v1, 1, 15 -; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v3, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3783,10 +3755,10 @@ ; GFX6-NEXT: v_and_b32_e32 v1, 15, v0 ; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX6-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_lshl_b32_e32 v1, s0, v1 ; GFX6-NEXT: s_bfe_u32 s0, s1, 0xf0001 -; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: ; return to shader part epilog @@ -3796,10 +3768,9 @@ ; GFX8-NEXT: v_and_b32_e32 v1, 15, v0 ; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX8-NEXT: v_lshlrev_b16_e64 v1, v1, s0 -; GFX8-NEXT: s_bfe_u32 s0, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, 1, 0x100000 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s1 ; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX8-NEXT: s_lshr_b32 s0, s0, s1 +; GFX8-NEXT: s_lshr_b32 s0, s0, 1 ; GFX8-NEXT: v_lshrrev_b16_e64 v0, v0, s0 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: ; return to shader part epilog @@ -3809,10 +3780,9 @@ ; GFX9-NEXT: v_and_b32_e32 v1, 15, v0 ; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX9-NEXT: v_lshlrev_b16_e64 v1, v1, s0 -; GFX9-NEXT: s_bfe_u32 s0, s1, 0x100000 -; GFX9-NEXT: s_bfe_u32 s1, 1, 0x100000 +; GFX9-NEXT: s_and_b32 s0, 0xffff, s1 ; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX9-NEXT: s_lshr_b32 s0, s0, s1 +; GFX9-NEXT: s_lshr_b32 s0, s0, 1 ; GFX9-NEXT: v_lshrrev_b16_e64 v0, v0, s0 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX9-NEXT: ; return to shader part epilog @@ -3821,9 +3791,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: v_xor_b32_e32 v1, -1, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX10-NEXT: s_bfe_u32 s2, 1, 0x100000 -; GFX10-NEXT: s_lshr_b32 s1, s1, s2 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX10-NEXT: s_lshr_b32 s1, s1, 1 ; GFX10-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX10-NEXT: v_lshlrev_b16 v0, v0, s0 ; GFX10-NEXT: v_lshrrev_b16 v1, v1, s1 @@ -3834,11 +3803,11 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: v_xor_b32_e32 v1, -1, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX11-NEXT: s_bfe_u32 s2, 1, 0x100000 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-NEXT: s_lshr_b32 s1, s1, s2 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_lshr_b32 s1, s1, 1 ; GFX11-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b16 v0, v0, s0 ; GFX11-NEXT: v_lshrrev_b16 v1, v1, s1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3854,9 +3823,9 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_and_b32 s2, s1, 15 ; GFX6-NEXT: s_andn2_b32 s1, 15, s1 -; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX6-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX6-NEXT: v_bfe_u32 v0, v0, 1, 15 -; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX6-NEXT: s_lshl_b32 s0, s0, s2 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s1, v0 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 @@ -3866,7 +3835,7 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s2, s1, 15 ; GFX8-NEXT: s_andn2_b32 s1, 15, s1 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 1, v0 ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 ; GFX8-NEXT: v_lshrrev_b16_e32 v0, s1, v0 @@ -3877,7 +3846,7 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s2, s1, 15 ; GFX9-NEXT: s_andn2_b32 s1, 15, s1 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 1, v0 ; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: v_lshrrev_b16_e32 v0, s1, v0 @@ -3889,7 +3858,7 @@ ; GFX10-NEXT: v_lshrrev_b16 v0, 1, v0 ; GFX10-NEXT: s_andn2_b32 s2, 15, s1 ; GFX10-NEXT: s_and_b32 s1, s1, 15 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: v_lshrrev_b16 v0, s2, v0 ; GFX10-NEXT: s_lshl_b32 s0, s0, s1 ; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 @@ -3901,7 +3870,7 @@ ; GFX11-NEXT: s_and_not1_b32 s2, 15, s1 ; GFX11-NEXT: s_and_b32 s1, s1, 15 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: v_lshrrev_b16 v0, s2, v0 ; GFX11-NEXT: s_lshl_b32 s0, s0, s1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) @@ -3917,9 +3886,9 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_and_b32 s2, s1, 15 ; GFX6-NEXT: s_andn2_b32 s1, 15, s1 -; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX6-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX6-NEXT: s_bfe_u32 s0, s0, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s2, v0 ; GFX6-NEXT: s_lshr_b32 s0, s0, s1 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 @@ -3929,11 +3898,10 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s2, s1, 15 ; GFX8-NEXT: s_andn2_b32 s1, 15, s1 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_lshr_b32 s0, s0, 1 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, s2, v0 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_bfe_u32 s2, 1, 0x100000 -; GFX8-NEXT: s_lshr_b32 s0, s0, s2 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX8-NEXT: s_lshr_b32 s0, s0, s1 ; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX8-NEXT: ; return to shader part epilog @@ -3942,11 +3910,10 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s2, s1, 15 ; GFX9-NEXT: s_andn2_b32 s1, 15, s1 +; GFX9-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX9-NEXT: s_lshr_b32 s0, s0, 1 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, s2, v0 -; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX9-NEXT: s_bfe_u32 s2, 1, 0x100000 -; GFX9-NEXT: s_lshr_b32 s0, s0, s2 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX9-NEXT: s_lshr_b32 s0, s0, s1 ; GFX9-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX9-NEXT: ; return to shader part epilog @@ -3955,11 +3922,10 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_and_b32 s2, s1, 15 ; GFX10-NEXT: s_andn2_b32 s1, 15, s1 -; GFX10-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX10-NEXT: s_bfe_u32 s3, 1, 0x100000 +; GFX10-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX10-NEXT: v_lshlrev_b16 v0, s2, v0 -; GFX10-NEXT: s_lshr_b32 s0, s0, s3 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_lshr_b32 s0, s0, 1 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: s_lshr_b32 s0, s0, s1 ; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog @@ -3968,11 +3934,10 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_and_b32 s2, s1, 15 ; GFX11-NEXT: s_and_not1_b32 s1, 15, s1 -; GFX11-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX11-NEXT: s_bfe_u32 s3, 1, 0x100000 +; GFX11-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX11-NEXT: v_lshlrev_b16 v0, s2, v0 -; GFX11-NEXT: s_lshr_b32 s0, s0, s3 -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_lshr_b32 s0, s0, 1 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshr_b32 s0, s0, s1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) @@ -3988,51 +3953,50 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_and_b32 s6, s4, 15 ; GFX6-NEXT: s_andn2_b32 s4, 15, s4 -; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX6-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX6-NEXT: s_bfe_u32 s2, s2, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX6-NEXT: s_lshl_b32 s0, s0, s6 ; GFX6-NEXT: s_lshr_b32 s2, s2, s4 ; GFX6-NEXT: s_or_b32 s0, s0, s2 ; GFX6-NEXT: s_and_b32 s2, s5, 15 ; GFX6-NEXT: s_andn2_b32 s4, 15, s5 -; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX6-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX6-NEXT: s_lshl_b32 s1, s1, s2 ; GFX6-NEXT: s_bfe_u32 s2, s3, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s3, s4, 0x100000 +; GFX6-NEXT: s_and_b32 s3, 0xffff, s4 ; GFX6-NEXT: s_lshr_b32 s2, s2, s3 ; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fshl_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b32 s6, s2, 15 -; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX8-NEXT: s_lshr_b32 s3, s0, 16 ; GFX8-NEXT: s_lshr_b32 s4, s1, 16 ; GFX8-NEXT: s_lshr_b32 s5, s2, 16 +; GFX8-NEXT: s_and_b32 s6, s2, 15 ; GFX8-NEXT: s_andn2_b32 s2, 15, s2 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX8-NEXT: s_lshr_b32 s1, s1, 1 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_lshr_b32 s3, s0, 16 ; GFX8-NEXT: s_lshl_b32 s0, s0, s6 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s6, 1, 0x100000 -; GFX8-NEXT: s_lshr_b32 s1, s1, s6 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX8-NEXT: s_lshr_b32 s1, s1, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s5, 15 ; GFX8-NEXT: s_andn2_b32 s2, 15, s5 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshl_b32 s1, s3, s1 -; GFX8-NEXT: s_lshr_b32 s3, s4, s6 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_lshr_b32 s3, s4, 1 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_lshr_b32 s2, s3, s2 ; GFX8-NEXT: s_or_b32 s1, s1, s2 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog @@ -4118,19 +4082,19 @@ ; GFX6-NEXT: v_and_b32_e32 v6, 15, v4 ; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX6-NEXT: v_bfe_u32 v6, v6, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX6-NEXT: v_bfe_u32 v2, v2, 1, 15 -; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v6, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v5 ; GFX6-NEXT: v_xor_b32_e32 v4, -1, v5 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_bfe_u32 v2, v3, 1, 15 -; GFX6-NEXT: v_bfe_u32 v3, v4, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v3, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -4154,8 +4118,8 @@ ; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_lshrrev_b16_e32 v1, v3, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_mov_b32_e32 v1, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -4206,17 +4170,13 @@ ; GFX6-LABEL: v_fshl_v2i16_4_8: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_bfe_u32 s4, 4, 0x100000 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX6-NEXT: v_bfe_u32 v2, v2, 1, 15 -; GFX6-NEXT: s_bfe_u32 s4, 11, 0x100000 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, s4, v2 -; GFX6-NEXT: s_bfe_u32 s4, 8, 0x100000 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 11, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_bfe_u32 v2, v3, 1, 15 -; GFX6-NEXT: s_bfe_u32 s4, 7, 0x100000 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 7, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -4231,8 +4191,8 @@ ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -4274,23 +4234,23 @@ ; GFX6-NEXT: v_and_b32_e32 v2, 15, v0 ; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX6-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, s0, v2 ; GFX6-NEXT: s_bfe_u32 s0, s2, 0xf0001 -; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v1 ; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX6-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: s_bfe_u32 s0, s3, 0xf0001 -; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_lshl_b32_e32 v2, s1, v2 ; GFX6-NEXT: v_lshr_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 -; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog @@ -4299,25 +4259,24 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: v_and_b32_e32 v2, 15, v0 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX8-NEXT: v_lshlrev_b16_e64 v2, v2, s0 -; GFX8-NEXT: s_bfe_u32 s0, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, 1, 0x100000 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s1 ; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX8-NEXT: s_lshr_b32 s0, s0, s1 +; GFX8-NEXT: s_lshr_b32 s0, s0, 1 ; GFX8-NEXT: v_lshrrev_b16_e64 v0, v0, s0 +; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v1 ; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX8-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX8-NEXT: s_lshr_b32 s0, s3, s1 +; GFX8-NEXT: s_lshr_b32 s0, s3, 1 ; GFX8-NEXT: v_lshlrev_b16_e64 v2, v2, s2 ; GFX8-NEXT: v_lshrrev_b16_e64 v1, v1, s0 ; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: ; return to shader part epilog ; @@ -4377,22 +4336,22 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_and_b32 s4, s2, 15 ; GFX6-NEXT: s_andn2_b32 s2, 15, s2 -; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX6-NEXT: v_bfe_u32 v0, v0, 1, 15 -; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX6-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX6-NEXT: s_lshl_b32 s0, s0, s4 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s2, v0 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX6-NEXT: s_and_b32 s0, s3, 15 ; GFX6-NEXT: s_andn2_b32 s2, 15, s3 -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX6-NEXT: s_lshl_b32 s0, s1, s0 ; GFX6-NEXT: v_bfe_u32 v1, v1, 1, 15 -; GFX6-NEXT: s_bfe_u32 s1, s2, 0x100000 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s2 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, s1, v1 ; GFX6-NEXT: v_or_b32_e32 v1, s0, v1 -; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 -; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog @@ -4402,7 +4361,7 @@ ; GFX8-NEXT: s_and_b32 s4, s1, 15 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: s_andn2_b32 s1, 15, s1 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v0 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: s_lshl_b32 s0, s0, s4 @@ -4411,13 +4370,13 @@ ; GFX8-NEXT: s_and_b32 s0, s3, 15 ; GFX8-NEXT: v_mov_b32_e32 v2, 1 ; GFX8-NEXT: s_andn2_b32 s1, 15, s3 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: v_lshrrev_b16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: s_lshl_b32 s0, s2, s0 ; GFX8-NEXT: v_lshrrev_b16_e32 v0, s1, v0 ; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: ; return to shader part epilog ; @@ -4474,49 +4433,48 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_and_b32 s4, s2, 15 ; GFX6-NEXT: s_andn2_b32 s2, 15, s2 -; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX6-NEXT: s_bfe_u32 s0, s0, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX6-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX6-NEXT: s_lshr_b32 s0, s0, s2 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX6-NEXT: s_and_b32 s0, s3, 15 ; GFX6-NEXT: s_andn2_b32 s2, 15, s3 -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, s0, v1 ; GFX6-NEXT: s_bfe_u32 s0, s1, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s1, s2, 0x100000 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s2 ; GFX6-NEXT: s_lshr_b32 s0, s0, s1 ; GFX6-NEXT: v_or_b32_e32 v1, s0, v1 -; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 -; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: v_fshl_v2i16_vss: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b32 s4, s1, 15 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 +; GFX8-NEXT: s_and_b32 s4, s1, 15 ; GFX8-NEXT: s_andn2_b32 s1, 15, s1 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_lshr_b32 s0, s0, 1 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, s4, v0 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_bfe_u32 s4, 1, 0x100000 -; GFX8-NEXT: s_lshr_b32 s0, s0, s4 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX8-NEXT: s_lshr_b32 s0, s0, s1 ; GFX8-NEXT: v_or_b32_e32 v1, s0, v1 ; GFX8-NEXT: s_and_b32 s0, s3, 15 ; GFX8-NEXT: s_andn2_b32 s1, 15, s3 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: s_lshr_b32 s0, s2, s4 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_lshr_b32 s0, s2, 1 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: s_lshr_b32 s0, s0, s1 ; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: ; return to shader part epilog ; @@ -4588,72 +4546,71 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_and_b32 s9, s6, 15 ; GFX6-NEXT: s_andn2_b32 s6, 15, s6 -; GFX6-NEXT: s_bfe_u32 s9, s9, 0x100000 +; GFX6-NEXT: s_and_b32 s9, 0xffff, s9 ; GFX6-NEXT: s_bfe_u32 s3, s3, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX6-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX6-NEXT: s_lshl_b32 s0, s0, s9 ; GFX6-NEXT: s_lshr_b32 s3, s3, s6 ; GFX6-NEXT: s_or_b32 s0, s0, s3 ; GFX6-NEXT: s_and_b32 s3, s7, 15 ; GFX6-NEXT: s_andn2_b32 s6, 15, s7 -; GFX6-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX6-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX6-NEXT: s_lshl_b32 s1, s1, s3 ; GFX6-NEXT: s_bfe_u32 s3, s4, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s4, s6, 0x100000 +; GFX6-NEXT: s_and_b32 s4, 0xffff, s6 ; GFX6-NEXT: s_lshr_b32 s3, s3, s4 ; GFX6-NEXT: s_or_b32 s1, s1, s3 ; GFX6-NEXT: s_and_b32 s3, s8, 15 ; GFX6-NEXT: s_andn2_b32 s4, 15, s8 -; GFX6-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX6-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX6-NEXT: s_lshl_b32 s2, s2, s3 ; GFX6-NEXT: s_bfe_u32 s3, s5, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX6-NEXT: s_lshr_b32 s3, s3, s4 -; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_bfe_u32 s1, s2, 0x100000 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s2 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fshl_v3i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b32 s9, s4, 15 -; GFX8-NEXT: s_bfe_u32 s9, s9, 0x100000 -; GFX8-NEXT: s_lshr_b32 s6, s0, 16 ; GFX8-NEXT: s_lshr_b32 s7, s2, 16 ; GFX8-NEXT: s_lshr_b32 s8, s4, 16 +; GFX8-NEXT: s_and_b32 s9, s4, 15 ; GFX8-NEXT: s_andn2_b32 s4, 15, s4 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_and_b32 s9, 0xffff, s9 +; GFX8-NEXT: s_lshr_b32 s2, s2, 1 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX8-NEXT: s_lshr_b32 s6, s0, 16 ; GFX8-NEXT: s_lshl_b32 s0, s0, s9 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_bfe_u32 s9, 1, 0x100000 -; GFX8-NEXT: s_lshr_b32 s2, s2, s9 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX8-NEXT: s_lshr_b32 s2, s2, s4 ; GFX8-NEXT: s_or_b32 s0, s0, s2 ; GFX8-NEXT: s_and_b32 s2, s8, 15 ; GFX8-NEXT: s_andn2_b32 s4, 15, s8 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_lshl_b32 s2, s6, s2 -; GFX8-NEXT: s_lshr_b32 s6, s7, s9 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX8-NEXT: s_lshr_b32 s6, s7, 1 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX8-NEXT: s_lshr_b32 s4, s6, s4 ; GFX8-NEXT: s_or_b32 s2, s2, s4 ; GFX8-NEXT: s_and_b32 s4, s5, 15 ; GFX8-NEXT: s_andn2_b32 s5, 15, s5 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_lshl_b32 s1, s1, s4 -; GFX8-NEXT: s_lshr_b32 s3, s3, s9 -; GFX8-NEXT: s_bfe_u32 s4, s5, 0x100000 +; GFX8-NEXT: s_lshr_b32 s3, s3, 1 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s5 ; GFX8-NEXT: s_lshr_b32 s3, s3, s4 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_or_b32 s1, s1, s3 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshl_v3i16: @@ -4808,28 +4765,28 @@ ; GFX6-NEXT: v_and_b32_e32 v9, 15, v6 ; GFX6-NEXT: v_xor_b32_e32 v6, -1, v6 ; GFX6-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX6-NEXT: v_bfe_u32 v9, v9, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX6-NEXT: v_bfe_u32 v3, v3, 1, 15 -; GFX6-NEXT: v_bfe_u32 v6, v6, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v9, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v6, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX6-NEXT: v_and_b32_e32 v3, 15, v7 ; GFX6-NEXT: v_xor_b32_e32 v6, -1, v7 ; GFX6-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX6-NEXT: v_bfe_u32 v3, v3, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v3, v1 ; GFX6-NEXT: v_bfe_u32 v3, v4, 1, 15 -; GFX6-NEXT: v_bfe_u32 v4, v6, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v6 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v4, v3 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX6-NEXT: v_and_b32_e32 v3, 15, v8 ; GFX6-NEXT: v_xor_b32_e32 v4, -1, v8 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX6-NEXT: v_bfe_u32 v3, v3, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v3, v2 ; GFX6-NEXT: v_bfe_u32 v3, v5, 1, 15 -; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v4, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -4859,11 +4816,11 @@ ; GFX8-NEXT: v_lshlrev_b16_e32 v1, v2, v1 ; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v3 ; GFX8-NEXT: v_lshrrev_b16_e32 v2, v5, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fshl_v3i16: @@ -4936,95 +4893,94 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_and_b32 s12, s8, 15 ; GFX6-NEXT: s_andn2_b32 s8, 15, s8 -; GFX6-NEXT: s_bfe_u32 s12, s12, 0x100000 +; GFX6-NEXT: s_and_b32 s12, 0xffff, s12 ; GFX6-NEXT: s_bfe_u32 s4, s4, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX6-NEXT: s_and_b32 s8, 0xffff, s8 ; GFX6-NEXT: s_lshl_b32 s0, s0, s12 ; GFX6-NEXT: s_lshr_b32 s4, s4, s8 ; GFX6-NEXT: s_or_b32 s0, s0, s4 ; GFX6-NEXT: s_and_b32 s4, s9, 15 ; GFX6-NEXT: s_andn2_b32 s8, 15, s9 -; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX6-NEXT: s_lshl_b32 s1, s1, s4 ; GFX6-NEXT: s_bfe_u32 s4, s5, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s5, s8, 0x100000 +; GFX6-NEXT: s_and_b32 s5, 0xffff, s8 ; GFX6-NEXT: s_lshr_b32 s4, s4, s5 ; GFX6-NEXT: s_or_b32 s1, s1, s4 ; GFX6-NEXT: s_and_b32 s4, s10, 15 ; GFX6-NEXT: s_andn2_b32 s5, 15, s10 -; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX6-NEXT: s_lshl_b32 s2, s2, s4 ; GFX6-NEXT: s_bfe_u32 s4, s6, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX6-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX6-NEXT: s_lshr_b32 s4, s4, s5 ; GFX6-NEXT: s_or_b32 s2, s2, s4 ; GFX6-NEXT: s_and_b32 s4, s11, 15 ; GFX6-NEXT: s_andn2_b32 s5, 15, s11 -; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX6-NEXT: s_lshl_b32 s3, s3, s4 ; GFX6-NEXT: s_bfe_u32 s4, s7, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX6-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX6-NEXT: s_lshr_b32 s4, s4, s5 -; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX6-NEXT: s_or_b32 s3, s3, s4 -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_bfe_u32 s1, s2, 0x100000 -; GFX6-NEXT: s_bfe_u32 s2, s3, 0x100000 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s2 +; GFX6-NEXT: s_and_b32 s2, 0xffff, s3 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fshl_v4i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b32 s12, s4, 15 -; GFX8-NEXT: s_bfe_u32 s12, s12, 0x100000 -; GFX8-NEXT: s_lshr_b32 s6, s0, 16 ; GFX8-NEXT: s_lshr_b32 s8, s2, 16 ; GFX8-NEXT: s_lshr_b32 s10, s4, 16 +; GFX8-NEXT: s_and_b32 s12, s4, 15 ; GFX8-NEXT: s_andn2_b32 s4, 15, s4 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_and_b32 s12, 0xffff, s12 +; GFX8-NEXT: s_lshr_b32 s2, s2, 1 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX8-NEXT: s_lshr_b32 s6, s0, 16 ; GFX8-NEXT: s_lshl_b32 s0, s0, s12 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_bfe_u32 s12, 1, 0x100000 -; GFX8-NEXT: s_lshr_b32 s2, s2, s12 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX8-NEXT: s_lshr_b32 s2, s2, s4 ; GFX8-NEXT: s_or_b32 s0, s0, s2 ; GFX8-NEXT: s_and_b32 s2, s10, 15 ; GFX8-NEXT: s_andn2_b32 s4, 15, s10 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_lshl_b32 s2, s6, s2 -; GFX8-NEXT: s_lshr_b32 s6, s8, s12 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX8-NEXT: s_lshr_b32 s6, s8, 1 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX8-NEXT: s_lshr_b32 s4, s6, s4 ; GFX8-NEXT: s_or_b32 s2, s2, s4 ; GFX8-NEXT: s_and_b32 s4, s5, 15 ; GFX8-NEXT: s_lshr_b32 s9, s3, 16 ; GFX8-NEXT: s_lshr_b32 s11, s5, 16 ; GFX8-NEXT: s_andn2_b32 s5, 15, s5 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_lshr_b32 s7, s1, 16 ; GFX8-NEXT: s_lshl_b32 s1, s1, s4 -; GFX8-NEXT: s_lshr_b32 s3, s3, s12 -; GFX8-NEXT: s_bfe_u32 s4, s5, 0x100000 +; GFX8-NEXT: s_lshr_b32 s3, s3, 1 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s5 ; GFX8-NEXT: s_lshr_b32 s3, s3, s4 ; GFX8-NEXT: s_or_b32 s1, s1, s3 ; GFX8-NEXT: s_and_b32 s3, s11, 15 ; GFX8-NEXT: s_andn2_b32 s4, 15, s11 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_lshr_b32 s5, s9, s12 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX8-NEXT: s_lshr_b32 s5, s9, 1 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX8-NEXT: s_lshl_b32 s3, s7, s3 ; GFX8-NEXT: s_lshr_b32 s4, s5, s4 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_or_b32 s3, s3, s4 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_bfe_u32 s2, s3, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s3 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: ; return to shader part epilog @@ -5166,37 +5122,37 @@ ; GFX6-NEXT: v_and_b32_e32 v12, 15, v8 ; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8 ; GFX6-NEXT: v_and_b32_e32 v8, 15, v8 -; GFX6-NEXT: v_bfe_u32 v12, v12, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; GFX6-NEXT: v_bfe_u32 v4, v4, 1, 15 -; GFX6-NEXT: v_bfe_u32 v8, v8, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v12, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v4 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v9 ; GFX6-NEXT: v_xor_b32_e32 v8, -1, v9 ; GFX6-NEXT: v_and_b32_e32 v8, 15, v8 -; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v4, v1 ; GFX6-NEXT: v_bfe_u32 v4, v5, 1, 15 -; GFX6-NEXT: v_bfe_u32 v5, v8, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v8 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v10 ; GFX6-NEXT: v_xor_b32_e32 v5, -1, v10 ; GFX6-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_bfe_u32 v4, v6, 1, 15 -; GFX6-NEXT: v_bfe_u32 v5, v5, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v11 ; GFX6-NEXT: v_xor_b32_e32 v5, -1, v11 ; GFX6-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, v4, v3 ; GFX6-NEXT: v_bfe_u32 v4, v7, 1, 15 -; GFX6-NEXT: v_bfe_u32 v5, v5, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -5236,9 +5192,10 @@ ; GFX8-NEXT: v_lshrrev_b16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_lshrrev_b16_e32 v3, v6, v3 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_mov_b32_e32 v3, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -5403,7 +5360,7 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 63, v4 -; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX6-NEXT: v_not_b32_e32 v4, v4 ; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], 1 ; GFX6-NEXT: v_and_b32_e32 v4, 63, v4 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v5 @@ -5416,7 +5373,7 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v5, 63, v4 -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] ; GFX8-NEXT: v_and_b32_e32 v4, 63, v4 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1] @@ -5429,7 +5386,7 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v5, 63, v4 -; GFX9-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX9-NEXT: v_not_b32_e32 v4, v4 ; GFX9-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] ; GFX9-NEXT: v_and_b32_e32 v4, 63, v4 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1] @@ -5442,7 +5399,7 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v5, -1, v4 +; GFX10-NEXT: v_not_b32_e32 v5, v4 ; GFX10-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] ; GFX10-NEXT: v_and_b32_e32 v4, 63, v4 ; GFX10-NEXT: v_and_b32_e32 v5, 63, v5 @@ -5456,7 +5413,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_xor_b32_e32 v5, -1, v4 +; GFX11-NEXT: v_not_b32_e32 v5, v4 ; GFX11-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] ; GFX11-NEXT: v_and_b32_e32 v4, 63, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -5612,7 +5569,7 @@ ; GFX6-LABEL: v_fshl_i64_ssv: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_and_b32_e32 v1, 63, v0 -; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX6-NEXT: v_not_b32_e32 v0, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 63, v0 ; GFX6-NEXT: v_lshl_b64 v[0:1], s[0:1], v1 ; GFX6-NEXT: s_lshr_b64 s[0:1], s[2:3], 1 @@ -5624,7 +5581,7 @@ ; GFX8-LABEL: v_fshl_i64_ssv: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_and_b32_e32 v1, 63, v0 -; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: v_and_b32_e32 v2, 63, v0 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v1, s[0:1] ; GFX8-NEXT: s_lshr_b64 s[0:1], s[2:3], 1 @@ -5636,7 +5593,7 @@ ; GFX9-LABEL: v_fshl_i64_ssv: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_and_b32_e32 v1, 63, v0 -; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX9-NEXT: v_not_b32_e32 v0, v0 ; GFX9-NEXT: v_and_b32_e32 v2, 63, v0 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v1, s[0:1] ; GFX9-NEXT: s_lshr_b64 s[0:1], s[2:3], 1 @@ -5647,7 +5604,7 @@ ; ; GFX10-LABEL: v_fshl_i64_ssv: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_xor_b32_e32 v1, -1, v0 +; GFX10-NEXT: v_not_b32_e32 v1, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 63, v0 ; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 ; GFX10-NEXT: v_and_b32_e32 v2, 63, v1 @@ -5659,7 +5616,7 @@ ; ; GFX11-LABEL: v_fshl_i64_ssv: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_xor_b32_e32 v1, -1, v0 +; GFX11-NEXT: v_not_b32_e32 v1, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 63, v0 ; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -5890,12 +5847,12 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v9, 63, v8 -; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX6-NEXT: v_not_b32_e32 v8, v8 ; GFX6-NEXT: v_lshr_b64 v[4:5], v[4:5], 1 ; GFX6-NEXT: v_and_b32_e32 v8, 63, v8 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v9 ; GFX6-NEXT: v_lshr_b64 v[4:5], v[4:5], v8 -; GFX6-NEXT: v_xor_b32_e32 v8, -1, v10 +; GFX6-NEXT: v_not_b32_e32 v8, v10 ; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], 1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 63, v10 @@ -5911,12 +5868,12 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v9, 63, v8 -; GFX8-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX8-NEXT: v_not_b32_e32 v8, v8 ; GFX8-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] ; GFX8-NEXT: v_and_b32_e32 v8, 63, v8 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] ; GFX8-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] -; GFX8-NEXT: v_xor_b32_e32 v8, -1, v10 +; GFX8-NEXT: v_not_b32_e32 v8, v10 ; GFX8-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7] ; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX8-NEXT: v_and_b32_e32 v4, 63, v10 @@ -5932,12 +5889,12 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v9, 63, v8 -; GFX9-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX9-NEXT: v_not_b32_e32 v8, v8 ; GFX9-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] ; GFX9-NEXT: v_and_b32_e32 v8, 63, v8 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] ; GFX9-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] -; GFX9-NEXT: v_xor_b32_e32 v8, -1, v10 +; GFX9-NEXT: v_not_b32_e32 v8, v10 ; GFX9-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7] ; GFX9-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX9-NEXT: v_and_b32_e32 v4, 63, v10 @@ -5953,8 +5910,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v9, -1, v8 -; GFX10-NEXT: v_xor_b32_e32 v11, -1, v10 +; GFX10-NEXT: v_not_b32_e32 v9, v8 +; GFX10-NEXT: v_not_b32_e32 v11, v10 ; GFX10-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] ; GFX10-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7] ; GFX10-NEXT: v_and_b32_e32 v8, 63, v8 @@ -5975,8 +5932,8 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_xor_b32_e32 v9, -1, v8 -; GFX11-NEXT: v_xor_b32_e32 v11, -1, v10 +; GFX11-NEXT: v_not_b32_e32 v9, v8 +; GFX11-NEXT: v_not_b32_e32 v11, v10 ; GFX11-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] ; GFX11-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7] ; GFX11-NEXT: v_and_b32_e32 v8, 63, v8 @@ -6245,7 +6202,7 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v14, 0x7f, v8 -; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX6-NEXT: v_not_b32_e32 v8, v8 ; GFX6-NEXT: v_and_b32_e32 v15, 0x7f, v8 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 64, v14 ; GFX6-NEXT: v_subrev_i32_e32 v16, vcc, 64, v14 @@ -6293,7 +6250,7 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v14, 0x7f, v8 -; GFX8-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX8-NEXT: v_not_b32_e32 v8, v8 ; GFX8-NEXT: v_and_b32_e32 v15, 0x7f, v8 ; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 64, v14 ; GFX8-NEXT: v_subrev_u32_e32 v16, vcc, 64, v14 @@ -6341,7 +6298,7 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v14, 0x7f, v8 -; GFX9-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX9-NEXT: v_not_b32_e32 v8, v8 ; GFX9-NEXT: v_and_b32_e32 v15, 0x7f, v8 ; GFX9-NEXT: v_sub_u32_e32 v8, 64, v14 ; GFX9-NEXT: v_subrev_u32_e32 v16, 64, v14 @@ -6389,7 +6346,7 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_and_b32_e32 v18, 0x7f, v8 -; GFX10-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX10-NEXT: v_not_b32_e32 v8, v8 ; GFX10-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] ; GFX10-NEXT: v_lshrrev_b64 v[12:13], 1, v[6:7] ; GFX10-NEXT: v_sub_nc_u32_e32 v10, 64, v18 @@ -6438,7 +6395,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] ; GFX11-NEXT: v_and_b32_e32 v18, 0x7f, v8 -; GFX11-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX11-NEXT: v_not_b32_e32 v8, v8 ; GFX11-NEXT: v_lshrrev_b64 v[12:13], 1, v[6:7] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_sub_nc_u32_e32 v10, 64, v18 @@ -6491,7 +6448,7 @@ ; GFX6-LABEL: v_fshl_i128_ssv: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_and_b32_e32 v6, 0x7f, v0 -; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX6-NEXT: v_not_b32_e32 v0, v0 ; GFX6-NEXT: v_and_b32_e32 v7, 0x7f, v0 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v6 ; GFX6-NEXT: v_lshr_b64 v[0:1], s[0:1], v0 @@ -6543,7 +6500,7 @@ ; GFX8-LABEL: v_fshl_i128_ssv: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_and_b32_e32 v6, 0x7f, v0 -; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: v_and_b32_e32 v7, 0x7f, v0 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v6 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, s[0:1] @@ -6595,7 +6552,7 @@ ; GFX9-LABEL: v_fshl_i128_ssv: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_and_b32_e32 v6, 0x7f, v0 -; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX9-NEXT: v_not_b32_e32 v0, v0 ; GFX9-NEXT: v_and_b32_e32 v7, 0x7f, v0 ; GFX9-NEXT: v_sub_u32_e32 v0, 64, v6 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, s[0:1] @@ -6647,7 +6604,7 @@ ; GFX10-LABEL: v_fshl_i128_ssv: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_and_b32_e32 v12, 0x7f, v0 -; GFX10-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX10-NEXT: v_not_b32_e32 v0, v0 ; GFX10-NEXT: s_mov_b32 s8, 0 ; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 ; GFX10-NEXT: s_lshl_b32 s9, s6, 31 @@ -6695,7 +6652,7 @@ ; GFX11-LABEL: v_fshl_i128_ssv: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_and_b32_e32 v12, 0x7f, v0 -; GFX11-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX11-NEXT: v_not_b32_e32 v0, v0 ; GFX11-NEXT: s_mov_b32 s8, 0 ; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 ; GFX11-NEXT: s_lshl_b32 s9, s6, 31 @@ -7887,7 +7844,7 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v23, 0x7f, v16 -; GFX6-NEXT: v_xor_b32_e32 v16, -1, v16 +; GFX6-NEXT: v_not_b32_e32 v16, v16 ; GFX6-NEXT: v_and_b32_e32 v24, 0x7f, v16 ; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 64, v23 ; GFX6-NEXT: v_subrev_i32_e32 v25, vcc, 64, v23 @@ -7925,7 +7882,7 @@ ; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] ; GFX6-NEXT: v_or_b32_e32 v0, v18, v0 ; GFX6-NEXT: v_and_b32_e32 v18, 0x7f, v20 -; GFX6-NEXT: v_xor_b32_e32 v8, -1, v20 +; GFX6-NEXT: v_not_b32_e32 v8, v20 ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc ; GFX6-NEXT: v_or_b32_e32 v1, v19, v1 @@ -7978,7 +7935,7 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v23, 0x7f, v16 -; GFX8-NEXT: v_xor_b32_e32 v16, -1, v16 +; GFX8-NEXT: v_not_b32_e32 v16, v16 ; GFX8-NEXT: v_and_b32_e32 v24, 0x7f, v16 ; GFX8-NEXT: v_sub_u32_e32 v16, vcc, 64, v23 ; GFX8-NEXT: v_subrev_u32_e32 v25, vcc, 64, v23 @@ -8016,7 +7973,7 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] ; GFX8-NEXT: v_or_b32_e32 v0, v18, v0 ; GFX8-NEXT: v_and_b32_e32 v18, 0x7f, v20 -; GFX8-NEXT: v_xor_b32_e32 v8, -1, v20 +; GFX8-NEXT: v_not_b32_e32 v8, v20 ; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc ; GFX8-NEXT: v_or_b32_e32 v1, v19, v1 @@ -8069,7 +8026,7 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v23, 0x7f, v16 -; GFX9-NEXT: v_xor_b32_e32 v16, -1, v16 +; GFX9-NEXT: v_not_b32_e32 v16, v16 ; GFX9-NEXT: v_and_b32_e32 v24, 0x7f, v16 ; GFX9-NEXT: v_sub_u32_e32 v16, 64, v23 ; GFX9-NEXT: v_subrev_u32_e32 v25, 64, v23 @@ -8106,7 +8063,7 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] ; GFX9-NEXT: v_or_b32_e32 v0, v18, v0 ; GFX9-NEXT: v_and_b32_e32 v18, 0x7f, v20 -; GFX9-NEXT: v_xor_b32_e32 v8, -1, v20 +; GFX9-NEXT: v_not_b32_e32 v8, v20 ; GFX9-NEXT: v_or_b32_e32 v1, v19, v1 ; GFX9-NEXT: v_and_b32_e32 v19, 0x7f, v8 ; GFX9-NEXT: v_sub_u32_e32 v8, 64, v18 @@ -8159,7 +8116,7 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_and_b32_e32 v27, 0x7f, v16 -; GFX10-NEXT: v_xor_b32_e32 v16, -1, v16 +; GFX10-NEXT: v_not_b32_e32 v16, v16 ; GFX10-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9] ; GFX10-NEXT: v_sub_nc_u32_e32 v17, 64, v27 ; GFX10-NEXT: v_and_b32_e32 v28, 0x7f, v16 @@ -8198,7 +8155,7 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, v0, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, v1, s4 ; GFX10-NEXT: v_or_b32_e32 v0, v21, v3 -; GFX10-NEXT: v_xor_b32_e32 v3, -1, v20 +; GFX10-NEXT: v_not_b32_e32 v3, v20 ; GFX10-NEXT: v_or_b32_e32 v1, v22, v8 ; GFX10-NEXT: v_lshrrev_b64 v[8:9], 1, v[12:13] ; GFX10-NEXT: v_sub_nc_u32_e32 v11, 64, v24 @@ -8249,7 +8206,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_and_b32_e32 v27, 0x7f, v16 -; GFX11-NEXT: v_xor_b32_e32 v16, -1, v16 +; GFX11-NEXT: v_not_b32_e32 v16, v16 ; GFX11-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_lshlrev_b64 v[21:22], v27, v[0:1] @@ -8292,7 +8249,7 @@ ; GFX11-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v25, 0, v1, s0 ; GFX11-NEXT: v_or_b32_e32 v0, v21, v3 -; GFX11-NEXT: v_xor_b32_e32 v3, -1, v20 +; GFX11-NEXT: v_not_b32_e32 v3, v20 ; GFX11-NEXT: v_or_b32_e32 v1, v22, v8 ; GFX11-NEXT: v_lshrrev_b64 v[8:9], 1, v[12:13] ; GFX11-NEXT: v_or_b32_e32 v2, v2, v10 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -353,7 +353,7 @@ ; GFX8-NEXT: s_and_b32 s3, s2, 7 ; GFX8-NEXT: s_andn2_b32 s2, 7, s2 ; GFX8-NEXT: s_lshl_b32 s0, s0, 1 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 ; GFX8-NEXT: s_lshr_b32 s1, s1, s3 ; GFX8-NEXT: s_or_b32 s0, s0, s1 @@ -365,7 +365,7 @@ ; GFX9-NEXT: s_and_b32 s3, s2, 7 ; GFX9-NEXT: s_andn2_b32 s2, 7, s2 ; GFX9-NEXT: s_lshl_b32 s0, s0, 1 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: s_lshr_b32 s1, s1, s3 ; GFX9-NEXT: s_or_b32 s0, s0, s1 @@ -377,7 +377,7 @@ ; GFX10-NEXT: s_and_b32 s3, s2, 7 ; GFX10-NEXT: s_andn2_b32 s2, 7, s2 ; GFX10-NEXT: s_lshl_b32 s0, s0, 1 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: s_lshl_b32 s0, s0, s2 ; GFX10-NEXT: s_lshr_b32 s1, s1, s3 ; GFX10-NEXT: s_or_b32 s0, s0, s1 @@ -389,7 +389,7 @@ ; GFX11-NEXT: s_and_b32 s3, s2, 7 ; GFX11-NEXT: s_and_not1_b32 s2, 7, s2 ; GFX11-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: s_lshl_b32 s0, s0, s2 ; GFX11-NEXT: s_lshr_b32 s1, s1, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -404,7 +404,7 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v3, 7, v2 -; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 @@ -417,7 +417,7 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v3, 7, v2 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 1, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, v2, v0 @@ -429,7 +429,7 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v3, 7, v2 -; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX9-NEXT: v_not_b32_e32 v2, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, v2, v0 @@ -441,7 +441,7 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2 +; GFX10-NEXT: v_not_b32_e32 v3, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 @@ -455,7 +455,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_xor_b32_e32 v3, -1, v2 +; GFX11-NEXT: v_not_b32_e32 v3, v2 ; GFX11-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 @@ -481,7 +481,7 @@ ; GFX8-LABEL: s_fshr_i8_4: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshl_b32 s0, s0, 4 ; GFX8-NEXT: s_lshr_b32 s1, s1, 4 ; GFX8-NEXT: s_or_b32 s0, s0, s1 @@ -490,7 +490,7 @@ ; GFX9-LABEL: s_fshr_i8_4: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_lshl_b32 s0, s0, 4 ; GFX9-NEXT: s_lshr_b32 s1, s1, 4 ; GFX9-NEXT: s_or_b32 s0, s0, s1 @@ -500,7 +500,7 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: s_lshl_b32 s0, s0, 4 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: s_lshr_b32 s1, s1, 4 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog @@ -509,7 +509,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_and_b32 s1, s1, 0xff ; GFX11-NEXT: s_lshl_b32 s0, s0, 4 -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_lshr_b32 s1, s1, 4 ; GFX11-NEXT: s_or_b32 s0, s0, s1 @@ -580,7 +580,7 @@ ; GFX8-LABEL: s_fshr_i8_5: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshl_b32 s0, s0, 3 ; GFX8-NEXT: s_lshr_b32 s1, s1, 5 ; GFX8-NEXT: s_or_b32 s0, s0, s1 @@ -589,7 +589,7 @@ ; GFX9-LABEL: s_fshr_i8_5: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_lshl_b32 s0, s0, 3 ; GFX9-NEXT: s_lshr_b32 s1, s1, 5 ; GFX9-NEXT: s_or_b32 s0, s0, s1 @@ -599,7 +599,7 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: s_lshl_b32 s0, s0, 3 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: s_lshr_b32 s1, s1, 5 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog @@ -608,7 +608,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_and_b32 s1, s1, 0xff ; GFX11-NEXT: s_lshl_b32 s0, s0, 3 -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_lshr_b32 s1, s1, 5 ; GFX11-NEXT: s_or_b32 s0, s0, s1 @@ -703,7 +703,7 @@ ; GFX8-NEXT: s_lshl_b32 s0, s0, 1 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_andn2_b32 s2, 7, s5 ; GFX8-NEXT: s_lshl_b32 s3, s3, 1 ; GFX8-NEXT: s_lshr_b32 s1, s1, s6 @@ -711,13 +711,12 @@ ; GFX8-NEXT: s_and_b32 s3, s4, 0xff ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s5, 7 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_lshr_b32 s1, s3, s1 ; GFX8-NEXT: s_or_b32 s1, s2, s1 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_bfe_u32 s2, 8, 0x100000 ; GFX8-NEXT: s_and_b32 s0, s0, 0xff -; GFX8-NEXT: s_lshl_b32 s1, s1, s2 +; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; @@ -731,7 +730,7 @@ ; GFX9-NEXT: s_lshl_b32 s0, s0, 1 ; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_lshl_b32 s0, s0, s2 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_andn2_b32 s2, 7, s5 ; GFX9-NEXT: s_lshl_b32 s3, s3, 1 ; GFX9-NEXT: s_lshr_b32 s1, s1, s6 @@ -739,13 +738,12 @@ ; GFX9-NEXT: s_and_b32 s3, s4, 0xff ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_and_b32 s1, s5, 7 -; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX9-NEXT: s_lshr_b32 s1, s3, s1 ; GFX9-NEXT: s_or_b32 s1, s2, s1 ; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_bfe_u32 s2, 8, 0x100000 ; GFX9-NEXT: s_and_b32 s0, s0, 0xff -; GFX9-NEXT: s_lshl_b32 s1, s1, s2 +; GFX9-NEXT: s_lshl_b32 s1, s1, 8 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; @@ -763,17 +761,16 @@ ; GFX10-NEXT: s_and_b32 s2, s5, 7 ; GFX10-NEXT: s_andn2_b32 s5, 7, s5 ; GFX10-NEXT: s_lshl_b32 s3, s3, 1 -; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: s_lshl_b32 s3, s3, s5 ; GFX10-NEXT: s_lshr_b32 s2, s4, s2 ; GFX10-NEXT: s_lshr_b32 s1, s1, s6 ; GFX10-NEXT: s_or_b32 s2, s3, s2 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: s_and_b32 s1, s2, 0xff -; GFX10-NEXT: s_bfe_u32 s2, 8, 0x100000 ; GFX10-NEXT: s_and_b32 s0, s0, 0xff -; GFX10-NEXT: s_lshl_b32 s1, s1, s2 +; GFX10-NEXT: s_lshl_b32 s1, s1, 8 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog ; @@ -791,17 +788,16 @@ ; GFX11-NEXT: s_and_b32 s2, s5, 7 ; GFX11-NEXT: s_and_not1_b32 s5, 7, s5 ; GFX11-NEXT: s_lshl_b32 s3, s3, 1 -; GFX11-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: s_lshl_b32 s3, s3, s5 ; GFX11-NEXT: s_lshr_b32 s2, s4, s2 ; GFX11-NEXT: s_lshr_b32 s1, s1, s6 ; GFX11-NEXT: s_or_b32 s2, s3, s2 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_and_b32 s1, s2, 0xff -; GFX11-NEXT: s_bfe_u32 s2, 8, 0x100000 ; GFX11-NEXT: s_and_b32 s0, s0, 0xff -; GFX11-NEXT: s_lshl_b32 s1, s1, s2 +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: ; return to shader part epilog @@ -819,7 +815,7 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 8, v2 ; GFX6-NEXT: v_and_b32_e32 v5, 7, v2 -; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -828,7 +824,7 @@ ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 7, v4 -; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX6-NEXT: v_not_b32_e32 v4, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 7, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3 ; GFX6-NEXT: v_bfe_u32 v1, v1, 8, 8 @@ -846,14 +842,14 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v2 ; GFX8-NEXT: v_and_b32_e32 v6, 7, v2 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 1, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, v2, v0 ; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v5 +; GFX8-NEXT: v_not_b32_e32 v2, v5 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_and_b32_e32 v1, 7, v5 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 @@ -871,14 +867,14 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v2 ; GFX9-NEXT: v_and_b32_e32 v6, 7, v2 -; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX9-NEXT: v_not_b32_e32 v2, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, v2, v0 ; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_xor_b32_e32 v2, -1, v5 +; GFX9-NEXT: v_not_b32_e32 v2, v5 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 7, v5 ; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 @@ -899,8 +895,8 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX10-NEXT: v_and_b32_e32 v7, 7, v2 -; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX10-NEXT: v_xor_b32_e32 v6, -1, v3 +; GFX10-NEXT: v_not_b32_e32 v2, v2 +; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 7, v3 ; GFX10-NEXT: v_lshlrev_b16 v4, 1, v4 ; GFX10-NEXT: v_and_b32_e32 v5, 0xff, v5 @@ -927,8 +923,8 @@ ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 8, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX11-NEXT: v_and_b32_e32 v7, 7, v2 -; GFX11-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX11-NEXT: v_xor_b32_e32 v6, -1, v3 +; GFX11-NEXT: v_not_b32_e32 v2, v2 +; GFX11-NEXT: v_not_b32_e32 v6, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 7, v3 ; GFX11-NEXT: v_lshlrev_b16 v4, 1, v4 ; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 @@ -1025,7 +1021,7 @@ ; GFX8-NEXT: s_lshl_b32 s0, s0, 1 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_andn2_b32 s2, 7, s9 ; GFX8-NEXT: s_lshl_b32 s3, s3, 1 ; GFX8-NEXT: s_lshr_b32 s1, s1, s12 @@ -1033,7 +1029,7 @@ ; GFX8-NEXT: s_and_b32 s3, s6, 0xff ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s9, 7 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_lshr_b32 s1, s3, s1 ; GFX8-NEXT: s_andn2_b32 s3, 7, s10 ; GFX8-NEXT: s_lshl_b32 s4, s4, 1 @@ -1041,7 +1037,7 @@ ; GFX8-NEXT: s_and_b32 s4, s7, 0xff ; GFX8-NEXT: s_or_b32 s1, s2, s1 ; GFX8-NEXT: s_and_b32 s2, s10, 7 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX8-NEXT: s_lshr_b32 s2, s4, s2 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_or_b32 s2, s3, s2 @@ -1078,7 +1074,7 @@ ; GFX9-NEXT: s_lshl_b32 s0, s0, 1 ; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_lshl_b32 s0, s0, s2 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_andn2_b32 s2, 7, s9 ; GFX9-NEXT: s_lshl_b32 s3, s3, 1 ; GFX9-NEXT: s_lshr_b32 s1, s1, s12 @@ -1086,7 +1082,7 @@ ; GFX9-NEXT: s_and_b32 s3, s6, 0xff ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_and_b32 s1, s9, 7 -; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX9-NEXT: s_lshr_b32 s1, s3, s1 ; GFX9-NEXT: s_andn2_b32 s3, 7, s10 ; GFX9-NEXT: s_lshl_b32 s4, s4, 1 @@ -1094,7 +1090,7 @@ ; GFX9-NEXT: s_and_b32 s4, s7, 0xff ; GFX9-NEXT: s_or_b32 s1, s2, s1 ; GFX9-NEXT: s_and_b32 s2, s10, 7 -; GFX9-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX9-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX9-NEXT: s_lshr_b32 s2, s4, s2 ; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_or_b32 s2, s3, s2 @@ -1131,12 +1127,12 @@ ; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: s_lshl_b32 s0, s0, 1 ; GFX10-NEXT: s_and_b32 s6, s6, 0xff -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: s_lshl_b32 s0, s0, s2 ; GFX10-NEXT: s_and_b32 s2, s9, 7 ; GFX10-NEXT: s_andn2_b32 s9, 7, s9 ; GFX10-NEXT: s_lshl_b32 s3, s3, 1 -; GFX10-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX10-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX10-NEXT: s_lshr_b32 s1, s1, s12 ; GFX10-NEXT: s_lshl_b32 s3, s3, s9 ; GFX10-NEXT: s_lshr_b32 s2, s6, s2 @@ -1146,7 +1142,7 @@ ; GFX10-NEXT: s_and_b32 s2, s10, 7 ; GFX10-NEXT: s_andn2_b32 s3, 7, s10 ; GFX10-NEXT: s_lshl_b32 s4, s4, 1 -; GFX10-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX10-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX10-NEXT: s_lshl_b32 s3, s4, s3 ; GFX10-NEXT: s_lshr_b32 s2, s6, s2 ; GFX10-NEXT: s_andn2_b32 s4, 7, s11 @@ -1184,12 +1180,12 @@ ; GFX11-NEXT: s_and_b32 s1, s1, 0xff ; GFX11-NEXT: s_lshl_b32 s0, s0, 1 ; GFX11-NEXT: s_and_b32 s6, s6, 0xff -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: s_lshl_b32 s0, s0, s2 ; GFX11-NEXT: s_and_b32 s2, s9, 7 ; GFX11-NEXT: s_and_not1_b32 s9, 7, s9 ; GFX11-NEXT: s_lshl_b32 s3, s3, 1 -; GFX11-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX11-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX11-NEXT: s_lshr_b32 s1, s1, s12 ; GFX11-NEXT: s_lshl_b32 s3, s3, s9 ; GFX11-NEXT: s_lshr_b32 s2, s6, s2 @@ -1199,7 +1195,7 @@ ; GFX11-NEXT: s_and_b32 s2, s10, 7 ; GFX11-NEXT: s_and_not1_b32 s3, 7, s10 ; GFX11-NEXT: s_lshl_b32 s4, s4, 1 -; GFX11-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX11-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX11-NEXT: s_lshl_b32 s3, s4, s3 ; GFX11-NEXT: s_lshr_b32 s2, s6, s2 ; GFX11-NEXT: s_and_not1_b32 s4, 7, s11 @@ -1237,7 +1233,7 @@ ; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v9, 24, v2 ; GFX6-NEXT: v_and_b32_e32 v10, 7, v2 -; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 24, v0 @@ -1248,13 +1244,13 @@ ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v10, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 7, v7 -; GFX6-NEXT: v_xor_b32_e32 v7, -1, v7 +; GFX6-NEXT: v_not_b32_e32 v7, v7 ; GFX6-NEXT: v_and_b32_e32 v7, 7, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, v7, v3 ; GFX6-NEXT: v_bfe_u32 v7, v1, 8, 8 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v2, v7 -; GFX6-NEXT: v_xor_b32_e32 v7, -1, v8 +; GFX6-NEXT: v_not_b32_e32 v7, v8 ; GFX6-NEXT: v_lshrrev_b32_e32 v6, 24, v1 ; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX6-NEXT: v_and_b32_e32 v3, 7, v8 @@ -1264,7 +1260,7 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v3, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX6-NEXT: v_xor_b32_e32 v4, -1, v9 +; GFX6-NEXT: v_not_b32_e32 v4, v9 ; GFX6-NEXT: v_and_b32_e32 v3, 7, v9 ; GFX6-NEXT: v_and_b32_e32 v4, 7, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 1, v5 @@ -1290,7 +1286,7 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v2 ; GFX8-NEXT: v_and_b32_e32 v8, 7, v2 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v9, 1, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, v2, v9 @@ -1298,7 +1294,7 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v8 ; GFX8-NEXT: v_and_b32_e32 v8, 7, v5 -; GFX8-NEXT: v_xor_b32_e32 v5, -1, v5 +; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v1 ; GFX8-NEXT: v_and_b32_e32 v5, 7, v5 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 1, v3 @@ -1306,17 +1302,17 @@ ; GFX8-NEXT: v_lshrrev_b16_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX8-NEXT: v_and_b32_e32 v4, 7, v6 -; GFX8-NEXT: v_xor_b32_e32 v5, -1, v6 +; GFX8-NEXT: v_not_b32_e32 v5, v6 ; GFX8-NEXT: v_mov_b32_e32 v6, 1 -; GFX8-NEXT: v_mov_b32_e32 v9, 0xff ; GFX8-NEXT: v_and_b32_e32 v5, 7, v5 ; GFX8-NEXT: v_lshlrev_b16_sdwa v8, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_lshlrev_b16_e32 v5, v5, v8 -; GFX8-NEXT: v_and_b32_sdwa v8, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v8, 0xff +; GFX8-NEXT: v_and_b32_sdwa v8, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v8 ; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 ; GFX8-NEXT: v_and_b32_e32 v5, 7, v7 -; GFX8-NEXT: v_xor_b32_e32 v7, -1, v7 +; GFX8-NEXT: v_not_b32_e32 v7, v7 ; GFX8-NEXT: v_and_b32_e32 v7, 7, v7 ; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, v7, v0 @@ -1340,7 +1336,7 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v2 ; GFX9-NEXT: v_and_b32_e32 v8, 7, v2 -; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX9-NEXT: v_not_b32_e32 v2, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v9, 1, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, v2, v9 @@ -1348,7 +1344,7 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX9-NEXT: v_or_b32_e32 v2, v2, v8 ; GFX9-NEXT: v_and_b32_e32 v8, 7, v5 -; GFX9-NEXT: v_xor_b32_e32 v5, -1, v5 +; GFX9-NEXT: v_not_b32_e32 v5, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 ; GFX9-NEXT: v_and_b32_e32 v5, 7, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v3, 1, v3 @@ -1356,24 +1352,23 @@ ; GFX9-NEXT: v_lshrrev_b16_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX9-NEXT: v_and_b32_e32 v4, 7, v6 -; GFX9-NEXT: v_xor_b32_e32 v5, -1, v6 +; GFX9-NEXT: v_not_b32_e32 v5, v6 ; GFX9-NEXT: v_mov_b32_e32 v6, 1 -; GFX9-NEXT: v_mov_b32_e32 v9, 0xff +; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: v_and_b32_e32 v5, 7, v5 ; GFX9-NEXT: v_lshlrev_b16_sdwa v8, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_lshlrev_b16_e32 v5, v5, v8 -; GFX9-NEXT: v_and_b32_sdwa v8, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v8, v1, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_lshrrev_b16_e32 v4, v4, v8 ; GFX9-NEXT: v_or_b32_e32 v4, v5, v4 ; GFX9-NEXT: v_and_b32_e32 v5, 7, v7 -; GFX9-NEXT: v_xor_b32_e32 v7, -1, v7 +; GFX9-NEXT: v_not_b32_e32 v7, v7 ; GFX9-NEXT: v_and_b32_e32 v7, 7, v7 ; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, v7, v0 ; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, 8 -; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_and_or_b32 v1, v2, s4, v1 ; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v4 @@ -1387,48 +1382,48 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX10-NEXT: v_xor_b32_e32 v8, -1, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v2 -; GFX10-NEXT: v_xor_b32_e32 v10, -1, v5 +; GFX10-NEXT: v_not_b32_e32 v8, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; GFX10-NEXT: v_not_b32_e32 v12, v7 ; GFX10-NEXT: v_lshlrev_b16 v3, 1, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v1 -; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 -; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX10-NEXT: v_and_b32_e32 v12, 7, v12 ; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 -; GFX10-NEXT: v_mov_b32_e32 v13, 0xff -; GFX10-NEXT: v_xor_b32_e32 v14, -1, v12 -; GFX10-NEXT: v_lshlrev_b16 v3, v10, v3 -; GFX10-NEXT: v_xor_b32_e32 v10, -1, v11 +; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 +; GFX10-NEXT: v_not_b32_e32 v13, v10 +; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_lshlrev_b16 v3, v12, v3 +; GFX10-NEXT: v_not_b32_e32 v12, v11 ; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v1 ; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0 ; GFX10-NEXT: v_and_b32_e32 v8, 0xff, v1 -; GFX10-NEXT: v_and_b32_e32 v5, 7, v5 -; GFX10-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 +; GFX10-NEXT: v_and_b32_e32 v7, 7, v7 +; GFX10-NEXT: v_and_b32_e32 v6, 0xff, v6 ; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 +; GFX10-NEXT: v_and_b32_e32 v13, 7, v13 ; GFX10-NEXT: v_lshlrev_b16 v4, 1, v4 -; GFX10-NEXT: v_and_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_and_b32_e32 v13, 7, v14 -; GFX10-NEXT: v_lshlrev_b16 v6, 1, v6 +; GFX10-NEXT: v_and_b32_sdwa v1, v1, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_and_b32_e32 v12, 7, v12 +; GFX10-NEXT: v_lshlrev_b16 v5, 1, v5 +; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 ; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX10-NEXT: v_lshrrev_b16 v5, v5, v7 -; GFX10-NEXT: v_lshlrev_b16 v4, v10, v4 -; GFX10-NEXT: v_lshrrev_b16 v1, v11, v1 -; GFX10-NEXT: v_lshlrev_b16 v6, v13, v6 -; GFX10-NEXT: v_lshrrev_b16 v7, v12, v9 +; GFX10-NEXT: v_lshrrev_b16 v6, v7, v6 +; GFX10-NEXT: v_lshlrev_b16 v4, v13, v4 +; GFX10-NEXT: v_lshrrev_b16 v1, v10, v1 +; GFX10-NEXT: v_lshlrev_b16 v5, v12, v5 +; GFX10-NEXT: v_lshrrev_b16 v7, v11, v9 ; GFX10-NEXT: v_lshrrev_b16 v2, v2, v8 -; GFX10-NEXT: v_or_b32_e32 v3, v3, v5 -; GFX10-NEXT: v_mov_b32_e32 v5, 8 +; GFX10-NEXT: v_or_b32_e32 v3, v3, v6 +; GFX10-NEXT: v_mov_b32_e32 v6, 8 ; GFX10-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX10-NEXT: v_or_b32_e32 v4, v6, v7 +; GFX10-NEXT: v_or_b32_e32 v4, v5, v7 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2 @@ -1447,18 +1442,18 @@ ; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v13, 24, v2 ; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX11-NEXT: v_xor_b32_e32 v12, -1, v7 +; GFX11-NEXT: v_not_b32_e32 v12, v7 ; GFX11-NEXT: v_and_b32_e32 v7, 7, v7 ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v1 ; GFX11-NEXT: v_and_b32_e32 v12, 7, v12 ; GFX11-NEXT: v_lshlrev_b16 v3, 1, v3 -; GFX11-NEXT: v_xor_b32_e32 v14, -1, v11 +; GFX11-NEXT: v_not_b32_e32 v14, v11 ; GFX11-NEXT: v_lshrrev_b16 v6, v7, v6 -; GFX11-NEXT: v_xor_b32_e32 v7, -1, v13 +; GFX11-NEXT: v_not_b32_e32 v7, v13 ; GFX11-NEXT: v_lshrrev_b32_e32 v9, 24, v1 -; GFX11-NEXT: v_xor_b32_e32 v10, -1, v2 +; GFX11-NEXT: v_not_b32_e32 v10, v2 ; GFX11-NEXT: v_lshlrev_b16 v3, v12, v3 ; GFX11-NEXT: v_and_b32_e32 v11, 7, v11 ; GFX11-NEXT: v_and_b32_e32 v12, 7, v14 @@ -1858,9 +1853,9 @@ ; GFX6-NEXT: s_lshl_b32 s2, s2, 8 ; GFX6-NEXT: s_and_b32 s8, s8, 0xff ; GFX6-NEXT: s_or_b32 s2, s11, s2 -; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX6-NEXT: s_and_b32 s8, 0xffff, s8 ; GFX6-NEXT: s_lshr_b32 s10, s3, 8 -; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX6-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX6-NEXT: s_lshl_b32 s8, s8, 16 ; GFX6-NEXT: s_and_b32 s3, s3, 0xff ; GFX6-NEXT: s_or_b32 s2, s2, s8 @@ -1868,8 +1863,8 @@ ; GFX6-NEXT: s_and_b32 s8, s10, 0xff ; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX6-NEXT: s_or_b32 s3, s9, s3 -; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 -; GFX6-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX6-NEXT: s_and_b32 s8, 0xffff, s8 +; GFX6-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX6-NEXT: s_lshl_b32 s8, s8, 16 ; GFX6-NEXT: s_or_b32 s3, s3, s8 ; GFX6-NEXT: s_lshr_b32 s8, s4, 16 @@ -1881,9 +1876,9 @@ ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 ; GFX6-NEXT: s_or_b32 s4, s11, s4 -; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX6-NEXT: s_and_b32 s8, 0xffff, s8 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX6-NEXT: s_lshl_b32 s8, s8, 16 ; GFX6-NEXT: s_or_b32 s4, s4, s8 ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 @@ -1901,9 +1896,9 @@ ; GFX6-NEXT: s_and_b32 s8, s10, 0xff ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX6-NEXT: s_or_b32 s5, s9, s5 -; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX6-NEXT: s_and_b32 s8, 0xffff, s8 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 24, v0 -; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX6-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX6-NEXT: s_lshl_b32 s8, s8, 16 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX6-NEXT: s_or_b32 s5, s5, s8 @@ -1911,8 +1906,8 @@ ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX6-NEXT: s_and_b32 s6, s6, 0xff -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX6-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, 24 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 23, v0 ; GFX6-NEXT: s_lshl_b32 s4, s6, 17 @@ -1929,8 +1924,8 @@ ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 24, v1 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 -; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX6-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX6-NEXT: s_and_b32 s7, 0xffff, s7 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 23, v1 ; GFX6-NEXT: s_lshl_b32 s0, s7, 17 @@ -1964,11 +1959,10 @@ ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: s_lshr_b32 s9, s1, 8 -; GFX8-NEXT: s_bfe_u32 s10, 8, 0x100000 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_lshr_b32 s6, s0, 8 ; GFX8-NEXT: s_lshr_b32 s8, s0, 24 -; GFX8-NEXT: s_lshl_b32 s1, s1, s10 +; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: s_and_b32 s6, s6, 0xff ; GFX8-NEXT: s_or_b32 s1, s8, s1 @@ -1976,74 +1970,74 @@ ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: s_lshr_b32 s7, s0, 16 ; GFX8-NEXT: s_and_b32 s0, s0, 0xff -; GFX8-NEXT: s_lshl_b32 s6, s6, s10 +; GFX8-NEXT: s_lshl_b32 s6, s6, 8 ; GFX8-NEXT: s_and_b32 s8, s8, 0xff ; GFX8-NEXT: s_or_b32 s0, s0, s6 ; GFX8-NEXT: s_and_b32 s6, s7, 0xff ; GFX8-NEXT: s_and_b32 s7, s9, 0xff ; GFX8-NEXT: s_lshr_b32 s9, s2, 16 -; GFX8-NEXT: s_lshr_b32 s11, s2, 24 +; GFX8-NEXT: s_lshr_b32 s10, s2, 24 ; GFX8-NEXT: s_and_b32 s2, s2, 0xff -; GFX8-NEXT: s_lshl_b32 s8, s8, s10 +; GFX8-NEXT: s_lshl_b32 s8, s8, 8 ; GFX8-NEXT: s_or_b32 s2, s2, s8 ; GFX8-NEXT: s_and_b32 s8, s9, 0xff ; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffe8 -; GFX8-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX8-NEXT: s_and_b32 s8, 0xffff, s8 ; GFX8-NEXT: v_mul_lo_u32 v2, v0, v1 -; GFX8-NEXT: s_lshr_b32 s12, s3, 8 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_lshr_b32 s11, s3, 8 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_lshl_b32 s8, s8, 16 ; GFX8-NEXT: s_and_b32 s3, s3, 0xff ; GFX8-NEXT: s_or_b32 s2, s2, s8 -; GFX8-NEXT: s_lshl_b32 s3, s3, s10 -; GFX8-NEXT: s_and_b32 s8, s12, 0xff -; GFX8-NEXT: s_or_b32 s3, s11, s3 -; GFX8-NEXT: s_bfe_u32 s8, s8, 0x100000 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX8-NEXT: s_lshl_b32 s3, s3, 8 +; GFX8-NEXT: s_and_b32 s8, s11, 0xff +; GFX8-NEXT: s_or_b32 s3, s10, s3 +; GFX8-NEXT: s_and_b32 s8, 0xffff, s8 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_lshl_b32 s8, s8, 16 ; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX8-NEXT: s_or_b32 s3, s3, s8 ; GFX8-NEXT: s_lshr_b32 s8, s4, 8 ; GFX8-NEXT: s_and_b32 s8, s8, 0xff ; GFX8-NEXT: s_lshr_b32 s9, s4, 16 -; GFX8-NEXT: s_lshr_b32 s11, s4, 24 +; GFX8-NEXT: s_lshr_b32 s10, s4, 24 ; GFX8-NEXT: s_and_b32 s4, s4, 0xff -; GFX8-NEXT: s_lshl_b32 s8, s8, s10 +; GFX8-NEXT: s_lshl_b32 s8, s8, 8 ; GFX8-NEXT: s_or_b32 s4, s4, s8 ; GFX8-NEXT: s_and_b32 s8, s9, 0xff ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 -; GFX8-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX8-NEXT: s_and_b32 s8, 0xffff, s8 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX8-NEXT: s_lshl_b32 s8, s8, 16 ; GFX8-NEXT: s_or_b32 s4, s4, s8 ; GFX8-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX8-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX8-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX8-NEXT: s_lshr_b32 s12, s5, 8 +; GFX8-NEXT: s_lshr_b32 s11, s5, 8 ; GFX8-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX8-NEXT: s_and_b32 s5, s5, 0xff ; GFX8-NEXT: v_mul_lo_u32 v1, v2, v1 -; GFX8-NEXT: s_lshl_b32 s5, s5, s10 +; GFX8-NEXT: s_lshl_b32 s5, s5, 8 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v0 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX8-NEXT: v_mul_hi_u32 v1, v2, v1 -; GFX8-NEXT: s_and_b32 s8, s12, 0xff +; GFX8-NEXT: s_and_b32 s8, s11, 0xff ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX8-NEXT: s_or_b32 s5, s11, s5 -; GFX8-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX8-NEXT: s_or_b32 s5, s10, s5 +; GFX8-NEXT: s_and_b32 s8, 0xffff, s8 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v0 -; GFX8-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX8-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX8-NEXT: s_lshl_b32 s8, s8, 16 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX8-NEXT: s_or_b32 s5, s5, s8 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 ; GFX8-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 23, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, v1, 24 ; GFX8-NEXT: s_lshl_b32 s4, s6, 17 @@ -2060,8 +2054,8 @@ ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 24, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s7, 0xffff, s7 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 23, v1 ; GFX8-NEXT: s_lshl_b32 s0, s7, 17 @@ -2093,45 +2087,44 @@ ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffffffe8 ; GFX9-NEXT: s_lshr_b32 s11, s1, 8 -; GFX9-NEXT: s_bfe_u32 s12, 8, 0x100000 +; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_lshr_b32 s7, s0, 8 ; GFX9-NEXT: s_lshr_b32 s10, s0, 24 +; GFX9-NEXT: s_lshl_b32 s1, s1, 8 ; GFX9-NEXT: v_mul_lo_u32 v2, v0, v1 -; GFX9-NEXT: s_lshl_b32 s1, s1, s12 ; GFX9-NEXT: s_and_b32 s7, s7, 0xff ; GFX9-NEXT: s_or_b32 s1, s10, s1 -; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX9-NEXT: s_lshr_b32 s10, s2, 8 +; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX9-NEXT: s_lshr_b32 s9, s0, 16 ; GFX9-NEXT: s_and_b32 s0, s0, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s7, s12 +; GFX9-NEXT: s_lshl_b32 s7, s7, 8 ; GFX9-NEXT: s_and_b32 s10, s10, 0xff ; GFX9-NEXT: s_or_b32 s0, s0, s7 ; GFX9-NEXT: s_and_b32 s7, s9, 0xff ; GFX9-NEXT: s_and_b32 s9, s11, 0xff ; GFX9-NEXT: s_lshr_b32 s11, s2, 16 -; GFX9-NEXT: s_lshr_b32 s13, s2, 24 +; GFX9-NEXT: s_lshr_b32 s12, s2, 24 ; GFX9-NEXT: s_and_b32 s2, s2, 0xff -; GFX9-NEXT: s_lshl_b32 s10, s10, s12 +; GFX9-NEXT: s_lshl_b32 s10, s10, 8 ; GFX9-NEXT: s_or_b32 s2, s2, s10 ; GFX9-NEXT: s_and_b32 s10, s11, 0xff ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 -; GFX9-NEXT: s_bfe_u32 s10, s10, 0x100000 +; GFX9-NEXT: s_and_b32 s10, 0xffff, s10 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX9-NEXT: s_lshr_b32 s14, s3, 8 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX9-NEXT: s_lshr_b32 s13, s3, 8 +; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX9-NEXT: s_lshl_b32 s10, s10, 16 ; GFX9-NEXT: s_and_b32 s3, s3, 0xff ; GFX9-NEXT: s_or_b32 s2, s2, s10 -; GFX9-NEXT: s_lshl_b32 s3, s3, s12 -; GFX9-NEXT: s_and_b32 s10, s14, 0xff -; GFX9-NEXT: s_or_b32 s3, s13, s3 -; GFX9-NEXT: s_bfe_u32 s10, s10, 0x100000 -; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX9-NEXT: s_lshl_b32 s3, s3, 8 +; GFX9-NEXT: s_and_b32 s10, s13, 0xff +; GFX9-NEXT: s_or_b32 s3, s12, s3 +; GFX9-NEXT: s_and_b32 s10, 0xffff, s10 +; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX9-NEXT: s_lshl_b32 s10, s10, 16 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX9-NEXT: s_or_b32 s3, s3, s10 @@ -2139,25 +2132,25 @@ ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: s_and_b32 s10, s10, 0xff ; GFX9-NEXT: s_lshr_b32 s11, s4, 16 -; GFX9-NEXT: s_lshr_b32 s13, s4, 24 +; GFX9-NEXT: s_lshr_b32 s12, s4, 24 ; GFX9-NEXT: s_and_b32 s4, s4, 0xff -; GFX9-NEXT: s_lshl_b32 s10, s10, s12 +; GFX9-NEXT: s_lshl_b32 s10, s10, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s10 ; GFX9-NEXT: s_and_b32 s10, s11, 0xff -; GFX9-NEXT: s_bfe_u32 s10, s10, 0x100000 +; GFX9-NEXT: s_and_b32 s10, 0xffff, s10 ; GFX9-NEXT: v_mul_lo_u32 v1, v2, v1 -; GFX9-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX9-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX9-NEXT: s_lshl_b32 s10, s10, 16 ; GFX9-NEXT: s_or_b32 s4, s4, s10 ; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX9-NEXT: s_lshr_b32 s14, s5, 8 +; GFX9-NEXT: s_lshr_b32 s13, s5, 8 ; GFX9-NEXT: s_and_b32 s5, s5, 0xff ; GFX9-NEXT: v_mul_hi_u32 v1, v2, v1 -; GFX9-NEXT: s_lshl_b32 s5, s5, s12 -; GFX9-NEXT: s_and_b32 s10, s14, 0xff -; GFX9-NEXT: s_or_b32 s5, s13, s5 -; GFX9-NEXT: s_bfe_u32 s10, s10, 0x100000 -; GFX9-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX9-NEXT: s_lshl_b32 s5, s5, 8 +; GFX9-NEXT: s_and_b32 s10, s13, 0xff +; GFX9-NEXT: s_or_b32 s5, s12, s5 +; GFX9-NEXT: s_and_b32 s10, 0xffff, s10 +; GFX9-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX9-NEXT: s_lshl_b32 s10, s10, 16 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX9-NEXT: s_or_b32 s5, s5, s10 @@ -2170,8 +2163,8 @@ ; GFX9-NEXT: v_mul_lo_u32 v1, v1, 24 ; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 -; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX9-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX9-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX9-NEXT: s_and_b32 s7, 0xffff, s7 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: v_sub_u32_e32 v3, 23, v0 ; GFX9-NEXT: s_lshl_b32 s4, s7, 17 @@ -2187,8 +2180,8 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_subrev_u32_e32 v2, 24, v1 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_bfe_u32 s9, s9, 0x100000 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX9-NEXT: s_and_b32 s9, 0xffff, s9 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_sub_u32_e32 v2, 23, v1 ; GFX9-NEXT: s_lshl_b32 s0, s9, 17 @@ -2219,105 +2212,104 @@ ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, 24 ; GFX10-NEXT: s_lshr_b32 s9, s1, 8 -; GFX10-NEXT: s_bfe_u32 s10, 8, 0x100000 ; GFX10-NEXT: s_and_b32 s1, s1, 0xff +; GFX10-NEXT: s_lshr_b32 s6, s0, 8 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX10-NEXT: s_lshr_b32 s6, s0, 8 ; GFX10-NEXT: s_lshr_b32 s8, s0, 24 -; GFX10-NEXT: s_lshl_b32 s1, s1, s10 +; GFX10-NEXT: s_lshl_b32 s1, s1, 8 ; GFX10-NEXT: s_and_b32 s6, s6, 0xff ; GFX10-NEXT: s_or_b32 s1, s8, s1 ; GFX10-NEXT: s_lshr_b32 s8, s4, 8 ; GFX10-NEXT: s_lshr_b32 s7, s0, 16 ; GFX10-NEXT: s_and_b32 s0, s0, 0xff +; GFX10-NEXT: s_lshl_b32 s6, s6, 8 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; GFX10-NEXT: s_lshl_b32 s6, s6, s10 ; GFX10-NEXT: s_and_b32 s8, s8, 0xff ; GFX10-NEXT: s_or_b32 s0, s0, s6 +; GFX10-NEXT: s_and_b32 s6, s7, 0xff ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX10-NEXT: s_and_b32 s6, s7, 0xff ; GFX10-NEXT: s_and_b32 s7, s9, 0xff ; GFX10-NEXT: s_lshr_b32 s9, s4, 16 +; GFX10-NEXT: s_lshr_b32 s10, s4, 24 ; GFX10-NEXT: v_mul_lo_u32 v2, 0xffffffe8, v0 ; GFX10-NEXT: v_mul_lo_u32 v3, 0xffffffe8, v1 -; GFX10-NEXT: s_lshr_b32 s11, s4, 24 ; GFX10-NEXT: s_and_b32 s4, s4, 0xff -; GFX10-NEXT: s_lshl_b32 s8, s8, s10 -; GFX10-NEXT: s_lshr_b32 s12, s5, 8 +; GFX10-NEXT: s_lshl_b32 s8, s8, 8 +; GFX10-NEXT: s_lshr_b32 s11, s5, 8 ; GFX10-NEXT: s_or_b32 s4, s4, s8 ; GFX10-NEXT: s_and_b32 s8, s9, 0xff +; GFX10-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX10-NEXT: s_bfe_u32 s8, s8, 0x100000 -; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX10-NEXT: s_and_b32 s8, 0xffff, s8 ; GFX10-NEXT: s_and_b32 s5, s5, 0xff ; GFX10-NEXT: s_lshl_b32 s8, s8, 16 -; GFX10-NEXT: s_lshl_b32 s5, s5, s10 +; GFX10-NEXT: s_lshl_b32 s5, s5, 8 ; GFX10-NEXT: s_or_b32 s4, s4, s8 +; GFX10-NEXT: s_and_b32 s8, s11, 0xff ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 -; GFX10-NEXT: s_and_b32 s8, s12, 0xff -; GFX10-NEXT: s_or_b32 s5, s11, s5 -; GFX10-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX10-NEXT: s_or_b32 s5, s10, s5 +; GFX10-NEXT: s_and_b32 s8, 0xffff, s8 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 +; GFX10-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX10-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX10-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX10-NEXT: s_lshl_b32 s8, s8, 16 ; GFX10-NEXT: s_lshr_b32 s9, s2, 8 ; GFX10-NEXT: s_or_b32 s5, s5, s8 ; GFX10-NEXT: s_lshr_b32 s8, s2, 16 ; GFX10-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX10-NEXT: s_and_b32 s9, s9, 0xff +; GFX10-NEXT: s_lshr_b32 s10, s2, 24 ; GFX10-NEXT: v_mul_lo_u32 v0, v0, 24 -; GFX10-NEXT: s_lshr_b32 s11, s2, 24 -; GFX10-NEXT: s_lshr_b32 s12, s3, 8 +; GFX10-NEXT: s_lshr_b32 s11, s3, 8 ; GFX10-NEXT: s_and_b32 s2, s2, 0xff -; GFX10-NEXT: s_lshl_b32 s9, s9, s10 +; GFX10-NEXT: s_lshl_b32 s9, s9, 8 ; GFX10-NEXT: s_and_b32 s8, s8, 0xff ; GFX10-NEXT: v_mul_lo_u32 v1, v1, 24 ; GFX10-NEXT: s_and_b32 s3, s3, 0xff -; GFX10-NEXT: v_sub_nc_u32_e32 v0, s4, v0 ; GFX10-NEXT: s_or_b32 s2, s2, s9 -; GFX10-NEXT: s_bfe_u32 s4, s8, 0x100000 -; GFX10-NEXT: s_lshl_b32 s3, s3, s10 -; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, s4, v0 +; GFX10-NEXT: s_and_b32 s4, 0xffff, s8 +; GFX10-NEXT: s_lshl_b32 s3, s3, 8 +; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX10-NEXT: s_lshl_b32 s4, s4, 16 ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s5, v1 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX10-NEXT: s_and_b32 s5, s12, 0xff -; GFX10-NEXT: s_lshl_b32 s4, s4, 16 -; GFX10-NEXT: s_or_b32 s3, s11, s3 -; GFX10-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX10-NEXT: s_and_b32 s5, s11, 0xff +; GFX10-NEXT: s_or_b32 s3, s10, s3 +; GFX10-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX10-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 -; GFX10-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX10-NEXT: s_lshl_b32 s5, s5, 16 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v0 ; GFX10-NEXT: s_or_b32 s2, s2, s4 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v0 +; GFX10-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX10-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX10-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX10-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX10-NEXT: s_or_b32 s3, s3, s5 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX10-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX10-NEXT: s_and_b32 s7, 0xffff, s7 ; GFX10-NEXT: s_lshl_b32 s4, s6, 17 +; GFX10-NEXT: s_lshl_b32 s0, s0, 1 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, 23, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX10-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GFX10-NEXT: s_lshl_b32 s0, s0, 1 +; GFX10-NEXT: s_or_b32 s0, s4, s0 ; GFX10-NEXT: s_lshl_b32 s1, s1, 1 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 23, v1 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX10-NEXT: v_lshrrev_b32_e64 v0, v0, s2 -; GFX10-NEXT: s_or_b32 s0, s4, s0 ; GFX10-NEXT: s_lshl_b32 s2, s7, 17 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX10-NEXT: v_lshrrev_b32_e64 v1, v1, s3 @@ -2344,96 +2336,95 @@ ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, 24 ; GFX11-NEXT: s_lshr_b32 s6, s0, 8 -; GFX11-NEXT: s_bfe_u32 s9, 8, 0x100000 +; GFX11-NEXT: s_lshr_b32 s7, s0, 16 ; GFX11-NEXT: s_and_b32 s6, s6, 0xff ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX11-NEXT: s_lshr_b32 s7, s0, 16 ; GFX11-NEXT: s_lshr_b32 s8, s0, 24 ; GFX11-NEXT: s_and_b32 s0, s0, 0xff -; GFX11-NEXT: s_lshl_b32 s6, s6, s9 -; GFX11-NEXT: s_lshr_b32 s10, s1, 8 +; GFX11-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-NEXT: s_lshr_b32 s9, s1, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s6 ; GFX11-NEXT: s_and_b32 s6, s7, 0xff -; GFX11-NEXT: s_and_b32 s7, s10, 0xff +; GFX11-NEXT: s_and_b32 s7, s9, 0xff +; GFX11-NEXT: s_lshr_b32 s9, s4, 8 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_dual_mul_f32 v0, 0x4f7ffffe, v0 :: v_dual_mul_f32 v1, 0x4f7ffffe, v1 -; GFX11-NEXT: s_lshr_b32 s10, s4, 8 -; GFX11-NEXT: s_lshr_b32 s11, s4, 16 -; GFX11-NEXT: s_and_b32 s10, s10, 0xff +; GFX11-NEXT: s_lshr_b32 s10, s4, 16 +; GFX11-NEXT: s_and_b32 s9, s9, 0xff +; GFX11-NEXT: s_and_b32 s11, s4, 0xff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX11-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX11-NEXT: s_and_b32 s12, s4, 0xff -; GFX11-NEXT: s_lshl_b32 s10, s10, s9 -; GFX11-NEXT: s_and_b32 s11, s11, 0xff +; GFX11-NEXT: s_lshl_b32 s9, s9, 8 +; GFX11-NEXT: s_and_b32 s10, s10, 0xff +; GFX11-NEXT: s_or_b32 s9, s11, s9 ; GFX11-NEXT: v_mul_lo_u32 v2, 0xffffffe8, v0 -; GFX11-NEXT: s_or_b32 s10, s12, s10 ; GFX11-NEXT: v_mul_lo_u32 v3, 0xffffffe8, v1 -; GFX11-NEXT: s_bfe_u32 s11, s11, 0x100000 -; GFX11-NEXT: s_bfe_u32 s10, s10, 0x100000 -; GFX11-NEXT: s_lshl_b32 s11, s11, 16 -; GFX11-NEXT: s_lshr_b32 s12, s5, 8 -; GFX11-NEXT: s_or_b32 s10, s10, s11 -; GFX11-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX11-NEXT: s_and_b32 s10, 0xffff, s10 +; GFX11-NEXT: s_and_b32 s9, 0xffff, s9 +; GFX11-NEXT: s_lshl_b32 s10, s10, 16 +; GFX11-NEXT: s_lshr_b32 s11, s5, 8 +; GFX11-NEXT: s_or_b32 s9, s9, s10 ; GFX11-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX11-NEXT: s_lshr_b32 s4, s4, 24 -; GFX11-NEXT: s_lshl_b32 s5, s5, s9 -; GFX11-NEXT: s_and_b32 s11, s12, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-NEXT: s_and_b32 s10, s11, 0xff ; GFX11-NEXT: s_or_b32 s4, s4, s5 -; GFX11-NEXT: s_bfe_u32 s5, s11, 0x100000 -; GFX11-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX11-NEXT: s_and_b32 s5, 0xffff, s10 +; GFX11-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 ; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX11-NEXT: v_mul_hi_u32 v2, v1, v3 -; GFX11-NEXT: s_lshl_b32 s5, s5, 16 -; GFX11-NEXT: s_and_b32 s1, s1, 0xff ; GFX11-NEXT: s_or_b32 s4, s4, s5 -; GFX11-NEXT: v_mul_hi_u32 v0, s10, v0 -; GFX11-NEXT: s_lshl_b32 s1, s1, s9 -; GFX11-NEXT: s_lshr_b32 s11, s2, 16 +; GFX11-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-NEXT: s_lshr_b32 s10, s2, 16 +; GFX11-NEXT: v_mul_hi_u32 v0, s9, v0 +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_lshr_b32 s5, s2, 24 ; GFX11-NEXT: s_or_b32 s1, s8, s1 ; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX11-NEXT: s_lshr_b32 s8, s2, 8 -; GFX11-NEXT: s_lshr_b32 s5, s2, 24 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff ; GFX11-NEXT: s_and_b32 s8, s8, 0xff ; GFX11-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX11-NEXT: v_mul_hi_u32 v1, s4, v1 -; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: s_lshl_b32 s8, s8, s9 -; GFX11-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX11-NEXT: s_lshl_b32 s8, s8, 8 +; GFX11-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX11-NEXT: s_or_b32 s2, s2, s8 -; GFX11-NEXT: s_and_b32 s8, s11, 0xff -; GFX11-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX11-NEXT: v_sub_nc_u32_e32 v0, s10, v0 +; GFX11-NEXT: s_and_b32 s8, s10, 0xff +; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX11-NEXT: s_and_b32 s8, 0xffff, s8 +; GFX11-NEXT: v_sub_nc_u32_e32 v0, s9, v0 ; GFX11-NEXT: v_mul_lo_u32 v1, v1, 24 -; GFX11-NEXT: s_bfe_u32 s8, s8, 0x100000 -; GFX11-NEXT: s_lshr_b32 s10, s3, 8 +; GFX11-NEXT: s_lshr_b32 s9, s3, 8 ; GFX11-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s8, 16 ; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX11-NEXT: s_lshl_b32 s8, s8, 16 -; GFX11-NEXT: s_lshl_b32 s3, s3, s9 +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_or_b32 s2, s2, s8 ; GFX11-NEXT: v_sub_nc_u32_e32 v1, s4, v1 -; GFX11-NEXT: s_and_b32 s4, s10, 0xff +; GFX11-NEXT: s_and_b32 s4, s9, 0xff ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo -; GFX11-NEXT: s_or_b32 s2, s2, s8 -; GFX11-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 24, v1 +; GFX11-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX11-NEXT: s_or_b32 s3, s5, s3 +; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 24, v1 +; GFX11-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX11-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX11-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX11-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX11-NEXT: s_lshl_b32 s4, s4, 16 ; GFX11-NEXT: s_lshl_b32 s5, s6, 17 +; GFX11-NEXT: s_lshl_b32 s0, s0, 1 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 -; GFX11-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX11-NEXT: s_or_b32 s0, s5, s0 -; GFX11-NEXT: s_bfe_u32 s7, s7, 0x100000 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX11-NEXT: s_and_b32 s7, 0xffff, s7 ; GFX11-NEXT: s_lshl_b32 s1, s1, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 24, v1 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 @@ -3250,9 +3241,9 @@ ; GFX6-NEXT: s_and_b32 s3, s2, 15 ; GFX6-NEXT: s_andn2_b32 s2, 15, s2 ; GFX6-NEXT: s_lshl_b32 s0, s0, 1 -; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX6-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX6-NEXT: s_lshl_b32 s0, s0, s2 -; GFX6-NEXT: s_bfe_u32 s2, s3, 0x100000 +; GFX6-NEXT: s_and_b32 s2, 0xffff, s3 ; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_lshr_b32 s1, s1, s2 ; GFX6-NEXT: s_or_b32 s0, s0, s1 @@ -3262,12 +3253,11 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s3, s2, 15 ; GFX8-NEXT: s_andn2_b32 s2, 15, s2 -; GFX8-NEXT: s_bfe_u32 s4, 1, 0x100000 -; GFX8-NEXT: s_lshl_b32 s0, s0, s4 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s2, s3, 0x100000 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s3 ; GFX8-NEXT: s_lshr_b32 s1, s1, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog @@ -3276,12 +3266,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s3, s2, 15 ; GFX9-NEXT: s_andn2_b32 s2, 15, s2 -; GFX9-NEXT: s_bfe_u32 s4, 1, 0x100000 -; GFX9-NEXT: s_lshl_b32 s0, s0, s4 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX9-NEXT: s_lshl_b32 s0, s0, s2 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_bfe_u32 s2, s3, 0x100000 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX9-NEXT: s_and_b32 s2, 0xffff, s3 ; GFX9-NEXT: s_lshr_b32 s1, s1, s2 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog @@ -3289,12 +3278,11 @@ ; GFX10-LABEL: s_fshr_i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_and_b32 s3, s2, 15 -; GFX10-NEXT: s_bfe_u32 s4, 1, 0x100000 ; GFX10-NEXT: s_andn2_b32 s2, 15, s2 -; GFX10-NEXT: s_lshl_b32 s0, s0, s4 -; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX10-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX10-NEXT: s_lshl_b32 s0, s0, 1 +; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX10-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX10-NEXT: s_lshl_b32 s0, s0, s2 ; GFX10-NEXT: s_lshr_b32 s1, s1, s3 ; GFX10-NEXT: s_or_b32 s0, s0, s1 @@ -3303,12 +3291,11 @@ ; GFX11-LABEL: s_fshr_i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_and_b32 s3, s2, 15 -; GFX11-NEXT: s_bfe_u32 s4, 1, 0x100000 ; GFX11-NEXT: s_and_not1_b32 s2, 15, s2 -; GFX11-NEXT: s_lshl_b32 s0, s0, s4 -; GFX11-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX11-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX11-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX11-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX11-NEXT: s_lshl_b32 s0, s0, s2 ; GFX11-NEXT: s_lshr_b32 s1, s1, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -3328,41 +3315,33 @@ ; ; GFX8-LABEL: s_fshr_i16_4: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s2, 12, 0x100000 -; GFX8-NEXT: s_lshl_b32 s0, s0, s2 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s2, 4, 0x100000 -; GFX8-NEXT: s_lshr_b32 s1, s1, s2 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_lshl_b32 s0, s0, 12 +; GFX8-NEXT: s_lshr_b32 s1, s1, 4 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshr_i16_4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_bfe_u32 s2, 12, 0x100000 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_bfe_u32 s2, 4, 0x100000 -; GFX9-NEXT: s_lshr_b32 s1, s1, s2 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX9-NEXT: s_lshl_b32 s0, s0, 12 +; GFX9-NEXT: s_lshr_b32 s1, s1, 4 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fshr_i16_4: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_bfe_u32 s2, 12, 0x100000 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX10-NEXT: s_bfe_u32 s3, 4, 0x100000 -; GFX10-NEXT: s_lshl_b32 s0, s0, s2 -; GFX10-NEXT: s_lshr_b32 s1, s1, s3 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX10-NEXT: s_lshl_b32 s0, s0, 12 +; GFX10-NEXT: s_lshr_b32 s1, s1, 4 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshr_i16_4: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_bfe_u32 s2, 12, 0x100000 -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX11-NEXT: s_bfe_u32 s3, 4, 0x100000 -; GFX11-NEXT: s_lshl_b32 s0, s0, s2 -; GFX11-NEXT: s_lshr_b32 s1, s1, s3 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX11-NEXT: s_lshl_b32 s0, s0, 12 +; GFX11-NEXT: s_lshr_b32 s1, s1, 4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: ; return to shader part epilog @@ -3380,41 +3359,33 @@ ; ; GFX8-LABEL: s_fshr_i16_5: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s2, 11, 0x100000 -; GFX8-NEXT: s_lshl_b32 s0, s0, s2 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s2, 5, 0x100000 -; GFX8-NEXT: s_lshr_b32 s1, s1, s2 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_lshl_b32 s0, s0, 11 +; GFX8-NEXT: s_lshr_b32 s1, s1, 5 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshr_i16_5: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_bfe_u32 s2, 11, 0x100000 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_bfe_u32 s2, 5, 0x100000 -; GFX9-NEXT: s_lshr_b32 s1, s1, s2 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX9-NEXT: s_lshl_b32 s0, s0, 11 +; GFX9-NEXT: s_lshr_b32 s1, s1, 5 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fshr_i16_5: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_bfe_u32 s2, 11, 0x100000 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX10-NEXT: s_bfe_u32 s3, 5, 0x100000 -; GFX10-NEXT: s_lshl_b32 s0, s0, s2 -; GFX10-NEXT: s_lshr_b32 s1, s1, s3 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX10-NEXT: s_lshl_b32 s0, s0, 11 +; GFX10-NEXT: s_lshr_b32 s1, s1, 5 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshr_i16_5: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_bfe_u32 s2, 11, 0x100000 -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX11-NEXT: s_bfe_u32 s3, 5, 0x100000 -; GFX11-NEXT: s_lshl_b32 s0, s0, s2 -; GFX11-NEXT: s_lshr_b32 s1, s1, s3 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX11-NEXT: s_lshl_b32 s0, s0, 11 +; GFX11-NEXT: s_lshr_b32 s1, s1, 5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: ; return to shader part epilog @@ -3430,9 +3401,9 @@ ; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_bfe_u32 v2, v3, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3594,9 +3565,9 @@ ; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX6-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX6-NEXT: s_lshl_b32 s0, s0, 1 -; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshl_b32_e32 v0, s0, v0 -; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: s_and_b32 s0, s1, 0xffff ; GFX6-NEXT: v_lshr_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3606,9 +3577,8 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: v_and_b32_e32 v1, 15, v0 ; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX8-NEXT: s_bfe_u32 s2, 1, 0x100000 ; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX8-NEXT: s_lshl_b32 s0, s0, s2 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 ; GFX8-NEXT: v_lshlrev_b16_e64 v0, v0, s0 ; GFX8-NEXT: v_lshrrev_b16_e64 v1, v1, s1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3618,9 +3588,8 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: v_and_b32_e32 v1, 15, v0 ; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX9-NEXT: s_bfe_u32 s2, 1, 0x100000 ; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 ; GFX9-NEXT: v_lshlrev_b16_e64 v0, v0, s0 ; GFX9-NEXT: v_lshrrev_b16_e64 v1, v1, s1 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3630,8 +3599,7 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: v_xor_b32_e32 v1, -1, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX10-NEXT: s_bfe_u32 s2, 1, 0x100000 -; GFX10-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10-NEXT: s_lshl_b32 s0, s0, 1 ; GFX10-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX10-NEXT: v_lshrrev_b16 v0, v0, s1 ; GFX10-NEXT: v_lshlrev_b16 v1, v1, s0 @@ -3642,14 +3610,12 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: v_xor_b32_e32 v1, -1, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX11-NEXT: s_bfe_u32 s2, 1, 0x100000 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: s_lshl_b32 s0, s0, s2 -; GFX11-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX11-NEXT: s_lshl_b32 s0, s0, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX11-NEXT: v_lshrrev_b16 v0, v0, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b16 v1, v1, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-NEXT: ; return to shader part epilog %result = call i16 @llvm.fshr.i16(i16 %lhs, i16 %rhs, i16 %amt) @@ -3663,9 +3629,9 @@ ; GFX6-NEXT: s_and_b32 s2, s1, 15 ; GFX6-NEXT: s_andn2_b32 s1, 15, s1 ; GFX6-NEXT: s_lshl_b32 s0, s0, 1 -; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX6-NEXT: s_lshl_b32 s0, s0, s1 -; GFX6-NEXT: s_bfe_u32 s1, s2, 0x100000 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s2 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s1, v0 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 @@ -3675,9 +3641,8 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s2, s1, 15 ; GFX8-NEXT: s_andn2_b32 s1, 15, s1 -; GFX8-NEXT: s_bfe_u32 s3, 1, 0x100000 -; GFX8-NEXT: s_lshl_b32 s0, s0, s3 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshl_b32 s0, s0, s1 ; GFX8-NEXT: v_lshrrev_b16_e32 v0, s2, v0 ; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 @@ -3687,9 +3652,8 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s2, s1, 15 ; GFX9-NEXT: s_andn2_b32 s1, 15, s1 -; GFX9-NEXT: s_bfe_u32 s3, 1, 0x100000 -; GFX9-NEXT: s_lshl_b32 s0, s0, s3 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_lshl_b32 s0, s0, s1 ; GFX9-NEXT: v_lshrrev_b16_e32 v0, s2, v0 ; GFX9-NEXT: v_or_b32_e32 v0, s0, v0 @@ -3698,11 +3662,10 @@ ; GFX10-LABEL: v_fshr_i16_svs: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_and_b32 s2, s1, 15 -; GFX10-NEXT: s_bfe_u32 s3, 1, 0x100000 ; GFX10-NEXT: s_andn2_b32 s1, 15, s1 ; GFX10-NEXT: v_lshrrev_b16 v0, s2, v0 -; GFX10-NEXT: s_lshl_b32 s0, s0, s3 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_lshl_b32 s0, s0, 1 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: s_lshl_b32 s0, s0, s1 ; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog @@ -3710,11 +3673,10 @@ ; GFX11-LABEL: v_fshr_i16_svs: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_and_b32 s2, s1, 15 -; GFX11-NEXT: s_bfe_u32 s3, 1, 0x100000 ; GFX11-NEXT: s_and_not1_b32 s1, 15, s1 ; GFX11-NEXT: v_lshrrev_b16 v0, s2, v0 -; GFX11-NEXT: s_lshl_b32 s0, s0, s3 -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s0, s0, s1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) @@ -3731,9 +3693,9 @@ ; GFX6-NEXT: s_and_b32 s2, s1, 15 ; GFX6-NEXT: s_andn2_b32 s1, 15, s1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s1, v0 -; GFX6-NEXT: s_bfe_u32 s1, s2, 0x100000 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s2 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshr_b32 s0, s0, s1 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 @@ -3745,8 +3707,8 @@ ; GFX8-NEXT: s_andn2_b32 s1, 15, s1 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 1, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, s1, v0 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s2, 0x100000 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s2 ; GFX8-NEXT: s_lshr_b32 s0, s0, s1 ; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX8-NEXT: ; return to shader part epilog @@ -3757,8 +3719,8 @@ ; GFX9-NEXT: s_andn2_b32 s1, 15, s1 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, s1, v0 -; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX9-NEXT: s_bfe_u32 s1, s2, 0x100000 +; GFX9-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s2 ; GFX9-NEXT: s_lshr_b32 s0, s0, s1 ; GFX9-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX9-NEXT: ; return to shader part epilog @@ -3768,8 +3730,8 @@ ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 ; GFX10-NEXT: s_andn2_b32 s2, 15, s1 ; GFX10-NEXT: s_and_b32 s1, s1, 15 -; GFX10-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: v_lshlrev_b16 v0, s2, v0 ; GFX10-NEXT: s_lshr_b32 s0, s0, s1 ; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 @@ -3780,8 +3742,8 @@ ; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0 ; GFX11-NEXT: s_and_not1_b32 s2, 15, s1 ; GFX11-NEXT: s_and_b32 s1, s1, 15 -; GFX11-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: v_lshlrev_b16 v0, s2, v0 ; GFX11-NEXT: s_lshr_b32 s0, s0, s1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) @@ -3798,79 +3760,75 @@ ; GFX6-NEXT: s_lshl_b32 s5, s5, 16 ; GFX6-NEXT: s_and_b32 s4, s4, 0xffff ; GFX6-NEXT: s_or_b32 s4, s5, s4 -; GFX6-NEXT: s_bfe_u32 s5, 1, 0x100000 -; GFX6-NEXT: s_lshl_b32 s0, s0, s5 -; GFX6-NEXT: s_bfe_u32 s6, s2, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s7, 14, 0x100000 -; GFX6-NEXT: s_lshl_b32 s1, s1, s5 +; GFX6-NEXT: s_bfe_u32 s5, s2, 0xf0001 +; GFX6-NEXT: s_lshl_b32 s0, s0, 1 +; GFX6-NEXT: s_lshr_b32 s5, s5, 14 +; GFX6-NEXT: s_or_b32 s0, s0, s5 ; GFX6-NEXT: s_bfe_u32 s5, s3, 0xf0001 -; GFX6-NEXT: s_lshr_b32 s6, s6, s7 -; GFX6-NEXT: s_lshr_b32 s5, s5, s7 +; GFX6-NEXT: s_lshl_b32 s1, s1, 1 +; GFX6-NEXT: s_lshr_b32 s5, s5, 14 ; GFX6-NEXT: s_xor_b32 s4, s4, -1 -; GFX6-NEXT: s_or_b32 s0, s0, s6 ; GFX6-NEXT: s_or_b32 s1, s1, s5 ; GFX6-NEXT: s_lshl_b32 s2, s2, 1 ; GFX6-NEXT: s_lshr_b32 s5, s4, 16 ; GFX6-NEXT: s_and_b32 s6, s4, 15 ; GFX6-NEXT: s_andn2_b32 s4, 15, s4 -; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX6-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX6-NEXT: s_bfe_u32 s2, s2, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX6-NEXT: s_lshl_b32 s0, s0, s6 ; GFX6-NEXT: s_lshr_b32 s2, s2, s4 ; GFX6-NEXT: s_or_b32 s0, s0, s2 ; GFX6-NEXT: s_and_b32 s2, s5, 15 ; GFX6-NEXT: s_lshl_b32 s3, s3, 1 ; GFX6-NEXT: s_andn2_b32 s4, 15, s5 -; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX6-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX6-NEXT: s_lshl_b32 s1, s1, s2 ; GFX6-NEXT: s_bfe_u32 s2, s3, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s3, s4, 0x100000 +; GFX6-NEXT: s_and_b32 s3, 0xffff, s4 ; GFX6-NEXT: s_lshr_b32 s2, s2, s3 ; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fshr_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s5, 1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s6, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s7, 15, 0x100000 +; GFX8-NEXT: s_and_b32 s5, 0xffff, s1 ; GFX8-NEXT: s_lshr_b32 s3, s0, 16 ; GFX8-NEXT: s_lshr_b32 s4, s1, 16 -; GFX8-NEXT: s_lshl_b32 s0, s0, s5 -; GFX8-NEXT: s_lshr_b32 s6, s6, s7 -; GFX8-NEXT: s_or_b32 s0, s0, s6 -; GFX8-NEXT: s_lshl_b32 s3, s3, s5 -; GFX8-NEXT: s_lshr_b32 s6, s4, s7 -; GFX8-NEXT: s_lshl_b32 s1, s1, s5 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: s_lshr_b32 s5, s5, 15 +; GFX8-NEXT: s_or_b32 s0, s0, s5 +; GFX8-NEXT: s_lshl_b32 s3, s3, 1 +; GFX8-NEXT: s_lshr_b32 s5, s4, 15 +; GFX8-NEXT: s_lshl_b32 s1, s1, 1 ; GFX8-NEXT: s_xor_b32 s2, s2, -1 -; GFX8-NEXT: s_or_b32 s3, s3, s6 -; GFX8-NEXT: s_lshr_b32 s6, s2, 16 -; GFX8-NEXT: s_and_b32 s7, s2, 15 +; GFX8-NEXT: s_or_b32 s3, s3, s5 +; GFX8-NEXT: s_lshr_b32 s5, s2, 16 +; GFX8-NEXT: s_and_b32 s6, s2, 15 ; GFX8-NEXT: s_andn2_b32 s2, 15, s2 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s7, s7, 0x100000 -; GFX8-NEXT: s_lshr_b32 s1, s1, s5 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_lshl_b32 s0, s0, s7 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX8-NEXT: s_lshr_b32 s1, s1, 1 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_lshl_b32 s0, s0, s6 ; GFX8-NEXT: s_lshr_b32 s1, s1, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s6, 15 -; GFX8-NEXT: s_lshl_b32 s4, s4, s5 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_andn2_b32 s2, 15, s6 +; GFX8-NEXT: s_and_b32 s1, s5, 15 +; GFX8-NEXT: s_lshl_b32 s4, s4, 1 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_andn2_b32 s2, 15, s5 ; GFX8-NEXT: s_lshl_b32 s1, s3, s1 -; GFX8-NEXT: s_bfe_u32 s3, s4, 0x100000 -; GFX8-NEXT: s_lshr_b32 s3, s3, s5 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s4 +; GFX8-NEXT: s_lshr_b32 s3, s3, 1 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_lshr_b32 s2, s3, s2 ; GFX8-NEXT: s_or_b32 s1, s1, s2 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog @@ -3953,15 +3911,13 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX6-NEXT: s_bfe_u32 s4, 1, 0x100000 ; GFX6-NEXT: v_bfe_u32 v5, v2, 1, 15 -; GFX6-NEXT: s_bfe_u32 s5, 14, 0x100000 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, s5, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 14, v5 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v5 ; GFX6-NEXT: v_bfe_u32 v5, v3, 1, 15 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, s5, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 14, v5 ; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v4 @@ -3969,9 +3925,9 @@ ; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX6-NEXT: v_bfe_u32 v6, v6, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX6-NEXT: v_bfe_u32 v2, v2, 1, 15 -; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v6, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 @@ -3979,10 +3935,10 @@ ; GFX6-NEXT: v_xor_b32_e32 v4, -1, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_bfe_u32 v2, v3, 1, 15 -; GFX6-NEXT: v_bfe_u32 v3, v4, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v3, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -4016,8 +3972,8 @@ ; GFX8-NEXT: v_lshlrev_b16_e32 v0, v3, v0 ; GFX8-NEXT: v_lshrrev_b16_e32 v1, v4, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_mov_b32_e32 v1, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -4068,17 +4024,13 @@ ; GFX6-LABEL: v_fshr_v2i16_4_8: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_bfe_u32 s4, 12, 0x100000 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX6-NEXT: v_bfe_u32 v2, v2, 1, 15 -; GFX6-NEXT: s_bfe_u32 s4, 3, 0x100000 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, s4, v2 -; GFX6-NEXT: s_bfe_u32 s4, 8, 0x100000 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 12, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 3, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_bfe_u32 v2, v3, 1, 15 -; GFX6-NEXT: s_bfe_u32 s4, 7, 0x100000 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 7, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -4093,8 +4045,8 @@ ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -4136,79 +4088,75 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_bfe_u32 s4, 1, 0x100000 -; GFX6-NEXT: s_bfe_u32 s5, s2, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s6, 14, 0x100000 +; GFX6-NEXT: s_bfe_u32 s4, s2, 0xf0001 ; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX6-NEXT: s_lshl_b32 s0, s0, s4 -; GFX6-NEXT: s_lshr_b32 s5, s5, s6 +; GFX6-NEXT: s_lshl_b32 s0, s0, 1 +; GFX6-NEXT: s_lshr_b32 s4, s4, 14 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v0 ; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX6-NEXT: s_or_b32 s0, s0, s5 +; GFX6-NEXT: s_or_b32 s0, s0, s4 ; GFX6-NEXT: s_lshl_b32 s2, s2, 1 ; GFX6-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, s0, v2 ; GFX6-NEXT: s_bfe_u32 s0, s2, 0xf0001 -; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0 -; GFX6-NEXT: s_lshl_b32 s1, s1, s4 ; GFX6-NEXT: s_bfe_u32 s4, s3, 0xf0001 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v1 ; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX6-NEXT: s_lshr_b32 s4, s4, s6 +; GFX6-NEXT: s_lshl_b32 s1, s1, 1 +; GFX6-NEXT: s_lshr_b32 s4, s4, 14 ; GFX6-NEXT: s_lshl_b32 s3, s3, 1 ; GFX6-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX6-NEXT: s_or_b32 s1, s1, s4 -; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: s_bfe_u32 s0, s3, 0xf0001 -; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_lshl_b32_e32 v2, s1, v2 ; GFX6-NEXT: v_lshr_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 -; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: v_fshr_v2i16_ssv: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s4, 1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s5, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s6, 15, 0x100000 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s1 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_lshl_b32 s0, s0, s4 -; GFX8-NEXT: s_lshr_b32 s5, s5, s6 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: s_lshr_b32 s4, s4, 15 ; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: s_or_b32 s0, s0, s5 -; GFX8-NEXT: s_lshl_b32 s1, s1, s4 +; GFX8-NEXT: s_or_b32 s0, s0, s4 +; GFX8-NEXT: s_lshl_b32 s1, s1, 1 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX8-NEXT: v_lshlrev_b16_e64 v2, v2, s0 -; GFX8-NEXT: s_bfe_u32 s0, s1, 0x100000 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s1 ; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX8-NEXT: s_lshr_b32 s0, s0, s4 -; GFX8-NEXT: s_lshr_b32 s5, s3, s6 -; GFX8-NEXT: s_lshl_b32 s3, s3, s4 +; GFX8-NEXT: s_lshr_b32 s0, s0, 1 +; GFX8-NEXT: s_lshr_b32 s4, s3, 15 +; GFX8-NEXT: s_lshl_b32 s3, s3, 1 ; GFX8-NEXT: v_lshrrev_b16_e64 v0, v0, s0 -; GFX8-NEXT: s_lshl_b32 s2, s2, s4 +; GFX8-NEXT: s_lshl_b32 s2, s2, 1 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v1 ; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX8-NEXT: s_bfe_u32 s0, s3, 0x100000 -; GFX8-NEXT: s_or_b32 s2, s2, s5 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s3 +; GFX8-NEXT: s_or_b32 s2, s2, s4 ; GFX8-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX8-NEXT: s_lshr_b32 s0, s0, s4 +; GFX8-NEXT: s_lshr_b32 s0, s0, 1 ; GFX8-NEXT: v_lshlrev_b16_e64 v2, v2, s2 ; GFX8-NEXT: v_lshrrev_b16_e64 v1, v1, s0 ; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: ; return to shader part epilog ; @@ -4262,18 +4210,16 @@ define amdgpu_ps float @v_fshr_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, <2 x i16> inreg %amt) { ; GFX6-LABEL: v_fshr_v2i16_svs: ; GFX6: ; %bb.0: +; GFX6-NEXT: v_bfe_u32 v2, v0, 1, 15 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_and_b32 s2, s2, 0xffff -; GFX6-NEXT: s_or_b32 s2, s3, s2 -; GFX6-NEXT: s_bfe_u32 s3, 1, 0x100000 -; GFX6-NEXT: v_bfe_u32 v2, v0, 1, 15 -; GFX6-NEXT: s_bfe_u32 s4, 14, 0x100000 -; GFX6-NEXT: s_lshl_b32 s0, s0, s3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, s4, v2 +; GFX6-NEXT: s_lshl_b32 s0, s0, 1 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 14, v2 ; GFX6-NEXT: v_bfe_u32 v3, v1, 1, 15 +; GFX6-NEXT: s_or_b32 s2, s3, s2 ; GFX6-NEXT: v_or_b32_e32 v2, s0, v2 -; GFX6-NEXT: s_lshl_b32 s0, s1, s3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, s4, v3 +; GFX6-NEXT: s_lshl_b32 s0, s1, 1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 14, v3 ; GFX6-NEXT: v_or_b32_e32 v3, s0, v3 ; GFX6-NEXT: s_xor_b32 s0, s2, -1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -4281,35 +4227,34 @@ ; GFX6-NEXT: s_and_b32 s2, s0, 15 ; GFX6-NEXT: s_andn2_b32 s0, 15, s0 ; GFX6-NEXT: v_bfe_u32 v0, v0, 1, 15 -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX6-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX6-NEXT: s_and_b32 s0, s1, 15 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, s2, v2 ; GFX6-NEXT: s_andn2_b32 s1, 15, s1 -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, s0, v3 ; GFX6-NEXT: v_bfe_u32 v1, v1, 1, 15 -; GFX6-NEXT: s_bfe_u32 s0, s1, 0x100000 +; GFX6-NEXT: s_and_b32 s0, 0xffff, s1 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 -; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: v_fshr_v2i16_svs: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s3, 1, 0x100000 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_lshl_b32 s0, s0, s3 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 ; GFX8-NEXT: v_lshrrev_b16_e32 v1, 15, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 15 ; GFX8-NEXT: v_or_b32_e32 v1, s0, v1 -; GFX8-NEXT: s_lshl_b32 s0, s2, s3 +; GFX8-NEXT: s_lshl_b32 s0, s2, 1 ; GFX8-NEXT: v_lshrrev_b16_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v2, s0, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 1, v0 @@ -4326,11 +4271,11 @@ ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 1, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, s0, v2 ; GFX8-NEXT: v_lshrrev_b16_e32 v0, s1, v0 -; GFX8-NEXT: v_lshlrev_b16_e32 v1, s2, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, s2, v1 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: ; return to shader part epilog ; @@ -4397,77 +4342,73 @@ ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_and_b32 s2, s2, 0xffff ; GFX6-NEXT: s_or_b32 s2, s3, s2 -; GFX6-NEXT: s_bfe_u32 s3, 1, 0x100000 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s3, v0 -; GFX6-NEXT: s_bfe_u32 s4, s0, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s5, 14, 0x100000 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, s3, v1 +; GFX6-NEXT: s_bfe_u32 s3, s0, 0xf0001 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: s_lshr_b32 s3, s3, 14 +; GFX6-NEXT: v_or_b32_e32 v0, s3, v0 ; GFX6-NEXT: s_bfe_u32 s3, s1, 0xf0001 -; GFX6-NEXT: s_lshr_b32 s4, s4, s5 -; GFX6-NEXT: s_lshr_b32 s3, s3, s5 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX6-NEXT: s_lshr_b32 s3, s3, 14 ; GFX6-NEXT: s_xor_b32 s2, s2, -1 -; GFX6-NEXT: v_or_b32_e32 v0, s4, v0 ; GFX6-NEXT: v_or_b32_e32 v1, s3, v1 ; GFX6-NEXT: s_lshl_b32 s0, s0, 1 ; GFX6-NEXT: s_lshr_b32 s3, s2, 16 ; GFX6-NEXT: s_and_b32 s4, s2, 15 ; GFX6-NEXT: s_andn2_b32 s2, 15, s2 -; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX6-NEXT: s_bfe_u32 s0, s0, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX6-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX6-NEXT: s_lshr_b32 s0, s0, s2 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX6-NEXT: s_and_b32 s0, s3, 15 ; GFX6-NEXT: s_lshl_b32 s1, s1, 1 ; GFX6-NEXT: s_andn2_b32 s2, 15, s3 -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, s0, v1 ; GFX6-NEXT: s_bfe_u32 s0, s1, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s1, s2, 0x100000 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s2 ; GFX6-NEXT: s_lshr_b32 s0, s0, s1 ; GFX6-NEXT: v_or_b32_e32 v1, s0, v1 -; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 -; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: v_fshr_v2i16_vss: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s3, s0, 0x100000 -; GFX8-NEXT: s_bfe_u32 s4, 15, 0x100000 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s0 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 1, v0 -; GFX8-NEXT: s_lshr_b32 s3, s3, s4 +; GFX8-NEXT: s_lshr_b32 s3, s3, 15 ; GFX8-NEXT: v_mov_b32_e32 v2, 1 ; GFX8-NEXT: v_or_b32_e32 v1, s3, v1 ; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: s_lshr_b32 s3, s2, s4 -; GFX8-NEXT: v_or_b32_e32 v0, s3, v0 -; GFX8-NEXT: s_bfe_u32 s3, 1, 0x100000 -; GFX8-NEXT: s_lshl_b32 s0, s0, s3 +; GFX8-NEXT: s_lshr_b32 s3, s2, 15 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 ; GFX8-NEXT: s_xor_b32 s1, s1, -1 -; GFX8-NEXT: s_lshr_b32 s4, s1, 16 -; GFX8-NEXT: s_and_b32 s5, s1, 15 +; GFX8-NEXT: v_or_b32_e32 v0, s3, v0 +; GFX8-NEXT: s_lshr_b32 s3, s1, 16 +; GFX8-NEXT: s_and_b32 s4, s1, 15 ; GFX8-NEXT: s_andn2_b32 s1, 15, s1 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_lshr_b32 s0, s0, s3 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: v_lshlrev_b16_e32 v1, s5, v1 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_lshr_b32 s0, s0, 1 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, s4, v1 ; GFX8-NEXT: s_lshr_b32 s0, s0, s1 -; GFX8-NEXT: s_lshl_b32 s2, s2, s3 +; GFX8-NEXT: s_lshl_b32 s2, s2, 1 ; GFX8-NEXT: v_or_b32_e32 v1, s0, v1 -; GFX8-NEXT: s_and_b32 s0, s4, 15 -; GFX8-NEXT: s_andn2_b32 s1, 15, s4 +; GFX8-NEXT: s_and_b32 s0, s3, 15 +; GFX8-NEXT: s_andn2_b32 s1, 15, s3 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, s0, v0 -; GFX8-NEXT: s_bfe_u32 s0, s2, 0x100000 -; GFX8-NEXT: s_lshr_b32 s0, s0, s3 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s2 +; GFX8-NEXT: s_lshr_b32 s0, s0, 1 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshr_b32 s0, s0, s1 ; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: ; return to shader part epilog ; @@ -4530,113 +4471,109 @@ ; GFX6-NEXT: s_lshl_b32 s7, s7, 16 ; GFX6-NEXT: s_or_b32 s6, s6, s7 ; GFX6-NEXT: s_and_b32 s7, s8, 0xffff -; GFX6-NEXT: s_bfe_u32 s8, 1, 0x100000 -; GFX6-NEXT: s_bfe_u32 s9, s3, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s10, 14, 0x100000 -; GFX6-NEXT: s_lshl_b32 s0, s0, s8 -; GFX6-NEXT: s_lshr_b32 s9, s9, s10 -; GFX6-NEXT: s_or_b32 s0, s0, s9 -; GFX6-NEXT: s_bfe_u32 s9, s4, 0xf0001 -; GFX6-NEXT: s_lshl_b32 s1, s1, s8 -; GFX6-NEXT: s_lshr_b32 s9, s9, s10 +; GFX6-NEXT: s_bfe_u32 s8, s3, 0xf0001 +; GFX6-NEXT: s_lshl_b32 s0, s0, 1 +; GFX6-NEXT: s_lshr_b32 s8, s8, 14 +; GFX6-NEXT: s_or_b32 s0, s0, s8 +; GFX6-NEXT: s_bfe_u32 s8, s4, 0xf0001 +; GFX6-NEXT: s_lshl_b32 s1, s1, 1 +; GFX6-NEXT: s_lshr_b32 s8, s8, 14 ; GFX6-NEXT: s_xor_b32 s6, s6, -1 -; GFX6-NEXT: s_or_b32 s1, s1, s9 +; GFX6-NEXT: s_or_b32 s1, s1, s8 ; GFX6-NEXT: s_lshl_b32 s3, s3, 1 -; GFX6-NEXT: s_lshr_b32 s9, s6, 16 -; GFX6-NEXT: s_and_b32 s11, s6, 15 +; GFX6-NEXT: s_lshr_b32 s8, s6, 16 +; GFX6-NEXT: s_and_b32 s9, s6, 15 ; GFX6-NEXT: s_andn2_b32 s6, 15, s6 -; GFX6-NEXT: s_bfe_u32 s11, s11, 0x100000 +; GFX6-NEXT: s_and_b32 s9, 0xffff, s9 ; GFX6-NEXT: s_bfe_u32 s3, s3, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX6-NEXT: s_lshl_b32 s0, s0, s11 +; GFX6-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX6-NEXT: s_lshl_b32 s0, s0, s9 ; GFX6-NEXT: s_lshr_b32 s3, s3, s6 ; GFX6-NEXT: s_or_b32 s0, s0, s3 -; GFX6-NEXT: s_and_b32 s3, s9, 15 +; GFX6-NEXT: s_and_b32 s3, s8, 15 ; GFX6-NEXT: s_lshl_b32 s4, s4, 1 -; GFX6-NEXT: s_andn2_b32 s6, 15, s9 -; GFX6-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX6-NEXT: s_andn2_b32 s6, 15, s8 +; GFX6-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX6-NEXT: s_lshl_b32 s1, s1, s3 ; GFX6-NEXT: s_bfe_u32 s3, s4, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s4, s6, 0x100000 +; GFX6-NEXT: s_and_b32 s4, 0xffff, s6 ; GFX6-NEXT: s_lshr_b32 s3, s3, s4 ; GFX6-NEXT: s_or_b32 s1, s1, s3 ; GFX6-NEXT: s_bfe_u32 s3, s5, 0xf0001 -; GFX6-NEXT: s_lshl_b32 s2, s2, s8 -; GFX6-NEXT: s_lshr_b32 s3, s3, s10 +; GFX6-NEXT: s_lshl_b32 s2, s2, 1 +; GFX6-NEXT: s_lshr_b32 s3, s3, 14 ; GFX6-NEXT: s_xor_b32 s4, s7, -1 ; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: s_lshl_b32 s3, s5, 1 ; GFX6-NEXT: s_and_b32 s5, s4, 15 ; GFX6-NEXT: s_andn2_b32 s4, 15, s4 -; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX6-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX6-NEXT: s_bfe_u32 s3, s3, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX6-NEXT: s_lshl_b32 s2, s2, s5 ; GFX6-NEXT: s_lshr_b32 s3, s3, s4 -; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_bfe_u32 s1, s2, 0x100000 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s2 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fshr_v3i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s8, 1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x100000 -; GFX8-NEXT: s_bfe_u32 s10, 15, 0x100000 +; GFX8-NEXT: s_and_b32 s8, 0xffff, s2 ; GFX8-NEXT: s_lshr_b32 s6, s0, 16 ; GFX8-NEXT: s_lshr_b32 s7, s2, 16 -; GFX8-NEXT: s_lshl_b32 s0, s0, s8 -; GFX8-NEXT: s_lshr_b32 s9, s9, s10 -; GFX8-NEXT: s_or_b32 s0, s0, s9 -; GFX8-NEXT: s_lshl_b32 s6, s6, s8 -; GFX8-NEXT: s_lshr_b32 s9, s7, s10 -; GFX8-NEXT: s_lshl_b32 s2, s2, s8 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: s_lshr_b32 s8, s8, 15 +; GFX8-NEXT: s_or_b32 s0, s0, s8 +; GFX8-NEXT: s_lshl_b32 s6, s6, 1 +; GFX8-NEXT: s_lshr_b32 s8, s7, 15 +; GFX8-NEXT: s_lshl_b32 s2, s2, 1 ; GFX8-NEXT: s_xor_b32 s4, s4, -1 -; GFX8-NEXT: s_or_b32 s6, s6, s9 -; GFX8-NEXT: s_lshr_b32 s9, s4, 16 -; GFX8-NEXT: s_and_b32 s11, s4, 15 +; GFX8-NEXT: s_or_b32 s6, s6, s8 +; GFX8-NEXT: s_lshr_b32 s8, s4, 16 +; GFX8-NEXT: s_and_b32 s9, s4, 15 ; GFX8-NEXT: s_andn2_b32 s4, 15, s4 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_bfe_u32 s11, s11, 0x100000 -; GFX8-NEXT: s_lshr_b32 s2, s2, s8 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_lshl_b32 s0, s0, s11 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_and_b32 s9, 0xffff, s9 +; GFX8-NEXT: s_lshr_b32 s2, s2, 1 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX8-NEXT: s_lshl_b32 s0, s0, s9 ; GFX8-NEXT: s_lshr_b32 s2, s2, s4 ; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s2, s9, 15 -; GFX8-NEXT: s_lshl_b32 s7, s7, s8 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_andn2_b32 s4, 15, s9 +; GFX8-NEXT: s_and_b32 s2, s8, 15 +; GFX8-NEXT: s_lshl_b32 s7, s7, 1 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_andn2_b32 s4, 15, s8 ; GFX8-NEXT: s_lshl_b32 s2, s6, s2 -; GFX8-NEXT: s_bfe_u32 s6, s7, 0x100000 -; GFX8-NEXT: s_lshr_b32 s6, s6, s8 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX8-NEXT: s_and_b32 s6, 0xffff, s7 +; GFX8-NEXT: s_lshr_b32 s6, s6, 1 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX8-NEXT: s_lshr_b32 s4, s6, s4 ; GFX8-NEXT: s_or_b32 s2, s2, s4 -; GFX8-NEXT: s_bfe_u32 s4, s3, 0x100000 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s3 ; GFX8-NEXT: s_and_b32 s5, s5, 0xffff -; GFX8-NEXT: s_lshl_b32 s1, s1, s8 -; GFX8-NEXT: s_lshr_b32 s4, s4, s10 +; GFX8-NEXT: s_lshl_b32 s1, s1, 1 +; GFX8-NEXT: s_lshr_b32 s4, s4, 15 ; GFX8-NEXT: s_or_b32 s1, s1, s4 -; GFX8-NEXT: s_lshl_b32 s3, s3, s8 +; GFX8-NEXT: s_lshl_b32 s3, s3, 1 ; GFX8-NEXT: s_xor_b32 s4, s5, -1 ; GFX8-NEXT: s_and_b32 s5, s4, 15 ; GFX8-NEXT: s_andn2_b32 s4, 15, s4 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX8-NEXT: s_lshr_b32 s3, s3, s8 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX8-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX8-NEXT: s_lshr_b32 s3, s3, 1 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX8-NEXT: s_lshl_b32 s1, s1, s5 ; GFX8-NEXT: s_lshr_b32 s3, s3, s4 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_or_b32 s1, s1, s3 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshr_v3i16: @@ -4788,15 +4725,13 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX6-NEXT: v_or_b32_e32 v6, v6, v7 ; GFX6-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; GFX6-NEXT: s_bfe_u32 s4, 1, 0x100000 ; GFX6-NEXT: v_bfe_u32 v8, v3, 1, 15 -; GFX6-NEXT: s_bfe_u32 s5, 14, 0x100000 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v8, s5, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v8, 14, v8 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v8 ; GFX6-NEXT: v_bfe_u32 v8, v4, 1, 15 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v8, s5, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v8, 14, v8 ; GFX6-NEXT: v_xor_b32_e32 v6, -1, v6 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v8 ; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v6 @@ -4804,9 +4739,9 @@ ; GFX6-NEXT: v_xor_b32_e32 v6, -1, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3 ; GFX6-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX6-NEXT: v_bfe_u32 v9, v9, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX6-NEXT: v_bfe_u32 v3, v3, 1, 15 -; GFX6-NEXT: v_bfe_u32 v6, v6, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v9, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v6, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 @@ -4814,24 +4749,24 @@ ; GFX6-NEXT: v_xor_b32_e32 v6, -1, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v4 ; GFX6-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX6-NEXT: v_bfe_u32 v3, v3, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v3, v1 ; GFX6-NEXT: v_bfe_u32 v3, v4, 1, 15 -; GFX6-NEXT: v_bfe_u32 v4, v6, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v6 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v4, v3 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX6-NEXT: v_bfe_u32 v3, v5, 1, 15 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, s4, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, s5, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 14, v3 ; GFX6-NEXT: v_xor_b32_e32 v4, -1, v7 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 15, v4 ; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX6-NEXT: v_bfe_u32 v5, v5, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX6-NEXT: v_bfe_u32 v3, v3, 1, 15 -; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v5, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v4, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 @@ -4878,11 +4813,11 @@ ; GFX8-NEXT: v_lshrrev_b16_e32 v3, 1, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, v5, v1 ; GFX8-NEXT: v_lshrrev_b16_e32 v3, v4, v3 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_mov_b32_e32 v3, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fshr_v3i16: @@ -4959,47 +4894,45 @@ ; GFX6-NEXT: s_lshl_b32 s9, s11, 16 ; GFX6-NEXT: s_and_b32 s10, s10, 0xffff ; GFX6-NEXT: s_or_b32 s9, s9, s10 -; GFX6-NEXT: s_bfe_u32 s10, 1, 0x100000 -; GFX6-NEXT: s_bfe_u32 s11, s4, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s12, 14, 0x100000 -; GFX6-NEXT: s_lshl_b32 s0, s0, s10 -; GFX6-NEXT: s_lshr_b32 s11, s11, s12 -; GFX6-NEXT: s_or_b32 s0, s0, s11 -; GFX6-NEXT: s_bfe_u32 s11, s5, 0xf0001 -; GFX6-NEXT: s_lshl_b32 s1, s1, s10 -; GFX6-NEXT: s_lshr_b32 s11, s11, s12 +; GFX6-NEXT: s_bfe_u32 s10, s4, 0xf0001 +; GFX6-NEXT: s_lshl_b32 s0, s0, 1 +; GFX6-NEXT: s_lshr_b32 s10, s10, 14 +; GFX6-NEXT: s_or_b32 s0, s0, s10 +; GFX6-NEXT: s_bfe_u32 s10, s5, 0xf0001 +; GFX6-NEXT: s_lshl_b32 s1, s1, 1 +; GFX6-NEXT: s_lshr_b32 s10, s10, 14 ; GFX6-NEXT: s_xor_b32 s8, s8, -1 -; GFX6-NEXT: s_or_b32 s1, s1, s11 +; GFX6-NEXT: s_or_b32 s1, s1, s10 ; GFX6-NEXT: s_lshl_b32 s4, s4, 1 -; GFX6-NEXT: s_lshr_b32 s11, s8, 16 -; GFX6-NEXT: s_and_b32 s13, s8, 15 +; GFX6-NEXT: s_lshr_b32 s10, s8, 16 +; GFX6-NEXT: s_and_b32 s11, s8, 15 ; GFX6-NEXT: s_andn2_b32 s8, 15, s8 -; GFX6-NEXT: s_bfe_u32 s13, s13, 0x100000 +; GFX6-NEXT: s_and_b32 s11, 0xffff, s11 ; GFX6-NEXT: s_bfe_u32 s4, s4, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 -; GFX6-NEXT: s_lshl_b32 s0, s0, s13 +; GFX6-NEXT: s_and_b32 s8, 0xffff, s8 +; GFX6-NEXT: s_lshl_b32 s0, s0, s11 ; GFX6-NEXT: s_lshr_b32 s4, s4, s8 ; GFX6-NEXT: s_or_b32 s0, s0, s4 -; GFX6-NEXT: s_and_b32 s4, s11, 15 +; GFX6-NEXT: s_and_b32 s4, s10, 15 ; GFX6-NEXT: s_lshl_b32 s5, s5, 1 -; GFX6-NEXT: s_andn2_b32 s8, 15, s11 -; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: s_andn2_b32 s8, 15, s10 +; GFX6-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX6-NEXT: s_lshl_b32 s1, s1, s4 ; GFX6-NEXT: s_bfe_u32 s4, s5, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s5, s8, 0x100000 +; GFX6-NEXT: s_and_b32 s5, 0xffff, s8 ; GFX6-NEXT: s_lshr_b32 s4, s4, s5 ; GFX6-NEXT: s_or_b32 s1, s1, s4 -; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_lshl_b32 s1, s2, s10 +; GFX6-NEXT: s_lshl_b32 s1, s2, 1 ; GFX6-NEXT: s_bfe_u32 s2, s6, 0xf0001 -; GFX6-NEXT: s_lshr_b32 s2, s2, s12 +; GFX6-NEXT: s_lshr_b32 s2, s2, 14 ; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_lshl_b32 s2, s3, s10 +; GFX6-NEXT: s_lshl_b32 s2, s3, 1 ; GFX6-NEXT: s_bfe_u32 s3, s7, 0xf0001 -; GFX6-NEXT: s_lshr_b32 s3, s3, s12 +; GFX6-NEXT: s_lshr_b32 s3, s3, 14 ; GFX6-NEXT: s_xor_b32 s5, s9, -1 ; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: s_lshl_b32 s3, s6, 1 @@ -5007,98 +4940,96 @@ ; GFX6-NEXT: s_lshr_b32 s6, s5, 16 ; GFX6-NEXT: s_and_b32 s7, s5, 15 ; GFX6-NEXT: s_andn2_b32 s5, 15, s5 -; GFX6-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX6-NEXT: s_and_b32 s7, 0xffff, s7 ; GFX6-NEXT: s_bfe_u32 s3, s3, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX6-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX6-NEXT: s_lshl_b32 s1, s1, s7 ; GFX6-NEXT: s_lshr_b32 s3, s3, s5 ; GFX6-NEXT: s_or_b32 s1, s1, s3 ; GFX6-NEXT: s_and_b32 s3, s6, 15 ; GFX6-NEXT: s_andn2_b32 s5, 15, s6 -; GFX6-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX6-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX6-NEXT: s_lshl_b32 s2, s2, s3 ; GFX6-NEXT: s_bfe_u32 s3, s4, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s4, s5, 0x100000 +; GFX6-NEXT: s_and_b32 s4, 0xffff, s5 ; GFX6-NEXT: s_lshr_b32 s3, s3, s4 ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX6-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fshr_v4i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s8, 1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x100000 -; GFX8-NEXT: s_bfe_u32 s10, 15, 0x100000 +; GFX8-NEXT: s_and_b32 s8, 0xffff, s2 ; GFX8-NEXT: s_lshr_b32 s6, s0, 16 ; GFX8-NEXT: s_lshr_b32 s7, s2, 16 -; GFX8-NEXT: s_lshl_b32 s0, s0, s8 -; GFX8-NEXT: s_lshr_b32 s9, s9, s10 -; GFX8-NEXT: s_or_b32 s0, s0, s9 -; GFX8-NEXT: s_lshl_b32 s6, s6, s8 -; GFX8-NEXT: s_lshr_b32 s9, s7, s10 -; GFX8-NEXT: s_lshl_b32 s2, s2, s8 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: s_lshr_b32 s8, s8, 15 +; GFX8-NEXT: s_or_b32 s0, s0, s8 +; GFX8-NEXT: s_lshl_b32 s6, s6, 1 +; GFX8-NEXT: s_lshr_b32 s8, s7, 15 +; GFX8-NEXT: s_lshl_b32 s2, s2, 1 ; GFX8-NEXT: s_xor_b32 s4, s4, -1 -; GFX8-NEXT: s_or_b32 s6, s6, s9 -; GFX8-NEXT: s_lshr_b32 s9, s4, 16 -; GFX8-NEXT: s_and_b32 s11, s4, 15 +; GFX8-NEXT: s_or_b32 s6, s6, s8 +; GFX8-NEXT: s_lshr_b32 s8, s4, 16 +; GFX8-NEXT: s_and_b32 s9, s4, 15 ; GFX8-NEXT: s_andn2_b32 s4, 15, s4 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_bfe_u32 s11, s11, 0x100000 -; GFX8-NEXT: s_lshr_b32 s2, s2, s8 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_lshl_b32 s0, s0, s11 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_and_b32 s9, 0xffff, s9 +; GFX8-NEXT: s_lshr_b32 s2, s2, 1 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX8-NEXT: s_lshl_b32 s0, s0, s9 ; GFX8-NEXT: s_lshr_b32 s2, s2, s4 ; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s2, s9, 15 -; GFX8-NEXT: s_lshl_b32 s7, s7, s8 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_andn2_b32 s4, 15, s9 +; GFX8-NEXT: s_and_b32 s2, s8, 15 +; GFX8-NEXT: s_lshl_b32 s7, s7, 1 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_andn2_b32 s4, 15, s8 ; GFX8-NEXT: s_lshl_b32 s2, s6, s2 -; GFX8-NEXT: s_bfe_u32 s6, s7, 0x100000 -; GFX8-NEXT: s_lshr_b32 s6, s6, s8 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX8-NEXT: s_and_b32 s6, 0xffff, s7 +; GFX8-NEXT: s_lshr_b32 s6, s6, 1 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX8-NEXT: s_lshr_b32 s4, s6, s4 ; GFX8-NEXT: s_or_b32 s2, s2, s4 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 -; GFX8-NEXT: s_bfe_u32 s6, s3, 0x100000 +; GFX8-NEXT: s_and_b32 s6, 0xffff, s3 ; GFX8-NEXT: s_or_b32 s0, s0, s2 ; GFX8-NEXT: s_lshr_b32 s2, s1, 16 ; GFX8-NEXT: s_lshr_b32 s4, s3, 16 -; GFX8-NEXT: s_lshl_b32 s1, s1, s8 -; GFX8-NEXT: s_lshr_b32 s6, s6, s10 +; GFX8-NEXT: s_lshl_b32 s1, s1, 1 +; GFX8-NEXT: s_lshr_b32 s6, s6, 15 ; GFX8-NEXT: s_or_b32 s1, s1, s6 -; GFX8-NEXT: s_lshl_b32 s2, s2, s8 -; GFX8-NEXT: s_lshr_b32 s6, s4, s10 -; GFX8-NEXT: s_lshl_b32 s3, s3, s8 +; GFX8-NEXT: s_lshl_b32 s2, s2, 1 +; GFX8-NEXT: s_lshr_b32 s6, s4, 15 +; GFX8-NEXT: s_lshl_b32 s3, s3, 1 ; GFX8-NEXT: s_xor_b32 s5, s5, -1 ; GFX8-NEXT: s_or_b32 s2, s2, s6 ; GFX8-NEXT: s_lshr_b32 s6, s5, 16 ; GFX8-NEXT: s_and_b32 s7, s5, 15 ; GFX8-NEXT: s_andn2_b32 s5, 15, s5 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_bfe_u32 s7, s7, 0x100000 -; GFX8-NEXT: s_lshr_b32 s3, s3, s8 -; GFX8-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX8-NEXT: s_and_b32 s7, 0xffff, s7 +; GFX8-NEXT: s_lshr_b32 s3, s3, 1 +; GFX8-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX8-NEXT: s_lshl_b32 s1, s1, s7 ; GFX8-NEXT: s_lshr_b32 s3, s3, s5 ; GFX8-NEXT: s_or_b32 s1, s1, s3 ; GFX8-NEXT: s_and_b32 s3, s6, 15 -; GFX8-NEXT: s_lshl_b32 s4, s4, s8 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX8-NEXT: s_lshl_b32 s4, s4, 1 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_andn2_b32 s5, 15, s6 ; GFX8-NEXT: s_lshl_b32 s2, s2, s3 -; GFX8-NEXT: s_bfe_u32 s3, s4, 0x100000 -; GFX8-NEXT: s_lshr_b32 s3, s3, s8 -; GFX8-NEXT: s_bfe_u32 s4, s5, 0x100000 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s4 +; GFX8-NEXT: s_lshr_b32 s3, s3, 1 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s5 ; GFX8-NEXT: s_lshr_b32 s3, s3, s4 ; GFX8-NEXT: s_or_b32 s2, s2, s3 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: ; return to shader part epilog @@ -5237,15 +5168,13 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v9, 16, v11 ; GFX6-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; GFX6-NEXT: v_or_b32_e32 v9, v9, v10 -; GFX6-NEXT: s_bfe_u32 s4, 1, 0x100000 ; GFX6-NEXT: v_bfe_u32 v10, v4, 1, 15 -; GFX6-NEXT: s_bfe_u32 s5, 14, 0x100000 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v10, s5, v10 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v10, 14, v10 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v10 ; GFX6-NEXT: v_bfe_u32 v10, v5, 1, 15 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v10, s5, v10 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v10, 14, v10 ; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v10 ; GFX6-NEXT: v_lshrrev_b32_e32 v10, 16, v8 @@ -5253,9 +5182,9 @@ ; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v4 ; GFX6-NEXT: v_and_b32_e32 v8, 15, v8 -; GFX6-NEXT: v_bfe_u32 v11, v11, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; GFX6-NEXT: v_bfe_u32 v4, v4, 1, 15 -; GFX6-NEXT: v_bfe_u32 v8, v8, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v11, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v4 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 @@ -5263,19 +5192,19 @@ ; GFX6-NEXT: v_xor_b32_e32 v8, -1, v10 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 1, v5 ; GFX6-NEXT: v_and_b32_e32 v8, 15, v8 -; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v4, v1 ; GFX6-NEXT: v_bfe_u32 v4, v5, 1, 15 -; GFX6-NEXT: v_bfe_u32 v5, v8, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v8 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX6-NEXT: v_bfe_u32 v4, v6, 1, 15 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, s4, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, s5, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 14, v4 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX6-NEXT: v_bfe_u32 v4, v7, 1, 15 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, s4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, s5, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 14, v4 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v6 ; GFX6-NEXT: v_xor_b32_e32 v6, -1, v9 @@ -5284,19 +5213,19 @@ ; GFX6-NEXT: v_and_b32_e32 v8, 15, v6 ; GFX6-NEXT: v_xor_b32_e32 v6, -1, v6 ; GFX6-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX6-NEXT: v_bfe_u32 v8, v8, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX6-NEXT: v_bfe_u32 v4, v4, 1, 15 -; GFX6-NEXT: v_bfe_u32 v6, v6, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v8, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v6, v4 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v7 ; GFX6-NEXT: v_xor_b32_e32 v6, -1, v7 ; GFX6-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, v4, v3 ; GFX6-NEXT: v_bfe_u32 v4, v5, 1, 15 -; GFX6-NEXT: v_bfe_u32 v5, v6, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v6 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -5330,36 +5259,37 @@ ; GFX8-NEXT: v_lshlrev_b16_e32 v0, v6, v0 ; GFX8-NEXT: v_lshrrev_b16_e32 v2, v7, v2 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b16_e32 v4, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v6, 15, v3 -; GFX8-NEXT: v_or_b32_e32 v4, v4, v6 -; GFX8-NEXT: v_mov_b32_e32 v6, 1 -; GFX8-NEXT: v_lshlrev_b16_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b16_sdwa v7, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_xor_b32_e32 v5, -1, v5 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 -; GFX8-NEXT: v_lshlrev_b16_e32 v7, 1, v3 -; GFX8-NEXT: v_lshlrev_b16_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; GFX8-NEXT: v_and_b32_e32 v8, 15, v5 +; GFX8-NEXT: v_lshlrev_b16_e32 v2, 1, v1 +; GFX8-NEXT: v_lshrrev_b16_e32 v4, 15, v3 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, 1 +; GFX8-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b16_sdwa v6, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v6 +; GFX8-NEXT: v_lshlrev_b16_e32 v6, 1, v3 +; GFX8-NEXT: v_lshlrev_b16_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_xor_b32_e32 v4, -1, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v7, 15, v4 +; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX8-NEXT: v_lshrrev_b16_e32 v6, 1, v6 +; GFX8-NEXT: v_lshlrev_b16_e32 v2, v7, v2 +; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v6 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX8-NEXT: v_and_b32_e32 v4, 15, v5 ; GFX8-NEXT: v_xor_b32_e32 v5, -1, v5 ; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX8-NEXT: v_lshrrev_b16_e32 v7, 1, v7 -; GFX8-NEXT: v_lshlrev_b16_e32 v4, v8, v4 -; GFX8-NEXT: v_lshrrev_b16_e32 v5, v5, v7 -; GFX8-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX8-NEXT: v_and_b32_e32 v5, 15, v6 -; GFX8-NEXT: v_xor_b32_e32 v6, -1, v6 -; GFX8-NEXT: v_and_b32_e32 v6, 15, v6 ; GFX8-NEXT: v_lshrrev_b16_e32 v3, 1, v3 -; GFX8-NEXT: v_lshlrev_b16_e32 v1, v5, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v3, v6, v3 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, v4, v1 +; GFX8-NEXT: v_lshrrev_b16_e32 v3, v5, v3 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fshr_v4i16: @@ -5552,7 +5482,7 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 63, v4 -; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX6-NEXT: v_not_b32_e32 v4, v4 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 ; GFX6-NEXT: v_and_b32_e32 v4, 63, v4 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v4 @@ -5565,7 +5495,7 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v5, 63, v4 -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; GFX8-NEXT: v_and_b32_e32 v4, 63, v4 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] @@ -5578,7 +5508,7 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v5, 63, v4 -; GFX9-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX9-NEXT: v_not_b32_e32 v4, v4 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; GFX9-NEXT: v_and_b32_e32 v4, 63, v4 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] @@ -5591,7 +5521,7 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v5, -1, v4 +; GFX10-NEXT: v_not_b32_e32 v5, v4 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; GFX10-NEXT: v_and_b32_e32 v4, 63, v4 ; GFX10-NEXT: v_and_b32_e32 v5, 63, v5 @@ -5605,7 +5535,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_xor_b32_e32 v5, -1, v4 +; GFX11-NEXT: v_not_b32_e32 v5, v4 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; GFX11-NEXT: v_and_b32_e32 v4, 63, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -5758,7 +5688,7 @@ ; GFX6-LABEL: v_fshr_i64_ssv: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_and_b32_e32 v2, 63, v0 -; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX6-NEXT: v_not_b32_e32 v0, v0 ; GFX6-NEXT: v_and_b32_e32 v0, 63, v0 ; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 ; GFX6-NEXT: v_lshl_b64 v[0:1], s[0:1], v0 @@ -5770,7 +5700,7 @@ ; GFX8-LABEL: v_fshr_i64_ssv: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_and_b32_e32 v2, 63, v0 -; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 63, v0 ; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v0, s[0:1] @@ -5782,7 +5712,7 @@ ; GFX9-LABEL: v_fshr_i64_ssv: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_and_b32_e32 v2, 63, v0 -; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX9-NEXT: v_not_b32_e32 v0, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 63, v0 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v0, s[0:1] @@ -5793,7 +5723,7 @@ ; ; GFX10-LABEL: v_fshr_i64_ssv: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_xor_b32_e32 v1, -1, v0 +; GFX10-NEXT: v_not_b32_e32 v1, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 63, v0 ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 ; GFX10-NEXT: v_and_b32_e32 v2, 63, v1 @@ -5805,7 +5735,7 @@ ; ; GFX11-LABEL: v_fshr_i64_ssv: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_xor_b32_e32 v1, -1, v0 +; GFX11-NEXT: v_not_b32_e32 v1, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 63, v0 ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -6036,12 +5966,12 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v9, 63, v8 -; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX6-NEXT: v_not_b32_e32 v8, v8 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 ; GFX6-NEXT: v_and_b32_e32 v8, 63, v8 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v8 ; GFX6-NEXT: v_lshr_b64 v[4:5], v[4:5], v9 -; GFX6-NEXT: v_xor_b32_e32 v8, -1, v10 +; GFX6-NEXT: v_not_b32_e32 v8, v10 ; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 63, v10 @@ -6057,12 +5987,12 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v9, 63, v8 -; GFX8-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX8-NEXT: v_not_b32_e32 v8, v8 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; GFX8-NEXT: v_and_b32_e32 v8, 63, v8 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] ; GFX8-NEXT: v_lshrrev_b64 v[4:5], v9, v[4:5] -; GFX8-NEXT: v_xor_b32_e32 v8, -1, v10 +; GFX8-NEXT: v_not_b32_e32 v8, v10 ; GFX8-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] ; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX8-NEXT: v_and_b32_e32 v4, 63, v10 @@ -6078,12 +6008,12 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v9, 63, v8 -; GFX9-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX9-NEXT: v_not_b32_e32 v8, v8 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; GFX9-NEXT: v_and_b32_e32 v8, 63, v8 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] ; GFX9-NEXT: v_lshrrev_b64 v[4:5], v9, v[4:5] -; GFX9-NEXT: v_xor_b32_e32 v8, -1, v10 +; GFX9-NEXT: v_not_b32_e32 v8, v10 ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] ; GFX9-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX9-NEXT: v_and_b32_e32 v4, 63, v10 @@ -6099,8 +6029,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v9, -1, v8 -; GFX10-NEXT: v_xor_b32_e32 v11, -1, v10 +; GFX10-NEXT: v_not_b32_e32 v9, v8 +; GFX10-NEXT: v_not_b32_e32 v11, v10 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] ; GFX10-NEXT: v_and_b32_e32 v8, 63, v8 @@ -6121,8 +6051,8 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_xor_b32_e32 v9, -1, v8 -; GFX11-NEXT: v_xor_b32_e32 v11, -1, v10 +; GFX11-NEXT: v_not_b32_e32 v9, v8 +; GFX11-NEXT: v_not_b32_e32 v11, v10 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; GFX11-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] ; GFX11-NEXT: v_and_b32_e32 v8, 63, v8 @@ -6390,7 +6320,7 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v14, 0x7f, v8 -; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX6-NEXT: v_not_b32_e32 v8, v8 ; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GFX6-NEXT: v_and_b32_e32 v15, 0x7f, v8 ; GFX6-NEXT: v_lshl_b64 v[8:9], v[0:1], 1 @@ -6438,7 +6368,7 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v14, 0x7f, v8 -; GFX8-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX8-NEXT: v_not_b32_e32 v8, v8 ; GFX8-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] ; GFX8-NEXT: v_and_b32_e32 v15, 0x7f, v8 ; GFX8-NEXT: v_lshlrev_b64 v[8:9], 1, v[0:1] @@ -6486,7 +6416,7 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v14, 0x7f, v8 -; GFX9-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX9-NEXT: v_not_b32_e32 v8, v8 ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] ; GFX9-NEXT: v_and_b32_e32 v15, 0x7f, v8 ; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[0:1] @@ -6534,7 +6464,7 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v9, -1, v8 +; GFX10-NEXT: v_not_b32_e32 v9, v8 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 31, v1 ; GFX10-NEXT: v_and_b32_e32 v19, 0x7f, v8 @@ -6583,7 +6513,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_xor_b32_e32 v9, -1, v8 +; GFX11-NEXT: v_not_b32_e32 v9, v8 ; GFX11-NEXT: v_lshrrev_b32_e32 v10, 31, v1 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; GFX11-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] @@ -6639,7 +6569,7 @@ ; GFX6-LABEL: v_fshr_i128_ssv: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_and_b32_e32 v6, 0x7f, v0 -; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX6-NEXT: v_not_b32_e32 v0, v0 ; GFX6-NEXT: s_mov_b32 s9, 0 ; GFX6-NEXT: v_and_b32_e32 v7, 0x7f, v0 ; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 @@ -6691,7 +6621,7 @@ ; GFX8-LABEL: v_fshr_i128_ssv: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_and_b32_e32 v6, 0x7f, v0 -; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: s_mov_b32 s9, 0 ; GFX8-NEXT: v_and_b32_e32 v7, 0x7f, v0 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 @@ -6743,7 +6673,7 @@ ; GFX9-LABEL: v_fshr_i128_ssv: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_and_b32_e32 v6, 0x7f, v0 -; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX9-NEXT: v_not_b32_e32 v0, v0 ; GFX9-NEXT: s_mov_b32 s9, 0 ; GFX9-NEXT: v_and_b32_e32 v7, 0x7f, v0 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 @@ -6794,7 +6724,7 @@ ; ; GFX10-LABEL: v_fshr_i128_ssv: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_xor_b32_e32 v1, -1, v0 +; GFX10-NEXT: v_not_b32_e32 v1, v0 ; GFX10-NEXT: v_and_b32_e32 v13, 0x7f, v0 ; GFX10-NEXT: s_mov_b32 s9, 0 ; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 @@ -6842,7 +6772,7 @@ ; ; GFX11-LABEL: v_fshr_i128_ssv: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_xor_b32_e32 v1, -1, v0 +; GFX11-NEXT: v_not_b32_e32 v1, v0 ; GFX11-NEXT: s_lshr_b32 s8, s1, 31 ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 ; GFX11-NEXT: s_mov_b32 s9, 0 @@ -8036,7 +7966,7 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v23, 0x7f, v16 -; GFX6-NEXT: v_xor_b32_e32 v16, -1, v16 +; GFX6-NEXT: v_not_b32_e32 v16, v16 ; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GFX6-NEXT: v_and_b32_e32 v24, 0x7f, v16 ; GFX6-NEXT: v_lshl_b64 v[16:17], v[0:1], 1 @@ -8072,7 +8002,7 @@ ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc -; GFX6-NEXT: v_xor_b32_e32 v8, -1, v20 +; GFX6-NEXT: v_not_b32_e32 v8, v20 ; GFX6-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[4:5] ; GFX6-NEXT: v_or_b32_e32 v3, v19, v3 @@ -8127,7 +8057,7 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v23, 0x7f, v16 -; GFX8-NEXT: v_xor_b32_e32 v16, -1, v16 +; GFX8-NEXT: v_not_b32_e32 v16, v16 ; GFX8-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] ; GFX8-NEXT: v_and_b32_e32 v24, 0x7f, v16 ; GFX8-NEXT: v_lshlrev_b64 v[16:17], 1, v[0:1] @@ -8163,7 +8093,7 @@ ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc -; GFX8-NEXT: v_xor_b32_e32 v8, -1, v20 +; GFX8-NEXT: v_not_b32_e32 v8, v20 ; GFX8-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[4:5] ; GFX8-NEXT: v_or_b32_e32 v3, v19, v3 @@ -8218,7 +8148,7 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v23, 0x7f, v16 -; GFX9-NEXT: v_xor_b32_e32 v16, -1, v16 +; GFX9-NEXT: v_not_b32_e32 v16, v16 ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] ; GFX9-NEXT: v_and_b32_e32 v24, 0x7f, v16 ; GFX9-NEXT: v_lshlrev_b64 v[16:17], 1, v[0:1] @@ -8254,7 +8184,7 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc -; GFX9-NEXT: v_xor_b32_e32 v8, -1, v20 +; GFX9-NEXT: v_not_b32_e32 v8, v20 ; GFX9-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[4:5] ; GFX9-NEXT: v_or_b32_e32 v3, v19, v3 @@ -8309,7 +8239,7 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v17, -1, v16 +; GFX10-NEXT: v_not_b32_e32 v17, v16 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] ; GFX10-NEXT: v_and_b32_e32 v26, 0x7f, v16 ; GFX10-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] @@ -8343,7 +8273,7 @@ ; GFX10-NEXT: v_cndmask_b32_e32 v22, v22, v3, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v26 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v16, s4 -; GFX10-NEXT: v_xor_b32_e32 v16, -1, v20 +; GFX10-NEXT: v_not_b32_e32 v16, v20 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v17, s4 ; GFX10-NEXT: v_lshrrev_b64 v[2:3], v26, v[10:11] ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo @@ -8401,7 +8331,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_xor_b32_e32 v17, -1, v16 +; GFX11-NEXT: v_not_b32_e32 v17, v16 ; GFX11-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] ; GFX11-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) @@ -8438,7 +8368,7 @@ ; GFX11-NEXT: v_or_b32_e32 v16, v16, v18 ; GFX11-NEXT: v_or_b32_e32 v17, v17, v19 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v16, s0 -; GFX11-NEXT: v_xor_b32_e32 v16, -1, v20 +; GFX11-NEXT: v_not_b32_e32 v16, v20 ; GFX11-NEXT: v_cndmask_b32_e32 v18, v21, v2, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v17, s0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll @@ -283,7 +283,7 @@ ; GFX9-NEXT: s_and_b32 s2, s4, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s2 ; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s1 -; GFX9-NEXT: v_xor_b32_e32 v3, -1, v0 +; GFX9-NEXT: v_not_b32_e32 v3, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -300,7 +300,7 @@ ; GFX8-NEXT: s_and_b32 s2, s4, 0xffff ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s2 ; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s1 -; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v3, s0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 @@ -317,7 +317,7 @@ ; GFX7-NEXT: s_and_b32 s1, s4, 0xffff ; GFX7-NEXT: v_lshl_b32_e32 v2, s1, v0 ; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX7-NEXT: v_not_b32_e32 v0, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v3, s0, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 @@ -334,7 +334,7 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, s1 -; GFX10-NEXT: v_xor_b32_e32 v3, -1, v1 +; GFX10-NEXT: v_not_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -352,7 +352,7 @@ ; GFX11-NEXT: v_lshlrev_b32_e64 v1, v0, 0xffff ; GFX11-NEXT: v_lshlrev_b32_e64 v2, v0, s1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_xor_b32_e32 v3, -1, v1 +; GFX11-NEXT: v_not_b32_e32 v3, v1 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -375,7 +375,7 @@ ; GFX9-NEXT: s_mov_b32 s1, 0xffff ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX9-NEXT: v_lshlrev_b32_e64 v0, v1, s1 -; GFX9-NEXT: v_xor_b32_e32 v3, -1, v0 +; GFX9-NEXT: v_not_b32_e32 v3, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -391,7 +391,7 @@ ; GFX8-NEXT: s_mov_b32 s1, 0xffff ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_lshlrev_b32_e64 v0, v1, s1 -; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v3, s0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 @@ -408,7 +408,7 @@ ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v0 ; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v1 -; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX7-NEXT: v_not_b32_e32 v0, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v3, s0, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 @@ -426,7 +426,7 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX10-NEXT: v_not_b32_e32 v2, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_and_or_b32 v2, s0, v2, v3 ; GFX10-NEXT: global_store_dword v[0:1], v2, off @@ -443,7 +443,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_lshlrev_b32 v3, v1, v0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX11-NEXT: v_not_b32_e32 v2, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_and_or_b32 v2, s0, v2, v3 @@ -466,7 +466,7 @@ ; GFX9-NEXT: s_and_b32 s1, s2, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s1 ; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s0 -; GFX9-NEXT: v_xor_b32_e32 v4, -1, v0 +; GFX9-NEXT: v_not_b32_e32 v4, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -483,7 +483,7 @@ ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v1, s1 ; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 -; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: v_not_b32_e32 v1, v1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v3, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 @@ -500,7 +500,7 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v1 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 -; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX7-NEXT: v_not_b32_e32 v1, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v3, v0, v1 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 @@ -517,7 +517,7 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, s0 -; GFX10-NEXT: v_xor_b32_e32 v4, -1, v1 +; GFX10-NEXT: v_not_b32_e32 v4, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -535,7 +535,7 @@ ; GFX11-NEXT: v_lshlrev_b32_e64 v1, v0, 0xffff ; GFX11-NEXT: v_lshlrev_b32_e64 v2, v0, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_xor_b32_e32 v4, -1, v1 +; GFX11-NEXT: v_not_b32_e32 v4, v1 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -647,7 +647,7 @@ ; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s0 -; GFX9-NEXT: v_xor_b32_e32 v3, -1, v0 +; GFX9-NEXT: v_not_b32_e32 v3, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -663,7 +663,7 @@ ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 -; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: v_not_b32_e32 v1, v1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v3, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 @@ -680,7 +680,7 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 -; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX7-NEXT: v_not_b32_e32 v1, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v3, v0, v1 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 @@ -696,7 +696,7 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, 0xffff ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_xor_b32_e32 v3, -1, v1 +; GFX10-NEXT: v_not_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -715,7 +715,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_lshlrev_b32 v3, v0, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX11-NEXT: v_not_b32_e32 v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_and_or_b32 v2, v4, v2, v3 @@ -1053,7 +1053,7 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_lshlrev_b32_e64 v3, v0, s3 ; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s2 -; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX9-NEXT: v_not_b32_e32 v0, v0 ; GFX9-NEXT: v_and_or_b32 v4, v1, v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -1080,7 +1080,7 @@ ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v0, s3 ; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s2 -; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 ; GFX8-NEXT: v_or_b32_e32 v4, v0, v3 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -1107,7 +1107,7 @@ ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX7-NEXT: v_lshl_b32_e32 v3, s2, v0 ; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX7-NEXT: v_not_b32_e32 v0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_or_b32_e32 v4, v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -1130,7 +1130,7 @@ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, s2 -; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX10-NEXT: v_not_b32_e32 v2, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s1 ; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v0, vcc_lo @@ -1161,7 +1161,7 @@ ; GFX11-NEXT: v_lshlrev_b32_e64 v3, v1, s2 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v4 -; GFX11-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX11-NEXT: v_not_b32_e32 v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_and_or_b32 v5, v5, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 @@ -1192,7 +1192,7 @@ ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX9-NEXT: v_not_b32_e32 v1, v1 ; GFX9-NEXT: v_and_or_b32 v4, v3, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -1218,7 +1218,7 @@ ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: v_not_b32_e32 v1, v1 ; GFX8-NEXT: v_and_b32_e32 v1, v3, v1 ; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -1245,7 +1245,7 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX7-NEXT: v_not_b32_e32 v1, v1 ; GFX7-NEXT: v_and_b32_e32 v1, v3, v1 ; GFX7-NEXT: v_or_b32_e32 v4, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -1267,7 +1267,7 @@ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v2, 0xffff ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX10-NEXT: v_not_b32_e32 v3, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v1, vcc_lo @@ -1296,7 +1296,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b32_e64 v3, v2, 0xffff ; GFX11-NEXT: v_lshlrev_b32_e32 v2, v2, v0 -; GFX11-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX11-NEXT: v_not_b32_e32 v3, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_and_or_b32 v5, v5, v3, v2 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -1327,7 +1327,7 @@ ; GFX9-NEXT: v_lshlrev_b32_e64 v6, v2, s1 ; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 -; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX9-NEXT: v_not_b32_e32 v2, v2 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 @@ -1350,7 +1350,7 @@ ; GFX8-NEXT: v_lshlrev_b32_e64 v6, v2, s1 ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 @@ -1373,7 +1373,7 @@ ; GFX7-NEXT: v_lshl_b32_e32 v6, s0, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 -; GFX7-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX7-NEXT: v_not_b32_e32 v2, v2 ; GFX7-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 ; GFX7-NEXT: v_mov_b32_e32 v4, 0 @@ -1397,7 +1397,7 @@ ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v3, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v5 -; GFX10-NEXT: v_xor_b32_e32 v3, -1, v4 +; GFX10-NEXT: v_not_b32_e32 v3, v4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo ; GFX10-NEXT: v_and_or_b32 v4, v4, v3, v2 @@ -1421,7 +1421,7 @@ ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-NEXT: v_lshlrev_b32_e64 v2, v3, s0 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v5 -; GFX11-NEXT: v_xor_b32_e32 v3, -1, v4 +; GFX11-NEXT: v_not_b32_e32 v3, v4 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) @@ -1569,7 +1569,7 @@ ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX9-NEXT: v_lshlrev_b32_e64 v3, v3, s0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 -; GFX9-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX9-NEXT: v_not_b32_e32 v3, v3 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 @@ -1591,7 +1591,7 @@ ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v3, s0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 -; GFX8-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX8-NEXT: v_not_b32_e32 v3, v3 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 @@ -1614,7 +1614,7 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v3, v2 ; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 -; GFX7-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX7-NEXT: v_not_b32_e32 v3, v3 ; GFX7-NEXT: v_mov_b32_e32 v4, 0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 ; GFX7-NEXT: v_mov_b32_e32 v5, 0 @@ -1637,7 +1637,7 @@ ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v6 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_xor_b32_e32 v3, -1, v5 +; GFX10-NEXT: v_not_b32_e32 v3, v5 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo ; GFX10-NEXT: v_and_or_b32 v4, v4, v3, v2 @@ -1662,7 +1662,7 @@ ; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_xor_b32_e32 v3, -1, v5 +; GFX11-NEXT: v_not_b32_e32 v3, v5 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo ; GFX11-NEXT: v_and_or_b32 v4, v4, v3, v2 @@ -2222,7 +2222,7 @@ ; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s4 ; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s5 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] -; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX9-NEXT: v_not_b32_e32 v0, v0 ; GFX9-NEXT: v_and_or_b32 v6, v1, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_mov_b32_e32 v1, s9 @@ -2259,7 +2259,7 @@ ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4 ; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s5 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] -; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 ; GFX8-NEXT: v_or_b32_e32 v6, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s8 @@ -2296,7 +2296,7 @@ ; GFX7-NEXT: v_lshl_b32_e32 v2, s4, v0 ; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] -; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX7-NEXT: v_not_b32_e32 v0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_or_b32_e32 v5, v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v0, s8 @@ -2327,7 +2327,7 @@ ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v1, s1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v6 -; GFX10-NEXT: v_xor_b32_e32 v5, -1, v2 +; GFX10-NEXT: v_not_b32_e32 v5, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s9 ; GFX10-NEXT: v_cndmask_b32_e32 v0, s8, v0, vcc_lo @@ -2366,7 +2366,7 @@ ; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v6 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s10, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_xor_b32_e32 v5, -1, v2 +; GFX11-NEXT: v_not_b32_e32 v5, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v7, v0, s11, s1 ; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 ; GFX11-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11 @@ -2408,7 +2408,7 @@ ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s8 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] -; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX9-NEXT: v_not_b32_e32 v1, v1 ; GFX9-NEXT: v_and_or_b32 v6, v2, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 @@ -2444,7 +2444,7 @@ ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s8 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] -; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: v_not_b32_e32 v1, v1 ; GFX8-NEXT: v_and_b32_e32 v1, v2, v1 ; GFX8-NEXT: v_or_b32_e32 v6, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 @@ -2481,7 +2481,7 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] -; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX7-NEXT: v_not_b32_e32 v1, v1 ; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 ; GFX7-NEXT: v_or_b32_e32 v5, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2512,7 +2512,7 @@ ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v6 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v2, 0xffff ; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_xor_b32_e32 v5, -1, v3 +; GFX10-NEXT: v_not_b32_e32 v5, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: v_cndmask_b32_e32 v1, s4, v1, vcc_lo @@ -2551,7 +2551,7 @@ ; GFX11-NEXT: v_lshlrev_b32_e32 v4, v2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s6, s0 -; GFX11-NEXT: v_xor_b32_e32 v5, -1, v3 +; GFX11-NEXT: v_not_b32_e32 v5, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e64 v7, v1, s7, s1 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -2588,7 +2588,7 @@ ; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX9-NEXT: v_not_b32_e32 v1, v1 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 @@ -2617,7 +2617,7 @@ ; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: v_not_b32_e32 v1, v1 ; GFX8-NEXT: v_mov_b32_e32 v7, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 ; GFX8-NEXT: v_mov_b32_e32 v8, 0 @@ -2649,7 +2649,7 @@ ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX7-NEXT: v_not_b32_e32 v1, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 ; GFX7-NEXT: s_mov_b32 s10, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -2678,7 +2678,7 @@ ; GFX10-NEXT: v_lshlrev_b32_e64 v7, v0, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, s1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v1 -; GFX10-NEXT: v_xor_b32_e32 v7, -1, v7 +; GFX10-NEXT: v_not_b32_e32 v7, v7 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v5, s0 @@ -2708,7 +2708,7 @@ ; GFX11-NEXT: v_lshlrev_b32_e64 v7, v0, 0xffff ; GFX11-NEXT: v_lshlrev_b32_e64 v0, v0, s1 ; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v1 -; GFX11-NEXT: v_xor_b32_e32 v7, -1, v7 +; GFX11-NEXT: v_not_b32_e32 v7, v7 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2895,7 +2895,7 @@ ; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX9-NEXT: v_not_b32_e32 v1, v1 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 ; GFX9-NEXT: v_mov_b32_e32 v9, 0 @@ -2923,7 +2923,7 @@ ; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: v_not_b32_e32 v1, v1 ; GFX8-NEXT: v_mov_b32_e32 v8, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 ; GFX8-NEXT: v_mov_b32_e32 v9, 0 @@ -2955,7 +2955,7 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX7-NEXT: v_not_b32_e32 v1, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 ; GFX7-NEXT: s_mov_b32 s10, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -2983,7 +2983,7 @@ ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v1 ; GFX10-NEXT: v_lshlrev_b32_e64 v8, v0, 0xffff ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_xor_b32_e32 v2, -1, v8 +; GFX10-NEXT: v_not_b32_e32 v2, v8 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: v_mov_b32_e32 v9, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3017,7 +3017,7 @@ ; GFX11-NEXT: v_lshlrev_b32_e32 v0, v0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1 -; GFX11-NEXT: v_xor_b32_e32 v2, -1, v8 +; GFX11-NEXT: v_not_b32_e32 v2, v8 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: v_mov_b32_e32 v9, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -3587,7 +3587,7 @@ ; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s4 ; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s5 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11] -; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX9-NEXT: v_not_b32_e32 v0, v0 ; GFX9-NEXT: v_and_or_b32 v9, v1, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 @@ -3647,7 +3647,7 @@ ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4 ; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s5 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11] -; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 ; GFX8-NEXT: v_or_b32_e32 v9, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s16 @@ -3707,7 +3707,7 @@ ; GFX7-NEXT: v_lshl_b32_e32 v2, s4, v0 ; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11] -; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX7-NEXT: v_not_b32_e32 v0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_or_b32_e32 v9, v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v0, s16 @@ -3754,7 +3754,7 @@ ; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v12 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s9 -; GFX10-NEXT: v_xor_b32_e32 v9, -1, v2 +; GFX10-NEXT: v_not_b32_e32 v9, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v1, s8, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s10, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s11, s1 @@ -3808,7 +3808,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, s9 ; GFX11-NEXT: v_cmp_eq_u32_e64 s5, 7, v12 -; GFX11-NEXT: v_xor_b32_e32 v9, -1, v2 +; GFX11-NEXT: v_not_b32_e32 v9, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v1, s8, v1, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s10, s0 @@ -3880,7 +3880,7 @@ ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s20 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11] -; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX9-NEXT: v_not_b32_e32 v1, v1 ; GFX9-NEXT: v_and_or_b32 v9, v2, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s12 ; GFX9-NEXT: v_mov_b32_e32 v1, s13 @@ -3939,7 +3939,7 @@ ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s20 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11] -; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: v_not_b32_e32 v1, v1 ; GFX8-NEXT: v_and_b32_e32 v1, v2, v1 ; GFX8-NEXT: v_or_b32_e32 v9, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s12 @@ -3999,7 +3999,7 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11] -; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX7-NEXT: v_not_b32_e32 v1, v1 ; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 ; GFX7-NEXT: v_or_b32_e32 v9, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s12 @@ -4046,7 +4046,7 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, s9 -; GFX10-NEXT: v_xor_b32_e32 v9, -1, v3 +; GFX10-NEXT: v_not_b32_e32 v9, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v2, s8, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s10, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s11, s1 @@ -4100,7 +4100,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v2, s9 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, v1, v0 -; GFX11-NEXT: v_xor_b32_e32 v9, -1, v3 +; GFX11-NEXT: v_not_b32_e32 v9, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v2, s8, v2, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s10, s0 @@ -4160,7 +4160,7 @@ ; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 -; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX9-NEXT: v_not_b32_e32 v1, v1 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_cndmask_b32_e32 v11, v3, v4, vcc @@ -4208,7 +4208,7 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 -; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: v_not_b32_e32 v1, v1 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_cndmask_b32_e32 v11, v3, v4, vcc @@ -4257,7 +4257,7 @@ ; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 -; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX7-NEXT: v_not_b32_e32 v1, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 ; GFX7-NEXT: s_mov_b32 s18, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) @@ -4305,7 +4305,7 @@ ; GFX10-NEXT: v_lshlrev_b32_e64 v11, v2, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v2, s5 ; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v0 -; GFX10-NEXT: v_xor_b32_e32 v11, -1, v11 +; GFX10-NEXT: v_not_b32_e32 v11, v11 ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v5, s0 @@ -4351,7 +4351,7 @@ ; GFX11-NEXT: v_lshlrev_b32_e64 v11, v2, 0xffff ; GFX11-NEXT: v_lshlrev_b32_e64 v2, v2, s5 ; GFX11-NEXT: v_cmp_eq_u32_e64 s5, 7, v0 -; GFX11-NEXT: v_xor_b32_e32 v11, -1, v11 +; GFX11-NEXT: v_not_b32_e32 v11, v11 ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -4556,7 +4556,7 @@ ; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 -; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX9-NEXT: v_not_b32_e32 v1, v1 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc @@ -4603,7 +4603,7 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 -; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: v_not_b32_e32 v1, v1 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc @@ -4652,7 +4652,7 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 -; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX7-NEXT: v_not_b32_e32 v1, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 ; GFX7-NEXT: s_mov_b32 s18, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) @@ -4699,7 +4699,7 @@ ; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v0 ; GFX10-NEXT: v_lshlrev_b32_e64 v12, v3, 0xffff ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_xor_b32_e32 v3, -1, v12 +; GFX10-NEXT: v_not_b32_e32 v3, v12 ; GFX10-NEXT: v_mov_b32_e32 v12, 0 ; GFX10-NEXT: v_mov_b32_e32 v13, 0 ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -4745,7 +4745,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b32_e64 v12, v3, 0xffff ; GFX11-NEXT: v_lshlrev_b32_e32 v2, v3, v2 -; GFX11-NEXT: v_xor_b32_e32 v3, -1, v12 +; GFX11-NEXT: v_not_b32_e32 v3, v12 ; GFX11-NEXT: v_mov_b32_e32 v12, 0 ; GFX11-NEXT: v_mov_b32_e32 v13, 0 ; GFX11-NEXT: s_waitcnt vmcnt(1) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll @@ -1077,7 +1077,7 @@ ; GFX9-NEXT: s_and_b32 s2, s4, 0xff ; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s2 ; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s1 -; GFX9-NEXT: v_xor_b32_e32 v3, -1, v0 +; GFX9-NEXT: v_not_b32_e32 v3, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1094,7 +1094,7 @@ ; GFX8-NEXT: s_and_b32 s2, s4, 0xff ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s2 ; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s1 -; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v3, s0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 @@ -1111,7 +1111,7 @@ ; GFX7-NEXT: s_and_b32 s1, s4, 0xff ; GFX7-NEXT: v_lshl_b32_e32 v1, s1, v0 ; GFX7-NEXT: v_lshl_b32_e32 v0, 0xff, v0 -; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX7-NEXT: v_not_b32_e32 v0, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1129,7 +1129,7 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, 0xff ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, s1 -; GFX10-NEXT: v_xor_b32_e32 v3, -1, v1 +; GFX10-NEXT: v_not_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1147,7 +1147,7 @@ ; GFX11-NEXT: v_lshlrev_b32_e64 v1, v0, 0xff ; GFX11-NEXT: v_lshlrev_b32_e64 v2, v0, s1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_xor_b32_e32 v3, -1, v1 +; GFX11-NEXT: v_not_b32_e32 v3, v1 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1170,7 +1170,7 @@ ; GFX9-NEXT: s_movk_i32 s1, 0xff ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_lshlrev_b32_e64 v0, v1, s1 -; GFX9-NEXT: v_xor_b32_e32 v3, -1, v0 +; GFX9-NEXT: v_not_b32_e32 v3, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1186,7 +1186,7 @@ ; GFX8-NEXT: s_movk_i32 s1, 0xff ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshlrev_b32_e64 v0, v1, s1 -; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v3, s0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 @@ -1203,7 +1203,7 @@ ; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1 -; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX7-NEXT: v_not_b32_e32 v1, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, s0, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 @@ -1222,7 +1222,7 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX10-NEXT: v_not_b32_e32 v2, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_and_or_b32 v2, s0, v2, v3 ; GFX10-NEXT: global_store_dword v[0:1], v2, off @@ -1239,7 +1239,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_lshlrev_b32 v3, v1, v0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX11-NEXT: v_not_b32_e32 v2, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_and_or_b32 v2, s0, v2, v3 @@ -1262,7 +1262,7 @@ ; GFX9-NEXT: s_and_b32 s1, s2, 0xff ; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s1 ; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s0 -; GFX9-NEXT: v_xor_b32_e32 v4, -1, v0 +; GFX9-NEXT: v_not_b32_e32 v4, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1279,7 +1279,7 @@ ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v1, s1 ; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 -; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: v_not_b32_e32 v1, v1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v3, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 @@ -1299,7 +1299,7 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v1 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1 -; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX7-NEXT: v_not_b32_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 @@ -1315,7 +1315,7 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, 0xff ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, s0 -; GFX10-NEXT: v_xor_b32_e32 v4, -1, v1 +; GFX10-NEXT: v_not_b32_e32 v4, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1333,7 +1333,7 @@ ; GFX11-NEXT: v_lshlrev_b32_e64 v1, v0, 0xff ; GFX11-NEXT: v_lshlrev_b32_e64 v2, v0, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_xor_b32_e32 v4, -1, v1 +; GFX11-NEXT: v_not_b32_e32 v4, v1 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1447,7 +1447,7 @@ ; GFX9-NEXT: s_movk_i32 s0, 0xff ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s0 -; GFX9-NEXT: v_xor_b32_e32 v3, -1, v0 +; GFX9-NEXT: v_not_b32_e32 v3, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1463,7 +1463,7 @@ ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 -; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: v_not_b32_e32 v1, v1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v3, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 @@ -1483,7 +1483,7 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1 -; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX7-NEXT: v_not_b32_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 @@ -1498,7 +1498,7 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, 0xff ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_xor_b32_e32 v3, -1, v1 +; GFX10-NEXT: v_not_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1517,7 +1517,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_lshlrev_b32 v3, v0, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX11-NEXT: v_not_b32_e32 v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_and_or_b32 v2, v4, v2, v3 @@ -1935,7 +1935,7 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_lshlrev_b32_e64 v3, v0, s3 ; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s2 -; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX9-NEXT: v_not_b32_e32 v0, v0 ; GFX9-NEXT: v_and_or_b32 v4, v1, v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -1962,7 +1962,7 @@ ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v0, s3 ; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s2 -; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 ; GFX8-NEXT: v_or_b32_e32 v4, v0, v3 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -1989,7 +1989,7 @@ ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX7-NEXT: v_lshl_b32_e32 v3, s2, v0 ; GFX7-NEXT: v_lshl_b32_e32 v0, 0xff, v0 -; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX7-NEXT: v_not_b32_e32 v0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_or_b32_e32 v3, v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -2013,7 +2013,7 @@ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xff ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, s2 -; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX10-NEXT: v_not_b32_e32 v2, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s1 ; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v0, vcc_lo @@ -2044,7 +2044,7 @@ ; GFX11-NEXT: v_lshlrev_b32_e64 v3, v1, s2 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v4 -; GFX11-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX11-NEXT: v_not_b32_e32 v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_and_or_b32 v5, v5, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 @@ -2075,7 +2075,7 @@ ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX9-NEXT: v_not_b32_e32 v1, v1 ; GFX9-NEXT: v_and_or_b32 v4, v3, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -2101,7 +2101,7 @@ ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: v_not_b32_e32 v1, v1 ; GFX8-NEXT: v_and_b32_e32 v1, v3, v1 ; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -2128,7 +2128,7 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX7-NEXT: v_not_b32_e32 v1, v1 ; GFX7-NEXT: v_and_b32_e32 v1, v3, v1 ; GFX7-NEXT: v_or_b32_e32 v3, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -2151,7 +2151,7 @@ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v2, 0xff ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX10-NEXT: v_not_b32_e32 v3, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v1, vcc_lo @@ -2180,7 +2180,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b32_e64 v3, v2, 0xff ; GFX11-NEXT: v_lshlrev_b32_e32 v2, v2, v0 -; GFX11-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX11-NEXT: v_not_b32_e32 v3, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_and_or_b32 v5, v5, v3, v2 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -2211,7 +2211,7 @@ ; GFX9-NEXT: v_lshlrev_b32_e64 v6, v2, s1 ; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 -; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX9-NEXT: v_not_b32_e32 v2, v2 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 @@ -2234,7 +2234,7 @@ ; GFX8-NEXT: v_lshlrev_b32_e64 v6, v2, s1 ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 @@ -2260,7 +2260,7 @@ ; GFX7-NEXT: v_lshl_b32_e32 v4, s0, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xff, v2 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; GFX7-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX7-NEXT: v_not_b32_e32 v2, v2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v3 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -2283,7 +2283,7 @@ ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xff ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v3, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v5 -; GFX10-NEXT: v_xor_b32_e32 v3, -1, v4 +; GFX10-NEXT: v_not_b32_e32 v3, v4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo ; GFX10-NEXT: v_and_or_b32 v4, v4, v3, v2 @@ -2307,7 +2307,7 @@ ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xff ; GFX11-NEXT: v_lshlrev_b32_e64 v2, v3, s0 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v5 -; GFX11-NEXT: v_xor_b32_e32 v3, -1, v4 +; GFX11-NEXT: v_not_b32_e32 v3, v4 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) @@ -2457,7 +2457,7 @@ ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_lshlrev_b32_e64 v3, v3, s0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 -; GFX9-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX9-NEXT: v_not_b32_e32 v3, v3 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 @@ -2479,7 +2479,7 @@ ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v3, s0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 -; GFX8-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX8-NEXT: v_not_b32_e32 v3, v3 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 @@ -2505,7 +2505,7 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v3, v2 ; GFX7-NEXT: v_lshl_b32_e32 v3, 0xff, v3 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX7-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX7-NEXT: v_not_b32_e32 v3, v3 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -2527,7 +2527,7 @@ ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v6 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xff ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_xor_b32_e32 v3, -1, v5 +; GFX10-NEXT: v_not_b32_e32 v3, v5 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo ; GFX10-NEXT: v_and_or_b32 v4, v4, v3, v2 @@ -2552,7 +2552,7 @@ ; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xff ; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_xor_b32_e32 v3, -1, v5 +; GFX11-NEXT: v_not_b32_e32 v3, v5 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo ; GFX11-NEXT: v_and_or_b32 v4, v4, v3, v2 @@ -3112,7 +3112,7 @@ ; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s4 ; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s5 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] -; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX9-NEXT: v_not_b32_e32 v0, v0 ; GFX9-NEXT: v_and_or_b32 v6, v1, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_mov_b32_e32 v1, s9 @@ -3149,7 +3149,7 @@ ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4 ; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s5 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] -; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 ; GFX8-NEXT: v_or_b32_e32 v6, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s8 @@ -3186,7 +3186,7 @@ ; GFX7-NEXT: v_lshl_b32_e32 v2, s4, v0 ; GFX7-NEXT: v_lshl_b32_e32 v0, 0xff, v0 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] -; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX7-NEXT: v_not_b32_e32 v0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_or_b32_e32 v5, v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v0, s8 @@ -3217,7 +3217,7 @@ ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xff ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v1, s1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v6 -; GFX10-NEXT: v_xor_b32_e32 v5, -1, v2 +; GFX10-NEXT: v_not_b32_e32 v5, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s9 ; GFX10-NEXT: v_cndmask_b32_e32 v0, s8, v0, vcc_lo @@ -3256,7 +3256,7 @@ ; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v6 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s10, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_xor_b32_e32 v5, -1, v2 +; GFX11-NEXT: v_not_b32_e32 v5, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v7, v0, s11, s1 ; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 ; GFX11-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11 @@ -3298,7 +3298,7 @@ ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s8 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] -; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX9-NEXT: v_not_b32_e32 v1, v1 ; GFX9-NEXT: v_and_or_b32 v6, v2, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 @@ -3334,7 +3334,7 @@ ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s8 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] -; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: v_not_b32_e32 v1, v1 ; GFX8-NEXT: v_and_b32_e32 v1, v2, v1 ; GFX8-NEXT: v_or_b32_e32 v6, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 @@ -3371,7 +3371,7 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] -; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX7-NEXT: v_not_b32_e32 v1, v1 ; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 ; GFX7-NEXT: v_or_b32_e32 v5, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -3402,7 +3402,7 @@ ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v6 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v2, 0xff ; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_xor_b32_e32 v5, -1, v3 +; GFX10-NEXT: v_not_b32_e32 v5, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: v_cndmask_b32_e32 v1, s4, v1, vcc_lo @@ -3441,7 +3441,7 @@ ; GFX11-NEXT: v_lshlrev_b32_e32 v4, v2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s6, s0 -; GFX11-NEXT: v_xor_b32_e32 v5, -1, v3 +; GFX11-NEXT: v_not_b32_e32 v5, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e64 v7, v1, s7, s1 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -3478,7 +3478,7 @@ ; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX9-NEXT: v_not_b32_e32 v1, v1 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 @@ -3507,7 +3507,7 @@ ; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: v_not_b32_e32 v1, v1 ; GFX8-NEXT: v_mov_b32_e32 v7, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 ; GFX8-NEXT: v_mov_b32_e32 v8, 0 @@ -3539,7 +3539,7 @@ ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX7-NEXT: v_not_b32_e32 v1, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 ; GFX7-NEXT: s_mov_b32 s10, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -3568,7 +3568,7 @@ ; GFX10-NEXT: v_lshlrev_b32_e64 v7, v0, 0xff ; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, s1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v1 -; GFX10-NEXT: v_xor_b32_e32 v7, -1, v7 +; GFX10-NEXT: v_not_b32_e32 v7, v7 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v5, s0 @@ -3598,7 +3598,7 @@ ; GFX11-NEXT: v_lshlrev_b32_e64 v7, v0, 0xff ; GFX11-NEXT: v_lshlrev_b32_e64 v0, v0, s1 ; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v1 -; GFX11-NEXT: v_xor_b32_e32 v7, -1, v7 +; GFX11-NEXT: v_not_b32_e32 v7, v7 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -3785,7 +3785,7 @@ ; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX9-NEXT: v_not_b32_e32 v1, v1 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 ; GFX9-NEXT: v_mov_b32_e32 v9, 0 @@ -3813,7 +3813,7 @@ ; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: v_not_b32_e32 v1, v1 ; GFX8-NEXT: v_mov_b32_e32 v8, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 ; GFX8-NEXT: v_mov_b32_e32 v9, 0 @@ -3845,7 +3845,7 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX7-NEXT: v_not_b32_e32 v1, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 ; GFX7-NEXT: s_mov_b32 s10, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -3873,7 +3873,7 @@ ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v1 ; GFX10-NEXT: v_lshlrev_b32_e64 v8, v0, 0xff ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_xor_b32_e32 v2, -1, v8 +; GFX10-NEXT: v_not_b32_e32 v2, v8 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: v_mov_b32_e32 v9, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3907,7 +3907,7 @@ ; GFX11-NEXT: v_lshlrev_b32_e32 v0, v0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1 -; GFX11-NEXT: v_xor_b32_e32 v2, -1, v8 +; GFX11-NEXT: v_not_b32_e32 v2, v8 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: v_mov_b32_e32 v9, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-abs.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-abs.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-abs.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-abs.mir @@ -77,7 +77,7 @@ ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: %src0:vgpr_32 = COPY $vgpr0 ; GFX6-NEXT: %zero:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX6-NEXT: %ineg:vgpr_32, dead %4:sreg_64_xexec = V_SUB_CO_U32_e64 %zero, %src0, 0, implicit $exec + ; GFX6-NEXT: %ineg:vgpr_32, dead %4:sreg_64 = V_SUB_CO_U32_e64 %zero, %src0, 0, implicit $exec ; GFX6-NEXT: %smax:vgpr_32 = V_MAX_I32_e64 %src0, %ineg, implicit $exec ; GFX6-NEXT: S_ENDPGM 0, implicit %smax ; GFX9-LABEL: name: smax_neg_abs_pattern_s32_vv diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.mir @@ -22,10 +22,10 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY]], [[COPY1]], implicit-def $scc - ; GFX6-NEXT: %7:vgpr_32, dead %12:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[S_ADD_I32_]], 0, implicit $exec - ; GFX6-NEXT: %8:vgpr_32, dead %11:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_ADD_I32_]], %7, 0, implicit $exec - ; GFX6-NEXT: %9:vgpr_32, dead %10:sreg_64_xexec = V_ADD_CO_U32_e64 %8, [[COPY2]], 0, implicit $exec - ; GFX6-NEXT: S_ENDPGM 0, implicit [[S_ADD_I32_]], implicit %7, implicit %8, implicit %9 + ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY2]], [[S_ADD_I32_]], 0, implicit $exec + ; GFX6-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[S_ADD_I32_]], [[V_ADD_CO_U32_e64_]], 0, implicit $exec + ; GFX6-NEXT: [[V_ADD_CO_U32_e64_4:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_5:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[V_ADD_CO_U32_e64_2]], [[COPY2]], 0, implicit $exec + ; GFX6-NEXT: S_ENDPGM 0, implicit [[S_ADD_I32_]], implicit [[V_ADD_CO_U32_e64_]], implicit [[V_ADD_CO_U32_e64_2]], implicit [[V_ADD_CO_U32_e64_4]] ; GFX9-LABEL: name: add_s32 ; GFX9: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr3_vgpr4 ; GFX9-NEXT: {{ $}} @@ -103,8 +103,8 @@ ; GFX6: liveins: $vgpr0 ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6-NEXT: %2:vgpr_32, dead %3:sreg_64 = V_SUB_CO_U32_e64 [[COPY]], 64, 0, implicit $exec - ; GFX6-NEXT: S_ENDPGM 0, implicit %2 + ; GFX6-NEXT: [[V_SUB_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_SUB_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_SUB_CO_U32_e64 [[COPY]], 64, 0, implicit $exec + ; GFX6-NEXT: S_ENDPGM 0, implicit [[V_SUB_CO_U32_e64_]] ; GFX9-LABEL: name: add_neg_inline_const_64_to_sub_s32_v ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} @@ -164,8 +164,8 @@ ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec - ; GFX6-NEXT: %2:vgpr_32, dead %3:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6-NEXT: S_ENDPGM 0, implicit %2 + ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX6-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e64_]] ; GFX9-LABEL: name: add_neg_inline_const_16_to_sub_s32_v ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.s16.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.s16.mir @@ -62,8 +62,9 @@ ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX10-NEXT: [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec - ; GFX10-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_ADD_NC_U16_e64_]], 0, 16, implicit $exec - ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_BFE_U32_e64_]] + ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; GFX10-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_ADD_NC_U16_e64_]], implicit $exec + ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_AND_B32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_TRUNC %0 @@ -125,8 +126,9 @@ ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX10-NEXT: [[V_SUB_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_NC_U16_e64 0, [[COPY]], 0, 64, 0, 0, implicit $exec - ; GFX10-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_SUB_NC_U16_e64_]], 0, 16, implicit $exec - ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_BFE_U32_e64_]] + ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; GFX10-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_SUB_NC_U16_e64_]], implicit $exec + ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_AND_B32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s16) = G_TRUNC %0 %2:vgpr(s16) = G_CONSTANT i16 -64 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-and.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-and.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-and.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-and.mir @@ -22,7 +22,7 @@ ; WAVE64-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; WAVE64-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec ; WAVE64-NEXT: [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[COPY1]], [[V_MOV_B32_e32_]], implicit $exec - ; WAVE64-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]], implicit-def dead $scc + ; WAVE64-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]], implicit-def $scc ; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_AND_B64_]] ; WAVE32-LABEL: name: and_s1_vcc_vcc_vcc ; WAVE32: liveins: $vgpr0, $vgpr1 @@ -32,7 +32,7 @@ ; WAVE32-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; WAVE32-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec ; WAVE32-NEXT: [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[COPY1]], [[V_MOV_B32_e32_]], implicit $exec - ; WAVE32-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]], implicit-def dead $scc + ; WAVE32-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]], implicit-def $scc ; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_AND_B32_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 @@ -401,7 +401,7 @@ ; WAVE64-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE64-NEXT: [[V_AND_B32_e32_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY1]], implicit $exec ; WAVE64-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_1]], implicit $exec - ; WAVE64-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc + ; WAVE64-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def $scc ; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_AND_B64_]] ; WAVE32-LABEL: name: and_s1_vcc_copy_to_vcc ; WAVE32: liveins: $vgpr0, $vgpr1 @@ -412,7 +412,7 @@ ; WAVE32-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE32-NEXT: [[V_AND_B32_e32_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY1]], implicit $exec ; WAVE32-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_1]], implicit $exec - ; WAVE32-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc + ; WAVE32-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def $scc ; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_AND_B32_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 @@ -446,7 +446,7 @@ ; WAVE64-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE64-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, %sgpr0, implicit-def $scc ; WAVE64-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec - ; WAVE64-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc + ; WAVE64-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def $scc ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY [[S_AND_B64_]] ; WAVE64-NEXT: S_ENDPGM 0, implicit [[COPY1]] ; WAVE32-LABEL: name: copy_select_constrain_vcc_result_reg_wave32 @@ -458,7 +458,7 @@ ; WAVE32-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE32-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, %sgpr0, implicit-def $scc ; WAVE32-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec - ; WAVE32-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc + ; WAVE32-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def $scc ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY [[S_AND_B32_1]] ; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY1]] %1:vgpr(s32) = COPY $vgpr0 @@ -494,7 +494,7 @@ ; WAVE64-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE64-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, %sgpr0, implicit-def $scc ; WAVE64-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec - ; WAVE64-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc + ; WAVE64-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def $scc ; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_AND_B64_]] ; WAVE32-LABEL: name: copy_select_constrain_vcc_result_reg_wave64 ; WAVE32: liveins: $vgpr0, $sgpr0 @@ -505,7 +505,7 @@ ; WAVE32-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE32-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, %sgpr0, implicit-def $scc ; WAVE32-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec - ; WAVE32-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc + ; WAVE32-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def $scc ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY [[S_AND_B32_1]] ; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY1]] %1:vgpr(s32) = COPY $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-anyext.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-anyext.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-anyext.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-anyext.mir @@ -36,8 +36,8 @@ ; GCN-LABEL: name: anyext_sgpr_s32_to_sgpr_s64 ; GCN: liveins: $sgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GCN-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY $sgpr0 + ; GCN-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0 = IMPLICIT_DEF ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[DEF]], %subreg.sub1 ; GCN-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]] %0:sgpr(s32) = COPY $sgpr0 @@ -100,13 +100,6 @@ bb.0: liveins: $vgpr0 - ; GCN-LABEL: name: anyext_vgpr_s16_to_vgpr_s64 - ; GCN: liveins: $vgpr0 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[DEF]], %subreg.sub1 - ; GCN-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s16) = G_TRUNC %0 %2:vgpr(s64) = G_ANYEXT %1 @@ -143,8 +136,9 @@ ; GCN: liveins: $sgpr0 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GCN-NEXT: [[S_BFE_U32_:%[0-9]+]]:sreg_32 = S_BFE_U32 [[COPY]], 1048576, implicit-def $scc - ; GCN-NEXT: $sgpr0 = COPY [[S_BFE_U32_]] + ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_MOV_B32_]], [[COPY]], implicit-def $scc + ; GCN-NEXT: $sgpr0 = COPY [[S_AND_B32_]] %0:sgpr(s32) = COPY $sgpr0 %1:sgpr(s1) = G_TRUNC %0 %2:sgpr(s16) = G_ANYEXT %1 @@ -207,8 +201,9 @@ ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GCN-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[COPY]], 0, 16, implicit $exec - ; GCN-NEXT: $vgpr0 = COPY [[V_BFE_U32_e64_]] + ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; GCN-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[COPY]], implicit $exec + ; GCN-NEXT: $vgpr0 = COPY [[V_AND_B32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s1) = G_TRUNC %0 %2:vgpr(s16) = G_ANYEXT %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.s16.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.s16.mir @@ -15,7 +15,6 @@ # ERR-NOT: remark # ERR: remark: :0:0: cannot select: %4:sgpr(s16) = G_ASHR %2:sgpr, %3:sgpr(s16) (in function: ashr_s16_s16_ss) # ERR-NEXT: remark: :0:0: cannot select: %3:vgpr(s16) = G_ASHR %2:vgpr, %1:vgpr(s32) (in function: ashr_s16_s32_vv) -# ERR-NEXT: remark: :0:0: cannot select: %5:vgpr(s64) = G_ZEXT %4:vgpr(s16) (in function: ashr_s16_vv_zext_to_s64) # ERR-NEXT: remark: :0:0: cannot select: %3:sgpr(s16) = G_ASHR %2:sgpr, %1:sgpr(s32) (in function: ashr_s16_s32_ss) # ERR-NEXT: remark: :0:0: cannot select: %3:vgpr(s16) = G_ASHR %2:sgpr, %1:vgpr(s32) (in function: ashr_s16_s32_sv) # ERR-NEXT: remark: :0:0: cannot select: %3:vgpr(s16) = G_ASHR %2:vgpr, %1:sgpr(s32) (in function: ashr_s16_s32_vs) @@ -240,16 +239,18 @@ ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX10-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec - ; GFX10-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_ASHRREV_I16_e64_]], 0, 16, implicit $exec - ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_BFE_U32_e64_]] + ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; GFX10-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_ASHRREV_I16_e64_]], implicit $exec + ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_AND_B32_e64_]] ; GFX11-LABEL: name: ashr_s16_s16_vv_zext_to_s32 ; GFX11: liveins: $vgpr0, $vgpr1 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX11-NEXT: [[V_ASHRREV_I16_t16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_t16_e64 [[COPY1]], [[COPY]], implicit $exec - ; GFX11-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_ASHRREV_I16_t16_e64_]], 0, 16, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_BFE_U32_e64_]] + ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; GFX11-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_ASHRREV_I16_t16_e64_]], implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_AND_B32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_TRUNC %0 @@ -271,43 +272,51 @@ ; GFX8-LABEL: name: ashr_s16_vv_zext_to_s64 ; GFX8: liveins: $vgpr0, $vgpr1 ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; GFX8-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) - ; GFX8-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32) - ; GFX8-NEXT: [[ASHR:%[0-9]+]]:vgpr(s16) = G_ASHR [[TRUNC]], [[TRUNC1]](s16) - ; GFX8-NEXT: [[ZEXT:%[0-9]+]]:vgpr(s64) = G_ZEXT [[ASHR]](s16) - ; GFX8-NEXT: S_ENDPGM 0, implicit [[ZEXT]](s64) + ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX8-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX8-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; GFX8-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_ASHRREV_I16_e64_]], implicit $exec + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX8-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]] ; GFX9-LABEL: name: ashr_s16_vv_zext_to_s64 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32) - ; GFX9-NEXT: [[ASHR:%[0-9]+]]:vgpr(s16) = G_ASHR [[TRUNC]], [[TRUNC1]](s16) - ; GFX9-NEXT: [[ZEXT:%[0-9]+]]:vgpr(s64) = G_ZEXT [[ASHR]](s16) - ; GFX9-NEXT: S_ENDPGM 0, implicit [[ZEXT]](s64) + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX9-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; GFX9-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_ASHRREV_I16_e64_]], implicit $exec + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX9-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]] ; GFX10-LABEL: name: ashr_s16_vv_zext_to_s64 ; GFX10: liveins: $vgpr0, $vgpr1 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) - ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32) - ; GFX10-NEXT: [[ASHR:%[0-9]+]]:vgpr(s16) = G_ASHR [[TRUNC]], [[TRUNC1]](s16) - ; GFX10-NEXT: [[ZEXT:%[0-9]+]]:vgpr(s64) = G_ZEXT [[ASHR]](s16) - ; GFX10-NEXT: S_ENDPGM 0, implicit [[ZEXT]](s64) + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; GFX10-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_ASHRREV_I16_e64_]], implicit $exec + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX10-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]] ; GFX11-LABEL: name: ashr_s16_vv_zext_to_s64 ; GFX11: liveins: $vgpr0, $vgpr1 ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) - ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32) - ; GFX11-NEXT: [[ASHR:%[0-9]+]]:vgpr(s16) = G_ASHR [[TRUNC]], [[TRUNC1]](s16) - ; GFX11-NEXT: [[ZEXT:%[0-9]+]]:vgpr(s64) = G_ZEXT [[ASHR]](s16) - ; GFX11-NEXT: S_ENDPGM 0, implicit [[ZEXT]](s64) + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[V_ASHRREV_I16_t16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_t16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; GFX11-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_ASHRREV_I16_t16_e64_]], implicit $exec + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX11-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_TRUNC %0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-brcond.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-brcond.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-brcond.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-brcond.mir @@ -243,8 +243,8 @@ ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GCN-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[COPY1]], implicit $exec - ; GCN-NEXT: %5:sreg_64_xexec = nofpexcept V_CMP_EQ_F32_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec - ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_]], %5, implicit-def dead $scc + ; GCN-NEXT: [[V_CMP_EQ_F32_e64_:%[0-9]+]]:sreg_64_xexec = nofpexcept V_CMP_EQ_F32_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_F32_e64_]], implicit-def $scc ; GCN-NEXT: $vcc = COPY [[S_AND_B64_]] ; GCN-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc ; GCN-NEXT: {{ $}} @@ -283,7 +283,7 @@ ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[COPY2]], implicit-def $scc ; GCN-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec ; GCN-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[COPY1]], implicit $exec - ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[V_CMP_NE_U32_e64_]], implicit-def dead $scc + ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[V_CMP_NE_U32_e64_]], implicit-def $scc ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[S_AND_B64_]], $exec, implicit-def $scc ; GCN-NEXT: $vcc = COPY [[S_AND_B64_1]] ; GCN-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc @@ -321,7 +321,7 @@ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GCN-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[COPY1]], implicit $exec ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 -1 - ; GCN-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[V_CMP_EQ_U32_e64_]], [[S_MOV_B64_]], implicit-def dead $scc + ; GCN-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[V_CMP_EQ_U32_e64_]], [[S_MOV_B64_]], implicit-def $scc ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[S_XOR_B64_]], $exec, implicit-def $scc ; GCN-NEXT: $vcc = COPY [[S_AND_B64_]] ; GCN-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-build-vector-trunc.v2s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-build-vector-trunc.v2s16.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-build-vector-trunc.v2s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-build-vector-trunc.v2s16.mir @@ -572,9 +572,11 @@ ; GFX9PLUS-LABEL: name: test_build_vector_trunc_s_v2s16_zext_impdef_zext_constant ; GFX9PLUS: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF ; GFX9PLUS-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 123 - ; GFX9PLUS-NEXT: [[S_BFE_U32_:%[0-9]+]]:sreg_32 = S_BFE_U32 [[DEF]], 1048576, implicit-def $scc - ; GFX9PLUS-NEXT: [[S_BFE_U32_1:%[0-9]+]]:sreg_32 = S_BFE_U32 [[S_MOV_B32_]], 1048576, implicit-def $scc - ; GFX9PLUS-NEXT: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[S_BFE_U32_]], [[S_BFE_U32_1]] + ; GFX9PLUS-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; GFX9PLUS-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_MOV_B32_1]], [[DEF]], implicit-def $scc + ; GFX9PLUS-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; GFX9PLUS-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_MOV_B32_2]], [[S_MOV_B32_]], implicit-def $scc + ; GFX9PLUS-NEXT: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[S_AND_B32_]], [[S_AND_B32_1]] ; GFX9PLUS-NEXT: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]] %0:sgpr(s16) = G_IMPLICIT_DEF %1:sgpr(s16) = G_CONSTANT i16 123 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ctpop.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ctpop.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ctpop.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ctpop.mir @@ -79,8 +79,9 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: [[V_BCNT_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_BCNT_U32_B32_e64 [[COPY]], [[COPY1]], implicit $exec - ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_BCNT_U32_B32_e64_]] + ; CHECK-NEXT: [[V_BCNT_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_BCNT_U32_B32_e64 [[COPY]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[V_BCNT_U32_B32_e64_]], [[COPY1]], 0, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = G_CTPOP %0 @@ -103,8 +104,9 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: [[V_BCNT_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_BCNT_U32_B32_e64 [[COPY]], [[COPY1]], implicit $exec - ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_BCNT_U32_B32_e64_]] + ; CHECK-NEXT: [[V_BCNT_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_BCNT_U32_B32_e64 [[COPY]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY1]], [[V_BCNT_U32_B32_e64_]], 0, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = G_CTPOP %0 @@ -153,8 +155,9 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; CHECK-NEXT: [[V_BCNT_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_BCNT_U32_B32_e64 [[COPY]], [[COPY1]], implicit $exec - ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_BCNT_U32_B32_e64_]] + ; CHECK-NEXT: [[V_BCNT_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_BCNT_U32_B32_e64 [[COPY]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[V_BCNT_U32_B32_e64_]], [[COPY1]], 0, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:sgpr(s32) = COPY $sgpr0 %2:vgpr(s32) = G_CTPOP %0 @@ -178,8 +181,9 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; CHECK-NEXT: [[V_BCNT_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_BCNT_U32_B32_e64 [[COPY1]], [[COPY]], implicit $exec - ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_BCNT_U32_B32_e64_]] + ; CHECK-NEXT: [[V_BCNT_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_BCNT_U32_B32_e64 [[COPY1]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[V_BCNT_U32_B32_e64_]], [[COPY]], 0, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:sgpr(s32) = COPY $sgpr0 %2:vgpr(s32) = G_CTPOP %1 @@ -203,8 +207,9 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: [[V_BCNT_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_BCNT_U32_B32_e64 [[COPY]], [[COPY1]], implicit $exec - ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_BCNT_U32_B32_e64_]] + ; CHECK-NEXT: [[S_BCNT1_I32_B32_:%[0-9]+]]:sreg_32 = S_BCNT1_I32_B32 [[COPY]], implicit-def $scc + ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[S_BCNT1_I32_B32_]], [[COPY1]], 0, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e64_]] %0:sgpr(s32) = COPY $sgpr0 %1:vgpr(s32) = COPY $vgpr0 %2:sgpr(s32) = G_CTPOP %0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.s16.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.s16.mir @@ -13,7 +13,6 @@ # ERR-NOT: remark # ERR: remark: :0:0: cannot select: %4:sgpr(s16) = G_LSHR %2:sgpr, %3:sgpr(s16) (in function: lshr_s16_s16_ss) # ERR-NEXT: remark: :0:0: cannot select: %3:vgpr(s16) = G_LSHR %2:vgpr, %1:vgpr(s32) (in function: lshr_s16_s32_vv) -# ERR-NEXT: remark: :0:0: cannot select: %5:vgpr(s64) = G_ZEXT %4:vgpr(s16) (in function: lshr_s16_vv_zext_to_s64) # ERR-NEXT: remark: :0:0: cannot select: %3:sgpr(s16) = G_LSHR %2:sgpr, %1:sgpr(s32) (in function: lshr_s16_s32_ss) # ERR-NEXT: remark: :0:0: cannot select: %3:vgpr(s16) = G_LSHR %2:sgpr, %1:vgpr(s32) (in function: lshr_s16_s32_sv) # ERR-NEXT: remark: :0:0: cannot select: %3:vgpr(s16) = G_LSHR %2:vgpr, %1:sgpr(s32) (in function: lshr_s16_s32_vs) @@ -238,16 +237,18 @@ ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX10-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec - ; GFX10-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_LSHRREV_B16_e64_]], 0, 16, implicit $exec - ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_BFE_U32_e64_]] + ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; GFX10-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_LSHRREV_B16_e64_]], implicit $exec + ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_AND_B32_e64_]] ; GFX11-LABEL: name: lshr_s16_s16_vv_zext_to_s32 ; GFX11: liveins: $vgpr0, $vgpr1 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX11-NEXT: [[V_LSHRREV_B16_t16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_t16_e64 [[COPY1]], [[COPY]], implicit $exec - ; GFX11-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_LSHRREV_B16_t16_e64_]], 0, 16, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_BFE_U32_e64_]] + ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; GFX11-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_LSHRREV_B16_t16_e64_]], implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_AND_B32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_TRUNC %0 @@ -269,43 +270,51 @@ ; GFX8-LABEL: name: lshr_s16_vv_zext_to_s64 ; GFX8: liveins: $vgpr0, $vgpr1 ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; GFX8-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) - ; GFX8-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32) - ; GFX8-NEXT: [[LSHR:%[0-9]+]]:vgpr(s16) = G_LSHR [[TRUNC]], [[TRUNC1]](s16) - ; GFX8-NEXT: [[ZEXT:%[0-9]+]]:vgpr(s64) = G_ZEXT [[LSHR]](s16) - ; GFX8-NEXT: S_ENDPGM 0, implicit [[ZEXT]](s64) + ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX8-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX8-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; GFX8-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_LSHRREV_B16_e64_]], implicit $exec + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX8-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]] ; GFX9-LABEL: name: lshr_s16_vv_zext_to_s64 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:vgpr(s16) = G_LSHR [[TRUNC]], [[TRUNC1]](s16) - ; GFX9-NEXT: [[ZEXT:%[0-9]+]]:vgpr(s64) = G_ZEXT [[LSHR]](s16) - ; GFX9-NEXT: S_ENDPGM 0, implicit [[ZEXT]](s64) + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX9-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; GFX9-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_LSHRREV_B16_e64_]], implicit $exec + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX9-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]] ; GFX10-LABEL: name: lshr_s16_vv_zext_to_s64 ; GFX10: liveins: $vgpr0, $vgpr1 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) - ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32) - ; GFX10-NEXT: [[LSHR:%[0-9]+]]:vgpr(s16) = G_LSHR [[TRUNC]], [[TRUNC1]](s16) - ; GFX10-NEXT: [[ZEXT:%[0-9]+]]:vgpr(s64) = G_ZEXT [[LSHR]](s16) - ; GFX10-NEXT: S_ENDPGM 0, implicit [[ZEXT]](s64) + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; GFX10-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_LSHRREV_B16_e64_]], implicit $exec + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX10-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]] ; GFX11-LABEL: name: lshr_s16_vv_zext_to_s64 ; GFX11: liveins: $vgpr0, $vgpr1 ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) - ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32) - ; GFX11-NEXT: [[LSHR:%[0-9]+]]:vgpr(s16) = G_LSHR [[TRUNC]], [[TRUNC1]](s16) - ; GFX11-NEXT: [[ZEXT:%[0-9]+]]:vgpr(s64) = G_ZEXT [[LSHR]](s16) - ; GFX11-NEXT: S_ENDPGM 0, implicit [[ZEXT]](s64) + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[V_LSHRREV_B16_t16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_t16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; GFX11-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_LSHRREV_B16_t16_e64_]], implicit $exec + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX11-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_TRUNC %0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-or.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-or.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-or.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-or.mir @@ -22,7 +22,7 @@ ; WAVE64-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; WAVE64-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec ; WAVE64-NEXT: [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[COPY1]], [[V_MOV_B32_e32_]], implicit $exec - ; WAVE64-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]], implicit-def dead $scc + ; WAVE64-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]], implicit-def $scc ; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_OR_B64_]] ; WAVE32-LABEL: name: or_s1_vcc_vcc_vcc ; WAVE32: liveins: $vgpr0, $vgpr1 @@ -32,7 +32,7 @@ ; WAVE32-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; WAVE32-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec ; WAVE32-NEXT: [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[COPY1]], [[V_MOV_B32_e32_]], implicit $exec - ; WAVE32-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_OR_B32 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]], implicit-def dead $scc + ; WAVE32-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_OR_B32 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]], implicit-def $scc ; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_OR_B32_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 @@ -401,7 +401,7 @@ ; WAVE64-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE64-NEXT: [[V_AND_B32_e32_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY1]], implicit $exec ; WAVE64-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_1]], implicit $exec - ; WAVE64-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc + ; WAVE64-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def $scc ; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_OR_B64_]] ; WAVE32-LABEL: name: or_s1_vcc_copy_to_vcc ; WAVE32: liveins: $vgpr0, $vgpr1 @@ -412,7 +412,7 @@ ; WAVE32-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE32-NEXT: [[V_AND_B32_e32_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY1]], implicit $exec ; WAVE32-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_1]], implicit $exec - ; WAVE32-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_OR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc + ; WAVE32-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_OR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def $scc ; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_OR_B32_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 @@ -446,7 +446,7 @@ ; WAVE64-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE64-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, %sgpr0, implicit-def $scc ; WAVE64-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec - ; WAVE64-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc + ; WAVE64-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def $scc ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY [[S_OR_B64_]] ; WAVE64-NEXT: S_ENDPGM 0, implicit [[COPY1]] ; WAVE32-LABEL: name: copy_select_constrain_vcc_result_reg_wave32 @@ -458,7 +458,7 @@ ; WAVE32-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE32-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, %sgpr0, implicit-def $scc ; WAVE32-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec - ; WAVE32-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_OR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc + ; WAVE32-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_OR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def $scc ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY [[S_OR_B32_]] ; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY1]] %1:vgpr(s32) = COPY $vgpr0 @@ -494,7 +494,7 @@ ; WAVE64-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE64-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, %sgpr0, implicit-def $scc ; WAVE64-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec - ; WAVE64-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc + ; WAVE64-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def $scc ; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_OR_B64_]] ; WAVE32-LABEL: name: copy_select_constrain_vcc_result_reg_wave64 ; WAVE32: liveins: $vgpr0, $sgpr0 @@ -505,7 +505,7 @@ ; WAVE32-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE32-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, %sgpr0, implicit-def $scc ; WAVE32-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec - ; WAVE32-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_OR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc + ; WAVE32-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_OR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def $scc ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY [[S_OR_B32_]] ; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY1]] %1:vgpr(s32) = COPY $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-add3.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-add3.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-add3.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-add3.mir @@ -56,9 +56,9 @@ ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX8-NEXT: %3:vgpr_32, dead %6:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec - ; GFX8-NEXT: %4:vgpr_32, dead %5:sreg_64_xexec = V_ADD_CO_U32_e64 %3, [[COPY2]], 0, implicit $exec - ; GFX8-NEXT: S_ENDPGM 0, implicit %4 + ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec + ; GFX8-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[V_ADD_CO_U32_e64_]], [[COPY2]], 0, implicit $exec + ; GFX8-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e64_2]] ; GFX9-LABEL: name: add_s32_vgpr_vgpr_vgpr ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} @@ -91,9 +91,9 @@ ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX8-NEXT: %3:vgpr_32, dead %6:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec - ; GFX8-NEXT: %4:vgpr_32, dead %5:sreg_64_xexec = V_ADD_CO_U32_e64 %3, [[COPY2]], 0, implicit $exec - ; GFX8-NEXT: S_ENDPGM 0, implicit %4, implicit %3 + ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec + ; GFX8-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[V_ADD_CO_U32_e64_]], [[COPY2]], 0, implicit $exec + ; GFX8-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e64_2]], implicit [[V_ADD_CO_U32_e64_]] ; GFX9-LABEL: name: add_s32_vgpr_vgpr_vgpr_multi_use ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} @@ -128,9 +128,9 @@ ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX8-NEXT: %3:vgpr_32, dead %6:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec - ; GFX8-NEXT: %4:vgpr_32, dead %5:sreg_64_xexec = V_ADD_CO_U32_e64 %3, [[COPY2]], 0, implicit $exec - ; GFX8-NEXT: S_ENDPGM 0, implicit %4 + ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec + ; GFX8-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_ADD_CO_U32_e64_]], [[COPY2]], 0, implicit $exec + ; GFX8-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e64_2]] ; GFX9-LABEL: name: add_p3_vgpr_vgpr_vgpr ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} @@ -164,9 +164,9 @@ ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX8-NEXT: %3:vgpr_32, dead %6:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec - ; GFX8-NEXT: %4:vgpr_32, dead %5:sreg_64_xexec = V_ADD_CO_U32_e64 %3, [[COPY2]], 0, implicit $exec - ; GFX8-NEXT: S_ENDPGM 0, implicit %4 + ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec + ; GFX8-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_ADD_CO_U32_e64_]], [[COPY2]], 0, implicit $exec + ; GFX8-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e64_2]] ; GFX9-LABEL: name: add_p5_vgpr_vgpr_vgpr ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} @@ -200,9 +200,9 @@ ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX8-NEXT: %3:vgpr_32, dead %6:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec - ; GFX8-NEXT: %4:vgpr_32, dead %5:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], %3, 0, implicit $exec - ; GFX8-NEXT: S_ENDPGM 0, implicit %4 + ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec + ; GFX8-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[V_ADD_CO_U32_e64_]], 0, implicit $exec + ; GFX8-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e64_2]] ; GFX9-LABEL: name: add_p3_s32_vgpr_vgpr_vgpr ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} @@ -237,9 +237,9 @@ ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX8-NEXT: %3:vgpr_32, dead %6:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec - ; GFX8-NEXT: %4:vgpr_32, dead %5:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], %3, 0, implicit $exec - ; GFX8-NEXT: S_ENDPGM 0, implicit %4 + ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec + ; GFX8-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[V_ADD_CO_U32_e64_]], 0, implicit $exec + ; GFX8-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e64_2]] ; GFX9-LABEL: name: add_p5_s32_vgpr_vgpr_vgpr ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sext.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sext.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sext.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sext.mir @@ -15,8 +15,9 @@ ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 ; GCN-NEXT: [[S_BFE_I32_:%[0-9]+]]:sreg_32 = S_BFE_I32 [[COPY]], 65536, implicit-def $scc - ; GCN-NEXT: [[S_BFE_U32_:%[0-9]+]]:sreg_32 = S_BFE_U32 [[S_BFE_I32_]], 1048576, implicit-def $scc - ; GCN-NEXT: $sgpr0 = COPY [[S_BFE_U32_]] + ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_MOV_B32_]], [[S_BFE_I32_]], implicit-def $scc + ; GCN-NEXT: $sgpr0 = COPY [[S_AND_B32_]] %0:sgpr(s32) = COPY $sgpr0 %1:sgpr(s1) = G_TRUNC %0 %2:sgpr(s16) = G_SEXT %1 @@ -165,8 +166,9 @@ ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GCN-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[COPY]], 0, 1, implicit $exec - ; GCN-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_BFE_I32_e64_]], 0, 16, implicit $exec - ; GCN-NEXT: $vgpr0 = COPY [[V_BFE_U32_e64_]] + ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; GCN-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_BFE_I32_e64_]], implicit $exec + ; GCN-NEXT: $vgpr0 = COPY [[V_AND_B32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s1) = G_TRUNC %0 %2:vgpr(s16) = G_SEXT %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.s16.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.s16.mir @@ -13,7 +13,6 @@ # ERR-NOT: remark # ERR: remark: :0:0: cannot select: %4:sgpr(s16) = G_SHL %2:sgpr, %3:sgpr(s16) (in function: shl_s16_s16_ss) # ERR-NEXT: remark: :0:0: cannot select: %3:vgpr(s16) = G_SHL %2:vgpr, %1:vgpr(s32) (in function: shl_s16_s32_vv) -# ERR-NEXT: remark: :0:0: cannot select: %5:vgpr(s64) = G_ZEXT %4:vgpr(s16) (in function: shl_s16_vv_zext_to_s64) # ERR-NEXT: remark: :0:0: cannot select: %3:sgpr(s16) = G_SHL %2:sgpr, %1:sgpr(s32) (in function: shl_s16_s32_ss) # ERR-NEXT: remark: :0:0: cannot select: %3:vgpr(s16) = G_SHL %2:sgpr, %1:vgpr(s32) (in function: shl_s16_s32_sv) # ERR-NEXT: remark: :0:0: cannot select: %3:vgpr(s16) = G_SHL %2:vgpr, %1:sgpr(s32) (in function: shl_s16_s32_vs) @@ -238,16 +237,18 @@ ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX10-NEXT: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec - ; GFX10-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_LSHLREV_B16_e64_]], 0, 16, implicit $exec - ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_BFE_U32_e64_]] + ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; GFX10-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_LSHLREV_B16_e64_]], implicit $exec + ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_AND_B32_e64_]] ; GFX11-LABEL: name: shl_s16_s16_vv_zext_to_s32 ; GFX11: liveins: $vgpr0, $vgpr1 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX11-NEXT: [[V_LSHLREV_B16_t16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_t16_e64 [[COPY1]], [[COPY]], implicit $exec - ; GFX11-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_LSHLREV_B16_t16_e64_]], 0, 16, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_BFE_U32_e64_]] + ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; GFX11-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_LSHLREV_B16_t16_e64_]], implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_AND_B32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_TRUNC %0 @@ -269,43 +270,51 @@ ; GFX8-LABEL: name: shl_s16_vv_zext_to_s64 ; GFX8: liveins: $vgpr0, $vgpr1 ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; GFX8-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) - ; GFX8-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32) - ; GFX8-NEXT: [[SHL:%[0-9]+]]:vgpr(s16) = G_SHL [[TRUNC]], [[TRUNC1]](s16) - ; GFX8-NEXT: [[ZEXT:%[0-9]+]]:vgpr(s64) = G_ZEXT [[SHL]](s16) - ; GFX8-NEXT: S_ENDPGM 0, implicit [[ZEXT]](s64) + ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX8-NEXT: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX8-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; GFX8-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_LSHLREV_B16_e64_]], implicit $exec + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX8-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]] ; GFX9-LABEL: name: shl_s16_vv_zext_to_s64 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32) - ; GFX9-NEXT: [[SHL:%[0-9]+]]:vgpr(s16) = G_SHL [[TRUNC]], [[TRUNC1]](s16) - ; GFX9-NEXT: [[ZEXT:%[0-9]+]]:vgpr(s64) = G_ZEXT [[SHL]](s16) - ; GFX9-NEXT: S_ENDPGM 0, implicit [[ZEXT]](s64) + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9-NEXT: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX9-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; GFX9-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_LSHLREV_B16_e64_]], implicit $exec + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX9-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]] ; GFX10-LABEL: name: shl_s16_vv_zext_to_s64 ; GFX10: liveins: $vgpr0, $vgpr1 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) - ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32) - ; GFX10-NEXT: [[SHL:%[0-9]+]]:vgpr(s16) = G_SHL [[TRUNC]], [[TRUNC1]](s16) - ; GFX10-NEXT: [[ZEXT:%[0-9]+]]:vgpr(s64) = G_ZEXT [[SHL]](s16) - ; GFX10-NEXT: S_ENDPGM 0, implicit [[ZEXT]](s64) + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10-NEXT: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; GFX10-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_LSHLREV_B16_e64_]], implicit $exec + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX10-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]] ; GFX11-LABEL: name: shl_s16_vv_zext_to_s64 ; GFX11: liveins: $vgpr0, $vgpr1 ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) - ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32) - ; GFX11-NEXT: [[SHL:%[0-9]+]]:vgpr(s16) = G_SHL [[TRUNC]], [[TRUNC1]](s16) - ; GFX11-NEXT: [[ZEXT:%[0-9]+]]:vgpr(s64) = G_ZEXT [[SHL]](s16) - ; GFX11-NEXT: S_ENDPGM 0, implicit [[ZEXT]](s64) + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[V_LSHLREV_B16_t16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_t16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; GFX11-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_LSHLREV_B16_t16_e64_]], implicit $exec + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX11-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_TRUNC %0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir @@ -861,15 +861,25 @@ ; GFX6-LABEL: name: store_atomic_global_s32 ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 - ; GFX6-NEXT: G_STORE [[COPY1]](s32), [[COPY]](p1) :: (store monotonic (s32), addrspace 1) + ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX6-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store monotonic (s32), addrspace 1) ; GFX7-LABEL: name: store_atomic_global_s32 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX7-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic (s32), addrspace 1) + ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX7-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store monotonic (s32), addrspace 1) ; GFX7-FLAT-LABEL: name: store_atomic_global_s32 ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7-FLAT-NEXT: {{ $}} @@ -914,15 +924,25 @@ ; GFX6-LABEL: name: store_atomic_global_s64 ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3 - ; GFX6-NEXT: G_STORE [[COPY1]](s64), [[COPY]](p1) :: (store monotonic (s64), addrspace 1) + ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 + ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX6-NEXT: BUFFER_STORE_DWORDX2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store monotonic (s64), addrspace 1) ; GFX7-LABEL: name: store_atomic_global_s64 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX7-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic (s64), addrspace 1) + ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX7-NEXT: BUFFER_STORE_DWORDX2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store monotonic (s64), addrspace 1) ; GFX7-FLAT-LABEL: name: store_atomic_global_s64 ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX7-FLAT-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sub.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sub.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sub.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sub.mir @@ -23,10 +23,10 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6-NEXT: [[S_SUB_I32_:%[0-9]+]]:sreg_32 = S_SUB_I32 [[COPY]], [[COPY1]], implicit-def $scc - ; GFX6-NEXT: %7:vgpr_32, dead %12:sreg_64_xexec = V_SUB_CO_U32_e64 [[COPY2]], [[S_SUB_I32_]], 0, implicit $exec - ; GFX6-NEXT: %8:vgpr_32, dead %11:sreg_64_xexec = V_SUB_CO_U32_e64 [[S_SUB_I32_]], %7, 0, implicit $exec - ; GFX6-NEXT: %9:vgpr_32, dead %10:sreg_64_xexec = V_SUB_CO_U32_e64 %8, [[COPY2]], 0, implicit $exec - ; GFX6-NEXT: S_ENDPGM 0, implicit %9 + ; GFX6-NEXT: [[V_SUB_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_SUB_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_SUB_CO_U32_e64 [[COPY2]], [[S_SUB_I32_]], 0, implicit $exec + ; GFX6-NEXT: [[V_SUB_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_SUB_CO_U32_e64_3:%[0-9]+]]:sreg_64 = V_SUB_CO_U32_e64 [[S_SUB_I32_]], [[V_SUB_CO_U32_e64_]], 0, implicit $exec + ; GFX6-NEXT: [[V_SUB_CO_U32_e64_4:%[0-9]+]]:vgpr_32, dead [[V_SUB_CO_U32_e64_5:%[0-9]+]]:sreg_64 = V_SUB_CO_U32_e64 [[V_SUB_CO_U32_e64_2]], [[COPY2]], 0, implicit $exec + ; GFX6-NEXT: S_ENDPGM 0, implicit [[V_SUB_CO_U32_e64_4]] ; GFX9-LABEL: name: sub_s32 ; GFX9: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr3_vgpr4 ; GFX9-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-xor.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-xor.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-xor.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-xor.mir @@ -22,7 +22,7 @@ ; WAVE64-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; WAVE64-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec ; WAVE64-NEXT: [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[COPY1]], [[V_MOV_B32_e32_]], implicit $exec - ; WAVE64-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]], implicit-def dead $scc + ; WAVE64-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]], implicit-def $scc ; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_XOR_B64_]] ; WAVE32-LABEL: name: xor_s1_vcc_vcc_vcc ; WAVE32: liveins: $vgpr0, $vgpr1 @@ -32,7 +32,7 @@ ; WAVE32-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; WAVE32-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec ; WAVE32-NEXT: [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[COPY1]], [[V_MOV_B32_e32_]], implicit $exec - ; WAVE32-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]], implicit-def dead $scc + ; WAVE32-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]], implicit-def $scc ; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_XOR_B32_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 @@ -402,7 +402,7 @@ ; WAVE64-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE64-NEXT: [[V_AND_B32_e32_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY1]], implicit $exec ; WAVE64-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_1]], implicit $exec - ; WAVE64-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc + ; WAVE64-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def $scc ; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_XOR_B64_]] ; WAVE32-LABEL: name: xor_s1_vcc_copy_to_vcc ; WAVE32: liveins: $vgpr0, $vgpr1 @@ -413,7 +413,7 @@ ; WAVE32-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE32-NEXT: [[V_AND_B32_e32_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY1]], implicit $exec ; WAVE32-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_1]], implicit $exec - ; WAVE32-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc + ; WAVE32-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def $scc ; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_XOR_B32_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 @@ -447,7 +447,7 @@ ; WAVE64-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE64-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, %sgpr0, implicit-def $scc ; WAVE64-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec - ; WAVE64-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc + ; WAVE64-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def $scc ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY [[S_XOR_B64_]] ; WAVE64-NEXT: S_ENDPGM 0, implicit [[COPY1]] ; WAVE32-LABEL: name: copy_select_constrain_vcc_result_reg_wave32 @@ -459,7 +459,7 @@ ; WAVE32-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE32-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, %sgpr0, implicit-def $scc ; WAVE32-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec - ; WAVE32-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc + ; WAVE32-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def $scc ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY [[S_XOR_B32_]] ; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY1]] %1:vgpr(s32) = COPY $vgpr0 @@ -495,7 +495,7 @@ ; WAVE64-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE64-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, %sgpr0, implicit-def $scc ; WAVE64-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec - ; WAVE64-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc + ; WAVE64-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def $scc ; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_XOR_B64_]] ; WAVE32-LABEL: name: copy_select_constrain_vcc_result_reg_wave64 ; WAVE32: liveins: $vgpr0, $sgpr0 @@ -506,7 +506,7 @@ ; WAVE32-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE32-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, %sgpr0, implicit-def $scc ; WAVE32-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec - ; WAVE32-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc + ; WAVE32-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def $scc ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY [[S_XOR_B32_]] ; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY1]] %1:vgpr(s32) = COPY $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-zext.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-zext.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-zext.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-zext.mir @@ -81,8 +81,9 @@ ; GCN: liveins: $sgpr0 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GCN-NEXT: [[S_BFE_U32_:%[0-9]+]]:sreg_32 = S_BFE_U32 [[COPY]], 1048576, implicit-def $scc - ; GCN-NEXT: $sgpr0 = COPY [[S_BFE_U32_]] + ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_MOV_B32_]], [[COPY]], implicit-def $scc + ; GCN-NEXT: $sgpr0 = COPY [[S_AND_B32_]] %0:sgpr(s32) = COPY $sgpr0 %1:sgpr(s16) = G_TRUNC %0 %2:sgpr(s32) = G_ZEXT %1 @@ -126,8 +127,8 @@ ; GCN-LABEL: name: zext_sgpr_s32_to_sgpr_s64 ; GCN: liveins: $sgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY $sgpr0 + ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 0 ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 ; GCN-NEXT: $sgpr0_sgpr1 = COPY [[REG_SEQUENCE]] %0:sgpr(s32) = COPY $sgpr0 @@ -208,8 +209,9 @@ ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GCN-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[COPY]], 0, 16, implicit $exec - ; GCN-NEXT: $vgpr0 = COPY [[V_BFE_U32_e64_]] + ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; GCN-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[COPY]], implicit $exec + ; GCN-NEXT: $vgpr0 = COPY [[V_AND_B32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s16) = G_TRUNC %0 %2:vgpr(s32) = G_ZEXT %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll @@ -10,13 +10,6 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 ; GFX906-NEXT: s_setpc_b64 s[30:31] -; -; GFX10PLUS-LABEL: v_fdot2: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10PLUS-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 false) ret float %r } @@ -44,13 +37,6 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX906-NEXT: s_setpc_b64 s[30:31] -; -; GFX10PLUS-LABEL: v_fdot2_neg_a: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10PLUS-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg <2 x half> %a %r = call float @llvm.amdgcn.fdot2(<2 x half> %neg.a, <2 x half> %b, float %c, i1 false) ret float %r @@ -62,13 +48,6 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] ; GFX906-NEXT: s_setpc_b64 s[30:31] -; -; GFX10PLUS-LABEL: v_fdot2_neg_b: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10PLUS-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %neg.b = fneg <2 x half> %b %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %neg.b, float %c, i1 false) ret float %r @@ -80,13 +59,6 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: v_dot2_f32_f16 v0, v1, v1, v2 neg_lo:[1,1,0] neg_hi:[1,1,0] ; GFX906-NEXT: s_setpc_b64 s[30:31] -; -; GFX10PLUS-LABEL: v_fdot2_neg_a_neg_b: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10PLUS-NEXT: v_dot2_f32_f16 v0, v1, v1, v2 neg_lo:[1,1,0] neg_hi:[1,1,0] -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg <2 x half> %b %neg.b = fneg <2 x half> %b %r = call float @llvm.amdgcn.fdot2(<2 x half> %neg.a, <2 x half> %neg.b, float %c, i1 false) @@ -100,14 +72,6 @@ ; GFX906-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 ; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 ; GFX906-NEXT: s_setpc_b64 s[30:31] -; -; GFX10PLUS-LABEL: v_fdot2_neg_c: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10PLUS-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 -; GFX10PLUS-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %neg.c = fneg float %c %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %neg.c, i1 false) ret float %r @@ -119,13 +83,6 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: v_dot2_f32_f16 v0, 2.0, v0, v1 op_sel_hi:[0,1,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] -; -; GFX10PLUS-LABEL: v_fdot2_inline_literal_a: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10PLUS-NEXT: v_dot2_f32_f16 v0, 2.0, v0, v1 op_sel_hi:[0,1,1] -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.fdot2(<2 x half> , <2 x half> %b, float %c, i1 false) ret float %ret } @@ -136,13 +93,6 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: v_dot2_f32_f16 v0, v0, 2.0, v1 op_sel_hi:[1,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] -; -; GFX10PLUS-LABEL: v_fdot2_inline_literal_b: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10PLUS-NEXT: v_dot2_f32_f16 v0, v0, 2.0, v1 op_sel_hi:[1,0,1] -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> , float %c, i1 false) ret float %ret } @@ -153,13 +103,6 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, 1.0 ; GFX906-NEXT: s_setpc_b64 s[30:31] -; -; GFX10PLUS-LABEL: v_fdot2_inline_literal_c: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10PLUS-NEXT: v_dot2_f32_f16 v0, v0, v1, 1.0 -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float 1.0, i1 false) ret float %ret } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll @@ -448,27 +448,28 @@ define amdgpu_ps float @atomic_add_i32_2d(<8 x i32> inreg %rsrc, i32 %data, i16 %s, i16 %t) { ; GFX9-LABEL: atomic_add_i32_2d: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, 0x5040100 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_perm_b32 v1, v2, v1, s8 +; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX9-NEXT: image_atomic_add v0, v1, s[0:7] dmask:0x1 unorm glc a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i32_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 @@ -485,27 +486,28 @@ define amdgpu_ps float @atomic_add_i32_3d(<8 x i32> inreg %rsrc, i32 %data, i16 %s, i16 %t, i16 %r) { ; GFX9-LABEL: atomic_add_i32_3d: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, 0x5040100 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_perm_b32 v2, v2, v1, s8 +; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v1 ; GFX9-NEXT: image_atomic_add v0, v[2:3], s[0:7] dmask:0x1 unorm glc a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i32_3d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_perm_b32 v2, v2, v1, 0x5040100 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 @@ -522,27 +524,28 @@ define amdgpu_ps float @atomic_add_i32_cube(<8 x i32> inreg %rsrc, i32 %data, i16 %s, i16 %t, i16 %face) { ; GFX9-LABEL: atomic_add_i32_cube: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, 0x5040100 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_perm_b32 v2, v2, v1, s8 +; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v1 ; GFX9-NEXT: image_atomic_add v0, v[2:3], s[0:7] dmask:0x1 unorm glc a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i32_cube: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_perm_b32 v2, v2, v1, 0x5040100 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 @@ -559,27 +562,28 @@ define amdgpu_ps float @atomic_add_i32_1darray(<8 x i32> inreg %rsrc, i32 %data, i16 %s, i16 %slice) { ; GFX9-LABEL: atomic_add_i32_1darray: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, 0x5040100 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_perm_b32 v1, v2, v1, s8 +; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX9-NEXT: image_atomic_add v0, v1, s[0:7] dmask:0x1 unorm glc a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i32_1darray: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 @@ -596,27 +600,28 @@ define amdgpu_ps float @atomic_add_i32_2darray(<8 x i32> inreg %rsrc, i32 %data, i16 %s, i16 %t, i16 %slice) { ; GFX9-LABEL: atomic_add_i32_2darray: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, 0x5040100 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_perm_b32 v2, v2, v1, s8 +; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v1 ; GFX9-NEXT: image_atomic_add v0, v[2:3], s[0:7] dmask:0x1 unorm glc a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i32_2darray: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_perm_b32 v2, v2, v1, 0x5040100 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 @@ -633,27 +638,28 @@ define amdgpu_ps float @atomic_add_i32_2dmsaa(<8 x i32> inreg %rsrc, i32 %data, i16 %s, i16 %t, i16 %fragid) { ; GFX9-LABEL: atomic_add_i32_2dmsaa: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, 0x5040100 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_perm_b32 v2, v2, v1, s8 +; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v1 ; GFX9-NEXT: image_atomic_add v0, v[2:3], s[0:7] dmask:0x1 unorm glc a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i32_2dmsaa: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_perm_b32 v2, v2, v1, 0x5040100 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 @@ -670,28 +676,31 @@ define amdgpu_ps float @atomic_add_i32_2darraymsaa(<8 x i32> inreg %rsrc, i32 %data, i16 %s, i16 %t, i16 %slice, i16 %fragid) { ; GFX9-LABEL: atomic_add_i32_2darraymsaa: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, 0x5040100 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_perm_b32 v1, v2, v1, s8 -; GFX9-NEXT: v_perm_b32 v2, v4, v3, s8 +; GFX9-NEXT: v_lshl_or_b32 v2, v4, 16, v2 ; GFX9-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 unorm glc a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i32_2darraymsaa: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 -; GFX10-NEXT: v_perm_b32 v2, v4, v3, 0x5040100 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_lshl_or_b32 v2, v4, 16, v3 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 @@ -1185,27 +1194,28 @@ define amdgpu_ps <2 x float> @atomic_add_i64_2d(<8 x i32> inreg %rsrc, i64 %data, i16 %s, i16 %t) { ; GFX9-LABEL: atomic_add_i64_2d: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, 0x5040100 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_perm_b32 v2, v3, v2, s8 +; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX9-NEXT: image_atomic_add v[0:1], v2, s[0:7] dmask:0x3 unorm glc a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i64_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 @@ -1222,27 +1232,28 @@ define amdgpu_ps <2 x float> @atomic_add_i64_3d(<8 x i32> inreg %rsrc, i64 %data, i16 %s, i16 %t, i16 %r) { ; GFX9-LABEL: atomic_add_i64_3d: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, 0x5040100 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_perm_b32 v3, v3, v2, s8 +; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; GFX9-NEXT: image_atomic_add v[0:1], v[3:4], s[0:7] dmask:0x3 unorm glc a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i64_3d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_perm_b32 v3, v3, v2, 0x5040100 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 @@ -1259,27 +1270,28 @@ define amdgpu_ps <2 x float> @atomic_add_i64_cube(<8 x i32> inreg %rsrc, i64 %data, i16 %s, i16 %t, i16 %face) { ; GFX9-LABEL: atomic_add_i64_cube: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, 0x5040100 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_perm_b32 v3, v3, v2, s8 +; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; GFX9-NEXT: image_atomic_add v[0:1], v[3:4], s[0:7] dmask:0x3 unorm glc a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i64_cube: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_perm_b32 v3, v3, v2, 0x5040100 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 @@ -1296,27 +1308,28 @@ define amdgpu_ps <2 x float> @atomic_add_i64_1darray(<8 x i32> inreg %rsrc, i64 %data, i16 %s, i16 %slice) { ; GFX9-LABEL: atomic_add_i64_1darray: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, 0x5040100 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_perm_b32 v2, v3, v2, s8 +; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX9-NEXT: image_atomic_add v[0:1], v2, s[0:7] dmask:0x3 unorm glc a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i64_1darray: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 @@ -1333,27 +1346,28 @@ define amdgpu_ps <2 x float> @atomic_add_i64_2darray(<8 x i32> inreg %rsrc, i64 %data, i16 %s, i16 %t, i16 %slice) { ; GFX9-LABEL: atomic_add_i64_2darray: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, 0x5040100 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_perm_b32 v3, v3, v2, s8 +; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; GFX9-NEXT: image_atomic_add v[0:1], v[3:4], s[0:7] dmask:0x3 unorm glc a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i64_2darray: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_perm_b32 v3, v3, v2, 0x5040100 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 @@ -1370,27 +1384,28 @@ define amdgpu_ps <2 x float> @atomic_add_i64_2dmsaa(<8 x i32> inreg %rsrc, i64 %data, i16 %s, i16 %t, i16 %fragid) { ; GFX9-LABEL: atomic_add_i64_2dmsaa: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, 0x5040100 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_perm_b32 v3, v3, v2, s8 +; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; GFX9-NEXT: image_atomic_add v[0:1], v[3:4], s[0:7] dmask:0x3 unorm glc a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i64_2dmsaa: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_perm_b32 v3, v3, v2, 0x5040100 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 @@ -1407,28 +1422,31 @@ define amdgpu_ps <2 x float> @atomic_add_i64_2darraymsaa(<8 x i32> inreg %rsrc, i64 %data, i16 %s, i16 %t, i16 %slice, i16 %fragid) { ; GFX9-LABEL: atomic_add_i64_2darraymsaa: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v4 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, 0x5040100 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_perm_b32 v2, v3, v2, s8 -; GFX9-NEXT: v_perm_b32 v3, v5, v4, s8 +; GFX9-NEXT: v_lshl_or_b32 v3, v5, 16, v3 ; GFX9-NEXT: image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 unorm glc a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i64_2darraymsaa: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 -; GFX10-NEXT: v_perm_b32 v3, v5, v4, 0x5040100 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: v_lshl_or_b32 v3, v5, 16, v4 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll @@ -9,19 +9,19 @@ ; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s12 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4 v[0:3], v0, s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -32,6 +32,7 @@ ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -43,7 +44,7 @@ ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX10NSA-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -59,19 +60,19 @@ ; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_perm_b32 v1, v1, v0, s12 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -82,6 +83,7 @@ ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -93,7 +95,7 @@ ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_perm_b32 v1, v1, v0, 0x5040100 +; GFX10NSA-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -109,19 +111,19 @@ ; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_perm_b32 v1, v1, v0, s12 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -132,6 +134,7 @@ ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -143,7 +146,7 @@ ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_perm_b32 v1, v1, v0, 0x5040100 +; GFX10NSA-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -159,19 +162,19 @@ ; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_perm_b32 v1, v2, v1, s12 +; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -182,6 +185,7 @@ ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -193,7 +197,7 @@ ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 +; GFX10NSA-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -209,19 +213,19 @@ ; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_perm_b32 v1, v1, v0, s12 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4_cl v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -232,6 +236,7 @@ ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -243,7 +248,7 @@ ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_perm_b32 v1, v1, v0, 0x5040100 +; GFX10NSA-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_cl v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -260,20 +265,20 @@ ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: v_perm_b32 v2, v2, v4, s12 +; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4_c_cl v[0:3], v[1:3], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -284,6 +289,7 @@ ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -295,7 +301,7 @@ ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 +; GFX10NSA-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -311,19 +317,19 @@ ; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_perm_b32 v1, v2, v1, s12 +; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -334,6 +340,7 @@ ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -345,7 +352,7 @@ ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 +; GFX10NSA-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -361,19 +368,19 @@ ; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_perm_b32 v2, v3, v2, s12 +; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -384,6 +391,7 @@ ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -395,7 +403,7 @@ ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 +; GFX10NSA-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -412,20 +420,20 @@ ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: v_perm_b32 v2, v2, v4, s12 +; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4_b_cl v[0:3], v[1:3], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -436,6 +444,7 @@ ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -447,7 +456,7 @@ ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 +; GFX10NSA-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_b_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -464,20 +473,20 @@ ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: v_mov_b32_e32 v3, v4 -; GFX9-NEXT: v_perm_b32 v2, v5, v2, s12 +; GFX9-NEXT: v_lshl_or_b32 v2, v5, 16, v2 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -488,6 +497,7 @@ ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -499,7 +509,7 @@ ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 +; GFX10NSA-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c_b_cl v[0:3], [v0, v1, v2, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -512,31 +522,32 @@ define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %lod) { ; GFX9-LABEL: gather4_l_2d: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s12, 0x5040100 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_perm_b32 v1, v1, v0, s12 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX9-NEXT: image_gather4_l v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10NSA-LABEL: gather4_l_2d: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: v_perm_b32 v1, v1, v0, 0x5040100 +; GFX10NSA-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX10NSA-NEXT: s_mov_b32 s4, s6 ; GFX10NSA-NEXT: s_mov_b32 s5, s7 ; GFX10NSA-NEXT: s_mov_b32 s6, s8 @@ -556,33 +567,34 @@ define amdgpu_ps <4 x float> @gather4_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t, half %lod) { ; GFX9-LABEL: gather4_c_l_2d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: v_perm_b32 v2, v2, v4, s12 +; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v0 ; GFX9-NEXT: image_gather4_c_l v[0:3], v[1:3], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10NSA-LABEL: gather4_c_l_2d: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 +; GFX10NSA-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10NSA-NEXT: s_mov_b32 s4, s6 ; GFX10NSA-NEXT: s_mov_b32 s5, s7 ; GFX10NSA-NEXT: s_mov_b32 s6, s8 @@ -602,31 +614,32 @@ define amdgpu_ps <4 x float> @gather4_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) { ; GFX9-LABEL: gather4_lz_2d: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s12, 0x5040100 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s12 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX9-NEXT: image_gather4_lz v[0:3], v0, s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10NSA-LABEL: gather4_lz_2d: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX10NSA-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10NSA-NEXT: s_mov_b32 s4, s6 ; GFX10NSA-NEXT: s_mov_b32 s5, s7 ; GFX10NSA-NEXT: s_mov_b32 s6, s8 @@ -646,31 +659,32 @@ define amdgpu_ps <4 x float> @gather4_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t) { ; GFX9-LABEL: gather4_c_lz_2d: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s12, 0x5040100 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_perm_b32 v1, v2, v1, s12 +; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX9-NEXT: image_gather4_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10NSA-LABEL: gather4_c_lz_2d: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 +; GFX10NSA-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10NSA-NEXT: s_mov_b32 s4, s6 ; GFX10NSA-NEXT: s_mov_b32 s5, s7 ; GFX10NSA-NEXT: s_mov_b32 s6, s8 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll @@ -6,28 +6,31 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw(<8 x i32> inreg %rsrc, i16 %s, i16 %t, i16 %slice, i16 %fragid) { ; GFX9-LABEL: load_2darraymsaa_v4f32_xyzw: ; GFX9: ; %bb.0: +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, 0x5040100 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s8 -; GFX9-NEXT: v_perm_b32 v1, v3, v2, s8 +; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX9-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: load_2darraymsaa_v4f32_xyzw: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX10PLUS-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 +; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10PLUS-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10PLUS-NEXT: s_mov_b32 s0, s2 ; GFX10PLUS-NEXT: s_mov_b32 s1, s3 ; GFX10PLUS-NEXT: s_mov_b32 s2, s4 +; GFX10PLUS-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10PLUS-NEXT: v_lshl_or_b32 v1, v3, 16, v2 ; GFX10PLUS-NEXT: s_mov_b32 s3, s5 ; GFX10PLUS-NEXT: s_mov_b32 s4, s6 ; GFX10PLUS-NEXT: s_mov_b32 s5, s7 @@ -43,22 +46,23 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i16 %s, i16 %t, i16 %slice, i16 %fragid) { ; GFX9-LABEL: load_2darraymsaa_v4f32_xyzw_tfe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, 0x5040100 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_perm_b32 v10, v1, v0, s8 -; GFX9-NEXT: v_perm_b32 v11, v3, v2, s8 +; GFX9-NEXT: v_lshl_or_b32 v11, v3, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: v_mov_b32_e32 v8, v5 ; GFX9-NEXT: v_mov_b32_e32 v9, v5 ; GFX9-NEXT: v_mov_b32_e32 v0, v5 +; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: v_mov_b32_e32 v1, v6 ; GFX9-NEXT: v_mov_b32_e32 v2, v7 @@ -73,14 +77,16 @@ ; GFX10-LABEL: load_2darraymsaa_v4f32_xyzw_tfe: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: v_perm_b32 v10, v1, v0, 0x5040100 -; GFX10-NEXT: v_perm_b32 v11, v3, v2, 0x5040100 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: v_mov_b32_e32 v6, v5 ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v8, v5 ; GFX10-NEXT: v_mov_b32_e32 v9, v5 +; GFX10-NEXT: v_lshl_or_b32 v10, v1, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v11, v3, 16, v2 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s4, s6 @@ -101,14 +107,16 @@ ; GFX11-LABEL: load_2darraymsaa_v4f32_xyzw_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v5, 0 -; GFX11-NEXT: v_perm_b32 v10, v1, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v3, v2, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 ; GFX11-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-NEXT: v_mov_b32_e32 v7, v5 ; GFX11-NEXT: v_mov_b32_e32 v8, v5 ; GFX11-NEXT: v_mov_b32_e32 v9, v5 +; GFX11-NEXT: v_lshl_or_b32 v10, v1, 16, v0 +; GFX11-NEXT: v_lshl_or_b32 v11, v3, 16, v2 ; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s3, s5 ; GFX11-NEXT: s_mov_b32 s4, s6 @@ -135,22 +143,23 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i16 %s, i16 %t, i16 %slice, i16 %fragid) { ; GFX9-LABEL: load_2darraymsaa_v4f32_xyzw_tfe_lwe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, 0x5040100 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_perm_b32 v10, v1, v0, s8 -; GFX9-NEXT: v_perm_b32 v11, v3, v2, s8 +; GFX9-NEXT: v_lshl_or_b32 v11, v3, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: v_mov_b32_e32 v8, v5 ; GFX9-NEXT: v_mov_b32_e32 v9, v5 ; GFX9-NEXT: v_mov_b32_e32 v0, v5 +; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: v_mov_b32_e32 v1, v6 ; GFX9-NEXT: v_mov_b32_e32 v2, v7 @@ -165,14 +174,16 @@ ; GFX10-LABEL: load_2darraymsaa_v4f32_xyzw_tfe_lwe: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: v_perm_b32 v10, v1, v0, 0x5040100 -; GFX10-NEXT: v_perm_b32 v11, v3, v2, 0x5040100 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: v_mov_b32_e32 v6, v5 ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v8, v5 ; GFX10-NEXT: v_mov_b32_e32 v9, v5 +; GFX10-NEXT: v_lshl_or_b32 v10, v1, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v11, v3, 16, v2 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s4, s6 @@ -193,14 +204,16 @@ ; GFX11-LABEL: load_2darraymsaa_v4f32_xyzw_tfe_lwe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v5, 0 -; GFX11-NEXT: v_perm_b32 v10, v1, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v3, v2, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 ; GFX11-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-NEXT: v_mov_b32_e32 v7, v5 ; GFX11-NEXT: v_mov_b32_e32 v8, v5 ; GFX11-NEXT: v_mov_b32_e32 v9, v5 +; GFX11-NEXT: v_lshl_or_b32 v10, v1, 16, v0 +; GFX11-NEXT: v_lshl_or_b32 v11, v3, 16, v2 ; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s3, s5 ; GFX11-NEXT: s_mov_b32 s4, s6 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll @@ -6,27 +6,28 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw(<8 x i32> inreg %rsrc, i16 %s, i16 %t, i16 %r) { ; GFX9-LABEL: load_3d_v4f32_xyzw: ; GFX9: ; %bb.0: +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, 0x5040100 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_perm_b32 v1, v1, v0, s8 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX9-NEXT: image_load v[0:3], v[1:2], s[0:7] dmask:0xf unorm a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: load_3d_v4f32_xyzw: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_perm_b32 v1, v1, v0, 0x5040100 +; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10PLUS-NEXT: s_mov_b32 s0, s2 ; GFX10PLUS-NEXT: s_mov_b32 s1, s3 ; GFX10PLUS-NEXT: s_mov_b32 s2, s4 ; GFX10PLUS-NEXT: s_mov_b32 s3, s5 +; GFX10PLUS-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX10PLUS-NEXT: s_mov_b32 s4, s6 ; GFX10PLUS-NEXT: s_mov_b32 s5, s7 ; GFX10PLUS-NEXT: s_mov_b32 s6, s8 @@ -41,22 +42,22 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i16 %s, i16 %t, i16 %r) { ; GFX9-LABEL: load_3d_v4f32_xyzw_tfe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, 0x5040100 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: v_mov_b32_e32 v6, v2 -; GFX9-NEXT: v_perm_b32 v5, v1, v0, s8 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v8, v7 ; GFX9-NEXT: v_mov_b32_e32 v9, v7 ; GFX9-NEXT: v_mov_b32_e32 v10, v7 ; GFX9-NEXT: v_mov_b32_e32 v11, v7 ; GFX9-NEXT: v_mov_b32_e32 v0, v7 +; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: v_mov_b32_e32 v1, v8 ; GFX9-NEXT: v_mov_b32_e32 v2, v9 @@ -71,14 +72,15 @@ ; GFX10-LABEL: load_3d_v4f32_xyzw_tfe: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v7, 0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_mov_b32_e32 v6, v2 -; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x5040100 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: v_mov_b32_e32 v8, v7 ; GFX10-NEXT: v_mov_b32_e32 v9, v7 ; GFX10-NEXT: v_mov_b32_e32 v10, v7 ; GFX10-NEXT: v_mov_b32_e32 v11, v7 +; GFX10-NEXT: v_lshl_or_b32 v5, v1, 16, v0 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s4, s6 @@ -99,7 +101,7 @@ ; GFX11-LABEL: load_3d_v4f32_xyzw_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, 0 -; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 ; GFX11-NEXT: s_mov_b32 s2, s4 @@ -107,6 +109,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v11, v7 ; GFX11-NEXT: v_mov_b32_e32 v10, v7 ; GFX11-NEXT: v_mov_b32_e32 v8, v7 +; GFX11-NEXT: v_lshl_or_b32 v5, v1, 16, v0 ; GFX11-NEXT: s_mov_b32 s3, s5 ; GFX11-NEXT: s_mov_b32 s4, s6 ; GFX11-NEXT: s_mov_b32 s5, s7 @@ -130,22 +133,22 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i16 %s, i16 %t, i16 %r) { ; GFX9-LABEL: load_3d_v4f32_xyzw_tfe_lwe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, 0x5040100 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: v_mov_b32_e32 v6, v2 -; GFX9-NEXT: v_perm_b32 v5, v1, v0, s8 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v8, v7 ; GFX9-NEXT: v_mov_b32_e32 v9, v7 ; GFX9-NEXT: v_mov_b32_e32 v10, v7 ; GFX9-NEXT: v_mov_b32_e32 v11, v7 ; GFX9-NEXT: v_mov_b32_e32 v0, v7 +; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: v_mov_b32_e32 v1, v8 ; GFX9-NEXT: v_mov_b32_e32 v2, v9 @@ -160,14 +163,15 @@ ; GFX10-LABEL: load_3d_v4f32_xyzw_tfe_lwe: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v7, 0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_mov_b32_e32 v6, v2 -; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x5040100 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: v_mov_b32_e32 v8, v7 ; GFX10-NEXT: v_mov_b32_e32 v9, v7 ; GFX10-NEXT: v_mov_b32_e32 v10, v7 ; GFX10-NEXT: v_mov_b32_e32 v11, v7 +; GFX10-NEXT: v_lshl_or_b32 v5, v1, 16, v0 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s4, s6 @@ -188,7 +192,7 @@ ; GFX11-LABEL: load_3d_v4f32_xyzw_tfe_lwe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, 0 -; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 ; GFX11-NEXT: s_mov_b32 s2, s4 @@ -196,6 +200,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v11, v7 ; GFX11-NEXT: v_mov_b32_e32 v10, v7 ; GFX11-NEXT: v_mov_b32_e32 v8, v7 +; GFX11-NEXT: v_lshl_or_b32 v5, v1, 16, v0 ; GFX11-NEXT: s_mov_b32 s3, s5 ; GFX11-NEXT: s_mov_b32 s4, s6 ; GFX11-NEXT: s_mov_b32 s5, s7 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.cd.g16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.cd.g16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.cd.g16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.cd.g16.ll @@ -15,8 +15,10 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v2 ; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -39,8 +41,10 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_c_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 -; GFX10-NEXT: v_perm_b32 v2, v4, v3, 0x5040100 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_lshl_or_b32 v2, v4, 16, v3 ; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -63,8 +67,10 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { ; GFX10-LABEL: sample_cd_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v2 ; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -88,10 +94,11 @@ ; GFX10-LABEL: sample_c_cd_cl_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v8, v2 -; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_perm_b32 v3, v8, v1, 0x5040100 -; GFX10-NEXT: v_perm_b32 v4, v4, v9, 0x5040100 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v1 ; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll @@ -22,16 +22,20 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v2 ; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: sample_d_2d: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v2 ; GFX11-NEXT: image_sample_d_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog @@ -44,17 +48,21 @@ ; GFX10-LABEL: sample_d_3d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, v3 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: v_perm_b32 v2, v1, v0, 0x5040100 -; GFX10-NEXT: v_perm_b32 v4, v4, v9, 0x5040100 +; GFX10-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v9 ; GFX10-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: sample_d_3d: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v4, v3, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-NEXT: v_lshl_or_b32 v1, v4, 16, v3 ; GFX11-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v1, v5, v[6:8]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog @@ -83,16 +91,20 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_c_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 -; GFX10-NEXT: v_perm_b32 v2, v4, v3, 0x5040100 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_lshl_or_b32 v2, v4, 16, v3 ; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: sample_c_d_2d: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v4, v3, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX11-NEXT: v_lshl_or_b32 v2, v4, 16, v3 ; GFX11-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog @@ -121,16 +133,20 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { ; GFX10-LABEL: sample_d_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v2 ; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: sample_d_cl_2d: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v2 ; GFX11-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog @@ -160,18 +176,21 @@ ; GFX10-LABEL: sample_c_d_cl_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v8, v2 -; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_perm_b32 v3, v8, v1, 0x5040100 -; GFX10-NEXT: v_perm_b32 v4, v4, v9, 0x5040100 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v1 ; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: sample_c_d_cl_2d: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v4, v3, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX11-NEXT: v_lshl_or_b32 v2, v4, 16, v3 ; GFX11-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v1, v2, v5, v[6:7]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog @@ -185,19 +204,22 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, v2 ; GFX10-NEXT: v_mov_b32_e32 v10, v3 -; GFX10-NEXT: v_mov_b32_e32 v11, v4 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_perm_b32 v4, v10, v9, 0x5040100 -; GFX10-NEXT: v_perm_b32 v5, v5, v11, 0x5040100 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v1 +; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v0 ; GFX10-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: sample_c_d_o_2darray_V1: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v5, v4, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX11-NEXT: v_lshl_or_b32 v3, v5, 16, v4 ; GFX11-NEXT: image_sample_c_d_o_g16 v0, [v0, v1, v2, v3, v[6:8]], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog @@ -211,19 +233,22 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, v2 ; GFX10-NEXT: v_mov_b32_e32 v10, v3 -; GFX10-NEXT: v_mov_b32_e32 v11, v4 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_perm_b32 v4, v10, v9, 0x5040100 -; GFX10-NEXT: v_perm_b32 v5, v5, v11, 0x5040100 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v1 +; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v0 ; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: sample_c_d_o_2darray_V2: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v5, v4, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX11-NEXT: v_lshl_or_b32 v3, v5, 16, v4 ; GFX11-NEXT: image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v3, v[6:8]], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll @@ -64,10 +64,13 @@ ; ; GFX11-LABEL: image_bvh_intersect_ray_a16: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_perm_b32 v9, v5, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v5, v7, 0x7060302 -; GFX11-NEXT: v_perm_b32 v11, v6, v8, 0x5040100 -; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v0, v1, v[2:4], v[9:11]], s[0:3] a16 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v7 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshl_or_b32 v8, v5, 16, v9 +; GFX11-NEXT: v_perm_b32 v9, v5, v7, 0x7060302 +; GFX11-NEXT: v_lshl_or_b32 v10, v6, 16, v10 +; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v0, v1, v[2:4], v[8:10]], s[0:3] a16 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) @@ -124,10 +127,13 @@ ; ; GFX11-LABEL: image_bvh64_intersect_ray_a16: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_perm_b32 v10, v6, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v6, v8, 0x7060302 -; GFX11-NEXT: v_perm_b32 v12, v7, v9, 0x5040100 -; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[0:1], v2, v[3:5], v[10:12]], s[0:3] a16 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v8 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshl_or_b32 v9, v6, 16, v10 +; GFX11-NEXT: v_perm_b32 v10, v6, v8, 0x7060302 +; GFX11-NEXT: v_lshl_or_b32 v11, v7, 16, v11 +; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[0:1], v2, v[3:5], v[9:11]], s[0:3] a16 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) @@ -327,12 +333,14 @@ ; GFX11-LABEL: image_bvh_intersect_ray_a16_vgpr_descr: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1 +; GFX11-NEXT: v_dual_mov_b32 v15, v4 :: v_dual_and_b32 v0, 0xffff, v7 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v8 ; GFX11-NEXT: v_dual_mov_b32 v13, v2 :: v_dual_mov_b32 v14, v3 -; GFX11-NEXT: v_mov_b32_e32 v15, v4 -; GFX11-NEXT: v_perm_b32 v4, v5, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v5, v7, 0x7060302 -; GFX11-NEXT: v_perm_b32 v6, v6, v8, 0x5040100 ; GFX11-NEXT: s_mov_b32 s1, exec_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_lshl_or_b32 v4, v5, 16, v0 +; GFX11-NEXT: v_perm_b32 v5, v5, v7, 0x7060302 +; GFX11-NEXT: v_lshl_or_b32 v6, v6, 16, v1 ; GFX11-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_readfirstlane_b32 s4, v9 ; GFX11-NEXT: v_readfirstlane_b32 s5, v10 @@ -558,11 +566,14 @@ ; GFX11-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_dual_mov_b32 v17, v0 :: v_dual_mov_b32 v18, v1 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v9 ; GFX11-NEXT: v_dual_mov_b32 v19, v2 :: v_dual_mov_b32 v14, v3 ; GFX11-NEXT: v_dual_mov_b32 v15, v4 :: v_dual_mov_b32 v16, v5 -; GFX11-NEXT: v_perm_b32 v4, v6, v8, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_lshl_or_b32 v4, v6, 16, v0 ; GFX11-NEXT: v_perm_b32 v5, v6, v8, 0x7060302 -; GFX11-NEXT: v_perm_b32 v6, v7, v9, 0x5040100 +; GFX11-NEXT: v_lshl_or_b32 v6, v7, 16, v1 ; GFX11-NEXT: s_mov_b32 s1, exec_lo ; GFX11-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_readfirstlane_b32 s4, v10 @@ -713,41 +724,23 @@ ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1030-NEXT: v_lshlrev_b32_e32 v4, 2, v0 -; GFX1030-NEXT: s_movk_i32 s9, 0x4600 -; GFX1030-NEXT: s_movk_i32 s8, 0x4700 -; GFX1030-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX1030-NEXT: v_mov_b32_e32 v5, 0x44004200 +; GFX1030-NEXT: v_mov_b32_e32 v6, 0x46004500 +; GFX1030-NEXT: v_mov_b32_e32 v7, 0x48004700 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s0 ; GFX1030-NEXT: v_mov_b32_e32 v1, s1 ; GFX1030-NEXT: v_mov_b32_e32 v2, s2 ; GFX1030-NEXT: v_mov_b32_e32 v3, s3 -; GFX1030-NEXT: s_movk_i32 s1, 0x4400 ; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX1030-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo -; GFX1030-NEXT: s_movk_i32 s2, 0x4200 +; GFX1030-NEXT: v_mov_b32_e32 v4, 2.0 ; GFX1030-NEXT: flat_load_dword v0, v[0:1] ; GFX1030-NEXT: flat_load_dword v1, v[2:3] -; GFX1030-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX1030-NEXT: s_movk_i32 s3, 0x4800 -; GFX1030-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX1030-NEXT: s_lshl_b32 s1, s1, 16 -; GFX1030-NEXT: s_movk_i32 s0, 0x4500 -; GFX1030-NEXT: s_or_b32 s1, s2, s1 -; GFX1030-NEXT: s_bfe_u32 s2, s9, 0x100000 -; GFX1030-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX1030-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX1030-NEXT: s_lshl_b32 s2, s2, 16 -; GFX1030-NEXT: s_lshl_b32 s3, s3, 16 -; GFX1030-NEXT: s_or_b32 s0, s0, s2 -; GFX1030-NEXT: s_or_b32 s2, s8, s3 ; GFX1030-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030-NEXT: v_mov_b32_e32 v3, 1.0 -; GFX1030-NEXT: v_mov_b32_e32 v4, 2.0 -; GFX1030-NEXT: v_mov_b32_e32 v5, s1 -; GFX1030-NEXT: v_mov_b32_e32 v6, s0 -; GFX1030-NEXT: v_mov_b32_e32 v7, s2 ; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[4:7] a16 ; GFX1030-NEXT: s_waitcnt vmcnt(0) @@ -758,41 +751,23 @@ ; GFX1013: ; %bb.0: ; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 2, v0 -; GFX1013-NEXT: s_movk_i32 s9, 0x4600 -; GFX1013-NEXT: s_movk_i32 s8, 0x4700 -; GFX1013-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX1013-NEXT: v_mov_b32_e32 v7, 0x48004700 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) ; GFX1013-NEXT: v_mov_b32_e32 v0, s0 ; GFX1013-NEXT: v_mov_b32_e32 v1, s1 ; GFX1013-NEXT: v_mov_b32_e32 v2, s2 ; GFX1013-NEXT: v_mov_b32_e32 v3, s3 -; GFX1013-NEXT: s_movk_i32 s1, 0x4400 ; GFX1013-NEXT: v_add_co_u32 v4, vcc_lo, v0, v6 ; GFX1013-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo ; GFX1013-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6 ; GFX1013-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo -; GFX1013-NEXT: s_movk_i32 s2, 0x4200 +; GFX1013-NEXT: v_mov_b32_e32 v6, 0x46004500 ; GFX1013-NEXT: flat_load_dword v0, v[4:5] ; GFX1013-NEXT: flat_load_dword v1, v[2:3] -; GFX1013-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX1013-NEXT: s_movk_i32 s3, 0x4800 -; GFX1013-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX1013-NEXT: s_lshl_b32 s1, s1, 16 -; GFX1013-NEXT: s_movk_i32 s0, 0x4500 -; GFX1013-NEXT: s_or_b32 s1, s2, s1 -; GFX1013-NEXT: s_bfe_u32 s2, s9, 0x100000 -; GFX1013-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX1013-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX1013-NEXT: s_lshl_b32 s2, s2, 16 -; GFX1013-NEXT: s_lshl_b32 s3, s3, 16 -; GFX1013-NEXT: s_or_b32 s0, s0, s2 -; GFX1013-NEXT: s_or_b32 s2, s8, s3 ; GFX1013-NEXT: v_mov_b32_e32 v2, 0 ; GFX1013-NEXT: v_mov_b32_e32 v3, 1.0 ; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0 -; GFX1013-NEXT: v_mov_b32_e32 v5, s1 -; GFX1013-NEXT: v_mov_b32_e32 v6, s0 -; GFX1013-NEXT: v_mov_b32_e32 v7, s2 +; GFX1013-NEXT: v_mov_b32_e32 v5, 0x44004200 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[4:7] a16 ; GFX1013-NEXT: s_waitcnt vmcnt(0) @@ -967,38 +942,20 @@ ; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX1030-NEXT: s_movk_i32 s6, 0x4200 -; GFX1030-NEXT: s_movk_i32 s7, 0x4800 -; GFX1030-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX1030-NEXT: s_movk_i32 s9, 0x4600 -; GFX1030-NEXT: s_movk_i32 s8, 0x4700 -; GFX1030-NEXT: s_bfe_u32 s7, s7, 0x100000 -; GFX1030-NEXT: s_bfe_u32 s8, s8, 0x100000 -; GFX1030-NEXT: s_lshl_b32 s7, s7, 16 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX1030-NEXT: v_mov_b32_e32 v5, 2.0 +; GFX1030-NEXT: v_mov_b32_e32 v6, 0x44004200 +; GFX1030-NEXT: v_mov_b32_e32 v7, 0x46004500 +; GFX1030-NEXT: v_mov_b32_e32 v8, 0x48004700 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 -; GFX1030-NEXT: s_movk_i32 s5, 0x4400 -; GFX1030-NEXT: s_movk_i32 s4, 0x4500 -; GFX1030-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX1030-NEXT: s_lshl_b32 s5, s5, 16 -; GFX1030-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX1030-NEXT: s_or_b32 s5, s6, s5 ; GFX1030-NEXT: flat_load_dword v2, v[0:1] -; GFX1030-NEXT: s_bfe_u32 s6, s9, 0x100000 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c6 -; GFX1030-NEXT: s_lshl_b32 s6, s6, 16 ; GFX1030-NEXT: v_mov_b32_e32 v1, 0x102 -; GFX1030-NEXT: s_or_b32 s4, s4, s6 -; GFX1030-NEXT: s_or_b32 s6, s8, s7 -; GFX1030-NEXT: v_mov_b32_e32 v6, s5 -; GFX1030-NEXT: v_mov_b32_e32 v7, s4 -; GFX1030-NEXT: v_mov_b32_e32 v8, s6 ; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16 ; GFX1030-NEXT: s_waitcnt vmcnt(0) @@ -1011,38 +968,20 @@ ; GFX1013-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX1013-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX1013-NEXT: s_movk_i32 s1, 0x4400 -; GFX1013-NEXT: s_movk_i32 s9, 0x4600 -; GFX1013-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX1013-NEXT: s_movk_i32 s0, 0x4500 -; GFX1013-NEXT: s_lshl_b32 s1, s1, 16 -; GFX1013-NEXT: s_movk_i32 s8, 0x4700 -; GFX1013-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX1013-NEXT: s_bfe_u32 s8, s8, 0x100000 ; GFX1013-NEXT: v_mov_b32_e32 v3, 0 ; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0 +; GFX1013-NEXT: v_mov_b32_e32 v6, 0x44004200 +; GFX1013-NEXT: v_mov_b32_e32 v7, 0x46004500 +; GFX1013-NEXT: v_mov_b32_e32 v8, 0x48004700 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) ; GFX1013-NEXT: v_mov_b32_e32 v0, s2 ; GFX1013-NEXT: v_mov_b32_e32 v1, s3 -; GFX1013-NEXT: s_movk_i32 s2, 0x4200 -; GFX1013-NEXT: s_movk_i32 s3, 0x4800 -; GFX1013-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX1013-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX1013-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX1013-NEXT: s_or_b32 s1, s2, s1 -; GFX1013-NEXT: s_bfe_u32 s2, s9, 0x100000 -; GFX1013-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX1013-NEXT: flat_load_dword v2, v[0:1] -; GFX1013-NEXT: s_lshl_b32 s2, s2, 16 -; GFX1013-NEXT: s_lshl_b32 s3, s3, 16 -; GFX1013-NEXT: s_or_b32 s0, s0, s2 -; GFX1013-NEXT: s_or_b32 s2, s8, s3 ; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c6 ; GFX1013-NEXT: v_mov_b32_e32 v1, 0x102 -; GFX1013-NEXT: v_mov_b32_e32 v6, s1 -; GFX1013-NEXT: v_mov_b32_e32 v7, s0 -; GFX1013-NEXT: v_mov_b32_e32 v8, s2 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[4:7] a16 ; GFX1013-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll @@ -124,10 +124,10 @@ ; CHECK-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec - ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec - ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: @@ -183,10 +183,10 @@ ; CHECK-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec - ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec - ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll @@ -83,10 +83,10 @@ ; CHECK-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec - ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec - ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: @@ -146,10 +146,10 @@ ; CHECK-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec - ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec - ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll @@ -166,10 +166,10 @@ ; GFX908-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec - ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec - ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: @@ -217,10 +217,10 @@ ; GFX90A-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; GFX90A-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; GFX90A-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec - ; GFX90A-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX90A-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; GFX90A-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec - ; GFX90A-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; GFX90A-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX90A-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.3: @@ -272,10 +272,10 @@ ; GFX908-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec - ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec - ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: @@ -321,10 +321,10 @@ ; GFX90A-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; GFX90A-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; GFX90A-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec - ; GFX90A-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX90A-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; GFX90A-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec - ; GFX90A-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; GFX90A-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX90A-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll @@ -174,10 +174,10 @@ ; PACKED-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec - ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec - ; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.3: @@ -224,10 +224,10 @@ ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec - ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec - ; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll @@ -124,10 +124,10 @@ ; CHECK-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec - ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec - ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll @@ -73,7 +73,7 @@ ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec - ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: @@ -125,10 +125,10 @@ ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec - ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec - ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: @@ -523,7 +523,7 @@ ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec - ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: @@ -575,7 +575,7 @@ ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec - ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: @@ -709,7 +709,7 @@ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -854,7 +854,7 @@ ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec - ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: @@ -893,7 +893,7 @@ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: @@ -910,7 +910,7 @@ ; CHECK-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec - ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll @@ -183,7 +183,7 @@ ; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3 ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec - ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.3: @@ -231,7 +231,7 @@ ; PACKED-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3 ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec - ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.3: @@ -417,7 +417,7 @@ ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec + ; UNPACKED-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY8]], [[COPY4]], implicit $exec @@ -438,7 +438,7 @@ ; PACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; PACKED-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec + ; PACKED-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 7) ; PACKED-NEXT: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4096 @@ -465,7 +465,7 @@ ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]], [[COPY8]], 0, implicit $exec + ; UNPACKED-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY6]], [[COPY8]], 0, implicit $exec ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY9]], [[COPY4]], implicit $exec @@ -488,7 +488,7 @@ ; UNPACKED-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3 ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec - ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.3: @@ -522,7 +522,7 @@ ; PACKED-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; PACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; PACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; PACKED-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]], [[COPY8]], 0, implicit $exec + ; PACKED-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY6]], [[COPY8]], 0, implicit $exec ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: @@ -539,7 +539,7 @@ ; PACKED-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3 ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec - ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll @@ -139,7 +139,7 @@ ; CHECK-NEXT: [[COPY13:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3 ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec - ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: @@ -263,7 +263,7 @@ ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]], [[COPY8]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY6]], [[COPY8]], 0, implicit $exec ; CHECK-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), align 1, addrspace 7) ; CHECK-NEXT: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4096 @@ -293,7 +293,7 @@ ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY8]], [[COPY10]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY8]], [[COPY10]], 0, implicit $exec ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: @@ -310,7 +310,7 @@ ; CHECK-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3 ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec - ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll @@ -75,7 +75,7 @@ ; CHECK-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec - ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: @@ -170,10 +170,10 @@ ; CHECK-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec - ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec - ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: @@ -524,7 +524,7 @@ ; CHECK-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3 ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec - ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: @@ -638,7 +638,7 @@ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 7) ; CHECK-NEXT: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4096 @@ -739,7 +739,7 @@ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 7) ; CHECK-NEXT: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4096 @@ -764,7 +764,7 @@ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: @@ -781,7 +781,7 @@ ; CHECK-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec - ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: @@ -835,7 +835,7 @@ ; CHECK-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec - ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll @@ -172,10 +172,10 @@ ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec - ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec - ; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.3: @@ -222,10 +222,10 @@ ; PACKED-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec - ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec - ; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll @@ -123,10 +123,10 @@ ; CHECK-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec - ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec - ; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll @@ -151,7 +151,7 @@ ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec - ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.3: @@ -197,7 +197,7 @@ ; PACKED-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec - ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.3: @@ -249,10 +249,10 @@ ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec - ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec - ; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.3: @@ -298,10 +298,10 @@ ; PACKED-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec - ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec - ; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.3: @@ -354,10 +354,10 @@ ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec - ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec - ; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.3: @@ -404,10 +404,10 @@ ; PACKED-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec - ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec - ; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.i8.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.i8.ll @@ -66,7 +66,7 @@ ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec - ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.3: @@ -112,7 +112,7 @@ ; PACKED-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec - ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.3: @@ -164,10 +164,10 @@ ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec - ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec - ; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.3: @@ -213,10 +213,10 @@ ; PACKED-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec - ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec - ; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.3: @@ -269,10 +269,10 @@ ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec - ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec - ; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.3: @@ -319,10 +319,10 @@ ; PACKED-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec - ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec - ; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll @@ -143,7 +143,7 @@ ; CHECK-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec - ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: @@ -195,10 +195,10 @@ ; CHECK-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec - ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec - ; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: @@ -251,10 +251,10 @@ ; CHECK-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec - ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec - ; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: @@ -613,7 +613,7 @@ ; CHECK-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec - ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: @@ -669,7 +669,7 @@ ; CHECK-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec - ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll @@ -2713,7 +2713,7 @@ ; GFX6-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; GFX6-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec ; GFX6-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec - ; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX6-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.3: @@ -2760,7 +2760,7 @@ ; GFX7-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; GFX7-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec ; GFX7-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec - ; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.3: @@ -2807,7 +2807,7 @@ ; GFX8-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec - ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.3: @@ -2858,7 +2858,7 @@ ; GFX6-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; GFX6-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec ; GFX6-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec - ; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX6-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.3: @@ -2903,7 +2903,7 @@ ; GFX7-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; GFX7-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec ; GFX7-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec - ; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.3: @@ -2948,7 +2948,7 @@ ; GFX8-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec - ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.3: @@ -3004,7 +3004,7 @@ ; GFX6-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; GFX6-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec ; GFX6-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec - ; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX6-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.3: @@ -3053,7 +3053,7 @@ ; GFX7-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; GFX7-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec ; GFX7-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec - ; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.3: @@ -3102,7 +3102,7 @@ ; GFX8-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec - ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.3: @@ -3154,7 +3154,7 @@ ; GFX6-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; GFX6-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY6]], [[COPY4]], implicit $exec ; GFX6-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec - ; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX6-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.3: @@ -3199,7 +3199,7 @@ ; GFX7-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; GFX7-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY6]], [[COPY4]], implicit $exec ; GFX7-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec - ; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.3: @@ -3244,7 +3244,7 @@ ; GFX8-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY6]], [[COPY4]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec - ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.3: @@ -3297,7 +3297,7 @@ ; GFX6-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; GFX6-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec ; GFX6-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec - ; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX6-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.3: @@ -3344,7 +3344,7 @@ ; GFX7-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; GFX7-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec ; GFX7-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec - ; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.3: @@ -3389,7 +3389,7 @@ ; GFX8-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY6]], [[COPY4]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec - ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.3: @@ -3441,7 +3441,7 @@ ; GFX6-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; GFX6-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec ; GFX6-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec - ; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX6-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.3: @@ -3503,7 +3503,7 @@ ; GFX7-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; GFX7-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec ; GFX7-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec - ; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.3: @@ -3565,7 +3565,7 @@ ; GFX8-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec - ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.3: @@ -3639,7 +3639,7 @@ ; GFX6-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; GFX6-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec ; GFX6-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec - ; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX6-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.3: @@ -3705,7 +3705,7 @@ ; GFX7-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; GFX7-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec ; GFX7-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec - ; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.3: @@ -3771,7 +3771,7 @@ ; GFX8-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec - ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.3: @@ -3843,7 +3843,7 @@ ; GFX6-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; GFX6-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec ; GFX6-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec - ; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX6-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.3: @@ -3909,7 +3909,7 @@ ; GFX7-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; GFX7-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec ; GFX7-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec - ; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.3: @@ -3975,7 +3975,7 @@ ; GFX8-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec - ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.3: @@ -4044,7 +4044,7 @@ ; GFX6-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; GFX6-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec ; GFX6-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec - ; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX6-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.3: @@ -4107,7 +4107,7 @@ ; GFX7-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; GFX7-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec ; GFX7-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec - ; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.3: @@ -4170,7 +4170,7 @@ ; GFX8-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec - ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.3: @@ -4239,7 +4239,7 @@ ; GFX6-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; GFX6-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec ; GFX6-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec - ; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX6-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.3: @@ -4302,7 +4302,7 @@ ; GFX7-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; GFX7-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec ; GFX7-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec - ; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.3: @@ -4365,7 +4365,7 @@ ; GFX8-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec - ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.3: @@ -4434,7 +4434,7 @@ ; GFX6-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; GFX6-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec ; GFX6-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec - ; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX6-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.3: @@ -4497,7 +4497,7 @@ ; GFX7-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; GFX7-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec ; GFX7-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec - ; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.3: @@ -4560,7 +4560,7 @@ ; GFX8-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec - ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.3: @@ -4628,7 +4628,7 @@ ; GFX6-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; GFX6-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY6]], [[COPY4]], implicit $exec ; GFX6-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec - ; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX6-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.3: @@ -4690,7 +4690,7 @@ ; GFX7-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; GFX7-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY6]], [[COPY4]], implicit $exec ; GFX7-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec - ; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.3: @@ -4752,7 +4752,7 @@ ; GFX8-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY6]], [[COPY4]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec - ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.3: @@ -4900,7 +4900,7 @@ ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX6-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec + ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] @@ -4917,7 +4917,7 @@ ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec + ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] @@ -4934,7 +4934,7 @@ ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec + ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] @@ -4958,7 +4958,7 @@ ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX6-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]], [[COPY4]], 0, implicit $exec + ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY6]], [[COPY4]], 0, implicit $exec ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] @@ -4975,7 +4975,7 @@ ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]], [[COPY4]], 0, implicit $exec + ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY6]], [[COPY4]], 0, implicit $exec ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] @@ -4992,7 +4992,7 @@ ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]], [[COPY4]], 0, implicit $exec + ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY6]], [[COPY4]], 0, implicit $exec ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] @@ -5073,7 +5073,7 @@ ; GFX6-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024 ; GFX6-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec + ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -5090,7 +5090,7 @@ ; GFX7-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec + ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -5107,7 +5107,7 @@ ; GFX8-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec + ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll @@ -14,7 +14,8 @@ ; GFX908-LABEL: v_sdot2: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX908-NEXT: v_dot2c_i32_i16_e32 v2, v0, v1 +; GFX908-NEXT: v_mov_b32_e32 v0, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot2: @@ -60,9 +61,9 @@ ; ; GFX908-LABEL: v_sdot2_sgpr_sgpr_sgpr: ; GFX908: ; %bb.0: -; GFX908-NEXT: v_mov_b32_e32 v0, s1 -; GFX908-NEXT: v_mov_b32_e32 v1, s2 -; GFX908-NEXT: v_dot2_i32_i16 v0, s0, v0, v1 +; GFX908-NEXT: v_mov_b32_e32 v1, s1 +; GFX908-NEXT: v_mov_b32_e32 v0, s2 +; GFX908-NEXT: v_dot2c_i32_i16_e32 v0, s0, v1 ; GFX908-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: v_sdot2_sgpr_sgpr_sgpr: @@ -85,7 +86,8 @@ ; GFX908-LABEL: v_sdot2_inline_literal_a: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_dot2_i32_i16 v0, 4, v0, v1 op_sel_hi:[0,1,1] +; GFX908-NEXT: v_dot2c_i32_i16_e32 v1, 0x40004, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, v1 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot2_inline_literal_a: @@ -108,7 +110,8 @@ ; GFX908-LABEL: v_sdot2_inline_literal_b: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_dot2_i32_i16 v0, v0, 4, v1 op_sel_hi:[1,0,1] +; GFX908-NEXT: v_dot2c_i32_i16_e32 v1, 0x40004, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, v1 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot2_inline_literal_b: @@ -131,7 +134,9 @@ ; GFX908-LABEL: v_sdot2_inline_literal_a_b: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_dot2_i32_i16 v0, 8, 4, v1 op_sel_hi:[0,0,1] +; GFX908-NEXT: v_mov_b32_e32 v0, v1 +; GFX908-NEXT: v_mov_b32_e32 v1, 0x40004 +; GFX908-NEXT: v_dot2c_i32_i16_e32 v0, 0x80008, v1 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot2_inline_literal_a_b: @@ -154,7 +159,9 @@ ; GFX908-LABEL: v_sdot2_inline_literal_a_b_c: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_dot2_i32_i16 v0, 8, 4, 8 op_sel_hi:[0,0,1] +; GFX908-NEXT: v_mov_b32_e32 v1, 0x40004 +; GFX908-NEXT: v_mov_b32_e32 v0, 8 +; GFX908-NEXT: v_dot2c_i32_i16_e32 v0, 0x80008, v1 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot2_inline_literal_a_b_c: @@ -177,7 +184,9 @@ ; GFX908-LABEL: v_sdot2_inline_literal_c: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, 7 +; GFX908-NEXT: v_mov_b32_e32 v2, 7 +; GFX908-NEXT: v_dot2c_i32_i16_e32 v2, v0, v1 +; GFX908-NEXT: v_mov_b32_e32 v0, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot2_inline_literal_c: @@ -200,7 +209,9 @@ ; GFX908-LABEL: v_sdot2_fneg_a: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX908-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX908-NEXT: v_dot2c_i32_i16_e32 v2, v0, v1 +; GFX908-NEXT: v_mov_b32_e32 v0, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot2_fneg_a: @@ -225,7 +236,9 @@ ; GFX908-LABEL: v_sdot2_fneg_b: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX908-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX908-NEXT: v_dot2c_i32_i16_e32 v2, v0, v1 +; GFX908-NEXT: v_mov_b32_e32 v0, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot2_fneg_b: @@ -252,7 +265,8 @@ ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 -; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX908-NEXT: v_dot2c_i32_i16_e32 v2, v0, v1 +; GFX908-NEXT: v_mov_b32_e32 v0, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot2_fnegf32_c: @@ -280,7 +294,8 @@ ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX908-NEXT: v_dot2c_i32_i16_e32 v2, v0, v1 +; GFX908-NEXT: v_mov_b32_e32 v0, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot2_fnegv2f16_c: @@ -308,7 +323,8 @@ ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_alignbit_b32 v0, v0, v0, 16 -; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX908-NEXT: v_dot2c_i32_i16_e32 v2, v0, v1 +; GFX908-NEXT: v_mov_b32_e32 v0, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot2_shuffle10_a: @@ -335,7 +351,8 @@ ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_alignbit_b32 v1, v1, v1, 16 -; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX908-NEXT: v_dot2c_i32_i16_e32 v2, v0, v1 +; GFX908-NEXT: v_mov_b32_e32 v0, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot2_shuffle10_b: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll @@ -14,7 +14,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_dot4_i32_i8 v0, v0, v1, v2 +; GFX10-NEXT: v_dot4c_i32_i8_e32 v2, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %r = call i32 @llvm.amdgcn.sdot4(i32 %a, i32 %b, i32 %c, i1 false) ret i32 %r @@ -80,7 +81,8 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v6 ; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 ; GFX10-NEXT: v_or3_b32 v1, v3, v4, v5 -; GFX10-NEXT: v_dot4_i32_i8 v0, v0, v1, v8 +; GFX10-NEXT: v_dot4c_i32_i8_e32 v8, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v0, v8 ; GFX10-NEXT: s_setpc_b64 s[30:31] %a.cast = bitcast <4 x i8> %a to i32 %b.cast = bitcast <4 x i8> %b to i32 @@ -101,7 +103,8 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX10-NEXT: v_dot4_i32_i8 v0, v0, v1, v2 +; GFX10-NEXT: v_dot4c_i32_i8_e32 v2, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg float %a %cast.neg.a = bitcast float %neg.a to i32 @@ -122,7 +125,8 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX10-NEXT: v_dot4_i32_i8 v0, v0, v1, v2 +; GFX10-NEXT: v_dot4c_i32_i8_e32 v2, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg <2 x half> %a %cast.neg.a = bitcast <2 x half> %neg.a to i32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll @@ -134,10 +134,10 @@ ; CHECK-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec - ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec - ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: @@ -196,10 +196,10 @@ ; CHECK-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec - ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec - ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll @@ -90,10 +90,10 @@ ; CHECK-NEXT: [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec - ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY8]], implicit $exec - ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: @@ -156,10 +156,10 @@ ; CHECK-NEXT: [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec - ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY8]], implicit $exec - ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll @@ -180,10 +180,10 @@ ; GFX908-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec - ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec - ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: @@ -234,10 +234,10 @@ ; GFX90A-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; GFX90A-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec ; GFX90A-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec - ; GFX90A-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX90A-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GFX90A-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec - ; GFX90A-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; GFX90A-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX90A-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.3: @@ -292,10 +292,10 @@ ; GFX908-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec - ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec - ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: @@ -343,10 +343,10 @@ ; GFX90A-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; GFX90A-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; GFX90A-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec - ; GFX90A-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX90A-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; GFX90A-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec - ; GFX90A-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; GFX90A-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX90A-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll @@ -187,10 +187,10 @@ ; UNPACKED-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec - ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec - ; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.3: @@ -261,10 +261,10 @@ ; PACKED-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec - ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec - ; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll @@ -133,10 +133,10 @@ ; CHECK-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec - ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec - ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll @@ -204,10 +204,10 @@ ; CHECK-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec - ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec - ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll @@ -166,10 +166,10 @@ ; UNPACKED-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec - ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec - ; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.3: @@ -220,10 +220,10 @@ ; PACKED-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec - ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec - ; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f32.ll @@ -128,10 +128,10 @@ ; CHECK-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec - ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec - ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll @@ -133,10 +133,10 @@ ; CHECK-NEXT: [[COPY17:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3 ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY17]], [[COPY15]], implicit $exec - ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY10]], implicit $exec - ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll @@ -227,10 +227,10 @@ ; PACKED-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec - ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec - ; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.3: @@ -283,10 +283,10 @@ ; UNPACKED-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec - ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec - ; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll @@ -156,10 +156,10 @@ ; CHECK-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec - ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec - ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll @@ -662,13 +662,6 @@ ; SI-NEXT: s_mov_b64 s[0:1], exec ; SI-NEXT: s_wqm_b64 exec, exec ; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 -; SI-NEXT: s_movk_i32 s2, 0x3c00 -; SI-NEXT: s_bfe_u32 s3, 0, 0x100000 -; SI-NEXT: s_bfe_u32 s2, s2, 0x100000 -; SI-NEXT: s_lshl_b32 s4, s3, 16 -; SI-NEXT: s_or_b32 s4, s2, s4 -; SI-NEXT: s_lshl_b32 s2, s2, 16 -; SI-NEXT: s_or_b32 s5, s3, s2 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc ; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -677,8 +670,8 @@ ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; SI-NEXT: s_cbranch_scc0 .LBB6_7 ; SI-NEXT: ; %bb.2: ; %.demote0 -; SI-NEXT: s_wqm_b64 s[6:7], s[0:1] -; SI-NEXT: s_and_b64 exec, exec, s[6:7] +; SI-NEXT: s_wqm_b64 s[4:5], s[0:1] +; SI-NEXT: s_and_b64 exec, exec, s[4:5] ; SI-NEXT: .LBB6_3: ; %.continue0 ; SI-NEXT: s_or_b64 exec, exec, s[2:3] ; SI-NEXT: s_mov_b64 s[2:3], s[0:1] @@ -693,8 +686,8 @@ ; SI-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 ; SI-NEXT: s_and_b64 s[2:3], s[0:1], vcc ; SI-NEXT: s_xor_b64 s[2:3], s[2:3], -1 -; SI-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] -; SI-NEXT: s_xor_b64 s[2:3], exec, s[6:7] +; SI-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] +; SI-NEXT: s_xor_b64 s[2:3], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB6_6 ; SI-NEXT: ; %bb.4: ; %.demote1 ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec @@ -703,8 +696,8 @@ ; SI-NEXT: s_mov_b64 exec, 0 ; SI-NEXT: .LBB6_6: ; %.continue1 ; SI-NEXT: s_or_b64 exec, exec, s[2:3] -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v0, 0x3c00 +; SI-NEXT: v_bfrev_b32_e32 v1, 60 ; SI-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB6_7: @@ -892,13 +885,6 @@ ; SI-NEXT: s_mov_b64 s[0:1], exec ; SI-NEXT: s_wqm_b64 exec, exec ; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 -; SI-NEXT: s_movk_i32 s2, 0x3c00 -; SI-NEXT: s_bfe_u32 s3, 0, 0x100000 -; SI-NEXT: s_bfe_u32 s2, s2, 0x100000 -; SI-NEXT: s_lshl_b32 s4, s3, 16 -; SI-NEXT: s_or_b32 s6, s2, s4 -; SI-NEXT: s_lshl_b32 s2, s2, 16 -; SI-NEXT: s_or_b32 s7, s3, s2 ; SI-NEXT: s_mov_b32 s4, 0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc @@ -908,8 +894,8 @@ ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; SI-NEXT: s_cbranch_scc0 .LBB7_9 ; SI-NEXT: ; %bb.2: ; %.demote0 -; SI-NEXT: s_wqm_b64 s[8:9], s[0:1] -; SI-NEXT: s_and_b64 exec, exec, s[8:9] +; SI-NEXT: s_wqm_b64 s[6:7], s[0:1] +; SI-NEXT: s_and_b64 exec, exec, s[6:7] ; SI-NEXT: .LBB7_3: ; %.continue0.preheader ; SI-NEXT: s_or_b64 exec, exec, s[2:3] ; SI-NEXT: s_mov_b64 s[2:3], 0 @@ -936,8 +922,8 @@ ; SI-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 ; SI-NEXT: s_and_b64 s[4:5], s[0:1], vcc ; SI-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; SI-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] -; SI-NEXT: s_xor_b64 s[4:5], exec, s[8:9] +; SI-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; SI-NEXT: s_xor_b64 s[4:5], exec, s[6:7] ; SI-NEXT: s_cbranch_execz .LBB7_4 ; SI-NEXT: ; %bb.6: ; %.demote1 ; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 @@ -945,14 +931,14 @@ ; SI-NEXT: s_cbranch_scc0 .LBB7_9 ; SI-NEXT: ; %bb.7: ; %.demote1 ; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; SI-NEXT: s_wqm_b64 s[8:9], s[0:1] -; SI-NEXT: s_and_b64 exec, exec, s[8:9] +; SI-NEXT: s_wqm_b64 s[6:7], s[0:1] +; SI-NEXT: s_and_b64 exec, exec, s[6:7] ; SI-NEXT: s_branch .LBB7_4 ; SI-NEXT: .LBB7_8: ; %.return ; SI-NEXT: s_or_b64 exec, exec, s[2:3] ; SI-NEXT: s_and_b64 exec, exec, s[0:1] -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v0, 0x3c00 +; SI-NEXT: v_bfrev_b32_e32 v1, 60 ; SI-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB7_9: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -125,7 +125,7 @@ ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10PLUS-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX10PLUS-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = mul i16 %num, %den ret i16 %result diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll @@ -99,7 +99,7 @@ ; GCN-LABEL: v_orn2_i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_xor_b32_e32 v1, -1, v1 +; GCN-NEXT: v_not_b32_e32 v1, v1 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; @@ -107,7 +107,7 @@ ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10PLUS-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX10PLUS-NEXT: v_not_b32_e32 v1, v1 ; GFX10PLUS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %not.src1 = xor i32 %src1, -1 @@ -118,13 +118,13 @@ define amdgpu_ps float @v_orn2_i32_sv(i32 inreg %src0, i32 %src1) { ; GCN-LABEL: v_orn2_i32_sv: ; GCN: ; %bb.0: -; GCN-NEXT: v_xor_b32_e32 v0, -1, v0 +; GCN-NEXT: v_not_b32_e32 v0, v0 ; GCN-NEXT: v_or_b32_e32 v0, s2, v0 ; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: v_orn2_i32_sv: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX10PLUS-NEXT: v_not_b32_e32 v0, v0 ; GFX10PLUS-NEXT: v_or_b32_e32 v0, s2, v0 ; GFX10PLUS-NEXT: ; return to shader part epilog %not.src1 = xor i32 %src1, -1 @@ -248,8 +248,8 @@ ; GCN-LABEL: v_orn2_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_xor_b32_e32 v2, -1, v2 -; GCN-NEXT: v_xor_b32_e32 v3, -1, v3 +; GCN-NEXT: v_not_b32_e32 v2, v2 +; GCN-NEXT: v_not_b32_e32 v3, v3 ; GCN-NEXT: v_or_b32_e32 v0, v0, v2 ; GCN-NEXT: v_or_b32_e32 v1, v1, v3 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -258,8 +258,8 @@ ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10PLUS-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX10PLUS-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX10PLUS-NEXT: v_not_b32_e32 v2, v2 +; GFX10PLUS-NEXT: v_not_b32_e32 v3, v3 ; GFX10PLUS-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX10PLUS-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] @@ -271,16 +271,16 @@ define amdgpu_ps <2 x float> @v_orn2_i64_sv(i64 inreg %src0, i64 %src1) { ; GCN-LABEL: v_orn2_i64_sv: ; GCN: ; %bb.0: -; GCN-NEXT: v_xor_b32_e32 v0, -1, v0 -; GCN-NEXT: v_xor_b32_e32 v1, -1, v1 +; GCN-NEXT: v_not_b32_e32 v0, v0 +; GCN-NEXT: v_not_b32_e32 v1, v1 ; GCN-NEXT: v_or_b32_e32 v0, s2, v0 ; GCN-NEXT: v_or_b32_e32 v1, s3, v1 ; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: v_orn2_i64_sv: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX10PLUS-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX10PLUS-NEXT: v_not_b32_e32 v0, v0 +; GFX10PLUS-NEXT: v_not_b32_e32 v1, v1 ; GFX10PLUS-NEXT: v_or_b32_e32 v0, s2, v0 ; GFX10PLUS-NEXT: v_or_b32_e32 v1, s3, v1 ; GFX10PLUS-NEXT: ; return to shader part epilog @@ -466,14 +466,14 @@ ; GCN: ; %bb.0: ; GCN-NEXT: v_xor_b32_e32 v0, -1, v0 ; GCN-NEXT: v_or_b32_e32 v0, s2, v0 -; GCN-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: v_orn2_i16_sv: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX10PLUS-NEXT: v_or_b32_e32 v0, s2, v0 -; GFX10PLUS-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10PLUS-NEXT: ; return to shader part epilog %not.src1 = xor i16 %src1, -1 %or = or i16 %src0, %not.src1 @@ -487,14 +487,14 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_xor_b32 s0, s2, -1 ; GCN-NEXT: v_or_b32_e32 v0, s0, v0 -; GCN-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: v_orn2_i16_vs: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_xor_b32 s0, s2, -1 ; GFX10PLUS-NEXT: v_or_b32_e32 v0, s0, v0 -; GFX10PLUS-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10PLUS-NEXT: ; return to shader part epilog %not.src1 = xor i16 %src1, -1 %or = or i16 %src0, %not.src1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -75,31 +75,29 @@ ; ; GFX8-LABEL: s_saddsat_i7: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s2, 9, 0x100000 -; GFX8-NEXT: s_lshl_b32 s0, s0, s2 -; GFX8-NEXT: s_sext_i32_i16 s3, s0 -; GFX8-NEXT: s_sext_i32_i16 s4, 0 -; GFX8-NEXT: s_max_i32 s5, s3, s4 -; GFX8-NEXT: s_min_i32 s3, s3, s4 -; GFX8-NEXT: s_lshl_b32 s1, s1, s2 -; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3 -; GFX8-NEXT: s_sext_i32_i16 s3, s3 +; GFX8-NEXT: s_lshl_b32 s0, s0, 9 +; GFX8-NEXT: s_sext_i32_i16 s2, s0 +; GFX8-NEXT: s_sext_i32_i16 s3, 0 +; GFX8-NEXT: s_max_i32 s4, s2, s3 +; GFX8-NEXT: s_min_i32 s2, s2, s3 +; GFX8-NEXT: s_lshl_b32 s1, s1, 9 +; GFX8-NEXT: s_sub_i32 s2, 0xffff8000, s2 +; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5 -; GFX8-NEXT: s_max_i32 s1, s3, s1 +; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4 +; GFX8-NEXT: s_max_i32 s1, s2, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sext_i32_i16 s3, s5 -; GFX8-NEXT: s_min_i32 s1, s1, s3 +; GFX8-NEXT: s_sext_i32_i16 s2, s4 +; GFX8-NEXT: s_min_i32 s1, s1, s2 ; GFX8-NEXT: s_add_i32 s0, s0, s1 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 -; GFX8-NEXT: s_ashr_i32 s0, s0, s2 +; GFX8-NEXT: s_ashr_i32 s0, s0, 9 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_saddsat_i7: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_bfe_u32 s2, 9, 0x100000 -; GFX9-NEXT: s_lshl_b32 s1, s1, s2 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 +; GFX9-NEXT: s_lshl_b32 s1, s1, 9 +; GFX9-NEXT: s_lshl_b32 s0, s0, 9 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_add_i16 v0, s0, v0 clamp ; GFX9-NEXT: v_ashrrev_i16_e32 v0, 9, v0 @@ -108,9 +106,8 @@ ; ; GFX10PLUS-LABEL: s_saddsat_i7: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_bfe_u32 s2, 9, 0x100000 -; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s2 -; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, s2 +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 9 +; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 9 ; GFX10PLUS-NEXT: v_add_nc_i16 v0, s0, s1 clamp ; GFX10PLUS-NEXT: v_ashrrev_i16 v0, 9, v0 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 @@ -189,31 +186,29 @@ ; ; GFX8-LABEL: s_saddsat_i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s2, 8, 0x100000 -; GFX8-NEXT: s_lshl_b32 s0, s0, s2 -; GFX8-NEXT: s_sext_i32_i16 s3, s0 -; GFX8-NEXT: s_sext_i32_i16 s4, 0 -; GFX8-NEXT: s_max_i32 s5, s3, s4 -; GFX8-NEXT: s_min_i32 s3, s3, s4 -; GFX8-NEXT: s_lshl_b32 s1, s1, s2 -; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3 -; GFX8-NEXT: s_sext_i32_i16 s3, s3 +; GFX8-NEXT: s_lshl_b32 s0, s0, 8 +; GFX8-NEXT: s_sext_i32_i16 s2, s0 +; GFX8-NEXT: s_sext_i32_i16 s3, 0 +; GFX8-NEXT: s_max_i32 s4, s2, s3 +; GFX8-NEXT: s_min_i32 s2, s2, s3 +; GFX8-NEXT: s_lshl_b32 s1, s1, 8 +; GFX8-NEXT: s_sub_i32 s2, 0xffff8000, s2 +; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5 -; GFX8-NEXT: s_max_i32 s1, s3, s1 +; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4 +; GFX8-NEXT: s_max_i32 s1, s2, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sext_i32_i16 s3, s5 -; GFX8-NEXT: s_min_i32 s1, s1, s3 +; GFX8-NEXT: s_sext_i32_i16 s2, s4 +; GFX8-NEXT: s_min_i32 s1, s1, s2 ; GFX8-NEXT: s_add_i32 s0, s0, s1 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 -; GFX8-NEXT: s_ashr_i32 s0, s0, s2 +; GFX8-NEXT: s_ashr_i32 s0, s0, 8 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_saddsat_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_bfe_u32 s2, 8, 0x100000 -; GFX9-NEXT: s_lshl_b32 s1, s1, s2 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 +; GFX9-NEXT: s_lshl_b32 s1, s1, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, 8 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_add_i16 v0, s0, v0 clamp ; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0 @@ -222,9 +217,8 @@ ; ; GFX10PLUS-LABEL: s_saddsat_i8: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_bfe_u32 s2, 8, 0x100000 -; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s2 -; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, s2 +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 8 +; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8 ; GFX10PLUS-NEXT: v_add_nc_i16 v0, s0, s1 clamp ; GFX10PLUS-NEXT: v_ashrrev_i16 v0, 8, v0 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 @@ -300,9 +294,10 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v2, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v3, v1, s4 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_add_i16 v0, v0, v1 clamp @@ -318,9 +313,11 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX10-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 +; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_add_i16 v0, v0, v1 clamp @@ -335,8 +332,10 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_i16 v0, v0, v1 clamp @@ -387,45 +386,44 @@ ; ; GFX8-LABEL: s_saddsat_v2i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s4, 8, 0x100000 ; GFX8-NEXT: s_lshr_b32 s2, s0, 8 -; GFX8-NEXT: s_lshl_b32 s0, s0, s4 -; GFX8-NEXT: s_sext_i32_i16 s5, s0 -; GFX8-NEXT: s_sext_i32_i16 s6, 0 -; GFX8-NEXT: s_max_i32 s7, s5, s6 -; GFX8-NEXT: s_min_i32 s5, s5, s6 +; GFX8-NEXT: s_lshl_b32 s0, s0, 8 +; GFX8-NEXT: s_sext_i32_i16 s4, s0 +; GFX8-NEXT: s_sext_i32_i16 s5, 0 +; GFX8-NEXT: s_max_i32 s6, s4, s5 +; GFX8-NEXT: s_min_i32 s4, s4, s5 ; GFX8-NEXT: s_lshr_b32 s3, s1, 8 -; GFX8-NEXT: s_lshl_b32 s1, s1, s4 -; GFX8-NEXT: s_sub_i32 s5, 0xffff8000, s5 -; GFX8-NEXT: s_sext_i32_i16 s5, s5 +; GFX8-NEXT: s_lshl_b32 s1, s1, 8 +; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4 +; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s7, 0x7fff, s7 -; GFX8-NEXT: s_max_i32 s1, s5, s1 +; GFX8-NEXT: s_sub_i32 s6, 0x7fff, s6 +; GFX8-NEXT: s_max_i32 s1, s4, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sext_i32_i16 s5, s7 -; GFX8-NEXT: s_min_i32 s1, s1, s5 +; GFX8-NEXT: s_sext_i32_i16 s4, s6 +; GFX8-NEXT: s_min_i32 s1, s1, s4 ; GFX8-NEXT: s_add_i32 s0, s0, s1 -; GFX8-NEXT: s_lshl_b32 s1, s2, s4 -; GFX8-NEXT: s_lshl_b32 s2, s3, s4 +; GFX8-NEXT: s_lshl_b32 s1, s2, 8 +; GFX8-NEXT: s_lshl_b32 s2, s3, 8 ; GFX8-NEXT: s_sext_i32_i16 s3, s1 -; GFX8-NEXT: s_max_i32 s5, s3, s6 -; GFX8-NEXT: s_min_i32 s3, s3, s6 +; GFX8-NEXT: s_max_i32 s4, s3, s5 +; GFX8-NEXT: s_min_i32 s3, s3, s5 ; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5 +; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4 ; GFX8-NEXT: s_max_i32 s2, s3, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sext_i32_i16 s3, s5 +; GFX8-NEXT: s_sext_i32_i16 s3, s4 ; GFX8-NEXT: s_min_i32 s2, s2, s3 ; GFX8-NEXT: s_add_i32 s1, s1, s2 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 -; GFX8-NEXT: s_ashr_i32 s1, s1, s4 -; GFX8-NEXT: s_ashr_i32 s0, s0, s4 +; GFX8-NEXT: s_ashr_i32 s1, s1, 8 +; GFX8-NEXT: s_ashr_i32 s0, s0, 8 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_and_b32 s0, s0, 0xff -; GFX8-NEXT: s_lshl_b32 s1, s1, s4 +; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; @@ -628,14 +626,15 @@ ; GFX9-LABEL: v_saddsat_v4i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX9-NEXT: v_perm_b32 v2, v2, v0, s4 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v0 ; GFX9-NEXT: v_alignbit_b32 v0, v3, v0, 16 -; GFX9-NEXT: v_perm_b32 v3, v4, v1, s4 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX9-NEXT: v_alignbit_b32 v1, v5, v1, 16 ; GFX9-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] @@ -662,15 +661,17 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX10-NEXT: v_perm_b32 v2, v2, v0, 0x5040100 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX10-NEXT: v_alignbit_b32 v0, v3, v0, 16 -; GFX10-NEXT: v_perm_b32 v3, v4, v1, 0x5040100 -; GFX10-NEXT: v_alignbit_b32 v1, v5, v1, 16 +; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, 24 -; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] +; GFX10-NEXT: v_lshl_or_b32 v3, v5, 16, v6 +; GFX10-NEXT: v_alignbit_b32 v1, v7, v1, 16 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_add_i16 v2, v2, v3 clamp @@ -692,12 +693,14 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX11-NEXT: v_perm_b32 v2, v2, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v3, v1, 0x5040100 -; GFX11-NEXT: v_alignbit_b32 v0, v4, v0, 16 -; GFX11-NEXT: v_alignbit_b32 v1, v5, v1, 16 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v0 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX11-NEXT: v_lshl_or_b32 v2, v2, 16, v4 +; GFX11-NEXT: v_lshl_or_b32 v3, v3, 16, v5 +; GFX11-NEXT: v_alignbit_b32 v0, v6, v0, 16 +; GFX11-NEXT: v_alignbit_b32 v1, v7, v1, 16 ; GFX11-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] @@ -785,47 +788,46 @@ ; ; GFX8-LABEL: s_saddsat_v4i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s8, 8, 0x100000 ; GFX8-NEXT: s_lshr_b32 s2, s0, 8 ; GFX8-NEXT: s_lshr_b32 s3, s0, 16 ; GFX8-NEXT: s_lshr_b32 s4, s0, 24 -; GFX8-NEXT: s_lshl_b32 s0, s0, s8 -; GFX8-NEXT: s_sext_i32_i16 s9, s0 -; GFX8-NEXT: s_sext_i32_i16 s10, 0 -; GFX8-NEXT: s_max_i32 s11, s9, s10 -; GFX8-NEXT: s_min_i32 s9, s9, s10 +; GFX8-NEXT: s_lshl_b32 s0, s0, 8 +; GFX8-NEXT: s_sext_i32_i16 s8, s0 +; GFX8-NEXT: s_sext_i32_i16 s9, 0 +; GFX8-NEXT: s_max_i32 s10, s8, s9 +; GFX8-NEXT: s_min_i32 s8, s8, s9 ; GFX8-NEXT: s_lshr_b32 s5, s1, 8 ; GFX8-NEXT: s_lshr_b32 s6, s1, 16 ; GFX8-NEXT: s_lshr_b32 s7, s1, 24 -; GFX8-NEXT: s_lshl_b32 s1, s1, s8 -; GFX8-NEXT: s_sub_i32 s9, 0xffff8000, s9 -; GFX8-NEXT: s_sext_i32_i16 s9, s9 +; GFX8-NEXT: s_lshl_b32 s1, s1, 8 +; GFX8-NEXT: s_sub_i32 s8, 0xffff8000, s8 +; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s11, 0x7fff, s11 -; GFX8-NEXT: s_max_i32 s1, s9, s1 +; GFX8-NEXT: s_sub_i32 s10, 0x7fff, s10 +; GFX8-NEXT: s_max_i32 s1, s8, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sext_i32_i16 s9, s11 -; GFX8-NEXT: s_min_i32 s1, s1, s9 +; GFX8-NEXT: s_sext_i32_i16 s8, s10 +; GFX8-NEXT: s_min_i32 s1, s1, s8 ; GFX8-NEXT: s_add_i32 s0, s0, s1 -; GFX8-NEXT: s_lshl_b32 s1, s2, s8 -; GFX8-NEXT: s_lshl_b32 s2, s5, s8 +; GFX8-NEXT: s_lshl_b32 s1, s2, 8 +; GFX8-NEXT: s_lshl_b32 s2, s5, 8 ; GFX8-NEXT: s_sext_i32_i16 s5, s1 -; GFX8-NEXT: s_max_i32 s9, s5, s10 -; GFX8-NEXT: s_min_i32 s5, s5, s10 +; GFX8-NEXT: s_max_i32 s8, s5, s9 +; GFX8-NEXT: s_min_i32 s5, s5, s9 ; GFX8-NEXT: s_sub_i32 s5, 0xffff8000, s5 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sub_i32 s9, 0x7fff, s9 +; GFX8-NEXT: s_sub_i32 s8, 0x7fff, s8 ; GFX8-NEXT: s_max_i32 s2, s5, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sext_i32_i16 s5, s9 +; GFX8-NEXT: s_sext_i32_i16 s5, s8 ; GFX8-NEXT: s_min_i32 s2, s2, s5 ; GFX8-NEXT: s_add_i32 s1, s1, s2 -; GFX8-NEXT: s_lshl_b32 s2, s3, s8 +; GFX8-NEXT: s_lshl_b32 s2, s3, 8 ; GFX8-NEXT: s_sext_i32_i16 s5, s2 -; GFX8-NEXT: s_lshl_b32 s3, s6, s8 -; GFX8-NEXT: s_max_i32 s6, s5, s10 -; GFX8-NEXT: s_min_i32 s5, s5, s10 +; GFX8-NEXT: s_lshl_b32 s3, s6, 8 +; GFX8-NEXT: s_max_i32 s6, s5, s9 +; GFX8-NEXT: s_min_i32 s5, s5, s9 ; GFX8-NEXT: s_sub_i32 s5, 0xffff8000, s5 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 @@ -835,11 +837,11 @@ ; GFX8-NEXT: s_sext_i32_i16 s5, s6 ; GFX8-NEXT: s_min_i32 s3, s3, s5 ; GFX8-NEXT: s_add_i32 s2, s2, s3 -; GFX8-NEXT: s_lshl_b32 s3, s4, s8 +; GFX8-NEXT: s_lshl_b32 s3, s4, 8 ; GFX8-NEXT: s_sext_i32_i16 s5, s3 -; GFX8-NEXT: s_max_i32 s6, s5, s10 -; GFX8-NEXT: s_min_i32 s5, s5, s10 -; GFX8-NEXT: s_lshl_b32 s4, s7, s8 +; GFX8-NEXT: s_max_i32 s6, s5, s9 +; GFX8-NEXT: s_min_i32 s5, s5, s9 +; GFX8-NEXT: s_lshl_b32 s4, s7, 8 ; GFX8-NEXT: s_sub_i32 s5, 0xffff8000, s5 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 @@ -847,21 +849,21 @@ ; GFX8-NEXT: s_sub_i32 s6, 0x7fff, s6 ; GFX8-NEXT: s_max_i32 s4, s5, s4 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 -; GFX8-NEXT: s_ashr_i32 s1, s1, s8 +; GFX8-NEXT: s_ashr_i32 s1, s1, 8 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s5, s6 -; GFX8-NEXT: s_ashr_i32 s0, s0, s8 +; GFX8-NEXT: s_ashr_i32 s0, s0, 8 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_min_i32 s4, s4, s5 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_ashr_i32 s2, s2, s8 +; GFX8-NEXT: s_ashr_i32 s2, s2, 8 ; GFX8-NEXT: s_add_i32 s3, s3, s4 ; GFX8-NEXT: s_and_b32 s0, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s2, 0xff -; GFX8-NEXT: s_ashr_i32 s3, s3, s8 +; GFX8-NEXT: s_ashr_i32 s3, s3, 8 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s3, 0xff @@ -2911,8 +2913,8 @@ ; GFX8-NEXT: s_sext_i32_i16 s3, s4 ; GFX8-NEXT: s_min_i32 s1, s1, s3 ; GFX8-NEXT: s_add_i32 s2, s2, s1 -; GFX8-NEXT: s_bfe_u32 s1, s2, 0x100000 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s2 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog @@ -3301,12 +3303,12 @@ ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_min_i32 s2, s2, s3 ; GFX8-NEXT: s_add_i32 s5, s5, s2 -; GFX8-NEXT: s_bfe_u32 s2, s4, 0x100000 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s4 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_bfe_u32 s2, s5, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s5 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: ; return to shader part epilog @@ -3658,16 +3660,16 @@ ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_min_i32 s3, s3, s4 ; GFX8-NEXT: s_add_i32 s8, s8, s3 -; GFX8-NEXT: s_bfe_u32 s3, s6, 0x100000 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s6 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s3, s3, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s3 -; GFX8-NEXT: s_bfe_u32 s3, s7, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s7 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshl_b32 s3, s3, 16 ; GFX8-NEXT: s_or_b32 s1, s1, s3 -; GFX8-NEXT: s_bfe_u32 s3, s8, 0x100000 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s8 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_lshl_b32 s3, s3, 16 ; GFX8-NEXT: s_or_b32 s2, s2, s3 ; GFX8-NEXT: ; return to shader part epilog @@ -4105,20 +4107,20 @@ ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_min_i32 s4, s4, s5 ; GFX8-NEXT: s_add_i32 s11, s11, s4 -; GFX8-NEXT: s_bfe_u32 s4, s8, 0x100000 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s8 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s4 -; GFX8-NEXT: s_bfe_u32 s4, s9, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s9 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16 ; GFX8-NEXT: s_or_b32 s1, s1, s4 -; GFX8-NEXT: s_bfe_u32 s4, s10, 0x100000 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s10 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16 ; GFX8-NEXT: s_or_b32 s2, s2, s4 -; GFX8-NEXT: s_bfe_u32 s4, s11, 0x100000 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s11 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16 ; GFX8-NEXT: s_or_b32 s3, s3, s4 ; GFX8-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -2905,19 +2905,20 @@ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: s_xor_b32 s4, s11, s7 +; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v0 ; GFX9-NEXT: v_xor_b32_e32 v2, s10, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 -; GFX9-NEXT: v_xor_b32_e32 v3, s11, v3 -; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v0 ; GFX9-NEXT: v_subrev_u32_e32 v2, s10, v2 ; GFX9-NEXT: v_subrev_u32_e32 v1, s4, v1 +; GFX9-NEXT: v_xor_b32_e32 v3, s11, v3 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_subrev_u32_e32 v3, s11, v3 -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: global_store_dword v2, v1, s[2:3] @@ -2938,7 +2939,6 @@ ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s1 ; GFX10-NEXT: s_sub_i32 s6, 0, s2 -; GFX10-NEXT: s_sub_i32 s7, 0, s1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -2946,14 +2946,15 @@ ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_lo_u32 v2, s6, v0 -; GFX10-NEXT: v_mul_lo_u32 v3, s7, v1 +; GFX10-NEXT: s_sub_i32 s6, 0, s1 +; GFX10-NEXT: v_mul_lo_u32 v3, s6, v1 ; GFX10-NEXT: s_sext_i32_i16 s6, s0 ; GFX10-NEXT: s_bfe_i32 s0, s0, 0x100010 ; GFX10-NEXT: s_ashr_i32 s9, s6, 31 ; GFX10-NEXT: s_ashr_i32 s10, s0, 31 +; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX10-NEXT: s_add_i32 s6, s6, s9 ; GFX10-NEXT: s_add_i32 s0, s0, s10 -; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX10-NEXT: s_xor_b32 s6, s6, s9 ; GFX10-NEXT: s_xor_b32 s0, s0, s10 @@ -2962,43 +2963,45 @@ ; GFX10-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX10-NEXT: v_mul_hi_u32 v1, s0, v1 ; GFX10-NEXT: v_mul_lo_u32 v2, v0, s2 -; GFX10-NEXT: v_mul_lo_u32 v3, v1, s1 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 +; GFX10-NEXT: v_mul_lo_u32 v3, v1, s1 +; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v1 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, s6, v2 -; GFX10-NEXT: v_sub_nc_u32_e32 v3, s0, v3 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v3 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s1, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo -; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v2 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s1, v3 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, s0, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s2, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s1, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo +; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s1, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 +; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v4, s0 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s2, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v1 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s1, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo +; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s1, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo ; GFX10-NEXT: s_xor_b32 s1, s9, s3 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo -; GFX10-NEXT: s_xor_b32 s0, s10, s8 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 ; GFX10-NEXT: v_xor_b32_e32 v0, s1, v0 -; GFX10-NEXT: v_xor_b32_e32 v1, s0, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v5, s0 ; GFX10-NEXT: v_xor_b32_e32 v2, s9, v2 -; GFX10-NEXT: v_xor_b32_e32 v3, s10, v3 +; GFX10-NEXT: s_xor_b32 s0, s10, s8 +; GFX10-NEXT: v_xor_b32_e32 v1, s0, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s1, v0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s0, v1 +; GFX10-NEXT: v_xor_b32_e32 v3, s10, v3 ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s9, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s0, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s10, v3 -; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: global_store_dword v1, v2, s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/select-to-fmin-fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/select-to-fmin-fmax.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/select-to-fmin-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/select-to-fmin-fmax.ll @@ -46,19 +46,20 @@ ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GCN-NEXT: v_cmp_gt_f16_e32 vcc, 0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GCN-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc ; GCN-NEXT: v_cmp_lt_f16_sdwa s[4:5], v0, s6 src0_sel:WORD_1 src1_sel:DWORD +; GCN-NEXT: v_cmp_gt_f16_e32 vcc, 0, v1 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GCN-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc ; GCN-NEXT: v_cndmask_b32_e64 v0, v2, 0, s[4:5] -; GCN-NEXT: v_cmp_gt_f16_e32 vcc, 0, v1 -; GCN-NEXT: v_cmp_lt_f16_sdwa s[4:5], v1, s6 src0_sel:WORD_1 src1_sel:DWORD ; GCN-NEXT: v_cndmask_b32_e64 v2, v1, 0, vcc +; GCN-NEXT: v_cmp_lt_f16_sdwa s[4:5], v1, s6 src0_sel:WORD_1 src1_sel:DWORD ; GCN-NEXT: v_cndmask_b32_e64 v1, v3, 0, s[4:5] -; GCN-NEXT: s_mov_b32 s4, 0x5040100 -; GCN-NEXT: v_perm_b32 v0, v0, v4, s4 -; GCN-NEXT: v_perm_b32 v1, v1, v2, s4 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_lshl_or_b32 v0, v0, 16, v3 +; GCN-NEXT: v_lshl_or_b32 v1, v1, 16, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] entry: %fcmp = fcmp olt <4 x half> %a, zeroinitializer @@ -71,31 +72,34 @@ ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GCN-NEXT: v_cmp_gt_f16_e32 vcc, 0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GCN-NEXT: v_cndmask_b32_e64 v8, v0, 0, vcc ; GCN-NEXT: v_cmp_lt_f16_sdwa s[4:5], v0, s6 src0_sel:WORD_1 src1_sel:DWORD +; GCN-NEXT: v_cmp_gt_f16_e32 vcc, 0, v1 ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GCN-NEXT: v_cndmask_b32_e64 v8, v0, 0, vcc ; GCN-NEXT: v_cndmask_b32_e64 v0, v4, 0, s[4:5] -; GCN-NEXT: v_cmp_gt_f16_e32 vcc, 0, v1 +; GCN-NEXT: v_cndmask_b32_e64 v4, v1, 0, vcc ; GCN-NEXT: v_cmp_lt_f16_sdwa s[4:5], v1, s6 src0_sel:WORD_1 src1_sel:DWORD +; GCN-NEXT: v_cmp_gt_f16_e32 vcc, 0, v2 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GCN-NEXT: v_cndmask_b32_e64 v4, v1, 0, vcc ; GCN-NEXT: v_cndmask_b32_e64 v1, v5, 0, s[4:5] -; GCN-NEXT: v_cmp_gt_f16_e32 vcc, 0, v2 +; GCN-NEXT: v_cndmask_b32_e64 v5, v2, 0, vcc ; GCN-NEXT: v_cmp_lt_f16_sdwa s[4:5], v2, s6 src0_sel:WORD_1 src1_sel:DWORD +; GCN-NEXT: v_cmp_gt_f16_e32 vcc, 0, v3 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GCN-NEXT: v_cndmask_b32_e64 v5, v2, 0, vcc ; GCN-NEXT: v_cndmask_b32_e64 v2, v6, 0, s[4:5] -; GCN-NEXT: v_cmp_gt_f16_e32 vcc, 0, v3 -; GCN-NEXT: v_cmp_lt_f16_sdwa s[4:5], v3, s6 src0_sel:WORD_1 src1_sel:DWORD ; GCN-NEXT: v_cndmask_b32_e64 v6, v3, 0, vcc +; GCN-NEXT: v_cmp_lt_f16_sdwa s[4:5], v3, s6 src0_sel:WORD_1 src1_sel:DWORD +; GCN-NEXT: v_lshl_or_b32 v1, v1, 16, v4 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v5 ; GCN-NEXT: v_cndmask_b32_e64 v3, v7, 0, s[4:5] -; GCN-NEXT: s_mov_b32 s4, 0x5040100 -; GCN-NEXT: v_perm_b32 v0, v0, v8, s4 -; GCN-NEXT: v_perm_b32 v1, v1, v4, s4 -; GCN-NEXT: v_perm_b32 v2, v2, v5, s4 -; GCN-NEXT: v_perm_b32 v3, v3, v6, s4 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; GCN-NEXT: v_lshl_or_b32 v2, v2, 16, v4 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GCN-NEXT: v_lshl_or_b32 v0, v0, 16, v7 +; GCN-NEXT: v_lshl_or_b32 v3, v3, 16, v4 ; GCN-NEXT: s_setpc_b64 s[30:31] entry: %fcmp = fcmp olt <8 x half> %a, zeroinitializer diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll @@ -49,24 +49,21 @@ ; ; GFX8-LABEL: s_sext_inreg_i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s1, 3, 0x100000 -; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: s_lshl_b32 s0, s0, 3 ; GFX8-NEXT: s_sext_i32_i8 s0, s0 ; GFX8-NEXT: s_ashr_i32 s0, s0, 3 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_sext_inreg_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_bfe_u32 s1, 3, 0x100000 -; GFX9-NEXT: s_lshl_b32 s0, s0, s1 +; GFX9-NEXT: s_lshl_b32 s0, s0, 3 ; GFX9-NEXT: s_sext_i32_i8 s0, s0 ; GFX9-NEXT: s_ashr_i32 s0, s0, 3 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_sext_inreg_i8: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_bfe_u32 s1, 3, 0x100000 -; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s1 +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 3 ; GFX10PLUS-NEXT: s_sext_i32_i8 s0, s0 ; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 3 ; GFX10PLUS-NEXT: ; return to shader part epilog @@ -83,24 +80,21 @@ ; ; GFX8-LABEL: s_sext_inreg_i8_6: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s1, 6, 0x100000 -; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: s_lshl_b32 s0, s0, 6 ; GFX8-NEXT: s_sext_i32_i8 s0, s0 ; GFX8-NEXT: s_ashr_i32 s0, s0, 6 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_sext_inreg_i8_6: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_bfe_u32 s1, 6, 0x100000 -; GFX9-NEXT: s_lshl_b32 s0, s0, s1 +; GFX9-NEXT: s_lshl_b32 s0, s0, 6 ; GFX9-NEXT: s_sext_i32_i8 s0, s0 ; GFX9-NEXT: s_ashr_i32 s0, s0, 6 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_sext_inreg_i8_6: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_bfe_u32 s1, 6, 0x100000 -; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s1 +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 6 ; GFX10PLUS-NEXT: s_sext_i32_i8 s0, s0 ; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 6 ; GFX10PLUS-NEXT: ; return to shader part epilog @@ -612,24 +606,21 @@ ; ; GFX8-LABEL: s_sext_inreg_i16_9: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s1, 9, 0x100000 -; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: s_lshl_b32 s0, s0, 9 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 ; GFX8-NEXT: s_ashr_i32 s0, s0, 9 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_sext_inreg_i16_9: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_bfe_u32 s1, 9, 0x100000 -; GFX9-NEXT: s_lshl_b32 s0, s0, s1 +; GFX9-NEXT: s_lshl_b32 s0, s0, 9 ; GFX9-NEXT: s_sext_i32_i16 s0, s0 ; GFX9-NEXT: s_ashr_i32 s0, s0, 9 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_sext_inreg_i16_9: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_bfe_u32 s1, 9, 0x100000 -; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s1 +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 9 ; GFX10PLUS-NEXT: s_sext_i32_i16 s0, s0 ; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 9 ; GFX10PLUS-NEXT: ; return to shader part epilog @@ -646,24 +637,21 @@ ; ; GFX8-LABEL: s_sext_inreg_i16_15: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s1, 15, 0x100000 -; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: s_lshl_b32 s0, s0, 15 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 ; GFX8-NEXT: s_ashr_i32 s0, s0, 15 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_sext_inreg_i16_15: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_bfe_u32 s1, 15, 0x100000 -; GFX9-NEXT: s_lshl_b32 s0, s0, s1 +; GFX9-NEXT: s_lshl_b32 s0, s0, 15 ; GFX9-NEXT: s_sext_i32_i16 s0, s0 ; GFX9-NEXT: s_ashr_i32 s0, s0, 15 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_sext_inreg_i16_15: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_bfe_u32 s1, 15, 0x100000 -; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s1 +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 15 ; GFX10PLUS-NEXT: s_sext_i32_i16 s0, s0 ; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 15 ; GFX10PLUS-NEXT: ; return to shader part epilog @@ -762,9 +750,8 @@ ; GFX8-LABEL: s_sext_inreg_v2i16_11: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_bfe_u32 s2, 11, 0x100000 -; GFX8-NEXT: s_lshl_b32 s0, s0, s2 -; GFX8-NEXT: s_lshl_b32 s1, s1, s2 +; GFX8-NEXT: s_lshl_b32 s0, s0, 11 +; GFX8-NEXT: s_lshl_b32 s1, s1, 11 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_ashr_i32 s0, s0, 11 @@ -889,8 +876,8 @@ ; ; GFX8-LABEL: s_sext_inreg_v4i16_14: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s0, -1, 0x100000 -; GFX8-NEXT: s_mov_b32 s1, s0 +; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: s_mov_b32 s1, 0xffff ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_sext_inreg_v4i16_14: @@ -978,11 +965,10 @@ ; GFX8-LABEL: v_sext_inreg_v8i16_11: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s4, -1, 0x100000 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX8-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX8-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_sext_inreg_v8i16_11: @@ -1041,32 +1027,31 @@ ; GFX8-LABEL: s_sext_inreg_v8i16_5: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 -; GFX8-NEXT: s_bfe_u32 s8, 10, 0x100000 -; GFX8-NEXT: s_lshl_b32 s4, s4, s8 +; GFX8-NEXT: s_lshl_b32 s4, s4, 10 ; GFX8-NEXT: s_lshr_b32 s5, s1, 16 -; GFX8-NEXT: s_lshl_b32 s0, s0, s8 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_lshl_b32 s5, s5, s8 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX8-NEXT: s_lshl_b32 s0, s0, 10 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX8-NEXT: s_lshl_b32 s5, s5, 10 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16 ; GFX8-NEXT: s_lshr_b32 s6, s2, 16 -; GFX8-NEXT: s_lshl_b32 s1, s1, s8 +; GFX8-NEXT: s_lshl_b32 s1, s1, 10 ; GFX8-NEXT: s_or_b32 s0, s0, s4 -; GFX8-NEXT: s_bfe_u32 s4, s5, 0x100000 -; GFX8-NEXT: s_lshl_b32 s6, s6, s8 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s5 +; GFX8-NEXT: s_lshl_b32 s6, s6, 10 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16 ; GFX8-NEXT: s_lshr_b32 s7, s3, 16 -; GFX8-NEXT: s_lshl_b32 s2, s2, s8 +; GFX8-NEXT: s_lshl_b32 s2, s2, 10 ; GFX8-NEXT: s_or_b32 s1, s1, s4 -; GFX8-NEXT: s_bfe_u32 s4, s6, 0x100000 -; GFX8-NEXT: s_lshl_b32 s7, s7, s8 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s6 +; GFX8-NEXT: s_lshl_b32 s7, s7, 10 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_lshl_b32 s3, s3, s8 +; GFX8-NEXT: s_lshl_b32 s3, s3, 10 ; GFX8-NEXT: s_or_b32 s2, s2, s4 -; GFX8-NEXT: s_bfe_u32 s4, s7, 0x100000 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s7 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16 ; GFX8-NEXT: s_or_b32 s3, s3, s4 ; GFX8-NEXT: ; return to shader part epilog @@ -1550,251 +1535,18 @@ } define amdgpu_ps i65 @s_sext_inreg_i65_33(i65 inreg %value) { -; GFX6-LABEL: s_sext_inreg_i65_33: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s0, 0 -; GFX6-NEXT: s_mov_b32 s1, 0 -; GFX6-NEXT: s_mov_b32 s2, 0 -; GFX6-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: s_sext_inreg_i65_33: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s0, 1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, 2, 0x100000 -; GFX8-NEXT: s_lshr_b32 s0, 0, s0 -; GFX8-NEXT: s_lshr_b32 s1, 0, s1 -; GFX8-NEXT: s_bfe_u32 s2, 3, 0x100000 -; GFX8-NEXT: s_and_b32 s0, s0, 1 -; GFX8-NEXT: s_and_b32 s1, s1, 1 -; GFX8-NEXT: s_lshr_b32 s2, 0, s2 -; GFX8-NEXT: s_lshl_b32 s0, s0, 17 -; GFX8-NEXT: s_lshl_b32 s1, s1, 18 -; GFX8-NEXT: s_bfe_u32 s3, 4, 0x100000 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s2, 1 -; GFX8-NEXT: s_lshr_b32 s3, 0, s3 -; GFX8-NEXT: s_lshl_b32 s1, s1, 19 -; GFX8-NEXT: s_bfe_u32 s4, 5, 0x100000 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s3, 1 -; GFX8-NEXT: s_lshr_b32 s4, 0, s4 -; GFX8-NEXT: s_lshl_b32 s1, s1, 20 -; GFX8-NEXT: s_bfe_u32 s5, 6, 0x100000 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s4, 1 -; GFX8-NEXT: s_lshr_b32 s5, 0, s5 -; GFX8-NEXT: s_lshl_b32 s1, s1, 21 -; GFX8-NEXT: s_bfe_u32 s6, 7, 0x100000 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s5, 1 -; GFX8-NEXT: s_lshr_b32 s6, 0, s6 -; GFX8-NEXT: s_lshl_b32 s1, s1, 22 -; GFX8-NEXT: s_bfe_u32 s7, 8, 0x100000 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s6, 1 -; GFX8-NEXT: s_lshr_b32 s7, 0, s7 -; GFX8-NEXT: s_lshl_b32 s1, s1, 23 -; GFX8-NEXT: s_bfe_u32 s8, 9, 0x100000 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s7, 1 -; GFX8-NEXT: s_lshr_b32 s8, 0, s8 -; GFX8-NEXT: s_lshl_b32 s1, s1, 24 -; GFX8-NEXT: s_bfe_u32 s9, 10, 0x100000 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s8, 1 -; GFX8-NEXT: s_lshr_b32 s9, 0, s9 -; GFX8-NEXT: s_lshl_b32 s1, s1, 25 -; GFX8-NEXT: s_bfe_u32 s10, 11, 0x100000 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s9, 1 -; GFX8-NEXT: s_lshr_b32 s10, 0, s10 -; GFX8-NEXT: s_lshl_b32 s1, s1, 26 -; GFX8-NEXT: s_bfe_u32 s11, 12, 0x100000 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s10, 1 -; GFX8-NEXT: s_lshr_b32 s11, 0, s11 -; GFX8-NEXT: s_lshl_b32 s1, s1, 27 -; GFX8-NEXT: s_bfe_u32 s12, 13, 0x100000 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s11, 1 -; GFX8-NEXT: s_lshr_b32 s12, 0, s12 -; GFX8-NEXT: s_lshl_b32 s1, s1, 28 -; GFX8-NEXT: s_bfe_u32 s13, 14, 0x100000 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s12, 1 -; GFX8-NEXT: s_lshr_b32 s13, 0, s13 -; GFX8-NEXT: s_lshl_b32 s1, s1, 29 -; GFX8-NEXT: s_bfe_u32 s14, 15, 0x100000 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s13, 1 -; GFX8-NEXT: s_lshr_b32 s14, 0, s14 -; GFX8-NEXT: s_lshl_b32 s1, s1, 30 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s14, 1 -; GFX8-NEXT: s_lshl_b32 s1, s1, 31 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_mov_b32 s1, s0 -; GFX8-NEXT: s_mov_b32 s2, 0 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: s_sext_inreg_i65_33: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_bfe_u32 s0, 1, 0x100000 -; GFX9-NEXT: s_bfe_u32 s1, 2, 0x100000 -; GFX9-NEXT: s_lshr_b32 s0, 0, s0 -; GFX9-NEXT: s_lshr_b32 s1, 0, s1 -; GFX9-NEXT: s_bfe_u32 s2, 3, 0x100000 -; GFX9-NEXT: s_and_b32 s0, s0, 1 -; GFX9-NEXT: s_and_b32 s1, s1, 1 -; GFX9-NEXT: s_lshr_b32 s2, 0, s2 -; GFX9-NEXT: s_lshl_b32 s0, s0, 17 -; GFX9-NEXT: s_lshl_b32 s1, s1, 18 -; GFX9-NEXT: s_bfe_u32 s3, 4, 0x100000 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s2, 1 -; GFX9-NEXT: s_lshr_b32 s3, 0, s3 -; GFX9-NEXT: s_lshl_b32 s1, s1, 19 -; GFX9-NEXT: s_bfe_u32 s4, 5, 0x100000 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s3, 1 -; GFX9-NEXT: s_lshr_b32 s4, 0, s4 -; GFX9-NEXT: s_lshl_b32 s1, s1, 20 -; GFX9-NEXT: s_bfe_u32 s5, 6, 0x100000 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s4, 1 -; GFX9-NEXT: s_lshr_b32 s5, 0, s5 -; GFX9-NEXT: s_lshl_b32 s1, s1, 21 -; GFX9-NEXT: s_bfe_u32 s6, 7, 0x100000 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s5, 1 -; GFX9-NEXT: s_lshr_b32 s6, 0, s6 -; GFX9-NEXT: s_lshl_b32 s1, s1, 22 -; GFX9-NEXT: s_bfe_u32 s7, 8, 0x100000 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s6, 1 -; GFX9-NEXT: s_lshr_b32 s7, 0, s7 -; GFX9-NEXT: s_lshl_b32 s1, s1, 23 -; GFX9-NEXT: s_bfe_u32 s8, 9, 0x100000 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s7, 1 -; GFX9-NEXT: s_lshr_b32 s8, 0, s8 -; GFX9-NEXT: s_lshl_b32 s1, s1, 24 -; GFX9-NEXT: s_bfe_u32 s9, 10, 0x100000 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s8, 1 -; GFX9-NEXT: s_lshr_b32 s9, 0, s9 -; GFX9-NEXT: s_lshl_b32 s1, s1, 25 -; GFX9-NEXT: s_bfe_u32 s10, 11, 0x100000 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s9, 1 -; GFX9-NEXT: s_lshr_b32 s10, 0, s10 -; GFX9-NEXT: s_lshl_b32 s1, s1, 26 -; GFX9-NEXT: s_bfe_u32 s11, 12, 0x100000 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s10, 1 -; GFX9-NEXT: s_lshr_b32 s11, 0, s11 -; GFX9-NEXT: s_lshl_b32 s1, s1, 27 -; GFX9-NEXT: s_bfe_u32 s12, 13, 0x100000 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s11, 1 -; GFX9-NEXT: s_lshr_b32 s12, 0, s12 -; GFX9-NEXT: s_lshl_b32 s1, s1, 28 -; GFX9-NEXT: s_bfe_u32 s13, 14, 0x100000 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s12, 1 -; GFX9-NEXT: s_lshr_b32 s13, 0, s13 -; GFX9-NEXT: s_lshl_b32 s1, s1, 29 -; GFX9-NEXT: s_bfe_u32 s14, 15, 0x100000 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s13, 1 -; GFX9-NEXT: s_lshr_b32 s14, 0, s14 -; GFX9-NEXT: s_lshl_b32 s1, s1, 30 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s14, 1 -; GFX9-NEXT: s_lshl_b32 s1, s1, 31 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_mov_b32 s1, s0 -; GFX9-NEXT: s_mov_b32 s2, 0 -; GFX9-NEXT: ; return to shader part epilog +; GCN-LABEL: s_sext_inreg_i65_33: +; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b32 s0, 0 +; GCN-NEXT: s_mov_b32 s1, 0 +; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_sext_inreg_i65_33: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_bfe_u32 s0, 1, 0x100000 -; GFX10PLUS-NEXT: s_bfe_u32 s1, 2, 0x100000 -; GFX10PLUS-NEXT: s_lshr_b32 s0, 0, s0 -; GFX10PLUS-NEXT: s_bfe_u32 s2, 3, 0x100000 -; GFX10PLUS-NEXT: s_lshr_b32 s1, 0, s1 -; GFX10PLUS-NEXT: s_lshr_b32 s2, 0, s2 -; GFX10PLUS-NEXT: s_bfe_u32 s3, 4, 0x100000 -; GFX10PLUS-NEXT: s_and_b32 s0, s0, 1 -; GFX10PLUS-NEXT: s_and_b32 s1, s1, 1 -; GFX10PLUS-NEXT: s_bfe_u32 s4, 5, 0x100000 -; GFX10PLUS-NEXT: s_lshr_b32 s3, 0, s3 -; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 17 -; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 18 -; GFX10PLUS-NEXT: s_and_b32 s2, s2, 1 -; GFX10PLUS-NEXT: s_lshr_b32 s4, 0, s4 -; GFX10PLUS-NEXT: s_bfe_u32 s5, 6, 0x100000 -; GFX10PLUS-NEXT: s_or_b32 s0, s0, s1 -; GFX10PLUS-NEXT: s_lshl_b32 s1, s2, 19 -; GFX10PLUS-NEXT: s_and_b32 s2, s3, 1 -; GFX10PLUS-NEXT: s_bfe_u32 s6, 7, 0x100000 -; GFX10PLUS-NEXT: s_lshr_b32 s5, 0, s5 -; GFX10PLUS-NEXT: s_or_b32 s0, s0, s1 -; GFX10PLUS-NEXT: s_lshl_b32 s1, s2, 20 -; GFX10PLUS-NEXT: s_and_b32 s2, s4, 1 -; GFX10PLUS-NEXT: s_lshr_b32 s6, 0, s6 -; GFX10PLUS-NEXT: s_bfe_u32 s7, 8, 0x100000 -; GFX10PLUS-NEXT: s_or_b32 s0, s0, s1 -; GFX10PLUS-NEXT: s_lshl_b32 s1, s2, 21 -; GFX10PLUS-NEXT: s_and_b32 s2, s5, 1 -; GFX10PLUS-NEXT: s_bfe_u32 s8, 9, 0x100000 -; GFX10PLUS-NEXT: s_lshr_b32 s7, 0, s7 -; GFX10PLUS-NEXT: s_or_b32 s0, s0, s1 -; GFX10PLUS-NEXT: s_lshl_b32 s1, s2, 22 -; GFX10PLUS-NEXT: s_and_b32 s2, s6, 1 -; GFX10PLUS-NEXT: s_lshr_b32 s8, 0, s8 -; GFX10PLUS-NEXT: s_bfe_u32 s9, 10, 0x100000 -; GFX10PLUS-NEXT: s_or_b32 s0, s0, s1 -; GFX10PLUS-NEXT: s_lshl_b32 s1, s2, 23 -; GFX10PLUS-NEXT: s_and_b32 s2, s7, 1 -; GFX10PLUS-NEXT: s_bfe_u32 s10, 11, 0x100000 -; GFX10PLUS-NEXT: s_lshr_b32 s9, 0, s9 -; GFX10PLUS-NEXT: s_or_b32 s0, s0, s1 -; GFX10PLUS-NEXT: s_lshl_b32 s1, s2, 24 -; GFX10PLUS-NEXT: s_and_b32 s2, s8, 1 -; GFX10PLUS-NEXT: s_lshr_b32 s10, 0, s10 -; GFX10PLUS-NEXT: s_bfe_u32 s11, 12, 0x100000 -; GFX10PLUS-NEXT: s_or_b32 s0, s0, s1 -; GFX10PLUS-NEXT: s_lshl_b32 s1, s2, 25 -; GFX10PLUS-NEXT: s_and_b32 s2, s9, 1 -; GFX10PLUS-NEXT: s_bfe_u32 s12, 13, 0x100000 -; GFX10PLUS-NEXT: s_lshr_b32 s11, 0, s11 -; GFX10PLUS-NEXT: s_or_b32 s0, s0, s1 -; GFX10PLUS-NEXT: s_lshl_b32 s1, s2, 26 -; GFX10PLUS-NEXT: s_and_b32 s2, s10, 1 -; GFX10PLUS-NEXT: s_lshr_b32 s12, 0, s12 -; GFX10PLUS-NEXT: s_bfe_u32 s13, 14, 0x100000 -; GFX10PLUS-NEXT: s_or_b32 s0, s0, s1 -; GFX10PLUS-NEXT: s_lshl_b32 s1, s2, 27 -; GFX10PLUS-NEXT: s_and_b32 s2, s11, 1 -; GFX10PLUS-NEXT: s_bfe_u32 s14, 15, 0x100000 -; GFX10PLUS-NEXT: s_lshr_b32 s13, 0, s13 -; GFX10PLUS-NEXT: s_or_b32 s0, s0, s1 -; GFX10PLUS-NEXT: s_lshl_b32 s1, s2, 28 -; GFX10PLUS-NEXT: s_and_b32 s2, s12, 1 -; GFX10PLUS-NEXT: s_lshr_b32 s14, 0, s14 -; GFX10PLUS-NEXT: s_or_b32 s0, s0, s1 -; GFX10PLUS-NEXT: s_lshl_b32 s1, s2, 29 -; GFX10PLUS-NEXT: s_and_b32 s2, s13, 1 -; GFX10PLUS-NEXT: s_or_b32 s0, s0, s1 -; GFX10PLUS-NEXT: s_lshl_b32 s1, s2, 30 -; GFX10PLUS-NEXT: s_and_b32 s2, s14, 1 -; GFX10PLUS-NEXT: s_or_b32 s0, s0, s1 -; GFX10PLUS-NEXT: s_lshl_b32 s1, s2, 31 +; GFX10PLUS-NEXT: s_mov_b32 s0, 0 +; GFX10PLUS-NEXT: s_mov_b32 s1, 0 ; GFX10PLUS-NEXT: s_mov_b32 s2, 0 -; GFX10PLUS-NEXT: s_or_b32 s0, s0, s1 -; GFX10PLUS-NEXT: s_mov_b32 s1, s0 ; GFX10PLUS-NEXT: ; return to shader part epilog %shl = shl i65 %value, 33 %ashr = shl i65 %shl, 33 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll @@ -664,7 +664,7 @@ ; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0x3fff, v0 ; GFX10PLUS-NEXT: v_lshlrev_b16 v0, 2, v0 -; GFX10PLUS-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %and = and i16 %x, 16383 %ext = zext i16 %and to i32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll @@ -93,27 +93,14 @@ } define amdgpu_ps i8 @s_shl_i8_7(i8 inreg %value) { -; GFX6-LABEL: s_shl_i8_7: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, 7 -; GFX6-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: s_shl_i8_7: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s1, 7, 0x100000 -; GFX8-NEXT: s_lshl_b32 s0, s0, s1 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: s_shl_i8_7: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_bfe_u32 s1, 7, 0x100000 -; GFX9-NEXT: s_lshl_b32 s0, s0, s1 -; GFX9-NEXT: ; return to shader part epilog +; GCN-LABEL: s_shl_i8_7: +; GCN: ; %bb.0: +; GCN-NEXT: s_lshl_b32 s0, s0, 7 +; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_shl_i8_7: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_bfe_u32 s1, 7, 0x100000 -; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s1 +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 7 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = shl i8 %value, 7 ret i8 %result @@ -675,27 +662,14 @@ } define amdgpu_ps i16 @s_shl_i16_15(i16 inreg %value) { -; GFX6-LABEL: s_shl_i16_15: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, 15 -; GFX6-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: s_shl_i16_15: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s1, 15, 0x100000 -; GFX8-NEXT: s_lshl_b32 s0, s0, s1 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: s_shl_i16_15: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_bfe_u32 s1, 15, 0x100000 -; GFX9-NEXT: s_lshl_b32 s0, s0, s1 -; GFX9-NEXT: ; return to shader part epilog +; GCN-LABEL: s_shl_i16_15: +; GCN: ; %bb.0: +; GCN-NEXT: s_lshl_b32 s0, s0, 15 +; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_shl_i16_15: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_bfe_u32 s1, 15, 0x100000 -; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s1 +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 15 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = shl i16 %value, 15 ret i16 %result diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -75,31 +75,29 @@ ; ; GFX8-LABEL: s_ssubsat_i7: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s2, 9, 0x100000 -; GFX8-NEXT: s_lshl_b32 s0, s0, s2 -; GFX8-NEXT: s_sext_i32_i16 s3, s0 -; GFX8-NEXT: s_sext_i32_i16 s4, -1 -; GFX8-NEXT: s_max_i32 s5, s3, s4 -; GFX8-NEXT: s_lshl_b32 s1, s1, s2 -; GFX8-NEXT: s_sub_i32 s5, s5, 0x7fff -; GFX8-NEXT: s_min_i32 s3, s3, s4 -; GFX8-NEXT: s_sext_i32_i16 s4, s5 +; GFX8-NEXT: s_lshl_b32 s0, s0, 9 +; GFX8-NEXT: s_sext_i32_i16 s2, s0 +; GFX8-NEXT: s_sext_i32_i16 s3, -1 +; GFX8-NEXT: s_max_i32 s4, s2, s3 +; GFX8-NEXT: s_lshl_b32 s1, s1, 9 +; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fff +; GFX8-NEXT: s_min_i32 s2, s2, s3 +; GFX8-NEXT: s_sext_i32_i16 s3, s4 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s3, s3, 0xffff8000 -; GFX8-NEXT: s_max_i32 s1, s4, s1 +; GFX8-NEXT: s_sub_i32 s2, s2, 0xffff8000 +; GFX8-NEXT: s_max_i32 s1, s3, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_min_i32 s1, s1, s3 +; GFX8-NEXT: s_sext_i32_i16 s2, s2 +; GFX8-NEXT: s_min_i32 s1, s1, s2 ; GFX8-NEXT: s_sub_i32 s0, s0, s1 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 -; GFX8-NEXT: s_ashr_i32 s0, s0, s2 +; GFX8-NEXT: s_ashr_i32 s0, s0, 9 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_ssubsat_i7: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_bfe_u32 s2, 9, 0x100000 -; GFX9-NEXT: s_lshl_b32 s1, s1, s2 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 +; GFX9-NEXT: s_lshl_b32 s1, s1, 9 +; GFX9-NEXT: s_lshl_b32 s0, s0, 9 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_sub_i16 v0, s0, v0 clamp ; GFX9-NEXT: v_ashrrev_i16_e32 v0, 9, v0 @@ -108,9 +106,8 @@ ; ; GFX10PLUS-LABEL: s_ssubsat_i7: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_bfe_u32 s2, 9, 0x100000 -; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s2 -; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, s2 +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 9 +; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 9 ; GFX10PLUS-NEXT: v_sub_nc_i16 v0, s0, s1 clamp ; GFX10PLUS-NEXT: v_ashrrev_i16 v0, 9, v0 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 @@ -189,31 +186,29 @@ ; ; GFX8-LABEL: s_ssubsat_i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s2, 8, 0x100000 -; GFX8-NEXT: s_lshl_b32 s0, s0, s2 -; GFX8-NEXT: s_sext_i32_i16 s3, s0 -; GFX8-NEXT: s_sext_i32_i16 s4, -1 -; GFX8-NEXT: s_max_i32 s5, s3, s4 -; GFX8-NEXT: s_lshl_b32 s1, s1, s2 -; GFX8-NEXT: s_sub_i32 s5, s5, 0x7fff -; GFX8-NEXT: s_min_i32 s3, s3, s4 -; GFX8-NEXT: s_sext_i32_i16 s4, s5 +; GFX8-NEXT: s_lshl_b32 s0, s0, 8 +; GFX8-NEXT: s_sext_i32_i16 s2, s0 +; GFX8-NEXT: s_sext_i32_i16 s3, -1 +; GFX8-NEXT: s_max_i32 s4, s2, s3 +; GFX8-NEXT: s_lshl_b32 s1, s1, 8 +; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fff +; GFX8-NEXT: s_min_i32 s2, s2, s3 +; GFX8-NEXT: s_sext_i32_i16 s3, s4 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s3, s3, 0xffff8000 -; GFX8-NEXT: s_max_i32 s1, s4, s1 +; GFX8-NEXT: s_sub_i32 s2, s2, 0xffff8000 +; GFX8-NEXT: s_max_i32 s1, s3, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_min_i32 s1, s1, s3 +; GFX8-NEXT: s_sext_i32_i16 s2, s2 +; GFX8-NEXT: s_min_i32 s1, s1, s2 ; GFX8-NEXT: s_sub_i32 s0, s0, s1 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 -; GFX8-NEXT: s_ashr_i32 s0, s0, s2 +; GFX8-NEXT: s_ashr_i32 s0, s0, 8 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_ssubsat_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_bfe_u32 s2, 8, 0x100000 -; GFX9-NEXT: s_lshl_b32 s1, s1, s2 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 +; GFX9-NEXT: s_lshl_b32 s1, s1, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, 8 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_sub_i16 v0, s0, v0 clamp ; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0 @@ -222,9 +217,8 @@ ; ; GFX10PLUS-LABEL: s_ssubsat_i8: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_bfe_u32 s2, 8, 0x100000 -; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s2 -; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, s2 +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 8 +; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8 ; GFX10PLUS-NEXT: v_sub_nc_i16 v0, s0, s1 clamp ; GFX10PLUS-NEXT: v_ashrrev_i16 v0, 8, v0 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 @@ -300,9 +294,10 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v2, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v3, v1, s4 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 clamp @@ -318,9 +313,11 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX10-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 +; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_sub_i16 v0, v0, v1 clamp @@ -335,8 +332,10 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_sub_i16 v0, v0, v1 clamp @@ -387,45 +386,44 @@ ; ; GFX8-LABEL: s_ssubsat_v2i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s4, 8, 0x100000 ; GFX8-NEXT: s_lshr_b32 s2, s0, 8 -; GFX8-NEXT: s_lshl_b32 s0, s0, s4 -; GFX8-NEXT: s_sext_i32_i16 s5, s0 -; GFX8-NEXT: s_sext_i32_i16 s6, -1 -; GFX8-NEXT: s_max_i32 s7, s5, s6 +; GFX8-NEXT: s_lshl_b32 s0, s0, 8 +; GFX8-NEXT: s_sext_i32_i16 s4, s0 +; GFX8-NEXT: s_sext_i32_i16 s5, -1 +; GFX8-NEXT: s_max_i32 s6, s4, s5 ; GFX8-NEXT: s_lshr_b32 s3, s1, 8 -; GFX8-NEXT: s_lshl_b32 s1, s1, s4 -; GFX8-NEXT: s_sub_i32 s7, s7, 0x7fff -; GFX8-NEXT: s_min_i32 s5, s5, s6 -; GFX8-NEXT: s_sext_i32_i16 s7, s7 +; GFX8-NEXT: s_lshl_b32 s1, s1, 8 +; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fff +; GFX8-NEXT: s_min_i32 s4, s4, s5 +; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s5, s5, 0xffff8000 -; GFX8-NEXT: s_max_i32 s1, s7, s1 +; GFX8-NEXT: s_sub_i32 s4, s4, 0xffff8000 +; GFX8-NEXT: s_max_i32 s1, s6, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sext_i32_i16 s5, s5 -; GFX8-NEXT: s_min_i32 s1, s1, s5 +; GFX8-NEXT: s_sext_i32_i16 s4, s4 +; GFX8-NEXT: s_min_i32 s1, s1, s4 ; GFX8-NEXT: s_sub_i32 s0, s0, s1 -; GFX8-NEXT: s_lshl_b32 s1, s2, s4 -; GFX8-NEXT: s_lshl_b32 s2, s3, s4 +; GFX8-NEXT: s_lshl_b32 s1, s2, 8 +; GFX8-NEXT: s_lshl_b32 s2, s3, 8 ; GFX8-NEXT: s_sext_i32_i16 s3, s1 -; GFX8-NEXT: s_max_i32 s5, s3, s6 -; GFX8-NEXT: s_sub_i32 s5, s5, 0x7fff -; GFX8-NEXT: s_min_i32 s3, s3, s6 -; GFX8-NEXT: s_sext_i32_i16 s5, s5 +; GFX8-NEXT: s_max_i32 s4, s3, s5 +; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fff +; GFX8-NEXT: s_min_i32 s3, s3, s5 +; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sub_i32 s3, s3, 0xffff8000 -; GFX8-NEXT: s_max_i32 s2, s5, s2 +; GFX8-NEXT: s_max_i32 s2, s4, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_min_i32 s2, s2, s3 ; GFX8-NEXT: s_sub_i32 s1, s1, s2 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 -; GFX8-NEXT: s_ashr_i32 s1, s1, s4 -; GFX8-NEXT: s_ashr_i32 s0, s0, s4 +; GFX8-NEXT: s_ashr_i32 s1, s1, 8 +; GFX8-NEXT: s_ashr_i32 s0, s0, 8 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_and_b32 s0, s0, 0xff -; GFX8-NEXT: s_lshl_b32 s1, s1, s4 +; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; @@ -628,14 +626,15 @@ ; GFX9-LABEL: v_ssubsat_v4i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX9-NEXT: v_perm_b32 v2, v2, v0, s4 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v0 ; GFX9-NEXT: v_alignbit_b32 v0, v3, v0, 16 -; GFX9-NEXT: v_perm_b32 v3, v4, v1, s4 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX9-NEXT: v_alignbit_b32 v1, v5, v1, 16 ; GFX9-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] @@ -662,15 +661,17 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX10-NEXT: v_perm_b32 v2, v2, v0, 0x5040100 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX10-NEXT: v_alignbit_b32 v0, v3, v0, 16 -; GFX10-NEXT: v_perm_b32 v3, v4, v1, 0x5040100 -; GFX10-NEXT: v_alignbit_b32 v1, v5, v1, 16 +; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, 24 -; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] +; GFX10-NEXT: v_lshl_or_b32 v3, v5, 16, v6 +; GFX10-NEXT: v_alignbit_b32 v1, v7, v1, 16 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_sub_i16 v2, v2, v3 clamp @@ -692,12 +693,14 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX11-NEXT: v_perm_b32 v2, v2, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v3, v1, 0x5040100 -; GFX11-NEXT: v_alignbit_b32 v0, v4, v0, 16 -; GFX11-NEXT: v_alignbit_b32 v1, v5, v1, 16 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v0 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX11-NEXT: v_lshl_or_b32 v2, v2, 16, v4 +; GFX11-NEXT: v_lshl_or_b32 v3, v3, 16, v5 +; GFX11-NEXT: v_alignbit_b32 v0, v6, v0, 16 +; GFX11-NEXT: v_alignbit_b32 v1, v7, v1, 16 ; GFX11-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] @@ -785,48 +788,47 @@ ; ; GFX8-LABEL: s_ssubsat_v4i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s8, 8, 0x100000 ; GFX8-NEXT: s_lshr_b32 s2, s0, 8 ; GFX8-NEXT: s_lshr_b32 s3, s0, 16 ; GFX8-NEXT: s_lshr_b32 s4, s0, 24 -; GFX8-NEXT: s_lshl_b32 s0, s0, s8 -; GFX8-NEXT: s_sext_i32_i16 s9, s0 -; GFX8-NEXT: s_sext_i32_i16 s10, -1 -; GFX8-NEXT: s_max_i32 s11, s9, s10 +; GFX8-NEXT: s_lshl_b32 s0, s0, 8 +; GFX8-NEXT: s_sext_i32_i16 s8, s0 +; GFX8-NEXT: s_sext_i32_i16 s9, -1 +; GFX8-NEXT: s_max_i32 s10, s8, s9 ; GFX8-NEXT: s_lshr_b32 s5, s1, 8 ; GFX8-NEXT: s_lshr_b32 s6, s1, 16 ; GFX8-NEXT: s_lshr_b32 s7, s1, 24 -; GFX8-NEXT: s_lshl_b32 s1, s1, s8 -; GFX8-NEXT: s_sub_i32 s11, s11, 0x7fff -; GFX8-NEXT: s_min_i32 s9, s9, s10 -; GFX8-NEXT: s_sext_i32_i16 s11, s11 +; GFX8-NEXT: s_lshl_b32 s1, s1, 8 +; GFX8-NEXT: s_sub_i32 s10, s10, 0x7fff +; GFX8-NEXT: s_min_i32 s8, s8, s9 +; GFX8-NEXT: s_sext_i32_i16 s10, s10 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s9, s9, 0xffff8000 -; GFX8-NEXT: s_max_i32 s1, s11, s1 +; GFX8-NEXT: s_sub_i32 s8, s8, 0xffff8000 +; GFX8-NEXT: s_max_i32 s1, s10, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sext_i32_i16 s9, s9 -; GFX8-NEXT: s_min_i32 s1, s1, s9 +; GFX8-NEXT: s_sext_i32_i16 s8, s8 +; GFX8-NEXT: s_min_i32 s1, s1, s8 ; GFX8-NEXT: s_sub_i32 s0, s0, s1 -; GFX8-NEXT: s_lshl_b32 s1, s2, s8 -; GFX8-NEXT: s_lshl_b32 s2, s5, s8 +; GFX8-NEXT: s_lshl_b32 s1, s2, 8 +; GFX8-NEXT: s_lshl_b32 s2, s5, 8 ; GFX8-NEXT: s_sext_i32_i16 s5, s1 -; GFX8-NEXT: s_max_i32 s9, s5, s10 -; GFX8-NEXT: s_sub_i32 s9, s9, 0x7fff -; GFX8-NEXT: s_min_i32 s5, s5, s10 -; GFX8-NEXT: s_sext_i32_i16 s9, s9 +; GFX8-NEXT: s_max_i32 s8, s5, s9 +; GFX8-NEXT: s_sub_i32 s8, s8, 0x7fff +; GFX8-NEXT: s_min_i32 s5, s5, s9 +; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sub_i32 s5, s5, 0xffff8000 -; GFX8-NEXT: s_max_i32 s2, s9, s2 +; GFX8-NEXT: s_max_i32 s2, s8, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_min_i32 s2, s2, s5 ; GFX8-NEXT: s_sub_i32 s1, s1, s2 -; GFX8-NEXT: s_lshl_b32 s2, s3, s8 +; GFX8-NEXT: s_lshl_b32 s2, s3, 8 ; GFX8-NEXT: s_sext_i32_i16 s5, s2 -; GFX8-NEXT: s_lshl_b32 s3, s6, s8 -; GFX8-NEXT: s_max_i32 s6, s5, s10 +; GFX8-NEXT: s_lshl_b32 s3, s6, 8 +; GFX8-NEXT: s_max_i32 s6, s5, s9 ; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fff -; GFX8-NEXT: s_min_i32 s5, s5, s10 +; GFX8-NEXT: s_min_i32 s5, s5, s9 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sub_i32 s5, s5, 0xffff8000 @@ -835,33 +837,33 @@ ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_min_i32 s3, s3, s5 ; GFX8-NEXT: s_sub_i32 s2, s2, s3 -; GFX8-NEXT: s_lshl_b32 s3, s4, s8 +; GFX8-NEXT: s_lshl_b32 s3, s4, 8 ; GFX8-NEXT: s_sext_i32_i16 s5, s3 -; GFX8-NEXT: s_max_i32 s6, s5, s10 -; GFX8-NEXT: s_lshl_b32 s4, s7, s8 +; GFX8-NEXT: s_max_i32 s6, s5, s9 +; GFX8-NEXT: s_lshl_b32 s4, s7, 8 ; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fff -; GFX8-NEXT: s_min_i32 s5, s5, s10 +; GFX8-NEXT: s_min_i32 s5, s5, s9 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sub_i32 s5, s5, 0xffff8000 ; GFX8-NEXT: s_max_i32 s4, s6, s4 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 -; GFX8-NEXT: s_ashr_i32 s1, s1, s8 +; GFX8-NEXT: s_ashr_i32 s1, s1, 8 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 -; GFX8-NEXT: s_ashr_i32 s0, s0, s8 +; GFX8-NEXT: s_ashr_i32 s0, s0, 8 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_min_i32 s4, s4, s5 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_ashr_i32 s2, s2, s8 +; GFX8-NEXT: s_ashr_i32 s2, s2, 8 ; GFX8-NEXT: s_sub_i32 s3, s3, s4 ; GFX8-NEXT: s_and_b32 s0, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s2, 0xff -; GFX8-NEXT: s_ashr_i32 s3, s3, s8 +; GFX8-NEXT: s_ashr_i32 s3, s3, 8 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s3, 0xff @@ -2897,8 +2899,8 @@ ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_min_i32 s1, s3, s1 ; GFX8-NEXT: s_sub_i32 s1, s2, s1 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog @@ -3286,13 +3288,13 @@ ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_min_i32 s3, s4, s3 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_sub_i32 s3, s5, s3 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_bfe_u32 s2, s3, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s3 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: ; return to shader part epilog @@ -3640,20 +3642,20 @@ ; GFX8-NEXT: s_sext_i32_i16 s7, s11 ; GFX8-NEXT: s_sub_i32 s5, s5, 0xffff8000 ; GFX8-NEXT: s_max_i32 s6, s6, s7 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s3, s3, 16 ; GFX8-NEXT: s_min_i32 s5, s6, s5 ; GFX8-NEXT: s_or_b32 s0, s0, s3 -; GFX8-NEXT: s_bfe_u32 s3, s4, 0x100000 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s4 ; GFX8-NEXT: s_sub_i32 s5, s8, s5 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshl_b32 s3, s3, 16 ; GFX8-NEXT: s_or_b32 s1, s1, s3 -; GFX8-NEXT: s_bfe_u32 s3, s5, 0x100000 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s5 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_lshl_b32 s3, s3, 16 ; GFX8-NEXT: s_or_b32 s2, s2, s3 ; GFX8-NEXT: ; return to shader part epilog @@ -4081,30 +4083,30 @@ ; GFX8-NEXT: s_sext_i32_i16 s7, s11 ; GFX8-NEXT: s_max_i32 s8, s7, s17 ; GFX8-NEXT: s_sub_i32 s8, s8, 0x7fff -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX8-NEXT: s_min_i32 s7, s7, s17 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_sext_i32_i16 s9, s15 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16 ; GFX8-NEXT: s_sub_i32 s7, s7, 0xffff8000 ; GFX8-NEXT: s_max_i32 s8, s8, s9 ; GFX8-NEXT: s_or_b32 s0, s0, s4 -; GFX8-NEXT: s_bfe_u32 s4, s5, 0x100000 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s5 ; GFX8-NEXT: s_sub_i32 s6, s10, s6 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_sext_i32_i16 s7, s7 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16 ; GFX8-NEXT: s_min_i32 s7, s8, s7 ; GFX8-NEXT: s_or_b32 s1, s1, s4 -; GFX8-NEXT: s_bfe_u32 s4, s6, 0x100000 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s6 ; GFX8-NEXT: s_sub_i32 s7, s11, s7 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16 ; GFX8-NEXT: s_or_b32 s2, s2, s4 -; GFX8-NEXT: s_bfe_u32 s4, s7, 0x100000 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s7 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16 ; GFX8-NEXT: s_or_b32 s3, s3, s4 ; GFX8-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll @@ -69,56 +69,55 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-NEXT: s_bfe_u32 s0, 8, 0x100000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s3, s4, 0x100000 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: s_lshr_b32 s3, s3, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: s_lshr_b32 s1, s4, 16 +; GFX9-NEXT: s_lshr_b32 s1, s1, 8 +; GFX9-NEXT: s_lshr_b32 s0, s4, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: ds_write_b8 v1, v0 offset:1 -; GFX9-NEXT: s_lshr_b32 s2, s1, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:1 +; GFX9-NEXT: s_lshr_b32 s1, s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:2 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: s_bfe_u32 s2, s5, 0x100000 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s5 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:3 -; GFX9-NEXT: s_lshr_b32 s2, s2, s0 +; GFX9-NEXT: s_lshr_b32 s1, s1, 8 ; GFX9-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-NEXT: s_lshr_b32 s1, s5, 16 +; GFX9-NEXT: s_lshr_b32 s0, s5, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:4 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: ds_write_b8 v1, v0 offset:5 -; GFX9-NEXT: s_lshr_b32 s2, s1, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:5 +; GFX9-NEXT: s_lshr_b32 s1, s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:6 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: s_bfe_u32 s2, s6, 0x100000 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s6 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:7 -; GFX9-NEXT: s_lshr_b32 s2, s2, s0 +; GFX9-NEXT: s_lshr_b32 s1, s1, 8 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshr_b32 s1, s6, 16 +; GFX9-NEXT: s_lshr_b32 s0, s6, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:8 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: ds_write_b8 v1, v0 offset:9 -; GFX9-NEXT: s_lshr_b32 s2, s1, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:9 +; GFX9-NEXT: s_lshr_b32 s1, s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:10 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: s_bfe_u32 s2, s7, 0x100000 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s7 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:11 -; GFX9-NEXT: s_lshr_b32 s2, s2, s0 +; GFX9-NEXT: s_lshr_b32 s1, s1, 8 ; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: s_lshr_b32 s1, s7, 16 +; GFX9-NEXT: s_lshr_b32 s0, s7, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:12 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: ds_write_b8 v1, v0 offset:13 -; GFX9-NEXT: s_lshr_b32 s0, s1, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: ds_write_b8 v1, v0 offset:14 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:13 +; GFX9-NEXT: s_lshr_b32 s1, s0, 8 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:14 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:15 ; GFX9-NEXT: s_endpgm ; @@ -180,51 +179,50 @@ ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-NEXT: s_bfe_u32 s0, 8, 0x100000 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s3, s4, 0x100000 -; GFX10-NEXT: s_lshr_b32 s1, s4, 16 +; GFX10-NEXT: s_lshr_b32 s0, s4, 16 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: s_lshr_b32 s2, s5, 16 -; GFX10-NEXT: s_bfe_u32 s4, s5, 0x100000 +; GFX10-NEXT: s_and_b32 s3, 0xffff, s5 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-NEXT: s_lshr_b32 s3, s3, s0 -; GFX10-NEXT: s_lshr_b32 s5, s6, 16 -; GFX10-NEXT: s_bfe_u32 s8, s6, 0x100000 +; GFX10-NEXT: s_lshr_b32 s1, s1, 8 +; GFX10-NEXT: v_mov_b32_e32 v4, s0 +; GFX10-NEXT: s_lshr_b32 s4, s6, 16 +; GFX10-NEXT: s_and_b32 s5, 0xffff, s6 ; GFX10-NEXT: v_mov_b32_e32 v3, s6 -; GFX10-NEXT: s_lshr_b32 s6, s1, s0 -; GFX10-NEXT: v_mov_b32_e32 v4, s1 -; GFX10-NEXT: s_lshr_b32 s1, s4, s0 -; GFX10-NEXT: s_lshr_b32 s4, s2, s0 -; GFX10-NEXT: v_mov_b32_e32 v6, s3 -; GFX10-NEXT: v_mov_b32_e32 v7, s6 +; GFX10-NEXT: s_lshr_b32 s6, s0, 8 +; GFX10-NEXT: s_lshr_b32 s0, s3, 8 +; GFX10-NEXT: s_lshr_b32 s3, s2, 8 ; GFX10-NEXT: v_mov_b32_e32 v5, s2 -; GFX10-NEXT: s_lshr_b32 s2, s8, s0 -; GFX10-NEXT: v_mov_b32_e32 v8, s1 -; GFX10-NEXT: v_mov_b32_e32 v9, s4 +; GFX10-NEXT: v_mov_b32_e32 v6, s1 +; GFX10-NEXT: s_lshr_b32 s2, s5, 8 +; GFX10-NEXT: v_mov_b32_e32 v7, s6 +; GFX10-NEXT: v_mov_b32_e32 v8, s0 +; GFX10-NEXT: v_mov_b32_e32 v9, s3 ; GFX10-NEXT: ds_write_b8 v1, v0 ; GFX10-NEXT: ds_write_b8 v1, v2 offset:4 ; GFX10-NEXT: ds_write_b8 v1, v4 offset:2 +; GFX10-NEXT: ds_write_b8 v1, v5 offset:6 ; GFX10-NEXT: ds_write_b8 v1, v6 offset:1 ; GFX10-NEXT: ds_write_b8 v1, v7 offset:3 ; GFX10-NEXT: ds_write_b8 v1, v8 offset:5 -; GFX10-NEXT: ds_write_b8 v1, v5 offset:6 -; GFX10-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v10, s2 -; GFX10-NEXT: s_lshr_b32 s1, s5, s0 +; GFX10-NEXT: s_lshr_b32 s0, s4, 8 ; GFX10-NEXT: ds_write_b8 v1, v9 offset:7 ; GFX10-NEXT: ds_write_b8 v1, v3 offset:8 ; GFX10-NEXT: ds_write_b8 v1, v10 offset:9 ; GFX10-NEXT: ds_write_b8 v1, v0 offset:10 -; GFX10-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-NEXT: s_bfe_u32 s1, s7, 0x100000 -; GFX10-NEXT: s_lshr_b32 s2, s7, 16 -; GFX10-NEXT: s_lshr_b32 s1, s1, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: s_and_b32 s0, 0xffff, s7 +; GFX10-NEXT: s_lshr_b32 s1, s7, 16 +; GFX10-NEXT: s_lshr_b32 s0, s0, 8 ; GFX10-NEXT: v_mov_b32_e32 v2, s7 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: s_lshr_b32 s0, s2, s0 -; GFX10-NEXT: v_mov_b32_e32 v4, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s0 +; GFX10-NEXT: s_lshr_b32 s0, s1, 8 +; GFX10-NEXT: v_mov_b32_e32 v4, s1 ; GFX10-NEXT: v_mov_b32_e32 v5, s0 ; GFX10-NEXT: ds_write_b8 v1, v0 offset:11 ; GFX10-NEXT: ds_write_b8 v1, v2 offset:12 @@ -238,48 +236,46 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX11-NEXT: s_bfe_u32 s1, 8, 0x100000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bfe_u32 s3, s4, 0x100000 -; GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-NEXT: s_and_b32 s2, 0xffff, s4 +; GFX11-NEXT: s_lshr_b32 s1, s4, 16 +; GFX11-NEXT: s_lshr_b32 s2, s2, 8 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 ; GFX11-NEXT: s_lshr_b32 s0, s5, 16 -; GFX11-NEXT: s_lshr_b32 s3, s3, s1 -; GFX11-NEXT: s_bfe_u32 s4, s5, 0x100000 -; GFX11-NEXT: s_bfe_u32 s8, s6, 0x100000 -; GFX11-NEXT: s_lshr_b32 s9, s2, s1 -; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s3 -; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s2 -; GFX11-NEXT: s_lshr_b32 s5, s6, 16 -; GFX11-NEXT: s_lshr_b32 s2, s4, s1 -; GFX11-NEXT: s_lshr_b32 s4, s0, s1 -; GFX11-NEXT: s_lshr_b32 s0, s8, s1 -; GFX11-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v7, s2 -; GFX11-NEXT: v_mov_b32_e32 v8, s4 +; GFX11-NEXT: s_and_b32 s3, 0xffff, s5 +; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s6 +; GFX11-NEXT: s_lshr_b32 s4, s6, 16 +; GFX11-NEXT: s_and_b32 s5, 0xffff, s6 +; GFX11-NEXT: s_lshr_b32 s6, s1, 8 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s6 +; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: s_lshr_b32 s1, s3, 8 +; GFX11-NEXT: s_lshr_b32 s3, s0, 8 +; GFX11-NEXT: s_lshr_b32 s0, s5, 8 +; GFX11-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v9, s3 ; GFX11-NEXT: ds_store_b8 v1, v0 -; GFX11-NEXT: ds_store_b8 v1, v5 offset:1 -; GFX11-NEXT: ds_store_b8 v1, v3 offset:2 -; GFX11-NEXT: ds_store_b8 v1, v6 offset:3 +; GFX11-NEXT: ds_store_b8 v1, v6 offset:1 +; GFX11-NEXT: ds_store_b8 v1, v4 offset:2 +; GFX11-NEXT: ds_store_b8 v1, v7 offset:3 ; GFX11-NEXT: ds_store_b8 v1, v2 offset:4 -; GFX11-NEXT: ds_store_b8 v1, v7 offset:5 -; GFX11-NEXT: ds_store_b8 v1, v4 offset:6 -; GFX11-NEXT: ds_store_b8 v1, v8 offset:7 -; GFX11-NEXT: v_mov_b32_e32 v3, s5 -; GFX11-NEXT: s_lshr_b32 s2, s7, 16 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v7, s2 -; GFX11-NEXT: s_lshr_b32 s0, s5, s1 -; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v5, s7 +; GFX11-NEXT: ds_store_b8 v1, v8 offset:5 +; GFX11-NEXT: ds_store_b8 v1, v5 offset:6 +; GFX11-NEXT: ds_store_b8 v1, v9 offset:7 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s7 +; GFX11-NEXT: s_lshr_b32 s0, s4, 8 +; GFX11-NEXT: s_lshr_b32 s1, s7, 16 ; GFX11-NEXT: v_mov_b32_e32 v4, s0 -; GFX11-NEXT: s_bfe_u32 s0, s7, 0x100000 -; GFX11-NEXT: s_lshr_b32 s0, s0, s1 +; GFX11-NEXT: s_and_b32 s0, 0xffff, s7 +; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v7, s1 +; GFX11-NEXT: s_lshr_b32 s0, s0, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v6, s0 -; GFX11-NEXT: s_lshr_b32 s0, s2, s1 +; GFX11-NEXT: s_lshr_b32 s0, s1, 8 ; GFX11-NEXT: v_mov_b32_e32 v8, s0 -; GFX11-NEXT: ds_store_b8 v1, v0 offset:8 -; GFX11-NEXT: ds_store_b8 v1, v2 offset:9 -; GFX11-NEXT: ds_store_b8 v1, v3 offset:10 +; GFX11-NEXT: ds_store_b8 v1, v3 offset:8 +; GFX11-NEXT: ds_store_b8 v1, v0 offset:9 +; GFX11-NEXT: ds_store_b8 v1, v2 offset:10 ; GFX11-NEXT: ds_store_b8 v1, v4 offset:11 ; GFX11-NEXT: ds_store_b8 v1, v5 offset:12 ; GFX11-NEXT: ds_store_b8 v1, v6 offset:13 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll @@ -65,44 +65,43 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-NEXT: s_bfe_u32 s0, 8, 0x100000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s3, s4, 0x100000 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: s_lshr_b32 s3, s3, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: s_lshr_b32 s1, s4, 16 +; GFX9-NEXT: s_lshr_b32 s1, s1, 8 +; GFX9-NEXT: s_lshr_b32 s0, s4, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: ds_write_b8 v1, v0 offset:1 -; GFX9-NEXT: s_lshr_b32 s2, s1, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:1 +; GFX9-NEXT: s_lshr_b32 s1, s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:2 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: s_bfe_u32 s2, s5, 0x100000 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s5 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:3 -; GFX9-NEXT: s_lshr_b32 s2, s2, s0 +; GFX9-NEXT: s_lshr_b32 s1, s1, 8 ; GFX9-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-NEXT: s_lshr_b32 s1, s5, 16 +; GFX9-NEXT: s_lshr_b32 s0, s5, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:4 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: ds_write_b8 v1, v0 offset:5 -; GFX9-NEXT: s_lshr_b32 s2, s1, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:5 +; GFX9-NEXT: s_lshr_b32 s1, s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:6 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: s_bfe_u32 s2, s6, 0x100000 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s6 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:7 -; GFX9-NEXT: s_lshr_b32 s2, s2, s0 +; GFX9-NEXT: s_lshr_b32 s1, s1, 8 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshr_b32 s1, s6, 16 +; GFX9-NEXT: s_lshr_b32 s0, s6, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:8 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: ds_write_b8 v1, v0 offset:9 -; GFX9-NEXT: s_lshr_b32 s0, s1, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: ds_write_b8 v1, v0 offset:10 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:9 +; GFX9-NEXT: s_lshr_b32 s1, s0, 8 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:10 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:11 ; GFX9-NEXT: s_endpgm ; @@ -153,29 +152,30 @@ ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-NEXT: s_bfe_u32 s0, 8, 0x100000 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshr_b32 s1, s4, 16 -; GFX10-NEXT: s_bfe_u32 s3, s4, 0x100000 -; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: s_lshr_b32 s0, s4, 16 ; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: s_lshr_b32 s2, s5, 16 -; GFX10-NEXT: s_bfe_u32 s4, s5, 0x100000 +; GFX10-NEXT: s_and_b32 s3, 0xffff, s5 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-NEXT: s_lshr_b32 s5, s6, 16 -; GFX10-NEXT: s_bfe_u32 s7, s6, 0x100000 +; GFX10-NEXT: s_lshr_b32 s4, s6, 16 +; GFX10-NEXT: s_and_b32 s5, 0xffff, s6 ; GFX10-NEXT: v_mov_b32_e32 v3, s6 -; GFX10-NEXT: s_lshr_b32 s6, s1, s0 -; GFX10-NEXT: v_mov_b32_e32 v4, s1 -; GFX10-NEXT: s_lshr_b32 s1, s4, s0 -; GFX10-NEXT: s_lshr_b32 s4, s2, s0 -; GFX10-NEXT: s_lshr_b32 s3, s3, s0 +; GFX10-NEXT: s_lshr_b32 s6, s0, 8 +; GFX10-NEXT: v_mov_b32_e32 v4, s0 +; GFX10-NEXT: s_lshr_b32 s0, s3, 8 +; GFX10-NEXT: s_lshr_b32 s3, s2, 8 +; GFX10-NEXT: s_lshr_b32 s1, s1, 8 ; GFX10-NEXT: v_mov_b32_e32 v5, s2 -; GFX10-NEXT: s_lshr_b32 s2, s7, s0 -; GFX10-NEXT: v_mov_b32_e32 v9, s4 -; GFX10-NEXT: v_mov_b32_e32 v6, s3 +; GFX10-NEXT: s_lshr_b32 s2, s5, 8 +; GFX10-NEXT: v_mov_b32_e32 v9, s3 +; GFX10-NEXT: v_mov_b32_e32 v6, s1 +; GFX10-NEXT: v_mov_b32_e32 v8, s0 +; GFX10-NEXT: v_mov_b32_e32 v10, s2 +; GFX10-NEXT: s_lshr_b32 s0, s4, 8 ; GFX10-NEXT: v_mov_b32_e32 v7, s6 -; GFX10-NEXT: v_mov_b32_e32 v8, s1 ; GFX10-NEXT: ds_write_b8 v1, v0 ; GFX10-NEXT: ds_write_b8 v1, v2 offset:4 ; GFX10-NEXT: ds_write_b8 v1, v4 offset:2 @@ -183,15 +183,13 @@ ; GFX10-NEXT: ds_write_b8 v1, v6 offset:1 ; GFX10-NEXT: ds_write_b8 v1, v7 offset:3 ; GFX10-NEXT: ds_write_b8 v1, v8 offset:5 -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: s_lshr_b32 s0, s5, s0 -; GFX10-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-NEXT: v_mov_b32_e32 v4, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: ds_write_b8 v1, v9 offset:7 ; GFX10-NEXT: ds_write_b8 v1, v3 offset:8 -; GFX10-NEXT: ds_write_b8 v1, v0 offset:9 -; GFX10-NEXT: ds_write_b8 v1, v2 offset:10 -; GFX10-NEXT: ds_write_b8 v1, v4 offset:11 +; GFX10-NEXT: ds_write_b8 v1, v10 offset:9 +; GFX10-NEXT: ds_write_b8 v1, v0 offset:10 +; GFX10-NEXT: ds_write_b8 v1, v2 offset:11 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v3i32_align1: @@ -199,27 +197,26 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX11-NEXT: s_bfe_u32 s1, 8, 0x100000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bfe_u32 s3, s4, 0x100000 -; GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-NEXT: s_and_b32 s2, 0xffff, s4 +; GFX11-NEXT: s_lshr_b32 s1, s4, 16 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: s_lshr_b32 s4, s6, 16 +; GFX11-NEXT: s_lshr_b32 s2, s2, 8 ; GFX11-NEXT: s_lshr_b32 s0, s5, 16 -; GFX11-NEXT: s_bfe_u32 s4, s5, 0x100000 +; GFX11-NEXT: s_and_b32 s3, 0xffff, s5 ; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s6 -; GFX11-NEXT: s_lshr_b32 s5, s6, 16 -; GFX11-NEXT: s_lshr_b32 s3, s3, s1 -; GFX11-NEXT: s_bfe_u32 s7, s6, 0x100000 -; GFX11-NEXT: s_lshr_b32 s6, s2, s1 -; GFX11-NEXT: v_dual_mov_b32 v6, s5 :: v_dual_mov_b32 v7, s3 -; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s0 -; GFX11-NEXT: s_lshr_b32 s2, s4, s1 -; GFX11-NEXT: s_lshr_b32 s4, s0, s1 -; GFX11-NEXT: s_lshr_b32 s0, s7, s1 -; GFX11-NEXT: s_lshr_b32 s1, s5, s1 -; GFX11-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s2 -; GFX11-NEXT: v_dual_mov_b32 v10, s4 :: v_dual_mov_b32 v11, s0 -; GFX11-NEXT: v_mov_b32_e32 v12, s1 +; GFX11-NEXT: s_and_b32 s5, 0xffff, s6 +; GFX11-NEXT: s_lshr_b32 s6, s1, 8 +; GFX11-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s2 +; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: s_lshr_b32 s1, s3, 8 +; GFX11-NEXT: s_lshr_b32 s3, s0, 8 +; GFX11-NEXT: s_lshr_b32 s0, s5, 8 +; GFX11-NEXT: s_lshr_b32 s5, s4, 8 +; GFX11-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s1 +; GFX11-NEXT: v_dual_mov_b32 v10, s3 :: v_dual_mov_b32 v11, s0 +; GFX11-NEXT: v_mov_b32_e32 v12, s5 ; GFX11-NEXT: ds_store_b8 v1, v0 ; GFX11-NEXT: ds_store_b8 v1, v7 offset:1 ; GFX11-NEXT: ds_store_b8 v1, v4 offset:2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/strict_fma.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/strict_fma.f16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/strict_fma.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/strict_fma.f16.ll @@ -27,9 +27,8 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX8-NEXT: v_fma_f16 v0, v0, v1, v2 ; GFX8-NEXT: v_fma_f16 v1, v3, v4, v5 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] %val = call <2 x half> @llvm.experimental.constrained.fma.v2f16(<2 x half> %x, <2 x half> %y, <2 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <2 x half> %val @@ -51,11 +50,9 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v4 ; GFX8-NEXT: v_fma_f16 v0, v0, v2, v4 ; GFX8-NEXT: v_fma_f16 v2, v6, v7, v8 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_fma_f16 v1, v1, v3, v5 -; GFX8-NEXT: v_mov_b32_e32 v3, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] %val = call <3 x half> @llvm.experimental.constrained.fma.v3f16(<3 x half> %x, <3 x half> %y, <3 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <3 x half> %val @@ -80,13 +77,12 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v5 ; GFX8-NEXT: v_fma_f16 v0, v0, v2, v4 ; GFX8-NEXT: v_fma_f16 v2, v6, v8, v10 -; GFX8-NEXT: v_mov_b32_e32 v4, 16 ; GFX8-NEXT: v_fma_f16 v1, v1, v3, v5 ; GFX8-NEXT: v_fma_f16 v3, v7, v9, v11 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] %val = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <4 x half> %val @@ -144,9 +140,8 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX8-NEXT: v_fma_f16 v0, v0, v1, v2 ; GFX8-NEXT: v_fma_f16 v1, v3, v4, v5 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] %neg.x = fneg <2 x half> %x %neg.y = fneg <2 x half> %y diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -11,7 +11,7 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 25, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 25, v1 -; GFX6-NEXT: v_xor_b32_e32 v2, -1, v0 +; GFX6-NEXT: v_not_b32_e32 v2, v0 ; GFX6-NEXT: v_min_u32_e32 v1, v2, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 25, v0 @@ -61,9 +61,8 @@ ; ; GFX8-LABEL: s_uaddsat_i7: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s2, 9, 0x100000 -; GFX8-NEXT: s_lshl_b32 s1, s1, s2 -; GFX8-NEXT: s_lshl_b32 s0, s0, s2 +; GFX8-NEXT: s_lshl_b32 s1, s1, 9 +; GFX8-NEXT: s_lshl_b32 s0, s0, 9 ; GFX8-NEXT: v_mov_b32_e32 v0, s1 ; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 9, v0 @@ -72,9 +71,8 @@ ; ; GFX9-LABEL: s_uaddsat_i7: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_bfe_u32 s2, 9, 0x100000 -; GFX9-NEXT: s_lshl_b32 s1, s1, s2 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 +; GFX9-NEXT: s_lshl_b32 s1, s1, 9 +; GFX9-NEXT: s_lshl_b32 s0, s0, 9 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_add_u16_e64 v0, s0, v0 clamp ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 9, v0 @@ -83,9 +81,8 @@ ; ; GFX10PLUS-LABEL: s_uaddsat_i7: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_bfe_u32 s2, 9, 0x100000 -; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s2 -; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, s2 +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 9 +; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 9 ; GFX10PLUS-NEXT: v_add_nc_u16 v0, s0, s1 clamp ; GFX10PLUS-NEXT: v_lshrrev_b16 v0, 9, v0 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 @@ -100,7 +97,7 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX6-NEXT: v_xor_b32_e32 v2, -1, v0 +; GFX6-NEXT: v_not_b32_e32 v2, v0 ; GFX6-NEXT: v_min_u32_e32 v1, v2, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v0 @@ -150,9 +147,8 @@ ; ; GFX8-LABEL: s_uaddsat_i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s2, 8, 0x100000 -; GFX8-NEXT: s_lshl_b32 s1, s1, s2 -; GFX8-NEXT: s_lshl_b32 s0, s0, s2 +; GFX8-NEXT: s_lshl_b32 s1, s1, 8 +; GFX8-NEXT: s_lshl_b32 s0, s0, 8 ; GFX8-NEXT: v_mov_b32_e32 v0, s1 ; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0 @@ -161,9 +157,8 @@ ; ; GFX9-LABEL: s_uaddsat_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_bfe_u32 s2, 8, 0x100000 -; GFX9-NEXT: s_lshl_b32 s1, s1, s2 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 +; GFX9-NEXT: s_lshl_b32 s1, s1, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, 8 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_add_u16_e64 v0, s0, v0 clamp ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0 @@ -172,9 +167,8 @@ ; ; GFX10PLUS-LABEL: s_uaddsat_i8: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_bfe_u32 s2, 8, 0x100000 -; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s2 -; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, s2 +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 8 +; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8 ; GFX10PLUS-NEXT: v_add_nc_u16 v0, s0, s1 clamp ; GFX10PLUS-NEXT: v_lshrrev_b16 v0, 8, v0 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 @@ -191,12 +185,12 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX6-NEXT: v_xor_b32_e32 v4, -1, v0 +; GFX6-NEXT: v_not_b32_e32 v4, v0 ; GFX6-NEXT: v_min_u32_e32 v1, v4, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3 -; GFX6-NEXT: v_xor_b32_e32 v3, -1, v1 +; GFX6-NEXT: v_not_b32_e32 v3, v1 ; GFX6-NEXT: v_min_u32_e32 v2, v3, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 24, v1 @@ -225,9 +219,10 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v2, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v3, v1, s4 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 clamp @@ -243,9 +238,11 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX10-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 +; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 clamp @@ -260,8 +257,10 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_u16 v0, v0, v1 clamp @@ -302,15 +301,14 @@ ; ; GFX8-LABEL: s_uaddsat_v2i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s4, 8, 0x100000 ; GFX8-NEXT: s_lshr_b32 s3, s1, 8 -; GFX8-NEXT: s_lshl_b32 s1, s1, s4 +; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_lshr_b32 s2, s0, 8 -; GFX8-NEXT: s_lshl_b32 s0, s0, s4 +; GFX8-NEXT: s_lshl_b32 s0, s0, 8 ; GFX8-NEXT: v_mov_b32_e32 v0, s1 -; GFX8-NEXT: s_lshl_b32 s1, s3, s4 +; GFX8-NEXT: s_lshl_b32 s1, s3, 8 ; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp -; GFX8-NEXT: s_lshl_b32 s0, s2, s4 +; GFX8-NEXT: s_lshl_b32 s0, s2, 8 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u16_e64 v1, s0, v1 clamp ; GFX8-NEXT: v_lshrrev_b16_e32 v1, 8, v1 @@ -406,22 +404,22 @@ ; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX6-NEXT: v_xor_b32_e32 v8, -1, v0 +; GFX6-NEXT: v_not_b32_e32 v8, v0 ; GFX6-NEXT: v_min_u32_e32 v1, v8, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v5 -; GFX6-NEXT: v_xor_b32_e32 v5, -1, v1 +; GFX6-NEXT: v_not_b32_e32 v5, v1 ; GFX6-NEXT: v_min_u32_e32 v2, v5, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v6 -; GFX6-NEXT: v_xor_b32_e32 v5, -1, v2 +; GFX6-NEXT: v_not_b32_e32 v5, v2 ; GFX6-NEXT: v_min_u32_e32 v3, v5, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 24, v7 -; GFX6-NEXT: v_xor_b32_e32 v5, -1, v3 +; GFX6-NEXT: v_not_b32_e32 v5, v3 ; GFX6-NEXT: v_min_u32_e32 v4, v5, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 24, v2 @@ -468,14 +466,15 @@ ; GFX9-LABEL: v_uaddsat_v4i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX9-NEXT: v_perm_b32 v2, v2, v0, s4 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v0 ; GFX9-NEXT: v_alignbit_b32 v0, v3, v0, 16 -; GFX9-NEXT: v_perm_b32 v3, v4, v1, s4 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX9-NEXT: v_alignbit_b32 v1, v5, v1, 16 ; GFX9-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] @@ -502,15 +501,17 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX10-NEXT: v_perm_b32 v2, v2, v0, 0x5040100 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX10-NEXT: v_alignbit_b32 v0, v3, v0, 16 -; GFX10-NEXT: v_perm_b32 v3, v4, v1, 0x5040100 -; GFX10-NEXT: v_alignbit_b32 v1, v5, v1, 16 +; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, 24 -; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] +; GFX10-NEXT: v_lshl_or_b32 v3, v5, 16, v6 +; GFX10-NEXT: v_alignbit_b32 v1, v7, v1, 16 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_add_u16 v2, v2, v3 clamp @@ -532,12 +533,14 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX11-NEXT: v_perm_b32 v2, v2, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v3, v1, 0x5040100 -; GFX11-NEXT: v_alignbit_b32 v0, v4, v0, 16 -; GFX11-NEXT: v_alignbit_b32 v1, v5, v1, 16 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v0 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX11-NEXT: v_lshl_or_b32 v2, v2, 16, v4 +; GFX11-NEXT: v_lshl_or_b32 v3, v3, 16, v5 +; GFX11-NEXT: v_alignbit_b32 v0, v6, v0, 16 +; GFX11-NEXT: v_alignbit_b32 v1, v7, v1, 16 ; GFX11-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] @@ -605,29 +608,28 @@ ; ; GFX8-LABEL: s_uaddsat_v4i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s8, 8, 0x100000 ; GFX8-NEXT: s_lshr_b32 s5, s1, 8 ; GFX8-NEXT: s_lshr_b32 s6, s1, 16 ; GFX8-NEXT: s_lshr_b32 s7, s1, 24 -; GFX8-NEXT: s_lshl_b32 s1, s1, s8 +; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_lshr_b32 s2, s0, 8 ; GFX8-NEXT: s_lshr_b32 s3, s0, 16 ; GFX8-NEXT: s_lshr_b32 s4, s0, 24 -; GFX8-NEXT: s_lshl_b32 s0, s0, s8 +; GFX8-NEXT: s_lshl_b32 s0, s0, 8 ; GFX8-NEXT: v_mov_b32_e32 v0, s1 -; GFX8-NEXT: s_lshl_b32 s1, s5, s8 +; GFX8-NEXT: s_lshl_b32 s1, s5, 8 ; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp -; GFX8-NEXT: s_lshl_b32 s0, s2, s8 +; GFX8-NEXT: s_lshl_b32 s0, s2, 8 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u16_e64 v1, s0, v1 clamp -; GFX8-NEXT: s_lshl_b32 s1, s6, s8 +; GFX8-NEXT: s_lshl_b32 s1, s6, 8 ; GFX8-NEXT: v_mov_b32_e32 v4, 0xff -; GFX8-NEXT: s_lshl_b32 s0, s3, s8 +; GFX8-NEXT: s_lshl_b32 s0, s3, 8 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: s_lshl_b32 s1, s7, s8 +; GFX8-NEXT: s_lshl_b32 s1, s7, 8 ; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_add_u16_e64 v2, s0, v2 clamp -; GFX8-NEXT: s_lshl_b32 s0, s4, s8 +; GFX8-NEXT: s_lshl_b32 s0, s4, 8 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 @@ -782,7 +784,7 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX6-NEXT: v_xor_b32_e32 v2, -1, v0 +; GFX6-NEXT: v_not_b32_e32 v2, v0 ; GFX6-NEXT: v_min_u32_e32 v1, v2, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 8, v0 @@ -866,7 +868,7 @@ ; GFX6-LABEL: v_uaddsat_i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_xor_b32_e32 v2, -1, v0 +; GFX6-NEXT: v_not_b32_e32 v2, v0 ; GFX6-NEXT: v_min_u32_e32 v1, v2, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -954,7 +956,7 @@ define amdgpu_ps float @uaddsat_i32_vs(i32 %lhs, i32 inreg %rhs) { ; GFX6-LABEL: uaddsat_i32_vs: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_xor_b32_e32 v1, -1, v0 +; GFX6-NEXT: v_not_b32_e32 v1, v0 ; GFX6-NEXT: v_min_u32_e32 v1, s0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: ; return to shader part epilog @@ -982,10 +984,10 @@ ; GFX6-LABEL: v_uaddsat_v2i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_xor_b32_e32 v4, -1, v0 +; GFX6-NEXT: v_not_b32_e32 v4, v0 ; GFX6-NEXT: v_min_u32_e32 v2, v4, v2 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_xor_b32_e32 v2, -1, v1 +; GFX6-NEXT: v_not_b32_e32 v2, v1 ; GFX6-NEXT: v_min_u32_e32 v2, v2, v3 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -1061,13 +1063,13 @@ ; GFX6-LABEL: v_uaddsat_v3i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_xor_b32_e32 v6, -1, v0 +; GFX6-NEXT: v_not_b32_e32 v6, v0 ; GFX6-NEXT: v_min_u32_e32 v3, v6, v3 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GFX6-NEXT: v_xor_b32_e32 v3, -1, v1 +; GFX6-NEXT: v_not_b32_e32 v3, v1 ; GFX6-NEXT: v_min_u32_e32 v3, v3, v4 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GFX6-NEXT: v_xor_b32_e32 v3, -1, v2 +; GFX6-NEXT: v_not_b32_e32 v3, v2 ; GFX6-NEXT: v_min_u32_e32 v3, v3, v5 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -1157,16 +1159,16 @@ ; GFX6-LABEL: v_uaddsat_v4i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_xor_b32_e32 v8, -1, v0 +; GFX6-NEXT: v_not_b32_e32 v8, v0 ; GFX6-NEXT: v_min_u32_e32 v4, v8, v4 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GFX6-NEXT: v_xor_b32_e32 v4, -1, v1 +; GFX6-NEXT: v_not_b32_e32 v4, v1 ; GFX6-NEXT: v_min_u32_e32 v4, v4, v5 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 -; GFX6-NEXT: v_xor_b32_e32 v4, -1, v2 +; GFX6-NEXT: v_not_b32_e32 v4, v2 ; GFX6-NEXT: v_min_u32_e32 v4, v4, v6 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: v_xor_b32_e32 v4, -1, v3 +; GFX6-NEXT: v_not_b32_e32 v4, v3 ; GFX6-NEXT: v_min_u32_e32 v4, v4, v7 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -1270,19 +1272,19 @@ ; GFX6-LABEL: v_uaddsat_v5i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_xor_b32_e32 v10, -1, v0 +; GFX6-NEXT: v_not_b32_e32 v10, v0 ; GFX6-NEXT: v_min_u32_e32 v5, v10, v5 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; GFX6-NEXT: v_xor_b32_e32 v5, -1, v1 +; GFX6-NEXT: v_not_b32_e32 v5, v1 ; GFX6-NEXT: v_min_u32_e32 v5, v5, v6 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v5 -; GFX6-NEXT: v_xor_b32_e32 v5, -1, v2 +; GFX6-NEXT: v_not_b32_e32 v5, v2 ; GFX6-NEXT: v_min_u32_e32 v5, v5, v7 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GFX6-NEXT: v_xor_b32_e32 v5, -1, v3 +; GFX6-NEXT: v_not_b32_e32 v5, v3 ; GFX6-NEXT: v_min_u32_e32 v5, v5, v8 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GFX6-NEXT: v_xor_b32_e32 v5, -1, v4 +; GFX6-NEXT: v_not_b32_e32 v5, v4 ; GFX6-NEXT: v_min_u32_e32 v5, v5, v9 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -1400,53 +1402,53 @@ ; GFX6-LABEL: v_uaddsat_v16i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_xor_b32_e32 v31, -1, v0 +; GFX6-NEXT: v_not_b32_e32 v31, v0 ; GFX6-NEXT: v_min_u32_e32 v16, v31, v16 ; GFX6-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v16 -; GFX6-NEXT: v_xor_b32_e32 v16, -1, v1 +; GFX6-NEXT: v_not_b32_e32 v16, v1 ; GFX6-NEXT: v_min_u32_e32 v16, v16, v17 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v16 -; GFX6-NEXT: v_xor_b32_e32 v16, -1, v2 +; GFX6-NEXT: v_not_b32_e32 v16, v2 ; GFX6-NEXT: v_min_u32_e32 v16, v16, v18 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v16 -; GFX6-NEXT: v_xor_b32_e32 v16, -1, v3 +; GFX6-NEXT: v_not_b32_e32 v16, v3 ; GFX6-NEXT: v_min_u32_e32 v16, v16, v19 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v16 -; GFX6-NEXT: v_xor_b32_e32 v16, -1, v4 +; GFX6-NEXT: v_not_b32_e32 v16, v4 ; GFX6-NEXT: v_min_u32_e32 v16, v16, v20 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v16 -; GFX6-NEXT: v_xor_b32_e32 v16, -1, v5 +; GFX6-NEXT: v_not_b32_e32 v16, v5 ; GFX6-NEXT: v_min_u32_e32 v16, v16, v21 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v16 -; GFX6-NEXT: v_xor_b32_e32 v16, -1, v6 +; GFX6-NEXT: v_not_b32_e32 v16, v6 ; GFX6-NEXT: v_min_u32_e32 v16, v16, v22 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v16 -; GFX6-NEXT: v_xor_b32_e32 v16, -1, v7 +; GFX6-NEXT: v_not_b32_e32 v16, v7 ; GFX6-NEXT: v_min_u32_e32 v16, v16, v23 ; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v16 -; GFX6-NEXT: v_xor_b32_e32 v16, -1, v8 +; GFX6-NEXT: v_not_b32_e32 v16, v8 ; GFX6-NEXT: v_min_u32_e32 v16, v16, v24 ; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v16 -; GFX6-NEXT: v_xor_b32_e32 v16, -1, v9 +; GFX6-NEXT: v_not_b32_e32 v16, v9 ; GFX6-NEXT: v_min_u32_e32 v16, v16, v25 ; GFX6-NEXT: v_add_i32_e32 v9, vcc, v9, v16 -; GFX6-NEXT: v_xor_b32_e32 v16, -1, v10 +; GFX6-NEXT: v_not_b32_e32 v16, v10 ; GFX6-NEXT: v_min_u32_e32 v16, v16, v26 ; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v16 -; GFX6-NEXT: v_xor_b32_e32 v16, -1, v11 +; GFX6-NEXT: v_not_b32_e32 v16, v11 ; GFX6-NEXT: v_min_u32_e32 v16, v16, v27 ; GFX6-NEXT: v_add_i32_e32 v11, vcc, v11, v16 -; GFX6-NEXT: v_xor_b32_e32 v16, -1, v12 +; GFX6-NEXT: v_not_b32_e32 v16, v12 ; GFX6-NEXT: v_min_u32_e32 v16, v16, v28 ; GFX6-NEXT: v_add_i32_e32 v12, vcc, v12, v16 -; GFX6-NEXT: v_xor_b32_e32 v16, -1, v13 +; GFX6-NEXT: v_not_b32_e32 v16, v13 ; GFX6-NEXT: v_min_u32_e32 v16, v16, v29 ; GFX6-NEXT: v_add_i32_e32 v13, vcc, v13, v16 -; GFX6-NEXT: v_xor_b32_e32 v16, -1, v14 +; GFX6-NEXT: v_not_b32_e32 v16, v14 ; GFX6-NEXT: v_min_u32_e32 v16, v16, v30 ; GFX6-NEXT: v_add_i32_e32 v14, vcc, v14, v16 -; GFX6-NEXT: v_xor_b32_e32 v16, -1, v15 +; GFX6-NEXT: v_not_b32_e32 v16, v15 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_min_u32_e32 v16, v16, v31 ; GFX6-NEXT: v_add_i32_e32 v15, vcc, v15, v16 @@ -1751,7 +1753,7 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_xor_b32_e32 v2, -1, v0 +; GFX6-NEXT: v_not_b32_e32 v2, v0 ; GFX6-NEXT: v_min_u32_e32 v1, v2, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -1848,7 +1850,7 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: v_xor_b32_e32 v1, -1, v0 +; GFX6-NEXT: v_not_b32_e32 v1, v0 ; GFX6-NEXT: v_min_u32_e32 v1, s0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -1879,12 +1881,12 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_xor_b32_e32 v4, -1, v0 +; GFX6-NEXT: v_not_b32_e32 v4, v0 ; GFX6-NEXT: v_min_u32_e32 v2, v4, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX6-NEXT: v_xor_b32_e32 v3, -1, v1 +; GFX6-NEXT: v_not_b32_e32 v3, v1 ; GFX6-NEXT: v_min_u32_e32 v2, v3, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -1895,10 +1897,8 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u16_e64 v2, v0, v1 clamp -; GFX8-NEXT: v_add_u16_sdwa v0, v0, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v1, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uaddsat_v2i16: @@ -1938,15 +1938,14 @@ ; ; GFX8-LABEL: s_uaddsat_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: v_mov_b32_e32 v0, s1 -; GFX8-NEXT: v_add_u16_e64 v1, s2, v1 clamp -; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_add_u16_sdwa v1, v2, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -1989,10 +1988,8 @@ ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: v_add_u16_e64 v1, s0, v0 clamp -; GFX8-NEXT: v_add_u16_sdwa v0, v2, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_add_u16_sdwa v0, v2, v0 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: uaddsat_v2i16_sv: @@ -2014,12 +2011,12 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: v_xor_b32_e32 v2, -1, v0 +; GFX6-NEXT: v_not_b32_e32 v2, v0 ; GFX6-NEXT: v_min_u32_e32 v2, s0, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: s_lshl_b32 s0, s1, 16 -; GFX6-NEXT: v_xor_b32_e32 v2, -1, v1 +; GFX6-NEXT: v_not_b32_e32 v2, v1 ; GFX6-NEXT: v_min_u32_e32 v2, s0, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -2031,10 +2028,8 @@ ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: v_add_u16_e64 v1, v0, s0 clamp -; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: uaddsat_v2i16_vs: @@ -2068,22 +2063,22 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_xor_b32_e32 v8, -1, v0 +; GFX6-NEXT: v_not_b32_e32 v8, v0 ; GFX6-NEXT: v_min_u32_e32 v4, v8, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_xor_b32_e32 v5, -1, v1 +; GFX6-NEXT: v_not_b32_e32 v5, v1 ; GFX6-NEXT: v_min_u32_e32 v4, v5, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_xor_b32_e32 v5, -1, v2 +; GFX6-NEXT: v_not_b32_e32 v5, v2 ; GFX6-NEXT: v_min_u32_e32 v4, v5, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; GFX6-NEXT: v_xor_b32_e32 v5, -1, v3 +; GFX6-NEXT: v_not_b32_e32 v5, v3 ; GFX6-NEXT: v_min_u32_e32 v4, v5, v4 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -2096,14 +2091,11 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u16_e64 v4, v0, v2 clamp -; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_u16_e64 v2, v1, v3 clamp -; GFX8-NEXT: v_add_u16_sdwa v1, v1, v3 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v3, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_add_u16_sdwa v1, v1, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uaddsat_v4i16: @@ -2160,23 +2152,22 @@ ; ; GFX8-LABEL: s_uaddsat_v4i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s6, s2, 16 ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 +; GFX8-NEXT: s_lshr_b32 s6, s2, 16 +; GFX8-NEXT: s_lshr_b32 s5, s1, 16 ; GFX8-NEXT: s_lshr_b32 s7, s3, 16 ; GFX8-NEXT: v_mov_b32_e32 v1, s6 -; GFX8-NEXT: s_lshr_b32 s5, s1, 16 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_add_u16_e64 v1, s4, v1 clamp +; GFX8-NEXT: v_add_u16_sdwa v1, v2, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: v_mov_b32_e32 v4, 16 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp -; GFX8-NEXT: v_mov_b32_e32 v2, s3 -; GFX8-NEXT: v_add_u16_e64 v3, s5, v3 clamp -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_add_u16_e64 v2, s1, v2 clamp -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_add_u16_sdwa v3, v4, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v3 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1 ; GFX8-NEXT: ; return to shader part epilog @@ -2220,32 +2211,32 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX6-NEXT: v_xor_b32_e32 v12, -1, v0 +; GFX6-NEXT: v_not_b32_e32 v12, v0 ; GFX6-NEXT: v_min_u32_e32 v6, v12, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX6-NEXT: v_xor_b32_e32 v7, -1, v1 +; GFX6-NEXT: v_not_b32_e32 v7, v1 ; GFX6-NEXT: v_min_u32_e32 v6, v7, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX6-NEXT: v_xor_b32_e32 v7, -1, v2 +; GFX6-NEXT: v_not_b32_e32 v7, v2 ; GFX6-NEXT: v_min_u32_e32 v6, v7, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v9 -; GFX6-NEXT: v_xor_b32_e32 v7, -1, v3 +; GFX6-NEXT: v_not_b32_e32 v7, v3 ; GFX6-NEXT: v_min_u32_e32 v6, v7, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v10 -; GFX6-NEXT: v_xor_b32_e32 v7, -1, v4 +; GFX6-NEXT: v_not_b32_e32 v7, v4 ; GFX6-NEXT: v_min_u32_e32 v6, v7, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; GFX6-NEXT: v_xor_b32_e32 v7, -1, v5 +; GFX6-NEXT: v_not_b32_e32 v7, v5 ; GFX6-NEXT: v_min_u32_e32 v6, v7, v6 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -2260,19 +2251,14 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u16_e64 v6, v0, v3 clamp -; GFX8-NEXT: v_add_u16_sdwa v0, v0, v3 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_u16_e64 v3, v1, v4 clamp -; GFX8-NEXT: v_add_u16_sdwa v1, v1, v4 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_sdwa v1, v1, v4 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_u16_e64 v4, v2, v5 clamp -; GFX8-NEXT: v_add_u16_sdwa v2, v2, v5 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v5, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_mov_b32_e32 v3, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_add_u16_sdwa v2, v2, v5 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uaddsat_v6i16: @@ -2345,31 +2331,30 @@ ; ; GFX8-LABEL: s_uaddsat_v6i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s9, s3, 16 ; GFX8-NEXT: s_lshr_b32 s6, s0, 16 -; GFX8-NEXT: s_lshr_b32 s10, s4, 16 -; GFX8-NEXT: v_mov_b32_e32 v1, s9 ; GFX8-NEXT: s_lshr_b32 s7, s1, 16 +; GFX8-NEXT: s_lshr_b32 s9, s3, 16 +; GFX8-NEXT: s_lshr_b32 s10, s4, 16 +; GFX8-NEXT: s_lshr_b32 s8, s2, 16 ; GFX8-NEXT: s_lshr_b32 s11, s5, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: v_add_u16_e64 v1, s6, v1 clamp +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NEXT: v_mov_b32_e32 v3, s10 -; GFX8-NEXT: v_mov_b32_e32 v6, 16 -; GFX8-NEXT: s_lshr_b32 s8, s2, 16 -; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_mov_b32_e32 v4, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_add_u16_sdwa v1, v2, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_add_u16_e64 v3, s7, v3 clamp +; GFX8-NEXT: v_add_u16_sdwa v3, v4, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, s11 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_mov_b32_e32 v6, s8 +; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp ; GFX8-NEXT: v_add_u16_e64 v2, s1, v2 clamp -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: v_add_u16_e64 v5, s8, v5 clamp -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_add_u16_e64 v4, s2, v4 clamp -; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_add_u16_sdwa v5, v6, v5 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX8-NEXT: v_or_b32_e32 v2, v4, v5 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1 ; GFX8-NEXT: v_readfirstlane_b32 s2, v2 @@ -2408,42 +2393,42 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX6-NEXT: v_xor_b32_e32 v16, -1, v0 +; GFX6-NEXT: v_not_b32_e32 v16, v0 ; GFX6-NEXT: v_min_u32_e32 v8, v16, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; GFX6-NEXT: v_xor_b32_e32 v9, -1, v1 +; GFX6-NEXT: v_not_b32_e32 v9, v1 ; GFX6-NEXT: v_min_u32_e32 v8, v9, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; GFX6-NEXT: v_xor_b32_e32 v9, -1, v2 +; GFX6-NEXT: v_not_b32_e32 v9, v2 ; GFX6-NEXT: v_min_u32_e32 v8, v9, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v11 -; GFX6-NEXT: v_xor_b32_e32 v9, -1, v3 +; GFX6-NEXT: v_not_b32_e32 v9, v3 ; GFX6-NEXT: v_min_u32_e32 v8, v9, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v12 -; GFX6-NEXT: v_xor_b32_e32 v9, -1, v4 +; GFX6-NEXT: v_not_b32_e32 v9, v4 ; GFX6-NEXT: v_min_u32_e32 v8, v9, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v13 -; GFX6-NEXT: v_xor_b32_e32 v9, -1, v5 +; GFX6-NEXT: v_not_b32_e32 v9, v5 ; GFX6-NEXT: v_min_u32_e32 v8, v9, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v14 -; GFX6-NEXT: v_xor_b32_e32 v9, -1, v6 +; GFX6-NEXT: v_not_b32_e32 v9, v6 ; GFX6-NEXT: v_min_u32_e32 v8, v9, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v15 -; GFX6-NEXT: v_xor_b32_e32 v9, -1, v7 +; GFX6-NEXT: v_not_b32_e32 v9, v7 ; GFX6-NEXT: v_min_u32_e32 v8, v9, v8 ; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -2460,23 +2445,17 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u16_e64 v8, v0, v4 clamp -; GFX8-NEXT: v_add_u16_sdwa v0, v0, v4 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v4 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_u16_e64 v4, v1, v5 clamp -; GFX8-NEXT: v_add_u16_sdwa v1, v1, v5 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_sdwa v1, v1, v5 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_u16_e64 v5, v2, v6 clamp -; GFX8-NEXT: v_add_u16_sdwa v2, v2, v6 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_sdwa v2, v2, v6 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_u16_e64 v6, v3, v7 clamp -; GFX8-NEXT: v_add_u16_sdwa v3, v3, v7 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v7, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_mov_b32_e32 v7, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_add_u16_sdwa v3, v3, v7 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v6, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uaddsat_v8i16: @@ -2565,39 +2544,38 @@ ; ; GFX8-LABEL: s_uaddsat_v8i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s12, s4, 16 ; GFX8-NEXT: s_lshr_b32 s8, s0, 16 -; GFX8-NEXT: s_lshr_b32 s13, s5, 16 -; GFX8-NEXT: v_mov_b32_e32 v1, s12 ; GFX8-NEXT: s_lshr_b32 s9, s1, 16 +; GFX8-NEXT: s_lshr_b32 s10, s2, 16 +; GFX8-NEXT: s_lshr_b32 s12, s4, 16 +; GFX8-NEXT: s_lshr_b32 s13, s5, 16 ; GFX8-NEXT: s_lshr_b32 s14, s6, 16 +; GFX8-NEXT: s_lshr_b32 s11, s3, 16 ; GFX8-NEXT: s_lshr_b32 s15, s7, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_add_u16_e64 v1, s8, v1 clamp +; GFX8-NEXT: v_mov_b32_e32 v1, s12 +; GFX8-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NEXT: v_mov_b32_e32 v3, s13 -; GFX8-NEXT: v_mov_b32_e32 v8, 16 -; GFX8-NEXT: s_lshr_b32 s10, s2, 16 -; GFX8-NEXT: s_lshr_b32 s11, s3, 16 -; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp -; GFX8-NEXT: v_mov_b32_e32 v2, s5 -; GFX8-NEXT: v_add_u16_e64 v3, s9, v3 clamp +; GFX8-NEXT: v_mov_b32_e32 v4, s9 ; GFX8-NEXT: v_mov_b32_e32 v5, s14 -; GFX8-NEXT: v_mov_b32_e32 v7, s15 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_add_u16_e64 v2, s1, v2 clamp +; GFX8-NEXT: v_mov_b32_e32 v6, s10 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_add_u16_sdwa v1, v2, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_add_u16_sdwa v3, v4, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NEXT: v_add_u16_e64 v5, s10, v5 clamp +; GFX8-NEXT: v_add_u16_sdwa v5, v6, v5 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v6, s7 -; GFX8-NEXT: v_add_u16_e64 v7, s11, v7 clamp -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_mov_b32_e32 v7, s15 +; GFX8-NEXT: v_mov_b32_e32 v8, s11 +; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_add_u16_e64 v2, s1, v2 clamp ; GFX8-NEXT: v_add_u16_e64 v4, s2, v4 clamp ; GFX8-NEXT: v_add_u16_e64 v6, s3, v6 clamp -; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_add_u16_sdwa v7, v8, v7 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX8-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX8-NEXT: v_or_b32_e32 v3, v6, v7 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1 ; GFX8-NEXT: v_readfirstlane_b32 s2, v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -2320,11 +2320,12 @@ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: v_subrev_u32_e32 v4, s2, v3 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX9-NEXT: s_mov_b32 s0, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s0 -; GFX9-NEXT: v_perm_b32 v1, v3, v2, s0 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v2, v0, s[4:5] ; GFX9-NEXT: global_store_dword v2, v1, s[6:7] @@ -2339,7 +2340,7 @@ ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s1 ; GFX10-NEXT: s_sub_i32 s3, 0, s2 -; GFX10-NEXT: s_sub_i32 s6, 0, s1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -2347,10 +2348,10 @@ ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_lo_u32 v2, s3, v0 -; GFX10-NEXT: v_mul_lo_u32 v3, s6, v1 +; GFX10-NEXT: s_sub_i32 s3, 0, s1 +; GFX10-NEXT: v_mul_lo_u32 v3, s3, v1 ; GFX10-NEXT: s_and_b32 s3, s0, 0xffff ; GFX10-NEXT: s_lshr_b32 s0, s0, 16 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 @@ -2358,32 +2359,34 @@ ; GFX10-NEXT: v_mul_hi_u32 v0, s3, v0 ; GFX10-NEXT: v_mul_hi_u32 v1, s0, v1 ; GFX10-NEXT: v_mul_lo_u32 v2, v0, s2 -; GFX10-NEXT: v_mul_lo_u32 v3, v1, s1 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 +; GFX10-NEXT: v_mul_lo_u32 v3, v1, s1 +; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v1 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, s3, v2 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s0, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v3 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s1, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo +; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s2, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s1, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo +; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s1, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 +; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v4, s0 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s2, v2 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s1, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v2 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s1, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo -; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s1, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v5, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v6, s0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: global_store_dword v1, v2, s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -59,9 +59,8 @@ ; ; GFX8-LABEL: s_usubsat_i7: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s2, 9, 0x100000 -; GFX8-NEXT: s_lshl_b32 s1, s1, s2 -; GFX8-NEXT: s_lshl_b32 s0, s0, s2 +; GFX8-NEXT: s_lshl_b32 s1, s1, 9 +; GFX8-NEXT: s_lshl_b32 s0, s0, 9 ; GFX8-NEXT: v_mov_b32_e32 v0, s1 ; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 9, v0 @@ -70,9 +69,8 @@ ; ; GFX9-LABEL: s_usubsat_i7: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_bfe_u32 s2, 9, 0x100000 -; GFX9-NEXT: s_lshl_b32 s1, s1, s2 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 +; GFX9-NEXT: s_lshl_b32 s1, s1, 9 +; GFX9-NEXT: s_lshl_b32 s0, s0, 9 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_sub_u16_e64 v0, s0, v0 clamp ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 9, v0 @@ -81,9 +79,8 @@ ; ; GFX10PLUS-LABEL: s_usubsat_i7: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_bfe_u32 s2, 9, 0x100000 -; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s2 -; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, s2 +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 9 +; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 9 ; GFX10PLUS-NEXT: v_sub_nc_u16 v0, s0, s1 clamp ; GFX10PLUS-NEXT: v_lshrrev_b16 v0, 9, v0 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 @@ -146,9 +143,8 @@ ; ; GFX8-LABEL: s_usubsat_i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s2, 8, 0x100000 -; GFX8-NEXT: s_lshl_b32 s1, s1, s2 -; GFX8-NEXT: s_lshl_b32 s0, s0, s2 +; GFX8-NEXT: s_lshl_b32 s1, s1, 8 +; GFX8-NEXT: s_lshl_b32 s0, s0, 8 ; GFX8-NEXT: v_mov_b32_e32 v0, s1 ; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0 @@ -157,9 +153,8 @@ ; ; GFX9-LABEL: s_usubsat_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_bfe_u32 s2, 8, 0x100000 -; GFX9-NEXT: s_lshl_b32 s1, s1, s2 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 +; GFX9-NEXT: s_lshl_b32 s1, s1, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, 8 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_sub_u16_e64 v0, s0, v0 clamp ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0 @@ -168,9 +163,8 @@ ; ; GFX10PLUS-LABEL: s_usubsat_i8: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_bfe_u32 s2, 8, 0x100000 -; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s2 -; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, s2 +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 8 +; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8 ; GFX10PLUS-NEXT: v_sub_nc_u16 v0, s0, s1 clamp ; GFX10PLUS-NEXT: v_lshrrev_b16 v0, 8, v0 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 @@ -219,9 +213,10 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v2, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v3, v1, s4 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_sub_u16 v0, v0, v1 clamp @@ -237,9 +232,11 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX10-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 +; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_sub_u16 v0, v0, v1 clamp @@ -254,8 +251,10 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_sub_u16 v0, v0, v1 clamp @@ -294,15 +293,14 @@ ; ; GFX8-LABEL: s_usubsat_v2i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s4, 8, 0x100000 ; GFX8-NEXT: s_lshr_b32 s3, s1, 8 -; GFX8-NEXT: s_lshl_b32 s1, s1, s4 +; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_lshr_b32 s2, s0, 8 -; GFX8-NEXT: s_lshl_b32 s0, s0, s4 +; GFX8-NEXT: s_lshl_b32 s0, s0, 8 ; GFX8-NEXT: v_mov_b32_e32 v0, s1 -; GFX8-NEXT: s_lshl_b32 s1, s3, s4 +; GFX8-NEXT: s_lshl_b32 s1, s3, 8 ; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp -; GFX8-NEXT: s_lshl_b32 s0, s2, s4 +; GFX8-NEXT: s_lshl_b32 s0, s2, 8 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_sub_u16_e64 v1, s0, v1 clamp ; GFX8-NEXT: v_lshrrev_b16_e32 v1, 8, v1 @@ -456,14 +454,15 @@ ; GFX9-LABEL: v_usubsat_v4i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX9-NEXT: v_perm_b32 v2, v2, v0, s4 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v0 ; GFX9-NEXT: v_alignbit_b32 v0, v3, v0, 16 -; GFX9-NEXT: v_perm_b32 v3, v4, v1, s4 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX9-NEXT: v_alignbit_b32 v1, v5, v1, 16 ; GFX9-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] @@ -490,15 +489,17 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX10-NEXT: v_perm_b32 v2, v2, v0, 0x5040100 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX10-NEXT: v_alignbit_b32 v0, v3, v0, 16 -; GFX10-NEXT: v_perm_b32 v3, v4, v1, 0x5040100 -; GFX10-NEXT: v_alignbit_b32 v1, v5, v1, 16 +; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, 24 -; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] +; GFX10-NEXT: v_lshl_or_b32 v3, v5, 16, v6 +; GFX10-NEXT: v_alignbit_b32 v1, v7, v1, 16 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_sub_u16 v2, v2, v3 clamp @@ -520,12 +521,14 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX11-NEXT: v_perm_b32 v2, v2, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v3, v1, 0x5040100 -; GFX11-NEXT: v_alignbit_b32 v0, v4, v0, 16 -; GFX11-NEXT: v_alignbit_b32 v1, v5, v1, 16 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v0 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX11-NEXT: v_lshl_or_b32 v2, v2, 16, v4 +; GFX11-NEXT: v_lshl_or_b32 v3, v3, 16, v5 +; GFX11-NEXT: v_alignbit_b32 v0, v6, v0, 16 +; GFX11-NEXT: v_alignbit_b32 v1, v7, v1, 16 ; GFX11-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] @@ -589,29 +592,28 @@ ; ; GFX8-LABEL: s_usubsat_v4i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s8, 8, 0x100000 ; GFX8-NEXT: s_lshr_b32 s5, s1, 8 ; GFX8-NEXT: s_lshr_b32 s6, s1, 16 ; GFX8-NEXT: s_lshr_b32 s7, s1, 24 -; GFX8-NEXT: s_lshl_b32 s1, s1, s8 +; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_lshr_b32 s2, s0, 8 ; GFX8-NEXT: s_lshr_b32 s3, s0, 16 ; GFX8-NEXT: s_lshr_b32 s4, s0, 24 -; GFX8-NEXT: s_lshl_b32 s0, s0, s8 +; GFX8-NEXT: s_lshl_b32 s0, s0, 8 ; GFX8-NEXT: v_mov_b32_e32 v0, s1 -; GFX8-NEXT: s_lshl_b32 s1, s5, s8 +; GFX8-NEXT: s_lshl_b32 s1, s5, 8 ; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp -; GFX8-NEXT: s_lshl_b32 s0, s2, s8 +; GFX8-NEXT: s_lshl_b32 s0, s2, 8 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_sub_u16_e64 v1, s0, v1 clamp -; GFX8-NEXT: s_lshl_b32 s1, s6, s8 +; GFX8-NEXT: s_lshl_b32 s1, s6, 8 ; GFX8-NEXT: v_mov_b32_e32 v4, 0xff -; GFX8-NEXT: s_lshl_b32 s0, s3, s8 +; GFX8-NEXT: s_lshl_b32 s0, s3, 8 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: s_lshl_b32 s1, s7, s8 +; GFX8-NEXT: s_lshl_b32 s1, s7, 8 ; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_sub_u16_e64 v2, s0, v2 clamp -; GFX8-NEXT: s_lshl_b32 s0, s4, s8 +; GFX8-NEXT: s_lshl_b32 s0, s4, 8 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 @@ -1807,10 +1809,8 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_u16_e64 v2, v0, v1 clamp -; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v1, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_usubsat_v2i16: @@ -1848,15 +1848,14 @@ ; ; GFX8-LABEL: s_usubsat_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: v_mov_b32_e32 v0, s1 -; GFX8-NEXT: v_sub_u16_e64 v1, s2, v1 clamp -; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_sdwa v1, v2, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -1897,10 +1896,8 @@ ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: v_sub_u16_e64 v1, s0, v0 clamp -; GFX8-NEXT: v_sub_u16_sdwa v0, v2, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_sdwa v0, v2, v0 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: usubsat_v2i16_sv: @@ -1937,10 +1934,8 @@ ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: v_sub_u16_e64 v1, v0, s0 clamp -; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: usubsat_v2i16_vs: @@ -1998,14 +1993,11 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_u16_e64 v4, v0, v2 clamp -; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_u16_e64 v2, v1, v3 clamp -; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v3 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v3, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_usubsat_v4i16: @@ -2058,23 +2050,22 @@ ; ; GFX8-LABEL: s_usubsat_v4i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s6, s2, 16 ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 +; GFX8-NEXT: s_lshr_b32 s6, s2, 16 +; GFX8-NEXT: s_lshr_b32 s5, s1, 16 ; GFX8-NEXT: s_lshr_b32 s7, s3, 16 ; GFX8-NEXT: v_mov_b32_e32 v1, s6 -; GFX8-NEXT: s_lshr_b32 s5, s1, 16 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_sub_u16_e64 v1, s4, v1 clamp +; GFX8-NEXT: v_sub_u16_sdwa v1, v2, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: v_mov_b32_e32 v4, 16 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp -; GFX8-NEXT: v_mov_b32_e32 v2, s3 -; GFX8-NEXT: v_sub_u16_e64 v3, s5, v3 clamp -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_sub_u16_e64 v2, s1, v2 clamp -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_sdwa v3, v4, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v3 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1 ; GFX8-NEXT: ; return to shader part epilog @@ -2152,19 +2143,14 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_u16_e64 v6, v0, v3 clamp -; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v3 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_u16_e64 v3, v1, v4 clamp -; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v4 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v4 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_u16_e64 v4, v2, v5 clamp -; GFX8-NEXT: v_sub_u16_sdwa v2, v2, v5 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v5, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_mov_b32_e32 v3, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_sdwa v2, v2, v5 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_usubsat_v6i16: @@ -2231,31 +2217,30 @@ ; ; GFX8-LABEL: s_usubsat_v6i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s9, s3, 16 ; GFX8-NEXT: s_lshr_b32 s6, s0, 16 -; GFX8-NEXT: s_lshr_b32 s10, s4, 16 -; GFX8-NEXT: v_mov_b32_e32 v1, s9 ; GFX8-NEXT: s_lshr_b32 s7, s1, 16 +; GFX8-NEXT: s_lshr_b32 s9, s3, 16 +; GFX8-NEXT: s_lshr_b32 s10, s4, 16 +; GFX8-NEXT: s_lshr_b32 s8, s2, 16 ; GFX8-NEXT: s_lshr_b32 s11, s5, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: v_sub_u16_e64 v1, s6, v1 clamp +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NEXT: v_mov_b32_e32 v3, s10 -; GFX8-NEXT: v_mov_b32_e32 v6, 16 -; GFX8-NEXT: s_lshr_b32 s8, s2, 16 -; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_mov_b32_e32 v4, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_sub_u16_sdwa v1, v2, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_sub_u16_e64 v3, s7, v3 clamp +; GFX8-NEXT: v_sub_u16_sdwa v3, v4, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, s11 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_mov_b32_e32 v6, s8 +; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp ; GFX8-NEXT: v_sub_u16_e64 v2, s1, v2 clamp -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: v_sub_u16_e64 v5, s8, v5 clamp -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_sub_u16_e64 v4, s2, v4 clamp -; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_sdwa v5, v6, v5 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX8-NEXT: v_or_b32_e32 v2, v4, v5 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1 ; GFX8-NEXT: v_readfirstlane_b32 s2, v2 @@ -2338,23 +2323,17 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_u16_e64 v8, v0, v4 clamp -; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v4 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v4 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_u16_e64 v4, v1, v5 clamp -; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v5 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v5 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_u16_e64 v5, v2, v6 clamp -; GFX8-NEXT: v_sub_u16_sdwa v2, v2, v6 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_sub_u16_sdwa v2, v2, v6 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_u16_e64 v6, v3, v7 clamp -; GFX8-NEXT: v_sub_u16_sdwa v3, v3, v7 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v7, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_mov_b32_e32 v7, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_sdwa v3, v3, v7 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v6, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_usubsat_v8i16: @@ -2435,39 +2414,38 @@ ; ; GFX8-LABEL: s_usubsat_v8i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s12, s4, 16 ; GFX8-NEXT: s_lshr_b32 s8, s0, 16 -; GFX8-NEXT: s_lshr_b32 s13, s5, 16 -; GFX8-NEXT: v_mov_b32_e32 v1, s12 ; GFX8-NEXT: s_lshr_b32 s9, s1, 16 +; GFX8-NEXT: s_lshr_b32 s10, s2, 16 +; GFX8-NEXT: s_lshr_b32 s12, s4, 16 +; GFX8-NEXT: s_lshr_b32 s13, s5, 16 ; GFX8-NEXT: s_lshr_b32 s14, s6, 16 +; GFX8-NEXT: s_lshr_b32 s11, s3, 16 ; GFX8-NEXT: s_lshr_b32 s15, s7, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_sub_u16_e64 v1, s8, v1 clamp +; GFX8-NEXT: v_mov_b32_e32 v1, s12 +; GFX8-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NEXT: v_mov_b32_e32 v3, s13 -; GFX8-NEXT: v_mov_b32_e32 v8, 16 -; GFX8-NEXT: s_lshr_b32 s10, s2, 16 -; GFX8-NEXT: s_lshr_b32 s11, s3, 16 -; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp -; GFX8-NEXT: v_mov_b32_e32 v2, s5 -; GFX8-NEXT: v_sub_u16_e64 v3, s9, v3 clamp +; GFX8-NEXT: v_mov_b32_e32 v4, s9 ; GFX8-NEXT: v_mov_b32_e32 v5, s14 -; GFX8-NEXT: v_mov_b32_e32 v7, s15 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_sub_u16_e64 v2, s1, v2 clamp +; GFX8-NEXT: v_mov_b32_e32 v6, s10 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_sub_u16_sdwa v1, v2, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_sub_u16_sdwa v3, v4, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NEXT: v_sub_u16_e64 v5, s10, v5 clamp +; GFX8-NEXT: v_sub_u16_sdwa v5, v6, v5 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v6, s7 -; GFX8-NEXT: v_sub_u16_e64 v7, s11, v7 clamp -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_mov_b32_e32 v7, s15 +; GFX8-NEXT: v_mov_b32_e32 v8, s11 +; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_sub_u16_e64 v2, s1, v2 clamp ; GFX8-NEXT: v_sub_u16_e64 v4, s2, v4 clamp ; GFX8-NEXT: v_sub_u16_e64 v6, s3, v6 clamp -; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_sdwa v7, v8, v7 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX8-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX8-NEXT: v_or_b32_e32 v3, v6, v7 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1 ; GFX8-NEXT: v_readfirstlane_b32 s2, v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll @@ -212,38 +212,19 @@ } define i32 @vector_xnor_i32_one_use(i32 %a, i32 %b) { -; GFX7-LABEL: vector_xnor_i32_one_use: -; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_xor_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: vector_xnor_i32_one_use: -; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_xor_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX900-LABEL: vector_xnor_i32_one_use: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_xor_b32_e32 v0, v0, v1 -; GFX900-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX906-LABEL: vector_xnor_i32_one_use: -; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_xnor_b32_e32 v0, v0, v1 -; GFX906-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: vector_xnor_i32_one_use: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_xor_b32_e32 v0, v0, v1 +; GCN-NEXT: v_not_b32_e32 v0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: vector_xnor_i32_one_use: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor3_b32 v0, v0, v1, -1 +; GFX10-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX10-NEXT: v_not_b32_e32 v0, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: %xor = xor i32 %a, %b @@ -252,46 +233,23 @@ } define i64 @vector_xnor_i64_one_use(i64 %a, i64 %b) { -; GFX7-LABEL: vector_xnor_i64_one_use: -; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_xor_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_xor_b32_e32 v1, v1, v3 -; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: vector_xnor_i64_one_use: -; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_xor_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_xor_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX900-LABEL: vector_xnor_i64_one_use: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_xor_b32_e32 v0, v0, v2 -; GFX900-NEXT: v_xor_b32_e32 v1, v1, v3 -; GFX900-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX900-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX906-LABEL: vector_xnor_i64_one_use: -; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_xnor_b32_e32 v0, v0, v2 -; GFX906-NEXT: v_xnor_b32_e32 v1, v1, v3 -; GFX906-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: vector_xnor_i64_one_use: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_xor_b32_e32 v0, v0, v2 +; GCN-NEXT: v_xor_b32_e32 v1, v1, v3 +; GCN-NEXT: v_not_b32_e32 v0, v0 +; GCN-NEXT: v_not_b32_e32 v1, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: vector_xnor_i64_one_use: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor3_b32 v0, v0, v2, -1 -; GFX10-NEXT: v_xor3_b32 v1, v1, v3, -1 +; GFX10-NEXT: v_xor_b32_e32 v0, v0, v2 +; GFX10-NEXT: v_xor_b32_e32 v1, v1, v3 +; GFX10-NEXT: v_not_b32_e32 v0, v0 +; GFX10-NEXT: v_not_b32_e32 v1, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: %xor = xor i64 %a, %b @@ -300,32 +258,16 @@ } define amdgpu_ps float @xnor_s_v_i32_one_use(i32 inreg %s, i32 %v) { -; GFX7-LABEL: xnor_s_v_i32_one_use: -; GFX7: ; %bb.0: -; GFX7-NEXT: v_xor_b32_e32 v0, s0, v0 -; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX7-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: xnor_s_v_i32_one_use: -; GFX8: ; %bb.0: -; GFX8-NEXT: v_xor_b32_e32 v0, s0, v0 -; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX900-LABEL: xnor_s_v_i32_one_use: -; GFX900: ; %bb.0: -; GFX900-NEXT: v_xor_b32_e32 v0, s0, v0 -; GFX900-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX900-NEXT: ; return to shader part epilog -; -; GFX906-LABEL: xnor_s_v_i32_one_use: -; GFX906: ; %bb.0: -; GFX906-NEXT: v_xnor_b32_e32 v0, s0, v0 -; GFX906-NEXT: ; return to shader part epilog +; GCN-LABEL: xnor_s_v_i32_one_use: +; GCN: ; %bb.0: +; GCN-NEXT: v_xor_b32_e32 v0, s0, v0 +; GCN-NEXT: v_not_b32_e32 v0, v0 +; GCN-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: xnor_s_v_i32_one_use: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_xor3_b32 v0, s0, v0, -1 +; GFX10-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX10-NEXT: v_not_b32_e32 v0, v0 ; GFX10-NEXT: ; return to shader part epilog %xor = xor i32 %s, %v %d = xor i32 %xor, -1 @@ -334,32 +276,16 @@ } define amdgpu_ps float @xnor_v_s_i32_one_use(i32 inreg %s, i32 %v) { -; GFX7-LABEL: xnor_v_s_i32_one_use: -; GFX7: ; %bb.0: -; GFX7-NEXT: v_xor_b32_e32 v0, s0, v0 -; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX7-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: xnor_v_s_i32_one_use: -; GFX8: ; %bb.0: -; GFX8-NEXT: v_xor_b32_e32 v0, s0, v0 -; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX900-LABEL: xnor_v_s_i32_one_use: -; GFX900: ; %bb.0: -; GFX900-NEXT: v_xor_b32_e32 v0, s0, v0 -; GFX900-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX900-NEXT: ; return to shader part epilog -; -; GFX906-LABEL: xnor_v_s_i32_one_use: -; GFX906: ; %bb.0: -; GFX906-NEXT: v_xnor_b32_e64 v0, v0, s0 -; GFX906-NEXT: ; return to shader part epilog +; GCN-LABEL: xnor_v_s_i32_one_use: +; GCN: ; %bb.0: +; GCN-NEXT: v_xor_b32_e32 v0, s0, v0 +; GCN-NEXT: v_not_b32_e32 v0, v0 +; GCN-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: xnor_v_s_i32_one_use: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_xor3_b32 v0, v0, s0, -1 +; GFX10-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX10-NEXT: v_not_b32_e32 v0, v0 ; GFX10-NEXT: ; return to shader part epilog %xor = xor i32 %v, %s %d = xor i32 %xor, -1 @@ -373,8 +299,8 @@ ; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 29 ; GFX7-NEXT: v_xor_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_xor_b32_e32 v1, s1, v1 -; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX7-NEXT: v_not_b32_e32 v0, v0 +; GFX7-NEXT: v_not_b32_e32 v1, v1 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: xnor_i64_s_v_one_use: @@ -382,8 +308,8 @@ ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 29, v[0:1] ; GFX8-NEXT: v_xor_b32_e32 v0, s0, v0 ; GFX8-NEXT: v_xor_b32_e32 v1, s1, v1 -; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: v_not_b32_e32 v0, v0 +; GFX8-NEXT: v_not_b32_e32 v1, v1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX900-LABEL: xnor_i64_s_v_one_use: @@ -391,22 +317,26 @@ ; GFX900-NEXT: v_lshlrev_b64 v[0:1], 29, v[0:1] ; GFX900-NEXT: v_xor_b32_e32 v0, s0, v0 ; GFX900-NEXT: v_xor_b32_e32 v1, s1, v1 -; GFX900-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX900-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX900-NEXT: v_not_b32_e32 v0, v0 +; GFX900-NEXT: v_not_b32_e32 v1, v1 ; GFX900-NEXT: ; return to shader part epilog ; ; GFX906-LABEL: xnor_i64_s_v_one_use: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: v_lshlrev_b64 v[0:1], 29, v[0:1] -; GFX906-NEXT: v_xnor_b32_e32 v0, s0, v0 -; GFX906-NEXT: v_xnor_b32_e32 v1, s1, v1 +; GFX906-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX906-NEXT: v_xor_b32_e32 v1, s1, v1 +; GFX906-NEXT: v_not_b32_e32 v0, v0 +; GFX906-NEXT: v_not_b32_e32 v1, v1 ; GFX906-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: xnor_i64_s_v_one_use: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 29, v[0:1] -; GFX10-NEXT: v_xor3_b32 v0, s0, v0, -1 -; GFX10-NEXT: v_xor3_b32 v1, s1, v1, -1 +; GFX10-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX10-NEXT: v_xor_b32_e32 v1, s1, v1 +; GFX10-NEXT: v_not_b32_e32 v0, v0 +; GFX10-NEXT: v_not_b32_e32 v1, v1 ; GFX10-NEXT: ; return to shader part epilog entry: %b = shl i64 %b64, 29 @@ -422,8 +352,8 @@ ; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 29 ; GFX7-NEXT: v_xor_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_xor_b32_e32 v1, s1, v1 -; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX7-NEXT: v_not_b32_e32 v0, v0 +; GFX7-NEXT: v_not_b32_e32 v1, v1 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: xnor_i64_v_s_one_use: @@ -431,8 +361,8 @@ ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 29, v[0:1] ; GFX8-NEXT: v_xor_b32_e32 v0, s0, v0 ; GFX8-NEXT: v_xor_b32_e32 v1, s1, v1 -; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: v_not_b32_e32 v0, v0 +; GFX8-NEXT: v_not_b32_e32 v1, v1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX900-LABEL: xnor_i64_v_s_one_use: @@ -440,22 +370,26 @@ ; GFX900-NEXT: v_lshlrev_b64 v[0:1], 29, v[0:1] ; GFX900-NEXT: v_xor_b32_e32 v0, s0, v0 ; GFX900-NEXT: v_xor_b32_e32 v1, s1, v1 -; GFX900-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX900-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX900-NEXT: v_not_b32_e32 v0, v0 +; GFX900-NEXT: v_not_b32_e32 v1, v1 ; GFX900-NEXT: ; return to shader part epilog ; ; GFX906-LABEL: xnor_i64_v_s_one_use: ; GFX906: ; %bb.0: ; GFX906-NEXT: v_lshlrev_b64 v[0:1], 29, v[0:1] -; GFX906-NEXT: v_xnor_b32_e64 v0, v0, s0 -; GFX906-NEXT: v_xnor_b32_e64 v1, v1, s1 +; GFX906-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX906-NEXT: v_xor_b32_e32 v1, s1, v1 +; GFX906-NEXT: v_not_b32_e32 v0, v0 +; GFX906-NEXT: v_not_b32_e32 v1, v1 ; GFX906-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: xnor_i64_v_s_one_use: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 29, v[0:1] -; GFX10-NEXT: v_xor3_b32 v0, v0, s0, -1 -; GFX10-NEXT: v_xor3_b32 v1, v1, s1, -1 +; GFX10-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX10-NEXT: v_xor_b32_e32 v1, s1, v1 +; GFX10-NEXT: v_not_b32_e32 v0, v0 +; GFX10-NEXT: v_not_b32_e32 v1, v1 ; GFX10-NEXT: ; return to shader part epilog %b = shl i64 %b64, 29 %xor = xor i64 %b, %a @@ -468,21 +402,21 @@ ; GFX7-LABEL: vector_xor_na_b_i32_one_use: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX7-NEXT: v_not_b32_e32 v0, v0 ; GFX7-NEXT: v_xor_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: vector_xor_na_b_i32_one_use: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: v_xor_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: vector_xor_na_b_i32_one_use: ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX900-NEXT: v_not_b32_e32 v0, v0 ; GFX900-NEXT: v_xor_b32_e32 v0, v0, v1 ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -508,21 +442,21 @@ ; GFX7-LABEL: vector_xor_a_nb_i32_one_use: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX7-NEXT: v_not_b32_e32 v1, v1 ; GFX7-NEXT: v_xor_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: vector_xor_a_nb_i32_one_use: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: v_not_b32_e32 v1, v1 ; GFX8-NEXT: v_xor_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: vector_xor_a_nb_i32_one_use: ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX900-NEXT: v_not_b32_e32 v1, v1 ; GFX900-NEXT: v_xor_b32_e32 v0, v0, v1 ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll --- a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll +++ b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll @@ -139,6 +139,10 @@ ; GISEL-LABEL: csh_v4i32: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_and_b32_e32 v4, 31, v4 +; GISEL-NEXT: v_and_b32_e32 v5, 31, v5 +; GISEL-NEXT: v_and_b32_e32 v6, 31, v6 +; GISEL-NEXT: v_and_b32_e32 v7, 31, v7 ; GISEL-NEXT: v_lshlrev_b32_e32 v8, v4, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v9, v5, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v10, v6, v2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll @@ -31,7 +31,8 @@ ; ; GFX10GISEL-LABEL: sample_d_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_perm_b32 v4, v5, v4, 0x5040100 +; GFX10GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX10GISEL-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; GFX10GISEL-NEXT: image_sample_d v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -58,8 +59,9 @@ ; GFX10GISEL-LABEL: sample_d_3d: ; GFX10GISEL: ; %bb.0: ; %main_body ; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v7 +; GFX10GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX10GISEL-NEXT: v_mov_b32_e32 v7, v8 -; GFX10GISEL-NEXT: v_perm_b32 v6, v9, v6, 0x5040100 +; GFX10GISEL-NEXT: v_lshl_or_b32 v6, v9, 16, v6 ; GFX10GISEL-NEXT: image_sample_d v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -95,7 +97,8 @@ ; ; GFX10GISEL-LABEL: sample_c_d_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_perm_b32 v5, v6, v5, 0x5040100 +; GFX10GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX10GISEL-NEXT: v_lshl_or_b32 v5, v6, 16, v5 ; GFX10GISEL-NEXT: image_sample_c_d v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -114,7 +117,8 @@ ; ; GFX10GISEL-LABEL: sample_d_cl_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 +; GFX10GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10GISEL-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10GISEL-NEXT: image_sample_d_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -139,8 +143,9 @@ ; GFX10GISEL-LABEL: sample_d_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body ; GFX10GISEL-NEXT: v_mov_b32_e32 v7, v5 +; GFX10GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX10GISEL-NEXT: v_mov_b32_e32 v5, v6 -; GFX10GISEL-NEXT: v_perm_b32 v4, v7, v4, 0x5040100 +; GFX10GISEL-NEXT: v_lshl_or_b32 v4, v7, 16, v4 ; GFX10GISEL-NEXT: image_sample_d_cl v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -159,7 +164,8 @@ ; ; GFX10GISEL-LABEL: sample_c_d_cl_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 +; GFX10GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10GISEL-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10GISEL-NEXT: image_sample_c_d_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -185,8 +191,9 @@ ; GFX10GISEL-LABEL: sample_c_d_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body ; GFX10GISEL-NEXT: v_mov_b32_e32 v8, v6 +; GFX10GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX10GISEL-NEXT: v_mov_b32_e32 v6, v7 -; GFX10GISEL-NEXT: v_perm_b32 v5, v8, v5, 0x5040100 +; GFX10GISEL-NEXT: v_lshl_or_b32 v5, v8, 16, v5 ; GFX10GISEL-NEXT: image_sample_c_d_cl v[0:3], v[0:6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -222,7 +229,8 @@ ; ; GFX10GISEL-LABEL: sample_cd_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_perm_b32 v4, v5, v4, 0x5040100 +; GFX10GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX10GISEL-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; GFX10GISEL-NEXT: image_sample_cd v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -258,7 +266,8 @@ ; ; GFX10GISEL-LABEL: sample_c_cd_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_perm_b32 v5, v6, v5, 0x5040100 +; GFX10GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX10GISEL-NEXT: v_lshl_or_b32 v5, v6, 16, v5 ; GFX10GISEL-NEXT: image_sample_c_cd v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -277,7 +286,8 @@ ; ; GFX10GISEL-LABEL: sample_cd_cl_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 +; GFX10GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10GISEL-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10GISEL-NEXT: image_sample_cd_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -302,8 +312,9 @@ ; GFX10GISEL-LABEL: sample_cd_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body ; GFX10GISEL-NEXT: v_mov_b32_e32 v7, v5 +; GFX10GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX10GISEL-NEXT: v_mov_b32_e32 v5, v6 -; GFX10GISEL-NEXT: v_perm_b32 v4, v7, v4, 0x5040100 +; GFX10GISEL-NEXT: v_lshl_or_b32 v4, v7, 16, v4 ; GFX10GISEL-NEXT: image_sample_cd_cl v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -322,7 +333,8 @@ ; ; GFX10GISEL-LABEL: sample_c_cd_cl_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 +; GFX10GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10GISEL-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10GISEL-NEXT: image_sample_c_cd_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -348,8 +360,9 @@ ; GFX10GISEL-LABEL: sample_c_cd_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body ; GFX10GISEL-NEXT: v_mov_b32_e32 v8, v6 +; GFX10GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX10GISEL-NEXT: v_mov_b32_e32 v6, v7 -; GFX10GISEL-NEXT: v_perm_b32 v5, v8, v5, 0x5040100 +; GFX10GISEL-NEXT: v_lshl_or_b32 v5, v8, 16, v5 ; GFX10GISEL-NEXT: image_sample_c_cd_cl v[0:3], v[0:6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -376,8 +389,9 @@ ; GFX10GISEL-LABEL: sample_c_d_o_2darray_V1: ; GFX10GISEL: ; %bb.0: ; %main_body ; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v7 +; GFX10GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX10GISEL-NEXT: v_mov_b32_e32 v7, v8 -; GFX10GISEL-NEXT: v_perm_b32 v6, v9, v6, 0x5040100 +; GFX10GISEL-NEXT: v_lshl_or_b32 v6, v9, 16, v6 ; GFX10GISEL-NEXT: image_sample_c_d_o v0, v[0:7], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -404,8 +418,9 @@ ; GFX10GISEL-LABEL: sample_c_d_o_2darray_V2: ; GFX10GISEL: ; %bb.0: ; %main_body ; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v7 +; GFX10GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX10GISEL-NEXT: v_mov_b32_e32 v7, v8 -; GFX10GISEL-NEXT: v_perm_b32 v6, v9, v6, 0x5040100 +; GFX10GISEL-NEXT: v_lshl_or_b32 v6, v9, 16, v6 ; GFX10GISEL-NEXT: image_sample_c_d_o v[0:1], v[0:7], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -464,8 +479,10 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_d_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX10GISEL-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 +; GFX10GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v2 ; GFX10GISEL-NEXT: image_sample_d_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -488,9 +505,11 @@ ; GFX10GISEL-LABEL: sample_g16_noa16_d_3d: ; GFX10GISEL: ; %bb.0: ; %main_body ; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v3 +; GFX10GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10GISEL-NEXT: v_mov_b32_e32 v3, v2 -; GFX10GISEL-NEXT: v_perm_b32 v2, v1, v0, 0x5040100 -; GFX10GISEL-NEXT: v_perm_b32 v4, v4, v9, 0x5040100 +; GFX10GISEL-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX10GISEL-NEXT: v_lshl_or_b32 v2, v1, 16, v0 +; GFX10GISEL-NEXT: v_lshl_or_b32 v4, v4, 16, v9 ; GFX10GISEL-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -527,8 +546,10 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_c_d_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 -; GFX10GISEL-NEXT: v_perm_b32 v2, v4, v3, 0x5040100 +; GFX10GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10GISEL-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10GISEL-NEXT: v_lshl_or_b32 v2, v4, 16, v3 ; GFX10GISEL-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -565,8 +586,10 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_d_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX10GISEL-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 +; GFX10GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v2 ; GFX10GISEL-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -606,10 +629,11 @@ ; GFX10GISEL-LABEL: sample_g16_noa16_c_d_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body ; GFX10GISEL-NEXT: v_mov_b32_e32 v8, v2 -; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v3 ; GFX10GISEL-NEXT: v_mov_b32_e32 v2, v0 -; GFX10GISEL-NEXT: v_perm_b32 v3, v8, v1, 0x5040100 -; GFX10GISEL-NEXT: v_perm_b32 v4, v4, v9, 0x5040100 +; GFX10GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX10GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX10GISEL-NEXT: v_lshl_or_b32 v3, v8, 16, v0 +; GFX10GISEL-NEXT: v_lshl_or_b32 v4, v4, 16, v1 ; GFX10GISEL-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -646,8 +670,10 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_cd_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX10GISEL-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 +; GFX10GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v2 ; GFX10GISEL-NEXT: image_sample_cd_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -684,8 +710,10 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_c_cd_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 -; GFX10GISEL-NEXT: v_perm_b32 v2, v4, v3, 0x5040100 +; GFX10GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10GISEL-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10GISEL-NEXT: v_lshl_or_b32 v2, v4, 16, v3 ; GFX10GISEL-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -722,8 +750,10 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_cd_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX10GISEL-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 +; GFX10GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v2 ; GFX10GISEL-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -763,10 +793,11 @@ ; GFX10GISEL-LABEL: sample_g16_noa16_c_cd_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body ; GFX10GISEL-NEXT: v_mov_b32_e32 v8, v2 -; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v3 ; GFX10GISEL-NEXT: v_mov_b32_e32 v2, v0 -; GFX10GISEL-NEXT: v_perm_b32 v3, v8, v1, 0x5040100 -; GFX10GISEL-NEXT: v_perm_b32 v4, v4, v9, 0x5040100 +; GFX10GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX10GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX10GISEL-NEXT: v_lshl_or_b32 v3, v8, 16, v0 +; GFX10GISEL-NEXT: v_lshl_or_b32 v4, v4, 16, v1 ; GFX10GISEL-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -792,11 +823,12 @@ ; GFX10GISEL: ; %bb.0: ; %main_body ; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v2 ; GFX10GISEL-NEXT: v_mov_b32_e32 v10, v3 -; GFX10GISEL-NEXT: v_mov_b32_e32 v11, v4 ; GFX10GISEL-NEXT: v_mov_b32_e32 v2, v0 ; GFX10GISEL-NEXT: v_mov_b32_e32 v3, v1 -; GFX10GISEL-NEXT: v_perm_b32 v4, v10, v9, 0x5040100 -; GFX10GISEL-NEXT: v_perm_b32 v5, v5, v11, 0x5040100 +; GFX10GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX10GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX10GISEL-NEXT: v_lshl_or_b32 v5, v5, 16, v1 +; GFX10GISEL-NEXT: v_lshl_or_b32 v4, v10, 16, v0 ; GFX10GISEL-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -822,11 +854,12 @@ ; GFX10GISEL: ; %bb.0: ; %main_body ; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v2 ; GFX10GISEL-NEXT: v_mov_b32_e32 v10, v3 -; GFX10GISEL-NEXT: v_mov_b32_e32 v11, v4 ; GFX10GISEL-NEXT: v_mov_b32_e32 v2, v0 ; GFX10GISEL-NEXT: v_mov_b32_e32 v3, v1 -; GFX10GISEL-NEXT: v_perm_b32 v4, v10, v9, 0x5040100 -; GFX10GISEL-NEXT: v_perm_b32 v5, v5, v11, 0x5040100 +; GFX10GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX10GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX10GISEL-NEXT: v_lshl_or_b32 v5, v5, 16, v1 +; GFX10GISEL-NEXT: v_lshl_or_b32 v4, v10, 16, v0 ; GFX10GISEL-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -886,9 +919,12 @@ ; ; GFX10GISEL-LABEL: sample_d_2d_g16_a16: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX10GISEL-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 -; GFX10GISEL-NEXT: v_perm_b32 v2, v5, v4, 0x5040100 +; GFX10GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX10GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v2 +; GFX10GISEL-NEXT: v_lshl_or_b32 v2, v5, 16, v4 ; GFX10GISEL-NEXT: image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -914,11 +950,14 @@ ; GFX10GISEL: ; %bb.0: ; %main_body ; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v3 ; GFX10GISEL-NEXT: v_mov_b32_e32 v10, v7 -; GFX10GISEL-NEXT: v_mov_b32_e32 v3, v2 ; GFX10GISEL-NEXT: v_mov_b32_e32 v7, v8 -; GFX10GISEL-NEXT: v_perm_b32 v2, v1, v0, 0x5040100 -; GFX10GISEL-NEXT: v_perm_b32 v4, v4, v9, 0x5040100 -; GFX10GISEL-NEXT: v_perm_b32 v6, v10, v6, 0x5040100 +; GFX10GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX10GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v9 +; GFX10GISEL-NEXT: v_mov_b32_e32 v3, v2 +; GFX10GISEL-NEXT: v_lshl_or_b32 v2, v1, 16, v0 +; GFX10GISEL-NEXT: v_lshl_or_b32 v6, v10, 16, v6 +; GFX10GISEL-NEXT: v_lshl_or_b32 v4, v4, 16, v8 ; GFX10GISEL-NEXT: image_sample_d_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll @@ -32,7 +32,7 @@ ; GFX7GLISEL-NEXT: s_mov_b32 s2, -1 ; GFX7GLISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX7GLISEL-NEXT: s_and_b32 s3, s3, 0x7fff -; GFX7GLISEL-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX7GLISEL-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX7GLISEL-NEXT: s_cmpk_gt_u32 s3, 0x7c00 ; GFX7GLISEL-NEXT: s_cselect_b32 s3, 1, 0 ; GFX7GLISEL-NEXT: s_bfe_i32 s3, s3, 0x10000 @@ -217,7 +217,7 @@ ; GFX7GLISEL: ; %bb.0: ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7GLISEL-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7c00 ; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7e00 @@ -274,7 +274,7 @@ ; GFX7GLISEL: ; %bb.0: ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7GLISEL-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7e00 ; GFX7GLISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 ; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -440,7 +440,7 @@ ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7GLISEL-NEXT: v_bfe_u32 v2, v1, 0, 16 +; GFX7GLISEL-NEXT: v_and_b32_e32 v2, 0xffff, v1 ; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v0, v2 ; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v1 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -506,7 +506,7 @@ ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7GLISEL-NEXT: v_bfe_u32 v2, v1, 0, 16 +; GFX7GLISEL-NEXT: v_and_b32_e32 v2, 0xffff, v1 ; GFX7GLISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], v0, v2 ; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v1 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -626,7 +626,7 @@ ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7GLISEL-NEXT: v_bfe_u32 v2, v1, 0, 16 +; GFX7GLISEL-NEXT: v_and_b32_e32 v2, 0xffff, v1 ; GFX7GLISEL-NEXT: v_cmp_ne_u32_e32 vcc, v0, v2 ; GFX7GLISEL-NEXT: v_subrev_i32_e64 v0, s[4:5], 1, v1 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -844,7 +844,7 @@ ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7GLISEL-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7GLISEL-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v0, 0x7c00 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e64 s[4:5], v1, v0 @@ -900,7 +900,7 @@ ; GFX7GLISEL: ; %bb.0: ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7GLISEL-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7c00 ; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -954,7 +954,7 @@ ; GFX7GLISEL: ; %bb.0: ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7GLISEL-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: s_movk_i32 s4, 0x7c00 ; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0 ; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, v0 @@ -1016,10 +1016,10 @@ ; GFX7GLISEL: ; %bb.0: ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7GLISEL-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: s_movk_i32 s4, 0x7c00 ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0x7fff, v1 -; GFX7GLISEL-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, s4, v0 ; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, s4, v1 @@ -1096,13 +1096,13 @@ ; GFX7GLISEL: ; %bb.0: ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7GLISEL-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: s_movk_i32 s4, 0x7c00 ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0x7fff, v1 -; GFX7GLISEL-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7GLISEL-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, s4, v0 -; GFX7GLISEL-NEXT: v_bfe_u32 v2, v2, 0, 16 +; GFX7GLISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, s4, v1 ; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc @@ -1248,17 +1248,17 @@ ; GFX7GLISEL: ; %bb.0: ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7GLISEL-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: s_movk_i32 s4, 0x7c00 ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0x7fff, v1 -; GFX7GLISEL-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7GLISEL-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, s4, v0 -; GFX7GLISEL-NEXT: v_bfe_u32 v2, v2, 0, 16 +; GFX7GLISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7GLISEL-NEXT: v_and_b32_e32 v3, 0x7fff, v3 ; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, s4, v1 -; GFX7GLISEL-NEXT: v_bfe_u32 v3, v3, 0, 16 +; GFX7GLISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, s4, v2 ; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc @@ -1397,7 +1397,7 @@ ; GFX7GLISEL: ; %bb.0: ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7GLISEL-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7c00 ; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -1451,7 +1451,7 @@ ; GFX7GLISEL: ; %bb.0: ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7GLISEL-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7c00 ; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -1507,7 +1507,7 @@ ; GFX7GLISEL: ; %bb.0: ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7GLISEL-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7c00 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1 ; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -1566,7 +1566,7 @@ ; GFX7GLISEL: ; %bb.0: ; %entry ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7GLISEL-NEXT: v_bfe_u32 v1, v0, 0, 16 +; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v0 ; GFX7GLISEL-NEXT: v_subrev_i32_e64 v0, s[4:5], 1, v0 ; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -1632,7 +1632,7 @@ ; GFX7GLISEL: ; %bb.0: ; %entry ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7GLISEL-NEXT: v_bfe_u32 v1, v0, 0, 16 +; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v0 ; GFX7GLISEL-NEXT: s_movk_i32 s4, 0x7c00 ; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, s4, v1 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e64 s[4:5], s4, v1 @@ -1757,7 +1757,7 @@ ; GFX7GLISEL: ; %bb.0: ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7GLISEL-NEXT: v_bfe_u32 v1, v0, 0, 16 +; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v0 ; GFX7GLISEL-NEXT: v_subrev_i32_e64 v0, s[4:5], 1, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x3ff @@ -1828,7 +1828,7 @@ ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7GLISEL-NEXT: v_bfe_u32 v2, v1, 0, 16 +; GFX7GLISEL-NEXT: v_and_b32_e32 v2, 0xffff, v1 ; GFX7GLISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], v0, v2 ; GFX7GLISEL-NEXT: v_subrev_i32_e64 v0, s[6:7], 1, v1 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -1906,7 +1906,7 @@ ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7GLISEL-NEXT: v_bfe_u32 v2, v1, 0, 16 +; GFX7GLISEL-NEXT: v_and_b32_e32 v2, 0xffff, v1 ; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v0, v2 ; GFX7GLISEL-NEXT: v_subrev_i32_e64 v0, s[6:7], 1, v1 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -2037,7 +2037,7 @@ ; GFX7GLISEL: ; %bb.0: ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7GLISEL-NEXT: v_bfe_u32 v1, v0, 0, 16 +; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v0 ; GFX7GLISEL-NEXT: s_movk_i32 s6, 0x7c00 ; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v1 @@ -2101,7 +2101,7 @@ ; GFX7GLISEL: ; %bb.0: ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7GLISEL-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31] @@ -2159,7 +2159,7 @@ ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x3ff ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 -; GFX7GLISEL-NEXT: v_bfe_u32 v1, v0, 0, 16 +; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v0 ; GFX7GLISEL-NEXT: s_movk_i32 s6, 0x7c00 ; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v1 ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] @@ -2288,7 +2288,7 @@ ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7GLISEL-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7GLISEL-NEXT: s_movk_i32 s6, 0x7c00 ; GFX7GLISEL-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1 ; GFX7GLISEL-NEXT: v_cmp_gt_u32_e64 s[4:5], s6, v1 @@ -2356,7 +2356,7 @@ ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7GLISEL-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x7c00 ; GFX7GLISEL-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e64 s[4:5], v1, v2 @@ -2422,7 +2422,7 @@ ; GFX7GLISEL-NEXT: s_movk_i32 s6, 0x7c00 ; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, s6, v0 ; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v0 -; GFX7GLISEL-NEXT: v_bfe_u32 v0, v1, 0, 16 +; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v1 ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, s6, v0 ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], s[4:5], vcc @@ -2481,7 +2481,7 @@ ; GFX7GLISEL: ; %bb.0: ; %entry ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7GLISEL-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7c00 ; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7GLISEL-NEXT: v_cmp_gt_u32_e64 s[4:5], v0, v1 @@ -2542,7 +2542,7 @@ ; GFX7GLISEL: ; %bb.0: ; %entry ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7GLISEL-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7c00 ; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7GLISEL-NEXT: v_cmp_gt_u32_e64 s[4:5], v0, v1 @@ -2603,7 +2603,7 @@ ; GFX7GLISEL: ; %bb.0: ; %entry ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7GLISEL-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7c00 ; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7GLISEL-NEXT: v_cmp_gt_u32_e64 s[4:5], v0, v1 @@ -2675,7 +2675,7 @@ ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x3ff ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 -; GFX7GLISEL-NEXT: v_bfe_u32 v1, v0, 0, 16 +; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x7c00 ; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v2 ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] @@ -2751,7 +2751,7 @@ ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x3ff ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 -; GFX7GLISEL-NEXT: v_bfe_u32 v1, v0, 0, 16 +; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x7c00 ; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v2 ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] @@ -2827,7 +2827,7 @@ ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x3ff ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 -; GFX7GLISEL-NEXT: v_bfe_u32 v1, v0, 0, 16 +; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x7c00 ; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v2 ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] @@ -2892,7 +2892,7 @@ ; GFX7GLISEL: ; %bb.0: ; %entry ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7GLISEL-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7e00 ; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7GLISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v1 @@ -2956,7 +2956,7 @@ ; GFX7GLISEL: ; %bb.0: ; %entry ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7GLISEL-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7c00 ; GFX7GLISEL-NEXT: v_cmp_gt_u32_e64 s[4:5], v0, v1 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7e00 @@ -3036,7 +3036,7 @@ ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x3ff ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 -; GFX7GLISEL-NEXT: v_bfe_u32 v1, v0, 0, 16 +; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v0 ; GFX7GLISEL-NEXT: s_movk_i32 s8, 0x7c00 ; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], s8, v1 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x7e00 @@ -3120,7 +3120,7 @@ ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x3ff ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 -; GFX7GLISEL-NEXT: v_bfe_u32 v1, v0, 0, 16 +; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x7c00 ; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v2 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x7e00 @@ -3186,7 +3186,7 @@ ; GFX7GLISEL: ; %bb.0: ; %entry ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7GLISEL-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: s_movk_i32 s4, 0x7c00 ; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e64 s[4:5], s4, v0 @@ -3245,7 +3245,7 @@ ; GFX7GLISEL: ; %bb.0: ; %entry ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7GLISEL-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7c00 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1 ; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -3302,7 +3302,7 @@ ; GFX7GLISEL: ; %bb.0: ; %entry ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7GLISEL-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: s_movk_i32 s4, 0x7c00 ; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e64 s[4:5], s4, v0 @@ -3361,7 +3361,7 @@ ; GFX7GLISEL: ; %bb.0: ; %entry ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7GLISEL-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7c00 ; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll --- a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll @@ -14,15 +14,15 @@ ; GFX9-NEXT: v_mad_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo: -; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-VI-NEXT: v_mac_f32_e32 v2, v0, v1 -; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo: ; SDAG-CI: ; %bb.0: @@ -32,18 +32,6 @@ ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v0 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo: -; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1 -; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; GISEL-VI-NEXT: v_mov_b32_e32 v1, 16 -; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GISEL-VI-NEXT: s_setpc_b64 s[30:31] -; ; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -71,16 +59,16 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo: -; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-VI-NEXT: v_mac_f32_e32 v2, v0, v1 -; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; SDAG-VI-NEXT: v_or_b32_e32 v0, 0x3c00, v0 -; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, 0x3c00, v0 +; VI-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo: ; SDAG-CI: ; %bb.0: @@ -91,21 +79,6 @@ ; SDAG-CI-NEXT: v_mov_b32_e32 v0, 1.0 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo: -; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GISEL-VI-NEXT: s_movk_i32 s4, 0x3c00 -; GISEL-VI-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1 -; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; GISEL-VI-NEXT: v_mov_b32_e32 v1, 16 -; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GISEL-VI-NEXT: v_or_b32_e32 v0, s4, v0 -; GISEL-VI-NEXT: s_setpc_b64 s[30:31] -; ; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -133,16 +106,16 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo: -; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-VI-NEXT: v_mac_f32_e32 v2, v0, v1 -; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; SDAG-VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo: ; SDAG-CI: ; %bb.0: @@ -153,19 +126,6 @@ ; SDAG-CI-NEXT: v_mov_b32_e32 v0, v3 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo: -; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1 -; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; GISEL-VI-NEXT: v_mov_b32_e32 v1, 16 -; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GISEL-VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GISEL-VI-NEXT: s_setpc_b64 s[30:31] -; ; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -194,15 +154,15 @@ ; SDAG-GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack: -; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-VI-NEXT: v_mac_f32_e32 v2, v0, v1 -; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack: ; SDAG-CI: ; %bb.0: @@ -216,22 +176,10 @@ ; GISEL-GFX9: ; %bb.0: ; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] -; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, 16 -; GISEL-GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GISEL-GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GISEL-GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack: -; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1 -; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; GISEL-VI-NEXT: v_mov_b32_e32 v1, 16 -; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GISEL-VI-NEXT: s_setpc_b64 s[30:31] -; ; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -240,7 +188,6 @@ ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; GISEL-CI-NEXT: v_bfe_u32 v0, v0, 0, 16 ; GISEL-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float @@ -330,15 +277,15 @@ ; GFX9-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt: -; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-VI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp -; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; VI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp +; VI-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt: ; SDAG-CI: ; %bb.0: @@ -348,18 +295,6 @@ ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v0 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt: -; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GISEL-VI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp -; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GISEL-VI-NEXT: v_mov_b32_e32 v1, 16 -; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GISEL-VI-NEXT: s_setpc_b64 s[30:31] -; ; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -387,15 +322,15 @@ ; GFX9-NEXT: v_mad_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt: -; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-VI-NEXT: v_mac_f32_e32 v2, v0, v1 -; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt: ; SDAG-CI: ; %bb.0: @@ -405,18 +340,6 @@ ; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v1, v0 clamp ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt: -; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1 -; GISEL-VI-NEXT: v_cvt_f16_f32_e64 v0, v2 clamp -; GISEL-VI-NEXT: v_mov_b32_e32 v1, 16 -; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GISEL-VI-NEXT: s_setpc_b64 s[30:31] -; ; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -455,18 +378,18 @@ ; GFX9-NEXT: v_mad_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use: -; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-VI-NEXT: v_mac_f32_e32 v2, v0, v1 -; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SDAG-VI-NEXT: flat_store_short v[0:1], v0 -; SDAG-VI-NEXT: s_waitcnt vmcnt(0) -; SDAG-VI-NEXT: v_max_f16_sdwa v0, v0, v0 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; VI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; VI-NEXT: flat_store_short v[0:1], v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_max_f16_sdwa v0, v0, v0 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use: ; SDAG-CI: ; %bb.0: @@ -480,21 +403,6 @@ ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use: -; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1 -; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; GISEL-VI-NEXT: flat_store_short v[0:1], v0 -; GISEL-VI-NEXT: s_waitcnt vmcnt(0) -; GISEL-VI-NEXT: v_max_f16_e64 v0, v0, v0 clamp -; GISEL-VI-NEXT: v_mov_b32_e32 v1, 16 -; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GISEL-VI-NEXT: s_setpc_b64 s[30:31] -; ; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -539,4 +447,3 @@ attributes #1 = { nounwind readnone speculatable } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; CI: {{.*}} -; VI: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll --- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll @@ -315,11 +315,9 @@ ; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GISEL-VI-NEXT: v_mac_f32_e32 v5, v3, v4 ; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1 -; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; GISEL-VI-NEXT: v_mov_b32_e32 v2, 16 -; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GISEL-VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v1 ; GISEL-VI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-CI-LABEL: v_mad_mix_v2f32: @@ -446,16 +444,13 @@ ; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GISEL-VI-NEXT: v_mac_f32_e32 v4, v0, v2 ; GISEL-VI-NEXT: v_mac_f32_e32 v8, v6, v7 -; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; GISEL-VI-NEXT: v_mac_f32_e32 v4, v0, v2 +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; GISEL-VI-NEXT: v_mac_f32_e32 v5, v1, v3 -; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v1, v8 -; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v2, v5 -; GISEL-VI-NEXT: v_mov_b32_e32 v3, 16 -; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GISEL-VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GISEL-VI-NEXT: v_bfe_u32 v1, v2, 0, 16 +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v2 ; GISEL-VI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-CI-LABEL: v_mad_mix_v3f32: @@ -603,29 +598,26 @@ ; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v6, v0 ; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v8, v2 -; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v10, v4 -; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v7, v1 ; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v8, v2 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v9, v3 ; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v10, v4 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v11, v5 ; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GISEL-VI-NEXT: v_mac_f32_e32 v4, v0, v2 ; GISEL-VI-NEXT: v_mac_f32_e32 v10, v6, v8 +; GISEL-VI-NEXT: v_mac_f32_e32 v4, v0, v2 ; GISEL-VI-NEXT: v_mac_f32_e32 v11, v7, v9 ; GISEL-VI-NEXT: v_mac_f32_e32 v5, v1, v3 -; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v1, v4 ; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v3, v5 +; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v2, v11 -; GISEL-VI-NEXT: v_mov_b32_e32 v4, 16 -; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GISEL-VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v1, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GISEL-VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-VI-NEXT: v_or_b32_e32 v1, v2, v3 ; GISEL-VI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-CI-LABEL: v_mad_mix_v4f32: @@ -729,11 +721,9 @@ ; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GISEL-VI-NEXT: v_mac_f32_e32 v5, v3, v4 ; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1 -; GISEL-VI-NEXT: v_cvt_f16_f32_e64 v0, v2 clamp -; GISEL-VI-NEXT: v_cvt_f16_f32_e64 v1, v5 clamp -; GISEL-VI-NEXT: v_mov_b32_e32 v2, 16 -; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GISEL-VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GISEL-VI-NEXT: v_cvt_f16_f32_e64 v0, v5 clamp +; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v1, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v1 ; GISEL-VI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-CI-LABEL: v_mad_mix_v2f32_clamp_postcvt: @@ -885,16 +875,13 @@ ; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GISEL-VI-NEXT: v_mac_f32_e32 v4, v0, v2 ; GISEL-VI-NEXT: v_mac_f32_e32 v8, v6, v7 -; GISEL-VI-NEXT: v_cvt_f16_f32_e64 v0, v4 clamp +; GISEL-VI-NEXT: v_mac_f32_e32 v4, v0, v2 +; GISEL-VI-NEXT: v_cvt_f16_f32_e64 v0, v8 clamp +; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v2, v4 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; GISEL-VI-NEXT: v_mac_f32_e32 v5, v1, v3 -; GISEL-VI-NEXT: v_cvt_f16_f32_e64 v1, v8 clamp -; GISEL-VI-NEXT: v_cvt_f16_f32_e64 v2, v5 clamp -; GISEL-VI-NEXT: v_mov_b32_e32 v3, 16 -; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GISEL-VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GISEL-VI-NEXT: v_bfe_u32 v1, v2, 0, 16 +; GISEL-VI-NEXT: v_cvt_f16_f32_e64 v1, v5 clamp +; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v2 ; GISEL-VI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-CI-LABEL: v_mad_mix_v3f32_clamp_postcvt: @@ -1042,29 +1029,26 @@ ; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v6, v0 ; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v8, v2 -; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v10, v4 -; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v7, v1 ; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v8, v2 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v9, v3 ; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v10, v4 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v11, v5 ; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GISEL-VI-NEXT: v_mac_f32_e32 v4, v0, v2 ; GISEL-VI-NEXT: v_mac_f32_e32 v10, v6, v8 +; GISEL-VI-NEXT: v_mac_f32_e32 v4, v0, v2 ; GISEL-VI-NEXT: v_mac_f32_e32 v11, v7, v9 ; GISEL-VI-NEXT: v_mac_f32_e32 v5, v1, v3 -; GISEL-VI-NEXT: v_cvt_f16_f32_e64 v1, v4 clamp ; GISEL-VI-NEXT: v_cvt_f16_f32_e64 v0, v10 clamp -; GISEL-VI-NEXT: v_cvt_f16_f32_e64 v3, v5 clamp +; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v1, v4 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; GISEL-VI-NEXT: v_cvt_f16_f32_e64 v2, v11 clamp -; GISEL-VI-NEXT: v_mov_b32_e32 v4, 16 -; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GISEL-VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v1, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GISEL-VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v3, v5 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-VI-NEXT: v_or_b32_e32 v1, v2, v3 ; GISEL-VI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-CI-LABEL: v_mad_mix_v4f32_clamp_postcvt: @@ -1192,9 +1176,8 @@ ; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] ; GISEL-GFX900-NEXT: v_max_f16_e64 v4, v3, v3 clamp ; GISEL-GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; GISEL-GFX900-NEXT: v_bfe_u32 v0, v4, 0, 16 -; GISEL-GFX900-NEXT: v_mov_b32_e32 v1, 0xffff0000 -; GISEL-GFX900-NEXT: v_and_or_b32 v0, v3, v1, v0 +; GISEL-GFX900-NEXT: v_mov_b32_e32 v0, 0xffff0000 +; GISEL-GFX900-NEXT: v_and_or_b32 v0, v3, v0, v4 ; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX906-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo: @@ -1203,9 +1186,8 @@ ; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] ; GISEL-GFX906-NEXT: v_max_f16_e64 v4, v3, v3 clamp ; GISEL-GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; GISEL-GFX906-NEXT: v_bfe_u32 v0, v4, 0, 16 -; GISEL-GFX906-NEXT: v_mov_b32_e32 v1, 0xffff0000 -; GISEL-GFX906-NEXT: v_and_or_b32 v0, v3, v1, v0 +; GISEL-GFX906-NEXT: v_mov_b32_e32 v0, 0xffff0000 +; GISEL-GFX906-NEXT: v_and_or_b32 v0, v3, v0, v4 ; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-VI-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo: @@ -1219,14 +1201,12 @@ ; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GISEL-VI-NEXT: v_mac_f32_e32 v5, v3, v4 ; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1 -; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; GISEL-VI-NEXT: v_mov_b32_e32 v2, 16 -; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GISEL-VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v1 ; GISEL-VI-NEXT: v_max_f16_e64 v1, v0, v0 clamp ; GISEL-VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GISEL-VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v1 ; GISEL-VI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-CI-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo: @@ -1239,14 +1219,12 @@ ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GISEL-CI-NEXT: v_mac_f32_e32 v5, v1, v3 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v5 ; GISEL-CI-NEXT: v_mac_f32_e32 v4, v0, v2 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GISEL-CI-NEXT: v_bfe_u32 v1, v1, 0, 16 -; GISEL-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v4 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0 -; GISEL-CI-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GISEL-CI-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GISEL-CI-NEXT: v_or_b32_e32 v0, v1, v0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GISEL-CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2 @@ -1255,7 +1233,6 @@ ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v2 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GISEL-CI-NEXT: v_bfe_u32 v1, v1, 0, 16 ; GISEL-CI-NEXT: v_or_b32_e32 v0, v0, v1 ; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] @@ -1333,8 +1310,8 @@ ; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] ; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v4, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp ; GISEL-GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; GISEL-GFX900-NEXT: v_mov_b32_e32 v0, 16 -; GISEL-GFX900-NEXT: v_lshlrev_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GISEL-GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GISEL-GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GISEL-GFX900-NEXT: v_mov_b32_e32 v1, 0xffff ; GISEL-GFX900-NEXT: v_and_or_b32 v0, v3, v1, v0 ; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1345,8 +1322,8 @@ ; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] ; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v4, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp ; GISEL-GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; GISEL-GFX906-NEXT: v_mov_b32_e32 v0, 16 -; GISEL-GFX906-NEXT: v_lshlrev_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GISEL-GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GISEL-GFX906-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GISEL-GFX906-NEXT: v_mov_b32_e32 v1, 0xffff ; GISEL-GFX906-NEXT: v_and_or_b32 v0, v3, v1, v0 ; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1362,14 +1339,10 @@ ; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GISEL-VI-NEXT: v_mac_f32_e32 v5, v3, v4 ; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1 -; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; GISEL-VI-NEXT: v_mov_b32_e32 v2, 16 -; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GISEL-VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GISEL-VI-NEXT: v_max_f16_sdwa v1, v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GISEL-VI-NEXT: v_mov_b32_e32 v2, 16 -; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-VI-NEXT: v_max_f16_sdwa v1, v0, v0 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GISEL-VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GISEL-VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1383,14 +1356,12 @@ ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GISEL-CI-NEXT: v_mac_f32_e32 v5, v1, v3 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v5 ; GISEL-CI-NEXT: v_mac_f32_e32 v4, v0, v2 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GISEL-CI-NEXT: v_bfe_u32 v1, v1, 0, 16 -; GISEL-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v4 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0 -; GISEL-CI-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GISEL-CI-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GISEL-CI-NEXT: v_or_b32_e32 v0, v1, v0 ; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GISEL-CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -1400,7 +1371,6 @@ ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v2 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GISEL-CI-NEXT: v_bfe_u32 v1, v1, 0, 16 ; GISEL-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GISEL-CI-NEXT: v_or_b32_e32 v0, v0, v1 ; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 @@ -1510,11 +1480,9 @@ ; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GISEL-VI-NEXT: v_mad_f32 v3, v3, v4, v5 clamp ; GISEL-VI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp -; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GISEL-VI-NEXT: v_mov_b32_e32 v2, 16 -; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GISEL-VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GISEL-VI-NEXT: v_or_b32_e32 v0, v1, v0 ; GISEL-VI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-CI-LABEL: v_mad_mix_v2f32_clamp_precvt: @@ -1657,16 +1625,13 @@ ; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GISEL-VI-NEXT: v_mad_f32 v0, v0, v2, v4 clamp ; GISEL-VI-NEXT: v_mad_f32 v6, v6, v7, v8 clamp -; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GISEL-VI-NEXT: v_mad_f32 v1, v1, v3, v5 clamp +; GISEL-VI-NEXT: v_mad_f32 v0, v0, v2, v4 clamp ; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v2, v6 +; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GISEL-VI-NEXT: v_mad_f32 v1, v1, v3, v5 clamp ; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GISEL-VI-NEXT: v_mov_b32_e32 v3, 16 -; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GISEL-VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GISEL-VI-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GISEL-VI-NEXT: v_or_b32_e32 v0, v2, v0 ; GISEL-VI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-CI-LABEL: v_mad_mix_v3f32_clamp_precvt: @@ -1846,15 +1811,12 @@ ; GISEL-VI-NEXT: v_mad_f32 v0, v0, v2, v4 clamp ; GISEL-VI-NEXT: v_mad_f32 v2, v7, v9, v11 clamp ; GISEL-VI-NEXT: v_mad_f32 v1, v1, v3, v5 clamp -; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v3, v6 +; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GISEL-VI-NEXT: v_mov_b32_e32 v4, 16 -; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GISEL-VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GISEL-VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GISEL-VI-NEXT: v_or_b32_e32 v0, v3, v0 +; GISEL-VI-NEXT: v_or_b32_e32 v1, v2, v1 ; GISEL-VI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-CI-LABEL: v_mad_mix_v4f32_clamp_precvt: diff --git a/llvm/test/CodeGen/AMDGPU/roundeven.ll b/llvm/test/CodeGen/AMDGPU/roundeven.ll --- a/llvm/test/CodeGen/AMDGPU/roundeven.ll +++ b/llvm/test/CodeGen/AMDGPU/roundeven.ll @@ -437,10 +437,8 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_rndne_f16_e32 v1, v0 -; GFX8-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_roundeven_v2f16: @@ -568,10 +566,8 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 ; GFX8-NEXT: v_rndne_f16_e32 v1, v0 -; GFX8-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_roundeven_v2f16_fneg: @@ -712,14 +708,11 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_rndne_f16_e32 v2, v0 -; GFX8-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX8-NEXT: v_rndne_f16_e32 v3, v1 -; GFX8-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v4, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_roundeven_v4f16: diff --git a/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll --- a/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll @@ -84,10 +84,8 @@ ; GFX8-GISEL: ; %bb.0: ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_mul_f16_e32 v2, v0, v1 -; GFX8-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, 16 -; GFX8-GISEL-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-GISEL-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10PLUS-LABEL: v_constained_fmul_v2f16_fpexcept_strict: @@ -119,10 +117,8 @@ ; GFX8-GISEL: ; %bb.0: ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_mul_f16_e32 v2, v0, v1 -; GFX8-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, 16 -; GFX8-GISEL-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-GISEL-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10PLUS-LABEL: v_constained_fmul_v2f16_fpexcept_ignore: @@ -154,10 +150,8 @@ ; GFX8-GISEL: ; %bb.0: ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_mul_f16_e32 v2, v0, v1 -; GFX8-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, 16 -; GFX8-GISEL-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-GISEL-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10PLUS-LABEL: v_constained_fmul_v2f16_fpexcept_maytrap: @@ -198,12 +192,9 @@ ; GFX8-GISEL: ; %bb.0: ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_mul_f16_e32 v4, v0, v2 -; GFX8-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-GISEL-NEXT: v_mul_f16_e32 v1, v1, v3 -; GFX8-GISEL-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-GISEL-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-GISEL-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_constained_fmul_v3f16_fpexcept_strict: @@ -277,14 +268,11 @@ ; GFX8-GISEL: ; %bb.0: ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_mul_f16_e32 v4, v0, v2 -; GFX8-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-GISEL-NEXT: v_mul_f16_e32 v2, v1, v3 -; GFX8-GISEL-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v3, 16 -; GFX8-GISEL-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-GISEL-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-GISEL-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-GISEL-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-GISEL-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX8-GISEL-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_constained_fmul_v4f16_fpexcept_strict: @@ -370,15 +358,14 @@ ; ; GFX8-GISEL-LABEL: s_constained_fmul_v2f16_fpexcept_strict: ; GFX8-GISEL: ; %bb.0: -; GFX8-GISEL-NEXT: s_lshr_b32 s1, s3, 16 ; GFX8-GISEL-NEXT: s_lshr_b32 s0, s2, 16 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-GISEL-NEXT: s_lshr_b32 s1, s3, 16 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-GISEL-NEXT: v_mul_f16_e32 v1, s0, v1 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-GISEL-NEXT: v_mul_f16_e32 v0, s2, v0 -; GFX8-GISEL-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-GISEL-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-GISEL-NEXT: v_mul_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-GISEL-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_constained_fmul_v2f16_fpexcept_strict: diff --git a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll --- a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll @@ -93,10 +93,8 @@ ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 ; GFX8-GISEL-NEXT: v_add_f16_e32 v2, v0, v1 -; GFX8-GISEL-NEXT: v_add_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, 16 -; GFX8-GISEL-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-GISEL-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-GISEL-NEXT: v_add_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_constained_fsub_v2f16_fpexcept_strict: @@ -165,10 +163,8 @@ ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 ; GFX8-GISEL-NEXT: v_add_f16_e32 v2, v0, v1 -; GFX8-GISEL-NEXT: v_add_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, 16 -; GFX8-GISEL-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-GISEL-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-GISEL-NEXT: v_add_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_constained_fsub_v2f16_fpexcept_ignore: @@ -237,10 +233,8 @@ ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 ; GFX8-GISEL-NEXT: v_add_f16_e32 v2, v0, v1 -; GFX8-GISEL-NEXT: v_add_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, 16 -; GFX8-GISEL-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-GISEL-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-GISEL-NEXT: v_add_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_constained_fsub_v2f16_fpexcept_maytrap: @@ -296,9 +290,8 @@ ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-GISEL-NEXT: v_sub_f16_e32 v4, v0, v2 ; GFX9-GISEL-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-GISEL-NEXT: v_sub_f16_e32 v1, v1, v3 -; GFX9-GISEL-NEXT: v_perm_b32 v0, v0, v4, s4 +; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v4 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_constained_fsub_v3f16_fpexcept_strict: @@ -314,12 +307,9 @@ ; GFX8-GISEL: ; %bb.0: ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_sub_f16_e32 v4, v0, v2 -; GFX8-GISEL-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-GISEL-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-GISEL-NEXT: v_sub_f16_e32 v1, v1, v3 -; GFX8-GISEL-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-GISEL-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-GISEL-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_constained_fsub_v3f16_fpexcept_strict: @@ -339,7 +329,8 @@ ; GFX10-GISEL-NEXT: v_sub_f16_e32 v4, v0, v2 ; GFX10-GISEL-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-GISEL-NEXT: v_sub_f16_e32 v1, v1, v3 -; GFX10-GISEL-NEXT: v_perm_b32 v0, v0, v4, 0x5040100 +; GFX10-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX10-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v2 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10PLUS-SDAG-LABEL: v_constained_fsub_v3f16_fpexcept_strict: @@ -363,7 +354,8 @@ ; GFX10PLUS-GISEL-NEXT: v_sub_f16_e32 v0, v0, v2 ; GFX10PLUS-GISEL-NEXT: v_sub_f16_e32 v1, v1, v3 ; GFX10PLUS-GISEL-NEXT: v_sub_f16_e32 v2, v4, v5 -; GFX10PLUS-GISEL-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX10PLUS-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10PLUS-GISEL-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX10PLUS-GISEL-NEXT: s_setpc_b64 s[30:31] %val = call <3 x half> @llvm.experimental.constrained.fsub.v3f16(<3 x half> %x, <3 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <3 x half> %val @@ -390,9 +382,8 @@ ; GFX9-GISEL-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-GISEL-NEXT: v_sub_f16_e32 v2, v1, v3 ; GFX9-GISEL-NEXT: v_sub_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-GISEL-NEXT: v_perm_b32 v0, v0, v4, s4 -; GFX9-GISEL-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v4 +; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v1, 16, v2 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_constained_fsub_v4f16_fpexcept_strict: @@ -410,14 +401,11 @@ ; GFX8-GISEL: ; %bb.0: ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_sub_f16_e32 v4, v0, v2 -; GFX8-GISEL-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-GISEL-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-GISEL-NEXT: v_sub_f16_e32 v2, v1, v3 -; GFX8-GISEL-NEXT: v_sub_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v3, 16 -; GFX8-GISEL-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-GISEL-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-GISEL-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-GISEL-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-GISEL-NEXT: v_sub_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX8-GISEL-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_constained_fsub_v4f16_fpexcept_strict: @@ -437,11 +425,13 @@ ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-GISEL-NEXT: v_sub_f16_e32 v4, v0, v2 +; GFX10-GISEL-NEXT: v_sub_f16_e32 v5, v1, v3 ; GFX10-GISEL-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-GISEL-NEXT: v_sub_f16_e32 v2, v1, v3 ; GFX10-GISEL-NEXT: v_sub_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-GISEL-NEXT: v_perm_b32 v0, v0, v4, 0x5040100 -; GFX10-GISEL-NEXT: v_perm_b32 v1, v1, v2, 0x5040100 +; GFX10-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX10-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; GFX10-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX10-GISEL-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10PLUS-SDAG-LABEL: v_constained_fsub_v4f16_fpexcept_strict: @@ -472,8 +462,10 @@ ; GFX10PLUS-GISEL-NEXT: v_sub_f16_e32 v1, v1, v3 ; GFX10PLUS-GISEL-NEXT: v_sub_f16_e32 v2, v4, v6 ; GFX10PLUS-GISEL-NEXT: v_sub_f16_e32 v3, v5, v7 -; GFX10PLUS-GISEL-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX10PLUS-GISEL-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 +; GFX10PLUS-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10PLUS-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10PLUS-GISEL-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX10PLUS-GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX10PLUS-GISEL-NEXT: s_setpc_b64 s[30:31] %val = call <4 x half> @llvm.experimental.constrained.fsub.v4f16(<4 x half> %x, <4 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <4 x half> %val @@ -528,15 +520,14 @@ ; GFX8-GISEL-LABEL: s_constained_fsub_v2f16_fpexcept_strict: ; GFX8-GISEL: ; %bb.0: ; GFX8-GISEL-NEXT: s_xor_b32 s0, s3, 0x80008000 -; GFX8-GISEL-NEXT: s_lshr_b32 s3, s0, 16 ; GFX8-GISEL-NEXT: s_lshr_b32 s1, s2, 16 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-GISEL-NEXT: s_lshr_b32 s3, s0, 16 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-GISEL-NEXT: v_add_f16_e32 v1, s1, v1 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-GISEL-NEXT: v_add_f16_e32 v0, s2, v0 -; GFX8-GISEL-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-GISEL-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-GISEL-NEXT: v_add_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-GISEL-NEXT: ; return to shader part epilog ; ; GFX10-SDAG-LABEL: s_constained_fsub_v2f16_fpexcept_strict: diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll --- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-VI %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,SDAG-GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-GFX9 %s ; RUN: llc -march=amdgcn -mcpu=gfx1101 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,SDAG-GFX11 %s ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GISEL-VI %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GFX9,GISEL-GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GISEL-GFX9 %s ; RUN: llc -march=amdgcn -mcpu=gfx1101 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GFX11,GISEL-GFX11 %s ; {{.*}}(SELECT_I4: +// CHECK: GIR_Done, +// CHECK-NEXT: // Label 0: +// CHECK-NEXT: GIM_Reject, +// CHECK-NEXT: }; def SELECT_I4 : I<(outs GPR32:$dst), (ins GPR8:$cond, GPR32:$T, GPR32:$F), []>; def LI : I<(outs GPR32:$dst), (ins i32imm:$src), []>; diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp --- a/llvm/utils/TableGen/GlobalISelEmitter.cpp +++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp @@ -3696,7 +3696,7 @@ const TreePatternNode *Src, const TreePatternNode *Dst); Expected createAndImportSubInstructionRenderer( action_iterator InsertPt, RuleMatcher &M, const TreePatternNode *Dst, - unsigned TempReg); + const TreePatternNode *Src, unsigned TempReg); Expected createInstructionRenderer(action_iterator InsertPt, RuleMatcher &M, const TreePatternNode *Dst); @@ -3705,14 +3705,12 @@ action_iterator InsertPt, RuleMatcher &M, BuildMIAction &DstMIBuilder, const TreePatternNode *Src, const TreePatternNode *Dst); - Expected - importExplicitUseRenderers(action_iterator InsertPt, RuleMatcher &M, - BuildMIAction &DstMIBuilder, - const llvm::TreePatternNode *Dst); - Expected - importExplicitUseRenderer(action_iterator InsertPt, RuleMatcher &Rule, - BuildMIAction &DstMIBuilder, - const TreePatternNode *DstChild); + Expected importExplicitUseRenderers( + action_iterator InsertPt, RuleMatcher &M, BuildMIAction &DstMIBuilder, + const llvm::TreePatternNode *Dst, const TreePatternNode *Src); + Expected importExplicitUseRenderer( + action_iterator InsertPt, RuleMatcher &Rule, BuildMIAction &DstMIBuilder, + const TreePatternNode *DstChild, const TreePatternNode *Src); Error importDefaultOperandRenderers(action_iterator InsertPt, RuleMatcher &M, BuildMIAction &DstMIBuilder, DagInit *DefaultOps) const; @@ -4510,7 +4508,7 @@ Expected GlobalISelEmitter::importExplicitUseRenderer( action_iterator InsertPt, RuleMatcher &Rule, BuildMIAction &DstMIBuilder, - const TreePatternNode *DstChild) { + const TreePatternNode *DstChild, const TreePatternNode *Src) { const auto &SubOperand = Rule.getComplexSubOperand(DstChild->getName()); if (SubOperand) { @@ -4580,7 +4578,7 @@ DstMIBuilder.addRenderer(TempRegID); auto InsertPtOrError = createAndImportSubInstructionRenderer( - ++InsertPt, Rule, DstChild, TempRegID); + ++InsertPt, Rule, DstChild, Src, TempRegID); if (auto Error = InsertPtOrError.takeError()) return std::move(Error); return InsertPtOrError.get(); @@ -4652,6 +4650,16 @@ return failedImport( "Dst pattern child def is an unsupported tablegen class"); } + + // Handle the case where the MVT/register class is omitted in the dest pattern + // but MVT exists in the source pattern. + if (isa(DstChild->getLeafValue())) { + for (unsigned NumOp = 0; NumOp < Src->getNumChildren(); NumOp++) + if (Src->getChild(NumOp)->getName() == DstChild->getName()) { + DstMIBuilder.addRenderer(Src->getChild(NumOp)->getName()); + return InsertPt; + } + } return failedImport("Dst pattern child is an unsupported kind"); } @@ -4682,8 +4690,9 @@ .takeError()) return std::move(Error); - if (auto Error = importExplicitUseRenderers(InsertPt, M, DstMIBuilder, Dst) - .takeError()) + if (auto Error = + importExplicitUseRenderers(InsertPt, M, DstMIBuilder, Dst, Src) + .takeError()) return std::move(Error); return DstMIBuilder; @@ -4692,7 +4701,7 @@ Expected GlobalISelEmitter::createAndImportSubInstructionRenderer( const action_iterator InsertPt, RuleMatcher &M, const TreePatternNode *Dst, - unsigned TempRegID) { + const TreePatternNode *Src, unsigned TempRegID) { auto InsertPtOrError = createInstructionRenderer(InsertPt, M, Dst); // TODO: Assert there's exactly one result. @@ -4706,8 +4715,8 @@ // Assign the result to TempReg. DstMIBuilder.addRenderer(TempRegID, true); - InsertPtOrError = - importExplicitUseRenderers(InsertPtOrError.get(), M, DstMIBuilder, Dst); + InsertPtOrError = importExplicitUseRenderers(InsertPtOrError.get(), M, + DstMIBuilder, Dst, Src); if (auto Error = InsertPtOrError.takeError()) return std::move(Error); @@ -4869,7 +4878,7 @@ Expected GlobalISelEmitter::importExplicitUseRenderers( action_iterator InsertPt, RuleMatcher &M, BuildMIAction &DstMIBuilder, - const llvm::TreePatternNode *Dst) { + const llvm::TreePatternNode *Dst, const llvm::TreePatternNode *Src) { const CodeGenInstruction *DstI = DstMIBuilder.getCGI(); CodeGenInstruction *OrigDstI = &Target.getInstruction(Dst->getOperator()); @@ -4898,7 +4907,7 @@ InsertPt, *ExtractSrcTy, TempRegID); auto InsertPtOrError = createAndImportSubInstructionRenderer( - ++InsertPt, M, ValChild, TempRegID); + ++InsertPt, M, ValChild, Src, TempRegID); if (auto Error = InsertPtOrError.takeError()) return std::move(Error); @@ -4955,7 +4964,7 @@ CodeGenSubRegIndex *SubIdx = CGRegs.getSubRegIdx(SubRegInit->getDef()); auto InsertPtOrError = - importExplicitUseRenderer(InsertPt, M, DstMIBuilder, ValChild); + importExplicitUseRenderer(InsertPt, M, DstMIBuilder, ValChild, Src); if (auto Error = InsertPtOrError.takeError()) return std::move(Error); InsertPt = InsertPtOrError.get(); @@ -5024,7 +5033,7 @@ } auto InsertPtOrError = importExplicitUseRenderer(InsertPt, M, DstMIBuilder, - Dst->getChild(Child)); + Dst->getChild(Child), Src); if (auto Error = InsertPtOrError.takeError()) return std::move(Error); InsertPt = InsertPtOrError.get();