diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -684,6 +684,11 @@ /// (G_*ADDO x, 0) -> x + no carry out bool matchAddOBy0(MachineInstr &MI, BuildFnTy &MatchInfo); + /// Match: + /// (G_*ADDE x, y, 0) -> (G_*ADDO x, y) + /// (G_*SUBE x, y, 0) -> (G_*SUBO x, y) + bool matchAddEToAddO(MachineInstr &MI, BuildFnTy &MatchInfo); + /// Transform (fadd x, fneg(y)) -> (fsub x, y) /// (fadd fneg(x), y) -> (fsub y, x) /// (fsub x, fneg(y)) -> (fadd x, y) diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -812,6 +812,16 @@ [{ return Helper.matchAddOBy0(*${root}, ${matchinfo}); }]), (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>; +// Transform (uadde x, y, 0) -> (uaddo x, y) +// (sadde x, y, 0) -> (saddo x, y) +// (usube x, y, 0) -> (usubo x, y) +// (ssube x, y, 0) -> (ssubo x, y) +def adde_to_addo: GICombineRule< + (defs root:$root, build_fn_matchinfo:$matchinfo), + (match (wip_match_opcode G_UADDE, G_SADDE, G_USUBE, G_SSUBE):$root, + [{ return Helper.matchAddEToAddO(*${root}, ${matchinfo}); }]), + (apply [{ Helper.applyBuildFnNoErase(*${root}, ${matchinfo}); }])>; + def mulh_to_lshr : GICombineRule< (defs root:$root), (match (wip_match_opcode G_UMULH):$root, @@ -940,7 +950,8 @@ def const_combines : GICombineGroup<[constant_fp_op, const_ptradd_to_i2p, overlapping_and, mulo_by_2, mulo_by_0, - addo_by_0, combine_minmax_nan]>; + addo_by_0, adde_to_addo, + combine_minmax_nan]>; def known_bits_simplifications : GICombineGroup<[ redundant_and, redundant_sext_inreg, redundant_or, urem_pow2_to_mask, diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -4794,6 +4794,39 @@ return true; } +bool CombinerHelper::matchAddEToAddO(MachineInstr &MI, BuildFnTy &MatchInfo) { + // (G_*ADDE x, y, 0) -> (G_*ADDO x, y) + // (G_*SUBE x, y, 0) -> (G_*SUBO x, y) + assert(MI.getOpcode() == TargetOpcode::G_UADDE || + MI.getOpcode() == TargetOpcode::G_SADDE || + MI.getOpcode() == TargetOpcode::G_USUBE || + MI.getOpcode() == TargetOpcode::G_SSUBE); + if (!mi_match(MI.getOperand(4).getReg(), MRI, m_SpecificICstOrSplat(0))) + return false; + MatchInfo = [&](MachineIRBuilder &B) { + unsigned NewOpcode; + switch (MI.getOpcode()) { + case TargetOpcode::G_UADDE: + NewOpcode = TargetOpcode::G_UADDO; + break; + case TargetOpcode::G_SADDE: + NewOpcode = TargetOpcode::G_SADDO; + break; + case TargetOpcode::G_USUBE: + NewOpcode = TargetOpcode::G_USUBO; + break; + case TargetOpcode::G_SSUBE: + NewOpcode = TargetOpcode::G_SSUBO; + break; + } + Observer.changingInstr(MI); + MI.setDesc(B.getTII().get(NewOpcode)); + MI.removeOperand(4); + Observer.changedInstr(MI); + }; + return true; +} + MachineInstr *CombinerHelper::buildUDivUsingMul(MachineInstr &MI) { assert(MI.getOpcode() == TargetOpcode::G_UDIV); auto &UDiv = cast(MI); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -4206,13 +4206,11 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v0, v2 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v1, v3, vcc -; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] -; GFX6-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] +; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] +; GFX6-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[2:3] ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v5 -; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX6-NEXT: s_mov_b64 s[6:7], 0 -; GFX6-NEXT: v_addc_u32_e64 v1, s[6:7], v0, v1, s[6:7] -; GFX6-NEXT: s_xor_b64 vcc, s[4:5], vcc +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0x80000000, v0 +; GFX6-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -4222,13 +4220,11 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v1, v3, vcc -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] -; GFX8-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] +; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] +; GFX8-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[2:3] ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v5 -; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_addc_u32_e64 v1, s[6:7], v0, v1, s[6:7] -; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x80000000, v0 +; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -4238,13 +4234,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v3, vcc -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] -; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] +; GFX9-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[2:3] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v5 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9-NEXT: s_mov_b64 s[6:7], 0 -; GFX9-NEXT: v_addc_co_u32_e64 v1, s[6:7], v0, v1, s[6:7] -; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4256,10 +4250,9 @@ ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo ; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[2:3] -; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s5, 0x80000000, v6, s5 +; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v6 ; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo @@ -4272,10 +4265,9 @@ ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 ; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo ; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[2:3] -; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, s1, 0x80000000, v6, s1 +; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v6 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -4293,16 +4285,14 @@ ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 ; GFX6-NEXT: s_ashr_i32 s2, s5, 31 -; GFX6-NEXT: s_mov_b32 s3, 0 -; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc -; GFX6-NEXT: s_cmp_lg_u32 s3, 0 -; GFX6-NEXT: s_addc_u32 s1, s2, 0x80000000 +; GFX6-NEXT: s_add_u32 s3, s2, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: v_mov_b32_e32 v2, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: v_mov_b32_e32 v3, s5 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX6-NEXT: v_readfirstlane_b32 s0, v0 ; GFX6-NEXT: v_readfirstlane_b32 s1, v1 ; GFX6-NEXT: ; return to shader part epilog @@ -4316,16 +4306,14 @@ ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 ; GFX8-NEXT: s_ashr_i32 s2, s5, 31 -; GFX8-NEXT: s_mov_b32 s3, 0 -; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc -; GFX8-NEXT: s_cmp_lg_u32 s3, 0 -; GFX8-NEXT: s_addc_u32 s1, s2, 0x80000000 +; GFX8-NEXT: s_add_u32 s3, s2, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1 ; GFX8-NEXT: ; return to shader part epilog @@ -4339,16 +4327,14 @@ ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 ; GFX9-NEXT: s_ashr_i32 s2, s5, 31 -; GFX9-NEXT: s_mov_b32 s3, 0 -; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc -; GFX9-NEXT: s_cmp_lg_u32 s3, 0 -; GFX9-NEXT: s_addc_u32 s1, s2, 0x80000000 +; GFX9-NEXT: s_add_u32 s3, s2, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NEXT: ; return to shader part epilog @@ -4358,15 +4344,13 @@ ; GFX10-NEXT: s_add_u32 s4, s0, s2 ; GFX10-NEXT: s_addc_u32 s5, s1, s3 ; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[2:3], 0 -; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[4:5], s[0:1] -; GFX10-NEXT: s_mov_b32 s3, 0 -; GFX10-NEXT: s_ashr_i32 s0, s5, 31 +; GFX10-NEXT: v_cmp_lt_i64_e64 s6, s[4:5], s[0:1] ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: s_xor_b32 s2, s2, s1 -; GFX10-NEXT: s_cmp_lg_u32 s3, 0 +; GFX10-NEXT: s_ashr_i32 s0, s5, 31 +; GFX10-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX10-NEXT: s_xor_b32 s2, s2, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s0, s2 -; GFX10-NEXT: s_addc_u32 s1, s0, 0x80000000 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s1, s2 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 @@ -4377,14 +4361,12 @@ ; GFX11-NEXT: s_add_u32 s4, s0, s2 ; GFX11-NEXT: s_addc_u32 s5, s1, s3 ; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[2:3], 0 -; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[4:5], s[0:1] -; GFX11-NEXT: s_mov_b32 s3, 0 -; GFX11-NEXT: s_ashr_i32 s0, s5, 31 +; GFX11-NEXT: v_cmp_lt_i64_e64 s6, s[4:5], s[0:1] ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX11-NEXT: s_xor_b32 s2, s2, s1 -; GFX11-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-NEXT: s_ashr_i32 s0, s5, 31 +; GFX11-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX11-NEXT: s_xor_b32 s2, s2, s6 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, s2 -; GFX11-NEXT: s_addc_u32 s1, s0, 0x80000000 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s1, s2 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: v_readfirstlane_b32 s1, v1 @@ -4399,13 +4381,11 @@ ; GFX6-NEXT: v_mov_b32_e32 v3, s1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, s0, v0 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v1, vcc -; GFX6-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3] -; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], 0, v[0:1] +; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3] +; GFX6-NEXT: v_cmp_gt_i64_e64 s[2:3], 0, v[0:1] ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX6-NEXT: s_mov_b64 s[2:3], 0 -; GFX6-NEXT: v_addc_u32_e64 v1, s[2:3], v0, v1, s[2:3] -; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0x80000000, v0 +; GFX6-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX6-NEXT: ; return to shader part epilog @@ -4415,13 +4395,11 @@ ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v1, vcc -; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3] -; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], 0, v[0:1] +; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3] +; GFX8-NEXT: v_cmp_gt_i64_e64 s[2:3], 0, v[0:1] ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX8-NEXT: s_mov_b64 s[2:3], 0 -; GFX8-NEXT: v_addc_u32_e64 v1, s[2:3], v0, v1, s[2:3] -; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x80000000, v0 +; GFX8-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX8-NEXT: ; return to shader part epilog @@ -4431,13 +4409,11 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v1, vcc -; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3] -; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], 0, v[0:1] +; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3] +; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], 0, v[0:1] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_addc_co_u32_e64 v1, s[2:3], v0, v1, s[2:3] -; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-NEXT: ; return to shader part epilog @@ -4449,8 +4425,7 @@ ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] ; GFX10-NEXT: v_cmp_gt_i64_e64 s0, 0, v[0:1] -; GFX10-NEXT: s_mov_b32 s1, 0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s1, 0x80000000, v4, s1 +; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4 ; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo @@ -4463,8 +4438,7 @@ ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] ; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[0:1] -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, s1, 0x80000000, v4, s1 +; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-NEXT: ; return to shader part epilog @@ -4479,13 +4453,11 @@ ; GFX6-NEXT: v_mov_b32_e32 v3, s1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, s0, v0 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc -; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1] -; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], s[0:1], 0 +; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1] +; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX6-NEXT: s_mov_b64 s[0:1], 0 -; GFX6-NEXT: v_addc_u32_e64 v1, s[0:1], v0, v1, s[0:1] -; GFX6-NEXT: s_xor_b64 vcc, s[2:3], vcc +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0x80000000, v0 +; GFX6-NEXT: s_xor_b64 vcc, s[0:1], s[2:3] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX6-NEXT: ; return to shader part epilog @@ -4495,13 +4467,11 @@ ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1] -; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], s[0:1], 0 +; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1] +; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_addc_u32_e64 v1, s[0:1], v0, v1, s[0:1] -; GFX8-NEXT: s_xor_b64 vcc, s[2:3], vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x80000000, v0 +; GFX8-NEXT: s_xor_b64 vcc, s[0:1], s[2:3] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX8-NEXT: ; return to shader part epilog @@ -4511,13 +4481,11 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[0:1], 0 +; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_addc_co_u32_e64 v1, s[0:1], v0, v1, s[0:1] -; GFX9-NEXT: s_xor_b64 vcc, s[2:3], vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[0:1], s[2:3] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-NEXT: ; return to shader part epilog @@ -4526,12 +4494,11 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, s0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo -; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[0:1], 0 -; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0 ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 0x80000000, v4, s0 -; GFX10-NEXT: s_xor_b32 vcc_lo, s1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4 +; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX10-NEXT: ; return to shader part epilog @@ -4540,12 +4507,11 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v0, s0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo -; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[0:1], 0 -; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0 ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, s0, 0x80000000, v4, s0 -; GFX11-NEXT: s_xor_b32 vcc_lo, s1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4 +; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-NEXT: ; return to shader part epilog %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs) @@ -4559,22 +4525,21 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v8, vcc, v0, v4 ; GFX6-NEXT: v_addc_u32_e32 v9, vcc, v1, v5, vcc -; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[8:9], v[0:1] -; GFX6-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[4:5] +; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[8:9], v[0:1] +; GFX6-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[4:5] ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v9 -; GFX6-NEXT: v_bfrev_b32_e32 v10, 1 -; GFX6-NEXT: s_mov_b64 s[6:7], 0 -; GFX6-NEXT: v_addc_u32_e64 v1, s[8:9], v0, v10, s[6:7] -; GFX6-NEXT: s_xor_b64 vcc, s[4:5], vcc +; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v0, v1 +; GFX6-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v2, v6 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v3, v7, vcc -; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3] -; GFX6-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[6:7] +; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[2:3] +; GFX6-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[6:7] ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v5 -; GFX6-NEXT: v_addc_u32_e64 v3, s[6:7], v2, v10, s[6:7] -; GFX6-NEXT: s_xor_b64 vcc, s[4:5], vcc +; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0x80000000, v2 +; GFX6-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -4584,22 +4549,21 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v0, v4 ; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v1, v5, vcc -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[8:9], v[0:1] -; GFX8-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[4:5] +; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[8:9], v[0:1] +; GFX8-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[4:5] ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v9 -; GFX8-NEXT: v_bfrev_b32_e32 v10, 1 -; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_addc_u32_e64 v1, s[8:9], v0, v10, s[6:7] -; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc +; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v0, v1 +; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v2, v6 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v3, v7, vcc -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3] -; GFX8-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[6:7] +; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[2:3] +; GFX8-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[6:7] ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v5 -; GFX8-NEXT: v_addc_u32_e64 v3, s[6:7], v2, v10, s[6:7] -; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x80000000, v2 +; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -4609,22 +4573,21 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v1, v5, vcc -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[8:9], v[0:1] -; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[4:5] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[8:9], v[0:1] +; GFX9-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[4:5] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v9 -; GFX9-NEXT: v_bfrev_b32_e32 v10, 1 -; GFX9-NEXT: s_mov_b64 s[6:7], 0 -; GFX9-NEXT: v_addc_co_u32_e64 v1, s[8:9], v0, v10, s[6:7] -; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc +; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v0, v1 +; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v2, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v3, v7, vcc -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3] -; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[6:7] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[2:3] +; GFX9-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[6:7] ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v5 -; GFX9-NEXT: v_addc_co_u32_e64 v3, s[6:7], v2, v10, s[6:7] -; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, 0x80000000, v2 +; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4637,19 +4600,18 @@ ; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo ; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v2, v6 ; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v3, v7, vcc_lo +; GFX10-NEXT: v_ashrrev_i32_e32 v12, 31, v9 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1] -; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v9 -; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[4:5] ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v11 -; GFX10-NEXT: v_cmp_gt_i64_e64 s7, 0, v[6:7] -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s6, 0x80000000, v0, s5 -; GFX10-NEXT: v_cmp_lt_i64_e64 s6, v[10:11], v[2:3] -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s5, 0x80000000, v4, s5 +; GFX10-NEXT: v_cmp_gt_i64_e64 s6, 0, v[6:7] +; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v12 +; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[10:11], v[2:3] +; GFX10-NEXT: v_add_co_u32 v3, s7, 0x80000000, v4 ; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v12, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo -; GFX10-NEXT: s_xor_b32 vcc_lo, s7, s6 +; GFX10-NEXT: s_xor_b32 vcc_lo, s6, s5 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -4662,18 +4624,17 @@ ; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo ; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v2, v6 ; GFX11-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v3, v7, vcc_lo +; GFX11-NEXT: v_ashrrev_i32_e32 v12, 31, v9 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1] -; GFX11-NEXT: v_ashrrev_i32_e32 v0, 31, v9 -; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[4:5] ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v11 -; GFX11-NEXT: v_cmp_gt_i64_e64 s3, 0, v[6:7] -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, s2, 0x80000000, v0, s1 -; GFX11-NEXT: v_cmp_lt_i64_e64 s2, v[10:11], v[2:3] -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s1, 0x80000000, v4, s1 +; GFX11-NEXT: v_cmp_lt_i64_e64 s1, v[10:11], v[2:3] +; GFX11-NEXT: v_cmp_gt_i64_e64 s2, 0, v[6:7] +; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v12 +; GFX11-NEXT: v_add_co_u32 v3, null, 0x80000000, v4 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo -; GFX11-NEXT: v_dual_cndmask_b32 v0, v8, v0 :: v_dual_cndmask_b32 v1, v9, v1 -; GFX11-NEXT: s_xor_b32 vcc_lo, s3, s2 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v8, v12 :: v_dual_cndmask_b32 v1, v9, v1 +; GFX11-NEXT: s_xor_b32 vcc_lo, s2, s1 ; GFX11-NEXT: v_dual_cndmask_b32 v2, v10, v4 :: v_dual_cndmask_b32 v3, v11, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) @@ -4690,35 +4651,31 @@ ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0 ; GFX6-NEXT: s_ashr_i32 s4, s9, 31 -; GFX6-NEXT: s_mov_b32 s5, 0 -; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc -; GFX6-NEXT: s_cmp_lg_u32 s5, 0 +; GFX6-NEXT: s_add_u32 s5, s4, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s8 -; GFX6-NEXT: s_addc_u32 s1, s4, 0x80000000 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: v_mov_b32_e32 v3, s9 +; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc ; GFX6-NEXT: s_add_u32 s0, s2, s6 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NEXT: v_mov_b32_e32 v3, s9 ; GFX6-NEXT: s_addc_u32 s1, s3, s7 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0 ; GFX6-NEXT: s_ashr_i32 s4, s1, 31 -; GFX6-NEXT: s_mov_b32 s5, 0 -; GFX6-NEXT: s_xor_b64 vcc, s[2:3], vcc -; GFX6-NEXT: s_cmp_lg_u32 s5, 0 -; GFX6-NEXT: s_addc_u32 s3, s4, 0x80000000 +; GFX6-NEXT: s_add_u32 s5, s4, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v3, s3 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v4, s0 ; GFX6-NEXT: v_mov_b32_e32 v5, s1 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GFX6-NEXT: v_readfirstlane_b32 s0, v4 -; GFX6-NEXT: v_readfirstlane_b32 s1, v2 +; GFX6-NEXT: s_xor_b64 vcc, s[2:3], vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX6-NEXT: v_readfirstlane_b32 s0, v2 +; GFX6-NEXT: v_readfirstlane_b32 s1, v3 ; GFX6-NEXT: v_readfirstlane_b32 s2, v0 ; GFX6-NEXT: v_readfirstlane_b32 s3, v1 ; GFX6-NEXT: ; return to shader part epilog @@ -4732,35 +4689,31 @@ ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0 ; GFX8-NEXT: s_ashr_i32 s4, s9, 31 -; GFX8-NEXT: s_mov_b32 s5, 0 -; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc -; GFX8-NEXT: s_cmp_lg_u32 s5, 0 +; GFX8-NEXT: s_add_u32 s5, s4, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s8 -; GFX8-NEXT: s_addc_u32 s1, s4, 0x80000000 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s8 +; GFX8-NEXT: v_mov_b32_e32 v3, s9 +; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc ; GFX8-NEXT: s_add_u32 s0, s2, s6 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s9 ; GFX8-NEXT: s_addc_u32 s1, s3, s7 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0 ; GFX8-NEXT: s_ashr_i32 s4, s1, 31 -; GFX8-NEXT: s_mov_b32 s5, 0 -; GFX8-NEXT: s_xor_b64 vcc, s[2:3], vcc -; GFX8-NEXT: s_cmp_lg_u32 s5, 0 -; GFX8-NEXT: s_addc_u32 s3, s4, 0x80000000 +; GFX8-NEXT: s_add_u32 s5, s4, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s0 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GFX8-NEXT: v_readfirstlane_b32 s0, v4 -; GFX8-NEXT: v_readfirstlane_b32 s1, v2 +; GFX8-NEXT: s_xor_b64 vcc, s[2:3], vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8-NEXT: v_readfirstlane_b32 s0, v2 +; GFX8-NEXT: v_readfirstlane_b32 s1, v3 ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_readfirstlane_b32 s3, v1 ; GFX8-NEXT: ; return to shader part epilog @@ -4774,35 +4727,31 @@ ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0 ; GFX9-NEXT: s_ashr_i32 s4, s9, 31 -; GFX9-NEXT: s_mov_b32 s5, 0 -; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc -; GFX9-NEXT: s_cmp_lg_u32 s5, 0 +; GFX9-NEXT: s_add_u32 s5, s4, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-NEXT: s_addc_u32 s1, s4, 0x80000000 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc ; GFX9-NEXT: s_add_u32 s0, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_mov_b32_e32 v3, s9 ; GFX9-NEXT: s_addc_u32 s1, s3, s7 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0 ; GFX9-NEXT: s_ashr_i32 s4, s1, 31 -; GFX9-NEXT: s_mov_b32 s5, 0 -; GFX9-NEXT: s_xor_b64 vcc, s[2:3], vcc -; GFX9-NEXT: s_cmp_lg_u32 s5, 0 -; GFX9-NEXT: s_addc_u32 s3, s4, 0x80000000 +; GFX9-NEXT: s_add_u32 s5, s4, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 ; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GFX9-NEXT: v_readfirstlane_b32 s0, v4 -; GFX9-NEXT: v_readfirstlane_b32 s1, v2 +; GFX9-NEXT: s_xor_b64 vcc, s[2:3], vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9-NEXT: v_readfirstlane_b32 s0, v2 +; GFX9-NEXT: v_readfirstlane_b32 s1, v3 ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_readfirstlane_b32 s3, v1 ; GFX9-NEXT: ; return to shader part epilog @@ -4812,29 +4761,26 @@ ; GFX10-NEXT: s_add_u32 s8, s0, s4 ; GFX10-NEXT: s_addc_u32 s9, s1, s5 ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[4:5], 0 -; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[8:9], s[0:1] -; GFX10-NEXT: s_mov_b32 s10, 0 +; GFX10-NEXT: v_cmp_lt_i64_e64 s10, s[8:9], s[0:1] ; GFX10-NEXT: s_ashr_i32 s0, s9, 31 ; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: s_add_u32 s1, s0, 0x80000000 ; GFX10-NEXT: v_mov_b32_e32 v1, s9 -; GFX10-NEXT: s_xor_b32 s8, s4, s1 -; GFX10-NEXT: s_cmp_lg_u32 s10, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s0, s8 -; GFX10-NEXT: s_addc_u32 s1, s0, 0x80000000 +; GFX10-NEXT: s_xor_b32 s8, s4, s10 ; GFX10-NEXT: s_add_u32 s4, s2, s6 ; GFX10-NEXT: s_addc_u32 s5, s3, s7 ; GFX10-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[4:5], s[2:3] ; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[6:7], 0 -; GFX10-NEXT: s_ashr_i32 s0, s5, 31 ; GFX10-NEXT: v_mov_b32_e32 v3, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s0, s8 +; GFX10-NEXT: s_ashr_i32 s0, s5, 31 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s1, s8 +; GFX10-NEXT: s_add_u32 s1, s0, 0x80000000 ; GFX10-NEXT: s_xor_b32 s2, s3, s2 -; GFX10-NEXT: s_cmp_lg_u32 s10, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s0, s2 -; GFX10-NEXT: s_addc_u32 s1, s0, 0x80000000 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s1, s2 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: v_readfirstlane_b32 s3, v3 @@ -4845,27 +4791,24 @@ ; GFX11-NEXT: s_add_u32 s8, s0, s4 ; GFX11-NEXT: s_addc_u32 s9, s1, s5 ; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[4:5], 0 -; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[8:9], s[0:1] -; GFX11-NEXT: s_mov_b32 s10, 0 +; GFX11-NEXT: v_cmp_lt_i64_e64 s10, s[8:9], s[0:1] ; GFX11-NEXT: s_ashr_i32 s0, s9, 31 ; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 -; GFX11-NEXT: s_xor_b32 s8, s4, s1 -; GFX11-NEXT: s_cmp_lg_u32 s10, 0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, s8 -; GFX11-NEXT: s_addc_u32 s1, s0, 0x80000000 +; GFX11-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX11-NEXT: s_xor_b32 s8, s4, s10 ; GFX11-NEXT: s_add_u32 s4, s2, s6 ; GFX11-NEXT: s_addc_u32 s5, s3, s7 ; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[4:5], s[2:3] ; GFX11-NEXT: v_cmp_lt_i64_e64 s3, s[6:7], 0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, s8 ; GFX11-NEXT: s_ashr_i32 s0, s5, 31 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s1, s8 +; GFX11-NEXT: s_add_u32 s1, s0, 0x80000000 ; GFX11-NEXT: s_xor_b32 s2, s3, s2 -; GFX11-NEXT: s_cmp_lg_u32 s10, 0 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s0, s2 -; GFX11-NEXT: s_addc_u32 s1, s0, 0x80000000 -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s1, s2 +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: v_readfirstlane_b32 s1, v1 ; GFX11-NEXT: v_readfirstlane_b32 s2, v2 ; GFX11-NEXT: v_readfirstlane_b32 s3, v3 @@ -4895,23 +4838,18 @@ ; GFX6-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], 0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] -; GFX6-NEXT: s_ashr_i32 s0, s9, 31 -; GFX6-NEXT: s_mov_b32 s1, 0 -; GFX6-NEXT: s_cmp_lg_u32 s1, 0 ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_addc_u32 s1, s0, 0 -; GFX6-NEXT: s_addc_u32 s2, s0, 0 +; GFX6-NEXT: s_ashr_i32 s0, s9, 31 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_addc_u32 s3, s0, 0x80000000 +; GFX6-NEXT: s_add_u32 s1, s0, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v2, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: v_mov_b32_e32 v3, s4 ; GFX6-NEXT: v_mov_b32_e32 v4, s5 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc -; GFX6-NEXT: v_mov_b32_e32 v2, s2 -; GFX6-NEXT: v_mov_b32_e32 v3, s3 +; GFX6-NEXT: v_mov_b32_e32 v3, s1 ; GFX6-NEXT: v_mov_b32_e32 v4, s8 ; GFX6-NEXT: v_mov_b32_e32 v5, s9 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc @@ -4946,25 +4884,21 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX8-NEXT: s_and_b32 s0, 1, s2 ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] -; GFX8-NEXT: s_ashr_i32 s0, s9, 31 -; GFX8-NEXT: s_mov_b32 s1, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: s_cmp_lg_u32 s1, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX8-NEXT: s_addc_u32 s1, s0, 0 -; GFX8-NEXT: s_addc_u32 s2, s0, 0 +; GFX8-NEXT: s_ashr_i32 s0, s9, 31 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_addc_u32 s3, s0, 0x80000000 +; GFX8-NEXT: s_add_u32 s1, s0, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 -; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NEXT: v_mov_b32_e32 v5, s9 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc @@ -4999,25 +4933,21 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX9-NEXT: s_and_b32 s0, 1, s2 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] -; GFX9-NEXT: s_ashr_i32 s0, s9, 31 -; GFX9-NEXT: s_mov_b32 s1, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: s_cmp_lg_u32 s1, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX9-NEXT: s_addc_u32 s1, s0, 0 -; GFX9-NEXT: s_addc_u32 s2, s0, 0 +; GFX9-NEXT: s_ashr_i32 s0, s9, 31 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_addc_u32 s3, s0, 0x80000000 +; GFX9-NEXT: s_add_u32 s1, s0, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: v_mov_b32_e32 v4, s5 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mov_b32_e32 v4, s8 ; GFX9-NEXT: v_mov_b32_e32 v5, s9 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc @@ -5049,24 +4979,22 @@ ; GFX10PLUS-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10PLUS-NEXT: s_and_b32 s1, 1, s1 ; GFX10PLUS-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 -; GFX10PLUS-NEXT: s_mov_b32 s1, 0 ; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0 -; GFX10PLUS-NEXT: s_ashr_i32 s0, s9, 31 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10PLUS-NEXT: v_mov_b32_e32 v2, s5 -; GFX10PLUS-NEXT: s_addc_u32 s1, s0, 0 +; GFX10PLUS-NEXT: s_ashr_i32 s0, s9, 31 +; GFX10PLUS-NEXT: s_add_u32 s1, s0, 0x80000000 ; GFX10PLUS-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX10PLUS-NEXT: v_mov_b32_e32 v1, s4 -; GFX10PLUS-NEXT: s_addc_u32 s2, s0, 0 -; GFX10PLUS-NEXT: s_addc_u32 s3, s0, 0x80000000 +; GFX10PLUS-NEXT: s_mov_b32 s3, s0 +; GFX10PLUS-NEXT: s_mov_b32 s2, s0 ; GFX10PLUS-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10PLUS-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10PLUS-NEXT: v_mov_b32_e32 v0, s8 ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, s0, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, v2, s1, vcc_lo +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc_lo ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v3, v3, s3, vcc_lo +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v3, v3, s1, vcc_lo ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v1 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v2 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v0 @@ -5097,18 +5025,15 @@ ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v5 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v7, 0, vcc -; GFX6-NEXT: s_mov_b64 vcc, 0 ; GFX6-NEXT: v_xor_b32_e32 v2, v2, v6 -; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v3, vcc -; GFX6-NEXT: v_bfrev_b32_e32 v7, 1 -; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v3, vcc -; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v3, v7, vcc +; GFX6-NEXT: v_bfrev_b32_e32 v6, 1 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v3, v6 ; GFX6-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v8, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: saddsat_i128_sv: @@ -5131,18 +5056,15 @@ ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v5 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, 0, vcc -; GFX8-NEXT: s_mov_b64 vcc, 0 ; GFX8-NEXT: v_xor_b32_e32 v2, v2, v6 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v3, vcc -; GFX8-NEXT: v_bfrev_b32_e32 v7, 1 -; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v3, vcc -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v3, v7, vcc +; GFX8-NEXT: v_bfrev_b32_e32 v6, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v3, v6 ; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: saddsat_i128_sv: @@ -5165,49 +5087,69 @@ ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, 0, vcc -; GFX9-NEXT: s_mov_b64 vcc, 0 ; GFX9-NEXT: v_xor_b32_e32 v2, v2, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v3, vcc -; GFX9-NEXT: v_bfrev_b32_e32 v7, 1 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v3, v7, vcc +; GFX9-NEXT: v_bfrev_b32_e32 v6, 1 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v3, v6 ; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: saddsat_i128_sv: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0 -; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo -; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s2, v2, vcc_lo -; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s3, v3, vcc_lo -; GFX10PLUS-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1] -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo -; GFX10PLUS-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[4:5] -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo -; GFX10PLUS-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3] -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[4:5] -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3] -; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v3, 31, v5 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, v8, 0, vcc_lo -; GFX10PLUS-NEXT: s_mov_b32 vcc_lo, 0 -; GFX10PLUS-NEXT: v_xor_b32_e32 v2, v2, v6 -; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v3, vcc_lo -; GFX10PLUS-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX10PLUS-NEXT: v_cmp_ne_u32_e64 s0, 0, v2 -; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v3, vcc_lo -; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0x80000000, v3, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, v3, s0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, v4, v2, s0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0 -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: saddsat_i128_sv: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s2, v2, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s3, v3, vcc_lo +; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1] +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo +; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[4:5] +; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo +; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3] +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[4:5] +; GFX10-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc_lo +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3] +; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v8, 0, vcc_lo +; GFX10-NEXT: v_xor_b32_e32 v2, v2, v6 +; GFX10-NEXT: v_add_co_u32 v6, s0, 0x80000000, v3 +; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: saddsat_i128_sv: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo +; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s2, v2, vcc_lo +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s3, v3, vcc_lo +; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1] +; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[4:5] +; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3] +; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[4:5] +; GFX11-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc_lo +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3] +; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v5 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v8, 0, vcc_lo +; GFX11-NEXT: v_xor_b32_e32 v2, v2, v6 +; GFX11-NEXT: v_add_co_u32 v6, null, 0x80000000, v3 +; GFX11-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v2, v4, v3 :: v_dual_cndmask_b32 v3, v5, v6 +; GFX11-NEXT: ; return to shader part epilog %result = call i128 @llvm.sadd.sat.i128(i128 %lhs, i128 %rhs) %cast = bitcast i128 %result to <4 x float> ret <4 x float> %cast @@ -5229,22 +5171,19 @@ ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3] ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] -; GFX6-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v7 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX6-NEXT: v_cmp_eq_u64_e64 s[0:1], s[2:3], 0 -; GFX6-NEXT: s_mov_b64 vcc, 0 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v7 -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v1, vcc -; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc +; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX6-NEXT: ; return to shader part epilog ; @@ -5271,17 +5210,14 @@ ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v7 -; GFX8-NEXT: s_mov_b64 vcc, 0 -; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v1, vcc -; GFX8-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc +; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v7 +; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v2, v1 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX8-NEXT: ; return to shader part epilog ; @@ -5308,52 +5244,75 @@ ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v7 -; GFX9-NEXT: s_mov_b64 vcc, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v1, vcc -; GFX9-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v1, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc +; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v7 +; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v2, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: saddsat_i128_vs: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_add_co_u32 v4, vcc_lo, v0, s0 -; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo -; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo -; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo -; GFX10PLUS-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1] -; GFX10PLUS-NEXT: s_cmp_eq_u64 s[2:3], 0 -; GFX10PLUS-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0 -; GFX10PLUS-NEXT: s_cselect_b32 s0, 1, 0 -; GFX10PLUS-NEXT: s_and_b32 s0, 1, s0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX10PLUS-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3] -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v8, 0, 1, s1 -; GFX10PLUS-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3] -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v8, 0, s0 -; GFX10PLUS-NEXT: s_mov_b32 vcc_lo, 0 -; GFX10PLUS-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v1, 31, v7 -; GFX10PLUS-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v1, vcc_lo -; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo -; GFX10PLUS-NEXT: v_cmp_ne_u32_e64 s0, 0, v0 -; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0x80000000, v1, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v4, v1, s0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v5, v2, s0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, v6, v3, s0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v3, v7, v8, s0 -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: saddsat_i128_vs: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, s0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo +; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1] +; GFX10-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0 +; GFX10-NEXT: s_cselect_b32 s0, 1, 0 +; GFX10-NEXT: s_and_b32 s0, 1, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3] +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s1 +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3] +; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v7 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v8, 0, s0 +; GFX10-NEXT: v_add_co_u32 v3, s0, 0x80000000, v2 +; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: saddsat_i128_vs: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v0, s0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo +; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo +; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo +; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1] +; GFX11-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0 +; GFX11-NEXT: s_cselect_b32 s0, 1, 0 +; GFX11-NEXT: s_and_b32 s0, 1, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3] +; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, s1 +; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3] +; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v7 +; GFX11-NEXT: v_add_co_u32 v3, null, 0x80000000, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v8, 0, s0 +; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX11-NEXT: ; return to shader part epilog %result = call i128 @llvm.sadd.sat.i128(i128 %lhs, i128 %rhs) %cast = bitcast i128 %result to <4 x float> ret <4 x float> %cast @@ -5373,48 +5332,43 @@ ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[16:17], v[2:3] ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[16:17], v[2:3] +; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v17 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX6-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[10:11] ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v17 -; GFX6-NEXT: s_mov_b64 vcc, 0 -; GFX6-NEXT: v_addc_u32_e64 v2, s[4:5], 0, v1, vcc -; GFX6-NEXT: v_addc_u32_e64 v3, s[4:5], 0, v1, s[4:5] -; GFX6-NEXT: v_addc_u32_e64 v10, s[4:5], v1, v18, s[4:5] +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v18 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v8, v1, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v1, v9, v2, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v2, v16, v3, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v3, v17, v10, s[4:5] -; GFX6-NEXT: v_add_i32_e64 v8, s[4:5], v4, v12 -; GFX6-NEXT: v_addc_u32_e64 v9, s[4:5], v5, v13, s[4:5] -; GFX6-NEXT: v_addc_u32_e64 v10, s[4:5], v6, v14, s[4:5] -; GFX6-NEXT: v_addc_u32_e64 v11, s[4:5], v7, v15, s[4:5] -; GFX6-NEXT: v_cmp_lt_u64_e64 s[4:5], v[8:9], v[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] -; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[10:11], v[6:7] -; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] -; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], v[10:11], v[6:7] -; GFX6-NEXT: v_cndmask_b32_e64 v4, v5, v4, s[4:5] -; GFX6-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[14:15] -; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] -; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[14:15] -; GFX6-NEXT: v_cndmask_b32_e64 v5, v5, 0, s[4:5] +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v8, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v9, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc +; GFX6-NEXT: v_add_i32_e32 v8, vcc, v4, v12 +; GFX6-NEXT: v_addc_u32_e32 v9, vcc, v5, v13, vcc +; GFX6-NEXT: v_addc_u32_e32 v10, vcc, v6, v14, vcc +; GFX6-NEXT: v_addc_u32_e32 v11, vcc, v7, v15, vcc +; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7] +; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7] +; GFX6-NEXT: v_ashrrev_i32_e32 v6, 31, v11 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc +; GFX6-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[14:15] +; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] +; GFX6-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc ; GFX6-NEXT: v_xor_b32_e32 v4, v5, v4 -; GFX6-NEXT: v_ashrrev_i32_e32 v5, 31, v11 -; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v5, vcc -; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc -; GFX6-NEXT: v_addc_u32_e32 v12, vcc, v5, v18, vcc +; GFX6-NEXT: v_add_i32_e32 v7, vcc, v6, v18 ; GFX6-NEXT: v_and_b32_e32 v4, 1, v4 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v5, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v6, v10, v7, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_saddsat_v2i128: @@ -5430,48 +5384,43 @@ ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[16:17], v[2:3] ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[16:17], v[2:3] +; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v17 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[10:11] ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v17 -; GFX8-NEXT: s_mov_b64 vcc, 0 -; GFX8-NEXT: v_addc_u32_e64 v2, s[4:5], 0, v1, vcc -; GFX8-NEXT: v_addc_u32_e64 v3, s[4:5], 0, v1, s[4:5] -; GFX8-NEXT: v_addc_u32_e64 v10, s[4:5], v1, v18, s[4:5] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v2, v18 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v8, v1, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v9, v2, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v16, v3, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v17, v10, s[4:5] -; GFX8-NEXT: v_add_u32_e64 v8, s[4:5], v4, v12 -; GFX8-NEXT: v_addc_u32_e64 v9, s[4:5], v5, v13, s[4:5] -; GFX8-NEXT: v_addc_u32_e64 v10, s[4:5], v6, v14, s[4:5] -; GFX8-NEXT: v_addc_u32_e64 v11, s[4:5], v7, v15, s[4:5] -; GFX8-NEXT: v_cmp_lt_u64_e64 s[4:5], v[8:9], v[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] -; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[10:11], v[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] -; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], v[10:11], v[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v5, v4, s[4:5] -; GFX8-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[14:15] -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] -; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[14:15] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, 0, s[4:5] +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v4, v12 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v5, v13, vcc +; GFX8-NEXT: v_addc_u32_e32 v10, vcc, v6, v14, vcc +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v7, v15, vcc +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7] +; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v11 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc +; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[14:15] +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc ; GFX8-NEXT: v_xor_b32_e32 v4, v5, v4 -; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v11 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v5, vcc -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc -; GFX8-NEXT: v_addc_u32_e32 v12, vcc, v5, v18, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v6, v18 ; GFX8-NEXT: v_and_b32_e32 v4, 1, v4 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v10, v7, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_saddsat_v2i128: @@ -5487,48 +5436,43 @@ ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[16:17], v[2:3] ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[16:17], v[2:3] +; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v17 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[10:11] ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v17 -; GFX9-NEXT: s_mov_b64 vcc, 0 -; GFX9-NEXT: v_addc_co_u32_e64 v2, s[4:5], 0, v1, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, v1, s[4:5] -; GFX9-NEXT: v_addc_co_u32_e64 v10, s[4:5], v1, v18, s[4:5] +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v2, v18 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v8, v1, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v9, v2, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v16, v3, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v17, v10, s[4:5] -; GFX9-NEXT: v_add_co_u32_e64 v8, s[4:5], v4, v12 -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[4:5], v5, v13, s[4:5] -; GFX9-NEXT: v_addc_co_u32_e64 v10, s[4:5], v6, v14, s[4:5] -; GFX9-NEXT: v_addc_co_u32_e64 v11, s[4:5], v7, v15, s[4:5] -; GFX9-NEXT: v_cmp_lt_u64_e64 s[4:5], v[8:9], v[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[10:11], v[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] -; GFX9-NEXT: v_cmp_eq_u64_e64 s[4:5], v[10:11], v[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v5, v4, s[4:5] -; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[14:15] -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] -; GFX9-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[14:15] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, 0, s[4:5] +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v4, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v5, v13, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v6, v14, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v7, v15, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7] +; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[14:15] +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc ; GFX9-NEXT: v_xor_b32_e32 v4, v5, v4 -; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v11 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, v5, v18, vcc +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v6, v18 ; GFX9-NEXT: v_and_b32_e32 v4, 1, v4 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_saddsat_v2i128: @@ -5543,49 +5487,44 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[16:17], v[2:3] ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[10:11] -; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[16:17], v[2:3] ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[10:11] +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v12, vcc_lo, v4, v12 +; GFX10-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, v5, v13, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v18, vcc_lo, v6, v14, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v19, vcc_lo, v7, v15, vcc_lo ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11] -; GFX10-NEXT: v_cndmask_b32_e64 v1, v18, 0, vcc_lo -; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v4, v12 -; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v5, v13, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, v6, v14, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, v7, v15, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e64 s5, v[10:11], v[4:5] +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo +; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[12:13], v[4:5] ; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v17 -; GFX10-NEXT: v_cmp_eq_u64_e64 s6, v[12:13], v[6:7] -; GFX10-NEXT: s_mov_b32 vcc_lo, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s5 -; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[12:13], v[6:7] +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[6:7] ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v13 -; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s4, 0, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s5 -; GFX10-NEXT: v_cmp_gt_i64_e64 s5, 0, v[14:15] -; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v6, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s5 -; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, v3, s6 -; GFX10-NEXT: v_cmp_eq_u64_e64 s6, 0, v[14:15] -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s4, 0, v1, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, 0, s6 -; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s4, 0x80000000, v1, s4 -; GFX10-NEXT: v_xor_b32_e32 v3, v3, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, v1, s5 -; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v6, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v9, v2, s5 -; GFX10-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0x80000000, v6, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v16, v4, s5 -; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v17, v5, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v10, v6, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v11, v7, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v12, v8, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v13, v9, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[14:15] +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[6:7] +; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v19 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15] +; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v17 +; GFX10-NEXT: v_add_co_u32 v7, s5, 0x80000000, v6 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc_lo +; GFX10-NEXT: v_add_co_u32 v4, s4, 0x80000000, v3 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: v_xor_b32_e32 v1, v2, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, v16, v3, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v5, 1, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v17, v4, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v12, v6, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v13, v6, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v18, v6, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v19, v7, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_saddsat_v2i128: @@ -5600,49 +5539,43 @@ ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[16:17], v[2:3] ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[10:11] -; GFX11-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[16:17], v[2:3] ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[10:11] +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v4, v12 +; GFX11-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, v5, v13, vcc_lo +; GFX11-NEXT: v_add_co_ci_u32_e32 v18, vcc_lo, v6, v14, vcc_lo +; GFX11-NEXT: v_add_co_ci_u32_e32 v19, vcc_lo, v7, v15, vcc_lo ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11] -; GFX11-NEXT: v_cndmask_b32_e64 v1, v18, 0, vcc_lo -; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v4, v12 -; GFX11-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v5, v13, vcc_lo -; GFX11-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, v6, v14, vcc_lo -; GFX11-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, v7, v15, vcc_lo -; GFX11-NEXT: v_cmp_lt_u64_e64 s1, v[10:11], v[4:5] +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo +; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[12:13], v[4:5] ; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v17 -; GFX11-NEXT: v_cmp_eq_u64_e64 s2, v[12:13], v[6:7] -; GFX11-NEXT: s_mov_b32 vcc_lo, 0 -; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s1 -; GFX11-NEXT: v_cmp_lt_i64_e64 s1, v[12:13], v[6:7] +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[6:7] +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[14:15] +; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[6:7] +; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15] +; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v17 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v13 -; GFX11-NEXT: v_add_co_ci_u32_e64 v2, s0, 0, v1, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s1 -; GFX11-NEXT: v_cmp_gt_i64_e64 s1, 0, v[14:15] -; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v6, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, s1 -; GFX11-NEXT: v_cmp_ne_u32_e64 s1, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v4, v3, s2 -; GFX11-NEXT: v_cmp_eq_u64_e64 s2, 0, v[14:15] -; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s0, 0, v1, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v5, 0, s2 -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s0, 0x80000000, v1, s0 -; GFX11-NEXT: v_xor_b32_e32 v3, v3, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v8, v1, s1 -; GFX11-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v6, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v1, v9, v2, s1 -; GFX11-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0x80000000, v6, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v16, v4, s1 -; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v3 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v17, v5, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v10, v6, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v11, v7, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v12, v8, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v13, v9, s0 +; GFX11-NEXT: v_add_co_u32 v7, null, 0x80000000, v6 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc_lo +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_add_co_u32 v4, null, 0x80000000, v3 +; GFX11-NEXT: v_xor_b32_e32 v1, v2, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v8, v3, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v2, v16, v3 :: v_dual_and_b32 v5, 1, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v3, v17, v4, vcc_lo +; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v5 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v12, v6, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v13, v6, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v18, v6, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v19, v7, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs) ret <2 x i128> %result @@ -5669,23 +5602,19 @@ ; GFX6-NEXT: v_cmp_eq_u64_e64 s[0:1], s[10:11], 0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] -; GFX6-NEXT: s_ashr_i32 s0, s17, 31 -; GFX6-NEXT: s_mov_b32 s1, 0 -; GFX6-NEXT: s_cmp_lg_u32 s1, 0 ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_addc_u32 s1, s0, 0 -; GFX6-NEXT: s_addc_u32 s2, s0, 0 +; GFX6-NEXT: s_ashr_i32 s0, s17, 31 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_addc_u32 s3, s0, 0x80000000 +; GFX6-NEXT: s_add_u32 s1, s0, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v2, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: v_mov_b32_e32 v3, s8 ; GFX6-NEXT: v_mov_b32_e32 v4, s9 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc -; GFX6-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s16 ; GFX6-NEXT: v_mov_b32_e32 v3, s17 ; GFX6-NEXT: v_cndmask_b32_e32 v6, v2, v0, vcc @@ -5708,23 +5637,18 @@ ; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[14:15], 0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[4:5] -; GFX6-NEXT: s_ashr_i32 s4, s3, 31 -; GFX6-NEXT: s_mov_b32 s5, 0 -; GFX6-NEXT: s_cmp_lg_u32 s5, 0 ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_addc_u32 s5, s4, 0 -; GFX6-NEXT: s_addc_u32 s6, s4, 0 +; GFX6-NEXT: s_ashr_i32 s4, s3, 31 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_addc_u32 s7, s4, 0x80000000 +; GFX6-NEXT: s_add_u32 s5, s4, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: v_mov_b32_e32 v3, s0 ; GFX6-NEXT: v_mov_b32_e32 v8, s1 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc -; GFX6-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NEXT: v_mov_b32_e32 v3, s5 ; GFX6-NEXT: v_mov_b32_e32 v8, s2 ; GFX6-NEXT: v_mov_b32_e32 v9, s3 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc @@ -5763,25 +5687,21 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX8-NEXT: s_and_b32 s0, 1, s2 ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] -; GFX8-NEXT: s_ashr_i32 s0, s17, 31 -; GFX8-NEXT: s_mov_b32 s1, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: s_cmp_lg_u32 s1, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX8-NEXT: s_addc_u32 s1, s0, 0 -; GFX8-NEXT: s_addc_u32 s2, s0, 0 +; GFX8-NEXT: s_ashr_i32 s0, s17, 31 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_addc_u32 s3, s0, 0x80000000 +; GFX8-NEXT: s_add_u32 s1, s0, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 -; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, s8 ; GFX8-NEXT: v_mov_b32_e32 v4, s9 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s16 ; GFX8-NEXT: v_mov_b32_e32 v3, s17 ; GFX8-NEXT: s_add_u32 s0, s4, s12 @@ -5808,25 +5728,21 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] ; GFX8-NEXT: s_and_b32 s4, 1, s6 ; GFX8-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, s4 -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[4:5] -; GFX8-NEXT: s_ashr_i32 s4, s3, 31 -; GFX8-NEXT: s_mov_b32 s5, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: s_cmp_lg_u32 s5, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[4:5] ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX8-NEXT: s_addc_u32 s5, s4, 0 -; GFX8-NEXT: s_addc_u32 s6, s4, 0 +; GFX8-NEXT: s_ashr_i32 s4, s3, 31 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_addc_u32 s7, s4, 0x80000000 +; GFX8-NEXT: s_add_u32 s5, s4, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NEXT: v_mov_b32_e32 v8, s1 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: v_mov_b32_e32 v8, s2 ; GFX8-NEXT: v_mov_b32_e32 v9, s3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc @@ -5865,25 +5781,21 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX9-NEXT: s_and_b32 s0, 1, s2 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] -; GFX9-NEXT: s_ashr_i32 s0, s17, 31 -; GFX9-NEXT: s_mov_b32 s1, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: s_cmp_lg_u32 s1, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX9-NEXT: s_addc_u32 s1, s0, 0 -; GFX9-NEXT: s_addc_u32 s2, s0, 0 +; GFX9-NEXT: s_ashr_i32 s0, s17, 31 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_addc_u32 s3, s0, 0x80000000 +; GFX9-NEXT: s_add_u32 s1, s0, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s8 ; GFX9-NEXT: v_mov_b32_e32 v4, s9 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s16 ; GFX9-NEXT: v_mov_b32_e32 v3, s17 ; GFX9-NEXT: s_add_u32 s0, s4, s12 @@ -5910,25 +5822,21 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] ; GFX9-NEXT: s_and_b32 s4, 1, s6 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, s4 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[4:5] -; GFX9-NEXT: s_ashr_i32 s4, s3, 31 -; GFX9-NEXT: s_mov_b32 s5, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: s_cmp_lg_u32 s5, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[4:5] ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX9-NEXT: s_addc_u32 s5, s4, 0 -; GFX9-NEXT: s_addc_u32 s6, s4, 0 +; GFX9-NEXT: s_ashr_i32 s4, s3, 31 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_addc_u32 s7, s4, 0x80000000 +; GFX9-NEXT: s_add_u32 s5, s4, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-NEXT: v_mov_b32_e32 v8, s1 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: v_mov_b32_e32 v8, s2 ; GFX9-NEXT: v_mov_b32_e32 v9, s3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc @@ -5951,7 +5859,6 @@ ; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[8:9], s[0:1] ; GFX10-NEXT: s_addc_u32 s17, s3, s11 ; GFX10-NEXT: s_cmp_eq_u64 s[16:17], s[2:3] -; GFX10-NEXT: v_mov_b32_e32 v5, s17 ; GFX10-NEXT: s_cselect_b32 s18, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[16:17], s[2:3] @@ -5963,69 +5870,66 @@ ; GFX10-NEXT: s_cselect_b32 s1, 1, 0 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10-NEXT: s_and_b32 s1, 1, s1 -; GFX10-NEXT: s_ashr_i32 s2, s17, 31 ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0 -; GFX10-NEXT: s_mov_b32 s0, 0 -; GFX10-NEXT: s_cmp_lg_u32 s0, 0 -; GFX10-NEXT: s_addc_u32 s1, s2, 0 -; GFX10-NEXT: s_addc_u32 s10, s2, 0 -; GFX10-NEXT: s_addc_u32 s3, s2, 0x80000000 -; GFX10-NEXT: s_add_u32 s12, s4, s12 -; GFX10-NEXT: s_addc_u32 s13, s5, s13 -; GFX10-NEXT: s_addc_u32 s18, s6, s14 -; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[12:13], s[4:5] -; GFX10-NEXT: s_addc_u32 s19, s7, s15 -; GFX10-NEXT: v_cmp_lt_i64_e64 s5, s[14:15], 0 -; GFX10-NEXT: s_cmp_eq_u64 s[18:19], s[6:7] +; GFX10-NEXT: s_ashr_i32 s0, s17, 31 +; GFX10-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX10-NEXT: s_add_u32 s10, s4, s12 +; GFX10-NEXT: s_addc_u32 s11, s5, s13 +; GFX10-NEXT: s_addc_u32 s12, s6, s14 +; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[10:11], s[4:5] +; GFX10-NEXT: s_addc_u32 s13, s7, s15 ; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX10-NEXT: s_cselect_b32 s0, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 -; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[18:19], s[6:7] -; GFX10-NEXT: s_and_b32 s0, 1, s0 -; GFX10-NEXT: s_cmp_eq_u64 s[14:15], 0 -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s5 -; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 -; GFX10-NEXT: s_cselect_b32 s4, 1, 0 +; GFX10-NEXT: s_cmp_eq_u64 s[12:13], s[6:7] ; GFX10-NEXT: v_mov_b32_e32 v1, s8 -; GFX10-NEXT: s_and_b32 s4, 1, s4 +; GFX10-NEXT: s_cselect_b32 s8, 1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 +; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[12:13], s[6:7] +; GFX10-NEXT: v_cmp_lt_i64_e64 s6, s[14:15], 0 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: s_mov_b32 s3, s0 +; GFX10-NEXT: s_mov_b32 s2, s0 +; GFX10-NEXT: v_mov_b32_e32 v6, s11 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 +; GFX10-NEXT: s_and_b32 s4, 1, s8 +; GFX10-NEXT: s_cmp_eq_u64 s[14:15], 0 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s6 +; GFX10-NEXT: s_cselect_b32 s5, 1, 0 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 +; GFX10-NEXT: s_and_b32 s5, 1, s5 +; GFX10-NEXT: v_mov_b32_e32 v7, s13 +; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s5 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, v2, s0 -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, s9 -; GFX10-NEXT: v_mov_b32_e32 v6, s13 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v7, s19 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v4, 0, s0 -; GFX10-NEXT: v_mov_b32_e32 v4, s16 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s1, vcc_lo -; GFX10-NEXT: s_mov_b32 s1, 0 -; GFX10-NEXT: s_ashr_i32 s0, s19, 31 -; GFX10-NEXT: v_xor_b32_e32 v2, v3, v2 -; GFX10-NEXT: s_cmp_lg_u32 s1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v4, s10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, s3, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v5, s12 -; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX10-NEXT: s_addc_u32 s1, s0, 0 -; GFX10-NEXT: s_addc_u32 s2, s0, 0 -; GFX10-NEXT: s_addc_u32 s3, s0, 0x80000000 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, s18 +; GFX10-NEXT: v_mov_b32_e32 v0, s16 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, 0, s4 +; GFX10-NEXT: v_mov_b32_e32 v5, s17 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc_lo +; GFX10-NEXT: v_xor_b32_e32 v3, v4, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, s1, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v5, s10 +; GFX10-NEXT: s_ashr_i32 s0, s13, 31 +; GFX10-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX10-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX10-NEXT: s_mov_b32 s3, s0 +; GFX10-NEXT: s_mov_b32 s2, s0 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, s12 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, s1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, s3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, s3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, s1, vcc_lo ; GFX10-NEXT: v_readfirstlane_b32 s0, v1 -; GFX10-NEXT: v_readfirstlane_b32 s1, v0 -; GFX10-NEXT: v_readfirstlane_b32 s2, v3 +; GFX10-NEXT: v_readfirstlane_b32 s1, v2 +; GFX10-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10-NEXT: v_readfirstlane_b32 s3, v4 ; GFX10-NEXT: v_readfirstlane_b32 s4, v5 ; GFX10-NEXT: v_readfirstlane_b32 s5, v6 -; GFX10-NEXT: v_readfirstlane_b32 s6, v2 +; GFX10-NEXT: v_readfirstlane_b32 s6, v3 ; GFX10-NEXT: v_readfirstlane_b32 s7, v7 ; GFX10-NEXT: ; return to shader part epilog ; @@ -6037,7 +5941,6 @@ ; GFX11-NEXT: v_cmp_lt_u64_e64 s0, s[8:9], s[0:1] ; GFX11-NEXT: s_addc_u32 s17, s3, s11 ; GFX11-NEXT: s_cmp_eq_u64 s[16:17], s[2:3] -; GFX11-NEXT: v_mov_b32_e32 v5, s17 ; GFX11-NEXT: s_cselect_b32 s18, 1, 0 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[16:17], s[2:3] @@ -6049,68 +5952,65 @@ ; GFX11-NEXT: s_cselect_b32 s1, 1, 0 ; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX11-NEXT: s_and_b32 s1, 1, s1 -; GFX11-NEXT: s_ashr_i32 s2, s17, 31 ; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_cmp_lg_u32 s0, 0 -; GFX11-NEXT: s_addc_u32 s1, s2, 0 -; GFX11-NEXT: s_addc_u32 s10, s2, 0 -; GFX11-NEXT: s_addc_u32 s3, s2, 0x80000000 -; GFX11-NEXT: s_add_u32 s12, s4, s12 -; GFX11-NEXT: s_addc_u32 s13, s5, s13 -; GFX11-NEXT: s_addc_u32 s18, s6, s14 -; GFX11-NEXT: v_cmp_lt_u64_e64 s4, s[12:13], s[4:5] -; GFX11-NEXT: s_addc_u32 s19, s7, s15 -; GFX11-NEXT: v_cmp_lt_i64_e64 s5, s[14:15], 0 -; GFX11-NEXT: s_cmp_eq_u64 s[18:19], s[6:7] +; GFX11-NEXT: s_ashr_i32 s0, s17, 31 +; GFX11-NEXT: v_mov_b32_e32 v2, s9 +; GFX11-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX11-NEXT: s_add_u32 s10, s4, s12 +; GFX11-NEXT: s_addc_u32 s11, s5, s13 +; GFX11-NEXT: s_addc_u32 s12, s6, s14 +; GFX11-NEXT: v_cmp_lt_u64_e64 s4, s[10:11], s[4:5] +; GFX11-NEXT: s_addc_u32 s13, s7, s15 ; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX11-NEXT: s_cmp_eq_u64 s[12:13], s[6:7] ; GFX11-NEXT: v_mov_b32_e32 v1, s8 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 -; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[18:19], s[6:7] -; GFX11-NEXT: s_cselect_b32 s0, 1, 0 -; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s5 -; GFX11-NEXT: s_and_b32 s0, 1, s0 -; GFX11-NEXT: s_cmp_eq_u64 s[14:15], 0 -; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 +; GFX11-NEXT: s_cselect_b32 s8, 1, 0 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 -; GFX11-NEXT: s_cselect_b32 s4, 1, 0 +; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[12:13], s[6:7] +; GFX11-NEXT: v_cmp_lt_i64_e64 s6, s[14:15], 0 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: v_dual_mov_b32 v7, s13 :: v_dual_mov_b32 v6, s11 +; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 +; GFX11-NEXT: s_and_b32 s4, 1, s8 +; GFX11-NEXT: s_cmp_eq_u64 s[14:15], 0 +; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, s6 +; GFX11-NEXT: s_cselect_b32 s5, 1, 0 +; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 +; GFX11-NEXT: s_and_b32 s5, 1, s5 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11-NEXT: s_and_b32 s4, 1, s4 -; GFX11-NEXT: v_mov_b32_e32 v6, s13 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v3, v2, s0 -; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s4 +; GFX11-NEXT: v_cmp_ne_u32_e64 s4, 0, s5 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_dual_mov_b32 v0, s9 :: v_dual_mov_b32 v7, s19 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v4, 0, s0 -; GFX11-NEXT: v_mov_b32_e32 v4, s16 -; GFX11-NEXT: s_ashr_i32 s0, s19, 31 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo -; GFX11-NEXT: v_xor_b32_e32 v2, v3, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v4, s10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v4, v5, s3, vcc_lo -; GFX11-NEXT: v_mov_b32_e32 v5, s12 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s1, vcc_lo -; GFX11-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_cmp_lg_u32 s1, 0 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, s18 -; GFX11-NEXT: s_addc_u32 s1, s0, 0 -; GFX11-NEXT: s_addc_u32 s2, s0, 0 -; GFX11-NEXT: s_addc_u32 s3, s0, 0x80000000 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v5, 0, s4 +; GFX11-NEXT: v_mov_b32_e32 v5, s17 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s0, vcc_lo +; GFX11-NEXT: v_xor_b32_e32 v3, v4, v3 +; GFX11-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v4, v5, s1, vcc_lo +; GFX11-NEXT: v_mov_b32_e32 v5, s10 +; GFX11-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo +; GFX11-NEXT: s_ashr_i32 s0, s13, 31 +; GFX11-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, s12 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s0, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s1, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s3, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s3, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s1, vcc_lo ; GFX11-NEXT: v_readfirstlane_b32 s0, v1 -; GFX11-NEXT: v_readfirstlane_b32 s1, v0 -; GFX11-NEXT: v_readfirstlane_b32 s2, v3 +; GFX11-NEXT: v_readfirstlane_b32 s1, v2 +; GFX11-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11-NEXT: v_readfirstlane_b32 s3, v4 ; GFX11-NEXT: v_readfirstlane_b32 s4, v5 ; GFX11-NEXT: v_readfirstlane_b32 s5, v6 -; GFX11-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-NEXT: v_readfirstlane_b32 s6, v3 ; GFX11-NEXT: v_readfirstlane_b32 s7, v7 ; GFX11-NEXT: ; return to shader part epilog %result = call <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -4192,13 +4192,11 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v0, v2 ; GFX6-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc -; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] -; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[2:3] +; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] +; GFX6-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[2:3] ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v5 -; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX6-NEXT: s_mov_b64 s[6:7], 0 -; GFX6-NEXT: v_addc_u32_e64 v1, s[6:7], v0, v1, s[6:7] -; GFX6-NEXT: s_xor_b64 vcc, s[4:5], vcc +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0x80000000, v0 +; GFX6-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -4208,13 +4206,11 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v0, v2 ; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] -; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[2:3] +; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] +; GFX8-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[2:3] ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v5 -; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_addc_u32_e64 v1, s[6:7], v0, v1, s[6:7] -; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x80000000, v0 +; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -4224,13 +4220,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v2 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v3, vcc -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[2:3] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[2:3] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v5 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9-NEXT: s_mov_b64 s[6:7], 0 -; GFX9-NEXT: v_addc_co_u32_e64 v1, s[6:7], v0, v1, s[6:7] -; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4242,10 +4236,9 @@ ; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[2:3] -; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s5, 0x80000000, v6, s5 +; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v6 ; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo @@ -4258,10 +4251,9 @@ ; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0, v[2:3] -; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, s1, 0x80000000, v6, s1 +; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v6 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -4279,16 +4271,14 @@ ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 ; GFX6-NEXT: s_ashr_i32 s2, s5, 31 -; GFX6-NEXT: s_mov_b32 s3, 0 -; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc -; GFX6-NEXT: s_cmp_lg_u32 s3, 0 -; GFX6-NEXT: s_addc_u32 s1, s2, 0x80000000 +; GFX6-NEXT: s_add_u32 s3, s2, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: v_mov_b32_e32 v2, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: v_mov_b32_e32 v3, s5 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX6-NEXT: v_readfirstlane_b32 s0, v0 ; GFX6-NEXT: v_readfirstlane_b32 s1, v1 ; GFX6-NEXT: ; return to shader part epilog @@ -4302,16 +4292,14 @@ ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 ; GFX8-NEXT: s_ashr_i32 s2, s5, 31 -; GFX8-NEXT: s_mov_b32 s3, 0 -; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc -; GFX8-NEXT: s_cmp_lg_u32 s3, 0 -; GFX8-NEXT: s_addc_u32 s1, s2, 0x80000000 +; GFX8-NEXT: s_add_u32 s3, s2, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1 ; GFX8-NEXT: ; return to shader part epilog @@ -4325,16 +4313,14 @@ ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 ; GFX9-NEXT: s_ashr_i32 s2, s5, 31 -; GFX9-NEXT: s_mov_b32 s3, 0 -; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc -; GFX9-NEXT: s_cmp_lg_u32 s3, 0 -; GFX9-NEXT: s_addc_u32 s1, s2, 0x80000000 +; GFX9-NEXT: s_add_u32 s3, s2, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NEXT: ; return to shader part epilog @@ -4344,15 +4330,13 @@ ; GFX10-NEXT: s_sub_u32 s4, s0, s2 ; GFX10-NEXT: s_subb_u32 s5, s1, s3 ; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[2:3], 0 -; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[4:5], s[0:1] -; GFX10-NEXT: s_mov_b32 s3, 0 -; GFX10-NEXT: s_ashr_i32 s0, s5, 31 +; GFX10-NEXT: v_cmp_lt_i64_e64 s6, s[4:5], s[0:1] ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: s_xor_b32 s2, s2, s1 -; GFX10-NEXT: s_cmp_lg_u32 s3, 0 +; GFX10-NEXT: s_ashr_i32 s0, s5, 31 +; GFX10-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX10-NEXT: s_xor_b32 s2, s2, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s0, s2 -; GFX10-NEXT: s_addc_u32 s1, s0, 0x80000000 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s1, s2 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 @@ -4363,14 +4347,12 @@ ; GFX11-NEXT: s_sub_u32 s4, s0, s2 ; GFX11-NEXT: s_subb_u32 s5, s1, s3 ; GFX11-NEXT: v_cmp_gt_i64_e64 s2, s[2:3], 0 -; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[4:5], s[0:1] -; GFX11-NEXT: s_mov_b32 s3, 0 -; GFX11-NEXT: s_ashr_i32 s0, s5, 31 +; GFX11-NEXT: v_cmp_lt_i64_e64 s6, s[4:5], s[0:1] ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX11-NEXT: s_xor_b32 s2, s2, s1 -; GFX11-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-NEXT: s_ashr_i32 s0, s5, 31 +; GFX11-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX11-NEXT: s_xor_b32 s2, s2, s6 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, s2 -; GFX11-NEXT: s_addc_u32 s1, s0, 0x80000000 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s1, s2 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: v_readfirstlane_b32 s1, v1 @@ -4385,13 +4367,11 @@ ; GFX6-NEXT: v_mov_b32_e32 v3, s1 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s0, v0 ; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc -; GFX6-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3] -; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], 0, v[0:1] +; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3] +; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], 0, v[0:1] ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX6-NEXT: s_mov_b64 s[2:3], 0 -; GFX6-NEXT: v_addc_u32_e64 v1, s[2:3], v0, v1, s[2:3] -; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0x80000000, v0 +; GFX6-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX6-NEXT: ; return to shader part epilog @@ -4401,13 +4381,11 @@ ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0 ; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc -; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3] -; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], 0, v[0:1] +; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3] +; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], 0, v[0:1] ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX8-NEXT: s_mov_b64 s[2:3], 0 -; GFX8-NEXT: v_addc_u32_e64 v1, s[2:3], v0, v1, s[2:3] -; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x80000000, v0 +; GFX8-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX8-NEXT: ; return to shader part epilog @@ -4417,13 +4395,11 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s0, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v1, vcc -; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], 0, v[0:1] +; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], 0, v[0:1] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_addc_co_u32_e64 v1, s[2:3], v0, v1, s[2:3] -; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-NEXT: ; return to shader part epilog @@ -4435,8 +4411,7 @@ ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, 0, v[0:1] -; GFX10-NEXT: s_mov_b32 s1, 0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s1, 0x80000000, v4, s1 +; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4 ; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo @@ -4449,8 +4424,7 @@ ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0, v[0:1] -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, s1, 0x80000000, v4, s1 +; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-NEXT: ; return to shader part epilog @@ -4465,13 +4439,11 @@ ; GFX6-NEXT: v_mov_b32_e32 v3, s1 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s0, v0 ; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc -; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1] -; GFX6-NEXT: v_cmp_gt_i64_e64 s[2:3], s[0:1], 0 +; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1] +; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX6-NEXT: s_mov_b64 s[0:1], 0 -; GFX6-NEXT: v_addc_u32_e64 v1, s[0:1], v0, v1, s[0:1] -; GFX6-NEXT: s_xor_b64 vcc, s[2:3], vcc +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0x80000000, v0 +; GFX6-NEXT: s_xor_b64 vcc, s[0:1], s[2:3] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX6-NEXT: ; return to shader part epilog @@ -4481,13 +4453,11 @@ ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s0, v0 ; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1] -; GFX8-NEXT: v_cmp_gt_i64_e64 s[2:3], s[0:1], 0 +; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1] +; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_addc_u32_e64 v1, s[0:1], v0, v1, s[0:1] -; GFX8-NEXT: s_xor_b64 vcc, s[2:3], vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x80000000, v0 +; GFX8-NEXT: s_xor_b64 vcc, s[0:1], s[2:3] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX8-NEXT: ; return to shader part epilog @@ -4497,13 +4467,11 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1] -; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[0:1], 0 +; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1] +; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_addc_co_u32_e64 v1, s[0:1], v0, v1, s[0:1] -; GFX9-NEXT: s_xor_b64 vcc, s[2:3], vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[0:1], s[2:3] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-NEXT: ; return to shader part epilog @@ -4512,12 +4480,11 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s0 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo -; GFX10-NEXT: v_cmp_gt_i64_e64 s1, s[0:1], 0 -; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], 0 ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 0x80000000, v4, s0 -; GFX10-NEXT: s_xor_b32 vcc_lo, s1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4 +; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX10-NEXT: ; return to shader part epilog @@ -4526,12 +4493,11 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s0 ; GFX11-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo -; GFX11-NEXT: v_cmp_gt_i64_e64 s1, s[0:1], 0 -; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], 0 ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, s0, 0x80000000, v4, s0 -; GFX11-NEXT: s_xor_b32 vcc_lo, s1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4 +; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-NEXT: ; return to shader part epilog %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs) @@ -4545,22 +4511,21 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v0, v4 ; GFX6-NEXT: v_subb_u32_e32 v9, vcc, v1, v5, vcc -; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[8:9], v[0:1] -; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[4:5] +; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[8:9], v[0:1] +; GFX6-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[4:5] ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v9 -; GFX6-NEXT: v_bfrev_b32_e32 v10, 1 -; GFX6-NEXT: s_mov_b64 s[6:7], 0 -; GFX6-NEXT: v_addc_u32_e64 v1, s[8:9], v0, v10, s[6:7] -; GFX6-NEXT: s_xor_b64 vcc, s[4:5], vcc +; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v0, v1 +; GFX6-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v2, v6 ; GFX6-NEXT: v_subb_u32_e32 v5, vcc, v3, v7, vcc -; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3] -; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[6:7] +; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[2:3] +; GFX6-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[6:7] ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v5 -; GFX6-NEXT: v_addc_u32_e64 v3, s[6:7], v2, v10, s[6:7] -; GFX6-NEXT: s_xor_b64 vcc, s[4:5], vcc +; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0x80000000, v2 +; GFX6-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -4570,22 +4535,21 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_u32_e32 v8, vcc, v0, v4 ; GFX8-NEXT: v_subb_u32_e32 v9, vcc, v1, v5, vcc -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[8:9], v[0:1] -; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[4:5] +; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[8:9], v[0:1] +; GFX8-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[4:5] ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v9 -; GFX8-NEXT: v_bfrev_b32_e32 v10, 1 -; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_addc_u32_e64 v1, s[8:9], v0, v10, s[6:7] -; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc +; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v0, v1 +; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v2, v6 ; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v3, v7, vcc -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3] -; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[6:7] +; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[2:3] +; GFX8-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[6:7] ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v5 -; GFX8-NEXT: v_addc_u32_e64 v3, s[6:7], v2, v10, s[6:7] -; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x80000000, v2 +; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -4595,22 +4559,21 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v0, v4 ; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, v1, v5, vcc -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[8:9], v[0:1] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[4:5] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[8:9], v[0:1] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[4:5] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v9 -; GFX9-NEXT: v_bfrev_b32_e32 v10, 1 -; GFX9-NEXT: s_mov_b64 s[6:7], 0 -; GFX9-NEXT: v_addc_co_u32_e64 v1, s[8:9], v0, v10, s[6:7] -; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc +; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v0, v1 +; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc ; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v2, v6 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v3, v7, vcc -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[6:7] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[2:3] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[6:7] ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v5 -; GFX9-NEXT: v_addc_co_u32_e64 v3, s[6:7], v2, v10, s[6:7] -; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, 0x80000000, v2 +; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4623,19 +4586,18 @@ ; GFX10-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo ; GFX10-NEXT: v_sub_co_u32 v10, vcc_lo, v2, v6 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v11, vcc_lo, v3, v7, vcc_lo +; GFX10-NEXT: v_ashrrev_i32_e32 v12, 31, v9 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1] -; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v9 -; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[4:5] ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v11 -; GFX10-NEXT: v_cmp_lt_i64_e64 s7, 0, v[6:7] -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s6, 0x80000000, v0, s5 -; GFX10-NEXT: v_cmp_lt_i64_e64 s6, v[10:11], v[2:3] -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s5, 0x80000000, v4, s5 +; GFX10-NEXT: v_cmp_lt_i64_e64 s6, 0, v[6:7] +; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v12 +; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[10:11], v[2:3] +; GFX10-NEXT: v_add_co_u32 v3, s7, 0x80000000, v4 ; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v12, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo -; GFX10-NEXT: s_xor_b32 vcc_lo, s7, s6 +; GFX10-NEXT: s_xor_b32 vcc_lo, s6, s5 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -4648,18 +4610,17 @@ ; GFX11-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo ; GFX11-NEXT: v_sub_co_u32 v10, vcc_lo, v2, v6 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v11, vcc_lo, v3, v7, vcc_lo +; GFX11-NEXT: v_ashrrev_i32_e32 v12, 31, v9 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1] -; GFX11-NEXT: v_ashrrev_i32_e32 v0, 31, v9 -; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0, v[4:5] ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v11 -; GFX11-NEXT: v_cmp_lt_i64_e64 s3, 0, v[6:7] -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, s2, 0x80000000, v0, s1 -; GFX11-NEXT: v_cmp_lt_i64_e64 s2, v[10:11], v[2:3] -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s1, 0x80000000, v4, s1 +; GFX11-NEXT: v_cmp_lt_i64_e64 s1, v[10:11], v[2:3] +; GFX11-NEXT: v_cmp_lt_i64_e64 s2, 0, v[6:7] +; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v12 +; GFX11-NEXT: v_add_co_u32 v3, null, 0x80000000, v4 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo -; GFX11-NEXT: v_dual_cndmask_b32 v0, v8, v0 :: v_dual_cndmask_b32 v1, v9, v1 -; GFX11-NEXT: s_xor_b32 vcc_lo, s3, s2 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v8, v12 :: v_dual_cndmask_b32 v1, v9, v1 +; GFX11-NEXT: s_xor_b32 vcc_lo, s2, s1 ; GFX11-NEXT: v_dual_cndmask_b32 v2, v10, v4 :: v_dual_cndmask_b32 v3, v11, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) @@ -4676,35 +4637,31 @@ ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0 ; GFX6-NEXT: s_ashr_i32 s4, s9, 31 -; GFX6-NEXT: s_mov_b32 s5, 0 -; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc -; GFX6-NEXT: s_cmp_lg_u32 s5, 0 +; GFX6-NEXT: s_add_u32 s5, s4, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s8 -; GFX6-NEXT: s_addc_u32 s1, s4, 0x80000000 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: v_mov_b32_e32 v3, s9 +; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc ; GFX6-NEXT: s_sub_u32 s0, s2, s6 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NEXT: v_mov_b32_e32 v3, s9 ; GFX6-NEXT: s_subb_u32 s1, s3, s7 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0 ; GFX6-NEXT: s_ashr_i32 s4, s1, 31 -; GFX6-NEXT: s_mov_b32 s5, 0 -; GFX6-NEXT: s_xor_b64 vcc, s[2:3], vcc -; GFX6-NEXT: s_cmp_lg_u32 s5, 0 -; GFX6-NEXT: s_addc_u32 s3, s4, 0x80000000 +; GFX6-NEXT: s_add_u32 s5, s4, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v3, s3 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v4, s0 ; GFX6-NEXT: v_mov_b32_e32 v5, s1 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GFX6-NEXT: v_readfirstlane_b32 s0, v4 -; GFX6-NEXT: v_readfirstlane_b32 s1, v2 +; GFX6-NEXT: s_xor_b64 vcc, s[2:3], vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX6-NEXT: v_readfirstlane_b32 s0, v2 +; GFX6-NEXT: v_readfirstlane_b32 s1, v3 ; GFX6-NEXT: v_readfirstlane_b32 s2, v0 ; GFX6-NEXT: v_readfirstlane_b32 s3, v1 ; GFX6-NEXT: ; return to shader part epilog @@ -4718,35 +4675,31 @@ ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0 ; GFX8-NEXT: s_ashr_i32 s4, s9, 31 -; GFX8-NEXT: s_mov_b32 s5, 0 -; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc -; GFX8-NEXT: s_cmp_lg_u32 s5, 0 +; GFX8-NEXT: s_add_u32 s5, s4, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s8 -; GFX8-NEXT: s_addc_u32 s1, s4, 0x80000000 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s8 +; GFX8-NEXT: v_mov_b32_e32 v3, s9 +; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc ; GFX8-NEXT: s_sub_u32 s0, s2, s6 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s9 ; GFX8-NEXT: s_subb_u32 s1, s3, s7 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0 ; GFX8-NEXT: s_ashr_i32 s4, s1, 31 -; GFX8-NEXT: s_mov_b32 s5, 0 -; GFX8-NEXT: s_xor_b64 vcc, s[2:3], vcc -; GFX8-NEXT: s_cmp_lg_u32 s5, 0 -; GFX8-NEXT: s_addc_u32 s3, s4, 0x80000000 +; GFX8-NEXT: s_add_u32 s5, s4, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s0 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GFX8-NEXT: v_readfirstlane_b32 s0, v4 -; GFX8-NEXT: v_readfirstlane_b32 s1, v2 +; GFX8-NEXT: s_xor_b64 vcc, s[2:3], vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8-NEXT: v_readfirstlane_b32 s0, v2 +; GFX8-NEXT: v_readfirstlane_b32 s1, v3 ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_readfirstlane_b32 s3, v1 ; GFX8-NEXT: ; return to shader part epilog @@ -4760,35 +4713,31 @@ ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0 ; GFX9-NEXT: s_ashr_i32 s4, s9, 31 -; GFX9-NEXT: s_mov_b32 s5, 0 -; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc -; GFX9-NEXT: s_cmp_lg_u32 s5, 0 +; GFX9-NEXT: s_add_u32 s5, s4, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-NEXT: s_addc_u32 s1, s4, 0x80000000 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc ; GFX9-NEXT: s_sub_u32 s0, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_mov_b32_e32 v3, s9 ; GFX9-NEXT: s_subb_u32 s1, s3, s7 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0 ; GFX9-NEXT: s_ashr_i32 s4, s1, 31 -; GFX9-NEXT: s_mov_b32 s5, 0 -; GFX9-NEXT: s_xor_b64 vcc, s[2:3], vcc -; GFX9-NEXT: s_cmp_lg_u32 s5, 0 -; GFX9-NEXT: s_addc_u32 s3, s4, 0x80000000 +; GFX9-NEXT: s_add_u32 s5, s4, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 ; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GFX9-NEXT: v_readfirstlane_b32 s0, v4 -; GFX9-NEXT: v_readfirstlane_b32 s1, v2 +; GFX9-NEXT: s_xor_b64 vcc, s[2:3], vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9-NEXT: v_readfirstlane_b32 s0, v2 +; GFX9-NEXT: v_readfirstlane_b32 s1, v3 ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_readfirstlane_b32 s3, v1 ; GFX9-NEXT: ; return to shader part epilog @@ -4798,29 +4747,26 @@ ; GFX10-NEXT: s_sub_u32 s8, s0, s4 ; GFX10-NEXT: s_subb_u32 s9, s1, s5 ; GFX10-NEXT: v_cmp_gt_i64_e64 s4, s[4:5], 0 -; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[8:9], s[0:1] -; GFX10-NEXT: s_mov_b32 s10, 0 +; GFX10-NEXT: v_cmp_lt_i64_e64 s10, s[8:9], s[0:1] ; GFX10-NEXT: s_ashr_i32 s0, s9, 31 ; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: s_add_u32 s1, s0, 0x80000000 ; GFX10-NEXT: v_mov_b32_e32 v1, s9 -; GFX10-NEXT: s_xor_b32 s8, s4, s1 -; GFX10-NEXT: s_cmp_lg_u32 s10, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s0, s8 -; GFX10-NEXT: s_addc_u32 s1, s0, 0x80000000 +; GFX10-NEXT: s_xor_b32 s8, s4, s10 ; GFX10-NEXT: s_sub_u32 s4, s2, s6 ; GFX10-NEXT: s_subb_u32 s5, s3, s7 ; GFX10-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[4:5], s[2:3] ; GFX10-NEXT: v_cmp_gt_i64_e64 s3, s[6:7], 0 -; GFX10-NEXT: s_ashr_i32 s0, s5, 31 ; GFX10-NEXT: v_mov_b32_e32 v3, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s0, s8 +; GFX10-NEXT: s_ashr_i32 s0, s5, 31 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s1, s8 +; GFX10-NEXT: s_add_u32 s1, s0, 0x80000000 ; GFX10-NEXT: s_xor_b32 s2, s3, s2 -; GFX10-NEXT: s_cmp_lg_u32 s10, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s0, s2 -; GFX10-NEXT: s_addc_u32 s1, s0, 0x80000000 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s1, s2 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: v_readfirstlane_b32 s3, v3 @@ -4831,27 +4777,24 @@ ; GFX11-NEXT: s_sub_u32 s8, s0, s4 ; GFX11-NEXT: s_subb_u32 s9, s1, s5 ; GFX11-NEXT: v_cmp_gt_i64_e64 s4, s[4:5], 0 -; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[8:9], s[0:1] -; GFX11-NEXT: s_mov_b32 s10, 0 +; GFX11-NEXT: v_cmp_lt_i64_e64 s10, s[8:9], s[0:1] ; GFX11-NEXT: s_ashr_i32 s0, s9, 31 ; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 -; GFX11-NEXT: s_xor_b32 s8, s4, s1 -; GFX11-NEXT: s_cmp_lg_u32 s10, 0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, s8 -; GFX11-NEXT: s_addc_u32 s1, s0, 0x80000000 +; GFX11-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX11-NEXT: s_xor_b32 s8, s4, s10 ; GFX11-NEXT: s_sub_u32 s4, s2, s6 ; GFX11-NEXT: s_subb_u32 s5, s3, s7 ; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[4:5], s[2:3] ; GFX11-NEXT: v_cmp_gt_i64_e64 s3, s[6:7], 0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, s8 ; GFX11-NEXT: s_ashr_i32 s0, s5, 31 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s1, s8 +; GFX11-NEXT: s_add_u32 s1, s0, 0x80000000 ; GFX11-NEXT: s_xor_b32 s2, s3, s2 -; GFX11-NEXT: s_cmp_lg_u32 s10, 0 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s0, s2 -; GFX11-NEXT: s_addc_u32 s1, s0, 0x80000000 -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s1, s2 +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: v_readfirstlane_b32 s1, v1 ; GFX11-NEXT: v_readfirstlane_b32 s2, v2 ; GFX11-NEXT: v_readfirstlane_b32 s3, v3 @@ -4882,24 +4825,19 @@ ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[6:7], 0 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; GFX6-NEXT: s_ashr_i32 s0, s11, 31 -; GFX6-NEXT: s_mov_b32 s1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX6-NEXT: s_cmp_lg_u32 s1, 0 ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_addc_u32 s1, s0, 0 -; GFX6-NEXT: s_addc_u32 s2, s0, 0 +; GFX6-NEXT: s_ashr_i32 s0, s11, 31 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_addc_u32 s3, s0, 0x80000000 +; GFX6-NEXT: s_add_u32 s1, s0, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v2, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: v_mov_b32_e32 v3, s8 ; GFX6-NEXT: v_mov_b32_e32 v4, s9 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc -; GFX6-NEXT: v_mov_b32_e32 v2, s2 -; GFX6-NEXT: v_mov_b32_e32 v3, s3 +; GFX6-NEXT: v_mov_b32_e32 v3, s1 ; GFX6-NEXT: v_mov_b32_e32 v4, s10 ; GFX6-NEXT: v_mov_b32_e32 v5, s11 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc @@ -4937,24 +4875,20 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX8-NEXT: s_and_b32 s0, 1, s2 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX8-NEXT: s_ashr_i32 s0, s11, 31 -; GFX8-NEXT: s_mov_b32 s1, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX8-NEXT: s_cmp_lg_u32 s1, 0 ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX8-NEXT: s_addc_u32 s1, s0, 0 -; GFX8-NEXT: s_addc_u32 s2, s0, 0 +; GFX8-NEXT: s_ashr_i32 s0, s11, 31 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_addc_u32 s3, s0, 0x80000000 +; GFX8-NEXT: s_add_u32 s1, s0, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 -; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, s8 ; GFX8-NEXT: v_mov_b32_e32 v4, s9 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s10 ; GFX8-NEXT: v_mov_b32_e32 v5, s11 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc @@ -4992,24 +4926,20 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX9-NEXT: s_and_b32 s0, 1, s2 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX9-NEXT: s_ashr_i32 s0, s11, 31 -; GFX9-NEXT: s_mov_b32 s1, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: s_cmp_lg_u32 s1, 0 ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX9-NEXT: s_addc_u32 s1, s0, 0 -; GFX9-NEXT: s_addc_u32 s2, s0, 0 +; GFX9-NEXT: s_ashr_i32 s0, s11, 31 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_addc_u32 s3, s0, 0x80000000 +; GFX9-NEXT: s_add_u32 s1, s0, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s8 ; GFX9-NEXT: v_mov_b32_e32 v4, s9 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mov_b32_e32 v4, s10 ; GFX9-NEXT: v_mov_b32_e32 v5, s11 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc @@ -5041,26 +4971,24 @@ ; GFX10-NEXT: s_cselect_b32 s1, 1, 0 ; GFX10-NEXT: s_ashr_i32 s0, s11, 31 ; GFX10-NEXT: s_and_b32 s1, 1, s1 +; GFX10-NEXT: s_mov_b32 s3, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 -; GFX10-NEXT: s_mov_b32 s1, 0 -; GFX10-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX10-NEXT: s_mov_b32 s2, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, s9 ; GFX10-NEXT: v_mov_b32_e32 v3, s11 -; GFX10-NEXT: s_addc_u32 s1, s0, 0 -; GFX10-NEXT: s_addc_u32 s2, s0, 0 ; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, s8 -; GFX10-NEXT: s_addc_u32 s3, s0, 0x80000000 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s10 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s1, vcc_lo ; GFX10-NEXT: v_readfirstlane_b32 s0, v1 ; GFX10-NEXT: v_readfirstlane_b32 s1, v2 ; GFX10-NEXT: v_readfirstlane_b32 s2, v0 @@ -5088,25 +5016,22 @@ ; GFX11-NEXT: s_cselect_b32 s1, 1, 0 ; GFX11-NEXT: s_ashr_i32 s0, s11, 31 ; GFX11-NEXT: s_and_b32 s1, 1, s1 +; GFX11-NEXT: s_mov_b32 s3, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX11-NEXT: s_mov_b32 s2, s0 ; GFX11-NEXT: v_dual_cndmask_b32 v1, v3, v2 :: v_dual_mov_b32 v2, s9 ; GFX11-NEXT: v_mov_b32_e32 v3, s11 -; GFX11-NEXT: s_addc_u32 s1, s0, 0 -; GFX11-NEXT: s_addc_u32 s2, s0, 0 ; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX11-NEXT: v_mov_b32_e32 v1, s8 -; GFX11-NEXT: s_addc_u32 s3, s0, 0x80000000 -; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, s8 :: v_dual_and_b32 v0, 1, v0 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s10 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s0, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s3, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s1, vcc_lo ; GFX11-NEXT: v_readfirstlane_b32 s0, v1 ; GFX11-NEXT: v_readfirstlane_b32 s1, v2 ; GFX11-NEXT: v_readfirstlane_b32 s2, v0 @@ -5137,19 +5062,16 @@ ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[2:3] ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GFX6-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v7 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v7 -; GFX6-NEXT: s_mov_b64 vcc, 0 -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v1, vcc ; GFX6-NEXT: v_xor_b32_e32 v0, v0, v8 -; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc +; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX6-NEXT: ; return to shader part epilog ; @@ -5173,19 +5095,16 @@ ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[2:3] ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GFX8-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v7 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v7 -; GFX8-NEXT: s_mov_b64 vcc, 0 -; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v1, vcc ; GFX8-NEXT: v_xor_b32_e32 v0, v0, v8 -; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc +; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v2, v1 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX8-NEXT: ; return to shader part epilog ; @@ -5209,53 +5128,75 @@ ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[2:3] ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GFX9-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v7 -; GFX9-NEXT: s_mov_b64 vcc, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v1, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v1, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc +; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v2, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: ssubsat_i128_sv: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_sub_co_u32 v4, vcc_lo, s0, v0 -; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo -; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo -; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo -; GFX10PLUS-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[4:5] -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo -; GFX10PLUS-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[6:7] -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo -; GFX10PLUS-NEXT: v_cmp_lt_u64_e32 vcc_lo, 0, v[0:1] -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX10PLUS-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3] -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[6:7] -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3] -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v1, 31, v7 -; GFX10PLUS-NEXT: s_mov_b32 vcc_lo, 0 -; GFX10PLUS-NEXT: v_xor_b32_e32 v0, v0, v8 -; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v1, vcc_lo -; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo -; GFX10PLUS-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0x80000000, v1, vcc_lo -; GFX10PLUS-NEXT: v_cmp_ne_u32_e64 s0, 0, v0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v4, v1, s0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v5, v2, s0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, v6, v3, s0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v3, v7, v8, s0 -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: ssubsat_i128_sv: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, s0, v0 +; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo +; GFX10-NEXT: v_sub_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo +; GFX10-NEXT: v_sub_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo +; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[4:5] +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo +; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[6:7] +; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo +; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, 0, v[0:1] +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3] +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[6:7] +; GFX10-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc_lo +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3] +; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v7 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX10-NEXT: v_add_co_u32 v3, s0, 0x80000000, v2 +; GFX10-NEXT: v_xor_b32_e32 v0, v0, v8 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: ssubsat_i128_sv: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, s0, v0 +; GFX11-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo +; GFX11-NEXT: v_sub_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo +; GFX11-NEXT: v_sub_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo +; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[4:5] +; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[6:7] +; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3] +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[6:7] +; GFX11-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc_lo +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3] +; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v7 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX11-NEXT: v_add_co_u32 v3, null, 0x80000000, v2 +; GFX11-NEXT: v_xor_b32_e32 v0, v0, v8 +; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX11-NEXT: ; return to shader part epilog %result = call i128 @llvm.ssub.sat.i128(i128 %lhs, i128 %rhs) %cast = bitcast i128 %result to <4 x float> ret <4 x float> %cast @@ -5277,7 +5218,6 @@ ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3] ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] -; GFX6-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 @@ -5285,16 +5225,14 @@ ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v7 -; GFX6-NEXT: s_mov_b64 vcc, 0 -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v1, vcc -; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc +; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v7 +; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX6-NEXT: ; return to shader part epilog ; @@ -5318,22 +5256,19 @@ ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 -; GFX8-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX8-NEXT: s_and_b32 s0, 1, s4 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v7 -; GFX8-NEXT: s_mov_b64 vcc, 0 -; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v1, vcc -; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc +; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v7 +; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v2, v1 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX8-NEXT: ; return to shader part epilog ; @@ -5357,59 +5292,84 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 -; GFX9-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX9-NEXT: s_and_b32 s0, 1, s4 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v7 -; GFX9-NEXT: s_mov_b64 vcc, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v1, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v1, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc +; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v7 +; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v2, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: ssubsat_i128_vs: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_sub_co_u32 v4, vcc_lo, v0, s0 -; GFX10PLUS-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo -; GFX10PLUS-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo -; GFX10PLUS-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo -; GFX10PLUS-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1] -; GFX10PLUS-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], 0 -; GFX10PLUS-NEXT: s_cmp_eq_u64 s[2:3], 0 -; GFX10PLUS-NEXT: s_cselect_b32 s4, 1, 0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX10PLUS-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3] -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 -; GFX10PLUS-NEXT: v_cmp_gt_i64_e64 s0, s[2:3], 0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3] -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0 -; GFX10PLUS-NEXT: s_and_b32 s0, 1, s4 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX10PLUS-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo -; GFX10PLUS-NEXT: s_mov_b32 vcc_lo, 0 -; GFX10PLUS-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v1, 31, v7 -; GFX10PLUS-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v1, vcc_lo -; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo -; GFX10PLUS-NEXT: v_cmp_ne_u32_e64 s0, 0, v0 -; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0x80000000, v1, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v4, v1, s0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v5, v2, s0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, v6, v3, s0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v3, v7, v8, s0 -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: ssubsat_i128_vs: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v0, s0 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo +; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1] +; GFX10-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], 0 +; GFX10-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10-NEXT: s_cselect_b32 s4, 1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3] +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 +; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[2:3], 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3] +; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0 +; GFX10-NEXT: s_and_b32 s0, 1, s4 +; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v7 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX10-NEXT: v_add_co_u32 v3, s0, 0x80000000, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo +; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: ssubsat_i128_vs: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v0, s0 +; GFX11-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo +; GFX11-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo +; GFX11-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo +; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1] +; GFX11-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], 0 +; GFX11-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11-NEXT: s_cselect_b32 s4, 1, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3] +; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 +; GFX11-NEXT: v_cmp_gt_i64_e64 s0, s[2:3], 0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3] +; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v7 +; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0 +; GFX11-NEXT: s_and_b32 s0, 1, s4 +; GFX11-NEXT: v_add_co_u32 v3, null, 0x80000000, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo +; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX11-NEXT: ; return to shader part epilog %result = call i128 @llvm.ssub.sat.i128(i128 %lhs, i128 %rhs) %cast = bitcast i128 %result to <4 x float> ret <4 x float> %cast @@ -5437,44 +5397,39 @@ ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] ; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v19 -; GFX6-NEXT: s_mov_b64 vcc, 0 -; GFX6-NEXT: v_addc_u32_e64 v2, s[4:5], 0, v1, vcc -; GFX6-NEXT: v_addc_u32_e64 v3, s[4:5], 0, v1, s[4:5] -; GFX6-NEXT: v_addc_u32_e64 v8, s[4:5], v1, v20, s[4:5] +; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v19 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v20 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v16, v1, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v1, v17, v2, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v2, v18, v3, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v3, v19, v8, s[4:5] -; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v4, v12 -; GFX6-NEXT: v_subb_u32_e64 v9, s[4:5], v5, v13, s[4:5] -; GFX6-NEXT: v_subb_u32_e64 v10, s[4:5], v6, v14, s[4:5] -; GFX6-NEXT: v_subb_u32_e64 v11, s[4:5], v7, v15, s[4:5] -; GFX6-NEXT: v_cmp_lt_u64_e64 s[4:5], v[8:9], v[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] -; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[10:11], v[6:7] -; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] -; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], v[10:11], v[6:7] -; GFX6-NEXT: v_cndmask_b32_e64 v4, v5, v4, s[4:5] -; GFX6-NEXT: v_cmp_lt_u64_e64 s[4:5], 0, v[12:13] -; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] -; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[14:15] -; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[14:15] -; GFX6-NEXT: v_cndmask_b32_e64 v5, v6, v5, s[4:5] +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v17, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v4, v12 +; GFX6-NEXT: v_subb_u32_e32 v9, vcc, v5, v13, vcc +; GFX6-NEXT: v_subb_u32_e32 v10, vcc, v6, v14, vcc +; GFX6-NEXT: v_subb_u32_e32 v11, vcc, v7, v15, vcc +; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7] +; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7] +; GFX6-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc +; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[12:13] +; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[14:15] +; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] +; GFX6-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GFX6-NEXT: v_xor_b32_e32 v4, v5, v4 -; GFX6-NEXT: v_ashrrev_i32_e32 v5, 31, v11 -; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v5, vcc -; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc -; GFX6-NEXT: v_addc_u32_e32 v12, vcc, v5, v20, vcc +; GFX6-NEXT: v_ashrrev_i32_e32 v6, 31, v11 +; GFX6-NEXT: v_add_i32_e32 v7, vcc, v6, v20 ; GFX6-NEXT: v_and_b32_e32 v4, 1, v4 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v5, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v6, v10, v7, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_v2i128: @@ -5498,44 +5453,39 @@ ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v19 -; GFX8-NEXT: s_mov_b64 vcc, 0 -; GFX8-NEXT: v_addc_u32_e64 v2, s[4:5], 0, v1, vcc -; GFX8-NEXT: v_addc_u32_e64 v3, s[4:5], 0, v1, s[4:5] -; GFX8-NEXT: v_addc_u32_e64 v8, s[4:5], v1, v20, s[4:5] +; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v19 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v2, v20 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v16, v1, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v17, v2, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v18, v3, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v19, v8, s[4:5] -; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v4, v12 -; GFX8-NEXT: v_subb_u32_e64 v9, s[4:5], v5, v13, s[4:5] -; GFX8-NEXT: v_subb_u32_e64 v10, s[4:5], v6, v14, s[4:5] -; GFX8-NEXT: v_subb_u32_e64 v11, s[4:5], v7, v15, s[4:5] -; GFX8-NEXT: v_cmp_lt_u64_e64 s[4:5], v[8:9], v[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] -; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[10:11], v[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] -; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], v[10:11], v[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v5, v4, s[4:5] -; GFX8-NEXT: v_cmp_lt_u64_e64 s[4:5], 0, v[12:13] -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] -; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[14:15] -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[14:15] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v6, v5, s[4:5] +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc +; GFX8-NEXT: v_sub_u32_e32 v8, vcc, v4, v12 +; GFX8-NEXT: v_subb_u32_e32 v9, vcc, v5, v13, vcc +; GFX8-NEXT: v_subb_u32_e32 v10, vcc, v6, v14, vcc +; GFX8-NEXT: v_subb_u32_e32 v11, vcc, v7, v15, vcc +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7] +; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[12:13] +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[14:15] +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GFX8-NEXT: v_xor_b32_e32 v4, v5, v4 -; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v11 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v5, vcc -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc -; GFX8-NEXT: v_addc_u32_e32 v12, vcc, v5, v20, vcc +; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v11 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v6, v20 ; GFX8-NEXT: v_and_b32_e32 v4, 1, v4 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v10, v7, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_ssubsat_v2i128: @@ -5559,44 +5509,39 @@ ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v19 -; GFX9-NEXT: s_mov_b64 vcc, 0 -; GFX9-NEXT: v_addc_co_u32_e64 v2, s[4:5], 0, v1, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, v1, s[4:5] -; GFX9-NEXT: v_addc_co_u32_e64 v8, s[4:5], v1, v20, s[4:5] +; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v19 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v2, v20 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v16, v1, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v17, v2, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v18, v3, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v19, v8, s[4:5] -; GFX9-NEXT: v_sub_co_u32_e64 v8, s[4:5], v4, v12 -; GFX9-NEXT: v_subb_co_u32_e64 v9, s[4:5], v5, v13, s[4:5] -; GFX9-NEXT: v_subb_co_u32_e64 v10, s[4:5], v6, v14, s[4:5] -; GFX9-NEXT: v_subb_co_u32_e64 v11, s[4:5], v7, v15, s[4:5] -; GFX9-NEXT: v_cmp_lt_u64_e64 s[4:5], v[8:9], v[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[10:11], v[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] -; GFX9-NEXT: v_cmp_eq_u64_e64 s[4:5], v[10:11], v[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v5, v4, s[4:5] -; GFX9-NEXT: v_cmp_lt_u64_e64 s[4:5], 0, v[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[14:15] -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; GFX9-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[14:15] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v6, v5, s[4:5] +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc +; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v4, v12 +; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, v5, v13, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, v6, v14, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, v7, v15, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7] +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[12:13] +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[14:15] +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GFX9-NEXT: v_xor_b32_e32 v4, v5, v4 -; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v11 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, v5, v20, vcc +; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v11 +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v6, v20 ; GFX9-NEXT: v_and_b32_e32 v4, 1, v4 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_ssubsat_v2i128: @@ -5611,53 +5556,48 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[2:3] ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, 0, v[8:9] -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[10:11] -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[2:3] ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11] -; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo +; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, 0, v[8:9] +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[10:11] +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v4, v12 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v5, v13, vcc_lo -; GFX10-NEXT: v_sub_co_ci_u32_e32 v10, vcc_lo, v6, v14, vcc_lo -; GFX10-NEXT: v_sub_co_ci_u32_e32 v11, vcc_lo, v7, v15, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e64 s5, v[8:9], v[4:5] +; GFX10-NEXT: v_sub_co_ci_u32_e32 v20, vcc_lo, v6, v14, vcc_lo +; GFX10-NEXT: v_sub_co_ci_u32_e32 v21, vcc_lo, v7, v15, vcc_lo +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11] +; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo +; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[8:9], v[4:5] ; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v19 -; GFX10-NEXT: v_cmp_eq_u64_e64 s6, v[10:11], v[6:7] -; GFX10-NEXT: s_mov_b32 vcc_lo, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s5 -; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[10:11], v[6:7] +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[20:21], v[6:7] ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v11 -; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s4, 0, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s5 -; GFX10-NEXT: v_cmp_lt_u64_e64 s5, 0, v[12:13] -; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v6, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s5 -; GFX10-NEXT: v_cmp_lt_i64_e64 s5, 0, v[14:15] -; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s5 -; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, v3, s6 -; GFX10-NEXT: v_cmp_eq_u64_e64 s6, 0, v[14:15] -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s4, 0, v1, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v12, v5, s6 -; GFX10-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, 0, v6, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s4, 0x80000000, v1, s4 -; GFX10-NEXT: v_xor_b32_e32 v3, v3, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, 0x80000000, v6, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v16, v1, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v17, v2, s5 -; GFX10-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v18, v4, s5 -; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v19, v5, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, 0, v[12:13] +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[14:15] +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[20:21], v[6:7] +; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v21 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15] +; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v19 +; GFX10-NEXT: v_add_co_u32 v7, s5, 0x80000000, v6 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc_lo +; GFX10-NEXT: v_add_co_u32 v4, s4, 0x80000000, v3 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: v_xor_b32_e32 v1, v2, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v16, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, v18, v3, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v5, 1, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v17, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v19, v4, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v6, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v9, v7, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v10, v12, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v11, v13, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v9, v6, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v20, v6, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v21, v7, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_ssubsat_v2i128: @@ -5672,53 +5612,47 @@ ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[2:3] ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, 0, v[8:9] -; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo -; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[10:11] -; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[2:3] ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11] -; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo +; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, 0, v[8:9] +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[10:11] +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX11-NEXT: v_sub_co_u32 v8, vcc_lo, v4, v12 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v5, v13, vcc_lo -; GFX11-NEXT: v_sub_co_ci_u32_e32 v10, vcc_lo, v6, v14, vcc_lo -; GFX11-NEXT: v_sub_co_ci_u32_e32 v11, vcc_lo, v7, v15, vcc_lo -; GFX11-NEXT: v_cmp_lt_u64_e64 s1, v[8:9], v[4:5] +; GFX11-NEXT: v_sub_co_ci_u32_e32 v20, vcc_lo, v6, v14, vcc_lo +; GFX11-NEXT: v_sub_co_ci_u32_e32 v21, vcc_lo, v7, v15, vcc_lo +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11] +; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo +; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[8:9], v[4:5] ; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v19 -; GFX11-NEXT: v_cmp_eq_u64_e64 s2, v[10:11], v[6:7] -; GFX11-NEXT: s_mov_b32 vcc_lo, 0 -; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s1 -; GFX11-NEXT: v_cmp_lt_i64_e64 s1, v[10:11], v[6:7] +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[20:21], v[6:7] +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, 0, v[12:13] +; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[14:15] +; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[20:21], v[6:7] +; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v21 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15] +; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v19 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v11 -; GFX11-NEXT: v_add_co_ci_u32_e64 v2, s0, 0, v1, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s1 -; GFX11-NEXT: v_cmp_lt_u64_e64 s1, 0, v[12:13] -; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v6, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, s1 -; GFX11-NEXT: v_cmp_lt_i64_e64 s1, 0, v[14:15] -; GFX11-NEXT: v_cndmask_b32_e64 v12, 0, 1, s1 -; GFX11-NEXT: v_cmp_ne_u32_e64 s1, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v4, v3, s2 -; GFX11-NEXT: v_cmp_eq_u64_e64 s2, 0, v[14:15] -; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s0, 0, v1, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v12, v5, s2 -; GFX11-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, 0, v6, vcc_lo -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s0, 0x80000000, v1, s0 -; GFX11-NEXT: v_xor_b32_e32 v3, v3, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, 0x80000000, v6, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v0, v16, v1, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v17, v2, s1 -; GFX11-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v18, v4, s1 -; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v3 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v19, v5, s1 +; GFX11-NEXT: v_add_co_u32 v7, null, 0x80000000, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc_lo +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_add_co_u32 v4, null, 0x80000000, v3 +; GFX11-NEXT: v_xor_b32_e32 v1, v2, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v16, v3, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v2, v18, v3 :: v_dual_and_b32 v5, 1, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v17, v3, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v3, v19, v4, vcc_lo +; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v5 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v8, v6, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v9, v7, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v10, v12, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v11, v13, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v9, v6, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v20, v6, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v21, v7, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs) ret <2 x i128> %result @@ -5746,24 +5680,20 @@ ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[10:11], 0 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; GFX6-NEXT: s_ashr_i32 s0, s19, 31 -; GFX6-NEXT: s_mov_b32 s1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX6-NEXT: s_cmp_lg_u32 s1, 0 ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_addc_u32 s1, s0, 0 -; GFX6-NEXT: s_addc_u32 s2, s0, 0 +; GFX6-NEXT: s_ashr_i32 s0, s19, 31 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_addc_u32 s3, s0, 0x80000000 +; GFX6-NEXT: s_add_u32 s1, s0, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v2, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: v_mov_b32_e32 v3, s16 ; GFX6-NEXT: v_mov_b32_e32 v4, s17 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc -; GFX6-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s18 ; GFX6-NEXT: v_mov_b32_e32 v3, s19 ; GFX6-NEXT: v_cndmask_b32_e32 v6, v2, v0, vcc @@ -5787,24 +5717,19 @@ ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[14:15], 0 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] -; GFX6-NEXT: s_ashr_i32 s4, s3, 31 -; GFX6-NEXT: s_mov_b32 s5, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX6-NEXT: s_cmp_lg_u32 s5, 0 ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_addc_u32 s5, s4, 0 -; GFX6-NEXT: s_addc_u32 s6, s4, 0 +; GFX6-NEXT: s_ashr_i32 s4, s3, 31 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_addc_u32 s7, s4, 0x80000000 +; GFX6-NEXT: s_add_u32 s5, s4, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: v_mov_b32_e32 v3, s0 ; GFX6-NEXT: v_mov_b32_e32 v8, s1 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc -; GFX6-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NEXT: v_mov_b32_e32 v3, s5 ; GFX6-NEXT: v_mov_b32_e32 v8, s2 ; GFX6-NEXT: v_mov_b32_e32 v9, s3 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc @@ -5846,24 +5771,20 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX8-NEXT: s_and_b32 s0, 1, s2 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX8-NEXT: s_ashr_i32 s0, s19, 31 -; GFX8-NEXT: s_mov_b32 s1, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX8-NEXT: s_cmp_lg_u32 s1, 0 ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX8-NEXT: s_addc_u32 s1, s0, 0 -; GFX8-NEXT: s_addc_u32 s2, s0, 0 +; GFX8-NEXT: s_ashr_i32 s0, s19, 31 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_addc_u32 s3, s0, 0x80000000 +; GFX8-NEXT: s_add_u32 s1, s0, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 -; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, s16 ; GFX8-NEXT: v_mov_b32_e32 v4, s17 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s18 ; GFX8-NEXT: v_mov_b32_e32 v3, s19 ; GFX8-NEXT: s_sub_u32 s0, s4, s12 @@ -5893,24 +5814,20 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] ; GFX8-NEXT: s_and_b32 s4, 1, s6 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 -; GFX8-NEXT: s_ashr_i32 s4, s3, 31 -; GFX8-NEXT: s_mov_b32 s5, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX8-NEXT: s_cmp_lg_u32 s5, 0 ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX8-NEXT: s_addc_u32 s5, s4, 0 -; GFX8-NEXT: s_addc_u32 s6, s4, 0 +; GFX8-NEXT: s_ashr_i32 s4, s3, 31 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_addc_u32 s7, s4, 0x80000000 +; GFX8-NEXT: s_add_u32 s5, s4, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NEXT: v_mov_b32_e32 v8, s1 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: v_mov_b32_e32 v8, s2 ; GFX8-NEXT: v_mov_b32_e32 v9, s3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc @@ -5952,24 +5869,20 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX9-NEXT: s_and_b32 s0, 1, s2 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX9-NEXT: s_ashr_i32 s0, s19, 31 -; GFX9-NEXT: s_mov_b32 s1, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: s_cmp_lg_u32 s1, 0 ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX9-NEXT: s_addc_u32 s1, s0, 0 -; GFX9-NEXT: s_addc_u32 s2, s0, 0 +; GFX9-NEXT: s_ashr_i32 s0, s19, 31 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_addc_u32 s3, s0, 0x80000000 +; GFX9-NEXT: s_add_u32 s1, s0, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s16 ; GFX9-NEXT: v_mov_b32_e32 v4, s17 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: s_sub_u32 s0, s4, s12 @@ -5999,24 +5912,20 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] ; GFX9-NEXT: s_and_b32 s4, 1, s6 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 -; GFX9-NEXT: s_ashr_i32 s4, s3, 31 -; GFX9-NEXT: s_mov_b32 s5, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: s_cmp_lg_u32 s5, 0 ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX9-NEXT: s_addc_u32 s5, s4, 0 -; GFX9-NEXT: s_addc_u32 s6, s4, 0 +; GFX9-NEXT: s_ashr_i32 s4, s3, 31 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_addc_u32 s7, s4, 0x80000000 +; GFX9-NEXT: s_add_u32 s5, s4, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-NEXT: v_mov_b32_e32 v8, s1 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: v_mov_b32_e32 v8, s2 ; GFX9-NEXT: v_mov_b32_e32 v9, s3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc @@ -6046,35 +5955,31 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX10-NEXT: s_and_b32 s0, 1, s20 ; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10-NEXT: s_cselect_b32 s1, 1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 ; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[10:11], 0 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10-NEXT: s_mov_b32 s20, 0 ; GFX10-NEXT: s_and_b32 s1, 1, s1 ; GFX10-NEXT: s_ashr_i32 s0, s19, 31 -; GFX10-NEXT: s_cmp_lg_u32 s20, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 -; GFX10-NEXT: s_addc_u32 s1, s0, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 -; GFX10-NEXT: s_addc_u32 s2, s0, 0 -; GFX10-NEXT: s_addc_u32 s3, s0, 0x80000000 +; GFX10-NEXT: s_add_u32 s1, s0, 0x80000000 ; GFX10-NEXT: s_sub_u32 s8, s4, s12 ; GFX10-NEXT: s_subb_u32 s9, s5, s13 -; GFX10-NEXT: s_subb_u32 s10, s6, s14 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 ; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[8:9], s[4:5] +; GFX10-NEXT: s_subb_u32 s10, s6, s14 ; GFX10-NEXT: s_subb_u32 s11, s7, s15 +; GFX10-NEXT: s_mov_b32 s3, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo ; GFX10-NEXT: s_cmp_eq_u64 s[10:11], s[6:7] -; GFX10-NEXT: v_mov_b32_e32 v2, s17 -; GFX10-NEXT: v_mov_b32_e32 v7, s11 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[10:11], s[6:7] ; GFX10-NEXT: v_cmp_gt_u64_e64 s6, s[12:13], 0 ; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, s16 ; GFX10-NEXT: s_cselect_b32 s16, 1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, s17 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 ; GFX10-NEXT: s_and_b32 s4, 1, s16 ; GFX10-NEXT: s_cmp_eq_u64 s[14:15], 0 @@ -6084,9 +5989,11 @@ ; GFX10-NEXT: s_cselect_b32 s5, 1, 0 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_and_b32 s5, 1, s5 +; GFX10-NEXT: s_mov_b32 s2, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s6 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5 +; GFX10-NEXT: v_mov_b32_e32 v7, s11 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v5, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s18 @@ -6094,22 +6001,21 @@ ; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; GFX10-NEXT: v_xor_b32_e32 v3, v4, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s0, vcc_lo -; GFX10-NEXT: s_ashr_i32 s0, s11, 31 -; GFX10-NEXT: s_cmp_lg_u32 s20, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s1, vcc_lo -; GFX10-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, s3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, s1, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v3, 1, v3 ; GFX10-NEXT: v_mov_b32_e32 v5, s8 -; GFX10-NEXT: s_addc_u32 s1, s0, 0 +; GFX10-NEXT: s_ashr_i32 s0, s11, 31 +; GFX10-NEXT: s_add_u32 s1, s0, 0x80000000 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, s10 -; GFX10-NEXT: s_addc_u32 s2, s0, 0 -; GFX10-NEXT: s_addc_u32 s3, s0, 0x80000000 +; GFX10-NEXT: s_mov_b32 s3, s0 +; GFX10-NEXT: s_mov_b32 s2, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, s1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, s3, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, s3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, s1, vcc_lo ; GFX10-NEXT: v_readfirstlane_b32 s0, v1 ; GFX10-NEXT: v_readfirstlane_b32 s1, v2 ; GFX10-NEXT: v_readfirstlane_b32 s2, v0 @@ -6137,32 +6043,31 @@ ; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 ; GFX11-NEXT: v_cmp_gt_i64_e64 s2, s[10:11], 0 -; GFX11-NEXT: s_cselect_b32 s1, 1, 0 ; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX11-NEXT: s_mov_b32 s20, 0 -; GFX11-NEXT: s_and_b32 s1, 1, s1 +; GFX11-NEXT: s_cselect_b32 s1, 1, 0 ; GFX11-NEXT: s_ashr_i32 s0, s19, 31 -; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_and_b32 s1, 1, s1 +; GFX11-NEXT: s_mov_b32 s3, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 -; GFX11-NEXT: s_addc_u32 s1, s0, 0 -; GFX11-NEXT: s_addc_u32 s2, s0, 0 -; GFX11-NEXT: s_addc_u32 s3, s0, 0x80000000 +; GFX11-NEXT: s_add_u32 s1, s0, 0x80000000 ; GFX11-NEXT: s_sub_u32 s8, s4, s12 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo ; GFX11-NEXT: s_subb_u32 s9, s5, s13 ; GFX11-NEXT: s_subb_u32 s10, s6, s14 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo ; GFX11-NEXT: v_cmp_lt_u64_e64 s4, s[8:9], s[4:5] ; GFX11-NEXT: s_subb_u32 s11, s7, s15 -; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX11-NEXT: s_mov_b32 s2, s0 ; GFX11-NEXT: s_cmp_eq_u64 s[10:11], s[6:7] +; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX11-NEXT: v_mov_b32_e32 v1, s16 -; GFX11-NEXT: s_cselect_b32 s16, 1, 0 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 ; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[10:11], s[6:7] ; GFX11-NEXT: v_cmp_gt_u64_e64 s6, s[12:13], 0 -; GFX11-NEXT: v_dual_mov_b32 v7, s11 :: v_dual_and_b32 v0, 1, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_cselect_b32 s16, 1, 0 +; GFX11-NEXT: v_mov_b32_e32 v7, s11 ; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 ; GFX11-NEXT: s_and_b32 s4, 1, s16 ; GFX11-NEXT: s_cmp_eq_u64 s[14:15], 0 @@ -6181,22 +6086,21 @@ ; GFX11-NEXT: v_xor_b32_e32 v3, v4, v3 ; GFX11-NEXT: v_mov_b32_e32 v0, s18 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s0, vcc_lo -; GFX11-NEXT: s_ashr_i32 s0, s11, 31 -; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v4, v5, s1, vcc_lo ; GFX11-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s1, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v4, v5, s3, vcc_lo ; GFX11-NEXT: v_mov_b32_e32 v5, s8 +; GFX11-NEXT: s_ashr_i32 s0, s11, 31 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, s10 -; GFX11-NEXT: s_addc_u32 s1, s0, 0 -; GFX11-NEXT: s_addc_u32 s2, s0, 0 -; GFX11-NEXT: s_addc_u32 s3, s0, 0x80000000 +; GFX11-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s0, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s3, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s2, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s3, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s1, vcc_lo ; GFX11-NEXT: v_readfirstlane_b32 s0, v1 ; GFX11-NEXT: v_readfirstlane_b32 s1, v2 ; GFX11-NEXT: v_readfirstlane_b32 s2, v0