diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -4184,10 +4184,12 @@ const LLT HalfTy = LLT::scalar(NewBitSize); const LLT CondTy = LLT::scalar(1); - if (const MachineInstr *KShiftAmt = - getOpcodeDef(TargetOpcode::G_CONSTANT, Amt, MRI)) { + if (auto VRegAndVal = + getConstantVRegValWithLookThrough(Amt, MRI, true, false)) { + auto *DefMI = MRI.getVRegDef(VRegAndVal->VReg); return narrowScalarShiftByConstant( - MI, KShiftAmt->getOperand(1).getCImm()->getValue(), HalfTy, ShiftAmtTy); + MI, DefMI->getOperand(1).getCImm()->getValue(), HalfTy, + MRI.getType(VRegAndVal->VReg)); } // TODO: Expand with known bits. diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -5338,59 +5338,38 @@ ; GFX6-NEXT: v_cmp_lt_u64_e64 s[0:1], s[4:5], 0 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[10:11], v[0:1] -; GFX6-NEXT: s_movk_i32 s2, 0x7f -; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[6:7], 0 -; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[6:7], 0 -; GFX6-NEXT: s_sub_i32 s6, s2, 64 -; GFX6-NEXT: s_sub_i32 s4, 64, s2 -; GFX6-NEXT: s_cmp_lt_u32 s2, 64 -; GFX6-NEXT: s_cselect_b32 s12, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s2, 0 -; GFX6-NEXT: s_cselect_b32 s13, 1, 0 +; GFX6-NEXT: s_ashr_i32 s3, s11, 31 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; GFX6-NEXT: s_ashr_i64 s[0:1], s[10:11], s2 -; GFX6-NEXT: s_lshr_b64 s[2:3], s[8:9], s2 -; GFX6-NEXT: s_lshl_b64 s[4:5], s[10:11], s4 -; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX6-NEXT: s_ashr_i32 s4, s11, 31 -; GFX6-NEXT: s_ashr_i64 s[6:7], s[10:11], s6 -; GFX6-NEXT: s_and_b32 s12, s12, 1 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 -; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[6:7] -; GFX6-NEXT: s_and_b32 s6, s13, 1 -; GFX6-NEXT: s_cmp_lg_u32 s6, 0 -; GFX6-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] -; GFX6-NEXT: s_mov_b32 s5, s4 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 -; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] -; GFX6-NEXT: s_add_u32 s2, s2, 0 -; GFX6-NEXT: s_cselect_b32 s4, 1, 0 -; GFX6-NEXT: s_and_b32 s4, s4, 1 -; GFX6-NEXT: s_cmp_lg_u32 s4, 0 -; GFX6-NEXT: s_addc_u32 s3, s3, 0 -; GFX6-NEXT: s_cselect_b32 s4, 1, 0 -; GFX6-NEXT: s_and_b32 s4, s4, 1 -; GFX6-NEXT: s_cmp_lg_u32 s4, 0 -; GFX6-NEXT: s_addc_u32 s0, s0, 0 +; GFX6-NEXT: s_add_u32 s0, s3, 0 +; GFX6-NEXT: s_cselect_b32 s1, 1, 0 +; GFX6-NEXT: s_and_b32 s1, s1, 1 +; GFX6-NEXT: s_cmp_lg_u32 s1, 0 +; GFX6-NEXT: s_addc_u32 s1, s3, 0 +; GFX6-NEXT: s_cselect_b32 s2, 1, 0 +; GFX6-NEXT: s_and_b32 s2, s2, 1 +; GFX6-NEXT: s_cmp_lg_u32 s2, 0 +; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[6:7], 0 +; GFX6-NEXT: s_addc_u32 s2, s3, 0 ; GFX6-NEXT: s_cselect_b32 s4, 1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_and_b32 s4, s4, 1 ; GFX6-NEXT: s_cmp_lg_u32 s4, 0 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_addc_u32 s1, s1, 0x80000000 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: v_mov_b32_e32 v2, s3 +; GFX6-NEXT: s_addc_u32 s3, s3, 0x80000000 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mov_b32_e32 v2, s1 ; GFX6-NEXT: v_mov_b32_e32 v3, s8 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX6-NEXT: v_mov_b32_e32 v4, s9 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc -; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: v_mov_b32_e32 v3, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s2 ; GFX6-NEXT: v_mov_b32_e32 v4, s10 +; GFX6-NEXT: v_mov_b32_e32 v3, s3 ; GFX6-NEXT: v_mov_b32_e32 v5, s11 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc @@ -5435,55 +5414,34 @@ ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX8-NEXT: s_and_b32 s0, 1, s2 -; GFX8-NEXT: s_movk_i32 s2, 0x7f -; GFX8-NEXT: s_sub_i32 s6, s2, 64 -; GFX8-NEXT: s_sub_i32 s4, 64, s2 -; GFX8-NEXT: s_cmp_lt_u32 s2, 64 -; GFX8-NEXT: s_cselect_b32 s12, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s2, 0 -; GFX8-NEXT: s_cselect_b32 s13, 1, 0 +; GFX8-NEXT: s_ashr_i32 s3, s11, 31 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX8-NEXT: s_ashr_i64 s[0:1], s[10:11], s2 -; GFX8-NEXT: s_lshr_b64 s[2:3], s[8:9], s2 -; GFX8-NEXT: s_lshl_b64 s[4:5], s[10:11], s4 -; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX8-NEXT: s_ashr_i32 s4, s11, 31 -; GFX8-NEXT: s_ashr_i64 s[6:7], s[10:11], s6 -; GFX8-NEXT: s_and_b32 s12, s12, 1 -; GFX8-NEXT: s_cmp_lg_u32 s12, 0 -; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[6:7] -; GFX8-NEXT: s_and_b32 s6, s13, 1 -; GFX8-NEXT: s_cmp_lg_u32 s6, 0 -; GFX8-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] -; GFX8-NEXT: s_mov_b32 s5, s4 -; GFX8-NEXT: s_cmp_lg_u32 s12, 0 -; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] -; GFX8-NEXT: s_add_u32 s2, s2, 0 -; GFX8-NEXT: s_cselect_b32 s4, 1, 0 -; GFX8-NEXT: s_and_b32 s4, s4, 1 -; GFX8-NEXT: s_cmp_lg_u32 s4, 0 -; GFX8-NEXT: s_addc_u32 s3, s3, 0 -; GFX8-NEXT: s_cselect_b32 s4, 1, 0 -; GFX8-NEXT: s_and_b32 s4, s4, 1 -; GFX8-NEXT: s_cmp_lg_u32 s4, 0 -; GFX8-NEXT: s_addc_u32 s0, s0, 0 +; GFX8-NEXT: s_add_u32 s0, s3, 0 +; GFX8-NEXT: s_cselect_b32 s1, 1, 0 +; GFX8-NEXT: s_and_b32 s1, s1, 1 +; GFX8-NEXT: s_cmp_lg_u32 s1, 0 +; GFX8-NEXT: s_addc_u32 s1, s3, 0 +; GFX8-NEXT: s_cselect_b32 s2, 1, 0 +; GFX8-NEXT: s_and_b32 s2, s2, 1 +; GFX8-NEXT: s_cmp_lg_u32 s2, 0 +; GFX8-NEXT: s_addc_u32 s2, s3, 0 ; GFX8-NEXT: s_cselect_b32 s4, 1, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_and_b32 s4, s4, 1 ; GFX8-NEXT: s_cmp_lg_u32 s4, 0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_addc_u32 s1, s1, 0x80000000 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NEXT: s_addc_u32 s3, s3, 0x80000000 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: v_mov_b32_e32 v3, s8 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, s9 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v4, s10 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: v_mov_b32_e32 v5, s11 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc @@ -5528,55 +5486,34 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX9-NEXT: s_and_b32 s0, 1, s2 -; GFX9-NEXT: s_movk_i32 s2, 0x7f -; GFX9-NEXT: s_sub_i32 s6, s2, 64 -; GFX9-NEXT: s_sub_i32 s4, 64, s2 -; GFX9-NEXT: s_cmp_lt_u32 s2, 64 -; GFX9-NEXT: s_cselect_b32 s12, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s2, 0 -; GFX9-NEXT: s_cselect_b32 s13, 1, 0 +; GFX9-NEXT: s_ashr_i32 s3, s11, 31 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX9-NEXT: s_ashr_i64 s[0:1], s[10:11], s2 -; GFX9-NEXT: s_lshr_b64 s[2:3], s[8:9], s2 -; GFX9-NEXT: s_lshl_b64 s[4:5], s[10:11], s4 -; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX9-NEXT: s_ashr_i32 s4, s11, 31 -; GFX9-NEXT: s_ashr_i64 s[6:7], s[10:11], s6 -; GFX9-NEXT: s_and_b32 s12, s12, 1 -; GFX9-NEXT: s_cmp_lg_u32 s12, 0 -; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[6:7] -; GFX9-NEXT: s_and_b32 s6, s13, 1 -; GFX9-NEXT: s_cmp_lg_u32 s6, 0 -; GFX9-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] -; GFX9-NEXT: s_mov_b32 s5, s4 -; GFX9-NEXT: s_cmp_lg_u32 s12, 0 -; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] -; GFX9-NEXT: s_add_u32 s2, s2, 0 -; GFX9-NEXT: s_cselect_b32 s4, 1, 0 -; GFX9-NEXT: s_and_b32 s4, s4, 1 -; GFX9-NEXT: s_cmp_lg_u32 s4, 0 -; GFX9-NEXT: s_addc_u32 s3, s3, 0 -; GFX9-NEXT: s_cselect_b32 s4, 1, 0 -; GFX9-NEXT: s_and_b32 s4, s4, 1 -; GFX9-NEXT: s_cmp_lg_u32 s4, 0 -; GFX9-NEXT: s_addc_u32 s0, s0, 0 +; GFX9-NEXT: s_add_u32 s0, s3, 0 +; GFX9-NEXT: s_cselect_b32 s1, 1, 0 +; GFX9-NEXT: s_and_b32 s1, s1, 1 +; GFX9-NEXT: s_cmp_lg_u32 s1, 0 +; GFX9-NEXT: s_addc_u32 s1, s3, 0 +; GFX9-NEXT: s_cselect_b32 s2, 1, 0 +; GFX9-NEXT: s_and_b32 s2, s2, 1 +; GFX9-NEXT: s_cmp_lg_u32 s2, 0 +; GFX9-NEXT: s_addc_u32 s2, s3, 0 ; GFX9-NEXT: s_cselect_b32 s4, 1, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: s_and_b32 s4, s4, 1 ; GFX9-NEXT: s_cmp_lg_u32 s4, 0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_addc_u32 s1, s1, 0x80000000 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: s_addc_u32 s3, s3, 0x80000000 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_mov_b32_e32 v3, s8 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, s9 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_mov_b32_e32 v5, s11 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc @@ -5590,9 +5527,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_add_u32 s8, s0, s4 ; GFX10-NEXT: s_cselect_b32 s9, 1, 0 -; GFX10-NEXT: s_movk_i32 s12, 0x7f -; GFX10-NEXT: s_and_b32 s9, s9, 1 ; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_and_b32 s9, s9, 1 ; GFX10-NEXT: s_cmp_lg_u32 s9, 0 ; GFX10-NEXT: s_addc_u32 s9, s1, s5 ; GFX10-NEXT: s_cselect_b32 s10, 1, 0 @@ -5616,57 +5552,37 @@ ; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[4:5], 0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1 ; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: s_sub_i32 s13, s12, 64 -; GFX10-NEXT: s_and_b32 s14, 1, s1 -; GFX10-NEXT: s_sub_i32 s2, 64, s12 +; GFX10-NEXT: s_ashr_i32 s3, s11, 31 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[6:7], 0 -; GFX10-NEXT: s_cmp_lt_u32 s12, 64 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s14 -; GFX10-NEXT: s_cselect_b32 s15, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s12, 0 -; GFX10-NEXT: s_cselect_b32 s16, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 -; GFX10-NEXT: s_lshr_b64 s[0:1], s[8:9], s12 -; GFX10-NEXT: s_lshl_b64 s[2:3], s[10:11], s2 -; GFX10-NEXT: s_ashr_i64 s[4:5], s[10:11], s12 -; GFX10-NEXT: s_and_b32 s12, s15, 1 -; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX10-NEXT: s_ashr_i32 s2, s11, 31 -; GFX10-NEXT: s_ashr_i64 s[6:7], s[10:11], s13 -; GFX10-NEXT: s_cmp_lg_u32 s12, 0 -; GFX10-NEXT: s_mov_b32 s3, s2 -; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[6:7] -; GFX10-NEXT: s_and_b32 s6, s16, 1 +; GFX10-NEXT: s_and_b32 s0, 1, s1 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX10-NEXT: s_add_u32 s0, s3, 0 +; GFX10-NEXT: s_cselect_b32 s1, 1, 0 +; GFX10-NEXT: s_and_b32 s1, s1, 1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s6, 0 +; GFX10-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, s9 -; GFX10-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] -; GFX10-NEXT: s_cmp_lg_u32 s12, 0 +; GFX10-NEXT: s_addc_u32 s1, s3, 0 +; GFX10-NEXT: s_cselect_b32 s2, 1, 0 ; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX10-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] -; GFX10-NEXT: s_add_u32 s0, s0, 0 -; GFX10-NEXT: s_cselect_b32 s4, 1, 0 +; GFX10-NEXT: s_and_b32 s2, s2, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, s8 -; GFX10-NEXT: s_and_b32 s4, s4, 1 +; GFX10-NEXT: s_cmp_lg_u32 s2, 0 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: s_cmp_lg_u32 s4, 0 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_addc_u32 s2, s3, 0 ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_and_b32 s4, s4, 1 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 -; GFX10-NEXT: s_addc_u32 s2, s2, 0 -; GFX10-NEXT: s_cselect_b32 s4, 1, 0 +; GFX10-NEXT: s_addc_u32 s3, s3, 0x80000000 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo -; GFX10-NEXT: s_and_b32 s4, s4, 1 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s4, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, s2, vcc_lo -; GFX10-NEXT: s_addc_u32 s3, s3, 0x80000000 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v4, s3, vcc_lo +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: v_readfirstlane_b32 s3, v3 @@ -5686,55 +5602,30 @@ ; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v6, v2, vcc ; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc ; GFX6-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5] -; GFX6-NEXT: s_movk_i32 s0, 0x7f ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX6-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[6:7] -; GFX6-NEXT: s_sub_i32 s1, s0, 64 ; GFX6-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[6:7] -; GFX6-NEXT: s_sub_i32 s2, 64, s0 ; GFX6-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc ; GFX6-NEXT: v_cmp_gt_u64_e32 vcc, 0, v[0:1] -; GFX6-NEXT: s_cmp_lt_u32 s0, 64 ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX6-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3] -; GFX6-NEXT: s_cselect_b32 s3, 1, 0 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GFX6-NEXT: v_lshl_b64 v[2:3], v[6:7], s2 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX6-NEXT: v_xor_b32_e32 v10, v0, v8 -; GFX6-NEXT: v_lshr_b64 v[0:1], v[4:5], s0 -; GFX6-NEXT: s_cmp_eq_u32 s0, 0 -; GFX6-NEXT: s_cselect_b32 s4, 1, 0 -; GFX6-NEXT: v_ashr_i64 v[8:9], v[6:7], s0 -; GFX6-NEXT: s_and_b32 s0, 1, s3 -; GFX6-NEXT: v_or_b32_e32 v2, v0, v2 -; GFX6-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX6-NEXT: v_ashr_i64 v[0:1], v[6:7], s1 -; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX6-NEXT: s_and_b32 s0, 1, s4 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX6-NEXT: s_and_b32 s0, 1, s3 -; GFX6-NEXT: v_ashrrev_i32_e32 v11, 31, v7 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v11, v8, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v3, v11, v9, vcc -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0, v0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v7 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0, v1 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX6-NEXT: v_xor_b32_e32 v0, v0, v8 ; GFX6-NEXT: v_bfrev_b32_e32 v8, 1 -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc -; GFX6-NEXT: v_and_b32_e32 v8, 1, v10 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v1, v8, vcc +; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: saddsat_i128_sv: @@ -5747,55 +5638,30 @@ ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v6, v2, vcc ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc ; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5] -; GFX8-NEXT: s_movk_i32 s0, 0x7f ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[6:7] -; GFX8-NEXT: s_sub_i32 s1, s0, 64 ; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[6:7] -; GFX8-NEXT: s_sub_i32 s2, 64, s0 ; GFX8-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc ; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, 0, v[0:1] -; GFX8-NEXT: s_cmp_lt_u32 s0, 64 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3] -; GFX8-NEXT: s_cselect_b32 s3, 1, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GFX8-NEXT: v_lshlrev_b64 v[2:3], s2, v[6:7] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX8-NEXT: v_xor_b32_e32 v10, v0, v8 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], s0, v[4:5] -; GFX8-NEXT: s_cmp_eq_u32 s0, 0 -; GFX8-NEXT: s_cselect_b32 s4, 1, 0 -; GFX8-NEXT: v_ashrrev_i64 v[8:9], s0, v[6:7] -; GFX8-NEXT: s_and_b32 s0, 1, s3 -; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 -; GFX8-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX8-NEXT: v_ashrrev_i64 v[0:1], s1, v[6:7] -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX8-NEXT: s_and_b32 s0, 1, s4 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX8-NEXT: s_and_b32 s0, 1, s3 -; GFX8-NEXT: v_ashrrev_i32_e32 v11, 31, v7 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v11, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v9, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v7 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0, v1 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX8-NEXT: v_xor_b32_e32 v0, v0, v8 ; GFX8-NEXT: v_bfrev_b32_e32 v8, 1 -; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc -; GFX8-NEXT: v_and_b32_e32 v8, 1, v10 -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: v_addc_u32_e32 v8, vcc, v1, v8, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: saddsat_i128_sv: @@ -5808,55 +5674,30 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v2, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v3, vcc ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5] -; GFX9-NEXT: s_movk_i32 s0, 0x7f ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[6:7] -; GFX9-NEXT: s_sub_i32 s1, s0, 64 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[6:7] -; GFX9-NEXT: s_sub_i32 s2, 64, s0 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, 0, v[0:1] -; GFX9-NEXT: s_cmp_lt_u32 s0, 64 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3] -; GFX9-NEXT: s_cselect_b32 s3, 1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GFX9-NEXT: v_lshlrev_b64 v[2:3], s2, v[6:7] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX9-NEXT: v_xor_b32_e32 v10, v0, v8 -; GFX9-NEXT: v_lshrrev_b64 v[0:1], s0, v[4:5] -; GFX9-NEXT: s_cmp_eq_u32 s0, 0 -; GFX9-NEXT: s_cselect_b32 s4, 1, 0 -; GFX9-NEXT: v_ashrrev_i64 v[8:9], s0, v[6:7] -; GFX9-NEXT: s_and_b32 s0, 1, s3 -; GFX9-NEXT: v_or_b32_e32 v2, v0, v2 -; GFX9-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX9-NEXT: v_ashrrev_i64 v[0:1], s1, v[6:7] -; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX9-NEXT: s_and_b32 s0, 1, s4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX9-NEXT: s_and_b32 s0, 1, s3 -; GFX9-NEXT: v_ashrrev_i32_e32 v11, 31, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v11, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v9, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v7 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0, v1 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX9-NEXT: v_xor_b32_e32 v0, v0, v8 ; GFX9-NEXT: v_bfrev_b32_e32 v8, 1 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v8, vcc -; GFX9-NEXT: v_and_b32_e32 v8, 1, v10 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v1, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v1, v8, vcc +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: saddsat_i128_sv: @@ -5867,48 +5708,25 @@ ; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo ; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo ; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[4:5] -; GFX10-NEXT: s_movk_i32 s0, 0x7f -; GFX10-NEXT: s_sub_i32 s1, 64, s0 -; GFX10-NEXT: v_lshrrev_b64 v[15:16], s0, v[4:5] ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[6:7] ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[6:7] -; GFX10-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc_lo ; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, 0, v[0:1] -; GFX10-NEXT: v_lshlrev_b64 v[8:9], s1, v[6:7] -; GFX10-NEXT: s_sub_i32 s1, s0, 64 -; GFX10-NEXT: s_cmp_lt_u32 s0, 64 -; GFX10-NEXT: v_ashrrev_i64 v[0:1], s0, v[6:7] -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3] -; GFX10-NEXT: v_or_b32_e32 v8, v15, v8 -; GFX10-NEXT: v_or_b32_e32 v9, v16, v9 -; GFX10-NEXT: v_ashrrev_i32_e32 v15, 31, v7 -; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[6:7] +; GFX10-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc_lo ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3] -; GFX10-NEXT: v_ashrrev_i64 v[2:3], s1, v[6:7] -; GFX10-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc_lo -; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s0, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo -; GFX10-NEXT: s_and_b32 s0, 1, s1 -; GFX10-NEXT: s_and_b32 s1, 1, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 -; GFX10-NEXT: v_xor_b32_e32 v9, v11, v10 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v15, v0, s0 -; GFX10-NEXT: v_and_b32_e32 v8, 1, v9 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v15, v1, s0 -; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v2, 0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v8 -; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc_lo +; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v7 +; GFX10-NEXT: v_xor_b32_e32 v0, v3, v8 +; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v1, 0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0x80000000, v1, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, v2, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, v3, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 @@ -5933,7 +5751,7 @@ ; GFX6-NEXT: v_cmp_lt_u64_e64 s[0:1], s[0:1], 0 ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3] -; GFX6-NEXT: v_ashrrev_i32_e32 v11, 31, v7 +; GFX6-NEXT: v_bfrev_b32_e32 v8, 1 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc @@ -5941,44 +5759,19 @@ ; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 ; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[2:3], 0 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; GFX6-NEXT: s_movk_i32 s0, 0x7f ; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX6-NEXT: s_sub_i32 s2, 64, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, 64 -; GFX6-NEXT: v_xor_b32_e32 v10, v1, v0 -; GFX6-NEXT: s_cmp_lt_u32 s0, 64 -; GFX6-NEXT: v_lshr_b64 v[0:1], v[4:5], s0 -; GFX6-NEXT: v_lshl_b64 v[2:3], v[6:7], s2 -; GFX6-NEXT: s_cselect_b32 s3, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s0, 0 -; GFX6-NEXT: s_cselect_b32 s4, 1, 0 -; GFX6-NEXT: v_ashr_i64 v[8:9], v[6:7], s0 -; GFX6-NEXT: s_and_b32 s0, 1, s3 -; GFX6-NEXT: v_or_b32_e32 v2, v0, v2 -; GFX6-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX6-NEXT: v_ashr_i64 v[0:1], v[6:7], s1 -; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX6-NEXT: s_and_b32 s0, 1, s4 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX6-NEXT: s_and_b32 s0, 1, s3 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v11, v8, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v3, v11, v9, vcc -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0, v0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: v_bfrev_b32_e32 v8, 1 -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc -; GFX6-NEXT: v_and_b32_e32 v8, 1, v10 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v7 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0, v1 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v1, v8, vcc +; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: saddsat_i128_vs: @@ -6001,48 +5794,23 @@ ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 -; GFX8-NEXT: v_ashrrev_i32_e32 v11, 31, v7 +; GFX8-NEXT: v_bfrev_b32_e32 v8, 1 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX8-NEXT: s_and_b32 s0, 1, s4 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX8-NEXT: s_movk_i32 s0, 0x7f ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX8-NEXT: s_sub_i32 s2, 64, s0 -; GFX8-NEXT: s_sub_i32 s1, s0, 64 -; GFX8-NEXT: v_xor_b32_e32 v10, v1, v0 -; GFX8-NEXT: s_cmp_lt_u32 s0, 64 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], s0, v[4:5] -; GFX8-NEXT: v_lshlrev_b64 v[2:3], s2, v[6:7] -; GFX8-NEXT: s_cselect_b32 s3, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s0, 0 -; GFX8-NEXT: s_cselect_b32 s4, 1, 0 -; GFX8-NEXT: v_ashrrev_i64 v[8:9], s0, v[6:7] -; GFX8-NEXT: s_and_b32 s0, 1, s3 -; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 -; GFX8-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX8-NEXT: v_ashrrev_i64 v[0:1], s1, v[6:7] -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX8-NEXT: s_and_b32 s0, 1, s4 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX8-NEXT: s_and_b32 s0, 1, s3 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v11, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v9, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_bfrev_b32_e32 v8, 1 -; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc -; GFX8-NEXT: v_and_b32_e32 v8, 1, v10 -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v7 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0, v1 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: v_addc_u32_e32 v8, vcc, v1, v8, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: saddsat_i128_vs: @@ -6065,111 +5833,63 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 -; GFX9-NEXT: v_ashrrev_i32_e32 v11, 31, v7 +; GFX9-NEXT: v_bfrev_b32_e32 v8, 1 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX9-NEXT: s_and_b32 s0, 1, s4 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX9-NEXT: s_movk_i32 s0, 0x7f ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: s_sub_i32 s2, 64, s0 -; GFX9-NEXT: s_sub_i32 s1, s0, 64 -; GFX9-NEXT: v_xor_b32_e32 v10, v1, v0 -; GFX9-NEXT: s_cmp_lt_u32 s0, 64 -; GFX9-NEXT: v_lshrrev_b64 v[0:1], s0, v[4:5] -; GFX9-NEXT: v_lshlrev_b64 v[2:3], s2, v[6:7] -; GFX9-NEXT: s_cselect_b32 s3, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s0, 0 -; GFX9-NEXT: s_cselect_b32 s4, 1, 0 -; GFX9-NEXT: v_ashrrev_i64 v[8:9], s0, v[6:7] -; GFX9-NEXT: s_and_b32 s0, 1, s3 -; GFX9-NEXT: v_or_b32_e32 v2, v0, v2 -; GFX9-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX9-NEXT: v_ashrrev_i64 v[0:1], s1, v[6:7] -; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX9-NEXT: s_and_b32 s0, 1, s4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX9-NEXT: s_and_b32 s0, 1, s3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v11, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v9, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_bfrev_b32_e32 v8, 1 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v8, vcc -; GFX9-NEXT: v_and_b32_e32 v8, 1, v10 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v7 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0, v1 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v1, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v1, v8, vcc +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: saddsat_i128_vs: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: v_mov_b32_e32 v6, v1 -; GFX10-NEXT: v_mov_b32_e32 v9, v2 -; GFX10-NEXT: v_mov_b32_e32 v10, v3 +; GFX10-NEXT: v_mov_b32_e32 v13, v2 +; GFX10-NEXT: v_mov_b32_e32 v14, v3 ; GFX10-NEXT: s_cmp_eq_u64 s[2:3], 0 -; GFX10-NEXT: v_add_co_u32_e64 v15, vcc_lo, v5, s0 +; GFX10-NEXT: v_add_co_u32_e64 v11, vcc_lo, v5, s0 ; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], 0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v16, vcc_lo, s1, v6, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, s1, v6, vcc_lo ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v19, vcc_lo, s2, v9, vcc_lo -; GFX10-NEXT: s_and_b32 s1, 1, s4 -; GFX10-NEXT: v_add_co_ci_u32_e32 v20, vcc_lo, s3, v10, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[15:16], v[5:6] +; GFX10-NEXT: v_add_co_ci_u32_e32 v15, vcc_lo, s2, v13, vcc_lo +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e32 v16, vcc_lo, s3, v14, vcc_lo +; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[11:12], v[5:6] ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[2:3], 0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v20 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[19:20], v[9:10] -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[19:20], v[9:10] +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[15:16], v[13:14] ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0 -; GFX10-NEXT: s_movk_i32 s0, 0x7f -; GFX10-NEXT: s_sub_i32 s2, 64, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v10, v1, v0, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 -; GFX10-NEXT: v_lshrrev_b64 v[0:1], s0, v[15:16] -; GFX10-NEXT: v_lshlrev_b64 v[2:3], s2, v[19:20] -; GFX10-NEXT: s_sub_i32 s1, s0, 64 -; GFX10-NEXT: s_cmp_lt_u32 s0, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v11, v9, v8, vcc_lo -; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 -; GFX10-NEXT: v_ashrrev_i64 v[8:9], s1, v[19:20] -; GFX10-NEXT: s_cmp_eq_u32 s0, 0 -; GFX10-NEXT: v_or_b32_e32 v2, v0, v2 -; GFX10-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: v_ashrrev_i64 v[0:1], s0, v[19:20] -; GFX10-NEXT: s_and_b32 s0, 1, s1 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc_lo -; GFX10-NEXT: s_and_b32 s1, 1, vcc_lo +; GFX10-NEXT: s_and_b32 s0, 1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[15:16], v[13:14] +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 -; GFX10-NEXT: v_xor_b32_e32 v9, v11, v10 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v15, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v16, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v0, s0 -; GFX10-NEXT: v_and_b32_e32 v8, 1, v9 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v7, v1, s0 -; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v2, 0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v8 -; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo +; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v16 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v1, 0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0x80000000, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v15, v2, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v16, v3, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v19, v8, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v20, v9, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v11, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v12, v3, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v15, v8, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v16, v9, s0 ; GFX10-NEXT: ; return to shader part epilog %result = call i128 @llvm.sadd.sat.i128(i128 %lhs, i128 %rhs) %cast = bitcast i128 %result to <4 x float> @@ -6185,103 +5905,58 @@ ; GFX6-NEXT: v_addc_u32_e32 v18, vcc, v2, v10, vcc ; GFX6-NEXT: v_addc_u32_e32 v19, vcc, v3, v11, vcc ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[16:17], v[0:1] -; GFX6-NEXT: s_movk_i32 s6, 0x7f +; GFX6-NEXT: v_bfrev_b32_e32 v20, 1 ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[18:19], v[2:3] -; GFX6-NEXT: s_sub_i32 s7, s6, 64 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[18:19], v[2:3] -; GFX6-NEXT: s_sub_i32 s8, 64, s6 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX6-NEXT: v_cmp_gt_u64_e32 vcc, 0, v[8:9] -; GFX6-NEXT: s_cmp_lt_u32 s6, 64 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX6-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[10:11] -; GFX6-NEXT: s_cselect_b32 s4, 1, 0 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] -; GFX6-NEXT: s_cmp_eq_u32 s6, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX6-NEXT: v_xor_b32_e32 v10, v1, v0 -; GFX6-NEXT: v_lshr_b64 v[0:1], v[16:17], s6 -; GFX6-NEXT: v_lshl_b64 v[2:3], v[18:19], s8 -; GFX6-NEXT: s_cselect_b32 s5, 1, 0 -; GFX6-NEXT: s_and_b32 s4, 1, s4 -; GFX6-NEXT: v_or_b32_e32 v2, v0, v2 -; GFX6-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX6-NEXT: v_ashr_i64 v[0:1], v[18:19], s7 -; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 -; GFX6-NEXT: s_and_b32 s4, 1, s5 -; GFX6-NEXT: v_ashr_i64 v[8:9], v[18:19], s6 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX6-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, s4 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX6-NEXT: v_ashrrev_i32_e32 v11, 31, v19 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v16, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v2, v11, v8, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v3, v11, v9, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v17, s[4:5] -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0, v0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: v_bfrev_b32_e32 v20, 1 -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v20, vcc -; GFX6-NEXT: v_and_b32_e32 v8, 1, v10 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc +; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v19 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0, v1 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc +; GFX6-NEXT: v_addc_u32_e32 v9, vcc, v1, v20, vcc +; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v17, v3, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v2, v18, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v19, v9, vcc ; GFX6-NEXT: v_add_i32_e32 v8, vcc, v4, v12 ; GFX6-NEXT: v_addc_u32_e32 v9, vcc, v5, v13, vcc ; GFX6-NEXT: v_addc_u32_e32 v10, vcc, v6, v14, vcc ; GFX6-NEXT: v_addc_u32_e32 v11, vcc, v7, v15, vcc ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] -; GFX6-NEXT: s_cmp_lt_u32 s6, 64 ; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7] -; GFX6-NEXT: s_cselect_b32 s4, 1, 0 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7] -; GFX6-NEXT: s_cmp_eq_u32 s6, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc ; GFX6-NEXT: v_cmp_gt_u64_e32 vcc, 0, v[12:13] -; GFX6-NEXT: s_cselect_b32 s5, 1, 0 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX6-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[14:15] -; GFX6-NEXT: v_ashr_i64 v[12:13], v[10:11], s6 ; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] -; GFX6-NEXT: s_and_b32 s5, 1, s5 ; GFX6-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc -; GFX6-NEXT: v_xor_b32_e32 v14, v5, v4 -; GFX6-NEXT: v_lshr_b64 v[4:5], v[8:9], s6 -; GFX6-NEXT: v_lshl_b64 v[6:7], v[10:11], s8 -; GFX6-NEXT: s_and_b32 s6, 1, s4 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v6 -; GFX6-NEXT: v_or_b32_e32 v7, v5, v7 -; GFX6-NEXT: v_ashr_i64 v[4:5], v[10:11], s7 -; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s6 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 -; GFX6-NEXT: s_and_b32 s4, 1, s4 -; GFX6-NEXT: v_ashrrev_i32_e32 v15, 31, v11 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc -; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 -; GFX6-NEXT: v_cndmask_b32_e32 v6, v15, v12, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v7, v15, v13, vcc -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0, v4 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v7, v20, vcc -; GFX6-NEXT: v_and_b32_e32 v12, 1, v14 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc +; GFX6-NEXT: v_xor_b32_e32 v4, v5, v4 +; GFX6-NEXT: v_ashrrev_i32_e32 v5, 31, v11 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, 0, v5 +; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v12, vcc, 0, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v13, vcc, v5, v20, vcc +; GFX6-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v5, v9, v7, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v7, v11, v13, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_saddsat_v2i128: @@ -6292,103 +5967,58 @@ ; GFX8-NEXT: v_addc_u32_e32 v18, vcc, v2, v10, vcc ; GFX8-NEXT: v_addc_u32_e32 v19, vcc, v3, v11, vcc ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[16:17], v[0:1] -; GFX8-NEXT: s_movk_i32 s6, 0x7f +; GFX8-NEXT: v_bfrev_b32_e32 v20, 1 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[18:19], v[2:3] -; GFX8-NEXT: s_sub_i32 s7, s6, 64 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[18:19], v[2:3] -; GFX8-NEXT: s_sub_i32 s8, 64, s6 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, 0, v[8:9] -; GFX8-NEXT: s_cmp_lt_u32 s6, 64 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[10:11] -; GFX8-NEXT: s_cselect_b32 s4, 1, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] -; GFX8-NEXT: s_cmp_eq_u32 s6, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX8-NEXT: v_xor_b32_e32 v10, v1, v0 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], s6, v[16:17] -; GFX8-NEXT: v_lshlrev_b64 v[2:3], s8, v[18:19] -; GFX8-NEXT: s_cselect_b32 s5, 1, 0 -; GFX8-NEXT: s_and_b32 s4, 1, s4 -; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 -; GFX8-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX8-NEXT: v_ashrrev_i64 v[0:1], s7, v[18:19] -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 -; GFX8-NEXT: s_and_b32 s4, 1, s5 -; GFX8-NEXT: v_ashrrev_i64 v[8:9], s6, v[18:19] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, s4 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX8-NEXT: v_ashrrev_i32_e32 v11, 31, v19 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v16, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v2, v11, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v17, s[4:5] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_bfrev_b32_e32 v20, 1 -; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v20, vcc -; GFX8-NEXT: v_and_b32_e32 v8, 1, v10 -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc +; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v19 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0, v1 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v1, v20, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v18, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v9, vcc ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v4, v12 ; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v5, v13, vcc ; GFX8-NEXT: v_addc_u32_e32 v10, vcc, v6, v14, vcc ; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v7, v15, vcc ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] -; GFX8-NEXT: s_cmp_lt_u32 s6, 64 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7] -; GFX8-NEXT: s_cselect_b32 s4, 1, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7] -; GFX8-NEXT: s_cmp_eq_u32 s6, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc ; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, 0, v[12:13] -; GFX8-NEXT: s_cselect_b32 s5, 1, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[14:15] -; GFX8-NEXT: v_ashrrev_i64 v[12:13], s6, v[10:11] ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] -; GFX8-NEXT: s_and_b32 s5, 1, s5 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc -; GFX8-NEXT: v_xor_b32_e32 v14, v5, v4 -; GFX8-NEXT: v_lshrrev_b64 v[4:5], s6, v[8:9] -; GFX8-NEXT: v_lshlrev_b64 v[6:7], s8, v[10:11] -; GFX8-NEXT: s_and_b32 s6, 1, s4 -; GFX8-NEXT: v_or_b32_e32 v6, v4, v6 -; GFX8-NEXT: v_or_b32_e32 v7, v5, v7 -; GFX8-NEXT: v_ashrrev_i64 v[4:5], s7, v[10:11] -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s6 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 -; GFX8-NEXT: s_and_b32 s4, 1, s4 -; GFX8-NEXT: v_ashrrev_i32_e32 v15, 31, v11 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v15, v12, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v7, v15, v13, vcc -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0, v4 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v20, vcc -; GFX8-NEXT: v_and_b32_e32 v12, 1, v14 -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc +; GFX8-NEXT: v_xor_b32_e32 v4, v5, v4 +; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v11 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0, v5 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc +; GFX8-NEXT: v_addc_u32_e32 v12, vcc, 0, v5, vcc +; GFX8-NEXT: v_addc_u32_e32 v13, vcc, v5, v20, vcc +; GFX8-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v7, v11, v13, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_saddsat_v2i128: @@ -6399,103 +6029,58 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v18, vcc, v2, v10, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, v3, v11, vcc ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[16:17], v[0:1] -; GFX9-NEXT: s_movk_i32 s6, 0x7f +; GFX9-NEXT: v_bfrev_b32_e32 v20, 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[18:19], v[2:3] -; GFX9-NEXT: s_sub_i32 s7, s6, 64 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[18:19], v[2:3] -; GFX9-NEXT: s_sub_i32 s8, 64, s6 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, 0, v[8:9] -; GFX9-NEXT: s_cmp_lt_u32 s6, 64 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[10:11] -; GFX9-NEXT: s_cselect_b32 s4, 1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] -; GFX9-NEXT: s_cmp_eq_u32 s6, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: v_xor_b32_e32 v10, v1, v0 -; GFX9-NEXT: v_lshrrev_b64 v[0:1], s6, v[16:17] -; GFX9-NEXT: v_lshlrev_b64 v[2:3], s8, v[18:19] -; GFX9-NEXT: s_cselect_b32 s5, 1, 0 -; GFX9-NEXT: s_and_b32 s4, 1, s4 -; GFX9-NEXT: v_or_b32_e32 v2, v0, v2 -; GFX9-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX9-NEXT: v_ashrrev_i64 v[0:1], s7, v[18:19] -; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 -; GFX9-NEXT: s_and_b32 s4, 1, s5 -; GFX9-NEXT: v_ashrrev_i64 v[8:9], s6, v[18:19] -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, s4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_ashrrev_i32_e32 v11, 31, v19 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v16, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v2, v11, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v17, s[4:5] -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_bfrev_b32_e32 v20, 1 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v20, vcc -; GFX9-NEXT: v_and_b32_e32 v8, 1, v10 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc +; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v19 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0, v1 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v1, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v1, v20, vcc +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v9, vcc ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v4, v12 ; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v5, v13, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v6, v14, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v7, v15, vcc ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] -; GFX9-NEXT: s_cmp_lt_u32 s6, 64 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7] -; GFX9-NEXT: s_cselect_b32 s4, 1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7] -; GFX9-NEXT: s_cmp_eq_u32 s6, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, 0, v[12:13] -; GFX9-NEXT: s_cselect_b32 s5, 1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[14:15] -; GFX9-NEXT: v_ashrrev_i64 v[12:13], s6, v[10:11] ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] -; GFX9-NEXT: s_and_b32 s5, 1, s5 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc -; GFX9-NEXT: v_xor_b32_e32 v14, v5, v4 -; GFX9-NEXT: v_lshrrev_b64 v[4:5], s6, v[8:9] -; GFX9-NEXT: v_lshlrev_b64 v[6:7], s8, v[10:11] -; GFX9-NEXT: s_and_b32 s6, 1, s4 -; GFX9-NEXT: v_or_b32_e32 v6, v4, v6 -; GFX9-NEXT: v_or_b32_e32 v7, v5, v7 -; GFX9-NEXT: v_ashrrev_i64 v[4:5], s7, v[10:11] -; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s6 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 -; GFX9-NEXT: s_and_b32 s4, 1, s4 -; GFX9-NEXT: v_ashrrev_i32_e32 v15, 31, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v15, v12, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v13, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 0, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v20, vcc -; GFX9-NEXT: v_and_b32_e32 v12, 1, v14 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc +; GFX9-NEXT: v_xor_b32_e32 v4, v5, v4 +; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v11 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 0, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, 0, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, v5, v20, vcc +; GFX9-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v7, v11, v13, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_saddsat_v2i128: @@ -6506,110 +6091,67 @@ ; GFX10-NEXT: v_mov_b32_e32 v23, v1 ; GFX10-NEXT: v_mov_b32_e32 v20, v2 ; GFX10-NEXT: v_mov_b32_e32 v21, v3 -; GFX10-NEXT: s_movk_i32 s5, 0x7f +; GFX10-NEXT: v_mov_b32_e32 v29, v4 ; GFX10-NEXT: v_add_co_u32_e64 v16, vcc_lo, v22, v8 -; GFX10-NEXT: s_sub_i32 s6, 64, s5 +; GFX10-NEXT: v_mov_b32_e32 v30, v5 ; GFX10-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, v23, v9, vcc_lo -; GFX10-NEXT: s_sub_i32 s7, s5, 64 +; GFX10-NEXT: v_mov_b32_e32 v24, v6 ; GFX10-NEXT: v_add_co_ci_u32_e32 v18, vcc_lo, v20, v10, vcc_lo -; GFX10-NEXT: s_cmp_lt_u32 s5, 64 +; GFX10-NEXT: v_mov_b32_e32 v25, v7 ; GFX10-NEXT: v_add_co_ci_u32_e32 v19, vcc_lo, v21, v11, vcc_lo ; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[16:17], v[22:23] -; GFX10-NEXT: v_mov_b32_e32 v26, v4 -; GFX10-NEXT: v_mov_b32_e32 v27, v5 -; GFX10-NEXT: v_mov_b32_e32 v24, v6 -; GFX10-NEXT: v_lshlrev_b64 v[2:3], s6, v[18:19] -; GFX10-NEXT: v_mov_b32_e32 v25, v7 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[20:21] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[20:21] -; GFX10-NEXT: v_cndmask_b32_e32 v20, v1, v0, vcc_lo ; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, 0, v[8:9] -; GFX10-NEXT: v_lshrrev_b64 v[0:1], s5, v[16:17] ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[10:11] -; GFX10-NEXT: v_or_b32_e32 v2, v0, v2 -; GFX10-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX10-NEXT: v_ashrrev_i64 v[0:1], s5, v[18:19] ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[20:21] +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11] -; GFX10-NEXT: v_ashrrev_i32_e32 v11, 31, v19 -; GFX10-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc_lo -; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 -; GFX10-NEXT: v_ashrrev_i64 v[8:9], s7, v[18:19] -; GFX10-NEXT: s_cmp_eq_u32 s5, 0 -; GFX10-NEXT: s_cselect_b32 s4, 1, 0 -; GFX10-NEXT: s_and_b32 s8, 1, vcc_lo -; GFX10-NEXT: s_and_b32 s4, 1, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 -; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s8 -; GFX10-NEXT: v_xor_b32_e32 v9, v10, v20 -; GFX10-NEXT: s_cmp_lt_u32 s5, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v17, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v11, v0, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v11, v1, s4 -; GFX10-NEXT: v_and_b32_e32 v8, 1, v9 -; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v2, 0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e32 v20, vcc_lo, 0, v0, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e32 v21, vcc_lo, 0x80000000, v1, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 -; GFX10-NEXT: v_add_co_u32_e64 v8, s4, v26, v12 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s4, v27, v13, s4 -; GFX10-NEXT: v_add_co_ci_u32_e64 v10, s4, v24, v14, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e64 v11, s4, v25, v15, s4 -; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[8:9], v[26:27] -; GFX10-NEXT: v_cndmask_b32_e32 v1, v17, v3, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v2, v18, v20, vcc_lo -; GFX10-NEXT: v_lshrrev_b64 v[3:4], s5, v[8:9] -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo +; GFX10-NEXT: v_add_co_u32_e64 v31, vcc_lo, v29, v12 +; GFX10-NEXT: v_add_co_ci_u32_e32 v32, vcc_lo, v30, v13, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v24, v14, vcc_lo +; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v25, v15, vcc_lo +; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[31:32], v[29:30] +; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v19 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_cmp_eq_u64_e64 s5, v[10:11], v[24:25] +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[10:11], v[24:25] -; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, s4 +; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v1, 0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4 ; GFX10-NEXT: v_cmp_gt_u64_e64 s4, 0, v[12:13] -; GFX10-NEXT: v_lshlrev_b64 v[12:13], s6, v[10:11] -; GFX10-NEXT: v_cndmask_b32_e64 v17, 0, 1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s4 ; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[14:15] -; GFX10-NEXT: v_or_b32_e32 v12, v3, v12 -; GFX10-NEXT: v_or_b32_e32 v13, v4, v13 -; GFX10-NEXT: v_ashrrev_i64 v[3:4], s5, v[10:11] -; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, 1, s4 -; GFX10-NEXT: v_cmp_eq_u64_e64 s4, v[10:11], v[24:25] -; GFX10-NEXT: v_cndmask_b32_e64 v7, v16, v5, s4 -; GFX10-NEXT: v_cmp_eq_u64_e64 s4, 0, v[14:15] -; GFX10-NEXT: v_ashrrev_i64 v[5:6], s7, v[10:11] -; GFX10-NEXT: v_cndmask_b32_e64 v14, v18, v17, s4 -; GFX10-NEXT: s_cselect_b32 s4, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s5, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v12, s4 -; GFX10-NEXT: s_cselect_b32 s6, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v13, s4 -; GFX10-NEXT: s_and_b32 s5, 1, s6 -; GFX10-NEXT: s_and_b32 s6, 1, s4 -; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s5 -; GFX10-NEXT: v_xor_b32_e32 v7, v14, v7 -; GFX10-NEXT: v_ashrrev_i32_e32 v18, 31, v11 -; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v8, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v9, s4 -; GFX10-NEXT: v_and_b32_e32 v7, 1, v7 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v18, v3, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v18, v4, s5 -; GFX10-NEXT: v_add_co_u32_e64 v5, s4, v5, 0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s4, 0, v6, s4 -; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, v7 -; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, 0, v3, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v19, v21, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e64 v12, s4, 0x80000000, v4, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v5, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v9, v6, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v10, v7, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v11, v12, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s4 +; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v4, s5 +; GFX10-NEXT: v_cmp_eq_u64_e64 s5, 0, v[14:15] +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0x80000000, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v17, v3, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v13, v12, s5 +; GFX10-NEXT: v_xor_b32_e32 v4, v7, v0 +; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v11 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v16, v2, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v18, v5, s4 +; GFX10-NEXT: v_and_b32_e32 v3, 1, v4 +; GFX10-NEXT: v_add_co_u32_e64 v4, vcc_lo, v7, 0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v7, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, 0, v7, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, v3 +; GFX10-NEXT: v_add_co_ci_u32_e32 v14, vcc_lo, 0x80000000, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v19, v6, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v31, v4, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v32, v5, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v10, v12, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v11, v14, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs) ret <2 x i128> %result @@ -6635,74 +6177,53 @@ ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[2:3] ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_addc_u32 s19, s3, s11 -; GFX6-NEXT: s_movk_i32 s20, 0x7f ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[18:19], v[0:1] ; GFX6-NEXT: v_cmp_lt_u64_e64 s[0:1], s[8:9], 0 -; GFX6-NEXT: s_sub_i32 s21, s20, 64 -; GFX6-NEXT: s_sub_i32 s22, 64, s20 -; GFX6-NEXT: s_cmp_lt_u32 s20, 64 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[18:19], v[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; GFX6-NEXT: s_cselect_b32 s23, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s20, 0 ; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[10:11], 0 -; GFX6-NEXT: s_cselect_b32 s24, 1, 0 -; GFX6-NEXT: s_lshr_b64 s[2:3], s[16:17], s20 -; GFX6-NEXT: s_lshl_b64 s[8:9], s[18:19], s22 +; GFX6-NEXT: s_ashr_i32 s3, s19, 31 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX6-NEXT: s_add_u32 s0, s3, 0 +; GFX6-NEXT: s_cselect_b32 s1, 1, 0 +; GFX6-NEXT: s_and_b32 s1, s1, 1 +; GFX6-NEXT: s_cmp_lg_u32 s1, 0 +; GFX6-NEXT: s_addc_u32 s1, s3, 0 +; GFX6-NEXT: s_cselect_b32 s2, 1, 0 +; GFX6-NEXT: s_and_b32 s2, s2, 1 +; GFX6-NEXT: s_cmp_lg_u32 s2, 0 +; GFX6-NEXT: s_addc_u32 s2, s3, 0 +; GFX6-NEXT: s_cselect_b32 s9, 1, 0 +; GFX6-NEXT: s_and_b32 s9, s9, 1 ; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[10:11], 0 -; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GFX6-NEXT: s_ashr_i32 s8, s19, 31 -; GFX6-NEXT: s_ashr_i64 s[0:1], s[18:19], s20 -; GFX6-NEXT: s_ashr_i64 s[10:11], s[18:19], s21 -; GFX6-NEXT: s_and_b32 s23, s23, 1 -; GFX6-NEXT: s_cmp_lg_u32 s23, 0 -; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11] -; GFX6-NEXT: s_and_b32 s10, s24, 1 -; GFX6-NEXT: s_cmp_lg_u32 s10, 0 -; GFX6-NEXT: s_cselect_b64 s[2:3], s[16:17], s[2:3] -; GFX6-NEXT: s_cmp_lg_u32 s23, 0 -; GFX6-NEXT: s_mov_b32 s9, s8 -; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9] -; GFX6-NEXT: s_add_u32 s2, s2, 0 -; GFX6-NEXT: s_cselect_b32 s8, 1, 0 -; GFX6-NEXT: s_and_b32 s8, s8, 1 -; GFX6-NEXT: s_cmp_lg_u32 s8, 0 -; GFX6-NEXT: s_addc_u32 s3, s3, 0 -; GFX6-NEXT: s_cselect_b32 s8, 1, 0 -; GFX6-NEXT: s_and_b32 s8, s8, 1 -; GFX6-NEXT: s_cmp_lg_u32 s8, 0 -; GFX6-NEXT: s_addc_u32 s0, s0, 0 -; GFX6-NEXT: s_cselect_b32 s8, 1, 0 +; GFX6-NEXT: s_brev_b32 s8, 1 +; GFX6-NEXT: s_cmp_lg_u32 s9, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX6-NEXT: s_and_b32 s8, s8, 1 +; GFX6-NEXT: s_addc_u32 s3, s3, s8 ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_brev_b32 s23, 1 -; GFX6-NEXT: s_cmp_lg_u32 s8, 0 -; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_addc_u32 s1, s1, s23 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: v_mov_b32_e32 v3, s16 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: s_add_u32 s0, s4, s12 -; GFX6-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s1 ; GFX6-NEXT: s_cselect_b32 s1, 1, 0 ; GFX6-NEXT: s_and_b32 s1, s1, 1 ; GFX6-NEXT: s_cmp_lg_u32 s1, 0 +; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX6-NEXT: s_addc_u32 s1, s5, s13 +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_cselect_b32 s2, 1, 0 ; GFX6-NEXT: s_and_b32 s2, s2, 1 ; GFX6-NEXT: s_cmp_lg_u32 s2, 0 -; GFX6-NEXT: v_mov_b32_e32 v2, s3 ; GFX6-NEXT: v_mov_b32_e32 v4, s17 +; GFX6-NEXT: v_mov_b32_e32 v3, s16 +; GFX6-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc ; GFX6-NEXT: v_mov_b32_e32 v2, s18 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: v_mov_b32_e32 v3, s19 ; GFX6-NEXT: s_addc_u32 s2, s6, s14 ; GFX6-NEXT: s_cselect_b32 s3, 1, 0 @@ -6719,59 +6240,41 @@ ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] ; GFX6-NEXT: v_cmp_lt_u64_e64 s[4:5], s[12:13], 0 -; GFX6-NEXT: s_cmp_lt_u32 s20, 64 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] -; GFX6-NEXT: s_cselect_b32 s12, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s20, 0 ; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], s[14:15], 0 -; GFX6-NEXT: s_cselect_b32 s13, 1, 0 -; GFX6-NEXT: s_lshr_b64 s[6:7], s[0:1], s20 -; GFX6-NEXT: s_lshl_b64 s[8:9], s[2:3], s22 +; GFX6-NEXT: s_ashr_i32 s7, s3, 31 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] -; GFX6-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] -; GFX6-NEXT: s_ashr_i32 s8, s3, 31 -; GFX6-NEXT: s_ashr_i64 s[4:5], s[2:3], s20 -; GFX6-NEXT: s_ashr_i64 s[10:11], s[2:3], s21 -; GFX6-NEXT: s_and_b32 s12, s12, 1 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 -; GFX6-NEXT: s_cselect_b64 s[6:7], s[6:7], s[10:11] -; GFX6-NEXT: s_and_b32 s10, s13, 1 -; GFX6-NEXT: s_cmp_lg_u32 s10, 0 -; GFX6-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7] -; GFX6-NEXT: s_mov_b32 s9, s8 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 -; GFX6-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9] -; GFX6-NEXT: s_add_u32 s6, s6, 0 -; GFX6-NEXT: s_cselect_b32 s8, 1, 0 -; GFX6-NEXT: s_and_b32 s8, s8, 1 -; GFX6-NEXT: s_cmp_lg_u32 s8, 0 -; GFX6-NEXT: s_addc_u32 s7, s7, 0 -; GFX6-NEXT: s_cselect_b32 s8, 1, 0 -; GFX6-NEXT: s_and_b32 s8, s8, 1 -; GFX6-NEXT: s_cmp_lg_u32 s8, 0 +; GFX6-NEXT: s_add_u32 s4, s7, 0 +; GFX6-NEXT: s_cselect_b32 s5, 1, 0 +; GFX6-NEXT: s_and_b32 s5, s5, 1 +; GFX6-NEXT: s_cmp_lg_u32 s5, 0 +; GFX6-NEXT: s_addc_u32 s5, s7, 0 +; GFX6-NEXT: s_cselect_b32 s6, 1, 0 +; GFX6-NEXT: s_and_b32 s6, s6, 1 +; GFX6-NEXT: s_cmp_lg_u32 s6, 0 ; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[14:15], 0 -; GFX6-NEXT: s_addc_u32 s4, s4, 0 -; GFX6-NEXT: s_cselect_b32 s8, 1, 0 +; GFX6-NEXT: s_addc_u32 s6, s7, 0 +; GFX6-NEXT: s_cselect_b32 s9, 1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_and_b32 s8, s8, 1 -; GFX6-NEXT: s_cmp_lg_u32 s8, 0 +; GFX6-NEXT: s_and_b32 s9, s9, 1 +; GFX6-NEXT: s_cmp_lg_u32 s9, 0 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX6-NEXT: v_mov_b32_e32 v3, s0 ; GFX6-NEXT: v_mov_b32_e32 v8, s1 -; GFX6-NEXT: s_addc_u32 s5, s5, s23 -; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: s_addc_u32 s7, s7, s8 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX6-NEXT: v_mov_b32_e32 v2, s7 +; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc -; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: v_mov_b32_e32 v8, s2 -; GFX6-NEXT: v_mov_b32_e32 v3, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: v_mov_b32_e32 v9, s3 +; GFX6-NEXT: v_mov_b32_e32 v3, s7 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; GFX6-NEXT: v_readfirstlane_b32 s0, v5 @@ -6816,68 +6319,47 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[10:11], 0 ; GFX8-NEXT: s_cselect_b32 s2, 1, 0 -; GFX8-NEXT: s_movk_i32 s20, 0x7f ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX8-NEXT: s_and_b32 s0, 1, s2 -; GFX8-NEXT: s_sub_i32 s21, s20, 64 -; GFX8-NEXT: s_sub_i32 s22, 64, s20 -; GFX8-NEXT: s_cmp_lt_u32 s20, 64 -; GFX8-NEXT: s_cselect_b32 s23, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s20, 0 -; GFX8-NEXT: s_cselect_b32 s24, 1, 0 -; GFX8-NEXT: s_lshr_b64 s[2:3], s[16:17], s20 -; GFX8-NEXT: s_lshl_b64 s[8:9], s[18:19], s22 +; GFX8-NEXT: s_ashr_i32 s3, s19, 31 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GFX8-NEXT: s_ashr_i32 s8, s19, 31 -; GFX8-NEXT: s_ashr_i64 s[0:1], s[18:19], s20 -; GFX8-NEXT: s_ashr_i64 s[10:11], s[18:19], s21 -; GFX8-NEXT: s_and_b32 s23, s23, 1 -; GFX8-NEXT: s_cmp_lg_u32 s23, 0 -; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11] -; GFX8-NEXT: s_and_b32 s10, s24, 1 -; GFX8-NEXT: s_cmp_lg_u32 s10, 0 -; GFX8-NEXT: s_cselect_b64 s[2:3], s[16:17], s[2:3] -; GFX8-NEXT: s_cmp_lg_u32 s23, 0 -; GFX8-NEXT: s_mov_b32 s9, s8 -; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9] -; GFX8-NEXT: s_add_u32 s2, s2, 0 -; GFX8-NEXT: s_cselect_b32 s8, 1, 0 -; GFX8-NEXT: s_and_b32 s8, s8, 1 -; GFX8-NEXT: s_cmp_lg_u32 s8, 0 -; GFX8-NEXT: s_addc_u32 s3, s3, 0 -; GFX8-NEXT: s_cselect_b32 s8, 1, 0 -; GFX8-NEXT: s_and_b32 s8, s8, 1 -; GFX8-NEXT: s_cmp_lg_u32 s8, 0 -; GFX8-NEXT: s_addc_u32 s0, s0, 0 -; GFX8-NEXT: s_cselect_b32 s8, 1, 0 +; GFX8-NEXT: s_add_u32 s0, s3, 0 +; GFX8-NEXT: s_cselect_b32 s1, 1, 0 +; GFX8-NEXT: s_and_b32 s1, s1, 1 +; GFX8-NEXT: s_cmp_lg_u32 s1, 0 +; GFX8-NEXT: s_addc_u32 s1, s3, 0 +; GFX8-NEXT: s_cselect_b32 s2, 1, 0 +; GFX8-NEXT: s_and_b32 s2, s2, 1 +; GFX8-NEXT: s_cmp_lg_u32 s2, 0 +; GFX8-NEXT: s_addc_u32 s2, s3, 0 +; GFX8-NEXT: s_cselect_b32 s9, 1, 0 +; GFX8-NEXT: s_and_b32 s9, s9, 1 +; GFX8-NEXT: s_brev_b32 s8, 1 +; GFX8-NEXT: s_cmp_lg_u32 s9, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX8-NEXT: s_and_b32 s8, s8, 1 +; GFX8-NEXT: s_addc_u32 s3, s3, s8 ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX8-NEXT: s_brev_b32 s23, 1 -; GFX8-NEXT: s_cmp_lg_u32 s8, 0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_addc_u32 s1, s1, s23 -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s16 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: s_add_u32 s0, s4, s12 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: s_cselect_b32 s1, 1, 0 ; GFX8-NEXT: s_and_b32 s1, s1, 1 ; GFX8-NEXT: s_cmp_lg_u32 s1, 0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: s_addc_u32 s1, s5, s13 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: s_cselect_b32 s2, 1, 0 ; GFX8-NEXT: s_and_b32 s2, s2, 1 ; GFX8-NEXT: s_cmp_lg_u32 s2, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s17 +; GFX8-NEXT: v_mov_b32_e32 v3, s16 ; GFX8-NEXT: s_addc_u32 s2, s6, s14 -; GFX8-NEXT: s_cselect_b32 s3, 1, 0 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_cselect_b32 s3, 1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s18 ; GFX8-NEXT: v_mov_b32_e32 v3, s19 ; GFX8-NEXT: s_and_b32 s3, s3, 1 @@ -6905,53 +6387,35 @@ ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] ; GFX8-NEXT: s_and_b32 s4, 1, s6 -; GFX8-NEXT: s_cmp_lt_u32 s20, 64 -; GFX8-NEXT: s_cselect_b32 s12, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s20, 0 -; GFX8-NEXT: s_cselect_b32 s13, 1, 0 -; GFX8-NEXT: s_lshr_b64 s[6:7], s[0:1], s20 -; GFX8-NEXT: s_lshl_b64 s[8:9], s[2:3], s22 +; GFX8-NEXT: s_ashr_i32 s7, s3, 31 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 -; GFX8-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] -; GFX8-NEXT: s_ashr_i32 s8, s3, 31 -; GFX8-NEXT: s_ashr_i64 s[4:5], s[2:3], s20 -; GFX8-NEXT: s_ashr_i64 s[10:11], s[2:3], s21 -; GFX8-NEXT: s_and_b32 s12, s12, 1 -; GFX8-NEXT: s_cmp_lg_u32 s12, 0 -; GFX8-NEXT: s_cselect_b64 s[6:7], s[6:7], s[10:11] -; GFX8-NEXT: s_and_b32 s10, s13, 1 -; GFX8-NEXT: s_cmp_lg_u32 s10, 0 -; GFX8-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7] -; GFX8-NEXT: s_mov_b32 s9, s8 -; GFX8-NEXT: s_cmp_lg_u32 s12, 0 -; GFX8-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9] -; GFX8-NEXT: s_add_u32 s6, s6, 0 -; GFX8-NEXT: s_cselect_b32 s8, 1, 0 -; GFX8-NEXT: s_and_b32 s8, s8, 1 -; GFX8-NEXT: s_cmp_lg_u32 s8, 0 -; GFX8-NEXT: s_addc_u32 s7, s7, 0 -; GFX8-NEXT: s_cselect_b32 s8, 1, 0 -; GFX8-NEXT: s_and_b32 s8, s8, 1 -; GFX8-NEXT: s_cmp_lg_u32 s8, 0 -; GFX8-NEXT: s_addc_u32 s4, s4, 0 -; GFX8-NEXT: s_cselect_b32 s8, 1, 0 +; GFX8-NEXT: s_add_u32 s4, s7, 0 +; GFX8-NEXT: s_cselect_b32 s5, 1, 0 +; GFX8-NEXT: s_and_b32 s5, s5, 1 +; GFX8-NEXT: s_cmp_lg_u32 s5, 0 +; GFX8-NEXT: s_addc_u32 s5, s7, 0 +; GFX8-NEXT: s_cselect_b32 s6, 1, 0 +; GFX8-NEXT: s_and_b32 s6, s6, 1 +; GFX8-NEXT: s_cmp_lg_u32 s6, 0 +; GFX8-NEXT: s_addc_u32 s6, s7, 0 +; GFX8-NEXT: s_cselect_b32 s9, 1, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX8-NEXT: s_and_b32 s8, s8, 1 -; GFX8-NEXT: s_cmp_lg_u32 s8, 0 +; GFX8-NEXT: s_and_b32 s9, s9, 1 +; GFX8-NEXT: s_cmp_lg_u32 s9, 0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NEXT: v_mov_b32_e32 v8, s1 -; GFX8-NEXT: s_addc_u32 s5, s5, s23 -; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: s_addc_u32 s7, s7, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v8, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NEXT: v_mov_b32_e32 v9, s3 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; GFX8-NEXT: v_readfirstlane_b32 s0, v5 @@ -6996,68 +6460,47 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[10:11], 0 ; GFX9-NEXT: s_cselect_b32 s2, 1, 0 -; GFX9-NEXT: s_movk_i32 s20, 0x7f ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX9-NEXT: s_and_b32 s0, 1, s2 -; GFX9-NEXT: s_sub_i32 s21, s20, 64 -; GFX9-NEXT: s_sub_i32 s22, 64, s20 -; GFX9-NEXT: s_cmp_lt_u32 s20, 64 -; GFX9-NEXT: s_cselect_b32 s23, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s20, 0 -; GFX9-NEXT: s_cselect_b32 s24, 1, 0 -; GFX9-NEXT: s_lshr_b64 s[2:3], s[16:17], s20 -; GFX9-NEXT: s_lshl_b64 s[8:9], s[18:19], s22 +; GFX9-NEXT: s_ashr_i32 s3, s19, 31 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GFX9-NEXT: s_ashr_i32 s8, s19, 31 -; GFX9-NEXT: s_ashr_i64 s[0:1], s[18:19], s20 -; GFX9-NEXT: s_ashr_i64 s[10:11], s[18:19], s21 -; GFX9-NEXT: s_and_b32 s23, s23, 1 -; GFX9-NEXT: s_cmp_lg_u32 s23, 0 -; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11] -; GFX9-NEXT: s_and_b32 s10, s24, 1 -; GFX9-NEXT: s_cmp_lg_u32 s10, 0 -; GFX9-NEXT: s_cselect_b64 s[2:3], s[16:17], s[2:3] -; GFX9-NEXT: s_cmp_lg_u32 s23, 0 -; GFX9-NEXT: s_mov_b32 s9, s8 -; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9] -; GFX9-NEXT: s_add_u32 s2, s2, 0 -; GFX9-NEXT: s_cselect_b32 s8, 1, 0 -; GFX9-NEXT: s_and_b32 s8, s8, 1 -; GFX9-NEXT: s_cmp_lg_u32 s8, 0 -; GFX9-NEXT: s_addc_u32 s3, s3, 0 -; GFX9-NEXT: s_cselect_b32 s8, 1, 0 -; GFX9-NEXT: s_and_b32 s8, s8, 1 -; GFX9-NEXT: s_cmp_lg_u32 s8, 0 -; GFX9-NEXT: s_addc_u32 s0, s0, 0 -; GFX9-NEXT: s_cselect_b32 s8, 1, 0 +; GFX9-NEXT: s_add_u32 s0, s3, 0 +; GFX9-NEXT: s_cselect_b32 s1, 1, 0 +; GFX9-NEXT: s_and_b32 s1, s1, 1 +; GFX9-NEXT: s_cmp_lg_u32 s1, 0 +; GFX9-NEXT: s_addc_u32 s1, s3, 0 +; GFX9-NEXT: s_cselect_b32 s2, 1, 0 +; GFX9-NEXT: s_and_b32 s2, s2, 1 +; GFX9-NEXT: s_cmp_lg_u32 s2, 0 +; GFX9-NEXT: s_addc_u32 s2, s3, 0 +; GFX9-NEXT: s_cselect_b32 s9, 1, 0 +; GFX9-NEXT: s_and_b32 s9, s9, 1 +; GFX9-NEXT: s_brev_b32 s8, 1 +; GFX9-NEXT: s_cmp_lg_u32 s9, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: s_and_b32 s8, s8, 1 +; GFX9-NEXT: s_addc_u32 s3, s3, s8 ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX9-NEXT: s_brev_b32 s23, 1 -; GFX9-NEXT: s_cmp_lg_u32 s8, 0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_addc_u32 s1, s1, s23 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_add_u32 s0, s4, s12 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: s_cselect_b32 s1, 1, 0 ; GFX9-NEXT: s_and_b32 s1, s1, 1 ; GFX9-NEXT: s_cmp_lg_u32 s1, 0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_addc_u32 s1, s5, s13 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: s_cselect_b32 s2, 1, 0 ; GFX9-NEXT: s_and_b32 s2, s2, 1 ; GFX9-NEXT: s_cmp_lg_u32 s2, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-NEXT: v_mov_b32_e32 v4, s17 +; GFX9-NEXT: v_mov_b32_e32 v3, s16 ; GFX9-NEXT: s_addc_u32 s2, s6, s14 -; GFX9-NEXT: s_cselect_b32 s3, 1, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_cselect_b32 s3, 1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: s_and_b32 s3, s3, 1 @@ -7085,53 +6528,35 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] ; GFX9-NEXT: s_and_b32 s4, 1, s6 -; GFX9-NEXT: s_cmp_lt_u32 s20, 64 -; GFX9-NEXT: s_cselect_b32 s12, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s20, 0 -; GFX9-NEXT: s_cselect_b32 s13, 1, 0 -; GFX9-NEXT: s_lshr_b64 s[6:7], s[0:1], s20 -; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], s22 +; GFX9-NEXT: s_ashr_i32 s7, s3, 31 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 -; GFX9-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] -; GFX9-NEXT: s_ashr_i32 s8, s3, 31 -; GFX9-NEXT: s_ashr_i64 s[4:5], s[2:3], s20 -; GFX9-NEXT: s_ashr_i64 s[10:11], s[2:3], s21 -; GFX9-NEXT: s_and_b32 s12, s12, 1 -; GFX9-NEXT: s_cmp_lg_u32 s12, 0 -; GFX9-NEXT: s_cselect_b64 s[6:7], s[6:7], s[10:11] -; GFX9-NEXT: s_and_b32 s10, s13, 1 -; GFX9-NEXT: s_cmp_lg_u32 s10, 0 -; GFX9-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7] -; GFX9-NEXT: s_mov_b32 s9, s8 -; GFX9-NEXT: s_cmp_lg_u32 s12, 0 -; GFX9-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9] -; GFX9-NEXT: s_add_u32 s6, s6, 0 -; GFX9-NEXT: s_cselect_b32 s8, 1, 0 -; GFX9-NEXT: s_and_b32 s8, s8, 1 -; GFX9-NEXT: s_cmp_lg_u32 s8, 0 -; GFX9-NEXT: s_addc_u32 s7, s7, 0 -; GFX9-NEXT: s_cselect_b32 s8, 1, 0 -; GFX9-NEXT: s_and_b32 s8, s8, 1 -; GFX9-NEXT: s_cmp_lg_u32 s8, 0 -; GFX9-NEXT: s_addc_u32 s4, s4, 0 -; GFX9-NEXT: s_cselect_b32 s8, 1, 0 +; GFX9-NEXT: s_add_u32 s4, s7, 0 +; GFX9-NEXT: s_cselect_b32 s5, 1, 0 +; GFX9-NEXT: s_and_b32 s5, s5, 1 +; GFX9-NEXT: s_cmp_lg_u32 s5, 0 +; GFX9-NEXT: s_addc_u32 s5, s7, 0 +; GFX9-NEXT: s_cselect_b32 s6, 1, 0 +; GFX9-NEXT: s_and_b32 s6, s6, 1 +; GFX9-NEXT: s_cmp_lg_u32 s6, 0 +; GFX9-NEXT: s_addc_u32 s6, s7, 0 +; GFX9-NEXT: s_cselect_b32 s9, 1, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX9-NEXT: s_and_b32 s8, s8, 1 -; GFX9-NEXT: s_cmp_lg_u32 s8, 0 +; GFX9-NEXT: s_and_b32 s9, s9, 1 +; GFX9-NEXT: s_cmp_lg_u32 s9, 0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-NEXT: v_mov_b32_e32 v8, s1 -; GFX9-NEXT: s_addc_u32 s5, s5, s23 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: s_addc_u32 s7, s7, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v8, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v9, s3 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; GFX9-NEXT: v_readfirstlane_b32 s0, v5 @@ -7146,17 +6571,18 @@ ; ; GFX10-LABEL: s_saddsat_v2i128: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s28, s0, s8 +; GFX10-NEXT: s_add_u32 s16, s0, s8 ; GFX10-NEXT: s_cselect_b32 s17, 1, 0 ; GFX10-NEXT: s_mov_b32 s46, s0 ; GFX10-NEXT: s_and_b32 s17, s17, 1 ; GFX10-NEXT: s_mov_b32 s47, s1 ; GFX10-NEXT: s_cmp_lg_u32 s17, 0 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_addc_u32 s29, s1, s9 +; GFX10-NEXT: s_addc_u32 s17, s1, s9 ; GFX10-NEXT: s_cselect_b32 s18, 1, 0 -; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[28:29], s[46:47] +; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[16:17], s[46:47] ; GFX10-NEXT: s_and_b32 s18, s18, 1 +; GFX10-NEXT: v_cmp_lt_u64_e64 s1, s[8:9], 0 ; GFX10-NEXT: s_cmp_lg_u32 s18, 0 ; GFX10-NEXT: s_addc_u32 s30, s2, s10 ; GFX10-NEXT: s_cselect_b32 s19, 1, 0 @@ -7166,144 +6592,104 @@ ; GFX10-NEXT: s_addc_u32 s31, s3, s11 ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[30:31], s[2:3] ; GFX10-NEXT: s_cmp_eq_u64 s[30:31], s[2:3] -; GFX10-NEXT: v_cmp_lt_u64_e64 s2, s[8:9], 0 +; GFX10-NEXT: v_mov_b32_e32 v3, s31 ; GFX10-NEXT: s_cselect_b32 s20, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX10-NEXT: s_and_b32 s0, 1, s20 ; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 -; GFX10-NEXT: s_movk_i32 s20, 0x7f -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 -; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[10:11], 0 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10-NEXT: s_and_b32 s1, 1, s1 -; GFX10-NEXT: s_sub_i32 s21, s20, 64 -; GFX10-NEXT: s_sub_i32 s22, 64, s20 -; GFX10-NEXT: s_cmp_lt_u32 s20, 64 +; GFX10-NEXT: s_cselect_b32 s0, 1, 0 +; GFX10-NEXT: s_ashr_i32 s3, s31, 31 +; GFX10-NEXT: s_and_b32 s0, 1, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX10-NEXT: s_cselect_b32 s10, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s20, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 -; GFX10-NEXT: s_cselect_b32 s23, 1, 0 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 -; GFX10-NEXT: s_lshr_b64 s[0:1], s[28:29], s20 -; GFX10-NEXT: s_lshl_b64 s[2:3], s[30:31], s22 -; GFX10-NEXT: s_and_b32 s24, s10, 1 -; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX10-NEXT: s_ashr_i32 s2, s31, 31 -; GFX10-NEXT: s_ashr_i64 s[8:9], s[30:31], s20 -; GFX10-NEXT: s_ashr_i64 s[10:11], s[30:31], s21 -; GFX10-NEXT: s_cmp_lg_u32 s24, 0 -; GFX10-NEXT: s_mov_b32 s3, s2 -; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[10:11] -; GFX10-NEXT: s_and_b32 s10, s23, 1 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s10, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, s29 -; GFX10-NEXT: s_cselect_b64 s[0:1], s[28:29], s[0:1] -; GFX10-NEXT: s_cmp_lg_u32 s24, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1 +; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[10:11], 0 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX10-NEXT: s_add_u32 s0, s3, 0 +; GFX10-NEXT: s_brev_b32 s10, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1 +; GFX10-NEXT: s_cselect_b32 s1, 1, 0 +; GFX10-NEXT: s_and_b32 s1, s1, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo +; GFX10-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, s17 +; GFX10-NEXT: s_addc_u32 s1, s3, 0 +; GFX10-NEXT: s_cselect_b32 s2, 1, 0 ; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX10-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] -; GFX10-NEXT: s_add_u32 s0, s0, 0 -; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s28 -; GFX10-NEXT: s_and_b32 s8, s8, 1 +; GFX10-NEXT: s_and_b32 s2, s2, 1 +; GFX10-NEXT: v_mov_b32_e32 v1, s16 +; GFX10-NEXT: s_cmp_lg_u32 s2, 0 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: s_cmp_lg_u32 s8, 0 -; GFX10-NEXT: s_brev_b32 s23, 1 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_addc_u32 s2, s3, 0 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_and_b32 s8, s8, 1 -; GFX10-NEXT: v_mov_b32_e32 v3, s31 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 -; GFX10-NEXT: s_addc_u32 s2, s2, 0 -; GFX10-NEXT: s_cselect_b32 s8, 1, 0 +; GFX10-NEXT: s_addc_u32 s3, s3, s10 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo -; GFX10-NEXT: s_and_b32 s8, s8, 1 +; GFX10-NEXT: s_add_u32 s0, s4, s12 +; GFX10-NEXT: s_cselect_b32 s8, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s8, 0 +; GFX10-NEXT: s_and_b32 s8, s8, 1 ; GFX10-NEXT: v_mov_b32_e32 v2, s30 -; GFX10-NEXT: s_addc_u32 s3, s3, s23 -; GFX10-NEXT: s_add_u32 s0, s4, s12 -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 +; GFX10-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s3, vcc_lo -; GFX10-NEXT: s_and_b32 s1, s1, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10-NEXT: s_addc_u32 s1, s5, s13 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_cmp_lt_u64_e64 s3, s[0:1], s[4:5] +; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5] ; GFX10-NEXT: s_and_b32 s8, s8, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 +; GFX10-NEXT: v_cmp_lt_u64_e64 s3, s[12:13], 0 ; GFX10-NEXT: s_addc_u32 s8, s6, s14 ; GFX10-NEXT: s_cselect_b32 s9, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 ; GFX10-NEXT: s_and_b32 s9, s9, 1 ; GFX10-NEXT: v_mov_b32_e32 v7, s8 ; GFX10-NEXT: s_cmp_lg_u32 s9, 0 ; GFX10-NEXT: s_addc_u32 s9, s7, s15 ; GFX10-NEXT: s_cmp_eq_u64 s[8:9], s[6:7] -; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[8:9], s[6:7] +; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[8:9], s[6:7] ; GFX10-NEXT: s_cselect_b32 s2, 1, 0 ; GFX10-NEXT: v_mov_b32_e32 v8, s9 ; GFX10-NEXT: s_and_b32 s2, 1, s2 ; GFX10-NEXT: s_cmp_eq_u64 s[14:15], 0 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 -; GFX10-NEXT: v_cmp_lt_u64_e64 s2, s[12:13], 0 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s3 -; GFX10-NEXT: s_cselect_b32 s3, 1, 0 -; GFX10-NEXT: s_and_b32 s16, 1, s3 -; GFX10-NEXT: s_cmp_lt_u32 s20, 64 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4 +; GFX10-NEXT: s_cselect_b32 s2, 1, 0 +; GFX10-NEXT: s_ashr_i32 s5, s9, 31 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s2 -; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[14:15], 0 -; GFX10-NEXT: s_cselect_b32 s10, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s20, 0 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s16 -; GFX10-NEXT: s_cselect_b32 s12, 1, 0 -; GFX10-NEXT: s_lshl_b64 s[4:5], s[8:9], s22 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s2 -; GFX10-NEXT: s_lshr_b64 s[2:3], s[0:1], s20 -; GFX10-NEXT: s_and_b32 s13, s10, 1 -; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX10-NEXT: s_ashr_i32 s4, s9, 31 -; GFX10-NEXT: s_ashr_i64 s[6:7], s[8:9], s20 -; GFX10-NEXT: s_ashr_i64 s[10:11], s[8:9], s21 -; GFX10-NEXT: s_cmp_lg_u32 s13, 0 -; GFX10-NEXT: s_mov_b32 s5, s4 -; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11] -; GFX10-NEXT: s_and_b32 s10, s12, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s3 +; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[14:15], 0 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s3 +; GFX10-NEXT: s_and_b32 s3, 1, s2 +; GFX10-NEXT: s_add_u32 s2, s5, 0 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s3 +; GFX10-NEXT: s_cselect_b32 s4, 1, 0 +; GFX10-NEXT: s_and_b32 s4, s4, 1 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s10, 0 +; GFX10-NEXT: s_cmp_lg_u32 s4, 0 ; GFX10-NEXT: v_mov_b32_e32 v6, s1 -; GFX10-NEXT: s_cselect_b64 s[2:3], s[0:1], s[2:3] -; GFX10-NEXT: s_cmp_lg_u32 s13, 0 +; GFX10-NEXT: s_addc_u32 s3, s5, 0 +; GFX10-NEXT: s_cselect_b32 s4, 1, 0 ; GFX10-NEXT: v_xor_b32_e32 v4, v5, v4 -; GFX10-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5] -; GFX10-NEXT: s_add_u32 s2, s2, 0 -; GFX10-NEXT: s_cselect_b32 s6, 1, 0 +; GFX10-NEXT: s_and_b32 s4, s4, 1 ; GFX10-NEXT: v_mov_b32_e32 v5, s0 -; GFX10-NEXT: s_and_b32 s6, s6, 1 -; GFX10-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX10-NEXT: s_cmp_lg_u32 s6, 0 +; GFX10-NEXT: s_cmp_lg_u32 s4, 0 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX10-NEXT: s_addc_u32 s4, s5, 0 ; GFX10-NEXT: s_cselect_b32 s6, 1, 0 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 ; GFX10-NEXT: s_and_b32 s6, s6, 1 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 ; GFX10-NEXT: s_cmp_lg_u32 s6, 0 -; GFX10-NEXT: s_addc_u32 s4, s4, 0 -; GFX10-NEXT: s_cselect_b32 s6, 1, 0 +; GFX10-NEXT: s_addc_u32 s1, s5, s10 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, s2, vcc_lo -; GFX10-NEXT: s_and_b32 s6, s6, 1 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, s3, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s6, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v7, s4, vcc_lo -; GFX10-NEXT: s_addc_u32 s1, s5, s23 -; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v8, s1, vcc_lo ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: v_readfirstlane_b32 s4, v4 ; GFX10-NEXT: v_readfirstlane_b32 s5, v5 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -5323,59 +5323,38 @@ ; GFX6-NEXT: v_cmp_gt_u64_e64 s[0:1], s[4:5], 0 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[10:11], v[0:1] -; GFX6-NEXT: s_movk_i32 s2, 0x7f -; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], 0 -; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[6:7], 0 -; GFX6-NEXT: s_sub_i32 s6, s2, 64 -; GFX6-NEXT: s_sub_i32 s4, 64, s2 -; GFX6-NEXT: s_cmp_lt_u32 s2, 64 -; GFX6-NEXT: s_cselect_b32 s12, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s2, 0 -; GFX6-NEXT: s_cselect_b32 s13, 1, 0 +; GFX6-NEXT: s_ashr_i32 s3, s11, 31 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; GFX6-NEXT: s_ashr_i64 s[0:1], s[10:11], s2 -; GFX6-NEXT: s_lshr_b64 s[2:3], s[8:9], s2 -; GFX6-NEXT: s_lshl_b64 s[4:5], s[10:11], s4 -; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX6-NEXT: s_ashr_i32 s4, s11, 31 -; GFX6-NEXT: s_ashr_i64 s[6:7], s[10:11], s6 -; GFX6-NEXT: s_and_b32 s12, s12, 1 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 -; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[6:7] -; GFX6-NEXT: s_and_b32 s6, s13, 1 -; GFX6-NEXT: s_cmp_lg_u32 s6, 0 -; GFX6-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] -; GFX6-NEXT: s_mov_b32 s5, s4 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 -; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] -; GFX6-NEXT: s_add_u32 s2, s2, 0 -; GFX6-NEXT: s_cselect_b32 s4, 1, 0 -; GFX6-NEXT: s_and_b32 s4, s4, 1 -; GFX6-NEXT: s_cmp_lg_u32 s4, 0 -; GFX6-NEXT: s_addc_u32 s3, s3, 0 -; GFX6-NEXT: s_cselect_b32 s4, 1, 0 -; GFX6-NEXT: s_and_b32 s4, s4, 1 -; GFX6-NEXT: s_cmp_lg_u32 s4, 0 -; GFX6-NEXT: s_addc_u32 s0, s0, 0 +; GFX6-NEXT: s_add_u32 s0, s3, 0 +; GFX6-NEXT: s_cselect_b32 s1, 1, 0 +; GFX6-NEXT: s_and_b32 s1, s1, 1 +; GFX6-NEXT: s_cmp_lg_u32 s1, 0 +; GFX6-NEXT: s_addc_u32 s1, s3, 0 +; GFX6-NEXT: s_cselect_b32 s2, 1, 0 +; GFX6-NEXT: s_and_b32 s2, s2, 1 +; GFX6-NEXT: s_cmp_lg_u32 s2, 0 +; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[6:7], 0 +; GFX6-NEXT: s_addc_u32 s2, s3, 0 ; GFX6-NEXT: s_cselect_b32 s4, 1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_and_b32 s4, s4, 1 ; GFX6-NEXT: s_cmp_lg_u32 s4, 0 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_addc_u32 s1, s1, 0x80000000 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: v_mov_b32_e32 v2, s3 +; GFX6-NEXT: s_addc_u32 s3, s3, 0x80000000 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mov_b32_e32 v2, s1 ; GFX6-NEXT: v_mov_b32_e32 v3, s8 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX6-NEXT: v_mov_b32_e32 v4, s9 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc -; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: v_mov_b32_e32 v3, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s2 ; GFX6-NEXT: v_mov_b32_e32 v4, s10 +; GFX6-NEXT: v_mov_b32_e32 v3, s3 ; GFX6-NEXT: v_mov_b32_e32 v5, s11 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc @@ -5420,55 +5399,34 @@ ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX8-NEXT: s_and_b32 s0, 1, s2 -; GFX8-NEXT: s_movk_i32 s2, 0x7f -; GFX8-NEXT: s_sub_i32 s6, s2, 64 -; GFX8-NEXT: s_sub_i32 s4, 64, s2 -; GFX8-NEXT: s_cmp_lt_u32 s2, 64 -; GFX8-NEXT: s_cselect_b32 s12, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s2, 0 -; GFX8-NEXT: s_cselect_b32 s13, 1, 0 +; GFX8-NEXT: s_ashr_i32 s3, s11, 31 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX8-NEXT: s_ashr_i64 s[0:1], s[10:11], s2 -; GFX8-NEXT: s_lshr_b64 s[2:3], s[8:9], s2 -; GFX8-NEXT: s_lshl_b64 s[4:5], s[10:11], s4 -; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX8-NEXT: s_ashr_i32 s4, s11, 31 -; GFX8-NEXT: s_ashr_i64 s[6:7], s[10:11], s6 -; GFX8-NEXT: s_and_b32 s12, s12, 1 -; GFX8-NEXT: s_cmp_lg_u32 s12, 0 -; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[6:7] -; GFX8-NEXT: s_and_b32 s6, s13, 1 -; GFX8-NEXT: s_cmp_lg_u32 s6, 0 -; GFX8-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] -; GFX8-NEXT: s_mov_b32 s5, s4 -; GFX8-NEXT: s_cmp_lg_u32 s12, 0 -; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] -; GFX8-NEXT: s_add_u32 s2, s2, 0 -; GFX8-NEXT: s_cselect_b32 s4, 1, 0 -; GFX8-NEXT: s_and_b32 s4, s4, 1 -; GFX8-NEXT: s_cmp_lg_u32 s4, 0 -; GFX8-NEXT: s_addc_u32 s3, s3, 0 -; GFX8-NEXT: s_cselect_b32 s4, 1, 0 -; GFX8-NEXT: s_and_b32 s4, s4, 1 -; GFX8-NEXT: s_cmp_lg_u32 s4, 0 -; GFX8-NEXT: s_addc_u32 s0, s0, 0 +; GFX8-NEXT: s_add_u32 s0, s3, 0 +; GFX8-NEXT: s_cselect_b32 s1, 1, 0 +; GFX8-NEXT: s_and_b32 s1, s1, 1 +; GFX8-NEXT: s_cmp_lg_u32 s1, 0 +; GFX8-NEXT: s_addc_u32 s1, s3, 0 +; GFX8-NEXT: s_cselect_b32 s2, 1, 0 +; GFX8-NEXT: s_and_b32 s2, s2, 1 +; GFX8-NEXT: s_cmp_lg_u32 s2, 0 +; GFX8-NEXT: s_addc_u32 s2, s3, 0 ; GFX8-NEXT: s_cselect_b32 s4, 1, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_and_b32 s4, s4, 1 ; GFX8-NEXT: s_cmp_lg_u32 s4, 0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_addc_u32 s1, s1, 0x80000000 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NEXT: s_addc_u32 s3, s3, 0x80000000 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: v_mov_b32_e32 v3, s8 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, s9 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v4, s10 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: v_mov_b32_e32 v5, s11 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc @@ -5513,55 +5471,34 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX9-NEXT: s_and_b32 s0, 1, s2 -; GFX9-NEXT: s_movk_i32 s2, 0x7f -; GFX9-NEXT: s_sub_i32 s6, s2, 64 -; GFX9-NEXT: s_sub_i32 s4, 64, s2 -; GFX9-NEXT: s_cmp_lt_u32 s2, 64 -; GFX9-NEXT: s_cselect_b32 s12, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s2, 0 -; GFX9-NEXT: s_cselect_b32 s13, 1, 0 +; GFX9-NEXT: s_ashr_i32 s3, s11, 31 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX9-NEXT: s_ashr_i64 s[0:1], s[10:11], s2 -; GFX9-NEXT: s_lshr_b64 s[2:3], s[8:9], s2 -; GFX9-NEXT: s_lshl_b64 s[4:5], s[10:11], s4 -; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX9-NEXT: s_ashr_i32 s4, s11, 31 -; GFX9-NEXT: s_ashr_i64 s[6:7], s[10:11], s6 -; GFX9-NEXT: s_and_b32 s12, s12, 1 -; GFX9-NEXT: s_cmp_lg_u32 s12, 0 -; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[6:7] -; GFX9-NEXT: s_and_b32 s6, s13, 1 -; GFX9-NEXT: s_cmp_lg_u32 s6, 0 -; GFX9-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] -; GFX9-NEXT: s_mov_b32 s5, s4 -; GFX9-NEXT: s_cmp_lg_u32 s12, 0 -; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] -; GFX9-NEXT: s_add_u32 s2, s2, 0 -; GFX9-NEXT: s_cselect_b32 s4, 1, 0 -; GFX9-NEXT: s_and_b32 s4, s4, 1 -; GFX9-NEXT: s_cmp_lg_u32 s4, 0 -; GFX9-NEXT: s_addc_u32 s3, s3, 0 -; GFX9-NEXT: s_cselect_b32 s4, 1, 0 -; GFX9-NEXT: s_and_b32 s4, s4, 1 -; GFX9-NEXT: s_cmp_lg_u32 s4, 0 -; GFX9-NEXT: s_addc_u32 s0, s0, 0 +; GFX9-NEXT: s_add_u32 s0, s3, 0 +; GFX9-NEXT: s_cselect_b32 s1, 1, 0 +; GFX9-NEXT: s_and_b32 s1, s1, 1 +; GFX9-NEXT: s_cmp_lg_u32 s1, 0 +; GFX9-NEXT: s_addc_u32 s1, s3, 0 +; GFX9-NEXT: s_cselect_b32 s2, 1, 0 +; GFX9-NEXT: s_and_b32 s2, s2, 1 +; GFX9-NEXT: s_cmp_lg_u32 s2, 0 +; GFX9-NEXT: s_addc_u32 s2, s3, 0 ; GFX9-NEXT: s_cselect_b32 s4, 1, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: s_and_b32 s4, s4, 1 ; GFX9-NEXT: s_cmp_lg_u32 s4, 0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_addc_u32 s1, s1, 0x80000000 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: s_addc_u32 s3, s3, 0x80000000 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_mov_b32_e32 v3, s8 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, s9 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_mov_b32_e32 v5, s11 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc @@ -5575,9 +5512,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_sub_u32 s8, s0, s4 ; GFX10-NEXT: s_cselect_b32 s9, 1, 0 -; GFX10-NEXT: s_movk_i32 s12, 0x7f -; GFX10-NEXT: s_and_b32 s9, s9, 1 ; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_and_b32 s9, s9, 1 ; GFX10-NEXT: s_cmp_lg_u32 s9, 0 ; GFX10-NEXT: s_subb_u32 s9, s1, s5 ; GFX10-NEXT: s_cselect_b32 s10, 1, 0 @@ -5601,57 +5537,37 @@ ; GFX10-NEXT: v_cmp_gt_u64_e64 s0, s[4:5], 0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1 ; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: s_sub_i32 s13, s12, 64 -; GFX10-NEXT: s_and_b32 s14, 1, s1 -; GFX10-NEXT: s_sub_i32 s2, 64, s12 +; GFX10-NEXT: s_ashr_i32 s3, s11, 31 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[6:7], 0 -; GFX10-NEXT: s_cmp_lt_u32 s12, 64 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s14 -; GFX10-NEXT: s_cselect_b32 s15, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s12, 0 -; GFX10-NEXT: s_cselect_b32 s16, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 -; GFX10-NEXT: s_lshr_b64 s[0:1], s[8:9], s12 -; GFX10-NEXT: s_lshl_b64 s[2:3], s[10:11], s2 -; GFX10-NEXT: s_ashr_i64 s[4:5], s[10:11], s12 -; GFX10-NEXT: s_and_b32 s12, s15, 1 -; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX10-NEXT: s_ashr_i32 s2, s11, 31 -; GFX10-NEXT: s_ashr_i64 s[6:7], s[10:11], s13 -; GFX10-NEXT: s_cmp_lg_u32 s12, 0 -; GFX10-NEXT: s_mov_b32 s3, s2 -; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[6:7] -; GFX10-NEXT: s_and_b32 s6, s16, 1 +; GFX10-NEXT: s_and_b32 s0, 1, s1 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX10-NEXT: s_add_u32 s0, s3, 0 +; GFX10-NEXT: s_cselect_b32 s1, 1, 0 +; GFX10-NEXT: s_and_b32 s1, s1, 1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s6, 0 +; GFX10-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, s9 -; GFX10-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] -; GFX10-NEXT: s_cmp_lg_u32 s12, 0 +; GFX10-NEXT: s_addc_u32 s1, s3, 0 +; GFX10-NEXT: s_cselect_b32 s2, 1, 0 ; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX10-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] -; GFX10-NEXT: s_add_u32 s0, s0, 0 -; GFX10-NEXT: s_cselect_b32 s4, 1, 0 +; GFX10-NEXT: s_and_b32 s2, s2, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, s8 -; GFX10-NEXT: s_and_b32 s4, s4, 1 +; GFX10-NEXT: s_cmp_lg_u32 s2, 0 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: s_cmp_lg_u32 s4, 0 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_addc_u32 s2, s3, 0 ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_and_b32 s4, s4, 1 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 -; GFX10-NEXT: s_addc_u32 s2, s2, 0 -; GFX10-NEXT: s_cselect_b32 s4, 1, 0 +; GFX10-NEXT: s_addc_u32 s3, s3, 0x80000000 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo -; GFX10-NEXT: s_and_b32 s4, s4, 1 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s4, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, s2, vcc_lo -; GFX10-NEXT: s_addc_u32 s3, s3, 0x80000000 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v4, s3, vcc_lo +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: v_readfirstlane_b32 s3, v3 @@ -5671,55 +5587,30 @@ ; GFX6-NEXT: v_subb_u32_e32 v6, vcc, v6, v2, vcc ; GFX6-NEXT: v_subb_u32_e32 v7, vcc, v7, v3, vcc ; GFX6-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5] -; GFX6-NEXT: s_movk_i32 s0, 0x7f ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX6-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[6:7] -; GFX6-NEXT: s_sub_i32 s1, s0, 64 ; GFX6-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[6:7] -; GFX6-NEXT: s_sub_i32 s2, 64, s0 ; GFX6-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[0:1] -; GFX6-NEXT: s_cmp_lt_u32 s0, 64 ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[2:3] -; GFX6-NEXT: s_cselect_b32 s3, 1, 0 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GFX6-NEXT: v_lshl_b64 v[2:3], v[6:7], s2 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX6-NEXT: v_xor_b32_e32 v10, v0, v8 -; GFX6-NEXT: v_lshr_b64 v[0:1], v[4:5], s0 -; GFX6-NEXT: s_cmp_eq_u32 s0, 0 -; GFX6-NEXT: s_cselect_b32 s4, 1, 0 -; GFX6-NEXT: v_ashr_i64 v[8:9], v[6:7], s0 -; GFX6-NEXT: s_and_b32 s0, 1, s3 -; GFX6-NEXT: v_or_b32_e32 v2, v0, v2 -; GFX6-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX6-NEXT: v_ashr_i64 v[0:1], v[6:7], s1 -; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX6-NEXT: s_and_b32 s0, 1, s4 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX6-NEXT: s_and_b32 s0, 1, s3 -; GFX6-NEXT: v_ashrrev_i32_e32 v11, 31, v7 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v11, v8, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v3, v11, v9, vcc -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0, v0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v7 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0, v1 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX6-NEXT: v_xor_b32_e32 v0, v0, v8 ; GFX6-NEXT: v_bfrev_b32_e32 v8, 1 -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc -; GFX6-NEXT: v_and_b32_e32 v8, 1, v10 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v1, v8, vcc +; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: ssubsat_i128_sv: @@ -5732,55 +5623,30 @@ ; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v6, v2, vcc ; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v7, v3, vcc ; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5] -; GFX8-NEXT: s_movk_i32 s0, 0x7f ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[6:7] -; GFX8-NEXT: s_sub_i32 s1, s0, 64 ; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[6:7] -; GFX8-NEXT: s_sub_i32 s2, 64, s0 ; GFX8-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[0:1] -; GFX8-NEXT: s_cmp_lt_u32 s0, 64 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[2:3] -; GFX8-NEXT: s_cselect_b32 s3, 1, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GFX8-NEXT: v_lshlrev_b64 v[2:3], s2, v[6:7] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX8-NEXT: v_xor_b32_e32 v10, v0, v8 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], s0, v[4:5] -; GFX8-NEXT: s_cmp_eq_u32 s0, 0 -; GFX8-NEXT: s_cselect_b32 s4, 1, 0 -; GFX8-NEXT: v_ashrrev_i64 v[8:9], s0, v[6:7] -; GFX8-NEXT: s_and_b32 s0, 1, s3 -; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 -; GFX8-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX8-NEXT: v_ashrrev_i64 v[0:1], s1, v[6:7] -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX8-NEXT: s_and_b32 s0, 1, s4 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX8-NEXT: s_and_b32 s0, 1, s3 -; GFX8-NEXT: v_ashrrev_i32_e32 v11, 31, v7 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v11, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v9, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v7 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0, v1 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX8-NEXT: v_xor_b32_e32 v0, v0, v8 ; GFX8-NEXT: v_bfrev_b32_e32 v8, 1 -; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc -; GFX8-NEXT: v_and_b32_e32 v8, 1, v10 -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: v_addc_u32_e32 v8, vcc, v1, v8, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: ssubsat_i128_sv: @@ -5793,55 +5659,30 @@ ; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v2, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v3, vcc ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5] -; GFX9-NEXT: s_movk_i32 s0, 0x7f ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[6:7] -; GFX9-NEXT: s_sub_i32 s1, s0, 64 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[6:7] -; GFX9-NEXT: s_sub_i32 s2, 64, s0 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[0:1] -; GFX9-NEXT: s_cmp_lt_u32 s0, 64 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[2:3] -; GFX9-NEXT: s_cselect_b32 s3, 1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GFX9-NEXT: v_lshlrev_b64 v[2:3], s2, v[6:7] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX9-NEXT: v_xor_b32_e32 v10, v0, v8 -; GFX9-NEXT: v_lshrrev_b64 v[0:1], s0, v[4:5] -; GFX9-NEXT: s_cmp_eq_u32 s0, 0 -; GFX9-NEXT: s_cselect_b32 s4, 1, 0 -; GFX9-NEXT: v_ashrrev_i64 v[8:9], s0, v[6:7] -; GFX9-NEXT: s_and_b32 s0, 1, s3 -; GFX9-NEXT: v_or_b32_e32 v2, v0, v2 -; GFX9-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX9-NEXT: v_ashrrev_i64 v[0:1], s1, v[6:7] -; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX9-NEXT: s_and_b32 s0, 1, s4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX9-NEXT: s_and_b32 s0, 1, s3 -; GFX9-NEXT: v_ashrrev_i32_e32 v11, 31, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v11, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v9, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v7 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0, v1 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX9-NEXT: v_xor_b32_e32 v0, v0, v8 ; GFX9-NEXT: v_bfrev_b32_e32 v8, 1 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v8, vcc -; GFX9-NEXT: v_and_b32_e32 v8, 1, v10 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v1, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v1, v8, vcc +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: ssubsat_i128_sv: @@ -5852,48 +5693,25 @@ ; GFX10-NEXT: v_sub_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo ; GFX10-NEXT: v_sub_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo ; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[4:5] -; GFX10-NEXT: s_movk_i32 s0, 0x7f -; GFX10-NEXT: s_sub_i32 s1, 64, s0 -; GFX10-NEXT: v_lshrrev_b64 v[15:16], s0, v[4:5] ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[6:7] ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[6:7] -; GFX10-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc_lo ; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, 0, v[0:1] -; GFX10-NEXT: v_lshlrev_b64 v[8:9], s1, v[6:7] -; GFX10-NEXT: s_sub_i32 s1, s0, 64 -; GFX10-NEXT: s_cmp_lt_u32 s0, 64 -; GFX10-NEXT: v_ashrrev_i64 v[0:1], s0, v[6:7] -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3] -; GFX10-NEXT: v_or_b32_e32 v8, v15, v8 -; GFX10-NEXT: v_or_b32_e32 v9, v16, v9 -; GFX10-NEXT: v_ashrrev_i32_e32 v15, 31, v7 -; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[6:7] +; GFX10-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc_lo ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3] -; GFX10-NEXT: v_ashrrev_i64 v[2:3], s1, v[6:7] -; GFX10-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc_lo -; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s0, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo -; GFX10-NEXT: s_and_b32 s0, 1, s1 -; GFX10-NEXT: s_and_b32 s1, 1, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 -; GFX10-NEXT: v_xor_b32_e32 v9, v11, v10 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v15, v0, s0 -; GFX10-NEXT: v_and_b32_e32 v8, 1, v9 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v15, v1, s0 -; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v2, 0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v8 -; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc_lo +; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v7 +; GFX10-NEXT: v_xor_b32_e32 v0, v3, v8 +; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v1, 0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0x80000000, v1, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, v2, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, v3, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 @@ -5918,7 +5736,7 @@ ; GFX6-NEXT: v_cmp_gt_u64_e64 s[0:1], s[0:1], 0 ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3] -; GFX6-NEXT: v_ashrrev_i32_e32 v11, 31, v7 +; GFX6-NEXT: v_bfrev_b32_e32 v8, 1 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc @@ -5926,44 +5744,19 @@ ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 ; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[2:3], 0 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; GFX6-NEXT: s_movk_i32 s0, 0x7f ; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX6-NEXT: s_sub_i32 s2, 64, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, 64 -; GFX6-NEXT: v_xor_b32_e32 v10, v1, v0 -; GFX6-NEXT: s_cmp_lt_u32 s0, 64 -; GFX6-NEXT: v_lshr_b64 v[0:1], v[4:5], s0 -; GFX6-NEXT: v_lshl_b64 v[2:3], v[6:7], s2 -; GFX6-NEXT: s_cselect_b32 s3, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s0, 0 -; GFX6-NEXT: s_cselect_b32 s4, 1, 0 -; GFX6-NEXT: v_ashr_i64 v[8:9], v[6:7], s0 -; GFX6-NEXT: s_and_b32 s0, 1, s3 -; GFX6-NEXT: v_or_b32_e32 v2, v0, v2 -; GFX6-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX6-NEXT: v_ashr_i64 v[0:1], v[6:7], s1 -; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX6-NEXT: s_and_b32 s0, 1, s4 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX6-NEXT: s_and_b32 s0, 1, s3 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v11, v8, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v3, v11, v9, vcc -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0, v0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: v_bfrev_b32_e32 v8, 1 -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc -; GFX6-NEXT: v_and_b32_e32 v8, 1, v10 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v7 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0, v1 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v1, v8, vcc +; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: ssubsat_i128_vs: @@ -5986,48 +5779,23 @@ ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 -; GFX8-NEXT: v_ashrrev_i32_e32 v11, 31, v7 +; GFX8-NEXT: v_bfrev_b32_e32 v8, 1 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX8-NEXT: s_and_b32 s0, 1, s4 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX8-NEXT: s_movk_i32 s0, 0x7f ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX8-NEXT: s_sub_i32 s2, 64, s0 -; GFX8-NEXT: s_sub_i32 s1, s0, 64 -; GFX8-NEXT: v_xor_b32_e32 v10, v1, v0 -; GFX8-NEXT: s_cmp_lt_u32 s0, 64 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], s0, v[4:5] -; GFX8-NEXT: v_lshlrev_b64 v[2:3], s2, v[6:7] -; GFX8-NEXT: s_cselect_b32 s3, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s0, 0 -; GFX8-NEXT: s_cselect_b32 s4, 1, 0 -; GFX8-NEXT: v_ashrrev_i64 v[8:9], s0, v[6:7] -; GFX8-NEXT: s_and_b32 s0, 1, s3 -; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 -; GFX8-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX8-NEXT: v_ashrrev_i64 v[0:1], s1, v[6:7] -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX8-NEXT: s_and_b32 s0, 1, s4 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX8-NEXT: s_and_b32 s0, 1, s3 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v11, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v9, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_bfrev_b32_e32 v8, 1 -; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc -; GFX8-NEXT: v_and_b32_e32 v8, 1, v10 -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v7 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0, v1 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: v_addc_u32_e32 v8, vcc, v1, v8, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: ssubsat_i128_vs: @@ -6050,111 +5818,63 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 -; GFX9-NEXT: v_ashrrev_i32_e32 v11, 31, v7 +; GFX9-NEXT: v_bfrev_b32_e32 v8, 1 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX9-NEXT: s_and_b32 s0, 1, s4 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX9-NEXT: s_movk_i32 s0, 0x7f ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: s_sub_i32 s2, 64, s0 -; GFX9-NEXT: s_sub_i32 s1, s0, 64 -; GFX9-NEXT: v_xor_b32_e32 v10, v1, v0 -; GFX9-NEXT: s_cmp_lt_u32 s0, 64 -; GFX9-NEXT: v_lshrrev_b64 v[0:1], s0, v[4:5] -; GFX9-NEXT: v_lshlrev_b64 v[2:3], s2, v[6:7] -; GFX9-NEXT: s_cselect_b32 s3, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s0, 0 -; GFX9-NEXT: s_cselect_b32 s4, 1, 0 -; GFX9-NEXT: v_ashrrev_i64 v[8:9], s0, v[6:7] -; GFX9-NEXT: s_and_b32 s0, 1, s3 -; GFX9-NEXT: v_or_b32_e32 v2, v0, v2 -; GFX9-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX9-NEXT: v_ashrrev_i64 v[0:1], s1, v[6:7] -; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX9-NEXT: s_and_b32 s0, 1, s4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX9-NEXT: s_and_b32 s0, 1, s3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v11, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v9, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_bfrev_b32_e32 v8, 1 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v8, vcc -; GFX9-NEXT: v_and_b32_e32 v8, 1, v10 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v7 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0, v1 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v1, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v1, v8, vcc +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: ssubsat_i128_vs: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: v_mov_b32_e32 v6, v1 -; GFX10-NEXT: v_mov_b32_e32 v9, v2 -; GFX10-NEXT: v_mov_b32_e32 v10, v3 +; GFX10-NEXT: v_mov_b32_e32 v13, v2 +; GFX10-NEXT: v_mov_b32_e32 v14, v3 ; GFX10-NEXT: s_cmp_eq_u64 s[2:3], 0 -; GFX10-NEXT: v_sub_co_u32_e64 v15, vcc_lo, v5, s0 +; GFX10-NEXT: v_sub_co_u32_e64 v11, vcc_lo, v5, s0 ; GFX10-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], 0 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v16, vcc_lo, s1, v6, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v12, vcc_lo, s1, v6, vcc_lo ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v19, vcc_lo, s2, v9, vcc_lo -; GFX10-NEXT: s_and_b32 s1, 1, s4 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v20, vcc_lo, s3, v10, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[15:16], v[5:6] +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v15, vcc_lo, s2, v13, vcc_lo +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v16, vcc_lo, s3, v14, vcc_lo +; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[11:12], v[5:6] ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 ; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[2:3], 0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v20 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[19:20], v[9:10] -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[19:20], v[9:10] +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[15:16], v[13:14] ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0 -; GFX10-NEXT: s_movk_i32 s0, 0x7f -; GFX10-NEXT: s_sub_i32 s2, 64, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v10, v1, v0, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 -; GFX10-NEXT: v_lshrrev_b64 v[0:1], s0, v[15:16] -; GFX10-NEXT: v_lshlrev_b64 v[2:3], s2, v[19:20] -; GFX10-NEXT: s_sub_i32 s1, s0, 64 -; GFX10-NEXT: s_cmp_lt_u32 s0, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v11, v9, v8, vcc_lo -; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 -; GFX10-NEXT: v_ashrrev_i64 v[8:9], s1, v[19:20] -; GFX10-NEXT: s_cmp_eq_u32 s0, 0 -; GFX10-NEXT: v_or_b32_e32 v2, v0, v2 -; GFX10-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: v_ashrrev_i64 v[0:1], s0, v[19:20] -; GFX10-NEXT: s_and_b32 s0, 1, s1 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc_lo -; GFX10-NEXT: s_and_b32 s1, 1, vcc_lo +; GFX10-NEXT: s_and_b32 s0, 1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[15:16], v[13:14] +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 -; GFX10-NEXT: v_xor_b32_e32 v9, v11, v10 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v15, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v16, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v0, s0 -; GFX10-NEXT: v_and_b32_e32 v8, 1, v9 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v7, v1, s0 -; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v2, 0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v8 -; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo +; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v16 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v1, 0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0x80000000, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v15, v2, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v16, v3, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v19, v8, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v20, v9, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v11, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v12, v3, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v15, v8, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v16, v9, s0 ; GFX10-NEXT: ; return to shader part epilog %result = call i128 @llvm.ssub.sat.i128(i128 %lhs, i128 %rhs) %cast = bitcast i128 %result to <4 x float> @@ -6170,103 +5890,58 @@ ; GFX6-NEXT: v_subb_u32_e32 v18, vcc, v2, v10, vcc ; GFX6-NEXT: v_subb_u32_e32 v19, vcc, v3, v11, vcc ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[16:17], v[0:1] -; GFX6-NEXT: s_movk_i32 s6, 0x7f +; GFX6-NEXT: v_bfrev_b32_e32 v20, 1 ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[18:19], v[2:3] -; GFX6-NEXT: s_sub_i32 s7, s6, 64 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[18:19], v[2:3] -; GFX6-NEXT: s_sub_i32 s8, 64, s6 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[8:9] -; GFX6-NEXT: s_cmp_lt_u32 s6, 64 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[10:11] -; GFX6-NEXT: s_cselect_b32 s4, 1, 0 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] -; GFX6-NEXT: s_cmp_eq_u32 s6, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX6-NEXT: v_xor_b32_e32 v10, v1, v0 -; GFX6-NEXT: v_lshr_b64 v[0:1], v[16:17], s6 -; GFX6-NEXT: v_lshl_b64 v[2:3], v[18:19], s8 -; GFX6-NEXT: s_cselect_b32 s5, 1, 0 -; GFX6-NEXT: s_and_b32 s4, 1, s4 -; GFX6-NEXT: v_or_b32_e32 v2, v0, v2 -; GFX6-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX6-NEXT: v_ashr_i64 v[0:1], v[18:19], s7 -; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 -; GFX6-NEXT: s_and_b32 s4, 1, s5 -; GFX6-NEXT: v_ashr_i64 v[8:9], v[18:19], s6 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX6-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, s4 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX6-NEXT: v_ashrrev_i32_e32 v11, 31, v19 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v16, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v2, v11, v8, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v3, v11, v9, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v17, s[4:5] -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0, v0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: v_bfrev_b32_e32 v20, 1 -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v20, vcc -; GFX6-NEXT: v_and_b32_e32 v8, 1, v10 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc +; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v19 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0, v1 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc +; GFX6-NEXT: v_addc_u32_e32 v9, vcc, v1, v20, vcc +; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v17, v3, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v2, v18, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v19, v9, vcc ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v4, v12 ; GFX6-NEXT: v_subb_u32_e32 v9, vcc, v5, v13, vcc ; GFX6-NEXT: v_subb_u32_e32 v10, vcc, v6, v14, vcc ; GFX6-NEXT: v_subb_u32_e32 v11, vcc, v7, v15, vcc ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] -; GFX6-NEXT: s_cmp_lt_u32 s6, 64 ; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7] -; GFX6-NEXT: s_cselect_b32 s4, 1, 0 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7] -; GFX6-NEXT: s_cmp_eq_u32 s6, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[12:13] -; GFX6-NEXT: s_cselect_b32 s5, 1, 0 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[14:15] -; GFX6-NEXT: v_ashr_i64 v[12:13], v[10:11], s6 ; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] -; GFX6-NEXT: s_and_b32 s5, 1, s5 ; GFX6-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc -; GFX6-NEXT: v_xor_b32_e32 v14, v5, v4 -; GFX6-NEXT: v_lshr_b64 v[4:5], v[8:9], s6 -; GFX6-NEXT: v_lshl_b64 v[6:7], v[10:11], s8 -; GFX6-NEXT: s_and_b32 s6, 1, s4 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v6 -; GFX6-NEXT: v_or_b32_e32 v7, v5, v7 -; GFX6-NEXT: v_ashr_i64 v[4:5], v[10:11], s7 -; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s6 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 -; GFX6-NEXT: s_and_b32 s4, 1, s4 -; GFX6-NEXT: v_ashrrev_i32_e32 v15, 31, v11 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc -; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 -; GFX6-NEXT: v_cndmask_b32_e32 v6, v15, v12, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v7, v15, v13, vcc -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0, v4 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v7, v20, vcc -; GFX6-NEXT: v_and_b32_e32 v12, 1, v14 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc +; GFX6-NEXT: v_xor_b32_e32 v4, v5, v4 +; GFX6-NEXT: v_ashrrev_i32_e32 v5, 31, v11 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, 0, v5 +; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v12, vcc, 0, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v13, vcc, v5, v20, vcc +; GFX6-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v5, v9, v7, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v7, v11, v13, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_v2i128: @@ -6277,103 +5952,58 @@ ; GFX8-NEXT: v_subb_u32_e32 v18, vcc, v2, v10, vcc ; GFX8-NEXT: v_subb_u32_e32 v19, vcc, v3, v11, vcc ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[16:17], v[0:1] -; GFX8-NEXT: s_movk_i32 s6, 0x7f +; GFX8-NEXT: v_bfrev_b32_e32 v20, 1 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[18:19], v[2:3] -; GFX8-NEXT: s_sub_i32 s7, s6, 64 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[18:19], v[2:3] -; GFX8-NEXT: s_sub_i32 s8, 64, s6 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[8:9] -; GFX8-NEXT: s_cmp_lt_u32 s6, 64 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[10:11] -; GFX8-NEXT: s_cselect_b32 s4, 1, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] -; GFX8-NEXT: s_cmp_eq_u32 s6, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX8-NEXT: v_xor_b32_e32 v10, v1, v0 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], s6, v[16:17] -; GFX8-NEXT: v_lshlrev_b64 v[2:3], s8, v[18:19] -; GFX8-NEXT: s_cselect_b32 s5, 1, 0 -; GFX8-NEXT: s_and_b32 s4, 1, s4 -; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 -; GFX8-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX8-NEXT: v_ashrrev_i64 v[0:1], s7, v[18:19] -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 -; GFX8-NEXT: s_and_b32 s4, 1, s5 -; GFX8-NEXT: v_ashrrev_i64 v[8:9], s6, v[18:19] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, s4 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX8-NEXT: v_ashrrev_i32_e32 v11, 31, v19 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v16, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v2, v11, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v17, s[4:5] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_bfrev_b32_e32 v20, 1 -; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v20, vcc -; GFX8-NEXT: v_and_b32_e32 v8, 1, v10 -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc +; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v19 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0, v1 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v1, v20, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v18, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v9, vcc ; GFX8-NEXT: v_sub_u32_e32 v8, vcc, v4, v12 ; GFX8-NEXT: v_subb_u32_e32 v9, vcc, v5, v13, vcc ; GFX8-NEXT: v_subb_u32_e32 v10, vcc, v6, v14, vcc ; GFX8-NEXT: v_subb_u32_e32 v11, vcc, v7, v15, vcc ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] -; GFX8-NEXT: s_cmp_lt_u32 s6, 64 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7] -; GFX8-NEXT: s_cselect_b32 s4, 1, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7] -; GFX8-NEXT: s_cmp_eq_u32 s6, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[12:13] -; GFX8-NEXT: s_cselect_b32 s5, 1, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[14:15] -; GFX8-NEXT: v_ashrrev_i64 v[12:13], s6, v[10:11] ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] -; GFX8-NEXT: s_and_b32 s5, 1, s5 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc -; GFX8-NEXT: v_xor_b32_e32 v14, v5, v4 -; GFX8-NEXT: v_lshrrev_b64 v[4:5], s6, v[8:9] -; GFX8-NEXT: v_lshlrev_b64 v[6:7], s8, v[10:11] -; GFX8-NEXT: s_and_b32 s6, 1, s4 -; GFX8-NEXT: v_or_b32_e32 v6, v4, v6 -; GFX8-NEXT: v_or_b32_e32 v7, v5, v7 -; GFX8-NEXT: v_ashrrev_i64 v[4:5], s7, v[10:11] -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s6 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 -; GFX8-NEXT: s_and_b32 s4, 1, s4 -; GFX8-NEXT: v_ashrrev_i32_e32 v15, 31, v11 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v15, v12, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v7, v15, v13, vcc -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0, v4 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v20, vcc -; GFX8-NEXT: v_and_b32_e32 v12, 1, v14 -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc +; GFX8-NEXT: v_xor_b32_e32 v4, v5, v4 +; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v11 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0, v5 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc +; GFX8-NEXT: v_addc_u32_e32 v12, vcc, 0, v5, vcc +; GFX8-NEXT: v_addc_u32_e32 v13, vcc, v5, v20, vcc +; GFX8-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v7, v11, v13, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_ssubsat_v2i128: @@ -6384,103 +6014,58 @@ ; GFX9-NEXT: v_subb_co_u32_e32 v18, vcc, v2, v10, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v19, vcc, v3, v11, vcc ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[16:17], v[0:1] -; GFX9-NEXT: s_movk_i32 s6, 0x7f +; GFX9-NEXT: v_bfrev_b32_e32 v20, 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[18:19], v[2:3] -; GFX9-NEXT: s_sub_i32 s7, s6, 64 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[18:19], v[2:3] -; GFX9-NEXT: s_sub_i32 s8, 64, s6 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[8:9] -; GFX9-NEXT: s_cmp_lt_u32 s6, 64 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[10:11] -; GFX9-NEXT: s_cselect_b32 s4, 1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] -; GFX9-NEXT: s_cmp_eq_u32 s6, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: v_xor_b32_e32 v10, v1, v0 -; GFX9-NEXT: v_lshrrev_b64 v[0:1], s6, v[16:17] -; GFX9-NEXT: v_lshlrev_b64 v[2:3], s8, v[18:19] -; GFX9-NEXT: s_cselect_b32 s5, 1, 0 -; GFX9-NEXT: s_and_b32 s4, 1, s4 -; GFX9-NEXT: v_or_b32_e32 v2, v0, v2 -; GFX9-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX9-NEXT: v_ashrrev_i64 v[0:1], s7, v[18:19] -; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 -; GFX9-NEXT: s_and_b32 s4, 1, s5 -; GFX9-NEXT: v_ashrrev_i64 v[8:9], s6, v[18:19] -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, s4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_ashrrev_i32_e32 v11, 31, v19 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v16, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v2, v11, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v17, s[4:5] -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_bfrev_b32_e32 v20, 1 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v20, vcc -; GFX9-NEXT: v_and_b32_e32 v8, 1, v10 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc +; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v19 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0, v1 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v1, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v1, v20, vcc +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v9, vcc ; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v4, v12 ; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, v5, v13, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, v6, v14, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, v7, v15, vcc ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] -; GFX9-NEXT: s_cmp_lt_u32 s6, 64 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7] -; GFX9-NEXT: s_cselect_b32 s4, 1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7] -; GFX9-NEXT: s_cmp_eq_u32 s6, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[12:13] -; GFX9-NEXT: s_cselect_b32 s5, 1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[14:15] -; GFX9-NEXT: v_ashrrev_i64 v[12:13], s6, v[10:11] ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] -; GFX9-NEXT: s_and_b32 s5, 1, s5 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc -; GFX9-NEXT: v_xor_b32_e32 v14, v5, v4 -; GFX9-NEXT: v_lshrrev_b64 v[4:5], s6, v[8:9] -; GFX9-NEXT: v_lshlrev_b64 v[6:7], s8, v[10:11] -; GFX9-NEXT: s_and_b32 s6, 1, s4 -; GFX9-NEXT: v_or_b32_e32 v6, v4, v6 -; GFX9-NEXT: v_or_b32_e32 v7, v5, v7 -; GFX9-NEXT: v_ashrrev_i64 v[4:5], s7, v[10:11] -; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s6 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 -; GFX9-NEXT: s_and_b32 s4, 1, s4 -; GFX9-NEXT: v_ashrrev_i32_e32 v15, 31, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v15, v12, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v13, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 0, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v20, vcc -; GFX9-NEXT: v_and_b32_e32 v12, 1, v14 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc +; GFX9-NEXT: v_xor_b32_e32 v4, v5, v4 +; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v11 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 0, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, 0, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, v5, v20, vcc +; GFX9-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v7, v11, v13, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_ssubsat_v2i128: @@ -6491,110 +6076,67 @@ ; GFX10-NEXT: v_mov_b32_e32 v23, v1 ; GFX10-NEXT: v_mov_b32_e32 v20, v2 ; GFX10-NEXT: v_mov_b32_e32 v21, v3 -; GFX10-NEXT: s_movk_i32 s5, 0x7f +; GFX10-NEXT: v_mov_b32_e32 v29, v4 ; GFX10-NEXT: v_sub_co_u32_e64 v16, vcc_lo, v22, v8 -; GFX10-NEXT: s_sub_i32 s6, 64, s5 +; GFX10-NEXT: v_mov_b32_e32 v30, v5 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v17, vcc_lo, v23, v9, vcc_lo -; GFX10-NEXT: s_sub_i32 s7, s5, 64 +; GFX10-NEXT: v_mov_b32_e32 v24, v6 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v18, vcc_lo, v20, v10, vcc_lo -; GFX10-NEXT: s_cmp_lt_u32 s5, 64 +; GFX10-NEXT: v_mov_b32_e32 v25, v7 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v19, vcc_lo, v21, v11, vcc_lo ; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[16:17], v[22:23] -; GFX10-NEXT: v_mov_b32_e32 v26, v4 -; GFX10-NEXT: v_mov_b32_e32 v27, v5 -; GFX10-NEXT: v_mov_b32_e32 v24, v6 -; GFX10-NEXT: v_lshlrev_b64 v[2:3], s6, v[18:19] -; GFX10-NEXT: v_mov_b32_e32 v25, v7 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[20:21] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[20:21] -; GFX10-NEXT: v_cndmask_b32_e32 v20, v1, v0, vcc_lo ; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, 0, v[8:9] -; GFX10-NEXT: v_lshrrev_b64 v[0:1], s5, v[16:17] ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[10:11] -; GFX10-NEXT: v_or_b32_e32 v2, v0, v2 -; GFX10-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX10-NEXT: v_ashrrev_i64 v[0:1], s5, v[18:19] ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[20:21] +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11] -; GFX10-NEXT: v_ashrrev_i32_e32 v11, 31, v19 -; GFX10-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc_lo -; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 -; GFX10-NEXT: v_ashrrev_i64 v[8:9], s7, v[18:19] -; GFX10-NEXT: s_cmp_eq_u32 s5, 0 -; GFX10-NEXT: s_cselect_b32 s4, 1, 0 -; GFX10-NEXT: s_and_b32 s8, 1, vcc_lo -; GFX10-NEXT: s_and_b32 s4, 1, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 -; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s8 -; GFX10-NEXT: v_xor_b32_e32 v9, v10, v20 -; GFX10-NEXT: s_cmp_lt_u32 s5, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v17, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v11, v0, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v11, v1, s4 -; GFX10-NEXT: v_and_b32_e32 v8, 1, v9 -; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v2, 0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e32 v20, vcc_lo, 0, v0, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e32 v21, vcc_lo, 0x80000000, v1, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 -; GFX10-NEXT: v_sub_co_u32_e64 v8, s4, v26, v12 -; GFX10-NEXT: v_sub_co_ci_u32_e64 v9, s4, v27, v13, s4 -; GFX10-NEXT: v_sub_co_ci_u32_e64 v10, s4, v24, v14, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc_lo -; GFX10-NEXT: v_sub_co_ci_u32_e64 v11, s4, v25, v15, s4 -; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[8:9], v[26:27] -; GFX10-NEXT: v_cndmask_b32_e32 v1, v17, v3, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v2, v18, v20, vcc_lo -; GFX10-NEXT: v_lshrrev_b64 v[3:4], s5, v[8:9] -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo +; GFX10-NEXT: v_sub_co_u32_e64 v31, vcc_lo, v29, v12 +; GFX10-NEXT: v_sub_co_ci_u32_e32 v32, vcc_lo, v30, v13, vcc_lo +; GFX10-NEXT: v_sub_co_ci_u32_e32 v10, vcc_lo, v24, v14, vcc_lo +; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX10-NEXT: v_sub_co_ci_u32_e32 v11, vcc_lo, v25, v15, vcc_lo +; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[31:32], v[29:30] +; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v19 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_cmp_eq_u64_e64 s5, v[10:11], v[24:25] +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[10:11], v[24:25] -; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, s4 +; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v1, 0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4 ; GFX10-NEXT: v_cmp_lt_u64_e64 s4, 0, v[12:13] -; GFX10-NEXT: v_lshlrev_b64 v[12:13], s6, v[10:11] -; GFX10-NEXT: v_cndmask_b32_e64 v17, 0, 1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s4 ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[14:15] -; GFX10-NEXT: v_or_b32_e32 v12, v3, v12 -; GFX10-NEXT: v_or_b32_e32 v13, v4, v13 -; GFX10-NEXT: v_ashrrev_i64 v[3:4], s5, v[10:11] -; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, 1, s4 -; GFX10-NEXT: v_cmp_eq_u64_e64 s4, v[10:11], v[24:25] -; GFX10-NEXT: v_cndmask_b32_e64 v7, v16, v5, s4 -; GFX10-NEXT: v_cmp_eq_u64_e64 s4, 0, v[14:15] -; GFX10-NEXT: v_ashrrev_i64 v[5:6], s7, v[10:11] -; GFX10-NEXT: v_cndmask_b32_e64 v14, v18, v17, s4 -; GFX10-NEXT: s_cselect_b32 s4, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s5, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v12, s4 -; GFX10-NEXT: s_cselect_b32 s6, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v13, s4 -; GFX10-NEXT: s_and_b32 s5, 1, s6 -; GFX10-NEXT: s_and_b32 s6, 1, s4 -; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s5 -; GFX10-NEXT: v_xor_b32_e32 v7, v14, v7 -; GFX10-NEXT: v_ashrrev_i32_e32 v18, 31, v11 -; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v8, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v9, s4 -; GFX10-NEXT: v_and_b32_e32 v7, 1, v7 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v18, v3, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v18, v4, s5 -; GFX10-NEXT: v_add_co_u32_e64 v5, s4, v5, 0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s4, 0, v6, s4 -; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, v7 -; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, 0, v3, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v19, v21, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e64 v12, s4, 0x80000000, v4, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v5, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v9, v6, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v10, v7, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v11, v12, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s4 +; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v4, s5 +; GFX10-NEXT: v_cmp_eq_u64_e64 s5, 0, v[14:15] +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0x80000000, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v17, v3, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v13, v12, s5 +; GFX10-NEXT: v_xor_b32_e32 v4, v7, v0 +; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v11 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v16, v2, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v18, v5, s4 +; GFX10-NEXT: v_and_b32_e32 v3, 1, v4 +; GFX10-NEXT: v_add_co_u32_e64 v4, vcc_lo, v7, 0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v7, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, 0, v7, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, v3 +; GFX10-NEXT: v_add_co_ci_u32_e32 v14, vcc_lo, 0x80000000, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v19, v6, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v31, v4, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v32, v5, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v10, v12, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v11, v14, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs) ret <2 x i128> %result @@ -6620,74 +6162,53 @@ ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[2:3] ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_subb_u32 s19, s3, s11 -; GFX6-NEXT: s_movk_i32 s20, 0x7f ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[18:19], v[0:1] ; GFX6-NEXT: v_cmp_gt_u64_e64 s[0:1], s[8:9], 0 -; GFX6-NEXT: s_sub_i32 s21, s20, 64 -; GFX6-NEXT: s_sub_i32 s22, 64, s20 -; GFX6-NEXT: s_cmp_lt_u32 s20, 64 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[18:19], v[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; GFX6-NEXT: s_cselect_b32 s23, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s20, 0 ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[10:11], 0 -; GFX6-NEXT: s_cselect_b32 s24, 1, 0 -; GFX6-NEXT: s_lshr_b64 s[2:3], s[16:17], s20 -; GFX6-NEXT: s_lshl_b64 s[8:9], s[18:19], s22 +; GFX6-NEXT: s_ashr_i32 s3, s19, 31 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX6-NEXT: s_add_u32 s0, s3, 0 +; GFX6-NEXT: s_cselect_b32 s1, 1, 0 +; GFX6-NEXT: s_and_b32 s1, s1, 1 +; GFX6-NEXT: s_cmp_lg_u32 s1, 0 +; GFX6-NEXT: s_addc_u32 s1, s3, 0 +; GFX6-NEXT: s_cselect_b32 s2, 1, 0 +; GFX6-NEXT: s_and_b32 s2, s2, 1 +; GFX6-NEXT: s_cmp_lg_u32 s2, 0 +; GFX6-NEXT: s_addc_u32 s2, s3, 0 +; GFX6-NEXT: s_cselect_b32 s9, 1, 0 +; GFX6-NEXT: s_and_b32 s9, s9, 1 ; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[10:11], 0 -; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GFX6-NEXT: s_ashr_i32 s8, s19, 31 -; GFX6-NEXT: s_ashr_i64 s[0:1], s[18:19], s20 -; GFX6-NEXT: s_ashr_i64 s[10:11], s[18:19], s21 -; GFX6-NEXT: s_and_b32 s23, s23, 1 -; GFX6-NEXT: s_cmp_lg_u32 s23, 0 -; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11] -; GFX6-NEXT: s_and_b32 s10, s24, 1 -; GFX6-NEXT: s_cmp_lg_u32 s10, 0 -; GFX6-NEXT: s_cselect_b64 s[2:3], s[16:17], s[2:3] -; GFX6-NEXT: s_cmp_lg_u32 s23, 0 -; GFX6-NEXT: s_mov_b32 s9, s8 -; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9] -; GFX6-NEXT: s_add_u32 s2, s2, 0 -; GFX6-NEXT: s_cselect_b32 s8, 1, 0 -; GFX6-NEXT: s_and_b32 s8, s8, 1 -; GFX6-NEXT: s_cmp_lg_u32 s8, 0 -; GFX6-NEXT: s_addc_u32 s3, s3, 0 -; GFX6-NEXT: s_cselect_b32 s8, 1, 0 -; GFX6-NEXT: s_and_b32 s8, s8, 1 -; GFX6-NEXT: s_cmp_lg_u32 s8, 0 -; GFX6-NEXT: s_addc_u32 s0, s0, 0 -; GFX6-NEXT: s_cselect_b32 s8, 1, 0 +; GFX6-NEXT: s_brev_b32 s8, 1 +; GFX6-NEXT: s_cmp_lg_u32 s9, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX6-NEXT: s_and_b32 s8, s8, 1 +; GFX6-NEXT: s_addc_u32 s3, s3, s8 ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_brev_b32 s23, 1 -; GFX6-NEXT: s_cmp_lg_u32 s8, 0 -; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_addc_u32 s1, s1, s23 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: v_mov_b32_e32 v3, s16 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: s_sub_u32 s0, s4, s12 -; GFX6-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s1 ; GFX6-NEXT: s_cselect_b32 s1, 1, 0 ; GFX6-NEXT: s_and_b32 s1, s1, 1 ; GFX6-NEXT: s_cmp_lg_u32 s1, 0 +; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX6-NEXT: s_subb_u32 s1, s5, s13 +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_cselect_b32 s2, 1, 0 ; GFX6-NEXT: s_and_b32 s2, s2, 1 ; GFX6-NEXT: s_cmp_lg_u32 s2, 0 -; GFX6-NEXT: v_mov_b32_e32 v2, s3 ; GFX6-NEXT: v_mov_b32_e32 v4, s17 +; GFX6-NEXT: v_mov_b32_e32 v3, s16 +; GFX6-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc ; GFX6-NEXT: v_mov_b32_e32 v2, s18 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: v_mov_b32_e32 v3, s19 ; GFX6-NEXT: s_subb_u32 s2, s6, s14 ; GFX6-NEXT: s_cselect_b32 s3, 1, 0 @@ -6704,59 +6225,41 @@ ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] ; GFX6-NEXT: v_cmp_gt_u64_e64 s[4:5], s[12:13], 0 -; GFX6-NEXT: s_cmp_lt_u32 s20, 64 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] -; GFX6-NEXT: s_cselect_b32 s12, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s20, 0 ; GFX6-NEXT: v_cmp_gt_i64_e64 s[4:5], s[14:15], 0 -; GFX6-NEXT: s_cselect_b32 s13, 1, 0 -; GFX6-NEXT: s_lshr_b64 s[6:7], s[0:1], s20 -; GFX6-NEXT: s_lshl_b64 s[8:9], s[2:3], s22 +; GFX6-NEXT: s_ashr_i32 s7, s3, 31 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] -; GFX6-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] -; GFX6-NEXT: s_ashr_i32 s8, s3, 31 -; GFX6-NEXT: s_ashr_i64 s[4:5], s[2:3], s20 -; GFX6-NEXT: s_ashr_i64 s[10:11], s[2:3], s21 -; GFX6-NEXT: s_and_b32 s12, s12, 1 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 -; GFX6-NEXT: s_cselect_b64 s[6:7], s[6:7], s[10:11] -; GFX6-NEXT: s_and_b32 s10, s13, 1 -; GFX6-NEXT: s_cmp_lg_u32 s10, 0 -; GFX6-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7] -; GFX6-NEXT: s_mov_b32 s9, s8 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 -; GFX6-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9] -; GFX6-NEXT: s_add_u32 s6, s6, 0 -; GFX6-NEXT: s_cselect_b32 s8, 1, 0 -; GFX6-NEXT: s_and_b32 s8, s8, 1 -; GFX6-NEXT: s_cmp_lg_u32 s8, 0 -; GFX6-NEXT: s_addc_u32 s7, s7, 0 -; GFX6-NEXT: s_cselect_b32 s8, 1, 0 -; GFX6-NEXT: s_and_b32 s8, s8, 1 -; GFX6-NEXT: s_cmp_lg_u32 s8, 0 +; GFX6-NEXT: s_add_u32 s4, s7, 0 +; GFX6-NEXT: s_cselect_b32 s5, 1, 0 +; GFX6-NEXT: s_and_b32 s5, s5, 1 +; GFX6-NEXT: s_cmp_lg_u32 s5, 0 +; GFX6-NEXT: s_addc_u32 s5, s7, 0 +; GFX6-NEXT: s_cselect_b32 s6, 1, 0 +; GFX6-NEXT: s_and_b32 s6, s6, 1 +; GFX6-NEXT: s_cmp_lg_u32 s6, 0 ; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[14:15], 0 -; GFX6-NEXT: s_addc_u32 s4, s4, 0 -; GFX6-NEXT: s_cselect_b32 s8, 1, 0 +; GFX6-NEXT: s_addc_u32 s6, s7, 0 +; GFX6-NEXT: s_cselect_b32 s9, 1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_and_b32 s8, s8, 1 -; GFX6-NEXT: s_cmp_lg_u32 s8, 0 +; GFX6-NEXT: s_and_b32 s9, s9, 1 +; GFX6-NEXT: s_cmp_lg_u32 s9, 0 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX6-NEXT: v_mov_b32_e32 v3, s0 ; GFX6-NEXT: v_mov_b32_e32 v8, s1 -; GFX6-NEXT: s_addc_u32 s5, s5, s23 -; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: s_addc_u32 s7, s7, s8 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX6-NEXT: v_mov_b32_e32 v2, s7 +; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc -; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: v_mov_b32_e32 v8, s2 -; GFX6-NEXT: v_mov_b32_e32 v3, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: v_mov_b32_e32 v9, s3 +; GFX6-NEXT: v_mov_b32_e32 v3, s7 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; GFX6-NEXT: v_readfirstlane_b32 s0, v5 @@ -6801,68 +6304,47 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[10:11], 0 ; GFX8-NEXT: s_cselect_b32 s2, 1, 0 -; GFX8-NEXT: s_movk_i32 s20, 0x7f ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX8-NEXT: s_and_b32 s0, 1, s2 -; GFX8-NEXT: s_sub_i32 s21, s20, 64 -; GFX8-NEXT: s_sub_i32 s22, 64, s20 -; GFX8-NEXT: s_cmp_lt_u32 s20, 64 -; GFX8-NEXT: s_cselect_b32 s23, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s20, 0 -; GFX8-NEXT: s_cselect_b32 s24, 1, 0 -; GFX8-NEXT: s_lshr_b64 s[2:3], s[16:17], s20 -; GFX8-NEXT: s_lshl_b64 s[8:9], s[18:19], s22 +; GFX8-NEXT: s_ashr_i32 s3, s19, 31 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GFX8-NEXT: s_ashr_i32 s8, s19, 31 -; GFX8-NEXT: s_ashr_i64 s[0:1], s[18:19], s20 -; GFX8-NEXT: s_ashr_i64 s[10:11], s[18:19], s21 -; GFX8-NEXT: s_and_b32 s23, s23, 1 -; GFX8-NEXT: s_cmp_lg_u32 s23, 0 -; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11] -; GFX8-NEXT: s_and_b32 s10, s24, 1 -; GFX8-NEXT: s_cmp_lg_u32 s10, 0 -; GFX8-NEXT: s_cselect_b64 s[2:3], s[16:17], s[2:3] -; GFX8-NEXT: s_cmp_lg_u32 s23, 0 -; GFX8-NEXT: s_mov_b32 s9, s8 -; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9] -; GFX8-NEXT: s_add_u32 s2, s2, 0 -; GFX8-NEXT: s_cselect_b32 s8, 1, 0 -; GFX8-NEXT: s_and_b32 s8, s8, 1 -; GFX8-NEXT: s_cmp_lg_u32 s8, 0 -; GFX8-NEXT: s_addc_u32 s3, s3, 0 -; GFX8-NEXT: s_cselect_b32 s8, 1, 0 -; GFX8-NEXT: s_and_b32 s8, s8, 1 -; GFX8-NEXT: s_cmp_lg_u32 s8, 0 -; GFX8-NEXT: s_addc_u32 s0, s0, 0 -; GFX8-NEXT: s_cselect_b32 s8, 1, 0 +; GFX8-NEXT: s_add_u32 s0, s3, 0 +; GFX8-NEXT: s_cselect_b32 s1, 1, 0 +; GFX8-NEXT: s_and_b32 s1, s1, 1 +; GFX8-NEXT: s_cmp_lg_u32 s1, 0 +; GFX8-NEXT: s_addc_u32 s1, s3, 0 +; GFX8-NEXT: s_cselect_b32 s2, 1, 0 +; GFX8-NEXT: s_and_b32 s2, s2, 1 +; GFX8-NEXT: s_cmp_lg_u32 s2, 0 +; GFX8-NEXT: s_addc_u32 s2, s3, 0 +; GFX8-NEXT: s_cselect_b32 s9, 1, 0 +; GFX8-NEXT: s_and_b32 s9, s9, 1 +; GFX8-NEXT: s_brev_b32 s8, 1 +; GFX8-NEXT: s_cmp_lg_u32 s9, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX8-NEXT: s_and_b32 s8, s8, 1 +; GFX8-NEXT: s_addc_u32 s3, s3, s8 ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX8-NEXT: s_brev_b32 s23, 1 -; GFX8-NEXT: s_cmp_lg_u32 s8, 0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_addc_u32 s1, s1, s23 -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s16 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: s_sub_u32 s0, s4, s12 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: s_cselect_b32 s1, 1, 0 ; GFX8-NEXT: s_and_b32 s1, s1, 1 ; GFX8-NEXT: s_cmp_lg_u32 s1, 0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: s_subb_u32 s1, s5, s13 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: s_cselect_b32 s2, 1, 0 ; GFX8-NEXT: s_and_b32 s2, s2, 1 ; GFX8-NEXT: s_cmp_lg_u32 s2, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s17 +; GFX8-NEXT: v_mov_b32_e32 v3, s16 ; GFX8-NEXT: s_subb_u32 s2, s6, s14 -; GFX8-NEXT: s_cselect_b32 s3, 1, 0 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_cselect_b32 s3, 1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s18 ; GFX8-NEXT: v_mov_b32_e32 v3, s19 ; GFX8-NEXT: s_and_b32 s3, s3, 1 @@ -6890,53 +6372,35 @@ ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] ; GFX8-NEXT: s_and_b32 s4, 1, s6 -; GFX8-NEXT: s_cmp_lt_u32 s20, 64 -; GFX8-NEXT: s_cselect_b32 s12, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s20, 0 -; GFX8-NEXT: s_cselect_b32 s13, 1, 0 -; GFX8-NEXT: s_lshr_b64 s[6:7], s[0:1], s20 -; GFX8-NEXT: s_lshl_b64 s[8:9], s[2:3], s22 +; GFX8-NEXT: s_ashr_i32 s7, s3, 31 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 -; GFX8-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] -; GFX8-NEXT: s_ashr_i32 s8, s3, 31 -; GFX8-NEXT: s_ashr_i64 s[4:5], s[2:3], s20 -; GFX8-NEXT: s_ashr_i64 s[10:11], s[2:3], s21 -; GFX8-NEXT: s_and_b32 s12, s12, 1 -; GFX8-NEXT: s_cmp_lg_u32 s12, 0 -; GFX8-NEXT: s_cselect_b64 s[6:7], s[6:7], s[10:11] -; GFX8-NEXT: s_and_b32 s10, s13, 1 -; GFX8-NEXT: s_cmp_lg_u32 s10, 0 -; GFX8-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7] -; GFX8-NEXT: s_mov_b32 s9, s8 -; GFX8-NEXT: s_cmp_lg_u32 s12, 0 -; GFX8-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9] -; GFX8-NEXT: s_add_u32 s6, s6, 0 -; GFX8-NEXT: s_cselect_b32 s8, 1, 0 -; GFX8-NEXT: s_and_b32 s8, s8, 1 -; GFX8-NEXT: s_cmp_lg_u32 s8, 0 -; GFX8-NEXT: s_addc_u32 s7, s7, 0 -; GFX8-NEXT: s_cselect_b32 s8, 1, 0 -; GFX8-NEXT: s_and_b32 s8, s8, 1 -; GFX8-NEXT: s_cmp_lg_u32 s8, 0 -; GFX8-NEXT: s_addc_u32 s4, s4, 0 -; GFX8-NEXT: s_cselect_b32 s8, 1, 0 +; GFX8-NEXT: s_add_u32 s4, s7, 0 +; GFX8-NEXT: s_cselect_b32 s5, 1, 0 +; GFX8-NEXT: s_and_b32 s5, s5, 1 +; GFX8-NEXT: s_cmp_lg_u32 s5, 0 +; GFX8-NEXT: s_addc_u32 s5, s7, 0 +; GFX8-NEXT: s_cselect_b32 s6, 1, 0 +; GFX8-NEXT: s_and_b32 s6, s6, 1 +; GFX8-NEXT: s_cmp_lg_u32 s6, 0 +; GFX8-NEXT: s_addc_u32 s6, s7, 0 +; GFX8-NEXT: s_cselect_b32 s9, 1, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX8-NEXT: s_and_b32 s8, s8, 1 -; GFX8-NEXT: s_cmp_lg_u32 s8, 0 +; GFX8-NEXT: s_and_b32 s9, s9, 1 +; GFX8-NEXT: s_cmp_lg_u32 s9, 0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NEXT: v_mov_b32_e32 v8, s1 -; GFX8-NEXT: s_addc_u32 s5, s5, s23 -; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: s_addc_u32 s7, s7, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v8, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NEXT: v_mov_b32_e32 v9, s3 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; GFX8-NEXT: v_readfirstlane_b32 s0, v5 @@ -6981,68 +6445,47 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[10:11], 0 ; GFX9-NEXT: s_cselect_b32 s2, 1, 0 -; GFX9-NEXT: s_movk_i32 s20, 0x7f ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX9-NEXT: s_and_b32 s0, 1, s2 -; GFX9-NEXT: s_sub_i32 s21, s20, 64 -; GFX9-NEXT: s_sub_i32 s22, 64, s20 -; GFX9-NEXT: s_cmp_lt_u32 s20, 64 -; GFX9-NEXT: s_cselect_b32 s23, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s20, 0 -; GFX9-NEXT: s_cselect_b32 s24, 1, 0 -; GFX9-NEXT: s_lshr_b64 s[2:3], s[16:17], s20 -; GFX9-NEXT: s_lshl_b64 s[8:9], s[18:19], s22 +; GFX9-NEXT: s_ashr_i32 s3, s19, 31 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GFX9-NEXT: s_ashr_i32 s8, s19, 31 -; GFX9-NEXT: s_ashr_i64 s[0:1], s[18:19], s20 -; GFX9-NEXT: s_ashr_i64 s[10:11], s[18:19], s21 -; GFX9-NEXT: s_and_b32 s23, s23, 1 -; GFX9-NEXT: s_cmp_lg_u32 s23, 0 -; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11] -; GFX9-NEXT: s_and_b32 s10, s24, 1 -; GFX9-NEXT: s_cmp_lg_u32 s10, 0 -; GFX9-NEXT: s_cselect_b64 s[2:3], s[16:17], s[2:3] -; GFX9-NEXT: s_cmp_lg_u32 s23, 0 -; GFX9-NEXT: s_mov_b32 s9, s8 -; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9] -; GFX9-NEXT: s_add_u32 s2, s2, 0 -; GFX9-NEXT: s_cselect_b32 s8, 1, 0 -; GFX9-NEXT: s_and_b32 s8, s8, 1 -; GFX9-NEXT: s_cmp_lg_u32 s8, 0 -; GFX9-NEXT: s_addc_u32 s3, s3, 0 -; GFX9-NEXT: s_cselect_b32 s8, 1, 0 -; GFX9-NEXT: s_and_b32 s8, s8, 1 -; GFX9-NEXT: s_cmp_lg_u32 s8, 0 -; GFX9-NEXT: s_addc_u32 s0, s0, 0 -; GFX9-NEXT: s_cselect_b32 s8, 1, 0 +; GFX9-NEXT: s_add_u32 s0, s3, 0 +; GFX9-NEXT: s_cselect_b32 s1, 1, 0 +; GFX9-NEXT: s_and_b32 s1, s1, 1 +; GFX9-NEXT: s_cmp_lg_u32 s1, 0 +; GFX9-NEXT: s_addc_u32 s1, s3, 0 +; GFX9-NEXT: s_cselect_b32 s2, 1, 0 +; GFX9-NEXT: s_and_b32 s2, s2, 1 +; GFX9-NEXT: s_cmp_lg_u32 s2, 0 +; GFX9-NEXT: s_addc_u32 s2, s3, 0 +; GFX9-NEXT: s_cselect_b32 s9, 1, 0 +; GFX9-NEXT: s_and_b32 s9, s9, 1 +; GFX9-NEXT: s_brev_b32 s8, 1 +; GFX9-NEXT: s_cmp_lg_u32 s9, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: s_and_b32 s8, s8, 1 +; GFX9-NEXT: s_addc_u32 s3, s3, s8 ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX9-NEXT: s_brev_b32 s23, 1 -; GFX9-NEXT: s_cmp_lg_u32 s8, 0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_addc_u32 s1, s1, s23 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_sub_u32 s0, s4, s12 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: s_cselect_b32 s1, 1, 0 ; GFX9-NEXT: s_and_b32 s1, s1, 1 ; GFX9-NEXT: s_cmp_lg_u32 s1, 0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_subb_u32 s1, s5, s13 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: s_cselect_b32 s2, 1, 0 ; GFX9-NEXT: s_and_b32 s2, s2, 1 ; GFX9-NEXT: s_cmp_lg_u32 s2, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-NEXT: v_mov_b32_e32 v4, s17 +; GFX9-NEXT: v_mov_b32_e32 v3, s16 ; GFX9-NEXT: s_subb_u32 s2, s6, s14 -; GFX9-NEXT: s_cselect_b32 s3, 1, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_cselect_b32 s3, 1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: s_and_b32 s3, s3, 1 @@ -7070,53 +6513,35 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] ; GFX9-NEXT: s_and_b32 s4, 1, s6 -; GFX9-NEXT: s_cmp_lt_u32 s20, 64 -; GFX9-NEXT: s_cselect_b32 s12, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s20, 0 -; GFX9-NEXT: s_cselect_b32 s13, 1, 0 -; GFX9-NEXT: s_lshr_b64 s[6:7], s[0:1], s20 -; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], s22 +; GFX9-NEXT: s_ashr_i32 s7, s3, 31 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 -; GFX9-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] -; GFX9-NEXT: s_ashr_i32 s8, s3, 31 -; GFX9-NEXT: s_ashr_i64 s[4:5], s[2:3], s20 -; GFX9-NEXT: s_ashr_i64 s[10:11], s[2:3], s21 -; GFX9-NEXT: s_and_b32 s12, s12, 1 -; GFX9-NEXT: s_cmp_lg_u32 s12, 0 -; GFX9-NEXT: s_cselect_b64 s[6:7], s[6:7], s[10:11] -; GFX9-NEXT: s_and_b32 s10, s13, 1 -; GFX9-NEXT: s_cmp_lg_u32 s10, 0 -; GFX9-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7] -; GFX9-NEXT: s_mov_b32 s9, s8 -; GFX9-NEXT: s_cmp_lg_u32 s12, 0 -; GFX9-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9] -; GFX9-NEXT: s_add_u32 s6, s6, 0 -; GFX9-NEXT: s_cselect_b32 s8, 1, 0 -; GFX9-NEXT: s_and_b32 s8, s8, 1 -; GFX9-NEXT: s_cmp_lg_u32 s8, 0 -; GFX9-NEXT: s_addc_u32 s7, s7, 0 -; GFX9-NEXT: s_cselect_b32 s8, 1, 0 -; GFX9-NEXT: s_and_b32 s8, s8, 1 -; GFX9-NEXT: s_cmp_lg_u32 s8, 0 -; GFX9-NEXT: s_addc_u32 s4, s4, 0 -; GFX9-NEXT: s_cselect_b32 s8, 1, 0 +; GFX9-NEXT: s_add_u32 s4, s7, 0 +; GFX9-NEXT: s_cselect_b32 s5, 1, 0 +; GFX9-NEXT: s_and_b32 s5, s5, 1 +; GFX9-NEXT: s_cmp_lg_u32 s5, 0 +; GFX9-NEXT: s_addc_u32 s5, s7, 0 +; GFX9-NEXT: s_cselect_b32 s6, 1, 0 +; GFX9-NEXT: s_and_b32 s6, s6, 1 +; GFX9-NEXT: s_cmp_lg_u32 s6, 0 +; GFX9-NEXT: s_addc_u32 s6, s7, 0 +; GFX9-NEXT: s_cselect_b32 s9, 1, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX9-NEXT: s_and_b32 s8, s8, 1 -; GFX9-NEXT: s_cmp_lg_u32 s8, 0 +; GFX9-NEXT: s_and_b32 s9, s9, 1 +; GFX9-NEXT: s_cmp_lg_u32 s9, 0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-NEXT: v_mov_b32_e32 v8, s1 -; GFX9-NEXT: s_addc_u32 s5, s5, s23 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: s_addc_u32 s7, s7, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v8, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v9, s3 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; GFX9-NEXT: v_readfirstlane_b32 s0, v5 @@ -7131,17 +6556,18 @@ ; ; GFX10-LABEL: s_ssubsat_v2i128: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_sub_u32 s28, s0, s8 +; GFX10-NEXT: s_sub_u32 s16, s0, s8 ; GFX10-NEXT: s_cselect_b32 s17, 1, 0 ; GFX10-NEXT: s_mov_b32 s46, s0 ; GFX10-NEXT: s_and_b32 s17, s17, 1 ; GFX10-NEXT: s_mov_b32 s47, s1 ; GFX10-NEXT: s_cmp_lg_u32 s17, 0 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_subb_u32 s29, s1, s9 +; GFX10-NEXT: s_subb_u32 s17, s1, s9 ; GFX10-NEXT: s_cselect_b32 s18, 1, 0 -; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[28:29], s[46:47] +; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[16:17], s[46:47] ; GFX10-NEXT: s_and_b32 s18, s18, 1 +; GFX10-NEXT: v_cmp_gt_u64_e64 s1, s[8:9], 0 ; GFX10-NEXT: s_cmp_lg_u32 s18, 0 ; GFX10-NEXT: s_subb_u32 s30, s2, s10 ; GFX10-NEXT: s_cselect_b32 s19, 1, 0 @@ -7151,144 +6577,104 @@ ; GFX10-NEXT: s_subb_u32 s31, s3, s11 ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[30:31], s[2:3] ; GFX10-NEXT: s_cmp_eq_u64 s[30:31], s[2:3] -; GFX10-NEXT: v_cmp_gt_u64_e64 s2, s[8:9], 0 +; GFX10-NEXT: v_mov_b32_e32 v3, s31 ; GFX10-NEXT: s_cselect_b32 s20, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX10-NEXT: s_and_b32 s0, 1, s20 ; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 -; GFX10-NEXT: s_movk_i32 s20, 0x7f -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 -; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[10:11], 0 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10-NEXT: s_and_b32 s1, 1, s1 -; GFX10-NEXT: s_sub_i32 s21, s20, 64 -; GFX10-NEXT: s_sub_i32 s22, 64, s20 -; GFX10-NEXT: s_cmp_lt_u32 s20, 64 +; GFX10-NEXT: s_cselect_b32 s0, 1, 0 +; GFX10-NEXT: s_ashr_i32 s3, s31, 31 +; GFX10-NEXT: s_and_b32 s0, 1, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX10-NEXT: s_cselect_b32 s10, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s20, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 -; GFX10-NEXT: s_cselect_b32 s23, 1, 0 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 -; GFX10-NEXT: s_lshr_b64 s[0:1], s[28:29], s20 -; GFX10-NEXT: s_lshl_b64 s[2:3], s[30:31], s22 -; GFX10-NEXT: s_and_b32 s24, s10, 1 -; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX10-NEXT: s_ashr_i32 s2, s31, 31 -; GFX10-NEXT: s_ashr_i64 s[8:9], s[30:31], s20 -; GFX10-NEXT: s_ashr_i64 s[10:11], s[30:31], s21 -; GFX10-NEXT: s_cmp_lg_u32 s24, 0 -; GFX10-NEXT: s_mov_b32 s3, s2 -; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[10:11] -; GFX10-NEXT: s_and_b32 s10, s23, 1 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s10, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, s29 -; GFX10-NEXT: s_cselect_b64 s[0:1], s[28:29], s[0:1] -; GFX10-NEXT: s_cmp_lg_u32 s24, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1 +; GFX10-NEXT: v_cmp_gt_i64_e64 s1, s[10:11], 0 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX10-NEXT: s_add_u32 s0, s3, 0 +; GFX10-NEXT: s_brev_b32 s10, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1 +; GFX10-NEXT: s_cselect_b32 s1, 1, 0 +; GFX10-NEXT: s_and_b32 s1, s1, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo +; GFX10-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, s17 +; GFX10-NEXT: s_addc_u32 s1, s3, 0 +; GFX10-NEXT: s_cselect_b32 s2, 1, 0 ; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX10-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] -; GFX10-NEXT: s_add_u32 s0, s0, 0 -; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s28 -; GFX10-NEXT: s_and_b32 s8, s8, 1 +; GFX10-NEXT: s_and_b32 s2, s2, 1 +; GFX10-NEXT: v_mov_b32_e32 v1, s16 +; GFX10-NEXT: s_cmp_lg_u32 s2, 0 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: s_cmp_lg_u32 s8, 0 -; GFX10-NEXT: s_brev_b32 s23, 1 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_addc_u32 s2, s3, 0 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_and_b32 s8, s8, 1 -; GFX10-NEXT: v_mov_b32_e32 v3, s31 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 -; GFX10-NEXT: s_addc_u32 s2, s2, 0 -; GFX10-NEXT: s_cselect_b32 s8, 1, 0 +; GFX10-NEXT: s_addc_u32 s3, s3, s10 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo -; GFX10-NEXT: s_and_b32 s8, s8, 1 +; GFX10-NEXT: s_sub_u32 s0, s4, s12 +; GFX10-NEXT: s_cselect_b32 s8, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s8, 0 +; GFX10-NEXT: s_and_b32 s8, s8, 1 ; GFX10-NEXT: v_mov_b32_e32 v2, s30 -; GFX10-NEXT: s_addc_u32 s3, s3, s23 -; GFX10-NEXT: s_sub_u32 s0, s4, s12 -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 +; GFX10-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s3, vcc_lo -; GFX10-NEXT: s_and_b32 s1, s1, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10-NEXT: s_subb_u32 s1, s5, s13 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_cmp_lt_u64_e64 s3, s[0:1], s[4:5] +; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5] ; GFX10-NEXT: s_and_b32 s8, s8, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 +; GFX10-NEXT: v_cmp_gt_u64_e64 s3, s[12:13], 0 ; GFX10-NEXT: s_subb_u32 s8, s6, s14 ; GFX10-NEXT: s_cselect_b32 s9, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 ; GFX10-NEXT: s_and_b32 s9, s9, 1 ; GFX10-NEXT: v_mov_b32_e32 v7, s8 ; GFX10-NEXT: s_cmp_lg_u32 s9, 0 ; GFX10-NEXT: s_subb_u32 s9, s7, s15 ; GFX10-NEXT: s_cmp_eq_u64 s[8:9], s[6:7] -; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[8:9], s[6:7] +; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[8:9], s[6:7] ; GFX10-NEXT: s_cselect_b32 s2, 1, 0 ; GFX10-NEXT: v_mov_b32_e32 v8, s9 ; GFX10-NEXT: s_and_b32 s2, 1, s2 ; GFX10-NEXT: s_cmp_eq_u64 s[14:15], 0 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 -; GFX10-NEXT: v_cmp_gt_u64_e64 s2, s[12:13], 0 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s3 -; GFX10-NEXT: s_cselect_b32 s3, 1, 0 -; GFX10-NEXT: s_and_b32 s16, 1, s3 -; GFX10-NEXT: s_cmp_lt_u32 s20, 64 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4 +; GFX10-NEXT: s_cselect_b32 s2, 1, 0 +; GFX10-NEXT: s_ashr_i32 s5, s9, 31 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s2 -; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[14:15], 0 -; GFX10-NEXT: s_cselect_b32 s10, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s20, 0 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s16 -; GFX10-NEXT: s_cselect_b32 s12, 1, 0 -; GFX10-NEXT: s_lshl_b64 s[4:5], s[8:9], s22 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s2 -; GFX10-NEXT: s_lshr_b64 s[2:3], s[0:1], s20 -; GFX10-NEXT: s_and_b32 s13, s10, 1 -; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX10-NEXT: s_ashr_i32 s4, s9, 31 -; GFX10-NEXT: s_ashr_i64 s[6:7], s[8:9], s20 -; GFX10-NEXT: s_ashr_i64 s[10:11], s[8:9], s21 -; GFX10-NEXT: s_cmp_lg_u32 s13, 0 -; GFX10-NEXT: s_mov_b32 s5, s4 -; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11] -; GFX10-NEXT: s_and_b32 s10, s12, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s3 +; GFX10-NEXT: v_cmp_gt_i64_e64 s3, s[14:15], 0 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s3 +; GFX10-NEXT: s_and_b32 s3, 1, s2 +; GFX10-NEXT: s_add_u32 s2, s5, 0 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s3 +; GFX10-NEXT: s_cselect_b32 s4, 1, 0 +; GFX10-NEXT: s_and_b32 s4, s4, 1 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s10, 0 +; GFX10-NEXT: s_cmp_lg_u32 s4, 0 ; GFX10-NEXT: v_mov_b32_e32 v6, s1 -; GFX10-NEXT: s_cselect_b64 s[2:3], s[0:1], s[2:3] -; GFX10-NEXT: s_cmp_lg_u32 s13, 0 +; GFX10-NEXT: s_addc_u32 s3, s5, 0 +; GFX10-NEXT: s_cselect_b32 s4, 1, 0 ; GFX10-NEXT: v_xor_b32_e32 v4, v5, v4 -; GFX10-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5] -; GFX10-NEXT: s_add_u32 s2, s2, 0 -; GFX10-NEXT: s_cselect_b32 s6, 1, 0 +; GFX10-NEXT: s_and_b32 s4, s4, 1 ; GFX10-NEXT: v_mov_b32_e32 v5, s0 -; GFX10-NEXT: s_and_b32 s6, s6, 1 -; GFX10-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX10-NEXT: s_cmp_lg_u32 s6, 0 +; GFX10-NEXT: s_cmp_lg_u32 s4, 0 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX10-NEXT: s_addc_u32 s4, s5, 0 ; GFX10-NEXT: s_cselect_b32 s6, 1, 0 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 ; GFX10-NEXT: s_and_b32 s6, s6, 1 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 ; GFX10-NEXT: s_cmp_lg_u32 s6, 0 -; GFX10-NEXT: s_addc_u32 s4, s4, 0 -; GFX10-NEXT: s_cselect_b32 s6, 1, 0 +; GFX10-NEXT: s_addc_u32 s1, s5, s10 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, s2, vcc_lo -; GFX10-NEXT: s_and_b32 s6, s6, 1 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, s3, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s6, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v7, s4, vcc_lo -; GFX10-NEXT: s_addc_u32 s1, s5, s23 -; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v8, s1, vcc_lo ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: v_readfirstlane_b32 s4, v4 ; GFX10-NEXT: v_readfirstlane_b32 s5, v5 diff --git a/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp b/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp --- a/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp +++ b/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp @@ -3179,4 +3179,47 @@ EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } +// Test narror scalar of G_SHL with constant shift amount +TEST_F(AArch64GISelMITest, narrowScalarShiftByConstant) { + setUp(); + if (!TM) + return; + + DefineLegalizerInfo(A, {}); + + LLT S64{LLT::scalar(64)}; + LLT S32{LLT::scalar(32)}; + + auto Constant = B.buildConstant(S64, 33); + auto Trunc = B.buildTrunc(S32, Constant); + auto Shift = B.buildShl(S64, Copies[0], Trunc); + + AInfo Info(MF->getSubtarget()); + DummyGISelObserver Observer; + LegalizerHelper Helper(*MF, Info, Observer, B); + + // Perform Legalization + B.setInsertPt(*EntryMBB, Shift->getIterator()); + + // This should detect the G_CONSTANT feeding the G_SHL through a G_TRUNC + EXPECT_EQ(LegalizerHelper::LegalizeResult::Legalized, + Helper.narrowScalarShift(*Shift, 0, S32)); + + const auto *CheckStr = R"( + CHECK: [[COPY0:%[0-9]+]]:_(s64) = COPY + CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY + CHECK: [[COPY2:%[0-9]+]]:_(s64) = COPY + CHECK: [[THIRTY3:%[0-9]+]]:_(s64) = G_CONSTANT i64 33 + CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %4:_(s64) + CHECK: [[UNMERGE:%[0-9]+]]:_(s32), [[UNMERGE2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY0]] + CHECK: [[ZERO:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + CHECK: [[ONE:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + CHECK: [[SHIFT:%[0-9]+]]:_(s32) = G_SHL [[UNMERGE]]:_, [[ONE]]:_(s64) + CHECK: [[MERGE:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[ZERO]]:_(s32), [[SHIFT]]:_(s32) + )"; + + // Check + EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; +} + } // namespace