Index: llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp =================================================================== --- llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp +++ llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp @@ -567,6 +567,26 @@ Known = KnownBits::ashr(KnownBits::shl(Known, ShiftKnown), ShiftKnown); break; } + case TargetOpcode::G_UADDO: + case TargetOpcode::G_UADDE: + case TargetOpcode::G_SADDO: + case TargetOpcode::G_SADDE: + case TargetOpcode::G_USUBO: + case TargetOpcode::G_USUBE: + case TargetOpcode::G_SSUBO: + case TargetOpcode::G_SSUBE: + case TargetOpcode::G_UMULO: + case TargetOpcode::G_SMULO: { + if (MI.getOperand(1).getReg() == R) { + // If we know the result of a compare has the top bits zero, use this + // info. + if (TL.getBooleanContents(DstTy.isVector(), false) == + TargetLowering::ZeroOrOneBooleanContent && + BitWidth > 1) + Known.Zero.setBitsFrom(1); + } + break; + } } assert(!Known.hasConflict() && "Bits known to be one AND zero?"); @@ -673,6 +693,27 @@ MI.getOperand(3).getReg(), DemandedElts, Depth + 1); } + case TargetOpcode::G_SADDO: + case TargetOpcode::G_SADDE: + case TargetOpcode::G_UADDO: + case TargetOpcode::G_UADDE: + case TargetOpcode::G_SSUBO: + case TargetOpcode::G_SSUBE: + case TargetOpcode::G_USUBO: + case TargetOpcode::G_USUBE: + case TargetOpcode::G_SMULO: + case TargetOpcode::G_UMULO: { + // If compares returns 0/-1, all bits are sign bits. + // We know that we have an integer-based boolean since these operations + // are only available for integer. + if (MI.getOperand(1).getReg() == R) { + if (TL.getBooleanContents(DstTy.isVector(), false) == + TargetLowering::ZeroOrNegativeOneBooleanContent) + return TyBits; + } + + break; + } case TargetOpcode::G_INTRINSIC: case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: default: { Index: llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll @@ -457,7 +457,6 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_add_u32 s0, s0, s1 ; GFX7-NEXT: s_cselect_b32 s1, 1, 0 -; GFX7-NEXT: s_and_b32 s1, s1, 1 ; GFX7-NEXT: s_add_i32 s0, s0, s1 ; GFX7-NEXT: ; return to shader part epilog ; @@ -465,7 +464,6 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_add_u32 s0, s0, s1 ; GFX8-NEXT: s_cselect_b32 s1, 1, 0 -; GFX8-NEXT: s_and_b32 s1, s1, 1 ; GFX8-NEXT: s_add_i32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; @@ -473,7 +471,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 s0, s0, s1 ; GFX9-NEXT: s_cselect_b32 s1, 1, 0 -; GFX9-NEXT: s_and_b32 s1, s1, 1 ; GFX9-NEXT: s_add_i32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog %uaddo = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) @@ -488,9 +485,6 @@ ; GFX7-LABEL: s_uaddo_i64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_add_u32 s0, s0, s2 -; GFX7-NEXT: s_cselect_b32 s4, 1, 0 -; GFX7-NEXT: s_and_b32 s4, s4, 1 -; GFX7-NEXT: s_cmp_lg_u32 s4, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: s_addc_u32 s1, s1, s3 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 @@ -506,9 +500,6 @@ ; GFX8-LABEL: s_uaddo_i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_add_u32 s0, s0, s2 -; GFX8-NEXT: s_cselect_b32 s4, 1, 0 -; GFX8-NEXT: s_and_b32 s4, s4, 1 -; GFX8-NEXT: s_cmp_lg_u32 s4, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: s_addc_u32 s1, s1, s3 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -524,9 +515,6 @@ ; GFX9-LABEL: s_uaddo_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 s0, s0, s2 -; GFX9-NEXT: s_cselect_b32 s4, 1, 0 -; GFX9-NEXT: s_and_b32 s4, s4, 1 -; GFX9-NEXT: s_cmp_lg_u32 s4, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: s_addc_u32 s1, s1, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -553,8 +541,6 @@ ; GFX7-NEXT: s_cselect_b32 s2, 1, 0 ; GFX7-NEXT: s_add_u32 s1, s1, s3 ; GFX7-NEXT: s_cselect_b32 s3, 1, 0 -; GFX7-NEXT: s_and_b32 s2, s2, 1 -; GFX7-NEXT: s_and_b32 s3, s3, 1 ; GFX7-NEXT: s_add_i32 s0, s0, s2 ; GFX7-NEXT: s_add_i32 s1, s1, s3 ; GFX7-NEXT: ; return to shader part epilog @@ -565,8 +551,6 @@ ; GFX8-NEXT: s_cselect_b32 s2, 1, 0 ; GFX8-NEXT: s_add_u32 s1, s1, s3 ; GFX8-NEXT: s_cselect_b32 s3, 1, 0 -; GFX8-NEXT: s_and_b32 s2, s2, 1 -; GFX8-NEXT: s_and_b32 s3, s3, 1 ; GFX8-NEXT: s_add_i32 s0, s0, s2 ; GFX8-NEXT: s_add_i32 s1, s1, s3 ; GFX8-NEXT: ; return to shader part epilog @@ -577,8 +561,6 @@ ; GFX9-NEXT: s_cselect_b32 s2, 1, 0 ; GFX9-NEXT: s_add_u32 s1, s1, s3 ; GFX9-NEXT: s_cselect_b32 s3, 1, 0 -; GFX9-NEXT: s_and_b32 s2, s2, 1 -; GFX9-NEXT: s_and_b32 s3, s3, 1 ; GFX9-NEXT: s_add_i32 s0, s0, s2 ; GFX9-NEXT: s_add_i32 s1, s1, s3 ; GFX9-NEXT: ; return to shader part epilog @@ -728,9 +710,6 @@ ; GFX7-LABEL: s_saddo_i64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_add_u32 s4, s0, s2 -; GFX7-NEXT: s_cselect_b32 s5, 1, 0 -; GFX7-NEXT: s_and_b32 s5, s5, 1 -; GFX7-NEXT: s_cmp_lg_u32 s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: s_addc_u32 s5, s1, s3 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 @@ -748,9 +727,6 @@ ; GFX8-LABEL: s_saddo_i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_add_u32 s4, s0, s2 -; GFX8-NEXT: s_cselect_b32 s5, 1, 0 -; GFX8-NEXT: s_and_b32 s5, s5, 1 -; GFX8-NEXT: s_cmp_lg_u32 s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_addc_u32 s5, s1, s3 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -768,9 +744,6 @@ ; GFX9-LABEL: s_saddo_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 s4, s0, s2 -; GFX9-NEXT: s_cselect_b32 s5, 1, 0 -; GFX9-NEXT: s_and_b32 s5, s5, 1 -; GFX9-NEXT: s_cmp_lg_u32 s5, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_addc_u32 s5, s1, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll @@ -31,9 +31,6 @@ ; GFX: ; %bb.0: ; GFX-NEXT: s_ashr_i32 s2, s1, 31 ; GFX-NEXT: s_add_u32 s0, s0, s2 -; GFX-NEXT: s_cselect_b32 s4, 1, 0 -; GFX-NEXT: s_and_b32 s4, s4, 1 -; GFX-NEXT: s_cmp_lg_u32 s4, 0 ; GFX-NEXT: s_mov_b32 s3, s2 ; GFX-NEXT: s_addc_u32 s1, s1, s2 ; GFX-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] Index: llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -447,7 +447,6 @@ ; GFX7-NEXT: s_mul_i32 s5, s0, s5 ; GFX7-NEXT: s_add_i32 s0, s2, s7 ; GFX7-NEXT: s_add_i32 s0, s0, s5 -; GFX7-NEXT: s_and_b32 s8, s8, 1 ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v2, vcc, s0, v2 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, s8, v1 @@ -477,7 +476,6 @@ ; GFX8-NEXT: s_mul_i32 s5, s0, s5 ; GFX8-NEXT: s_add_i32 s0, s2, s7 ; GFX8-NEXT: s_add_i32 s0, s0, s5 -; GFX8-NEXT: s_and_b32 s8, s8, 1 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, s8, v1 @@ -492,13 +490,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mul_i32 s7, s1, s3 ; GFX9-NEXT: s_mul_i32 s8, s0, s4 +; GFX9-NEXT: s_mul_hi_u32 s9, s0, s3 ; GFX9-NEXT: s_add_u32 s7, s7, s8 ; GFX9-NEXT: s_cselect_b32 s8, 1, 0 -; GFX9-NEXT: s_mul_hi_u32 s9, s0, s3 -; GFX9-NEXT: s_and_b32 s8, s8, 1 ; GFX9-NEXT: s_add_u32 s7, s7, s9 ; GFX9-NEXT: s_cselect_b32 s9, 1, 0 -; GFX9-NEXT: s_and_b32 s9, s9, 1 ; GFX9-NEXT: s_add_i32 s8, s8, s9 ; GFX9-NEXT: s_mul_i32 s2, s2, s3 ; GFX9-NEXT: s_mul_i32 s9, s1, s4 @@ -521,17 +517,15 @@ ; GFX10-NEXT: s_mul_i32 s7, s0, s4 ; GFX10-NEXT: s_mul_hi_u32 s8, s0, s3 ; GFX10-NEXT: s_add_u32 s6, s6, s7 -; GFX10-NEXT: s_cselect_b32 s7, 1, 0 ; GFX10-NEXT: s_mul_i32 s2, s2, s3 -; GFX10-NEXT: s_and_b32 s7, s7, 1 ; GFX10-NEXT: s_mul_i32 s9, s1, s4 +; GFX10-NEXT: s_cselect_b32 s7, 1, 0 ; GFX10-NEXT: s_add_u32 s6, s6, s8 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 ; GFX10-NEXT: s_mul_i32 s5, s0, s5 ; GFX10-NEXT: s_add_i32 s2, s2, s9 ; GFX10-NEXT: s_mul_hi_u32 s1, s1, s3 ; GFX10-NEXT: s_add_i32 s2, s2, s5 -; GFX10-NEXT: s_and_b32 s8, s8, 1 ; GFX10-NEXT: s_mul_hi_u32 s4, s0, s4 ; GFX10-NEXT: s_add_i32 s1, s2, s1 ; GFX10-NEXT: s_add_i32 s7, s7, s8 @@ -656,24 +650,21 @@ ; GFX7-NEXT: s_mul_i32 s9, s1, s4 ; GFX7-NEXT: s_mul_i32 s10, s0, s5 ; GFX7-NEXT: s_add_u32 s9, s9, s10 -; GFX7-NEXT: s_cselect_b32 s10, 1, 0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, s9, v0 -; GFX7-NEXT: s_and_b32 s10, s10, 1 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_cselect_b32 s10, 1, 0 ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX7-NEXT: v_mul_hi_u32 v2, v2, s4 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, s10, v1 ; GFX7-NEXT: s_mul_i32 s9, s2, s4 ; GFX7-NEXT: s_mul_i32 s10, s1, s5 -; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: s_add_u32 s9, s9, s10 -; GFX7-NEXT: v_mul_hi_u32 v2, v2, s4 -; GFX7-NEXT: s_cselect_b32 s10, 1, 0 ; GFX7-NEXT: s_mul_i32 s11, s0, s6 -; GFX7-NEXT: s_and_b32 s10, s10, 1 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: s_add_u32 s9, s9, s11 +; GFX7-NEXT: s_add_u32 s9, s9, s10 ; GFX7-NEXT: v_mul_hi_u32 v4, s0, v3 +; GFX7-NEXT: s_cselect_b32 s10, 1, 0 +; GFX7-NEXT: s_add_u32 s9, s9, s11 ; GFX7-NEXT: s_cselect_b32 s11, 1, 0 -; GFX7-NEXT: s_and_b32 s11, s11, 1 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, s9, v2 ; GFX7-NEXT: s_add_i32 s10, s10, s11 ; GFX7-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc @@ -714,24 +705,21 @@ ; GFX8-NEXT: s_mul_i32 s9, s1, s4 ; GFX8-NEXT: s_mul_i32 s10, s0, s5 ; GFX8-NEXT: s_add_u32 s9, s9, s10 -; GFX8-NEXT: s_cselect_b32 s10, 1, 0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s9, v0 -; GFX8-NEXT: s_and_b32 s10, s10, 1 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: s_cselect_b32 s10, 1, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_mul_hi_u32 v2, v2, s4 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, s10, v1 ; GFX8-NEXT: s_mul_i32 s9, s2, s4 ; GFX8-NEXT: s_mul_i32 s10, s1, s5 -; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: s_add_u32 s9, s9, s10 -; GFX8-NEXT: v_mul_hi_u32 v2, v2, s4 -; GFX8-NEXT: s_cselect_b32 s10, 1, 0 ; GFX8-NEXT: s_mul_i32 s11, s0, s6 -; GFX8-NEXT: s_and_b32 s10, s10, 1 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: s_add_u32 s9, s9, s11 +; GFX8-NEXT: s_add_u32 s9, s9, s10 ; GFX8-NEXT: v_mul_hi_u32 v4, s0, v3 +; GFX8-NEXT: s_cselect_b32 s10, 1, 0 +; GFX8-NEXT: s_add_u32 s9, s9, s11 ; GFX8-NEXT: s_cselect_b32 s11, 1, 0 -; GFX8-NEXT: s_and_b32 s11, s11, 1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s9, v2 ; GFX8-NEXT: s_add_i32 s10, s10, s11 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc @@ -769,37 +757,30 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mul_i32 s9, s1, s4 ; GFX9-NEXT: s_mul_i32 s10, s0, s5 +; GFX9-NEXT: s_mul_hi_u32 s11, s0, s4 ; GFX9-NEXT: s_add_u32 s9, s9, s10 ; GFX9-NEXT: s_cselect_b32 s10, 1, 0 -; GFX9-NEXT: s_mul_hi_u32 s11, s0, s4 -; GFX9-NEXT: s_and_b32 s10, s10, 1 ; GFX9-NEXT: s_add_u32 s9, s9, s11 ; GFX9-NEXT: s_cselect_b32 s11, 1, 0 -; GFX9-NEXT: s_and_b32 s11, s11, 1 ; GFX9-NEXT: s_add_i32 s10, s10, s11 ; GFX9-NEXT: s_mul_i32 s11, s2, s4 ; GFX9-NEXT: s_mul_i32 s12, s1, s5 +; GFX9-NEXT: s_mul_i32 s13, s0, s6 ; GFX9-NEXT: s_add_u32 s11, s11, s12 ; GFX9-NEXT: s_cselect_b32 s12, 1, 0 -; GFX9-NEXT: s_mul_i32 s13, s0, s6 -; GFX9-NEXT: s_and_b32 s12, s12, 1 ; GFX9-NEXT: s_add_u32 s11, s11, s13 ; GFX9-NEXT: s_cselect_b32 s13, 1, 0 -; GFX9-NEXT: s_and_b32 s13, s13, 1 ; GFX9-NEXT: s_mul_hi_u32 s14, s1, s4 ; GFX9-NEXT: s_add_i32 s12, s12, s13 ; GFX9-NEXT: s_add_u32 s11, s11, s14 ; GFX9-NEXT: s_cselect_b32 s13, 1, 0 -; GFX9-NEXT: s_and_b32 s13, s13, 1 ; GFX9-NEXT: s_mul_hi_u32 s15, s0, s5 ; GFX9-NEXT: s_add_i32 s12, s12, s13 ; GFX9-NEXT: s_add_u32 s11, s11, s15 ; GFX9-NEXT: s_cselect_b32 s13, 1, 0 -; GFX9-NEXT: s_and_b32 s13, s13, 1 ; GFX9-NEXT: s_add_i32 s12, s12, s13 ; GFX9-NEXT: s_add_u32 s10, s11, s10 ; GFX9-NEXT: s_cselect_b32 s11, 1, 0 -; GFX9-NEXT: s_and_b32 s11, s11, 1 ; GFX9-NEXT: s_add_i32 s12, s12, s11 ; GFX9-NEXT: s_mul_i32 s3, s3, s4 ; GFX9-NEXT: s_mul_i32 s11, s2, s5 @@ -828,52 +809,45 @@ ; GFX10-NEXT: s_mul_hi_u32 s10, s0, s4 ; GFX10-NEXT: s_add_u32 s8, s8, s9 ; GFX10-NEXT: s_cselect_b32 s9, 1, 0 -; GFX10-NEXT: s_mul_i32 s11, s1, s5 -; GFX10-NEXT: s_and_b32 s9, s9, 1 ; GFX10-NEXT: s_add_u32 s8, s8, s10 ; GFX10-NEXT: s_cselect_b32 s10, 1, 0 -; GFX10-NEXT: s_mul_i32 s12, s0, s6 -; GFX10-NEXT: s_and_b32 s10, s10, 1 -; GFX10-NEXT: s_mul_hi_u32 s13, s1, s4 +; GFX10-NEXT: s_mul_i32 s11, s1, s5 ; GFX10-NEXT: s_add_i32 s9, s9, s10 ; GFX10-NEXT: s_mul_i32 s10, s2, s4 -; GFX10-NEXT: s_mul_i32 s3, s3, s4 +; GFX10-NEXT: s_mul_i32 s12, s0, s6 ; GFX10-NEXT: s_add_u32 s10, s10, s11 ; GFX10-NEXT: s_cselect_b32 s11, 1, 0 -; GFX10-NEXT: s_mul_i32 s7, s0, s7 -; GFX10-NEXT: s_and_b32 s11, s11, 1 ; GFX10-NEXT: s_add_u32 s10, s10, s12 ; GFX10-NEXT: s_cselect_b32 s12, 1, 0 -; GFX10-NEXT: s_and_b32 s12, s12, 1 +; GFX10-NEXT: s_mul_hi_u32 s13, s1, s4 ; GFX10-NEXT: s_add_i32 s11, s11, s12 ; GFX10-NEXT: s_add_u32 s10, s10, s13 ; GFX10-NEXT: s_cselect_b32 s12, 1, 0 -; GFX10-NEXT: s_mul_hi_u32 s13, s0, s5 -; GFX10-NEXT: s_and_b32 s12, s12, 1 +; GFX10-NEXT: s_mul_hi_u32 s14, s0, s5 ; GFX10-NEXT: s_add_i32 s11, s11, s12 -; GFX10-NEXT: s_add_u32 s10, s10, s13 +; GFX10-NEXT: s_add_u32 s10, s10, s14 ; GFX10-NEXT: s_cselect_b32 s12, 1, 0 -; GFX10-NEXT: s_mul_i32 s13, s1, s6 -; GFX10-NEXT: s_and_b32 s12, s12, 1 -; GFX10-NEXT: s_mul_hi_u32 s1, s1, s5 +; GFX10-NEXT: s_mul_i32 s3, s3, s4 ; GFX10-NEXT: s_add_i32 s11, s11, s12 ; GFX10-NEXT: s_mul_i32 s12, s2, s5 ; GFX10-NEXT: s_add_u32 s9, s10, s9 ; GFX10-NEXT: s_cselect_b32 s10, 1, 0 +; GFX10-NEXT: s_mul_i32 s13, s1, s6 ; GFX10-NEXT: s_add_i32 s3, s3, s12 -; GFX10-NEXT: s_mul_hi_u32 s2, s2, s4 +; GFX10-NEXT: s_mul_i32 s7, s0, s7 ; GFX10-NEXT: s_add_i32 s3, s3, s13 -; GFX10-NEXT: s_and_b32 s10, s10, 1 +; GFX10-NEXT: s_mul_hi_u32 s2, s2, s4 ; GFX10-NEXT: s_add_i32 s3, s3, s7 -; GFX10-NEXT: s_add_i32 s11, s11, s10 +; GFX10-NEXT: s_mul_hi_u32 s1, s1, s5 ; GFX10-NEXT: s_add_i32 s2, s3, s2 ; GFX10-NEXT: s_mul_hi_u32 s3, s0, s6 ; GFX10-NEXT: s_add_i32 s1, s2, s1 -; GFX10-NEXT: s_mul_i32 s0, s0, s4 +; GFX10-NEXT: s_add_i32 s11, s11, s10 ; GFX10-NEXT: s_add_i32 s1, s1, s3 -; GFX10-NEXT: s_mov_b32 s2, s9 +; GFX10-NEXT: s_mul_i32 s0, s0, s4 ; GFX10-NEXT: s_add_i32 s3, s1, s11 ; GFX10-NEXT: s_mov_b32 s1, s8 +; GFX10-NEXT: s_mov_b32 s2, s9 ; GFX10-NEXT: ; return to shader part epilog %result = mul i128 %num, %den %cast = bitcast i128 %result to <4 x i32> @@ -1082,189 +1056,168 @@ ; GFX7-NEXT: s_mul_i32 s17, s1, s8 ; GFX7-NEXT: s_mul_i32 s18, s16, s9 ; GFX7-NEXT: s_add_u32 s17, s17, s18 -; GFX7-NEXT: s_cselect_b32 s18, 1, 0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, s17, v0 -; GFX7-NEXT: s_and_b32 s18, s18, 1 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_cselect_b32 s18, 1, 0 ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX7-NEXT: v_mul_hi_u32 v2, v2, s8 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, s18, v1 ; GFX7-NEXT: s_mul_i32 s17, s2, s8 ; GFX7-NEXT: s_mul_i32 s18, s1, s9 -; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: s_add_u32 s17, s17, s18 -; GFX7-NEXT: v_mul_hi_u32 v2, v2, s8 -; GFX7-NEXT: s_cselect_b32 s18, 1, 0 ; GFX7-NEXT: s_mul_i32 s19, s16, s10 -; GFX7-NEXT: s_and_b32 s18, s18, 1 ; GFX7-NEXT: v_mov_b32_e32 v3, s9 -; GFX7-NEXT: s_add_u32 s17, s17, s19 +; GFX7-NEXT: s_add_u32 s17, s17, s18 ; GFX7-NEXT: v_mul_hi_u32 v4, s16, v3 +; GFX7-NEXT: s_cselect_b32 s18, 1, 0 +; GFX7-NEXT: s_add_u32 s17, s17, s19 ; GFX7-NEXT: s_cselect_b32 s19, 1, 0 -; GFX7-NEXT: s_and_b32 s19, s19, 1 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, s17, v2 ; GFX7-NEXT: s_add_i32 s18, s18, s19 ; GFX7-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v5, vcc, s18, v5 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GFX7-NEXT: s_mul_i32 s17, s3, s8 -; GFX7-NEXT: s_mul_i32 s18, s2, s9 ; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX7-NEXT: s_add_u32 s17, s17, s18 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GFX7-NEXT: s_cselect_b32 s18, 1, 0 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GFX7-NEXT: s_mul_i32 s19, s1, s10 -; GFX7-NEXT: s_and_b32 s18, s18, 1 ; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX7-NEXT: s_add_u32 s17, s17, s19 +; GFX7-NEXT: s_mul_i32 s17, s3, s8 +; GFX7-NEXT: s_mul_i32 s18, s2, s9 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GFX7-NEXT: s_mul_i32 s19, s1, s10 ; GFX7-NEXT: v_mov_b32_e32 v4, s2 -; GFX7-NEXT: s_cselect_b32 s19, 1, 0 +; GFX7-NEXT: s_add_u32 s17, s17, s18 ; GFX7-NEXT: v_mul_hi_u32 v5, v4, s8 -; GFX7-NEXT: s_and_b32 s19, s19, 1 +; GFX7-NEXT: s_cselect_b32 s18, 1, 0 +; GFX7-NEXT: s_add_u32 s17, s17, s19 +; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: s_mul_i32 s20, s16, s11 ; GFX7-NEXT: s_add_i32 s18, s18, s19 -; GFX7-NEXT: s_add_u32 s17, s17, s20 ; GFX7-NEXT: v_mul_hi_u32 v3, s1, v3 +; GFX7-NEXT: s_add_u32 s17, s17, s20 ; GFX7-NEXT: s_cselect_b32 s19, 1, 0 -; GFX7-NEXT: s_and_b32 s19, s19, 1 ; GFX7-NEXT: v_add_i32_e32 v5, vcc, s17, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, s10 ; GFX7-NEXT: s_add_i32 s18, s18, s19 ; GFX7-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX7-NEXT: v_mul_hi_u32 v7, s16, v6 ; GFX7-NEXT: v_add_i32_e32 v8, vcc, s18, v8 -; GFX7-NEXT: s_mul_i32 s17, s4, s8 -; GFX7-NEXT: s_mul_i32 s18, s3, s9 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GFX7-NEXT: s_add_u32 s17, s17, s18 ; GFX7-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX7-NEXT: s_cselect_b32 s18, 1, 0 ; GFX7-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GFX7-NEXT: s_mul_i32 s19, s2, s10 -; GFX7-NEXT: s_and_b32 s18, s18, 1 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; GFX7-NEXT: s_add_u32 s17, s17, s19 ; GFX7-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX7-NEXT: s_cselect_b32 s19, 1, 0 +; GFX7-NEXT: s_mul_i32 s17, s4, s8 +; GFX7-NEXT: s_mul_i32 s18, s3, s9 ; GFX7-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GFX7-NEXT: s_and_b32 s19, s19, 1 +; GFX7-NEXT: s_mul_i32 s19, s2, s10 +; GFX7-NEXT: s_add_u32 s17, s17, s18 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX7-NEXT: s_mul_i32 s20, s1, s11 -; GFX7-NEXT: s_add_i32 s18, s18, s19 +; GFX7-NEXT: s_cselect_b32 s18, 1, 0 +; GFX7-NEXT: s_add_u32 s17, s17, s19 ; GFX7-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX7-NEXT: s_add_u32 s17, s17, s20 +; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; GFX7-NEXT: s_mul_i32 s20, s1, s11 ; GFX7-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-NEXT: s_cselect_b32 s19, 1, 0 +; GFX7-NEXT: s_add_i32 s18, s18, s19 ; GFX7-NEXT: v_mul_hi_u32 v7, v5, s8 -; GFX7-NEXT: s_and_b32 s19, s19, 1 +; GFX7-NEXT: s_add_u32 s17, s17, s20 +; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: s_mul_i32 s21, s16, s12 ; GFX7-NEXT: s_add_i32 s18, s18, s19 +; GFX7-NEXT: v_mul_hi_u32 v4, v4, s9 ; GFX7-NEXT: s_add_u32 s17, s17, s21 ; GFX7-NEXT: s_cselect_b32 s19, 1, 0 -; GFX7-NEXT: v_mul_hi_u32 v4, v4, s9 -; GFX7-NEXT: s_and_b32 s19, s19, 1 ; GFX7-NEXT: v_add_i32_e32 v7, vcc, s17, v7 ; GFX7-NEXT: s_add_i32 s18, s18, s19 ; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GFX7-NEXT: v_add_i32_e32 v11, vcc, s18, v11 -; GFX7-NEXT: s_mul_i32 s17, s5, s8 -; GFX7-NEXT: s_mul_i32 s18, s4, s9 ; GFX7-NEXT: v_mul_hi_u32 v8, s1, v6 -; GFX7-NEXT: s_add_u32 s17, s17, s18 +; GFX7-NEXT: v_add_i32_e32 v11, vcc, s18, v11 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GFX7-NEXT: s_cselect_b32 s18, 1, 0 ; GFX7-NEXT: v_mov_b32_e32 v9, s11 ; GFX7-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX7-NEXT: s_mul_i32 s19, s3, s10 -; GFX7-NEXT: s_and_b32 s18, s18, 1 ; GFX7-NEXT: v_mul_hi_u32 v10, s16, v9 ; GFX7-NEXT: v_add_i32_e32 v7, vcc, v11, v7 -; GFX7-NEXT: s_add_u32 s17, s17, s19 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX7-NEXT: s_and_b32 s19, s19, 1 +; GFX7-NEXT: s_mul_i32 s17, s5, s8 +; GFX7-NEXT: s_mul_i32 s18, s4, s9 ; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v8 -; GFX7-NEXT: s_mul_i32 s20, s2, s11 -; GFX7-NEXT: s_add_i32 s18, s18, s19 +; GFX7-NEXT: s_mul_i32 s19, s3, s10 +; GFX7-NEXT: s_add_u32 s17, s17, s18 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; GFX7-NEXT: s_add_u32 s17, s17, s20 +; GFX7-NEXT: s_cselect_b32 s18, 1, 0 +; GFX7-NEXT: s_add_u32 s17, s17, s19 ; GFX7-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v8 -; GFX7-NEXT: s_and_b32 s19, s19, 1 -; GFX7-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GFX7-NEXT: s_mul_i32 s21, s1, s12 +; GFX7-NEXT: s_mul_i32 s20, s2, s11 ; GFX7-NEXT: s_add_i32 s18, s18, s19 +; GFX7-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GFX7-NEXT: s_add_u32 s17, s17, s20 ; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX7-NEXT: s_add_u32 s17, s17, s21 +; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; GFX7-NEXT: s_mul_i32 s21, s1, s12 ; GFX7-NEXT: v_mov_b32_e32 v7, s4 -; GFX7-NEXT: s_cselect_b32 s19, 1, 0 +; GFX7-NEXT: s_add_i32 s18, s18, s19 ; GFX7-NEXT: v_mul_hi_u32 v8, v7, s8 -; GFX7-NEXT: s_and_b32 s19, s19, 1 +; GFX7-NEXT: s_add_u32 s17, s17, s21 +; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: s_mul_i32 s22, s16, s13 ; GFX7-NEXT: s_add_i32 s18, s18, s19 +; GFX7-NEXT: v_mul_hi_u32 v10, v5, s9 ; GFX7-NEXT: s_add_u32 s17, s17, s22 ; GFX7-NEXT: s_cselect_b32 s19, 1, 0 -; GFX7-NEXT: s_and_b32 s19, s19, 1 ; GFX7-NEXT: v_add_i32_e32 v8, vcc, s17, v8 -; GFX7-NEXT: v_mul_hi_u32 v10, v5, s9 ; GFX7-NEXT: s_add_i32 s18, s18, s19 ; GFX7-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GFX7-NEXT: v_add_i32_e32 v14, vcc, s18, v14 -; GFX7-NEXT: s_mul_i32 s17, s6, s8 -; GFX7-NEXT: s_mul_i32 s18, s5, s9 -; GFX7-NEXT: s_add_u32 s17, s17, s18 ; GFX7-NEXT: v_mul_hi_u32 v6, s2, v6 -; GFX7-NEXT: s_cselect_b32 s18, 1, 0 +; GFX7-NEXT: v_add_i32_e32 v14, vcc, s18, v14 ; GFX7-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; GFX7-NEXT: s_mul_i32 s19, s4, s10 -; GFX7-NEXT: s_and_b32 s18, s18, 1 ; GFX7-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GFX7-NEXT: s_add_u32 s17, s17, s19 ; GFX7-NEXT: v_mul_hi_u32 v11, s1, v9 ; GFX7-NEXT: v_add_i32_e32 v10, vcc, v14, v10 -; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; GFX7-NEXT: s_and_b32 s19, s19, 1 ; GFX7-NEXT: v_mov_b32_e32 v12, s12 ; GFX7-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX7-NEXT: s_mul_i32 s20, s3, s11 -; GFX7-NEXT: s_add_i32 s18, s18, s19 +; GFX7-NEXT: s_mul_i32 s17, s6, s8 +; GFX7-NEXT: s_mul_i32 s18, s5, s9 ; GFX7-NEXT: v_mul_hi_u32 v13, s16, v12 ; GFX7-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; GFX7-NEXT: s_add_u32 s17, s17, s20 +; GFX7-NEXT: s_mul_i32 s19, s4, s10 +; GFX7-NEXT: s_add_u32 s17, s17, s18 ; GFX7-NEXT: v_add_i32_e32 v6, vcc, v6, v11 -; GFX7-NEXT: s_cselect_b32 s19, 1, 0 +; GFX7-NEXT: s_cselect_b32 s18, 1, 0 +; GFX7-NEXT: s_add_u32 s17, s17, s19 ; GFX7-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GFX7-NEXT: s_and_b32 s19, s19, 1 +; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; GFX7-NEXT: s_mul_i32 s21, s2, s12 +; GFX7-NEXT: s_mul_i32 s20, s3, s11 ; GFX7-NEXT: s_add_i32 s18, s18, s19 ; GFX7-NEXT: v_add_i32_e32 v6, vcc, v6, v13 -; GFX7-NEXT: s_add_u32 s17, s17, s21 +; GFX7-NEXT: s_add_u32 s17, s17, s20 ; GFX7-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; GFX7-NEXT: s_and_b32 s19, s19, 1 -; GFX7-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; GFX7-NEXT: s_mul_i32 s22, s1, s13 +; GFX7-NEXT: s_mul_i32 s21, s2, s12 ; GFX7-NEXT: s_add_i32 s18, s18, s19 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GFX7-NEXT: s_add_u32 s17, s17, s21 ; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX7-NEXT: s_add_u32 s17, s17, s22 +; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; GFX7-NEXT: s_mul_i32 s22, s1, s13 ; GFX7-NEXT: v_mov_b32_e32 v8, s5 -; GFX7-NEXT: s_cselect_b32 s19, 1, 0 +; GFX7-NEXT: s_add_i32 s18, s18, s19 ; GFX7-NEXT: v_mul_hi_u32 v10, v8, s8 -; GFX7-NEXT: s_and_b32 s19, s19, 1 +; GFX7-NEXT: s_add_u32 s17, s17, s22 +; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: s_mul_i32 s23, s16, s14 ; GFX7-NEXT: s_add_i32 s18, s18, s19 -; GFX7-NEXT: s_add_u32 s17, s17, s23 ; GFX7-NEXT: v_mul_hi_u32 v11, v7, s9 +; GFX7-NEXT: s_add_u32 s17, s17, s23 ; GFX7-NEXT: s_cselect_b32 s19, 1, 0 -; GFX7-NEXT: s_and_b32 s19, s19, 1 ; GFX7-NEXT: v_add_i32_e32 v10, vcc, s17, v10 ; GFX7-NEXT: s_add_i32 s18, s18, s19 ; GFX7-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc @@ -1342,189 +1295,168 @@ ; GFX8-NEXT: s_mul_i32 s17, s1, s8 ; GFX8-NEXT: s_mul_i32 s18, s16, s9 ; GFX8-NEXT: s_add_u32 s17, s17, s18 -; GFX8-NEXT: s_cselect_b32 s18, 1, 0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s17, v0 -; GFX8-NEXT: s_and_b32 s18, s18, 1 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: s_cselect_b32 s18, 1, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_mul_hi_u32 v2, v2, s8 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, s18, v1 ; GFX8-NEXT: s_mul_i32 s17, s2, s8 ; GFX8-NEXT: s_mul_i32 s18, s1, s9 -; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: s_add_u32 s17, s17, s18 -; GFX8-NEXT: v_mul_hi_u32 v2, v2, s8 -; GFX8-NEXT: s_cselect_b32 s18, 1, 0 ; GFX8-NEXT: s_mul_i32 s19, s16, s10 -; GFX8-NEXT: s_and_b32 s18, s18, 1 ; GFX8-NEXT: v_mov_b32_e32 v3, s9 -; GFX8-NEXT: s_add_u32 s17, s17, s19 +; GFX8-NEXT: s_add_u32 s17, s17, s18 ; GFX8-NEXT: v_mul_hi_u32 v4, s16, v3 +; GFX8-NEXT: s_cselect_b32 s18, 1, 0 +; GFX8-NEXT: s_add_u32 s17, s17, s19 ; GFX8-NEXT: s_cselect_b32 s19, 1, 0 -; GFX8-NEXT: s_and_b32 s19, s19, 1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s17, v2 ; GFX8-NEXT: s_add_i32 s18, s18, s19 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s18, v5 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 -; GFX8-NEXT: s_mul_i32 s17, s3, s8 -; GFX8-NEXT: s_mul_i32 s18, s2, s9 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX8-NEXT: s_add_u32 s17, s17, s18 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4 -; GFX8-NEXT: s_cselect_b32 s18, 1, 0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 -; GFX8-NEXT: s_mul_i32 s19, s1, s10 -; GFX8-NEXT: s_and_b32 s18, s18, 1 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: s_add_u32 s17, s17, s19 +; GFX8-NEXT: s_mul_i32 s17, s3, s8 +; GFX8-NEXT: s_mul_i32 s18, s2, s9 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2 +; GFX8-NEXT: s_mul_i32 s19, s1, s10 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NEXT: s_cselect_b32 s19, 1, 0 +; GFX8-NEXT: s_add_u32 s17, s17, s18 ; GFX8-NEXT: v_mul_hi_u32 v5, v4, s8 -; GFX8-NEXT: s_and_b32 s19, s19, 1 +; GFX8-NEXT: s_cselect_b32 s18, 1, 0 +; GFX8-NEXT: s_add_u32 s17, s17, s19 +; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: s_mul_i32 s20, s16, s11 ; GFX8-NEXT: s_add_i32 s18, s18, s19 -; GFX8-NEXT: s_add_u32 s17, s17, s20 ; GFX8-NEXT: v_mul_hi_u32 v3, s1, v3 +; GFX8-NEXT: s_add_u32 s17, s17, s20 ; GFX8-NEXT: s_cselect_b32 s19, 1, 0 -; GFX8-NEXT: s_and_b32 s19, s19, 1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s17, v5 ; GFX8-NEXT: v_mov_b32_e32 v6, s10 ; GFX8-NEXT: s_add_i32 s18, s18, s19 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX8-NEXT: v_mul_hi_u32 v7, s16, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, s18, v8 -; GFX8-NEXT: s_mul_i32 s17, s4, s8 -; GFX8-NEXT: s_mul_i32 s18, s3, s9 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3 -; GFX8-NEXT: s_add_u32 s17, s17, s18 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: s_cselect_b32 s18, 1, 0 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v8, v5 -; GFX8-NEXT: s_mul_i32 s19, s2, s10 -; GFX8-NEXT: s_and_b32 s18, s18, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v7 -; GFX8-NEXT: s_add_u32 s17, s17, s19 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: s_cselect_b32 s19, 1, 0 +; GFX8-NEXT: s_mul_i32 s17, s4, s8 +; GFX8-NEXT: s_mul_i32 s18, s3, s9 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v7 -; GFX8-NEXT: s_and_b32 s19, s19, 1 +; GFX8-NEXT: s_mul_i32 s19, s2, s10 +; GFX8-NEXT: s_add_u32 s17, s17, s18 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: s_mul_i32 s20, s1, s11 -; GFX8-NEXT: s_add_i32 s18, s18, s19 +; GFX8-NEXT: s_cselect_b32 s18, 1, 0 +; GFX8-NEXT: s_add_u32 s17, s17, s19 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX8-NEXT: s_add_u32 s17, s17, s20 +; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3 +; GFX8-NEXT: s_mul_i32 s20, s1, s11 ; GFX8-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NEXT: s_cselect_b32 s19, 1, 0 +; GFX8-NEXT: s_add_i32 s18, s18, s19 ; GFX8-NEXT: v_mul_hi_u32 v7, v5, s8 -; GFX8-NEXT: s_and_b32 s19, s19, 1 +; GFX8-NEXT: s_add_u32 s17, s17, s20 +; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: s_mul_i32 s21, s16, s12 ; GFX8-NEXT: s_add_i32 s18, s18, s19 +; GFX8-NEXT: v_mul_hi_u32 v4, v4, s9 ; GFX8-NEXT: s_add_u32 s17, s17, s21 ; GFX8-NEXT: s_cselect_b32 s19, 1, 0 -; GFX8-NEXT: v_mul_hi_u32 v4, v4, s9 -; GFX8-NEXT: s_and_b32 s19, s19, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, s17, v7 ; GFX8-NEXT: s_add_i32 s18, s18, s19 ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v11, vcc, s18, v11 -; GFX8-NEXT: s_mul_i32 s17, s5, s8 -; GFX8-NEXT: s_mul_i32 s18, s4, s9 ; GFX8-NEXT: v_mul_hi_u32 v8, s1, v6 -; GFX8-NEXT: s_add_u32 s17, s17, s18 +; GFX8-NEXT: v_add_u32_e32 v11, vcc, s18, v11 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4 -; GFX8-NEXT: s_cselect_b32 s18, 1, 0 ; GFX8-NEXT: v_mov_b32_e32 v9, s11 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: s_mul_i32 s19, s3, s10 -; GFX8-NEXT: s_and_b32 s18, s18, 1 ; GFX8-NEXT: v_mul_hi_u32 v10, s16, v9 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v11, v7 -; GFX8-NEXT: s_add_u32 s17, s17, s19 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8 -; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: s_and_b32 s19, s19, 1 +; GFX8-NEXT: s_mul_i32 s17, s5, s8 +; GFX8-NEXT: s_mul_i32 s18, s4, s9 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 -; GFX8-NEXT: s_mul_i32 s20, s2, s11 -; GFX8-NEXT: s_add_i32 s18, s18, s19 +; GFX8-NEXT: s_mul_i32 s19, s3, s10 +; GFX8-NEXT: s_add_u32 s17, s17, s18 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v10 -; GFX8-NEXT: s_add_u32 s17, s17, s20 +; GFX8-NEXT: s_cselect_b32 s18, 1, 0 +; GFX8-NEXT: s_add_u32 s17, s17, s19 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 -; GFX8-NEXT: s_and_b32 s19, s19, 1 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 -; GFX8-NEXT: s_mul_i32 s21, s1, s12 +; GFX8-NEXT: s_mul_i32 s20, s2, s11 ; GFX8-NEXT: s_add_i32 s18, s18, s19 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 +; GFX8-NEXT: s_add_u32 s17, s17, s20 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX8-NEXT: s_add_u32 s17, s17, s21 +; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4 +; GFX8-NEXT: s_mul_i32 s21, s1, s12 ; GFX8-NEXT: v_mov_b32_e32 v7, s4 -; GFX8-NEXT: s_cselect_b32 s19, 1, 0 +; GFX8-NEXT: s_add_i32 s18, s18, s19 ; GFX8-NEXT: v_mul_hi_u32 v8, v7, s8 -; GFX8-NEXT: s_and_b32 s19, s19, 1 +; GFX8-NEXT: s_add_u32 s17, s17, s21 +; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: s_mul_i32 s22, s16, s13 ; GFX8-NEXT: s_add_i32 s18, s18, s19 +; GFX8-NEXT: v_mul_hi_u32 v10, v5, s9 ; GFX8-NEXT: s_add_u32 s17, s17, s22 ; GFX8-NEXT: s_cselect_b32 s19, 1, 0 -; GFX8-NEXT: s_and_b32 s19, s19, 1 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, s17, v8 -; GFX8-NEXT: v_mul_hi_u32 v10, v5, s9 ; GFX8-NEXT: s_add_i32 s18, s18, s19 ; GFX8-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v14, vcc, s18, v14 -; GFX8-NEXT: s_mul_i32 s17, s6, s8 -; GFX8-NEXT: s_mul_i32 s18, s5, s9 -; GFX8-NEXT: s_add_u32 s17, s17, s18 ; GFX8-NEXT: v_mul_hi_u32 v6, s2, v6 -; GFX8-NEXT: s_cselect_b32 s18, 1, 0 +; GFX8-NEXT: v_add_u32_e32 v14, vcc, s18, v14 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v10 -; GFX8-NEXT: s_mul_i32 s19, s4, s10 -; GFX8-NEXT: s_and_b32 s18, s18, 1 ; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GFX8-NEXT: s_add_u32 s17, s17, s19 ; GFX8-NEXT: v_mul_hi_u32 v11, s1, v9 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, v14, v10 -; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6 -; GFX8-NEXT: s_and_b32 s19, s19, 1 ; GFX8-NEXT: v_mov_b32_e32 v12, s12 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: s_mul_i32 s20, s3, s11 -; GFX8-NEXT: s_add_i32 s18, s18, s19 +; GFX8-NEXT: s_mul_i32 s17, s6, s8 +; GFX8-NEXT: s_mul_i32 s18, s5, s9 ; GFX8-NEXT: v_mul_hi_u32 v13, s16, v12 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v10, v8 -; GFX8-NEXT: s_add_u32 s17, s17, s20 +; GFX8-NEXT: s_mul_i32 s19, s4, s10 +; GFX8-NEXT: s_add_u32 s17, s17, s18 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v11 -; GFX8-NEXT: s_cselect_b32 s19, 1, 0 +; GFX8-NEXT: s_cselect_b32 s18, 1, 0 +; GFX8-NEXT: s_add_u32 s17, s17, s19 ; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GFX8-NEXT: s_and_b32 s19, s19, 1 +; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v10 -; GFX8-NEXT: s_mul_i32 s21, s2, s12 +; GFX8-NEXT: s_mul_i32 s20, s3, s11 ; GFX8-NEXT: s_add_i32 s18, s18, s19 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v13 -; GFX8-NEXT: s_add_u32 s17, s17, s21 +; GFX8-NEXT: s_add_u32 s17, s17, s20 ; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v10 -; GFX8-NEXT: s_and_b32 s19, s19, 1 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v6, v4 -; GFX8-NEXT: s_mul_i32 s22, s1, s13 +; GFX8-NEXT: s_mul_i32 s21, s2, s12 ; GFX8-NEXT: s_add_i32 s18, s18, s19 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v6, v4 +; GFX8-NEXT: s_add_u32 s17, s17, s21 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: s_add_u32 s17, s17, s22 +; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6 +; GFX8-NEXT: s_mul_i32 s22, s1, s13 ; GFX8-NEXT: v_mov_b32_e32 v8, s5 -; GFX8-NEXT: s_cselect_b32 s19, 1, 0 +; GFX8-NEXT: s_add_i32 s18, s18, s19 ; GFX8-NEXT: v_mul_hi_u32 v10, v8, s8 -; GFX8-NEXT: s_and_b32 s19, s19, 1 +; GFX8-NEXT: s_add_u32 s17, s17, s22 +; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: s_mul_i32 s23, s16, s14 ; GFX8-NEXT: s_add_i32 s18, s18, s19 -; GFX8-NEXT: s_add_u32 s17, s17, s23 ; GFX8-NEXT: v_mul_hi_u32 v11, v7, s9 +; GFX8-NEXT: s_add_u32 s17, s17, s23 ; GFX8-NEXT: s_cselect_b32 s19, 1, 0 -; GFX8-NEXT: s_and_b32 s19, s19, 1 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, s17, v10 ; GFX8-NEXT: s_add_i32 s18, s18, s19 ; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc @@ -1599,233 +1531,186 @@ ; GFX9-NEXT: s_mov_b32 s16, s0 ; GFX9-NEXT: s_mul_i32 s17, s1, s8 ; GFX9-NEXT: s_mul_i32 s18, s16, s9 +; GFX9-NEXT: s_mul_hi_u32 s19, s16, s8 ; GFX9-NEXT: s_add_u32 s17, s17, s18 ; GFX9-NEXT: s_cselect_b32 s18, 1, 0 -; GFX9-NEXT: s_mul_hi_u32 s19, s16, s8 -; GFX9-NEXT: s_and_b32 s18, s18, 1 ; GFX9-NEXT: s_add_u32 s17, s17, s19 ; GFX9-NEXT: s_cselect_b32 s19, 1, 0 -; GFX9-NEXT: s_and_b32 s19, s19, 1 ; GFX9-NEXT: s_add_i32 s18, s18, s19 ; GFX9-NEXT: s_mul_i32 s19, s2, s8 ; GFX9-NEXT: s_mul_i32 s20, s1, s9 +; GFX9-NEXT: s_mul_i32 s21, s16, s10 ; GFX9-NEXT: s_add_u32 s19, s19, s20 ; GFX9-NEXT: s_cselect_b32 s20, 1, 0 -; GFX9-NEXT: s_mul_i32 s21, s16, s10 -; GFX9-NEXT: s_and_b32 s20, s20, 1 ; GFX9-NEXT: s_add_u32 s19, s19, s21 ; GFX9-NEXT: s_cselect_b32 s21, 1, 0 -; GFX9-NEXT: s_and_b32 s21, s21, 1 ; GFX9-NEXT: s_mul_hi_u32 s22, s1, s8 ; GFX9-NEXT: s_add_i32 s20, s20, s21 ; GFX9-NEXT: s_add_u32 s19, s19, s22 ; GFX9-NEXT: s_cselect_b32 s21, 1, 0 -; GFX9-NEXT: s_and_b32 s21, s21, 1 ; GFX9-NEXT: s_mul_hi_u32 s23, s16, s9 ; GFX9-NEXT: s_add_i32 s20, s20, s21 ; GFX9-NEXT: s_add_u32 s19, s19, s23 ; GFX9-NEXT: s_cselect_b32 s21, 1, 0 -; GFX9-NEXT: s_and_b32 s21, s21, 1 ; GFX9-NEXT: s_add_i32 s20, s20, s21 ; GFX9-NEXT: s_add_u32 s18, s19, s18 ; GFX9-NEXT: s_cselect_b32 s19, 1, 0 -; GFX9-NEXT: s_and_b32 s19, s19, 1 ; GFX9-NEXT: s_add_i32 s20, s20, s19 ; GFX9-NEXT: s_mul_i32 s19, s3, s8 ; GFX9-NEXT: s_mul_i32 s21, s2, s9 +; GFX9-NEXT: s_mul_i32 s22, s1, s10 ; GFX9-NEXT: s_add_u32 s19, s19, s21 ; GFX9-NEXT: s_cselect_b32 s21, 1, 0 -; GFX9-NEXT: s_mul_i32 s22, s1, s10 -; GFX9-NEXT: s_and_b32 s21, s21, 1 ; GFX9-NEXT: s_add_u32 s19, s19, s22 ; GFX9-NEXT: s_cselect_b32 s22, 1, 0 -; GFX9-NEXT: s_and_b32 s22, s22, 1 ; GFX9-NEXT: s_mul_i32 s23, s16, s11 ; GFX9-NEXT: s_add_i32 s21, s21, s22 ; GFX9-NEXT: s_add_u32 s19, s19, s23 ; GFX9-NEXT: s_cselect_b32 s22, 1, 0 -; GFX9-NEXT: s_and_b32 s22, s22, 1 ; GFX9-NEXT: s_mul_hi_u32 s24, s2, s8 ; GFX9-NEXT: s_add_i32 s21, s21, s22 ; GFX9-NEXT: s_add_u32 s19, s19, s24 ; GFX9-NEXT: s_cselect_b32 s22, 1, 0 -; GFX9-NEXT: s_and_b32 s22, s22, 1 ; GFX9-NEXT: s_mul_hi_u32 s25, s1, s9 ; GFX9-NEXT: s_add_i32 s21, s21, s22 ; GFX9-NEXT: s_add_u32 s19, s19, s25 ; GFX9-NEXT: s_cselect_b32 s22, 1, 0 -; GFX9-NEXT: s_and_b32 s22, s22, 1 ; GFX9-NEXT: s_mul_hi_u32 s26, s16, s10 ; GFX9-NEXT: s_add_i32 s21, s21, s22 ; GFX9-NEXT: s_add_u32 s19, s19, s26 ; GFX9-NEXT: s_cselect_b32 s22, 1, 0 -; GFX9-NEXT: s_and_b32 s22, s22, 1 ; GFX9-NEXT: s_add_i32 s21, s21, s22 ; GFX9-NEXT: s_add_u32 s19, s19, s20 ; GFX9-NEXT: s_cselect_b32 s20, 1, 0 -; GFX9-NEXT: s_and_b32 s20, s20, 1 ; GFX9-NEXT: s_add_i32 s21, s21, s20 ; GFX9-NEXT: s_mul_i32 s20, s4, s8 ; GFX9-NEXT: s_mul_i32 s22, s3, s9 +; GFX9-NEXT: s_mul_i32 s23, s2, s10 ; GFX9-NEXT: s_add_u32 s20, s20, s22 ; GFX9-NEXT: s_cselect_b32 s22, 1, 0 -; GFX9-NEXT: s_mul_i32 s23, s2, s10 -; GFX9-NEXT: s_and_b32 s22, s22, 1 ; GFX9-NEXT: s_add_u32 s20, s20, s23 ; GFX9-NEXT: s_cselect_b32 s23, 1, 0 -; GFX9-NEXT: s_and_b32 s23, s23, 1 ; GFX9-NEXT: s_mul_i32 s24, s1, s11 ; GFX9-NEXT: s_add_i32 s22, s22, s23 ; GFX9-NEXT: s_add_u32 s20, s20, s24 ; GFX9-NEXT: s_cselect_b32 s23, 1, 0 -; GFX9-NEXT: s_and_b32 s23, s23, 1 ; GFX9-NEXT: s_mul_i32 s25, s16, s12 ; GFX9-NEXT: s_add_i32 s22, s22, s23 ; GFX9-NEXT: s_add_u32 s20, s20, s25 ; GFX9-NEXT: s_cselect_b32 s23, 1, 0 -; GFX9-NEXT: s_and_b32 s23, s23, 1 ; GFX9-NEXT: s_mul_hi_u32 s26, s3, s8 ; GFX9-NEXT: s_add_i32 s22, s22, s23 ; GFX9-NEXT: s_add_u32 s20, s20, s26 ; GFX9-NEXT: s_cselect_b32 s23, 1, 0 -; GFX9-NEXT: s_and_b32 s23, s23, 1 ; GFX9-NEXT: s_mul_hi_u32 s27, s2, s9 ; GFX9-NEXT: s_add_i32 s22, s22, s23 ; GFX9-NEXT: s_add_u32 s20, s20, s27 ; GFX9-NEXT: s_cselect_b32 s23, 1, 0 -; GFX9-NEXT: s_and_b32 s23, s23, 1 ; GFX9-NEXT: s_mul_hi_u32 s28, s1, s10 ; GFX9-NEXT: s_add_i32 s22, s22, s23 ; GFX9-NEXT: s_add_u32 s20, s20, s28 ; GFX9-NEXT: s_cselect_b32 s23, 1, 0 -; GFX9-NEXT: s_and_b32 s23, s23, 1 ; GFX9-NEXT: s_mul_hi_u32 s29, s16, s11 ; GFX9-NEXT: s_add_i32 s22, s22, s23 ; GFX9-NEXT: s_add_u32 s20, s20, s29 ; GFX9-NEXT: s_cselect_b32 s23, 1, 0 -; GFX9-NEXT: s_and_b32 s23, s23, 1 ; GFX9-NEXT: s_add_i32 s22, s22, s23 ; GFX9-NEXT: s_add_u32 s20, s20, s21 ; GFX9-NEXT: s_cselect_b32 s21, 1, 0 -; GFX9-NEXT: s_and_b32 s21, s21, 1 ; GFX9-NEXT: s_add_i32 s22, s22, s21 ; GFX9-NEXT: s_mul_i32 s21, s5, s8 ; GFX9-NEXT: s_mul_i32 s23, s4, s9 +; GFX9-NEXT: s_mul_i32 s24, s3, s10 ; GFX9-NEXT: s_add_u32 s21, s21, s23 ; GFX9-NEXT: s_cselect_b32 s23, 1, 0 -; GFX9-NEXT: s_mul_i32 s24, s3, s10 -; GFX9-NEXT: s_and_b32 s23, s23, 1 ; GFX9-NEXT: s_add_u32 s21, s21, s24 ; GFX9-NEXT: s_cselect_b32 s24, 1, 0 -; GFX9-NEXT: s_and_b32 s24, s24, 1 ; GFX9-NEXT: s_mul_i32 s25, s2, s11 ; GFX9-NEXT: s_add_i32 s23, s23, s24 ; GFX9-NEXT: s_add_u32 s21, s21, s25 ; GFX9-NEXT: s_cselect_b32 s24, 1, 0 -; GFX9-NEXT: s_and_b32 s24, s24, 1 ; GFX9-NEXT: s_mul_i32 s26, s1, s12 ; GFX9-NEXT: s_add_i32 s23, s23, s24 ; GFX9-NEXT: s_add_u32 s21, s21, s26 ; GFX9-NEXT: s_cselect_b32 s24, 1, 0 -; GFX9-NEXT: s_and_b32 s24, s24, 1 ; GFX9-NEXT: s_mul_i32 s27, s16, s13 ; GFX9-NEXT: s_add_i32 s23, s23, s24 ; GFX9-NEXT: s_add_u32 s21, s21, s27 ; GFX9-NEXT: s_cselect_b32 s24, 1, 0 -; GFX9-NEXT: s_and_b32 s24, s24, 1 ; GFX9-NEXT: s_mul_hi_u32 s28, s4, s8 ; GFX9-NEXT: s_add_i32 s23, s23, s24 ; GFX9-NEXT: s_add_u32 s21, s21, s28 ; GFX9-NEXT: s_cselect_b32 s24, 1, 0 -; GFX9-NEXT: s_and_b32 s24, s24, 1 ; GFX9-NEXT: s_mul_hi_u32 s29, s3, s9 ; GFX9-NEXT: s_add_i32 s23, s23, s24 ; GFX9-NEXT: s_add_u32 s21, s21, s29 ; GFX9-NEXT: s_cselect_b32 s24, 1, 0 -; GFX9-NEXT: s_and_b32 s24, s24, 1 ; GFX9-NEXT: s_mul_hi_u32 s30, s2, s10 ; GFX9-NEXT: s_add_i32 s23, s23, s24 ; GFX9-NEXT: s_add_u32 s21, s21, s30 ; GFX9-NEXT: s_cselect_b32 s24, 1, 0 -; GFX9-NEXT: s_and_b32 s24, s24, 1 ; GFX9-NEXT: s_mul_hi_u32 s31, s1, s11 ; GFX9-NEXT: s_add_i32 s23, s23, s24 ; GFX9-NEXT: s_add_u32 s21, s21, s31 ; GFX9-NEXT: s_cselect_b32 s24, 1, 0 -; GFX9-NEXT: s_and_b32 s24, s24, 1 ; GFX9-NEXT: s_mul_hi_u32 s33, s16, s12 ; GFX9-NEXT: s_add_i32 s23, s23, s24 ; GFX9-NEXT: s_add_u32 s21, s21, s33 ; GFX9-NEXT: s_cselect_b32 s24, 1, 0 -; GFX9-NEXT: s_and_b32 s24, s24, 1 ; GFX9-NEXT: s_add_i32 s23, s23, s24 ; GFX9-NEXT: s_add_u32 s21, s21, s22 ; GFX9-NEXT: s_cselect_b32 s22, 1, 0 -; GFX9-NEXT: s_and_b32 s22, s22, 1 ; GFX9-NEXT: s_add_i32 s23, s23, s22 ; GFX9-NEXT: s_mul_i32 s22, s6, s8 ; GFX9-NEXT: s_mul_i32 s24, s5, s9 +; GFX9-NEXT: s_mul_i32 s25, s4, s10 ; GFX9-NEXT: s_add_u32 s22, s22, s24 ; GFX9-NEXT: s_cselect_b32 s24, 1, 0 -; GFX9-NEXT: s_mul_i32 s25, s4, s10 -; GFX9-NEXT: s_and_b32 s24, s24, 1 ; GFX9-NEXT: s_add_u32 s22, s22, s25 ; GFX9-NEXT: s_cselect_b32 s25, 1, 0 -; GFX9-NEXT: s_and_b32 s25, s25, 1 ; GFX9-NEXT: s_mul_i32 s26, s3, s11 ; GFX9-NEXT: s_add_i32 s24, s24, s25 ; GFX9-NEXT: s_add_u32 s22, s22, s26 ; GFX9-NEXT: s_cselect_b32 s25, 1, 0 -; GFX9-NEXT: s_and_b32 s25, s25, 1 ; GFX9-NEXT: s_mul_i32 s27, s2, s12 ; GFX9-NEXT: s_add_i32 s24, s24, s25 ; GFX9-NEXT: s_add_u32 s22, s22, s27 ; GFX9-NEXT: s_cselect_b32 s25, 1, 0 -; GFX9-NEXT: s_and_b32 s25, s25, 1 ; GFX9-NEXT: s_mul_i32 s28, s1, s13 ; GFX9-NEXT: s_add_i32 s24, s24, s25 ; GFX9-NEXT: s_add_u32 s22, s22, s28 ; GFX9-NEXT: s_cselect_b32 s25, 1, 0 -; GFX9-NEXT: s_and_b32 s25, s25, 1 ; GFX9-NEXT: s_mul_i32 s29, s16, s14 ; GFX9-NEXT: s_add_i32 s24, s24, s25 ; GFX9-NEXT: s_add_u32 s22, s22, s29 ; GFX9-NEXT: s_cselect_b32 s25, 1, 0 -; GFX9-NEXT: s_and_b32 s25, s25, 1 ; GFX9-NEXT: s_mul_hi_u32 s30, s5, s8 ; GFX9-NEXT: s_add_i32 s24, s24, s25 ; GFX9-NEXT: s_add_u32 s22, s22, s30 ; GFX9-NEXT: s_cselect_b32 s25, 1, 0 -; GFX9-NEXT: s_and_b32 s25, s25, 1 ; GFX9-NEXT: s_mul_hi_u32 s31, s4, s9 ; GFX9-NEXT: s_add_i32 s24, s24, s25 ; GFX9-NEXT: s_add_u32 s22, s22, s31 ; GFX9-NEXT: s_cselect_b32 s25, 1, 0 -; GFX9-NEXT: s_and_b32 s25, s25, 1 ; GFX9-NEXT: s_mul_hi_u32 s33, s3, s10 ; GFX9-NEXT: s_add_i32 s24, s24, s25 ; GFX9-NEXT: s_add_u32 s22, s22, s33 ; GFX9-NEXT: s_cselect_b32 s25, 1, 0 -; GFX9-NEXT: s_and_b32 s25, s25, 1 ; GFX9-NEXT: s_mul_hi_u32 s34, s2, s11 ; GFX9-NEXT: s_add_i32 s24, s24, s25 ; GFX9-NEXT: s_add_u32 s22, s22, s34 ; GFX9-NEXT: s_cselect_b32 s25, 1, 0 -; GFX9-NEXT: s_and_b32 s25, s25, 1 ; GFX9-NEXT: s_mul_hi_u32 s35, s1, s12 ; GFX9-NEXT: s_add_i32 s24, s24, s25 ; GFX9-NEXT: s_add_u32 s22, s22, s35 ; GFX9-NEXT: s_cselect_b32 s25, 1, 0 -; GFX9-NEXT: s_and_b32 s25, s25, 1 ; GFX9-NEXT: s_mul_hi_u32 s36, s16, s13 ; GFX9-NEXT: s_add_i32 s24, s24, s25 ; GFX9-NEXT: s_add_u32 s22, s22, s36 ; GFX9-NEXT: s_cselect_b32 s25, 1, 0 -; GFX9-NEXT: s_and_b32 s25, s25, 1 ; GFX9-NEXT: s_add_i32 s24, s24, s25 ; GFX9-NEXT: s_add_u32 s22, s22, s23 ; GFX9-NEXT: s_cselect_b32 s23, 1, 0 -; GFX9-NEXT: s_and_b32 s23, s23, 1 ; GFX9-NEXT: s_add_i32 s24, s24, s23 ; GFX9-NEXT: s_mul_i32 s7, s7, s8 ; GFX9-NEXT: s_mul_i32 s23, s6, s9 @@ -1873,268 +1758,221 @@ ; GFX10-NEXT: s_mul_hi_u32 s18, s0, s8 ; GFX10-NEXT: s_add_u32 s16, s16, s17 ; GFX10-NEXT: s_cselect_b32 s17, 1, 0 -; GFX10-NEXT: s_mul_i32 s19, s1, s9 -; GFX10-NEXT: s_and_b32 s17, s17, 1 ; GFX10-NEXT: s_add_u32 s16, s16, s18 ; GFX10-NEXT: s_cselect_b32 s18, 1, 0 -; GFX10-NEXT: s_mul_i32 s20, s0, s10 -; GFX10-NEXT: s_and_b32 s18, s18, 1 -; GFX10-NEXT: s_mul_hi_u32 s21, s1, s8 +; GFX10-NEXT: s_mul_i32 s19, s1, s9 ; GFX10-NEXT: s_add_i32 s17, s17, s18 ; GFX10-NEXT: s_mul_i32 s18, s2, s8 -; GFX10-NEXT: s_mul_i32 s22, s0, s11 +; GFX10-NEXT: s_mul_i32 s20, s0, s10 ; GFX10-NEXT: s_add_u32 s18, s18, s19 ; GFX10-NEXT: s_cselect_b32 s19, 1, 0 -; GFX10-NEXT: s_mul_i32 s23, s1, s11 -; GFX10-NEXT: s_and_b32 s19, s19, 1 ; GFX10-NEXT: s_add_u32 s18, s18, s20 ; GFX10-NEXT: s_cselect_b32 s20, 1, 0 -; GFX10-NEXT: s_mul_i32 s24, s0, s12 -; GFX10-NEXT: s_and_b32 s20, s20, 1 -; GFX10-NEXT: s_mul_i32 s25, s4, s9 +; GFX10-NEXT: s_mul_hi_u32 s21, s1, s8 ; GFX10-NEXT: s_add_i32 s19, s19, s20 ; GFX10-NEXT: s_add_u32 s18, s18, s21 ; GFX10-NEXT: s_cselect_b32 s20, 1, 0 -; GFX10-NEXT: s_mul_hi_u32 s21, s0, s9 -; GFX10-NEXT: s_and_b32 s20, s20, 1 -; GFX10-NEXT: s_mul_i32 s26, s2, s11 +; GFX10-NEXT: s_mul_hi_u32 s22, s0, s9 ; GFX10-NEXT: s_add_i32 s19, s19, s20 -; GFX10-NEXT: s_add_u32 s18, s18, s21 +; GFX10-NEXT: s_add_u32 s18, s18, s22 ; GFX10-NEXT: s_cselect_b32 s20, 1, 0 ; GFX10-NEXT: s_mul_i32 s21, s1, s10 -; GFX10-NEXT: s_and_b32 s20, s20, 1 -; GFX10-NEXT: s_mul_i32 s27, s0, s13 ; GFX10-NEXT: s_add_i32 s19, s19, s20 ; GFX10-NEXT: s_add_u32 s17, s18, s17 ; GFX10-NEXT: s_cselect_b32 s18, 1, 0 ; GFX10-NEXT: s_mul_i32 s20, s2, s9 -; GFX10-NEXT: s_and_b32 s18, s18, 1 -; GFX10-NEXT: s_mul_hi_u32 s28, s3, s9 ; GFX10-NEXT: s_add_i32 s19, s19, s18 ; GFX10-NEXT: s_mul_i32 s18, s3, s8 -; GFX10-NEXT: s_mul_i32 s7, s7, s8 +; GFX10-NEXT: s_mul_i32 s22, s0, s11 ; GFX10-NEXT: s_add_u32 s18, s18, s20 ; GFX10-NEXT: s_cselect_b32 s20, 1, 0 -; GFX10-NEXT: s_mul_i32 s15, s0, s15 -; GFX10-NEXT: s_and_b32 s20, s20, 1 ; GFX10-NEXT: s_add_u32 s18, s18, s21 ; GFX10-NEXT: s_cselect_b32 s21, 1, 0 -; GFX10-NEXT: s_and_b32 s21, s21, 1 +; GFX10-NEXT: s_mul_hi_u32 s23, s2, s8 ; GFX10-NEXT: s_add_i32 s20, s20, s21 ; GFX10-NEXT: s_add_u32 s18, s18, s22 ; GFX10-NEXT: s_cselect_b32 s21, 1, 0 -; GFX10-NEXT: s_mul_hi_u32 s22, s2, s8 -; GFX10-NEXT: s_and_b32 s21, s21, 1 +; GFX10-NEXT: s_mul_hi_u32 s24, s1, s9 ; GFX10-NEXT: s_add_i32 s20, s20, s21 -; GFX10-NEXT: s_add_u32 s18, s18, s22 +; GFX10-NEXT: s_add_u32 s18, s18, s23 ; GFX10-NEXT: s_cselect_b32 s21, 1, 0 -; GFX10-NEXT: s_mul_hi_u32 s22, s1, s9 -; GFX10-NEXT: s_and_b32 s21, s21, 1 +; GFX10-NEXT: s_mul_hi_u32 s25, s0, s10 ; GFX10-NEXT: s_add_i32 s20, s20, s21 -; GFX10-NEXT: s_add_u32 s18, s18, s22 +; GFX10-NEXT: s_add_u32 s18, s18, s24 ; GFX10-NEXT: s_cselect_b32 s21, 1, 0 -; GFX10-NEXT: s_mul_hi_u32 s22, s0, s10 -; GFX10-NEXT: s_and_b32 s21, s21, 1 +; GFX10-NEXT: s_mul_i32 s22, s2, s10 ; GFX10-NEXT: s_add_i32 s20, s20, s21 -; GFX10-NEXT: s_add_u32 s18, s18, s22 +; GFX10-NEXT: s_add_u32 s18, s18, s25 ; GFX10-NEXT: s_cselect_b32 s21, 1, 0 -; GFX10-NEXT: s_mul_i32 s22, s2, s10 -; GFX10-NEXT: s_and_b32 s21, s21, 1 +; GFX10-NEXT: s_mul_i32 s23, s1, s11 ; GFX10-NEXT: s_add_i32 s20, s20, s21 ; GFX10-NEXT: s_add_u32 s18, s18, s19 ; GFX10-NEXT: s_cselect_b32 s19, 1, 0 ; GFX10-NEXT: s_mul_i32 s21, s3, s9 -; GFX10-NEXT: s_and_b32 s19, s19, 1 ; GFX10-NEXT: s_add_i32 s20, s20, s19 ; GFX10-NEXT: s_mul_i32 s19, s4, s8 +; GFX10-NEXT: s_mul_i32 s24, s0, s12 ; GFX10-NEXT: s_add_u32 s19, s19, s21 ; GFX10-NEXT: s_cselect_b32 s21, 1, 0 -; GFX10-NEXT: s_and_b32 s21, s21, 1 ; GFX10-NEXT: s_add_u32 s19, s19, s22 ; GFX10-NEXT: s_cselect_b32 s22, 1, 0 -; GFX10-NEXT: s_and_b32 s22, s22, 1 +; GFX10-NEXT: s_mul_hi_u32 s25, s3, s8 ; GFX10-NEXT: s_add_i32 s21, s21, s22 ; GFX10-NEXT: s_add_u32 s19, s19, s23 ; GFX10-NEXT: s_cselect_b32 s22, 1, 0 -; GFX10-NEXT: s_mul_hi_u32 s23, s3, s8 -; GFX10-NEXT: s_and_b32 s22, s22, 1 +; GFX10-NEXT: s_mul_hi_u32 s26, s2, s9 ; GFX10-NEXT: s_add_i32 s21, s21, s22 ; GFX10-NEXT: s_add_u32 s19, s19, s24 ; GFX10-NEXT: s_cselect_b32 s22, 1, 0 -; GFX10-NEXT: s_mul_hi_u32 s24, s2, s9 -; GFX10-NEXT: s_and_b32 s22, s22, 1 +; GFX10-NEXT: s_mul_hi_u32 s27, s1, s10 ; GFX10-NEXT: s_add_i32 s21, s21, s22 -; GFX10-NEXT: s_add_u32 s19, s19, s23 +; GFX10-NEXT: s_add_u32 s19, s19, s25 ; GFX10-NEXT: s_cselect_b32 s22, 1, 0 -; GFX10-NEXT: s_mul_hi_u32 s23, s1, s10 -; GFX10-NEXT: s_and_b32 s22, s22, 1 +; GFX10-NEXT: s_mul_hi_u32 s28, s0, s11 ; GFX10-NEXT: s_add_i32 s21, s21, s22 -; GFX10-NEXT: s_add_u32 s19, s19, s24 +; GFX10-NEXT: s_add_u32 s19, s19, s26 ; GFX10-NEXT: s_cselect_b32 s22, 1, 0 -; GFX10-NEXT: s_mul_hi_u32 s24, s0, s11 -; GFX10-NEXT: s_and_b32 s22, s22, 1 +; GFX10-NEXT: s_mul_i32 s23, s3, s10 ; GFX10-NEXT: s_add_i32 s21, s21, s22 -; GFX10-NEXT: s_add_u32 s19, s19, s23 +; GFX10-NEXT: s_add_u32 s19, s19, s27 ; GFX10-NEXT: s_cselect_b32 s22, 1, 0 -; GFX10-NEXT: s_mul_i32 s23, s5, s8 -; GFX10-NEXT: s_and_b32 s22, s22, 1 +; GFX10-NEXT: s_mul_i32 s24, s2, s11 ; GFX10-NEXT: s_add_i32 s21, s21, s22 -; GFX10-NEXT: s_add_u32 s19, s19, s24 +; GFX10-NEXT: s_add_u32 s19, s19, s28 ; GFX10-NEXT: s_cselect_b32 s22, 1, 0 -; GFX10-NEXT: s_mul_i32 s24, s3, s10 -; GFX10-NEXT: s_and_b32 s22, s22, 1 +; GFX10-NEXT: s_mul_i32 s25, s1, s12 ; GFX10-NEXT: s_add_i32 s21, s21, s22 ; GFX10-NEXT: s_add_u32 s19, s19, s20 ; GFX10-NEXT: s_cselect_b32 s20, 1, 0 -; GFX10-NEXT: s_mul_i32 s22, s1, s12 -; GFX10-NEXT: s_and_b32 s20, s20, 1 +; GFX10-NEXT: s_mul_i32 s22, s4, s9 ; GFX10-NEXT: s_add_i32 s21, s21, s20 -; GFX10-NEXT: s_add_u32 s23, s23, s25 -; GFX10-NEXT: s_cselect_b32 s25, 1, 0 -; GFX10-NEXT: s_mul_hi_u32 s20, s4, s8 -; GFX10-NEXT: s_and_b32 s25, s25, 1 -; GFX10-NEXT: s_add_u32 s23, s23, s24 -; GFX10-NEXT: s_cselect_b32 s24, 1, 0 -; GFX10-NEXT: s_and_b32 s24, s24, 1 -; GFX10-NEXT: s_add_i32 s24, s25, s24 -; GFX10-NEXT: s_add_u32 s23, s23, s26 -; GFX10-NEXT: s_cselect_b32 s25, 1, 0 -; GFX10-NEXT: s_mul_hi_u32 s26, s2, s10 -; GFX10-NEXT: s_and_b32 s25, s25, 1 -; GFX10-NEXT: s_add_i32 s24, s24, s25 -; GFX10-NEXT: s_add_u32 s22, s23, s22 -; GFX10-NEXT: s_cselect_b32 s23, 1, 0 -; GFX10-NEXT: s_mul_hi_u32 s25, s1, s11 -; GFX10-NEXT: s_and_b32 s23, s23, 1 -; GFX10-NEXT: s_add_i32 s23, s24, s23 -; GFX10-NEXT: s_add_u32 s22, s22, s27 -; GFX10-NEXT: s_cselect_b32 s24, 1, 0 -; GFX10-NEXT: s_mul_hi_u32 s27, s0, s12 -; GFX10-NEXT: s_and_b32 s24, s24, 1 -; GFX10-NEXT: s_add_i32 s23, s23, s24 -; GFX10-NEXT: s_add_u32 s20, s22, s20 +; GFX10-NEXT: s_mul_i32 s20, s5, s8 +; GFX10-NEXT: s_mul_i32 s26, s0, s13 +; GFX10-NEXT: s_add_u32 s20, s20, s22 ; GFX10-NEXT: s_cselect_b32 s22, 1, 0 -; GFX10-NEXT: s_mul_i32 s24, s6, s8 -; GFX10-NEXT: s_and_b32 s22, s22, 1 -; GFX10-NEXT: s_add_i32 s22, s23, s22 -; GFX10-NEXT: s_add_u32 s20, s20, s28 +; GFX10-NEXT: s_add_u32 s20, s20, s23 ; GFX10-NEXT: s_cselect_b32 s23, 1, 0 -; GFX10-NEXT: s_mul_i32 s28, s5, s9 -; GFX10-NEXT: s_and_b32 s23, s23, 1 +; GFX10-NEXT: s_mul_hi_u32 s27, s4, s8 ; GFX10-NEXT: s_add_i32 s22, s22, s23 -; GFX10-NEXT: s_add_u32 s20, s20, s26 +; GFX10-NEXT: s_add_u32 s20, s20, s24 ; GFX10-NEXT: s_cselect_b32 s23, 1, 0 -; GFX10-NEXT: s_mul_i32 s26, s4, s10 -; GFX10-NEXT: s_and_b32 s23, s23, 1 +; GFX10-NEXT: s_mul_hi_u32 s28, s3, s9 ; GFX10-NEXT: s_add_i32 s22, s22, s23 ; GFX10-NEXT: s_add_u32 s20, s20, s25 ; GFX10-NEXT: s_cselect_b32 s23, 1, 0 -; GFX10-NEXT: s_mul_i32 s25, s3, s11 -; GFX10-NEXT: s_and_b32 s23, s23, 1 +; GFX10-NEXT: s_mul_hi_u32 s29, s2, s10 +; GFX10-NEXT: s_add_i32 s22, s22, s23 +; GFX10-NEXT: s_add_u32 s20, s20, s26 +; GFX10-NEXT: s_cselect_b32 s23, 1, 0 +; GFX10-NEXT: s_mul_hi_u32 s30, s1, s11 ; GFX10-NEXT: s_add_i32 s22, s22, s23 ; GFX10-NEXT: s_add_u32 s20, s20, s27 ; GFX10-NEXT: s_cselect_b32 s23, 1, 0 -; GFX10-NEXT: s_mul_i32 s27, s2, s12 -; GFX10-NEXT: s_and_b32 s23, s23, 1 +; GFX10-NEXT: s_mul_hi_u32 s31, s0, s12 +; GFX10-NEXT: s_add_i32 s22, s22, s23 +; GFX10-NEXT: s_add_u32 s20, s20, s28 +; GFX10-NEXT: s_cselect_b32 s23, 1, 0 +; GFX10-NEXT: s_mul_i32 s24, s4, s10 +; GFX10-NEXT: s_add_i32 s22, s22, s23 +; GFX10-NEXT: s_add_u32 s20, s20, s29 +; GFX10-NEXT: s_cselect_b32 s23, 1, 0 +; GFX10-NEXT: s_mul_i32 s25, s3, s11 +; GFX10-NEXT: s_add_i32 s22, s22, s23 +; GFX10-NEXT: s_add_u32 s20, s20, s30 +; GFX10-NEXT: s_cselect_b32 s23, 1, 0 +; GFX10-NEXT: s_mul_i32 s26, s2, s12 +; GFX10-NEXT: s_add_i32 s22, s22, s23 +; GFX10-NEXT: s_add_u32 s20, s20, s31 +; GFX10-NEXT: s_cselect_b32 s23, 1, 0 +; GFX10-NEXT: s_mul_i32 s27, s1, s13 ; GFX10-NEXT: s_add_i32 s22, s22, s23 ; GFX10-NEXT: s_add_u32 s20, s20, s21 ; GFX10-NEXT: s_cselect_b32 s21, 1, 0 -; GFX10-NEXT: s_mul_i32 s23, s1, s13 -; GFX10-NEXT: s_and_b32 s21, s21, 1 +; GFX10-NEXT: s_mul_i32 s23, s5, s9 ; GFX10-NEXT: s_add_i32 s22, s22, s21 -; GFX10-NEXT: s_add_u32 s21, s24, s28 -; GFX10-NEXT: s_cselect_b32 s24, 1, 0 +; GFX10-NEXT: s_mul_i32 s21, s6, s8 ; GFX10-NEXT: s_mul_i32 s28, s0, s14 -; GFX10-NEXT: s_and_b32 s24, s24, 1 -; GFX10-NEXT: s_add_u32 s21, s21, s26 -; GFX10-NEXT: s_cselect_b32 s26, 1, 0 -; GFX10-NEXT: s_and_b32 s26, s26, 1 -; GFX10-NEXT: s_add_i32 s24, s24, s26 -; GFX10-NEXT: s_add_u32 s21, s21, s25 -; GFX10-NEXT: s_cselect_b32 s25, 1, 0 -; GFX10-NEXT: s_mul_hi_u32 s26, s5, s8 -; GFX10-NEXT: s_and_b32 s25, s25, 1 -; GFX10-NEXT: s_add_i32 s24, s24, s25 -; GFX10-NEXT: s_add_u32 s21, s21, s27 -; GFX10-NEXT: s_cselect_b32 s25, 1, 0 -; GFX10-NEXT: s_mul_hi_u32 s27, s4, s9 -; GFX10-NEXT: s_and_b32 s25, s25, 1 -; GFX10-NEXT: s_add_i32 s24, s24, s25 ; GFX10-NEXT: s_add_u32 s21, s21, s23 ; GFX10-NEXT: s_cselect_b32 s23, 1, 0 -; GFX10-NEXT: s_mul_hi_u32 s25, s3, s10 -; GFX10-NEXT: s_and_b32 s23, s23, 1 -; GFX10-NEXT: s_add_i32 s23, s24, s23 -; GFX10-NEXT: s_add_u32 s21, s21, s28 +; GFX10-NEXT: s_add_u32 s21, s21, s24 ; GFX10-NEXT: s_cselect_b32 s24, 1, 0 -; GFX10-NEXT: s_mul_hi_u32 s28, s2, s11 -; GFX10-NEXT: s_and_b32 s24, s24, 1 +; GFX10-NEXT: s_mul_hi_u32 s29, s5, s8 +; GFX10-NEXT: s_add_i32 s23, s23, s24 +; GFX10-NEXT: s_add_u32 s21, s21, s25 +; GFX10-NEXT: s_cselect_b32 s24, 1, 0 +; GFX10-NEXT: s_mul_hi_u32 s30, s4, s9 ; GFX10-NEXT: s_add_i32 s23, s23, s24 ; GFX10-NEXT: s_add_u32 s21, s21, s26 ; GFX10-NEXT: s_cselect_b32 s24, 1, 0 -; GFX10-NEXT: s_mul_hi_u32 s26, s1, s12 -; GFX10-NEXT: s_and_b32 s24, s24, 1 +; GFX10-NEXT: s_mul_hi_u32 s31, s3, s10 ; GFX10-NEXT: s_add_i32 s23, s23, s24 ; GFX10-NEXT: s_add_u32 s21, s21, s27 ; GFX10-NEXT: s_cselect_b32 s24, 1, 0 -; GFX10-NEXT: s_mul_hi_u32 s27, s0, s13 -; GFX10-NEXT: s_and_b32 s24, s24, 1 +; GFX10-NEXT: s_mul_hi_u32 s33, s2, s11 ; GFX10-NEXT: s_add_i32 s23, s23, s24 -; GFX10-NEXT: s_add_u32 s21, s21, s25 +; GFX10-NEXT: s_add_u32 s21, s21, s28 ; GFX10-NEXT: s_cselect_b32 s24, 1, 0 -; GFX10-NEXT: s_mul_i32 s25, s6, s9 -; GFX10-NEXT: s_and_b32 s24, s24, 1 -; GFX10-NEXT: s_mul_hi_u32 s6, s6, s8 +; GFX10-NEXT: s_mul_hi_u32 s34, s1, s12 ; GFX10-NEXT: s_add_i32 s23, s23, s24 -; GFX10-NEXT: s_add_u32 s21, s21, s28 +; GFX10-NEXT: s_add_u32 s21, s21, s29 ; GFX10-NEXT: s_cselect_b32 s24, 1, 0 -; GFX10-NEXT: s_and_b32 s24, s24, 1 +; GFX10-NEXT: s_mul_hi_u32 s35, s0, s13 ; GFX10-NEXT: s_add_i32 s23, s23, s24 -; GFX10-NEXT: s_add_u32 s21, s21, s26 +; GFX10-NEXT: s_add_u32 s21, s21, s30 +; GFX10-NEXT: s_cselect_b32 s24, 1, 0 +; GFX10-NEXT: s_mul_i32 s7, s7, s8 +; GFX10-NEXT: s_add_i32 s23, s23, s24 +; GFX10-NEXT: s_add_u32 s21, s21, s31 +; GFX10-NEXT: s_cselect_b32 s24, 1, 0 +; GFX10-NEXT: s_mul_i32 s25, s5, s10 +; GFX10-NEXT: s_add_i32 s23, s23, s24 +; GFX10-NEXT: s_add_u32 s21, s21, s33 +; GFX10-NEXT: s_cselect_b32 s24, 1, 0 +; GFX10-NEXT: s_mul_i32 s15, s0, s15 +; GFX10-NEXT: s_add_i32 s23, s23, s24 +; GFX10-NEXT: s_add_u32 s21, s21, s34 ; GFX10-NEXT: s_cselect_b32 s24, 1, 0 -; GFX10-NEXT: s_mul_i32 s26, s5, s10 -; GFX10-NEXT: s_and_b32 s24, s24, 1 ; GFX10-NEXT: s_mul_hi_u32 s5, s5, s9 ; GFX10-NEXT: s_add_i32 s23, s23, s24 -; GFX10-NEXT: s_add_u32 s21, s21, s27 +; GFX10-NEXT: s_add_u32 s21, s21, s35 ; GFX10-NEXT: s_cselect_b32 s24, 1, 0 -; GFX10-NEXT: s_mul_i32 s27, s4, s11 -; GFX10-NEXT: s_and_b32 s24, s24, 1 -; GFX10-NEXT: s_mul_hi_u32 s4, s4, s10 ; GFX10-NEXT: s_add_i32 s23, s23, s24 +; GFX10-NEXT: s_mul_i32 s24, s6, s9 ; GFX10-NEXT: s_add_u32 s21, s21, s22 ; GFX10-NEXT: s_cselect_b32 s22, 1, 0 +; GFX10-NEXT: s_add_i32 s7, s7, s24 +; GFX10-NEXT: s_mul_i32 s24, s4, s11 ; GFX10-NEXT: s_add_i32 s7, s7, s25 -; GFX10-NEXT: s_mul_i32 s24, s3, s12 -; GFX10-NEXT: s_add_i32 s7, s7, s26 -; GFX10-NEXT: s_mul_i32 s25, s2, s13 -; GFX10-NEXT: s_add_i32 s7, s7, s27 -; GFX10-NEXT: s_mul_i32 s26, s1, s14 +; GFX10-NEXT: s_mul_i32 s25, s3, s12 ; GFX10-NEXT: s_add_i32 s7, s7, s24 -; GFX10-NEXT: s_mul_hi_u32 s3, s3, s11 +; GFX10-NEXT: s_mul_i32 s24, s2, s13 ; GFX10-NEXT: s_add_i32 s7, s7, s25 -; GFX10-NEXT: s_mul_hi_u32 s2, s2, s12 -; GFX10-NEXT: s_add_i32 s7, s7, s26 -; GFX10-NEXT: s_mul_hi_u32 s1, s1, s13 +; GFX10-NEXT: s_mul_i32 s25, s1, s14 +; GFX10-NEXT: s_add_i32 s7, s7, s24 +; GFX10-NEXT: s_mul_hi_u32 s6, s6, s8 +; GFX10-NEXT: s_add_i32 s7, s7, s25 +; GFX10-NEXT: s_mul_hi_u32 s4, s4, s10 ; GFX10-NEXT: s_add_i32 s7, s7, s15 +; GFX10-NEXT: s_mul_hi_u32 s3, s3, s11 ; GFX10-NEXT: s_add_i32 s6, s7, s6 +; GFX10-NEXT: s_mul_hi_u32 s2, s2, s12 ; GFX10-NEXT: s_add_i32 s5, s6, s5 -; GFX10-NEXT: s_mov_b32 s6, s21 +; GFX10-NEXT: s_mul_hi_u32 s1, s1, s13 ; GFX10-NEXT: s_add_i32 s4, s5, s4 -; GFX10-NEXT: s_mov_b32 s5, s20 +; GFX10-NEXT: s_add_i32 s23, s23, s22 ; GFX10-NEXT: s_add_i32 s3, s4, s3 -; GFX10-NEXT: s_mul_hi_u32 s4, s0, s14 +; GFX10-NEXT: s_mov_b32 s4, s19 ; GFX10-NEXT: s_add_i32 s2, s3, s2 -; GFX10-NEXT: s_and_b32 s3, s22, 1 +; GFX10-NEXT: s_mul_hi_u32 s3, s0, s14 ; GFX10-NEXT: s_add_i32 s1, s2, s1 -; GFX10-NEXT: s_add_i32 s23, s23, s3 -; GFX10-NEXT: s_add_i32 s1, s1, s4 ; GFX10-NEXT: s_mul_i32 s0, s0, s8 +; GFX10-NEXT: s_add_i32 s1, s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s17 ; GFX10-NEXT: s_add_i32 s7, s1, s23 ; GFX10-NEXT: s_mov_b32 s1, s16 -; GFX10-NEXT: s_mov_b32 s2, s17 ; GFX10-NEXT: s_mov_b32 s3, s18 -; GFX10-NEXT: s_mov_b32 s4, s19 +; GFX10-NEXT: s_mov_b32 s5, s20 +; GFX10-NEXT: s_mov_b32 s6, s21 ; GFX10-NEXT: ; return to shader part epilog %result = mul i256 %num, %den %cast = bitcast i256 %result to <8 x i32> Index: llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizer-combiner-sextload-from-sextinreg.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizer-combiner-sextload-from-sextinreg.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizer-combiner-sextload-from-sextinreg.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs -o - %s | FileCheck %s # Post-legalizer should not generate illegal extending loads --- @@ -21,3 +21,161 @@ %2:_(s64) = G_SEXT_INREG %1, 8 $vgpr0_vgpr1 = COPY %2 ... + +# Legal to fold into sextload +--- +name: sext_inreg_8_sextload_s32 +tracksRegLiveness: true +legalized: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: sext_inreg_8_sextload_s32 + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[SEXTLOAD:%[0-9]+]]:_(s32) = G_SEXTLOAD [[COPY]](p1) :: (load (s8), align 4, addrspace 1) + ; CHECK-NEXT: $vgpr0 = COPY [[SEXTLOAD]](s32) + %0:_(p1) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_LOAD %0 :: (load (s32), align 4, addrspace 1) + %2:_(s32) = G_SEXT_INREG %1, 8 + $vgpr0 = COPY %2 +... + +--- +name: sext_inreg_7_sextload_s32 +tracksRegLiveness: true +legalized: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: sext_inreg_7_sextload_s32 + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s32), addrspace 1) + ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LOAD]], 7 + ; CHECK-NEXT: $vgpr0 = COPY [[SEXT_INREG]](s32) + %0:_(p1) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_LOAD %0 :: (load (s32), align 4, addrspace 1) + %2:_(s32) = G_SEXT_INREG %1, 7 + $vgpr0 = COPY %2 +... + +--- +name: sext_inreg_9_sextload_s32 +tracksRegLiveness: true +legalized: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: sext_inreg_9_sextload_s32 + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s32), addrspace 1) + ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LOAD]], 9 + ; CHECK-NEXT: $vgpr0 = COPY [[SEXT_INREG]](s32) + %0:_(p1) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_LOAD %0 :: (load (s32), align 4, addrspace 1) + %2:_(s32) = G_SEXT_INREG %1, 9 + $vgpr0 = COPY %2 +... + +# Legal to fold into sextload +--- +name: sext_inreg_16_sextload_s32 +tracksRegLiveness: true +legalized: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: sext_inreg_16_sextload_s32 + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[SEXTLOAD:%[0-9]+]]:_(s32) = G_SEXTLOAD [[COPY]](p1) :: (load (s16), align 4, addrspace 1) + ; CHECK-NEXT: $vgpr0 = COPY [[SEXTLOAD]](s32) + %0:_(p1) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_LOAD %0 :: (load (s32), align 4, addrspace 1) + %2:_(s32) = G_SEXT_INREG %1, 16 + $vgpr0 = COPY %2 +... + +--- +name: sext_inreg_8_sextload_s8 +tracksRegLiveness: true +legalized: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: sext_inreg_8_sextload_s8 + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[SEXTLOAD:%[0-9]+]]:_(s32) = G_SEXTLOAD [[COPY]](p1) :: (load (s8), addrspace 1) + ; CHECK-NEXT: $vgpr0 = COPY [[SEXTLOAD]](s32) + %0:_(p1) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_LOAD %0 :: (load (s8), align 1, addrspace 1) + %2:_(s32) = G_SEXT_INREG %1, 8 + $vgpr0 = COPY %2 +... + +--- +name: sext_inreg_8_sextload_s8_volatile +tracksRegLiveness: true +legalized: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: sext_inreg_8_sextload_s8_volatile + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (volatile load (s8), addrspace 1) + ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LOAD]], 8 + ; CHECK-NEXT: $vgpr0 = COPY [[SEXT_INREG]](s32) + %0:_(p1) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_LOAD %0 :: (volatile load (s8), align 1, addrspace 1) + %2:_(s32) = G_SEXT_INREG %1, 8 + $vgpr0 = COPY %2 +... + +--- +name: sext_inreg_16_sextload_s16 +tracksRegLiveness: true +legalized: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: sext_inreg_16_sextload_s16 + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[SEXTLOAD:%[0-9]+]]:_(s32) = G_SEXTLOAD [[COPY]](p1) :: (load (s16), addrspace 1) + ; CHECK-NEXT: $vgpr0 = COPY [[SEXTLOAD]](s32) + %0:_(p1) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_LOAD %0 :: (load (s16), align 2, addrspace 1) + %2:_(s32) = G_SEXT_INREG %1, 16 + $vgpr0 = COPY %2 +... + +--- +name: sext_inreg_16_sextload_s16_volatile +tracksRegLiveness: true +legalized: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: sext_inreg_16_sextload_s16_volatile + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (volatile load (s16), addrspace 1) + ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LOAD]], 16 + ; CHECK-NEXT: $vgpr0 = COPY [[SEXT_INREG]](s32) + %0:_(p1) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_LOAD %0 :: (volatile load (s16), align 2, addrspace 1) + %2:_(s32) = G_SEXT_INREG %1, 16 + $vgpr0 = COPY %2 +... Index: llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizer-combiner-zextload-from-and.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizer-combiner-zextload-from-and.mir @@ -0,0 +1,195 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs -o - %s | FileCheck %s + +# Post-legalizer should not generate illegal extending loads +--- +name: zextload_from_inreg +tracksRegLiveness: true +legalized: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: zextload_from_inreg + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p1) :: (load (s64), addrspace 1) + ; CHECK-NEXT: %k:_(s64) = G_CONSTANT i64 4294967295 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[LOAD]], %k + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[AND]](s64) + %0:_(p1) = COPY $vgpr0_vgpr1 + %1:_(s64) = G_LOAD %0 :: (load (s64), align 8, addrspace 1) + %k:_(s64) = G_CONSTANT i64 4294967295 + %2:_(s64) = G_AND %1, %k + $vgpr0_vgpr1 = COPY %2 +... + +# Legal to fold into zextload +--- +name: zext_inreg_8_zextload_s32 +tracksRegLiveness: true +legalized: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: zext_inreg_8_zextload_s32 + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p1) :: (load (s8), align 4, addrspace 1) + ; CHECK-NEXT: $vgpr0 = COPY [[ZEXTLOAD]](s32) + %0:_(p1) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_LOAD %0 :: (load (s32), align 4, addrspace 1) + %k:_(s32) = G_CONSTANT i32 255 + %2:_(s32) = G_AND %1, %k + $vgpr0 = COPY %2 +... + +--- +name: zext_inreg_7_zextload_s32 +tracksRegLiveness: true +legalized: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: zext_inreg_7_zextload_s32 + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s32), addrspace 1) + ; CHECK-NEXT: %k:_(s32) = G_CONSTANT i32 127 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], %k + ; CHECK-NEXT: $vgpr0 = COPY [[AND]](s32) + %0:_(p1) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_LOAD %0 :: (load (s32), align 4, addrspace 1) + %k:_(s32) = G_CONSTANT i32 127 + %2:_(s32) = G_AND %1, %k + $vgpr0 = COPY %2 +... + +--- +name: zext_inreg_9_zextload_s32 +tracksRegLiveness: true +legalized: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: zext_inreg_9_zextload_s32 + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s32), addrspace 1) + ; CHECK-NEXT: %k:_(s32) = G_CONSTANT i32 511 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], %k + ; CHECK-NEXT: $vgpr0 = COPY [[AND]](s32) + %0:_(p1) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_LOAD %0 :: (load (s32), align 4, addrspace 1) + %k:_(s32) = G_CONSTANT i32 511 + %2:_(s32) = G_AND %1, %k + $vgpr0 = COPY %2 +... + +# Legal to fold into zextload +--- +name: zext_inreg_16_zextload_s32 +tracksRegLiveness: true +legalized: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: zext_inreg_16_zextload_s32 + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p1) :: (load (s16), align 4, addrspace 1) + ; CHECK-NEXT: $vgpr0 = COPY [[ZEXTLOAD]](s32) + %0:_(p1) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_LOAD %0 :: (load (s32), align 4, addrspace 1) + %k:_(s32) = G_CONSTANT i32 65535 + %2:_(s32) = G_AND %1, %k + $vgpr0 = COPY %2 +... + +--- +name: zext_inreg_8_zextload_s8 +tracksRegLiveness: true +legalized: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: zext_inreg_8_zextload_s8 + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p1) :: (load (s8), addrspace 1) + ; CHECK-NEXT: $vgpr0 = COPY [[ZEXTLOAD]](s32) + %0:_(p1) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_LOAD %0 :: (load (s8), align 1, addrspace 1) + %k:_(s32) = G_CONSTANT i32 255 + %2:_(s32) = G_AND %1, %k + $vgpr0 = COPY %2 +... + +--- +name: zext_inreg_8_zextload_s8_volatile +tracksRegLiveness: true +legalized: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: zext_inreg_8_zextload_s8_volatile + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (volatile load (s8), addrspace 1) + ; CHECK-NEXT: %k:_(s32) = G_CONSTANT i32 255 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], %k + ; CHECK-NEXT: $vgpr0 = COPY [[AND]](s32) + %0:_(p1) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_LOAD %0 :: (volatile load (s8), align 1, addrspace 1) + %k:_(s32) = G_CONSTANT i32 255 + %2:_(s32) = G_AND %1, %k + $vgpr0 = COPY %2 +... + +--- +name: zext_inreg_16_zextload_s16 +tracksRegLiveness: true +legalized: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: zext_inreg_16_zextload_s16 + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p1) :: (load (s16), addrspace 1) + ; CHECK-NEXT: $vgpr0 = COPY [[ZEXTLOAD]](s32) + %0:_(p1) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_LOAD %0 :: (load (s16), align 2, addrspace 1) + %k:_(s32) = G_CONSTANT i32 65535 + %2:_(s32) = G_AND %1, %k + $vgpr0 = COPY %2 +... + +--- +name: zext_inreg_16_zextload_s16_volatile +tracksRegLiveness: true +legalized: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: zext_inreg_16_zextload_s16_volatile + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (volatile load (s16), addrspace 1) + ; CHECK-NEXT: %k:_(s32) = G_CONSTANT i32 65535 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], %k + ; CHECK-NEXT: $vgpr0 = COPY [[AND]](s32) + %0:_(p1) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_LOAD %0 :: (volatile load (s16), align 2, addrspace 1) + %k:_(s32) = G_CONSTANT i32 65535 + %2:_(s32) = G_AND %1, %k + $vgpr0 = COPY %2 +... Index: llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -4217,9 +4217,6 @@ ; GFX6-LABEL: s_saddsat_i64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_add_u32 s4, s0, s2 -; GFX6-NEXT: s_cselect_b32 s5, 1, 0 -; GFX6-NEXT: s_and_b32 s5, s5, 1 -; GFX6-NEXT: s_cmp_lg_u32 s5, 0 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: s_addc_u32 s5, s1, s3 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -4243,9 +4240,6 @@ ; GFX8-LABEL: s_saddsat_i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_add_u32 s4, s0, s2 -; GFX8-NEXT: s_cselect_b32 s5, 1, 0 -; GFX8-NEXT: s_and_b32 s5, s5, 1 -; GFX8-NEXT: s_cmp_lg_u32 s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_addc_u32 s5, s1, s3 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -4269,9 +4263,6 @@ ; GFX9-LABEL: s_saddsat_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 s4, s0, s2 -; GFX9-NEXT: s_cselect_b32 s5, 1, 0 -; GFX9-NEXT: s_and_b32 s5, s5, 1 -; GFX9-NEXT: s_cmp_lg_u32 s5, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_addc_u32 s5, s1, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -4295,15 +4286,12 @@ ; GFX10-LABEL: s_saddsat_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_add_u32 s4, s0, s2 -; GFX10-NEXT: s_cselect_b32 s5, 1, 0 -; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[2:3], 0 -; GFX10-NEXT: s_and_b32 s5, s5, 1 -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: s_cmp_lg_u32 s5, 0 ; GFX10-NEXT: s_addc_u32 s5, s1, s3 -; GFX10-NEXT: s_mov_b32 s3, 0 +; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[2:3], 0 ; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[4:5], s[0:1] +; GFX10-NEXT: s_mov_b32 s3, 0 ; GFX10-NEXT: s_ashr_i32 s0, s5, 31 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: s_xor_b32 s2, s2, s1 ; GFX10-NEXT: s_cmp_lg_u32 s3, 0 @@ -4559,9 +4547,6 @@ ; GFX6-LABEL: s_saddsat_v2i64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_add_u32 s8, s0, s4 -; GFX6-NEXT: s_cselect_b32 s9, 1, 0 -; GFX6-NEXT: s_and_b32 s9, s9, 1 -; GFX6-NEXT: s_cmp_lg_u32 s9, 0 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: s_addc_u32 s9, s1, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -4572,16 +4557,13 @@ ; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: s_cmp_lg_u32 s10, 0 -; GFX6-NEXT: s_addc_u32 s1, s4, s5 -; GFX6-NEXT: s_add_u32 s0, s2, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NEXT: s_cselect_b32 s1, 1, 0 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s8 -; GFX6-NEXT: s_and_b32 s1, s1, 1 +; GFX6-NEXT: s_addc_u32 s1, s4, s5 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc -; GFX6-NEXT: s_cmp_lg_u32 s1, 0 +; GFX6-NEXT: s_add_u32 s0, s2, s6 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v2, s1 ; GFX6-NEXT: v_mov_b32_e32 v3, s9 ; GFX6-NEXT: s_addc_u32 s1, s3, s7 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 @@ -4608,9 +4590,6 @@ ; GFX8-LABEL: s_saddsat_v2i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_add_u32 s8, s0, s4 -; GFX8-NEXT: s_cselect_b32 s9, 1, 0 -; GFX8-NEXT: s_and_b32 s9, s9, 1 -; GFX8-NEXT: s_cmp_lg_u32 s9, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_addc_u32 s9, s1, s5 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -4621,16 +4600,13 @@ ; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX8-NEXT: s_brev_b32 s5, 1 ; GFX8-NEXT: s_cmp_lg_u32 s10, 0 -; GFX8-NEXT: s_addc_u32 s1, s4, s5 -; GFX8-NEXT: s_add_u32 s0, s2, s6 -; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: s_cselect_b32 s1, 1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s8 -; GFX8-NEXT: s_and_b32 s1, s1, 1 +; GFX8-NEXT: s_addc_u32 s1, s4, s5 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc -; GFX8-NEXT: s_cmp_lg_u32 s1, 0 +; GFX8-NEXT: s_add_u32 s0, s2, s6 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: v_mov_b32_e32 v3, s9 ; GFX8-NEXT: s_addc_u32 s1, s3, s7 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -4657,9 +4633,6 @@ ; GFX9-LABEL: s_saddsat_v2i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 s8, s0, s4 -; GFX9-NEXT: s_cselect_b32 s9, 1, 0 -; GFX9-NEXT: s_and_b32 s9, s9, 1 -; GFX9-NEXT: s_cmp_lg_u32 s9, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_addc_u32 s9, s1, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -4670,16 +4643,13 @@ ; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX9-NEXT: s_brev_b32 s5, 1 ; GFX9-NEXT: s_cmp_lg_u32 s10, 0 -; GFX9-NEXT: s_addc_u32 s1, s4, s5 -; GFX9-NEXT: s_add_u32 s0, s2, s6 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: s_cselect_b32 s1, 1, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-NEXT: s_and_b32 s1, s1, 1 +; GFX9-NEXT: s_addc_u32 s1, s4, s5 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc -; GFX9-NEXT: s_cmp_lg_u32 s1, 0 +; GFX9-NEXT: s_add_u32 s0, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_mov_b32_e32 v3, s9 ; GFX9-NEXT: s_addc_u32 s1, s3, s7 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -4706,32 +4676,26 @@ ; GFX10-LABEL: s_saddsat_v2i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_add_u32 s8, s0, s4 -; GFX10-NEXT: s_cselect_b32 s9, 1, 0 +; GFX10-NEXT: s_addc_u32 s9, s1, s5 ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[4:5], 0 -; GFX10-NEXT: s_and_b32 s9, s9, 1 +; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[8:9], s[0:1] ; GFX10-NEXT: s_mov_b32 s11, 0 -; GFX10-NEXT: s_cmp_lg_u32 s9, 0 +; GFX10-NEXT: s_ashr_i32 s0, s9, 31 ; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: s_addc_u32 s9, s1, s5 ; GFX10-NEXT: s_brev_b32 s10, 1 -; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[8:9], s[0:1] -; GFX10-NEXT: s_ashr_i32 s0, s9, 31 ; GFX10-NEXT: v_mov_b32_e32 v1, s9 ; GFX10-NEXT: s_xor_b32 s8, s4, s1 ; GFX10-NEXT: s_cmp_lg_u32 s11, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s0, s8 ; GFX10-NEXT: s_addc_u32 s1, s0, s10 ; GFX10-NEXT: s_add_u32 s4, s2, s6 -; GFX10-NEXT: s_cselect_b32 s5, 1, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: s_and_b32 s5, s5, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s1, s8 -; GFX10-NEXT: s_cmp_lg_u32 s5, 0 ; GFX10-NEXT: s_addc_u32 s5, s3, s7 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[4:5], s[2:3] ; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[6:7], 0 ; GFX10-NEXT: s_ashr_i32 s0, s5, 31 ; GFX10-NEXT: v_mov_b32_e32 v3, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s1, s8 ; GFX10-NEXT: s_xor_b32 s2, s3, s2 ; GFX10-NEXT: s_cmp_lg_u32 s11, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s0, s2 @@ -4750,19 +4714,10 @@ ; GFX6-LABEL: s_saddsat_i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_add_u32 s4, s0, s4 -; GFX6-NEXT: s_cselect_b32 s8, 1, 0 -; GFX6-NEXT: s_and_b32 s8, s8, 1 -; GFX6-NEXT: s_cmp_lg_u32 s8, 0 -; GFX6-NEXT: s_addc_u32 s5, s1, s5 -; GFX6-NEXT: s_cselect_b32 s8, 1, 0 -; GFX6-NEXT: s_and_b32 s8, s8, 1 -; GFX6-NEXT: s_cmp_lg_u32 s8, 0 -; GFX6-NEXT: s_addc_u32 s8, s2, s6 -; GFX6-NEXT: s_cselect_b32 s9, 1, 0 ; GFX6-NEXT: v_mov_b32_e32 v3, s1 -; GFX6-NEXT: s_and_b32 s9, s9, 1 +; GFX6-NEXT: s_addc_u32 s5, s1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: s_cmp_lg_u32 s9, 0 +; GFX6-NEXT: s_addc_u32 s8, s2, s6 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] ; GFX6-NEXT: s_addc_u32 s9, s3, s7 @@ -4779,15 +4734,9 @@ ; GFX6-NEXT: s_ashr_i32 s0, s9, 31 ; GFX6-NEXT: s_mov_b32 s1, 0 ; GFX6-NEXT: s_cmp_lg_u32 s1, 0 +; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_addc_u32 s1, s0, 0 -; GFX6-NEXT: s_cselect_b32 s2, 1, 0 -; GFX6-NEXT: s_and_b32 s2, s2, 1 -; GFX6-NEXT: s_cmp_lg_u32 s2, 0 ; GFX6-NEXT: s_addc_u32 s2, s0, 0 -; GFX6-NEXT: s_cselect_b32 s3, 1, 0 -; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_and_b32 s3, s3, 1 -; GFX6-NEXT: s_cmp_lg_u32 s3, 0 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX6-NEXT: s_addc_u32 s3, s0, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 @@ -4812,18 +4761,9 @@ ; GFX8-LABEL: s_saddsat_i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_add_u32 s4, s0, s4 -; GFX8-NEXT: s_cselect_b32 s8, 1, 0 -; GFX8-NEXT: s_and_b32 s8, s8, 1 -; GFX8-NEXT: s_cmp_lg_u32 s8, 0 ; GFX8-NEXT: s_addc_u32 s5, s1, s5 -; GFX8-NEXT: s_cselect_b32 s8, 1, 0 -; GFX8-NEXT: s_and_b32 s8, s8, 1 -; GFX8-NEXT: s_cmp_lg_u32 s8, 0 -; GFX8-NEXT: s_addc_u32 s8, s2, s6 -; GFX8-NEXT: s_cselect_b32 s9, 1, 0 -; GFX8-NEXT: s_and_b32 s9, s9, 1 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_cmp_lg_u32 s9, 0 +; GFX8-NEXT: s_addc_u32 s8, s2, s6 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: s_addc_u32 s9, s3, s7 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -4845,17 +4785,11 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] ; GFX8-NEXT: s_ashr_i32 s0, s9, 31 ; GFX8-NEXT: s_mov_b32 s1, 0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: s_cmp_lg_u32 s1, 0 +; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_addc_u32 s1, s0, 0 -; GFX8-NEXT: s_cselect_b32 s2, 1, 0 -; GFX8-NEXT: s_and_b32 s2, s2, 1 -; GFX8-NEXT: s_cmp_lg_u32 s2, 0 ; GFX8-NEXT: s_addc_u32 s2, s0, 0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: s_cselect_b32 s3, 1, 0 -; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX8-NEXT: s_and_b32 s3, s3, 1 -; GFX8-NEXT: s_cmp_lg_u32 s3, 0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: s_addc_u32 s3, s0, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 @@ -4880,18 +4814,9 @@ ; GFX9-LABEL: s_saddsat_i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 s4, s0, s4 -; GFX9-NEXT: s_cselect_b32 s8, 1, 0 -; GFX9-NEXT: s_and_b32 s8, s8, 1 -; GFX9-NEXT: s_cmp_lg_u32 s8, 0 ; GFX9-NEXT: s_addc_u32 s5, s1, s5 -; GFX9-NEXT: s_cselect_b32 s8, 1, 0 -; GFX9-NEXT: s_and_b32 s8, s8, 1 -; GFX9-NEXT: s_cmp_lg_u32 s8, 0 -; GFX9-NEXT: s_addc_u32 s8, s2, s6 -; GFX9-NEXT: s_cselect_b32 s9, 1, 0 -; GFX9-NEXT: s_and_b32 s9, s9, 1 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_cmp_lg_u32 s9, 0 +; GFX9-NEXT: s_addc_u32 s8, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: s_addc_u32 s9, s3, s7 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -4913,17 +4838,11 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] ; GFX9-NEXT: s_ashr_i32 s0, s9, 31 ; GFX9-NEXT: s_mov_b32 s1, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: s_cmp_lg_u32 s1, 0 +; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: s_addc_u32 s1, s0, 0 -; GFX9-NEXT: s_cselect_b32 s2, 1, 0 -; GFX9-NEXT: s_and_b32 s2, s2, 1 -; GFX9-NEXT: s_cmp_lg_u32 s2, 0 ; GFX9-NEXT: s_addc_u32 s2, s0, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: s_cselect_b32 s3, 1, 0 -; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX9-NEXT: s_and_b32 s3, s3, 1 -; GFX9-NEXT: s_cmp_lg_u32 s3, 0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_addc_u32 s3, s0, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 @@ -4948,60 +4867,45 @@ ; GFX10-LABEL: s_saddsat_i128: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_add_u32 s4, s0, s4 -; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_cmp_lt_i64_e64 s10, s[6:7], 0 -; GFX10-NEXT: s_and_b32 s8, s8, 1 -; GFX10-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10-NEXT: s_addc_u32 s5, s1, s5 -; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[4:5], s[0:1] -; GFX10-NEXT: s_and_b32 s8, s8, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s10 -; GFX10-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10-NEXT: s_addc_u32 s8, s2, s6 -; GFX10-NEXT: s_cselect_b32 s9, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 -; GFX10-NEXT: s_and_b32 s9, s9, 1 -; GFX10-NEXT: v_mov_b32_e32 v3, s8 -; GFX10-NEXT: s_cmp_lg_u32 s9, 0 +; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[4:5], s[0:1] ; GFX10-NEXT: s_addc_u32 s9, s3, s7 ; GFX10-NEXT: s_cmp_eq_u64 s[8:9], s[2:3] -; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[8:9], s[2:3] +; GFX10-NEXT: v_mov_b32_e32 v3, s9 ; GFX10-NEXT: s_cselect_b32 s10, 1, 0 -; GFX10-NEXT: v_mov_b32_e32 v4, s9 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[8:9], s[2:3] +; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[6:7], 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX10-NEXT: s_and_b32 s0, 1, s10 ; GFX10-NEXT: s_cmp_eq_u64 s[6:7], 0 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 +; GFX10-NEXT: s_cselect_b32 s1, 1, 0 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10-NEXT: s_and_b32 s1, 1, s1 ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 ; GFX10-NEXT: s_mov_b32 s1, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0 ; GFX10-NEXT: s_ashr_i32 s0, s9, 31 ; GFX10-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-NEXT: s_addc_u32 s1, s0, 0 -; GFX10-NEXT: s_cselect_b32 s2, 1, 0 -; GFX10-NEXT: v_xor_b32_e32 v0, v0, v1 -; GFX10-NEXT: s_and_b32 s2, s2, 1 +; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-NEXT: s_cmp_lg_u32 s2, 0 -; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_addc_u32 s2, s0, 0 -; GFX10-NEXT: s_cselect_b32 s3, 1, 0 -; GFX10-NEXT: s_and_b32 s3, s3, 1 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: s_cmp_lg_u32 s3, 0 ; GFX10-NEXT: s_addc_u32 s3, s0, 0x80000000 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, s2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v4, s3, vcc_lo -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s3, vcc_lo +; GFX10-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10-NEXT: v_readfirstlane_b32 s1, v2 +; GFX10-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: ; return to shader part epilog %result = call i128 @llvm.sadd.sat.i128(i128 %lhs, i128 %rhs) @@ -5527,19 +5431,10 @@ ; GFX6-LABEL: s_saddsat_v2i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_add_u32 s8, s0, s8 -; GFX6-NEXT: s_cselect_b32 s16, 1, 0 -; GFX6-NEXT: s_and_b32 s16, s16, 1 -; GFX6-NEXT: s_cmp_lg_u32 s16, 0 -; GFX6-NEXT: s_addc_u32 s9, s1, s9 -; GFX6-NEXT: s_cselect_b32 s16, 1, 0 -; GFX6-NEXT: s_and_b32 s16, s16, 1 -; GFX6-NEXT: s_cmp_lg_u32 s16, 0 -; GFX6-NEXT: s_addc_u32 s16, s2, s10 -; GFX6-NEXT: s_cselect_b32 s17, 1, 0 ; GFX6-NEXT: v_mov_b32_e32 v3, s1 -; GFX6-NEXT: s_and_b32 s17, s17, 1 +; GFX6-NEXT: s_addc_u32 s9, s1, s9 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: s_cmp_lg_u32 s17, 0 +; GFX6-NEXT: s_addc_u32 s16, s2, s10 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3] ; GFX6-NEXT: s_addc_u32 s17, s3, s11 @@ -5551,50 +5446,35 @@ ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX6-NEXT: v_cmp_eq_u64_e64 s[0:1], s[10:11], 0 -; GFX6-NEXT: s_brev_b32 s10, 1 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] ; GFX6-NEXT: s_ashr_i32 s0, s17, 31 ; GFX6-NEXT: s_mov_b32 s1, 0 ; GFX6-NEXT: s_cmp_lg_u32 s1, 0 +; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_addc_u32 s1, s0, 0 -; GFX6-NEXT: s_cselect_b32 s2, 1, 0 -; GFX6-NEXT: s_and_b32 s2, s2, 1 -; GFX6-NEXT: s_cmp_lg_u32 s2, 0 +; GFX6-NEXT: s_brev_b32 s10, 1 ; GFX6-NEXT: s_addc_u32 s2, s0, 0 -; GFX6-NEXT: s_cselect_b32 s3, 1, 0 -; GFX6-NEXT: s_and_b32 s3, s3, 1 -; GFX6-NEXT: s_cmp_lg_u32 s3, 0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX6-NEXT: s_addc_u32 s3, s0, s10 -; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: s_add_u32 s0, s4, s12 ; GFX6-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NEXT: s_cselect_b32 s1, 1, 0 -; GFX6-NEXT: s_and_b32 s1, s1, 1 -; GFX6-NEXT: s_cmp_lg_u32 s1, 0 -; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_addc_u32 s1, s5, s13 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NEXT: s_cselect_b32 s2, 1, 0 -; GFX6-NEXT: s_and_b32 s2, s2, 1 ; GFX6-NEXT: v_mov_b32_e32 v3, s8 ; GFX6-NEXT: v_mov_b32_e32 v4, s9 -; GFX6-NEXT: s_cmp_lg_u32 s2, 0 +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: v_mov_b32_e32 v2, s16 ; GFX6-NEXT: v_mov_b32_e32 v3, s17 -; GFX6-NEXT: s_addc_u32 s2, s6, s14 ; GFX6-NEXT: v_cndmask_b32_e32 v6, v2, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v7, v3, v1, vcc -; GFX6-NEXT: s_cselect_b32 s3, 1, 0 +; GFX6-NEXT: s_add_u32 s0, s4, s12 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_and_b32 s3, s3, 1 +; GFX6-NEXT: s_addc_u32 s1, s5, s13 ; GFX6-NEXT: v_mov_b32_e32 v3, s5 -; GFX6-NEXT: s_cmp_lg_u32 s3, 0 +; GFX6-NEXT: s_addc_u32 s2, s6, s14 ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX6-NEXT: s_addc_u32 s3, s7, s15 @@ -5611,15 +5491,9 @@ ; GFX6-NEXT: s_ashr_i32 s4, s3, 31 ; GFX6-NEXT: s_mov_b32 s5, 0 ; GFX6-NEXT: s_cmp_lg_u32 s5, 0 +; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_addc_u32 s5, s4, 0 -; GFX6-NEXT: s_cselect_b32 s6, 1, 0 -; GFX6-NEXT: s_and_b32 s6, s6, 1 -; GFX6-NEXT: s_cmp_lg_u32 s6, 0 ; GFX6-NEXT: s_addc_u32 s6, s4, 0 -; GFX6-NEXT: s_cselect_b32 s7, 1, 0 -; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_and_b32 s7, s7, 1 -; GFX6-NEXT: s_cmp_lg_u32 s7, 0 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX6-NEXT: s_addc_u32 s7, s4, s10 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 @@ -5648,18 +5522,9 @@ ; GFX8-LABEL: s_saddsat_v2i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_add_u32 s8, s0, s8 -; GFX8-NEXT: s_cselect_b32 s16, 1, 0 -; GFX8-NEXT: s_and_b32 s16, s16, 1 -; GFX8-NEXT: s_cmp_lg_u32 s16, 0 ; GFX8-NEXT: s_addc_u32 s9, s1, s9 -; GFX8-NEXT: s_cselect_b32 s16, 1, 0 -; GFX8-NEXT: s_and_b32 s16, s16, 1 -; GFX8-NEXT: s_cmp_lg_u32 s16, 0 -; GFX8-NEXT: s_addc_u32 s16, s2, s10 -; GFX8-NEXT: s_cselect_b32 s17, 1, 0 -; GFX8-NEXT: s_and_b32 s17, s17, 1 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_cmp_lg_u32 s17, 0 +; GFX8-NEXT: s_addc_u32 s16, s2, s10 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: s_addc_u32 s17, s3, s11 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -5681,46 +5546,31 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] ; GFX8-NEXT: s_ashr_i32 s0, s17, 31 ; GFX8-NEXT: s_mov_b32 s1, 0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: s_cmp_lg_u32 s1, 0 +; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_addc_u32 s1, s0, 0 -; GFX8-NEXT: s_cselect_b32 s2, 1, 0 -; GFX8-NEXT: s_and_b32 s2, s2, 1 -; GFX8-NEXT: s_cmp_lg_u32 s2, 0 -; GFX8-NEXT: s_addc_u32 s2, s0, 0 -; GFX8-NEXT: s_cselect_b32 s3, 1, 0 -; GFX8-NEXT: s_and_b32 s3, s3, 1 ; GFX8-NEXT: s_brev_b32 s10, 1 -; GFX8-NEXT: s_cmp_lg_u32 s3, 0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX8-NEXT: s_addc_u32 s2, s0, 0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: s_addc_u32 s3, s0, s10 -; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 -; GFX8-NEXT: s_add_u32 s0, s4, s12 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: s_cselect_b32 s1, 1, 0 -; GFX8-NEXT: s_and_b32 s1, s1, 1 -; GFX8-NEXT: s_cmp_lg_u32 s1, 0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_addc_u32 s1, s5, s13 -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: s_cselect_b32 s2, 1, 0 -; GFX8-NEXT: s_and_b32 s2, s2, 1 -; GFX8-NEXT: s_cmp_lg_u32 s2, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s8 ; GFX8-NEXT: v_mov_b32_e32 v4, s9 -; GFX8-NEXT: s_addc_u32 s2, s6, s14 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_mov_b32_e32 v2, s16 ; GFX8-NEXT: v_mov_b32_e32 v3, s17 -; GFX8-NEXT: s_cselect_b32 s3, 1, 0 +; GFX8-NEXT: s_add_u32 s0, s4, s12 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v1, vcc -; GFX8-NEXT: s_and_b32 s3, s3, 1 +; GFX8-NEXT: s_addc_u32 s1, s5, s13 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: s_cmp_lg_u32 s3, 0 +; GFX8-NEXT: s_addc_u32 s2, s6, s14 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: s_addc_u32 s3, s7, s15 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 @@ -5742,17 +5592,11 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[4:5] ; GFX8-NEXT: s_ashr_i32 s4, s3, 31 ; GFX8-NEXT: s_mov_b32 s5, 0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: s_cmp_lg_u32 s5, 0 +; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_addc_u32 s5, s4, 0 -; GFX8-NEXT: s_cselect_b32 s6, 1, 0 -; GFX8-NEXT: s_and_b32 s6, s6, 1 -; GFX8-NEXT: s_cmp_lg_u32 s6, 0 ; GFX8-NEXT: s_addc_u32 s6, s4, 0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: s_cselect_b32 s7, 1, 0 -; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX8-NEXT: s_and_b32 s7, s7, 1 -; GFX8-NEXT: s_cmp_lg_u32 s7, 0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: s_addc_u32 s7, s4, s10 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 @@ -5781,18 +5625,9 @@ ; GFX9-LABEL: s_saddsat_v2i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 s8, s0, s8 -; GFX9-NEXT: s_cselect_b32 s16, 1, 0 -; GFX9-NEXT: s_and_b32 s16, s16, 1 -; GFX9-NEXT: s_cmp_lg_u32 s16, 0 ; GFX9-NEXT: s_addc_u32 s9, s1, s9 -; GFX9-NEXT: s_cselect_b32 s16, 1, 0 -; GFX9-NEXT: s_and_b32 s16, s16, 1 -; GFX9-NEXT: s_cmp_lg_u32 s16, 0 -; GFX9-NEXT: s_addc_u32 s16, s2, s10 -; GFX9-NEXT: s_cselect_b32 s17, 1, 0 -; GFX9-NEXT: s_and_b32 s17, s17, 1 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_addc_u32 s16, s2, s10 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: s_addc_u32 s17, s3, s11 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -5814,46 +5649,31 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] ; GFX9-NEXT: s_ashr_i32 s0, s17, 31 ; GFX9-NEXT: s_mov_b32 s1, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: s_cmp_lg_u32 s1, 0 +; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: s_addc_u32 s1, s0, 0 -; GFX9-NEXT: s_cselect_b32 s2, 1, 0 -; GFX9-NEXT: s_and_b32 s2, s2, 1 -; GFX9-NEXT: s_cmp_lg_u32 s2, 0 -; GFX9-NEXT: s_addc_u32 s2, s0, 0 -; GFX9-NEXT: s_cselect_b32 s3, 1, 0 -; GFX9-NEXT: s_and_b32 s3, s3, 1 ; GFX9-NEXT: s_brev_b32 s10, 1 -; GFX9-NEXT: s_cmp_lg_u32 s3, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: s_addc_u32 s2, s0, 0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_addc_u32 s3, s0, s10 -; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_add_u32 s0, s4, s12 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: s_cselect_b32 s1, 1, 0 -; GFX9-NEXT: s_and_b32 s1, s1, 1 -; GFX9-NEXT: s_cmp_lg_u32 s1, 0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_addc_u32 s1, s5, s13 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: s_cselect_b32 s2, 1, 0 -; GFX9-NEXT: s_and_b32 s2, s2, 1 -; GFX9-NEXT: s_cmp_lg_u32 s2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s8 ; GFX9-NEXT: v_mov_b32_e32 v4, s9 -; GFX9-NEXT: s_addc_u32 s2, s6, s14 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_mov_b32_e32 v2, s16 ; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: s_cselect_b32 s3, 1, 0 +; GFX9-NEXT: s_add_u32 s0, s4, s12 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v7, v3, v1, vcc -; GFX9-NEXT: s_and_b32 s3, s3, 1 +; GFX9-NEXT: s_addc_u32 s1, s5, s13 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: s_cmp_lg_u32 s3, 0 +; GFX9-NEXT: s_addc_u32 s2, s6, s14 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: s_addc_u32 s3, s7, s15 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 @@ -5875,17 +5695,11 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[4:5] ; GFX9-NEXT: s_ashr_i32 s4, s3, 31 ; GFX9-NEXT: s_mov_b32 s5, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: s_cmp_lg_u32 s5, 0 +; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: s_addc_u32 s5, s4, 0 -; GFX9-NEXT: s_cselect_b32 s6, 1, 0 -; GFX9-NEXT: s_and_b32 s6, s6, 1 -; GFX9-NEXT: s_cmp_lg_u32 s6, 0 ; GFX9-NEXT: s_addc_u32 s6, s4, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: s_cselect_b32 s7, 1, 0 -; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX9-NEXT: s_and_b32 s7, s7, 1 -; GFX9-NEXT: s_cmp_lg_u32 s7, 0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_addc_u32 s7, s4, s10 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 @@ -5914,25 +5728,16 @@ ; GFX10-LABEL: s_saddsat_v2i128: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_add_u32 s8, s0, s8 -; GFX10-NEXT: s_cselect_b32 s16, 1, 0 -; GFX10-NEXT: s_and_b32 s16, s16, 1 -; GFX10-NEXT: s_cmp_lg_u32 s16, 0 ; GFX10-NEXT: s_addc_u32 s9, s1, s9 -; GFX10-NEXT: s_cselect_b32 s16, 1, 0 -; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[8:9], s[0:1] -; GFX10-NEXT: s_and_b32 s16, s16, 1 -; GFX10-NEXT: s_cmp_lg_u32 s16, 0 ; GFX10-NEXT: s_addc_u32 s16, s2, s10 -; GFX10-NEXT: s_cselect_b32 s17, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX10-NEXT: s_and_b32 s17, s17, 1 -; GFX10-NEXT: s_cmp_lg_u32 s17, 0 +; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[8:9], s[0:1] ; GFX10-NEXT: s_addc_u32 s17, s3, s11 -; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[16:17], s[2:3] ; GFX10-NEXT: s_cmp_eq_u64 s[16:17], s[2:3] -; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[10:11], 0 +; GFX10-NEXT: v_mov_b32_e32 v5, s17 ; GFX10-NEXT: s_cselect_b32 s18, 1, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, s17 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[16:17], s[2:3] +; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[10:11], 0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX10-NEXT: s_and_b32 s0, 1, s18 ; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 @@ -5940,91 +5745,70 @@ ; GFX10-NEXT: s_cselect_b32 s1, 1, 0 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10-NEXT: s_and_b32 s1, 1, s1 -; GFX10-NEXT: s_brev_b32 s10, 1 +; GFX10-NEXT: s_ashr_i32 s2, s17, 31 ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 -; GFX10-NEXT: s_mov_b32 s1, 0 +; GFX10-NEXT: s_brev_b32 s11, 1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0 -; GFX10-NEXT: s_ashr_i32 s0, s17, 31 -; GFX10-NEXT: s_cmp_lg_u32 s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, s9 -; GFX10-NEXT: s_addc_u32 s1, s0, 0 +; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: s_cmp_lg_u32 s0, 0 +; GFX10-NEXT: s_addc_u32 s1, s2, 0 +; GFX10-NEXT: s_addc_u32 s10, s2, 0 +; GFX10-NEXT: s_addc_u32 s3, s2, s11 +; GFX10-NEXT: s_add_u32 s12, s4, s12 +; GFX10-NEXT: s_addc_u32 s13, s5, s13 +; GFX10-NEXT: s_addc_u32 s18, s6, s14 +; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[12:13], s[4:5] +; GFX10-NEXT: s_addc_u32 s19, s7, s15 +; GFX10-NEXT: v_cmp_lt_i64_e64 s5, s[14:15], 0 +; GFX10-NEXT: s_cmp_eq_u64 s[18:19], s[6:7] ; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX10-NEXT: s_cselect_b32 s2, 1, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s8 -; GFX10-NEXT: s_and_b32 s2, s2, 1 -; GFX10-NEXT: s_cmp_lg_u32 s2, 0 +; GFX10-NEXT: s_cselect_b32 s0, 1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 +; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[18:19], s[6:7] +; GFX10-NEXT: s_and_b32 s0, 1, s0 +; GFX10-NEXT: s_cmp_eq_u64 s[14:15], 0 +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s5 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: s_addc_u32 s2, s0, 0 -; GFX10-NEXT: s_cselect_b32 s3, 1, 0 -; GFX10-NEXT: s_and_b32 s3, s3, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 +; GFX10-NEXT: s_cselect_b32 s4, 1, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, s8 +; GFX10-NEXT: s_and_b32 s4, 1, s4 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: s_cmp_lg_u32 s3, 0 -; GFX10-NEXT: s_addc_u32 s3, s0, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo -; GFX10-NEXT: s_add_u32 s0, s4, s12 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s3, vcc_lo -; GFX10-NEXT: s_and_b32 s1, s1, 1 -; GFX10-NEXT: v_mov_b32_e32 v2, s16 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, v2, s0 +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, s9 +; GFX10-NEXT: v_mov_b32_e32 v6, s13 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v7, s19 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v4, 0, s0 +; GFX10-NEXT: v_mov_b32_e32 v4, s16 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s1, vcc_lo +; GFX10-NEXT: s_mov_b32 s1, 0 +; GFX10-NEXT: s_ashr_i32 s0, s19, 31 +; GFX10-NEXT: v_xor_b32_e32 v2, v3, v2 ; GFX10-NEXT: s_cmp_lg_u32 s1, 0 -; GFX10-NEXT: s_addc_u32 s1, s5, s13 -; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_cmp_lt_u64_e64 s3, s[0:1], s[4:5] -; GFX10-NEXT: s_and_b32 s8, s8, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v4, s10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, s3, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v5, s12 +; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX10-NEXT: s_addc_u32 s1, s0, 0 +; GFX10-NEXT: s_addc_u32 s2, s0, 0 +; GFX10-NEXT: s_addc_u32 s3, s0, s11 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, s18 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, s1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s8, 0 -; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[14:15], 0 -; GFX10-NEXT: s_addc_u32 s8, s6, s14 -; GFX10-NEXT: s_cselect_b32 s9, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s3 -; GFX10-NEXT: s_and_b32 s9, s9, 1 -; GFX10-NEXT: v_mov_b32_e32 v7, s8 -; GFX10-NEXT: s_cmp_lg_u32 s9, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s4 -; GFX10-NEXT: s_addc_u32 s9, s7, s15 -; GFX10-NEXT: s_cmp_eq_u64 s[8:9], s[6:7] -; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[8:9], s[6:7] -; GFX10-NEXT: s_cselect_b32 s2, 1, 0 -; GFX10-NEXT: v_mov_b32_e32 v8, s9 -; GFX10-NEXT: s_and_b32 s2, 1, s2 -; GFX10-NEXT: s_cmp_eq_u64 s[14:15], 0 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s3 -; GFX10-NEXT: s_cselect_b32 s3, 1, 0 -; GFX10-NEXT: s_and_b32 s3, 1, s3 -; GFX10-NEXT: v_cmp_ne_u32_e64 s2, 0, s3 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo -; GFX10-NEXT: s_mov_b32 s3, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, 0, s2 -; GFX10-NEXT: s_ashr_i32 s2, s9, 31 -; GFX10-NEXT: s_cmp_lg_u32 s3, 0 -; GFX10-NEXT: v_mov_b32_e32 v6, s1 -; GFX10-NEXT: s_addc_u32 s3, s2, 0 -; GFX10-NEXT: s_cselect_b32 s4, 1, 0 -; GFX10-NEXT: v_xor_b32_e32 v4, v5, v4 -; GFX10-NEXT: s_and_b32 s4, s4, 1 -; GFX10-NEXT: v_mov_b32_e32 v5, s0 -; GFX10-NEXT: s_cmp_lg_u32 s4, 0 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX10-NEXT: s_addc_u32 s4, s2, 0 -; GFX10-NEXT: s_cselect_b32 s5, 1, 0 -; GFX10-NEXT: s_and_b32 s5, s5, 1 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 -; GFX10-NEXT: s_cmp_lg_u32 s5, 0 -; GFX10-NEXT: s_addc_u32 s1, s2, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, s2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, s3, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v6, v7, s4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v7, v8, s1, vcc_lo -; GFX10-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10-NEXT: v_readfirstlane_b32 s2, v2 -; GFX10-NEXT: v_readfirstlane_b32 s3, v3 -; GFX10-NEXT: v_readfirstlane_b32 s4, v4 -; GFX10-NEXT: v_readfirstlane_b32 s5, v5 -; GFX10-NEXT: v_readfirstlane_b32 s6, v6 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, s3, vcc_lo +; GFX10-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10-NEXT: v_readfirstlane_b32 s1, v0 +; GFX10-NEXT: v_readfirstlane_b32 s2, v3 +; GFX10-NEXT: v_readfirstlane_b32 s3, v4 +; GFX10-NEXT: v_readfirstlane_b32 s4, v5 +; GFX10-NEXT: v_readfirstlane_b32 s5, v6 +; GFX10-NEXT: v_readfirstlane_b32 s6, v2 ; GFX10-NEXT: v_readfirstlane_b32 s7, v7 ; GFX10-NEXT: ; return to shader part epilog %result = call <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -208,14 +208,8 @@ ; CHECK-NEXT: s_ashr_i32 s6, s3, 31 ; CHECK-NEXT: s_ashr_i32 s8, s5, 31 ; CHECK-NEXT: s_add_u32 s0, s2, s6 -; CHECK-NEXT: s_cselect_b32 s1, 1, 0 -; CHECK-NEXT: s_and_b32 s1, s1, 1 -; CHECK-NEXT: s_cmp_lg_u32 s1, 0 ; CHECK-NEXT: s_addc_u32 s1, s3, s6 ; CHECK-NEXT: s_add_u32 s10, s4, s8 -; CHECK-NEXT: s_cselect_b32 s3, 1, 0 -; CHECK-NEXT: s_and_b32 s3, s3, 1 -; CHECK-NEXT: s_cmp_lg_u32 s3, 0 ; CHECK-NEXT: s_mov_b32 s9, s8 ; CHECK-NEXT: s_addc_u32 s11, s5, s8 ; CHECK-NEXT: s_xor_b64 s[10:11], s[10:11], s[8:9] @@ -226,21 +220,18 @@ ; CHECK-NEXT: s_sub_u32 s0, 0, s10 ; CHECK-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; CHECK-NEXT: s_cselect_b32 s1, 1, 0 -; CHECK-NEXT: s_and_b32 s1, s1, 1 -; CHECK-NEXT: s_cmp_lg_u32 s1, 0 +; CHECK-NEXT: s_subb_u32 s1, 0, s11 ; CHECK-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; CHECK-NEXT: v_trunc_f32_e32 v1, v1 ; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; CHECK-NEXT: v_cvt_u32_f32_e32 v1, v1 ; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0 -; CHECK-NEXT: s_subb_u32 s1, 0, s11 -; CHECK-NEXT: v_mul_lo_u32 v3, s0, v1 -; CHECK-NEXT: v_mul_lo_u32 v2, s1, v0 +; CHECK-NEXT: v_mul_lo_u32 v2, s0, v1 +; CHECK-NEXT: v_mul_lo_u32 v3, s1, v0 ; CHECK-NEXT: v_mul_hi_u32 v5, s0, v0 ; CHECK-NEXT: v_mul_lo_u32 v4, s0, v0 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; CHECK-NEXT: v_mul_lo_u32 v3, v1, v4 ; CHECK-NEXT: v_mul_lo_u32 v5, v0, v2 @@ -1196,43 +1187,38 @@ ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: s_movk_i32 s10, 0x1000 -; GISEL-NEXT: s_add_u32 s4, s10, 0 -; GISEL-NEXT: s_cselect_b32 s5, 1, 0 -; GISEL-NEXT: s_and_b32 s5, s5, 1 ; GISEL-NEXT: s_mov_b32 s6, 0 -; GISEL-NEXT: s_cmp_lg_u32 s5, 0 +; GISEL-NEXT: s_add_u32 s4, s10, 0 ; GISEL-NEXT: s_mov_b32 s7, s6 ; GISEL-NEXT: s_addc_u32 s5, 0, 0 ; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7] ; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s8 ; GISEL-NEXT: v_cvt_f32_u32_e32 v6, s9 ; GISEL-NEXT: s_sub_u32 s4, 0, s8 -; GISEL-NEXT: s_cselect_b32 s5, 1, 0 -; GISEL-NEXT: s_and_b32 s5, s5, 1 -; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; GISEL-NEXT: s_cmp_lg_u32 s5, 0 ; GISEL-NEXT: s_subb_u32 s5, 0, s9 ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 +; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc ; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 ; GISEL-NEXT: v_trunc_f32_e32 v6, v6 ; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 ; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 +; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4 ; GISEL-NEXT: v_mul_lo_u32 v7, s5, v5 ; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6 ; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5 ; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; GISEL-NEXT: v_mul_lo_u32 v8, v6, v9 ; GISEL-NEXT: v_mul_lo_u32 v10, v5, v7 ; GISEL-NEXT: v_mul_hi_u32 v11, v5, v9 ; GISEL-NEXT: v_mul_hi_u32 v9, v6, v9 -; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 @@ -1256,7 +1242,6 @@ ; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6 ; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5 ; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5 -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; GISEL-NEXT: v_mul_lo_u32 v8, v6, v9 @@ -1327,15 +1312,12 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc ; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 -; GISEL-NEXT: s_add_u32 s4, s10, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, s9, v1 -; GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v8 -; GISEL-NEXT: s_and_b32 s5, s5, 1 ; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v9, vcc -; GISEL-NEXT: s_cmp_lg_u32 s5, 0 +; GISEL-NEXT: s_add_u32 s4, s10, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GISEL-NEXT: s_addc_u32 s5, 0, 0 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc @@ -1347,25 +1329,22 @@ ; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s6 ; GISEL-NEXT: v_cvt_f32_u32_e32 v6, s7 ; GISEL-NEXT: s_sub_u32 s4, 0, s6 -; GISEL-NEXT: s_cselect_b32 s5, 1, 0 -; GISEL-NEXT: s_and_b32 s5, s5, 1 -; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; GISEL-NEXT: s_cmp_lg_u32 s5, 0 ; GISEL-NEXT: s_subb_u32 s5, 0, s7 ; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 +; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 +; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 ; GISEL-NEXT: v_trunc_f32_e32 v6, v6 ; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 ; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc ; GISEL-NEXT: v_mul_lo_u32 v7, s5, v5 ; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6 ; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 @@ -1912,43 +1891,38 @@ ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: s_mov_b32 s10, 0x12d8fb -; GISEL-NEXT: s_add_u32 s4, s10, 0 -; GISEL-NEXT: s_cselect_b32 s5, 1, 0 -; GISEL-NEXT: s_and_b32 s5, s5, 1 ; GISEL-NEXT: s_mov_b32 s6, 0 -; GISEL-NEXT: s_cmp_lg_u32 s5, 0 +; GISEL-NEXT: s_add_u32 s4, s10, 0 ; GISEL-NEXT: s_mov_b32 s7, s6 ; GISEL-NEXT: s_addc_u32 s5, 0, 0 ; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7] ; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s8 ; GISEL-NEXT: v_cvt_f32_u32_e32 v6, s9 ; GISEL-NEXT: s_sub_u32 s4, 0, s8 -; GISEL-NEXT: s_cselect_b32 s5, 1, 0 -; GISEL-NEXT: s_and_b32 s5, s5, 1 -; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; GISEL-NEXT: s_cmp_lg_u32 s5, 0 ; GISEL-NEXT: s_subb_u32 s5, 0, s9 ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 +; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc ; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 ; GISEL-NEXT: v_trunc_f32_e32 v6, v6 ; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 ; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 +; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4 ; GISEL-NEXT: v_mul_lo_u32 v7, s5, v5 ; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6 ; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5 ; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; GISEL-NEXT: v_mul_lo_u32 v8, v6, v9 ; GISEL-NEXT: v_mul_lo_u32 v10, v5, v7 ; GISEL-NEXT: v_mul_hi_u32 v11, v5, v9 ; GISEL-NEXT: v_mul_hi_u32 v9, v6, v9 -; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 @@ -1972,7 +1946,6 @@ ; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6 ; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5 ; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5 -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; GISEL-NEXT: v_mul_lo_u32 v8, v6, v9 @@ -2043,15 +2016,12 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc ; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 -; GISEL-NEXT: s_add_u32 s4, s10, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, s9, v1 -; GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v8 -; GISEL-NEXT: s_and_b32 s5, s5, 1 ; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v9, vcc -; GISEL-NEXT: s_cmp_lg_u32 s5, 0 +; GISEL-NEXT: s_add_u32 s4, s10, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GISEL-NEXT: s_addc_u32 s5, 0, 0 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc @@ -2063,25 +2033,22 @@ ; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s6 ; GISEL-NEXT: v_cvt_f32_u32_e32 v6, s7 ; GISEL-NEXT: s_sub_u32 s4, 0, s6 -; GISEL-NEXT: s_cselect_b32 s5, 1, 0 -; GISEL-NEXT: s_and_b32 s5, s5, 1 -; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; GISEL-NEXT: s_cmp_lg_u32 s5, 0 ; GISEL-NEXT: s_subb_u32 s5, 0, s7 ; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 +; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 +; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 ; GISEL-NEXT: v_trunc_f32_e32 v6, v6 ; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 ; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc ; GISEL-NEXT: v_mul_lo_u32 v7, s5, v5 ; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6 ; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -150,14 +150,8 @@ ; GFX8-NEXT: s_ashr_i32 s2, s9, 31 ; GFX8-NEXT: s_ashr_i32 s12, s11, 31 ; GFX8-NEXT: s_add_u32 s0, s8, s2 -; GFX8-NEXT: s_cselect_b32 s1, 1, 0 -; GFX8-NEXT: s_and_b32 s1, s1, 1 -; GFX8-NEXT: s_cmp_lg_u32 s1, 0 ; GFX8-NEXT: s_addc_u32 s1, s9, s2 ; GFX8-NEXT: s_add_u32 s8, s10, s12 -; GFX8-NEXT: s_cselect_b32 s3, 1, 0 -; GFX8-NEXT: s_and_b32 s3, s3, 1 -; GFX8-NEXT: s_cmp_lg_u32 s3, 0 ; GFX8-NEXT: s_mov_b32 s13, s12 ; GFX8-NEXT: s_addc_u32 s9, s11, s12 ; GFX8-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] @@ -169,8 +163,7 @@ ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: s_sub_u32 s0, 0, s8 -; GFX8-NEXT: s_cselect_b32 s1, 1, 0 -; GFX8-NEXT: s_and_b32 s1, s1, 1 +; GFX8-NEXT: s_subb_u32 s1, 0, s9 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX8-NEXT: v_trunc_f32_e32 v1, v1 @@ -178,8 +171,6 @@ ; GFX8-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: s_cmp_lg_u32 s1, 0 -; GFX8-NEXT: s_subb_u32 s1, 0, s9 ; GFX8-NEXT: v_mul_lo_u32 v2, s0, v1 ; GFX8-NEXT: v_mul_lo_u32 v3, s1, v0 ; GFX8-NEXT: v_mul_hi_u32 v5, s0, v0 @@ -329,14 +320,8 @@ ; GFX9-NEXT: s_ashr_i32 s2, s9, 31 ; GFX9-NEXT: s_ashr_i32 s12, s11, 31 ; GFX9-NEXT: s_add_u32 s0, s8, s2 -; GFX9-NEXT: s_cselect_b32 s1, 1, 0 -; GFX9-NEXT: s_and_b32 s1, s1, 1 -; GFX9-NEXT: s_cmp_lg_u32 s1, 0 ; GFX9-NEXT: s_addc_u32 s1, s9, s2 ; GFX9-NEXT: s_add_u32 s8, s10, s12 -; GFX9-NEXT: s_cselect_b32 s3, 1, 0 -; GFX9-NEXT: s_and_b32 s3, s3, 1 -; GFX9-NEXT: s_cmp_lg_u32 s3, 0 ; GFX9-NEXT: s_mov_b32 s13, s12 ; GFX9-NEXT: s_addc_u32 s9, s11, s12 ; GFX9-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] @@ -348,8 +333,8 @@ ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_sub_u32 s0, 0, s8 -; GFX9-NEXT: s_cselect_b32 s1, 1, 0 -; GFX9-NEXT: s_and_b32 s1, s1, 1 +; GFX9-NEXT: s_subb_u32 s1, 0, s9 +; GFX9-NEXT: v_mov_b32_e32 v8, s11 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 @@ -357,27 +342,24 @@ ; GFX9-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_cmp_lg_u32 s1, 0 -; GFX9-NEXT: s_subb_u32 s1, 0, s9 ; GFX9-NEXT: v_mul_lo_u32 v2, s0, v1 ; GFX9-NEXT: v_mul_lo_u32 v3, s1, v0 ; GFX9-NEXT: v_mul_hi_u32 v4, s0, v0 ; GFX9-NEXT: v_mul_lo_u32 v5, s0, v0 -; GFX9-NEXT: v_mov_b32_e32 v8, s11 ; GFX9-NEXT: v_add3_u32 v2, v3, v2, v4 ; GFX9-NEXT: v_mul_lo_u32 v3, v1, v5 -; GFX9-NEXT: v_mul_lo_u32 v4, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v6, v0, v5 +; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v4, v0, v5 ; GFX9-NEXT: v_mul_hi_u32 v5, v1, v5 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v7, v1, v2 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v1, v2 -; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 ; GFX9-NEXT: v_mul_hi_u32 v4, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 ; GFX9-NEXT: v_mul_hi_u32 v2, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 +; GFX9-NEXT: v_add_u32_e32 v3, v6, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc @@ -499,27 +481,18 @@ ; GFX10-NEXT: s_ashr_i32 s2, s9, 31 ; GFX10-NEXT: s_ashr_i32 s12, s11, 31 ; GFX10-NEXT: s_add_u32 s0, s8, s2 -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: s_mov_b32 s13, s12 -; GFX10-NEXT: s_and_b32 s1, s1, 1 -; GFX10-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10-NEXT: s_addc_u32 s1, s9, s2 ; GFX10-NEXT: s_add_u32 s8, s10, s12 -; GFX10-NEXT: s_cselect_b32 s3, 1, 0 -; GFX10-NEXT: s_and_b32 s3, s3, 1 -; GFX10-NEXT: s_cmp_lg_u32 s3, 0 -; GFX10-NEXT: s_mov_b32 s3, s2 +; GFX10-NEXT: s_mov_b32 s13, s12 ; GFX10-NEXT: s_addc_u32 s9, s11, s12 -; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; GFX10-NEXT: s_mov_b32 s3, s2 ; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s9 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s8 ; GFX10-NEXT: s_sub_u32 s10, 0, s8 -; GFX10-NEXT: s_cselect_b32 s11, 1, 0 -; GFX10-NEXT: s_and_b32 s11, s11, 1 -; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 -; GFX10-NEXT: s_cmp_lg_u32 s11, 0 ; GFX10-NEXT: s_subb_u32 s11, 0, s9 +; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -1335,14 +1308,8 @@ ; GFX8-NEXT: s_ashr_i32 s2, s9, 31 ; GFX8-NEXT: s_ashr_i32 s6, s13, 31 ; GFX8-NEXT: s_add_u32 s0, s8, s2 -; GFX8-NEXT: s_cselect_b32 s1, 1, 0 -; GFX8-NEXT: s_and_b32 s1, s1, 1 -; GFX8-NEXT: s_cmp_lg_u32 s1, 0 ; GFX8-NEXT: s_addc_u32 s1, s9, s2 ; GFX8-NEXT: s_add_u32 s8, s12, s6 -; GFX8-NEXT: s_cselect_b32 s3, 1, 0 -; GFX8-NEXT: s_and_b32 s3, s3, 1 -; GFX8-NEXT: s_cmp_lg_u32 s3, 0 ; GFX8-NEXT: s_mov_b32 s7, s6 ; GFX8-NEXT: s_addc_u32 s9, s13, s6 ; GFX8-NEXT: s_xor_b64 s[8:9], s[8:9], s[6:7] @@ -1354,8 +1321,7 @@ ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: s_sub_u32 s0, 0, s8 -; GFX8-NEXT: s_cselect_b32 s1, 1, 0 -; GFX8-NEXT: s_and_b32 s1, s1, 1 +; GFX8-NEXT: s_subb_u32 s1, 0, s9 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX8-NEXT: v_trunc_f32_e32 v1, v1 @@ -1363,8 +1329,6 @@ ; GFX8-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: s_cmp_lg_u32 s1, 0 -; GFX8-NEXT: s_subb_u32 s1, 0, s9 ; GFX8-NEXT: v_mul_lo_u32 v2, s0, v1 ; GFX8-NEXT: v_mul_lo_u32 v3, s1, v0 ; GFX8-NEXT: v_mul_hi_u32 v5, s0, v0 @@ -1496,14 +1460,8 @@ ; GFX8-NEXT: s_add_u32 s0, s10, s6 ; GFX8-NEXT: v_xor_b32_e32 v1, s1, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: s_cselect_b32 s1, 1, 0 -; GFX8-NEXT: s_and_b32 s1, s1, 1 -; GFX8-NEXT: s_cmp_lg_u32 s1, 0 ; GFX8-NEXT: s_addc_u32 s1, s11, s6 ; GFX8-NEXT: s_add_u32 s10, s14, s8 -; GFX8-NEXT: s_cselect_b32 s3, 1, 0 -; GFX8-NEXT: s_and_b32 s3, s3, 1 -; GFX8-NEXT: s_cmp_lg_u32 s3, 0 ; GFX8-NEXT: s_mov_b32 s9, s8 ; GFX8-NEXT: s_addc_u32 s11, s15, s8 ; GFX8-NEXT: s_xor_b64 s[10:11], s[10:11], s[8:9] @@ -1516,8 +1474,8 @@ ; GFX8-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GFX8-NEXT: s_sub_u32 s0, 0, s10 -; GFX8-NEXT: s_cselect_b32 s1, 1, 0 -; GFX8-NEXT: s_and_b32 s1, s1, 1 +; GFX8-NEXT: s_subb_u32 s1, 0, s11 +; GFX8-NEXT: v_xor_b32_e32 v3, s2, v3 ; GFX8-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GFX8-NEXT: v_mul_f32_e32 v6, 0x2f800000, v4 ; GFX8-NEXT: v_trunc_f32_e32 v6, v6 @@ -1525,17 +1483,14 @@ ; GFX8-NEXT: v_add_f32_e32 v4, v7, v4 ; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v4 ; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GFX8-NEXT: s_cmp_lg_u32 s1, 0 -; GFX8-NEXT: s_subb_u32 s1, 0, s11 +; GFX8-NEXT: v_xor_b32_e32 v2, s2, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, s2 ; GFX8-NEXT: v_mul_lo_u32 v4, s1, v7 ; GFX8-NEXT: v_mul_lo_u32 v8, s0, v6 ; GFX8-NEXT: v_mul_hi_u32 v10, s0, v7 ; GFX8-NEXT: v_mul_lo_u32 v9, s0, v7 -; GFX8-NEXT: v_xor_b32_e32 v3, s2, v3 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v4, v10 -; GFX8-NEXT: v_xor_b32_e32 v2, s2, v2 -; GFX8-NEXT: v_mov_b32_e32 v5, s2 ; GFX8-NEXT: v_mul_lo_u32 v10, v6, v9 ; GFX8-NEXT: v_mul_lo_u32 v11, v7, v8 ; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s2, v3 @@ -1683,14 +1638,8 @@ ; GFX9-NEXT: s_ashr_i32 s2, s9, 31 ; GFX9-NEXT: s_ashr_i32 s6, s13, 31 ; GFX9-NEXT: s_add_u32 s0, s8, s2 -; GFX9-NEXT: s_cselect_b32 s1, 1, 0 -; GFX9-NEXT: s_and_b32 s1, s1, 1 -; GFX9-NEXT: s_cmp_lg_u32 s1, 0 ; GFX9-NEXT: s_addc_u32 s1, s9, s2 ; GFX9-NEXT: s_add_u32 s8, s12, s6 -; GFX9-NEXT: s_cselect_b32 s3, 1, 0 -; GFX9-NEXT: s_and_b32 s3, s3, 1 -; GFX9-NEXT: s_cmp_lg_u32 s3, 0 ; GFX9-NEXT: s_mov_b32 s7, s6 ; GFX9-NEXT: s_addc_u32 s9, s13, s6 ; GFX9-NEXT: s_xor_b64 s[8:9], s[8:9], s[6:7] @@ -1702,8 +1651,7 @@ ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_sub_u32 s0, 0, s8 -; GFX9-NEXT: s_cselect_b32 s1, 1, 0 -; GFX9-NEXT: s_and_b32 s1, s1, 1 +; GFX9-NEXT: s_subb_u32 s1, 0, s9 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 @@ -1711,27 +1659,24 @@ ; GFX9-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_cmp_lg_u32 s1, 0 -; GFX9-NEXT: s_subb_u32 s1, 0, s9 ; GFX9-NEXT: v_mul_lo_u32 v2, s0, v1 ; GFX9-NEXT: v_mul_lo_u32 v3, s1, v0 ; GFX9-NEXT: v_mul_hi_u32 v4, s0, v0 ; GFX9-NEXT: v_mul_lo_u32 v5, s0, v0 -; GFX9-NEXT: v_mov_b32_e32 v7, s13 ; GFX9-NEXT: v_add3_u32 v2, v3, v2, v4 ; GFX9-NEXT: v_mul_lo_u32 v3, v1, v5 -; GFX9-NEXT: v_mul_lo_u32 v4, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v6, v0, v5 +; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v4, v0, v5 ; GFX9-NEXT: v_mul_hi_u32 v5, v1, v5 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v7, v1, v2 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v1, v2 -; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 ; GFX9-NEXT: v_mul_hi_u32 v4, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 ; GFX9-NEXT: v_mul_hi_u32 v2, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 +; GFX9-NEXT: v_add_u32_e32 v3, v6, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc @@ -1745,6 +1690,7 @@ ; GFX9-NEXT: v_mul_lo_u32 v3, s0, v1 ; GFX9-NEXT: v_mul_hi_u32 v4, s0, v0 ; GFX9-NEXT: v_mul_lo_u32 v5, s0, v0 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 ; GFX9-NEXT: v_add3_u32 v2, v2, v3, v4 ; GFX9-NEXT: v_mul_lo_u32 v3, v1, v5 ; GFX9-NEXT: v_mul_lo_u32 v4, v0, v2 @@ -1826,14 +1772,8 @@ ; GFX9-NEXT: s_ashr_i32 s6, s11, 31 ; GFX9-NEXT: s_ashr_i32 s8, s15, 31 ; GFX9-NEXT: s_add_u32 s12, s10, s6 -; GFX9-NEXT: s_cselect_b32 s3, 1, 0 -; GFX9-NEXT: s_and_b32 s3, s3, 1 -; GFX9-NEXT: s_cmp_lg_u32 s3, 0 ; GFX9-NEXT: s_addc_u32 s13, s11, s6 ; GFX9-NEXT: s_add_u32 s10, s14, s8 -; GFX9-NEXT: s_cselect_b32 s3, 1, 0 -; GFX9-NEXT: s_and_b32 s3, s3, 1 -; GFX9-NEXT: s_cmp_lg_u32 s3, 0 ; GFX9-NEXT: s_mov_b32 s9, s8 ; GFX9-NEXT: s_addc_u32 s11, s15, s8 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 @@ -1858,14 +1798,11 @@ ; GFX9-NEXT: v_add_f32_e32 v4, v6, v4 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GFX9-NEXT: s_cselect_b32 s14, 1, 0 -; GFX9-NEXT: s_and_b32 s14, s14, 1 -; GFX9-NEXT: s_cmp_lg_u32 s14, 0 ; GFX9-NEXT: s_subb_u32 s14, 0, s11 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc ; GFX9-NEXT: v_mul_lo_u32 v6, s14, v4 ; GFX9-NEXT: v_mul_lo_u32 v7, s3, v5 ; GFX9-NEXT: v_mul_hi_u32 v8, s3, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc ; GFX9-NEXT: v_mul_lo_u32 v9, s3, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0 @@ -2015,321 +1952,303 @@ ; GFX10-NEXT: s_ashr_i32 s2, s9, 31 ; GFX10-NEXT: s_ashr_i32 s6, s13, 31 ; GFX10-NEXT: s_add_u32 s0, s8, s2 -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: s_mov_b32 s7, s6 -; GFX10-NEXT: s_and_b32 s1, s1, 1 -; GFX10-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10-NEXT: s_addc_u32 s1, s9, s2 ; GFX10-NEXT: s_add_u32 s8, s12, s6 -; GFX10-NEXT: s_cselect_b32 s3, 1, 0 -; GFX10-NEXT: s_and_b32 s3, s3, 1 -; GFX10-NEXT: s_cmp_lg_u32 s3, 0 -; GFX10-NEXT: s_mov_b32 s3, s2 +; GFX10-NEXT: s_mov_b32 s7, s6 ; GFX10-NEXT: s_addc_u32 s9, s13, s6 -; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; GFX10-NEXT: s_mov_b32 s3, s2 ; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[6:7] +; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s9 ; GFX10-NEXT: s_sub_u32 s20, 0, s8 -; GFX10-NEXT: s_cselect_b32 s12, 1, 0 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX10-NEXT: s_and_b32 s12, s12, 1 -; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 -; GFX10-NEXT: s_cmp_lg_u32 s12, 0 ; GFX10-NEXT: s_subb_u32 s21, 0, s9 ; GFX10-NEXT: s_ashr_i32 s12, s11, 31 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX10-NEXT: s_xor_b64 s[18:19], s[2:3], s[6:7] ; GFX10-NEXT: s_ashr_i32 s16, s15, 31 -; GFX10-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 ; GFX10-NEXT: s_add_u32 s6, s10, s12 -; GFX10-NEXT: s_cselect_b32 s3, 1, 0 -; GFX10-NEXT: s_mov_b32 s17, s16 -; GFX10-NEXT: s_and_b32 s3, s3, 1 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX10-NEXT: s_cmp_lg_u32 s3, 0 -; GFX10-NEXT: s_mov_b32 s13, s12 ; GFX10-NEXT: s_addc_u32 s7, s11, s12 ; GFX10-NEXT: s_add_u32 s10, s14, s16 -; GFX10-NEXT: s_cselect_b32 s3, 1, 0 -; GFX10-NEXT: s_and_b32 s3, s3, 1 -; GFX10-NEXT: s_cmp_lg_u32 s3, 0 -; GFX10-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GFX10-NEXT: s_mov_b32 s17, s16 ; GFX10-NEXT: s_addc_u32 s11, s15, s16 -; GFX10-NEXT: s_xor_b64 s[14:15], s[6:7], s[12:13] +; GFX10-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX10-NEXT: s_xor_b64 s[10:11], s[10:11], s[16:17] +; GFX10-NEXT: s_mov_b32 s13, s12 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s11 -; GFX10-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 -; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s10 +; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s10 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX10-NEXT: s_xor_b64 s[14:15], s[6:7], s[12:13] ; GFX10-NEXT: s_sub_u32 s3, 0, s10 -; GFX10-NEXT: s_cselect_b32 s6, 1, 0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 +; GFX10-NEXT: s_subb_u32 s6, 0, s11 +; GFX10-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX10-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX10-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 ; GFX10-NEXT: v_trunc_f32_e32 v2, v2 -; GFX10-NEXT: s_and_b32 s6, s6, 1 -; GFX10-NEXT: s_cmp_lg_u32 s6, 0 -; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX10-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 ; GFX10-NEXT: v_mul_f32_e32 v3, 0xcf800000, v2 ; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX10-NEXT: s_subb_u32 s6, 0, s11 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX10-NEXT: v_mul_f32_e32 v4, 0x2f800000, v1 ; GFX10-NEXT: v_add_f32_e32 v0, v3, v0 -; GFX10-NEXT: v_mul_lo_u32 v3, s20, v2 +; GFX10-NEXT: v_mul_lo_u32 v5, s20, v2 +; GFX10-NEXT: v_trunc_f32_e32 v3, v4 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX10-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 -; GFX10-NEXT: v_mul_lo_u32 v4, s21, v0 -; GFX10-NEXT: v_mul_hi_u32 v5, s20, v0 -; GFX10-NEXT: v_mul_lo_u32 v6, s20, v0 -; GFX10-NEXT: v_mul_f32_e32 v7, 0x2f800000, v1 -; GFX10-NEXT: v_add3_u32 v3, v4, v3, v5 -; GFX10-NEXT: v_trunc_f32_e32 v4, v7 -; GFX10-NEXT: v_mul_lo_u32 v5, v2, v6 -; GFX10-NEXT: v_mul_hi_u32 v7, v0, v6 -; GFX10-NEXT: v_mul_hi_u32 v6, v2, v6 -; GFX10-NEXT: v_mul_lo_u32 v8, v0, v3 -; GFX10-NEXT: v_mul_lo_u32 v10, v2, v3 -; GFX10-NEXT: v_mul_f32_e32 v9, 0xcf800000, v4 -; GFX10-NEXT: v_mul_hi_u32 v11, v0, v3 -; GFX10-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GFX10-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX10-NEXT: v_add_f32_e32 v1, v9, v1 -; GFX10-NEXT: v_add_co_u32 v5, s7, v5, v8 -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s7 -; GFX10-NEXT: v_add_co_u32 v6, s7, v10, v6 -; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s7 -; GFX10-NEXT: v_add_co_u32 v5, s7, v5, v7 +; GFX10-NEXT: v_mul_f32_e32 v4, 0xcf800000, v3 +; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX10-NEXT: v_mul_lo_u32 v6, s21, v0 +; GFX10-NEXT: v_mul_hi_u32 v7, s20, v0 +; GFX10-NEXT: v_add_f32_e32 v1, v4, v1 +; GFX10-NEXT: v_mul_lo_u32 v4, s20, v0 +; GFX10-NEXT: v_mul_lo_u32 v8, s3, v3 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s7 -; GFX10-NEXT: v_mul_lo_u32 v9, s3, v4 +; GFX10-NEXT: v_add3_u32 v5, v6, v5, v7 +; GFX10-NEXT: v_mul_lo_u32 v6, v2, v4 +; GFX10-NEXT: v_mul_lo_u32 v7, s6, v1 +; GFX10-NEXT: v_mul_hi_u32 v9, s3, v1 +; GFX10-NEXT: v_mul_lo_u32 v12, v0, v5 +; GFX10-NEXT: v_mul_hi_u32 v11, v0, v4 +; GFX10-NEXT: v_mul_hi_u32 v4, v2, v4 +; GFX10-NEXT: v_mul_lo_u32 v13, v2, v5 +; GFX10-NEXT: v_mul_lo_u32 v10, s3, v1 +; GFX10-NEXT: v_mul_hi_u32 v14, v0, v5 +; GFX10-NEXT: v_mul_hi_u32 v5, v2, v5 +; GFX10-NEXT: v_add3_u32 v7, v7, v8, v9 +; GFX10-NEXT: v_add_co_u32 v6, s7, v6, v12 +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s7 +; GFX10-NEXT: v_add_co_u32 v4, s7, v13, v4 +; GFX10-NEXT: v_mul_lo_u32 v8, v3, v10 +; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s7 +; GFX10-NEXT: v_mul_lo_u32 v15, v1, v7 ; GFX10-NEXT: v_add_co_u32 v6, s7, v6, v11 -; GFX10-NEXT: v_mul_lo_u32 v12, s6, v1 -; GFX10-NEXT: v_mul_hi_u32 v13, s3, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v5, v8, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s7 -; GFX10-NEXT: v_mul_lo_u32 v11, s3, v1 -; GFX10-NEXT: v_add_co_u32 v5, s7, v6, v5 -; GFX10-NEXT: v_add_nc_u32_e32 v7, v10, v7 +; GFX10-NEXT: v_mul_hi_u32 v9, v1, v10 +; GFX10-NEXT: v_mul_hi_u32 v10, v3, v10 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s7 -; GFX10-NEXT: v_add3_u32 v8, v12, v9, v13 -; GFX10-NEXT: v_mul_lo_u32 v9, v4, v11 -; GFX10-NEXT: v_mul_hi_u32 v10, v1, v11 -; GFX10-NEXT: v_mul_hi_u32 v11, v4, v11 -; GFX10-NEXT: v_add3_u32 v3, v7, v6, v3 -; GFX10-NEXT: v_mul_lo_u32 v6, v1, v8 -; GFX10-NEXT: v_mul_lo_u32 v7, v4, v8 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v5 -; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v3, vcc_lo -; GFX10-NEXT: v_mul_hi_u32 v5, v1, v8 -; GFX10-NEXT: v_mul_lo_u32 v12, s21, v0 -; GFX10-NEXT: v_add_co_u32 v6, s7, v9, v6 -; GFX10-NEXT: v_mul_hi_u32 v13, s20, v0 -; GFX10-NEXT: v_mul_lo_u32 v14, s20, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s7 -; GFX10-NEXT: v_add_co_u32 v7, s7, v7, v11 +; GFX10-NEXT: v_add_co_u32 v4, s7, v4, v14 +; GFX10-NEXT: v_mul_lo_u32 v14, v3, v7 ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s7 -; GFX10-NEXT: v_add_co_u32 v6, s7, v6, v10 -; GFX10-NEXT: v_mul_lo_u32 v3, s20, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v6, v12, v6 +; GFX10-NEXT: v_add_co_u32 v8, s7, v8, v15 +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s7 +; GFX10-NEXT: v_add_nc_u32_e32 v11, v13, v11 +; GFX10-NEXT: v_mul_hi_u32 v16, v1, v7 +; GFX10-NEXT: v_add_co_u32 v10, s7, v14, v10 +; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s7 +; GFX10-NEXT: v_add_co_u32 v4, s7, v4, v6 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s7 -; GFX10-NEXT: v_add_co_u32 v5, s7, v7, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s7 -; GFX10-NEXT: v_add3_u32 v12, v12, v14, v13 -; GFX10-NEXT: v_add_nc_u32_e32 v6, v9, v6 -; GFX10-NEXT: v_mul_hi_u32 v8, v4, v8 -; GFX10-NEXT: v_mul_lo_u32 v10, v2, v3 -; GFX10-NEXT: v_add_nc_u32_e32 v7, v11, v7 -; GFX10-NEXT: v_mul_lo_u32 v11, v0, v12 -; GFX10-NEXT: v_add_co_u32 v5, s7, v5, v6 -; GFX10-NEXT: v_mul_hi_u32 v9, v0, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s7 -; GFX10-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX10-NEXT: v_mul_lo_u32 v13, v2, v12 -; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, v1, v5 -; GFX10-NEXT: v_add_co_u32 v5, s7, v10, v11 -; GFX10-NEXT: v_add3_u32 v6, v7, v6, v8 -; GFX10-NEXT: v_mul_hi_u32 v14, v0, v12 -; GFX10-NEXT: v_mul_lo_u32 v10, s6, v1 -; GFX10-NEXT: v_add_co_u32 v5, s6, v5, v9 -; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s7 -; GFX10-NEXT: v_add_co_u32 v3, s7, v13, v3 -; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s6 -; GFX10-NEXT: v_mul_hi_u32 v11, s3, v1 -; GFX10-NEXT: v_add_co_u32 v3, s6, v3, v14 -; GFX10-NEXT: v_mul_lo_u32 v13, s3, v4 -; GFX10-NEXT: v_add_nc_u32_e32 v5, v7, v5 +; GFX10-NEXT: v_add_co_u32 v8, s7, v8, v9 ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s6 -; GFX10-NEXT: v_mul_hi_u32 v7, v2, v12 -; GFX10-NEXT: v_mul_lo_u32 v6, s3, v1 -; GFX10-NEXT: v_add_co_u32 v3, s3, v3, v5 -; GFX10-NEXT: v_add_nc_u32_e32 v8, v8, v9 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s3 -; GFX10-NEXT: v_add3_u32 v9, v10, v13, v11 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 -; GFX10-NEXT: v_mul_lo_u32 v10, v4, v6 -; GFX10-NEXT: v_add3_u32 v5, v8, v5, v7 -; GFX10-NEXT: v_mul_lo_u32 v7, v1, v9 -; GFX10-NEXT: v_mul_hi_u32 v11, v1, v6 -; GFX10-NEXT: v_mul_hi_u32 v6, v4, v6 -; GFX10-NEXT: v_mul_lo_u32 v8, v4, v9 +; GFX10-NEXT: v_add_co_u32 v9, s7, v10, v16 +; GFX10-NEXT: v_add3_u32 v5, v11, v6, v5 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v4, v12, v8 +; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s7 ; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo -; GFX10-NEXT: v_mul_hi_u32 v3, v1, v9 -; GFX10-NEXT: v_mul_lo_u32 v5, s1, v0 -; GFX10-NEXT: v_add_co_u32 v7, s3, v10, v7 -; GFX10-NEXT: v_mul_lo_u32 v13, s0, v2 +; GFX10-NEXT: v_mul_hi_u32 v7, v3, v7 +; GFX10-NEXT: v_add_co_u32 v4, s7, v9, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v6, v13, v10 +; GFX10-NEXT: v_mul_lo_u32 v5, s20, v0 +; GFX10-NEXT: v_mul_lo_u32 v9, s21, v0 +; GFX10-NEXT: v_mul_hi_u32 v10, s20, v0 +; GFX10-NEXT: v_mul_lo_u32 v11, s20, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s7 +; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, v1, v4 +; GFX10-NEXT: v_mul_hi_u32 v4, v2, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v8, v7 +; GFX10-NEXT: v_mul_lo_u32 v7, v2, v5 +; GFX10-NEXT: v_mul_hi_u32 v8, v0, v5 +; GFX10-NEXT: v_add3_u32 v5, v9, v11, v10 +; GFX10-NEXT: v_mul_lo_u32 v9, s6, v1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX10-NEXT: v_mul_hi_u32 v10, s3, v1 +; GFX10-NEXT: v_mul_lo_u32 v12, v0, v5 +; GFX10-NEXT: v_mul_lo_u32 v13, v2, v5 +; GFX10-NEXT: v_mul_lo_u32 v11, s3, v3 +; GFX10-NEXT: v_mul_lo_u32 v6, s3, v1 +; GFX10-NEXT: v_mul_hi_u32 v14, v0, v5 +; GFX10-NEXT: v_mul_hi_u32 v5, v2, v5 +; GFX10-NEXT: v_add_co_u32 v7, s3, v7, v12 +; GFX10-NEXT: v_add3_u32 v9, v9, v11, v10 ; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s3 -; GFX10-NEXT: v_add_co_u32 v6, s3, v8, v6 -; GFX10-NEXT: v_mul_hi_u32 v12, s0, v0 -; GFX10-NEXT: v_mul_hi_u32 v0, s1, v0 -; GFX10-NEXT: v_mul_lo_u32 v14, s1, v2 +; GFX10-NEXT: v_add_co_u32 v4, s3, v13, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s3 +; GFX10-NEXT: v_add_co_u32 v7, s3, v7, v8 +; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s3 +; GFX10-NEXT: v_add_co_u32 v4, s3, v4, v14 ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s3 -; GFX10-NEXT: v_add_co_u32 v7, s3, v7, v11 +; GFX10-NEXT: v_mul_lo_u32 v15, v3, v6 +; GFX10-NEXT: v_add_nc_u32_e32 v7, v10, v7 +; GFX10-NEXT: v_mul_lo_u32 v12, v1, v9 +; GFX10-NEXT: v_mul_hi_u32 v16, v1, v6 +; GFX10-NEXT: v_add_nc_u32_e32 v8, v11, v8 +; GFX10-NEXT: v_mul_hi_u32 v6, v3, v6 +; GFX10-NEXT: v_add_co_u32 v4, s3, v4, v7 ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s3 -; GFX10-NEXT: v_add_co_u32 v3, s3, v6, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s3 -; GFX10-NEXT: v_add_co_u32 v5, s3, v5, v13 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s3 -; GFX10-NEXT: v_add_co_u32 v0, s3, v14, v0 -; GFX10-NEXT: v_mul_hi_u32 v15, s0, v2 +; GFX10-NEXT: v_mul_lo_u32 v13, v3, v9 +; GFX10-NEXT: v_mul_hi_u32 v10, v1, v9 +; GFX10-NEXT: v_add_co_u32 v11, s3, v15, v12 +; GFX10-NEXT: v_add3_u32 v5, v8, v7, v5 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s3 +; GFX10-NEXT: v_add_co_u32 v6, s3, v13, v6 +; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s3 -; GFX10-NEXT: v_add_co_u32 v5, s3, v5, v12 +; GFX10-NEXT: v_add_co_u32 v7, s3, v11, v16 +; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s3 +; GFX10-NEXT: v_add_co_u32 v4, s3, v6, v10 +; GFX10-NEXT: v_mul_lo_u32 v6, s1, v0 +; GFX10-NEXT: v_mul_lo_u32 v8, s0, v2 +; GFX10-NEXT: v_mul_hi_u32 v10, s1, v0 +; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 +; GFX10-NEXT: v_mul_lo_u32 v11, s1, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s3 +; GFX10-NEXT: v_add_nc_u32_e32 v7, v12, v7 +; GFX10-NEXT: v_mul_hi_u32 v12, s0, v2 +; GFX10-NEXT: v_mul_hi_u32 v9, v3, v9 +; GFX10-NEXT: v_add_co_u32 v6, s3, v6, v8 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s3 +; GFX10-NEXT: v_add_co_u32 v10, s3, v11, v10 +; GFX10-NEXT: v_add_co_u32 v0, s6, v6, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s3 +; GFX10-NEXT: v_add_co_u32 v10, s3, v10, v12 +; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s3 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v8, v0 +; GFX10-NEXT: v_add_co_u32 v4, s3, v4, v7 ; GFX10-NEXT: v_mul_hi_u32 v2, s1, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v7, v10, v7 -; GFX10-NEXT: v_add_nc_u32_e32 v6, v8, v6 -; GFX10-NEXT: v_add_co_u32 v0, s3, v0, v15 -; GFX10-NEXT: v_add_nc_u32_e32 v5, v11, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s3 -; GFX10-NEXT: v_mul_hi_u32 v9, v4, v9 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX10-NEXT: v_add_co_u32 v0, s3, v0, v5 -; GFX10-NEXT: v_add_nc_u32_e32 v8, v13, v12 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s3 -; GFX10-NEXT: v_add_co_u32 v3, s3, v3, v7 ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s3 -; GFX10-NEXT: v_add3_u32 v2, v8, v5, v2 -; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 -; GFX10-NEXT: v_add3_u32 v5, v6, v7, v9 -; GFX10-NEXT: v_mul_lo_u32 v6, s9, v0 -; GFX10-NEXT: v_mul_hi_u32 v7, s8, v0 -; GFX10-NEXT: v_mul_lo_u32 v9, s8, v2 -; GFX10-NEXT: v_mul_lo_u32 v3, s8, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v5, vcc_lo -; GFX10-NEXT: v_mul_lo_u32 v5, s15, v1 -; GFX10-NEXT: v_mul_hi_u32 v10, s15, v1 -; GFX10-NEXT: v_mul_hi_u32 v1, s14, v1 -; GFX10-NEXT: v_mul_hi_u32 v17, s14, v4 -; GFX10-NEXT: v_add3_u32 v6, v6, v9, v7 -; GFX10-NEXT: v_sub_co_u32 v3, vcc_lo, s0, v3 -; GFX10-NEXT: v_mul_lo_u32 v7, s14, v4 -; GFX10-NEXT: v_mul_lo_u32 v9, s15, v4 -; GFX10-NEXT: v_sub_nc_u32_e32 v11, s1, v6 -; GFX10-NEXT: v_sub_co_ci_u32_e64 v6, s0, s1, v6, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s8, v3 -; GFX10-NEXT: v_mul_hi_u32 v4, s15, v4 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v11, vcc_lo, s9, v11, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s9, v6 -; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, -1, s0 -; GFX10-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v14, vcc_lo, v3, s8 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v15, s0, 0, v11, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s9, v6 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v11, vcc_lo, s9, v11, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v12, v13, v12, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s8, v14 -; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s9, v15 -; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, -1, s0 -; GFX10-NEXT: v_add_co_u32 v5, s0, v5, v7 +; GFX10-NEXT: v_add_co_u32 v0, s3, v10, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v5, v13, v5 +; GFX10-NEXT: v_add_nc_u32_e32 v6, v6, v11 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s3 +; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, v1, v4 +; GFX10-NEXT: v_add3_u32 v5, v5, v7, v9 +; GFX10-NEXT: v_mul_lo_u32 v4, s9, v0 +; GFX10-NEXT: v_add3_u32 v2, v6, v8, v2 +; GFX10-NEXT: v_mul_lo_u32 v7, s15, v1 +; GFX10-NEXT: v_mul_lo_u32 v8, s8, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo +; GFX10-NEXT: v_mul_hi_u32 v5, s8, v0 +; GFX10-NEXT: v_mul_lo_u32 v6, s8, v2 +; GFX10-NEXT: v_mul_hi_u32 v9, s14, v1 +; GFX10-NEXT: v_mul_lo_u32 v11, s14, v3 +; GFX10-NEXT: v_mul_hi_u32 v1, s15, v1 +; GFX10-NEXT: v_mul_lo_u32 v12, s15, v3 +; GFX10-NEXT: v_mul_hi_u32 v13, s14, v3 +; GFX10-NEXT: v_mul_hi_u32 v3, s15, v3 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX10-NEXT: v_add3_u32 v4, v4, v6, v5 +; GFX10-NEXT: v_sub_co_u32 v5, vcc_lo, s0, v8 +; GFX10-NEXT: v_add_co_u32 v6, s0, v7, v11 +; GFX10-NEXT: v_sub_nc_u32_e32 v8, s1, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v9, s0, v9, v10 -; GFX10-NEXT: v_add_co_u32 v1, s1, v5, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v9, s0, v9, v17 -; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v17, s0, v0, 1 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v4, s0, s1, v4, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s8, v5 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v8, vcc_lo, s9, v8, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s9, v4 +; GFX10-NEXT: v_add_co_u32 v6, s1, v6, v9 +; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v15, vcc_lo, v5, s8 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v16, s0, 0, v8, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s9, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v6, v7, v6 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v8, vcc_lo, s9, v8, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v10, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v11, v14, v11, s0 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s8, v15 +; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, -1, s0 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s9, v16 +; GFX10-NEXT: v_cndmask_b32_e64 v17, 0, -1, s0 +; GFX10-NEXT: v_add_co_u32 v1, s0, v12, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v1, s0, v1, v13 +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v13, s0, v0, 1 ; GFX10-NEXT: v_add_co_ci_u32_e64 v18, s0, 0, v2, s0 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v7, v1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s9, v15 -; GFX10-NEXT: v_add_nc_u32_e32 v5, v5, v10 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v16, v13, s0 -; GFX10-NEXT: v_add_co_u32 v9, s0, v9, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s9, v16 +; GFX10-NEXT: v_add_nc_u32_e32 v9, v9, v12 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v17, v14, s0 +; GFX10-NEXT: v_add_co_u32 v6, s0, v1, v6 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v10, s0, v17, 1 -; GFX10-NEXT: v_add_co_ci_u32_e64 v13, s0, 0, v18, s0 +; GFX10-NEXT: v_add_co_u32 v12, s0, v13, 1 +; GFX10-NEXT: v_add_co_ci_u32_e64 v14, s0, 0, v18, s0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v7 -; GFX10-NEXT: v_add3_u32 v4, v5, v1, v4 -; GFX10-NEXT: v_sub_co_u32 v1, s0, v14, s8 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v5, s0, 0, v11, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v10, v17, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v11, v18, v13, vcc_lo -; GFX10-NEXT: v_mul_lo_u32 v13, s11, v9 -; GFX10-NEXT: v_mul_lo_u32 v16, s10, v4 -; GFX10-NEXT: v_mul_hi_u32 v17, s10, v9 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX10-NEXT: v_add3_u32 v3, v9, v1, v3 +; GFX10-NEXT: v_sub_co_u32 v1, s0, v15, s8 +; GFX10-NEXT: v_mul_hi_u32 v17, s10, v6 +; GFX10-NEXT: v_cndmask_b32_e32 v9, v13, v12, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v12, v18, v14, vcc_lo +; GFX10-NEXT: v_mul_lo_u32 v13, s11, v6 +; GFX10-NEXT: v_mul_lo_u32 v14, s10, v3 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v8, s0, 0, v8, s0 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v7 -; GFX10-NEXT: v_mul_lo_u32 v7, s10, v9 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v14, v1, s0 -; GFX10-NEXT: v_add3_u32 v10, v13, v16, v17 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v15, v5, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v11, vcc_lo +; GFX10-NEXT: v_mul_lo_u32 v7, s10, v6 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo +; GFX10-NEXT: v_add3_u32 v9, v13, v14, v17 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v15, v1, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v16, v8, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc_lo ; GFX10-NEXT: v_sub_co_u32 v7, s0, s14, v7 -; GFX10-NEXT: v_sub_co_ci_u32_e64 v11, s1, s15, v10, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc_lo -; GFX10-NEXT: v_sub_nc_u32_e32 v1, s15, v10 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v11, s1, s15, v9, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo +; GFX10-NEXT: v_sub_nc_u32_e32 v1, s15, v9 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s11, v11 ; GFX10-NEXT: v_xor_b32_e32 v0, s18, v0 ; GFX10-NEXT: v_xor_b32_e32 v2, s19, v2 -; GFX10-NEXT: v_xor_b32_e32 v5, s2, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc_lo -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v10, vcc_lo, s11, v1, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v9, vcc_lo, s11, v1, s0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s10, v7 ; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc_lo ; GFX10-NEXT: v_sub_co_u32 v13, vcc_lo, v7, s10 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v14, s0, 0, v10, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v14, s0, 0, v9, vcc_lo ; GFX10-NEXT: v_sub_co_u32 v0, s0, v0, s18 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v1, s0, s19, v2, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s11, v11 -; GFX10-NEXT: v_xor_b32_e32 v2, s2, v3 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v10, vcc_lo, s11, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v12, s0 +; GFX10-NEXT: v_xor_b32_e32 v2, s2, v5 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v9, vcc_lo, s11, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v5, v8, v12, s0 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s11, v14 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, -1, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, -1, s0 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v13 ; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, -1, s0 -; GFX10-NEXT: v_add_co_u32 v15, s0, v9, 1 -; GFX10-NEXT: v_add_co_ci_u32_e64 v16, s0, 0, v4, s0 +; GFX10-NEXT: v_add_co_u32 v15, s0, v6, 1 +; GFX10-NEXT: v_add_co_ci_u32_e64 v16, s0, 0, v3, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s11, v14 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v12, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v12, s0 ; GFX10-NEXT: v_add_co_u32 v12, s0, v15, 1 ; GFX10-NEXT: v_add_co_ci_u32_e64 v17, s0, 0, v16, s0 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 -; GFX10-NEXT: v_sub_co_u32 v6, s0, v13, s10 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v10, s0, 0, v10, s0 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX10-NEXT: v_sub_co_u32 v8, s0, v13, s10 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v9, s0, 0, v9, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v12, v15, v12, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v3 +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v5 ; GFX10-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v3, v13, v6, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v6, v14, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v12, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v10, v4, v15, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v11, v6, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v13, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v8, v14, v9, vcc_lo +; GFX10-NEXT: v_xor_b32_e32 v9, s2, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v12, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v15, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v5, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v11, v8, s0 ; GFX10-NEXT: s_xor_b64 s[0:1], s[12:13], s[16:17] ; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v2, s2 -; GFX10-NEXT: v_xor_b32_e32 v2, s0, v9 -; GFX10-NEXT: v_xor_b32_e32 v7, s1, v10 -; GFX10-NEXT: v_xor_b32_e32 v9, s12, v3 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s2, v5, vcc_lo -; GFX10-NEXT: v_xor_b32_e32 v10, s12, v6 +; GFX10-NEXT: v_xor_b32_e32 v2, s0, v6 +; GFX10-NEXT: v_xor_b32_e32 v3, s1, v3 +; GFX10-NEXT: v_xor_b32_e32 v6, s12, v7 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s2, v9, vcc_lo +; GFX10-NEXT: v_xor_b32_e32 v7, s12, v8 ; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v2, s0 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v7, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v9, s12 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s12, v10, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v3, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v6, s12 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s12, v7, vcc_lo ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] -; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] +; GFX10-NEXT: global_store_dwordx4 v10, v[0:3], s[4:5] +; GFX10-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7] ; GFX10-NEXT: s_endpgm %div = sdiv <2 x i64> %x, %y store <2 x i64> %div, <2 x i64> addrspace(1)* %out0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -204,14 +204,8 @@ ; CHECK-NEXT: s_ashr_i32 s6, s3, 31 ; CHECK-NEXT: s_ashr_i32 s0, s5, 31 ; CHECK-NEXT: s_add_u32 s10, s2, s6 -; CHECK-NEXT: s_cselect_b32 s7, 1, 0 -; CHECK-NEXT: s_and_b32 s7, s7, 1 -; CHECK-NEXT: s_cmp_lg_u32 s7, 0 ; CHECK-NEXT: s_addc_u32 s11, s3, s6 ; CHECK-NEXT: s_add_u32 s8, s4, s0 -; CHECK-NEXT: s_cselect_b32 s3, 1, 0 -; CHECK-NEXT: s_and_b32 s3, s3, 1 -; CHECK-NEXT: s_cmp_lg_u32 s3, 0 ; CHECK-NEXT: s_mov_b32 s1, s0 ; CHECK-NEXT: s_addc_u32 s9, s5, s0 ; CHECK-NEXT: s_xor_b64 s[8:9], s[8:9], s[0:1] @@ -222,21 +216,18 @@ ; CHECK-NEXT: s_sub_u32 s0, 0, s8 ; CHECK-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; CHECK-NEXT: s_cselect_b32 s1, 1, 0 -; CHECK-NEXT: s_and_b32 s1, s1, 1 -; CHECK-NEXT: s_cmp_lg_u32 s1, 0 +; CHECK-NEXT: s_subb_u32 s1, 0, s9 ; CHECK-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; CHECK-NEXT: v_trunc_f32_e32 v1, v1 ; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; CHECK-NEXT: v_cvt_u32_f32_e32 v1, v1 ; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0 -; CHECK-NEXT: s_subb_u32 s1, 0, s9 -; CHECK-NEXT: v_mul_lo_u32 v3, s0, v1 -; CHECK-NEXT: v_mul_lo_u32 v2, s1, v0 +; CHECK-NEXT: v_mul_lo_u32 v2, s0, v1 +; CHECK-NEXT: v_mul_lo_u32 v3, s1, v0 ; CHECK-NEXT: v_mul_hi_u32 v5, s0, v0 ; CHECK-NEXT: v_mul_lo_u32 v4, s0, v0 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; CHECK-NEXT: v_mul_lo_u32 v3, v1, v4 ; CHECK-NEXT: v_mul_lo_u32 v5, v0, v2 @@ -1174,43 +1165,38 @@ ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: s_movk_i32 s10, 0x1000 -; GISEL-NEXT: s_add_u32 s4, s10, 0 -; GISEL-NEXT: s_cselect_b32 s5, 1, 0 -; GISEL-NEXT: s_and_b32 s5, s5, 1 ; GISEL-NEXT: s_mov_b32 s6, 0 -; GISEL-NEXT: s_cmp_lg_u32 s5, 0 +; GISEL-NEXT: s_add_u32 s4, s10, 0 ; GISEL-NEXT: s_mov_b32 s7, s6 ; GISEL-NEXT: s_addc_u32 s5, 0, 0 ; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7] ; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s8 ; GISEL-NEXT: v_cvt_f32_u32_e32 v6, s9 ; GISEL-NEXT: s_sub_u32 s4, 0, s8 -; GISEL-NEXT: s_cselect_b32 s5, 1, 0 -; GISEL-NEXT: s_and_b32 s5, s5, 1 -; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; GISEL-NEXT: s_cmp_lg_u32 s5, 0 ; GISEL-NEXT: s_subb_u32 s5, 0, s9 ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 +; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc ; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 ; GISEL-NEXT: v_trunc_f32_e32 v6, v6 ; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 ; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 +; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4 ; GISEL-NEXT: v_mul_lo_u32 v7, s5, v5 ; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6 ; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5 ; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; GISEL-NEXT: v_mul_lo_u32 v8, v6, v9 ; GISEL-NEXT: v_mul_lo_u32 v10, v5, v7 ; GISEL-NEXT: v_mul_hi_u32 v11, v5, v9 ; GISEL-NEXT: v_mul_hi_u32 v9, v6, v9 -; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 @@ -1234,7 +1220,6 @@ ; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6 ; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5 ; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5 -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; GISEL-NEXT: v_mul_lo_u32 v8, v6, v9 @@ -1303,16 +1288,13 @@ ; GISEL-NEXT: v_cmp_le_u32_e64 s[4:5], s9, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_le_u32_e64 s[4:5], s8, v7 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], s9, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5] -; GISEL-NEXT: s_add_u32 s4, s10, 0 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc -; GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GISEL-NEXT: v_subrev_i32_e32 v9, vcc, s8, v7 -; GISEL-NEXT: s_and_b32 s5, s5, 1 +; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5] ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; GISEL-NEXT: s_cmp_lg_u32 s5, 0 +; GISEL-NEXT: s_add_u32 s4, s10, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GISEL-NEXT: s_addc_u32 s5, 0, 0 ; GISEL-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc @@ -1323,26 +1305,23 @@ ; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s6 ; GISEL-NEXT: v_cvt_f32_u32_e32 v6, s7 ; GISEL-NEXT: s_sub_u32 s4, 0, s6 -; GISEL-NEXT: s_cselect_b32 s5, 1, 0 -; GISEL-NEXT: s_and_b32 s5, s5, 1 -; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; GISEL-NEXT: s_cmp_lg_u32 s5, 0 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GISEL-NEXT: s_subb_u32 s5, 0, s7 +; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 +; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 +; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 ; GISEL-NEXT: v_trunc_f32_e32 v6, v6 ; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 ; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc ; GISEL-NEXT: v_mul_lo_u32 v7, s5, v5 ; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 @@ -1882,43 +1861,38 @@ ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: s_mov_b32 s10, 0x12d8fb -; GISEL-NEXT: s_add_u32 s4, s10, 0 -; GISEL-NEXT: s_cselect_b32 s5, 1, 0 -; GISEL-NEXT: s_and_b32 s5, s5, 1 ; GISEL-NEXT: s_mov_b32 s6, 0 -; GISEL-NEXT: s_cmp_lg_u32 s5, 0 +; GISEL-NEXT: s_add_u32 s4, s10, 0 ; GISEL-NEXT: s_mov_b32 s7, s6 ; GISEL-NEXT: s_addc_u32 s5, 0, 0 ; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7] ; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s8 ; GISEL-NEXT: v_cvt_f32_u32_e32 v6, s9 ; GISEL-NEXT: s_sub_u32 s4, 0, s8 -; GISEL-NEXT: s_cselect_b32 s5, 1, 0 -; GISEL-NEXT: s_and_b32 s5, s5, 1 -; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; GISEL-NEXT: s_cmp_lg_u32 s5, 0 ; GISEL-NEXT: s_subb_u32 s5, 0, s9 ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 +; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc ; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 ; GISEL-NEXT: v_trunc_f32_e32 v6, v6 ; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 ; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 +; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4 ; GISEL-NEXT: v_mul_lo_u32 v7, s5, v5 ; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6 ; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5 ; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; GISEL-NEXT: v_mul_lo_u32 v8, v6, v9 ; GISEL-NEXT: v_mul_lo_u32 v10, v5, v7 ; GISEL-NEXT: v_mul_hi_u32 v11, v5, v9 ; GISEL-NEXT: v_mul_hi_u32 v9, v6, v9 -; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 @@ -1942,7 +1916,6 @@ ; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6 ; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5 ; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5 -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; GISEL-NEXT: v_mul_lo_u32 v8, v6, v9 @@ -2011,16 +1984,13 @@ ; GISEL-NEXT: v_cmp_le_u32_e64 s[4:5], s9, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_le_u32_e64 s[4:5], s8, v7 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], s9, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5] -; GISEL-NEXT: s_add_u32 s4, s10, 0 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc -; GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GISEL-NEXT: v_subrev_i32_e32 v9, vcc, s8, v7 -; GISEL-NEXT: s_and_b32 s5, s5, 1 +; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5] ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; GISEL-NEXT: s_cmp_lg_u32 s5, 0 +; GISEL-NEXT: s_add_u32 s4, s10, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GISEL-NEXT: s_addc_u32 s5, 0, 0 ; GISEL-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc @@ -2031,26 +2001,23 @@ ; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s6 ; GISEL-NEXT: v_cvt_f32_u32_e32 v6, s7 ; GISEL-NEXT: s_sub_u32 s4, 0, s6 -; GISEL-NEXT: s_cselect_b32 s5, 1, 0 -; GISEL-NEXT: s_and_b32 s5, s5, 1 -; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; GISEL-NEXT: s_cmp_lg_u32 s5, 0 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GISEL-NEXT: s_subb_u32 s5, 0, s7 +; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 +; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 +; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 ; GISEL-NEXT: v_trunc_f32_e32 v6, v6 ; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 ; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc ; GISEL-NEXT: v_mul_lo_u32 v7, s5, v5 ; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -4203,9 +4203,6 @@ ; GFX6-LABEL: s_ssubsat_i64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_sub_u32 s4, s0, s2 -; GFX6-NEXT: s_cselect_b32 s5, 1, 0 -; GFX6-NEXT: s_and_b32 s5, s5, 1 -; GFX6-NEXT: s_cmp_lg_u32 s5, 0 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: s_subb_u32 s5, s1, s3 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -4229,9 +4226,6 @@ ; GFX8-LABEL: s_ssubsat_i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_sub_u32 s4, s0, s2 -; GFX8-NEXT: s_cselect_b32 s5, 1, 0 -; GFX8-NEXT: s_and_b32 s5, s5, 1 -; GFX8-NEXT: s_cmp_lg_u32 s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_subb_u32 s5, s1, s3 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -4255,9 +4249,6 @@ ; GFX9-LABEL: s_ssubsat_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_sub_u32 s4, s0, s2 -; GFX9-NEXT: s_cselect_b32 s5, 1, 0 -; GFX9-NEXT: s_and_b32 s5, s5, 1 -; GFX9-NEXT: s_cmp_lg_u32 s5, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_subb_u32 s5, s1, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -4281,15 +4272,12 @@ ; GFX10-LABEL: s_ssubsat_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_sub_u32 s4, s0, s2 -; GFX10-NEXT: s_cselect_b32 s5, 1, 0 -; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[2:3], 0 -; GFX10-NEXT: s_and_b32 s5, s5, 1 -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: s_cmp_lg_u32 s5, 0 ; GFX10-NEXT: s_subb_u32 s5, s1, s3 -; GFX10-NEXT: s_mov_b32 s3, 0 +; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[2:3], 0 ; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[4:5], s[0:1] +; GFX10-NEXT: s_mov_b32 s3, 0 ; GFX10-NEXT: s_ashr_i32 s0, s5, 31 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: s_xor_b32 s2, s2, s1 ; GFX10-NEXT: s_cmp_lg_u32 s3, 0 @@ -4545,9 +4533,6 @@ ; GFX6-LABEL: s_ssubsat_v2i64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_sub_u32 s8, s0, s4 -; GFX6-NEXT: s_cselect_b32 s9, 1, 0 -; GFX6-NEXT: s_and_b32 s9, s9, 1 -; GFX6-NEXT: s_cmp_lg_u32 s9, 0 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: s_subb_u32 s9, s1, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -4558,16 +4543,13 @@ ; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: s_cmp_lg_u32 s10, 0 -; GFX6-NEXT: s_addc_u32 s1, s4, s5 -; GFX6-NEXT: s_sub_u32 s0, s2, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NEXT: s_cselect_b32 s1, 1, 0 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s8 -; GFX6-NEXT: s_and_b32 s1, s1, 1 +; GFX6-NEXT: s_addc_u32 s1, s4, s5 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc -; GFX6-NEXT: s_cmp_lg_u32 s1, 0 +; GFX6-NEXT: s_sub_u32 s0, s2, s6 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v2, s1 ; GFX6-NEXT: v_mov_b32_e32 v3, s9 ; GFX6-NEXT: s_subb_u32 s1, s3, s7 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 @@ -4594,9 +4576,6 @@ ; GFX8-LABEL: s_ssubsat_v2i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_sub_u32 s8, s0, s4 -; GFX8-NEXT: s_cselect_b32 s9, 1, 0 -; GFX8-NEXT: s_and_b32 s9, s9, 1 -; GFX8-NEXT: s_cmp_lg_u32 s9, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_subb_u32 s9, s1, s5 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -4607,16 +4586,13 @@ ; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX8-NEXT: s_brev_b32 s5, 1 ; GFX8-NEXT: s_cmp_lg_u32 s10, 0 -; GFX8-NEXT: s_addc_u32 s1, s4, s5 -; GFX8-NEXT: s_sub_u32 s0, s2, s6 -; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: s_cselect_b32 s1, 1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s8 -; GFX8-NEXT: s_and_b32 s1, s1, 1 +; GFX8-NEXT: s_addc_u32 s1, s4, s5 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc -; GFX8-NEXT: s_cmp_lg_u32 s1, 0 +; GFX8-NEXT: s_sub_u32 s0, s2, s6 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: v_mov_b32_e32 v3, s9 ; GFX8-NEXT: s_subb_u32 s1, s3, s7 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -4643,9 +4619,6 @@ ; GFX9-LABEL: s_ssubsat_v2i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_sub_u32 s8, s0, s4 -; GFX9-NEXT: s_cselect_b32 s9, 1, 0 -; GFX9-NEXT: s_and_b32 s9, s9, 1 -; GFX9-NEXT: s_cmp_lg_u32 s9, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_subb_u32 s9, s1, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -4656,16 +4629,13 @@ ; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX9-NEXT: s_brev_b32 s5, 1 ; GFX9-NEXT: s_cmp_lg_u32 s10, 0 -; GFX9-NEXT: s_addc_u32 s1, s4, s5 -; GFX9-NEXT: s_sub_u32 s0, s2, s6 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: s_cselect_b32 s1, 1, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-NEXT: s_and_b32 s1, s1, 1 +; GFX9-NEXT: s_addc_u32 s1, s4, s5 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc -; GFX9-NEXT: s_cmp_lg_u32 s1, 0 +; GFX9-NEXT: s_sub_u32 s0, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_mov_b32_e32 v3, s9 ; GFX9-NEXT: s_subb_u32 s1, s3, s7 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -4692,32 +4662,26 @@ ; GFX10-LABEL: s_ssubsat_v2i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_sub_u32 s8, s0, s4 -; GFX10-NEXT: s_cselect_b32 s9, 1, 0 +; GFX10-NEXT: s_subb_u32 s9, s1, s5 ; GFX10-NEXT: v_cmp_gt_i64_e64 s4, s[4:5], 0 -; GFX10-NEXT: s_and_b32 s9, s9, 1 +; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[8:9], s[0:1] ; GFX10-NEXT: s_mov_b32 s11, 0 -; GFX10-NEXT: s_cmp_lg_u32 s9, 0 +; GFX10-NEXT: s_ashr_i32 s0, s9, 31 ; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: s_subb_u32 s9, s1, s5 ; GFX10-NEXT: s_brev_b32 s10, 1 -; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[8:9], s[0:1] -; GFX10-NEXT: s_ashr_i32 s0, s9, 31 ; GFX10-NEXT: v_mov_b32_e32 v1, s9 ; GFX10-NEXT: s_xor_b32 s8, s4, s1 ; GFX10-NEXT: s_cmp_lg_u32 s11, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s0, s8 ; GFX10-NEXT: s_addc_u32 s1, s0, s10 ; GFX10-NEXT: s_sub_u32 s4, s2, s6 -; GFX10-NEXT: s_cselect_b32 s5, 1, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: s_and_b32 s5, s5, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s1, s8 -; GFX10-NEXT: s_cmp_lg_u32 s5, 0 ; GFX10-NEXT: s_subb_u32 s5, s3, s7 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[4:5], s[2:3] ; GFX10-NEXT: v_cmp_gt_i64_e64 s3, s[6:7], 0 ; GFX10-NEXT: s_ashr_i32 s0, s5, 31 ; GFX10-NEXT: v_mov_b32_e32 v3, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s1, s8 ; GFX10-NEXT: s_xor_b32 s2, s3, s2 ; GFX10-NEXT: s_cmp_lg_u32 s11, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s0, s2 @@ -4736,19 +4700,10 @@ ; GFX6-LABEL: s_ssubsat_i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_sub_u32 s8, s0, s4 -; GFX6-NEXT: s_cselect_b32 s9, 1, 0 -; GFX6-NEXT: s_and_b32 s9, s9, 1 -; GFX6-NEXT: s_cmp_lg_u32 s9, 0 -; GFX6-NEXT: s_subb_u32 s9, s1, s5 -; GFX6-NEXT: s_cselect_b32 s10, 1, 0 -; GFX6-NEXT: s_and_b32 s10, s10, 1 -; GFX6-NEXT: s_cmp_lg_u32 s10, 0 -; GFX6-NEXT: s_subb_u32 s10, s2, s6 -; GFX6-NEXT: s_cselect_b32 s11, 1, 0 ; GFX6-NEXT: v_mov_b32_e32 v3, s1 -; GFX6-NEXT: s_and_b32 s11, s11, 1 +; GFX6-NEXT: s_subb_u32 s9, s1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: s_cmp_lg_u32 s11, 0 +; GFX6-NEXT: s_subb_u32 s10, s2, s6 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3] ; GFX6-NEXT: s_subb_u32 s11, s3, s7 @@ -4761,21 +4716,15 @@ ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], 0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[6:7], 0 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX6-NEXT: s_ashr_i32 s0, s11, 31 ; GFX6-NEXT: s_mov_b32 s1, 0 +; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX6-NEXT: s_cmp_lg_u32 s1, 0 +; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_addc_u32 s1, s0, 0 -; GFX6-NEXT: s_cselect_b32 s2, 1, 0 -; GFX6-NEXT: s_and_b32 s2, s2, 1 -; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[6:7], 0 -; GFX6-NEXT: s_cmp_lg_u32 s2, 0 ; GFX6-NEXT: s_addc_u32 s2, s0, 0 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX6-NEXT: s_cselect_b32 s3, 1, 0 -; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_and_b32 s3, s3, 1 -; GFX6-NEXT: s_cmp_lg_u32 s3, 0 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX6-NEXT: s_addc_u32 s3, s0, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 @@ -4800,18 +4749,9 @@ ; GFX8-LABEL: s_ssubsat_i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_sub_u32 s8, s0, s4 -; GFX8-NEXT: s_cselect_b32 s9, 1, 0 -; GFX8-NEXT: s_and_b32 s9, s9, 1 -; GFX8-NEXT: s_cmp_lg_u32 s9, 0 ; GFX8-NEXT: s_subb_u32 s9, s1, s5 -; GFX8-NEXT: s_cselect_b32 s10, 1, 0 -; GFX8-NEXT: s_and_b32 s10, s10, 1 -; GFX8-NEXT: s_cmp_lg_u32 s10, 0 -; GFX8-NEXT: s_subb_u32 s10, s2, s6 -; GFX8-NEXT: s_cselect_b32 s11, 1, 0 -; GFX8-NEXT: s_and_b32 s11, s11, 1 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_cmp_lg_u32 s11, 0 +; GFX8-NEXT: s_subb_u32 s10, s2, s6 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: s_subb_u32 s11, s3, s7 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -4835,17 +4775,11 @@ ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX8-NEXT: s_ashr_i32 s0, s11, 31 ; GFX8-NEXT: s_mov_b32 s1, 0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX8-NEXT: s_cmp_lg_u32 s1, 0 +; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_addc_u32 s1, s0, 0 -; GFX8-NEXT: s_cselect_b32 s2, 1, 0 -; GFX8-NEXT: s_and_b32 s2, s2, 1 -; GFX8-NEXT: s_cmp_lg_u32 s2, 0 ; GFX8-NEXT: s_addc_u32 s2, s0, 0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX8-NEXT: s_cselect_b32 s3, 1, 0 -; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX8-NEXT: s_and_b32 s3, s3, 1 -; GFX8-NEXT: s_cmp_lg_u32 s3, 0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: s_addc_u32 s3, s0, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 @@ -4870,18 +4804,9 @@ ; GFX9-LABEL: s_ssubsat_i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_sub_u32 s8, s0, s4 -; GFX9-NEXT: s_cselect_b32 s9, 1, 0 -; GFX9-NEXT: s_and_b32 s9, s9, 1 -; GFX9-NEXT: s_cmp_lg_u32 s9, 0 ; GFX9-NEXT: s_subb_u32 s9, s1, s5 -; GFX9-NEXT: s_cselect_b32 s10, 1, 0 -; GFX9-NEXT: s_and_b32 s10, s10, 1 -; GFX9-NEXT: s_cmp_lg_u32 s10, 0 -; GFX9-NEXT: s_subb_u32 s10, s2, s6 -; GFX9-NEXT: s_cselect_b32 s11, 1, 0 -; GFX9-NEXT: s_and_b32 s11, s11, 1 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_cmp_lg_u32 s11, 0 +; GFX9-NEXT: s_subb_u32 s10, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: s_subb_u32 s11, s3, s7 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -4905,17 +4830,11 @@ ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX9-NEXT: s_ashr_i32 s0, s11, 31 ; GFX9-NEXT: s_mov_b32 s1, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX9-NEXT: s_cmp_lg_u32 s1, 0 +; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: s_addc_u32 s1, s0, 0 -; GFX9-NEXT: s_cselect_b32 s2, 1, 0 -; GFX9-NEXT: s_and_b32 s2, s2, 1 -; GFX9-NEXT: s_cmp_lg_u32 s2, 0 ; GFX9-NEXT: s_addc_u32 s2, s0, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: s_cselect_b32 s3, 1, 0 -; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX9-NEXT: s_and_b32 s3, s3, 1 -; GFX9-NEXT: s_cmp_lg_u32 s3, 0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_addc_u32 s3, s0, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 @@ -4940,62 +4859,47 @@ ; GFX10-LABEL: s_ssubsat_i128: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_sub_u32 s8, s0, s4 -; GFX10-NEXT: s_cselect_b32 s9, 1, 0 -; GFX10-NEXT: s_and_b32 s9, s9, 1 -; GFX10-NEXT: s_cmp_lg_u32 s9, 0 ; GFX10-NEXT: s_subb_u32 s9, s1, s5 -; GFX10-NEXT: s_cselect_b32 s10, 1, 0 -; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[8:9], s[0:1] -; GFX10-NEXT: s_and_b32 s10, s10, 1 -; GFX10-NEXT: s_cmp_lg_u32 s10, 0 ; GFX10-NEXT: s_subb_u32 s10, s2, s6 -; GFX10-NEXT: s_cselect_b32 s11, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX10-NEXT: s_and_b32 s11, s11, 1 -; GFX10-NEXT: s_cmp_lg_u32 s11, 0 +; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[8:9], s[0:1] ; GFX10-NEXT: s_subb_u32 s11, s3, s7 -; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[10:11], s[2:3] ; GFX10-NEXT: s_cmp_eq_u64 s[10:11], s[2:3] -; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[6:7], 0 -; GFX10-NEXT: s_cselect_b32 s0, 1, 0 -; GFX10-NEXT: v_mov_b32_e32 v4, s11 -; GFX10-NEXT: s_and_b32 s0, 1, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1 -; GFX10-NEXT: v_cmp_gt_u64_e64 s1, s[4:5], 0 +; GFX10-NEXT: s_cselect_b32 s12, 1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[10:11], s[2:3] +; GFX10-NEXT: v_cmp_gt_u64_e64 s2, s[4:5], 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX10-NEXT: s_and_b32 s0, 1, s12 ; GFX10-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 +; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[6:7], 0 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1 ; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX10-NEXT: s_and_b32 s1, 1, s1 ; GFX10-NEXT: s_ashr_i32 s0, s11, 31 +; GFX10-NEXT: s_and_b32 s1, 1, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 ; GFX10-NEXT: s_mov_b32 s1, 0 ; GFX10-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo -; GFX10-NEXT: s_addc_u32 s1, s0, 0 -; GFX10-NEXT: s_cselect_b32 s2, 1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, s9 -; GFX10-NEXT: s_and_b32 s2, s2, 1 +; GFX10-NEXT: v_mov_b32_e32 v3, s11 +; GFX10-NEXT: s_addc_u32 s1, s0, 0 +; GFX10-NEXT: s_addc_u32 s2, s0, 0 ; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX10-NEXT: s_cmp_lg_u32 s2, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, s8 -; GFX10-NEXT: s_addc_u32 s2, s0, 0 -; GFX10-NEXT: s_cselect_b32 s3, 1, 0 +; GFX10-NEXT: s_addc_u32 s3, s0, 0x80000000 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: s_and_b32 s3, s3, 1 -; GFX10-NEXT: v_mov_b32_e32 v3, s10 -; GFX10-NEXT: s_cmp_lg_u32 s3, 0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: s_addc_u32 s3, s0, 0x80000000 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, s2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v4, s3, vcc_lo -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, s10 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s3, vcc_lo +; GFX10-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10-NEXT: v_readfirstlane_b32 s1, v2 +; GFX10-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: ; return to shader part epilog %result = call i128 @llvm.ssub.sat.i128(i128 %lhs, i128 %rhs) @@ -5553,19 +5457,10 @@ ; GFX6-LABEL: s_ssubsat_v2i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_sub_u32 s16, s0, s8 -; GFX6-NEXT: s_cselect_b32 s17, 1, 0 -; GFX6-NEXT: s_and_b32 s17, s17, 1 -; GFX6-NEXT: s_cmp_lg_u32 s17, 0 -; GFX6-NEXT: s_subb_u32 s17, s1, s9 -; GFX6-NEXT: s_cselect_b32 s18, 1, 0 -; GFX6-NEXT: s_and_b32 s18, s18, 1 -; GFX6-NEXT: s_cmp_lg_u32 s18, 0 -; GFX6-NEXT: s_subb_u32 s18, s2, s10 -; GFX6-NEXT: s_cselect_b32 s19, 1, 0 ; GFX6-NEXT: v_mov_b32_e32 v3, s1 -; GFX6-NEXT: s_and_b32 s19, s19, 1 +; GFX6-NEXT: s_subb_u32 s17, s1, s9 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: s_cmp_lg_u32 s19, 0 +; GFX6-NEXT: s_subb_u32 s18, s2, s10 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[2:3] ; GFX6-NEXT: s_subb_u32 s19, s3, s11 @@ -5578,51 +5473,36 @@ ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[10:11], 0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[10:11], 0 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX6-NEXT: s_ashr_i32 s0, s19, 31 ; GFX6-NEXT: s_mov_b32 s1, 0 +; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX6-NEXT: s_cmp_lg_u32 s1, 0 +; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_addc_u32 s1, s0, 0 -; GFX6-NEXT: s_cselect_b32 s2, 1, 0 -; GFX6-NEXT: s_and_b32 s2, s2, 1 -; GFX6-NEXT: s_cmp_lg_u32 s2, 0 -; GFX6-NEXT: s_addc_u32 s2, s0, 0 -; GFX6-NEXT: s_cselect_b32 s3, 1, 0 -; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[10:11], 0 -; GFX6-NEXT: s_and_b32 s3, s3, 1 ; GFX6-NEXT: s_brev_b32 s8, 1 -; GFX6-NEXT: s_cmp_lg_u32 s3, 0 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX6-NEXT: s_addc_u32 s2, s0, 0 +; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX6-NEXT: s_addc_u32 s3, s0, s8 -; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: s_sub_u32 s0, s4, s12 ; GFX6-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NEXT: s_cselect_b32 s1, 1, 0 -; GFX6-NEXT: s_and_b32 s1, s1, 1 -; GFX6-NEXT: s_cmp_lg_u32 s1, 0 -; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_subb_u32 s1, s5, s13 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NEXT: s_cselect_b32 s2, 1, 0 -; GFX6-NEXT: s_and_b32 s2, s2, 1 ; GFX6-NEXT: v_mov_b32_e32 v3, s16 ; GFX6-NEXT: v_mov_b32_e32 v4, s17 -; GFX6-NEXT: s_cmp_lg_u32 s2, 0 +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: v_mov_b32_e32 v2, s18 ; GFX6-NEXT: v_mov_b32_e32 v3, s19 -; GFX6-NEXT: s_subb_u32 s2, s6, s14 ; GFX6-NEXT: v_cndmask_b32_e32 v6, v2, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v7, v3, v1, vcc -; GFX6-NEXT: s_cselect_b32 s3, 1, 0 +; GFX6-NEXT: s_sub_u32 s0, s4, s12 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_and_b32 s3, s3, 1 +; GFX6-NEXT: s_subb_u32 s1, s5, s13 ; GFX6-NEXT: v_mov_b32_e32 v3, s5 -; GFX6-NEXT: s_cmp_lg_u32 s3, 0 +; GFX6-NEXT: s_subb_u32 s2, s6, s14 ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX6-NEXT: s_subb_u32 s3, s7, s15 @@ -5635,21 +5515,15 @@ ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[4:5], s[14:15], 0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[14:15], 0 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] ; GFX6-NEXT: s_ashr_i32 s4, s3, 31 ; GFX6-NEXT: s_mov_b32 s5, 0 +; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX6-NEXT: s_cmp_lg_u32 s5, 0 +; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_addc_u32 s5, s4, 0 -; GFX6-NEXT: s_cselect_b32 s6, 1, 0 -; GFX6-NEXT: s_and_b32 s6, s6, 1 -; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[14:15], 0 -; GFX6-NEXT: s_cmp_lg_u32 s6, 0 ; GFX6-NEXT: s_addc_u32 s6, s4, 0 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX6-NEXT: s_cselect_b32 s7, 1, 0 -; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_and_b32 s7, s7, 1 -; GFX6-NEXT: s_cmp_lg_u32 s7, 0 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX6-NEXT: s_addc_u32 s7, s4, s8 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 @@ -5678,18 +5552,9 @@ ; GFX8-LABEL: s_ssubsat_v2i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_sub_u32 s16, s0, s8 -; GFX8-NEXT: s_cselect_b32 s17, 1, 0 -; GFX8-NEXT: s_and_b32 s17, s17, 1 -; GFX8-NEXT: s_cmp_lg_u32 s17, 0 ; GFX8-NEXT: s_subb_u32 s17, s1, s9 -; GFX8-NEXT: s_cselect_b32 s18, 1, 0 -; GFX8-NEXT: s_and_b32 s18, s18, 1 -; GFX8-NEXT: s_cmp_lg_u32 s18, 0 -; GFX8-NEXT: s_subb_u32 s18, s2, s10 -; GFX8-NEXT: s_cselect_b32 s19, 1, 0 -; GFX8-NEXT: s_and_b32 s19, s19, 1 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_cmp_lg_u32 s19, 0 +; GFX8-NEXT: s_subb_u32 s18, s2, s10 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: s_subb_u32 s19, s3, s11 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -5713,46 +5578,31 @@ ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX8-NEXT: s_ashr_i32 s0, s19, 31 ; GFX8-NEXT: s_mov_b32 s1, 0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX8-NEXT: s_cmp_lg_u32 s1, 0 +; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_addc_u32 s1, s0, 0 -; GFX8-NEXT: s_cselect_b32 s2, 1, 0 -; GFX8-NEXT: s_and_b32 s2, s2, 1 -; GFX8-NEXT: s_cmp_lg_u32 s2, 0 -; GFX8-NEXT: s_addc_u32 s2, s0, 0 -; GFX8-NEXT: s_cselect_b32 s3, 1, 0 -; GFX8-NEXT: s_and_b32 s3, s3, 1 ; GFX8-NEXT: s_brev_b32 s8, 1 -; GFX8-NEXT: s_cmp_lg_u32 s3, 0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX8-NEXT: s_addc_u32 s2, s0, 0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: s_addc_u32 s3, s0, s8 -; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 -; GFX8-NEXT: s_sub_u32 s0, s4, s12 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: s_cselect_b32 s1, 1, 0 -; GFX8-NEXT: s_and_b32 s1, s1, 1 -; GFX8-NEXT: s_cmp_lg_u32 s1, 0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_subb_u32 s1, s5, s13 -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: s_cselect_b32 s2, 1, 0 -; GFX8-NEXT: s_and_b32 s2, s2, 1 -; GFX8-NEXT: s_cmp_lg_u32 s2, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s16 ; GFX8-NEXT: v_mov_b32_e32 v4, s17 -; GFX8-NEXT: s_subb_u32 s2, s6, s14 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_mov_b32_e32 v2, s18 ; GFX8-NEXT: v_mov_b32_e32 v3, s19 -; GFX8-NEXT: s_cselect_b32 s3, 1, 0 +; GFX8-NEXT: s_sub_u32 s0, s4, s12 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v1, vcc -; GFX8-NEXT: s_and_b32 s3, s3, 1 +; GFX8-NEXT: s_subb_u32 s1, s5, s13 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: s_cmp_lg_u32 s3, 0 +; GFX8-NEXT: s_subb_u32 s2, s6, s14 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: s_subb_u32 s3, s7, s15 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 @@ -5776,17 +5626,11 @@ ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 ; GFX8-NEXT: s_ashr_i32 s4, s3, 31 ; GFX8-NEXT: s_mov_b32 s5, 0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX8-NEXT: s_cmp_lg_u32 s5, 0 +; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_addc_u32 s5, s4, 0 -; GFX8-NEXT: s_cselect_b32 s6, 1, 0 -; GFX8-NEXT: s_and_b32 s6, s6, 1 -; GFX8-NEXT: s_cmp_lg_u32 s6, 0 ; GFX8-NEXT: s_addc_u32 s6, s4, 0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX8-NEXT: s_cselect_b32 s7, 1, 0 -; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX8-NEXT: s_and_b32 s7, s7, 1 -; GFX8-NEXT: s_cmp_lg_u32 s7, 0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: s_addc_u32 s7, s4, s8 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 @@ -5815,18 +5659,9 @@ ; GFX9-LABEL: s_ssubsat_v2i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_sub_u32 s16, s0, s8 -; GFX9-NEXT: s_cselect_b32 s17, 1, 0 -; GFX9-NEXT: s_and_b32 s17, s17, 1 -; GFX9-NEXT: s_cmp_lg_u32 s17, 0 ; GFX9-NEXT: s_subb_u32 s17, s1, s9 -; GFX9-NEXT: s_cselect_b32 s18, 1, 0 -; GFX9-NEXT: s_and_b32 s18, s18, 1 -; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_subb_u32 s18, s2, s10 -; GFX9-NEXT: s_cselect_b32 s19, 1, 0 -; GFX9-NEXT: s_and_b32 s19, s19, 1 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_subb_u32 s18, s2, s10 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: s_subb_u32 s19, s3, s11 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -5850,46 +5685,31 @@ ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX9-NEXT: s_ashr_i32 s0, s19, 31 ; GFX9-NEXT: s_mov_b32 s1, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX9-NEXT: s_cmp_lg_u32 s1, 0 +; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: s_addc_u32 s1, s0, 0 -; GFX9-NEXT: s_cselect_b32 s2, 1, 0 -; GFX9-NEXT: s_and_b32 s2, s2, 1 -; GFX9-NEXT: s_cmp_lg_u32 s2, 0 -; GFX9-NEXT: s_addc_u32 s2, s0, 0 -; GFX9-NEXT: s_cselect_b32 s3, 1, 0 -; GFX9-NEXT: s_and_b32 s3, s3, 1 ; GFX9-NEXT: s_brev_b32 s8, 1 -; GFX9-NEXT: s_cmp_lg_u32 s3, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX9-NEXT: s_addc_u32 s2, s0, 0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_addc_u32 s3, s0, s8 -; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_sub_u32 s0, s4, s12 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: s_cselect_b32 s1, 1, 0 -; GFX9-NEXT: s_and_b32 s1, s1, 1 -; GFX9-NEXT: s_cmp_lg_u32 s1, 0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_subb_u32 s1, s5, s13 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: s_cselect_b32 s2, 1, 0 -; GFX9-NEXT: s_and_b32 s2, s2, 1 -; GFX9-NEXT: s_cmp_lg_u32 s2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s16 ; GFX9-NEXT: v_mov_b32_e32 v4, s17 -; GFX9-NEXT: s_subb_u32 s2, s6, s14 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 -; GFX9-NEXT: s_cselect_b32 s3, 1, 0 +; GFX9-NEXT: s_sub_u32 s0, s4, s12 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v7, v3, v1, vcc -; GFX9-NEXT: s_and_b32 s3, s3, 1 +; GFX9-NEXT: s_subb_u32 s1, s5, s13 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: s_cmp_lg_u32 s3, 0 +; GFX9-NEXT: s_subb_u32 s2, s6, s14 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: s_subb_u32 s3, s7, s15 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 @@ -5913,17 +5733,11 @@ ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 ; GFX9-NEXT: s_ashr_i32 s4, s3, 31 ; GFX9-NEXT: s_mov_b32 s5, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX9-NEXT: s_cmp_lg_u32 s5, 0 +; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: s_addc_u32 s5, s4, 0 -; GFX9-NEXT: s_cselect_b32 s6, 1, 0 -; GFX9-NEXT: s_and_b32 s6, s6, 1 -; GFX9-NEXT: s_cmp_lg_u32 s6, 0 ; GFX9-NEXT: s_addc_u32 s6, s4, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: s_cselect_b32 s7, 1, 0 -; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX9-NEXT: s_and_b32 s7, s7, 1 -; GFX9-NEXT: s_cmp_lg_u32 s7, 0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_addc_u32 s7, s4, s8 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 @@ -5952,120 +5766,90 @@ ; GFX10-LABEL: s_ssubsat_v2i128: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_sub_u32 s16, s0, s8 -; GFX10-NEXT: s_cselect_b32 s17, 1, 0 -; GFX10-NEXT: s_and_b32 s17, s17, 1 -; GFX10-NEXT: s_cmp_lg_u32 s17, 0 ; GFX10-NEXT: s_subb_u32 s17, s1, s9 -; GFX10-NEXT: s_cselect_b32 s18, 1, 0 -; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[16:17], s[0:1] -; GFX10-NEXT: s_and_b32 s18, s18, 1 -; GFX10-NEXT: s_cmp_lg_u32 s18, 0 ; GFX10-NEXT: s_subb_u32 s18, s2, s10 -; GFX10-NEXT: s_cselect_b32 s19, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX10-NEXT: s_and_b32 s19, s19, 1 -; GFX10-NEXT: s_cmp_lg_u32 s19, 0 +; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[16:17], s[0:1] ; GFX10-NEXT: s_subb_u32 s19, s3, s11 -; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[18:19], s[2:3] +; GFX10-NEXT: s_brev_b32 s21, 1 ; GFX10-NEXT: s_cmp_eq_u64 s[18:19], s[2:3] -; GFX10-NEXT: v_cmp_gt_u64_e64 s2, s[8:9], 0 ; GFX10-NEXT: s_cselect_b32 s20, 1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[18:19], s[2:3] +; GFX10-NEXT: v_cmp_gt_u64_e64 s2, s[8:9], 0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX10-NEXT: s_and_b32 s0, 1, s20 ; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 +; GFX10-NEXT: s_cselect_b32 s1, 1, 0 ; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[10:11], 0 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: s_mov_b32 s10, 0 +; GFX10-NEXT: s_mov_b32 s20, 0 ; GFX10-NEXT: s_and_b32 s1, 1, s1 ; GFX10-NEXT: s_ashr_i32 s0, s19, 31 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 +; GFX10-NEXT: s_cmp_lg_u32 s20, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 -; GFX10-NEXT: s_cmp_lg_u32 s10, 0 -; GFX10-NEXT: s_brev_b32 s11, 1 ; GFX10-NEXT: s_addc_u32 s1, s0, 0 -; GFX10-NEXT: s_cselect_b32 s2, 1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 +; GFX10-NEXT: s_addc_u32 s2, s0, 0 +; GFX10-NEXT: s_addc_u32 s3, s0, s21 +; GFX10-NEXT: s_sub_u32 s8, s4, s12 +; GFX10-NEXT: s_subb_u32 s9, s5, s13 +; GFX10-NEXT: s_subb_u32 s10, s6, s14 +; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[8:9], s[4:5] +; GFX10-NEXT: s_subb_u32 s11, s7, s15 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo -; GFX10-NEXT: s_and_b32 s2, s2, 1 +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], s[6:7] ; GFX10-NEXT: v_mov_b32_e32 v2, s17 -; GFX10-NEXT: s_cmp_lg_u32 s2, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, s19 +; GFX10-NEXT: v_mov_b32_e32 v7, s11 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 +; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[10:11], s[6:7] +; GFX10-NEXT: v_cmp_gt_u64_e64 s6, s[12:13], 0 ; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX10-NEXT: s_addc_u32 s2, s0, 0 -; GFX10-NEXT: s_cselect_b32 s3, 1, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, s16 -; GFX10-NEXT: s_and_b32 s3, s3, 1 -; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: s_cmp_lg_u32 s3, 0 -; GFX10-NEXT: s_addc_u32 s3, s0, s11 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo -; GFX10-NEXT: s_sub_u32 s0, s4, s12 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s3, vcc_lo -; GFX10-NEXT: s_and_b32 s1, s1, 1 -; GFX10-NEXT: v_mov_b32_e32 v2, s18 -; GFX10-NEXT: s_cmp_lg_u32 s1, 0 -; GFX10-NEXT: s_subb_u32 s1, s5, s13 -; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_cmp_lt_u64_e64 s3, s[0:1], s[4:5] -; GFX10-NEXT: s_and_b32 s8, s8, 1 -; GFX10-NEXT: v_cmp_gt_u64_e64 s4, s[12:13], 0 -; GFX10-NEXT: s_cmp_lg_u32 s8, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo -; GFX10-NEXT: s_subb_u32 s8, s6, s14 -; GFX10-NEXT: s_cselect_b32 s9, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s3 -; GFX10-NEXT: s_and_b32 s9, s9, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s4 -; GFX10-NEXT: s_cmp_lg_u32 s9, 0 -; GFX10-NEXT: v_cmp_gt_i64_e64 s4, s[14:15], 0 -; GFX10-NEXT: s_subb_u32 s9, s7, s15 -; GFX10-NEXT: s_cmp_eq_u64 s[8:9], s[6:7] -; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[8:9], s[6:7] -; GFX10-NEXT: s_cselect_b32 s2, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s4 -; GFX10-NEXT: s_and_b32 s2, 1, s2 +; GFX10-NEXT: s_cselect_b32 s16, 1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 +; GFX10-NEXT: s_and_b32 s4, 1, s16 ; GFX10-NEXT: s_cmp_eq_u64 s[14:15], 0 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s3 -; GFX10-NEXT: s_cselect_b32 s3, 1, 0 -; GFX10-NEXT: s_ashr_i32 s2, s9, 31 -; GFX10-NEXT: s_and_b32 s3, 1, s3 -; GFX10-NEXT: s_cmp_lg_u32 s10, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s3 -; GFX10-NEXT: s_addc_u32 s3, s2, 0 -; GFX10-NEXT: s_cselect_b32 s4, 1, 0 -; GFX10-NEXT: v_mov_b32_e32 v8, s9 -; GFX10-NEXT: s_and_b32 s4, s4, 1 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v6, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s4, 0 -; GFX10-NEXT: v_mov_b32_e32 v6, s1 -; GFX10-NEXT: s_addc_u32 s4, s2, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s6 +; GFX10-NEXT: v_cmp_gt_i64_e64 s6, s[14:15], 0 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 ; GFX10-NEXT: s_cselect_b32 s5, 1, 0 -; GFX10-NEXT: v_xor_b32_e32 v4, v5, v4 -; GFX10-NEXT: s_and_b32 s5, s5, 1 -; GFX10-NEXT: v_mov_b32_e32 v5, s0 -; GFX10-NEXT: v_mov_b32_e32 v7, s8 -; GFX10-NEXT: s_cmp_lg_u32 s5, 0 -; GFX10-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX10-NEXT: s_addc_u32 s1, s2, s11 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, s2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, s3, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v6, v7, s4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v7, v8, s1, vcc_lo -; GFX10-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10-NEXT: v_readfirstlane_b32 s2, v2 -; GFX10-NEXT: v_readfirstlane_b32 s3, v3 -; GFX10-NEXT: v_readfirstlane_b32 s4, v4 -; GFX10-NEXT: v_readfirstlane_b32 s5, v5 -; GFX10-NEXT: v_readfirstlane_b32 s6, v6 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: s_and_b32 s5, 1, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s6 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v5, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s18 +; GFX10-NEXT: v_mov_b32_e32 v5, s19 +; GFX10-NEXT: v_mov_b32_e32 v6, s9 +; GFX10-NEXT: v_xor_b32_e32 v3, v4, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s0, vcc_lo +; GFX10-NEXT: s_ashr_i32 s0, s11, 31 +; GFX10-NEXT: s_cmp_lg_u32 s20, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s1, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, s3, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v5, s8 +; GFX10-NEXT: s_addc_u32 s1, s0, 0 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, s10 +; GFX10-NEXT: s_addc_u32 s2, s0, 0 +; GFX10-NEXT: s_addc_u32 s3, s0, s21 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, s1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, s3, vcc_lo +; GFX10-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10-NEXT: v_readfirstlane_b32 s1, v2 +; GFX10-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10-NEXT: v_readfirstlane_b32 s3, v4 +; GFX10-NEXT: v_readfirstlane_b32 s4, v5 +; GFX10-NEXT: v_readfirstlane_b32 s5, v6 +; GFX10-NEXT: v_readfirstlane_b32 s6, v3 ; GFX10-NEXT: v_readfirstlane_b32 s7, v7 ; GFX10-NEXT: ; return to shader part epilog %result = call <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll @@ -457,7 +457,6 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_sub_u32 s0, s0, s1 ; GFX7-NEXT: s_cselect_b32 s1, 1, 0 -; GFX7-NEXT: s_and_b32 s1, s1, 1 ; GFX7-NEXT: s_sub_i32 s0, s0, s1 ; GFX7-NEXT: ; return to shader part epilog ; @@ -465,7 +464,6 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_sub_u32 s0, s0, s1 ; GFX8-NEXT: s_cselect_b32 s1, 1, 0 -; GFX8-NEXT: s_and_b32 s1, s1, 1 ; GFX8-NEXT: s_sub_i32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; @@ -473,7 +471,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_sub_u32 s0, s0, s1 ; GFX9-NEXT: s_cselect_b32 s1, 1, 0 -; GFX9-NEXT: s_and_b32 s1, s1, 1 ; GFX9-NEXT: s_sub_i32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog %usubo = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %a, i32 %b) @@ -487,13 +484,10 @@ define amdgpu_ps i64 @s_usubo_i64(i64 inreg %a, i64 inreg %b) { ; GFX7-LABEL: s_usubo_i64: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_sub_u32 s4, s0, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: s_cselect_b32 s5, 1, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: s_and_b32 s5, s5, 1 ; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] -; GFX7-NEXT: s_cmp_lg_u32 s5, 0 +; GFX7-NEXT: s_sub_u32 s4, s0, s2 ; GFX7-NEXT: s_subb_u32 s5, s1, s3 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -505,13 +499,10 @@ ; ; GFX8-LABEL: s_usubo_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sub_u32 s4, s0, s2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: s_cselect_b32 s5, 1, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_and_b32 s5, s5, 1 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] -; GFX8-NEXT: s_cmp_lg_u32 s5, 0 +; GFX8-NEXT: s_sub_u32 s4, s0, s2 ; GFX8-NEXT: s_subb_u32 s5, s1, s3 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -523,13 +514,10 @@ ; ; GFX9-LABEL: s_usubo_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_sub_u32 s4, s0, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: s_cselect_b32 s5, 1, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: s_and_b32 s5, s5, 1 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] -; GFX9-NEXT: s_cmp_lg_u32 s5, 0 +; GFX9-NEXT: s_sub_u32 s4, s0, s2 ; GFX9-NEXT: s_subb_u32 s5, s1, s3 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: v_mov_b32_e32 v1, s5 @@ -553,8 +541,6 @@ ; GFX7-NEXT: s_cselect_b32 s2, 1, 0 ; GFX7-NEXT: s_sub_u32 s1, s1, s3 ; GFX7-NEXT: s_cselect_b32 s3, 1, 0 -; GFX7-NEXT: s_and_b32 s2, s2, 1 -; GFX7-NEXT: s_and_b32 s3, s3, 1 ; GFX7-NEXT: s_sub_i32 s0, s0, s2 ; GFX7-NEXT: s_sub_i32 s1, s1, s3 ; GFX7-NEXT: ; return to shader part epilog @@ -565,8 +551,6 @@ ; GFX8-NEXT: s_cselect_b32 s2, 1, 0 ; GFX8-NEXT: s_sub_u32 s1, s1, s3 ; GFX8-NEXT: s_cselect_b32 s3, 1, 0 -; GFX8-NEXT: s_and_b32 s2, s2, 1 -; GFX8-NEXT: s_and_b32 s3, s3, 1 ; GFX8-NEXT: s_sub_i32 s0, s0, s2 ; GFX8-NEXT: s_sub_i32 s1, s1, s3 ; GFX8-NEXT: ; return to shader part epilog @@ -577,8 +561,6 @@ ; GFX9-NEXT: s_cselect_b32 s2, 1, 0 ; GFX9-NEXT: s_sub_u32 s1, s1, s3 ; GFX9-NEXT: s_cselect_b32 s3, 1, 0 -; GFX9-NEXT: s_and_b32 s2, s2, 1 -; GFX9-NEXT: s_and_b32 s3, s3, 1 ; GFX9-NEXT: s_sub_i32 s0, s0, s2 ; GFX9-NEXT: s_sub_i32 s1, s1, s3 ; GFX9-NEXT: ; return to shader part epilog @@ -728,9 +710,6 @@ ; GFX7-LABEL: s_ssubo_i64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_sub_u32 s4, s0, s2 -; GFX7-NEXT: s_cselect_b32 s5, 1, 0 -; GFX7-NEXT: s_and_b32 s5, s5, 1 -; GFX7-NEXT: s_cmp_lg_u32 s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: s_subb_u32 s5, s1, s3 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 @@ -748,9 +727,6 @@ ; GFX8-LABEL: s_ssubo_i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_sub_u32 s4, s0, s2 -; GFX8-NEXT: s_cselect_b32 s5, 1, 0 -; GFX8-NEXT: s_and_b32 s5, s5, 1 -; GFX8-NEXT: s_cmp_lg_u32 s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_subb_u32 s5, s1, s3 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -768,9 +744,6 @@ ; GFX9-LABEL: s_ssubo_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_sub_u32 s4, s0, s2 -; GFX9-NEXT: s_cselect_b32 s5, 1, 0 -; GFX9-NEXT: s_and_b32 s5, s5, 1 -; GFX9-NEXT: s_cmp_lg_u32 s5, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_subb_u32 s5, s1, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -2591,9 +2591,6 @@ ; GFX6-LABEL: s_uaddsat_i64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_add_u32 s0, s0, s2 -; GFX6-NEXT: s_cselect_b32 s4, 1, 0 -; GFX6-NEXT: s_and_b32 s4, s4, 1 -; GFX6-NEXT: s_cmp_lg_u32 s4, 0 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_addc_u32 s1, s1, s3 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 @@ -2609,9 +2606,6 @@ ; GFX8-LABEL: s_uaddsat_i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_add_u32 s0, s0, s2 -; GFX8-NEXT: s_cselect_b32 s4, 1, 0 -; GFX8-NEXT: s_and_b32 s4, s4, 1 -; GFX8-NEXT: s_cmp_lg_u32 s4, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: s_addc_u32 s1, s1, s3 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -2627,9 +2621,6 @@ ; GFX9-LABEL: s_uaddsat_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 s0, s0, s2 -; GFX9-NEXT: s_cselect_b32 s4, 1, 0 -; GFX9-NEXT: s_and_b32 s4, s4, 1 -; GFX9-NEXT: s_cmp_lg_u32 s4, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: s_addc_u32 s1, s1, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -2645,9 +2636,6 @@ ; GFX10-LABEL: s_uaddsat_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_add_u32 s0, s0, s2 -; GFX10-NEXT: s_cselect_b32 s4, 1, 0 -; GFX10-NEXT: s_and_b32 s4, s4, 1 -; GFX10-NEXT: s_cmp_lg_u32 s4, 0 ; GFX10-NEXT: s_addc_u32 s1, s1, s3 ; GFX10-NEXT: v_cmp_lt_u64_e64 s2, s[0:1], s[2:3] ; GFX10-NEXT: v_cndmask_b32_e64 v0, s0, -1, s2 @@ -2816,20 +2804,14 @@ ; GFX6-LABEL: s_uaddsat_v2i64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_add_u32 s0, s0, s4 -; GFX6-NEXT: s_cselect_b32 s8, 1, 0 -; GFX6-NEXT: s_and_b32 s8, s8, 1 -; GFX6-NEXT: s_cmp_lg_u32 s8, 0 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_addc_u32 s1, s1, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] ; GFX6-NEXT: s_add_u32 s0, s2, s6 -; GFX6-NEXT: v_mov_b32_e32 v3, s1 -; GFX6-NEXT: s_cselect_b32 s1, 1, 0 -; GFX6-NEXT: s_and_b32 s1, s1, 1 -; GFX6-NEXT: s_cmp_lg_u32 s1, 0 ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, s1 ; GFX6-NEXT: s_addc_u32 s1, s3, s7 ; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc @@ -2848,20 +2830,14 @@ ; GFX8-LABEL: s_uaddsat_v2i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_add_u32 s0, s0, s4 -; GFX8-NEXT: s_cselect_b32 s8, 1, 0 -; GFX8-NEXT: s_and_b32 s8, s8, 1 -; GFX8-NEXT: s_cmp_lg_u32 s8, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_addc_u32 s1, s1, s5 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] ; GFX8-NEXT: s_add_u32 s0, s2, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_cselect_b32 s1, 1, 0 -; GFX8-NEXT: s_and_b32 s1, s1, 1 -; GFX8-NEXT: s_cmp_lg_u32 s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: s_addc_u32 s1, s3, s7 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc @@ -2880,20 +2856,14 @@ ; GFX9-LABEL: s_uaddsat_v2i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 s0, s0, s4 -; GFX9-NEXT: s_cselect_b32 s8, 1, 0 -; GFX9-NEXT: s_and_b32 s8, s8, 1 -; GFX9-NEXT: s_cmp_lg_u32 s8, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] ; GFX9-NEXT: s_add_u32 s0, s2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_cselect_b32 s1, 1, 0 -; GFX9-NEXT: s_and_b32 s1, s1, 1 -; GFX9-NEXT: s_cmp_lg_u32 s1, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: s_addc_u32 s1, s3, s7 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc @@ -2912,23 +2882,17 @@ ; GFX10-LABEL: s_uaddsat_v2i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_add_u32 s0, s0, s4 -; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: s_and_b32 s8, s8, 1 -; GFX10-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10-NEXT: s_addc_u32 s1, s1, s5 ; GFX10-NEXT: s_add_u32 s2, s2, s6 -; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5] -; GFX10-NEXT: s_and_b32 s8, s8, 1 -; GFX10-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10-NEXT: s_addc_u32 s3, s3, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v0, s0, -1, s4 +; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5] ; GFX10-NEXT: v_cmp_lt_u64_e64 s5, s[2:3], s[6:7] +; GFX10-NEXT: v_cndmask_b32_e64 v0, s0, -1, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v1, s1, -1, s4 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v2, s2, -1, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v3, s3, -1, s5 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: ; return to shader part epilog @@ -2940,19 +2904,10 @@ ; GFX6-LABEL: s_uaddsat_i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_add_u32 s0, s0, s4 -; GFX6-NEXT: s_cselect_b32 s8, 1, 0 -; GFX6-NEXT: s_and_b32 s8, s8, 1 -; GFX6-NEXT: s_cmp_lg_u32 s8, 0 -; GFX6-NEXT: s_addc_u32 s1, s1, s5 -; GFX6-NEXT: s_cselect_b32 s8, 1, 0 -; GFX6-NEXT: s_and_b32 s8, s8, 1 -; GFX6-NEXT: s_cmp_lg_u32 s8, 0 -; GFX6-NEXT: s_addc_u32 s2, s2, s6 -; GFX6-NEXT: s_cselect_b32 s8, 1, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_and_b32 s8, s8, 1 +; GFX6-NEXT: s_addc_u32 s1, s1, s5 ; GFX6-NEXT: v_mov_b32_e32 v3, s5 -; GFX6-NEXT: s_cmp_lg_u32 s8, 0 +; GFX6-NEXT: s_addc_u32 s2, s2, s6 ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX6-NEXT: s_addc_u32 s3, s3, s7 @@ -2981,18 +2936,9 @@ ; GFX8-LABEL: s_uaddsat_i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_add_u32 s0, s0, s4 -; GFX8-NEXT: s_cselect_b32 s8, 1, 0 -; GFX8-NEXT: s_and_b32 s8, s8, 1 -; GFX8-NEXT: s_cmp_lg_u32 s8, 0 ; GFX8-NEXT: s_addc_u32 s1, s1, s5 -; GFX8-NEXT: s_cselect_b32 s8, 1, 0 -; GFX8-NEXT: s_and_b32 s8, s8, 1 -; GFX8-NEXT: s_cmp_lg_u32 s8, 0 -; GFX8-NEXT: s_addc_u32 s2, s2, s6 -; GFX8-NEXT: s_cselect_b32 s8, 1, 0 -; GFX8-NEXT: s_and_b32 s8, s8, 1 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: s_cmp_lg_u32 s8, 0 +; GFX8-NEXT: s_addc_u32 s2, s2, s6 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: s_addc_u32 s3, s3, s7 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 @@ -3025,18 +2971,9 @@ ; GFX9-LABEL: s_uaddsat_i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 s0, s0, s4 -; GFX9-NEXT: s_cselect_b32 s8, 1, 0 -; GFX9-NEXT: s_and_b32 s8, s8, 1 -; GFX9-NEXT: s_cmp_lg_u32 s8, 0 ; GFX9-NEXT: s_addc_u32 s1, s1, s5 -; GFX9-NEXT: s_cselect_b32 s8, 1, 0 -; GFX9-NEXT: s_and_b32 s8, s8, 1 -; GFX9-NEXT: s_cmp_lg_u32 s8, 0 -; GFX9-NEXT: s_addc_u32 s2, s2, s6 -; GFX9-NEXT: s_cselect_b32 s8, 1, 0 -; GFX9-NEXT: s_and_b32 s8, s8, 1 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: s_cmp_lg_u32 s8, 0 +; GFX9-NEXT: s_addc_u32 s2, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: s_addc_u32 s3, s3, s7 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 @@ -3069,26 +3006,17 @@ ; GFX10-LABEL: s_uaddsat_i128: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_add_u32 s0, s0, s4 -; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: s_and_b32 s8, s8, 1 -; GFX10-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10-NEXT: s_addc_u32 s1, s1, s5 -; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5] -; GFX10-NEXT: s_and_b32 s8, s8, 1 -; GFX10-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10-NEXT: s_addc_u32 s2, s2, s6 -; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 -; GFX10-NEXT: s_and_b32 s8, s8, 1 -; GFX10-NEXT: s_cmp_lg_u32 s8, 0 +; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5] ; GFX10-NEXT: s_addc_u32 s3, s3, s7 ; GFX10-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] -; GFX10-NEXT: v_cmp_lt_u64_e64 s5, s[2:3], s[6:7] -; GFX10-NEXT: s_cselect_b32 s4, 1, 0 -; GFX10-NEXT: s_and_b32 s4, 1, s4 +; GFX10-NEXT: s_cselect_b32 s8, 1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[2:3], s[6:7] +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4 +; GFX10-NEXT: s_and_b32 s4, 1, s8 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s5 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 @@ -3450,19 +3378,10 @@ ; GFX6-LABEL: s_uaddsat_v2i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_add_u32 s0, s0, s8 -; GFX6-NEXT: s_cselect_b32 s16, 1, 0 -; GFX6-NEXT: s_and_b32 s16, s16, 1 -; GFX6-NEXT: s_cmp_lg_u32 s16, 0 -; GFX6-NEXT: s_addc_u32 s1, s1, s9 -; GFX6-NEXT: s_cselect_b32 s16, 1, 0 -; GFX6-NEXT: s_and_b32 s16, s16, 1 -; GFX6-NEXT: s_cmp_lg_u32 s16, 0 -; GFX6-NEXT: s_addc_u32 s2, s2, s10 -; GFX6-NEXT: s_cselect_b32 s16, 1, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: s_and_b32 s16, s16, 1 +; GFX6-NEXT: s_addc_u32 s1, s1, s9 ; GFX6-NEXT: v_mov_b32_e32 v3, s9 -; GFX6-NEXT: s_cmp_lg_u32 s16, 0 +; GFX6-NEXT: s_addc_u32 s2, s2, s10 ; GFX6-NEXT: v_mov_b32_e32 v0, s10 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX6-NEXT: s_addc_u32 s3, s3, s11 @@ -3472,30 +3391,21 @@ ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1] ; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: s_add_u32 s0, s4, s12 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX6-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NEXT: s_cselect_b32 s1, 1, 0 -; GFX6-NEXT: s_and_b32 s1, s1, 1 -; GFX6-NEXT: s_cmp_lg_u32 s1, 0 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_addc_u32 s1, s5, s13 +; GFX6-NEXT: v_mov_b32_e32 v2, s1 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NEXT: s_cselect_b32 s2, 1, 0 -; GFX6-NEXT: s_and_b32 s2, s2, 1 -; GFX6-NEXT: s_cmp_lg_u32 s2, 0 -; GFX6-NEXT: s_addc_u32 s2, s6, s14 -; GFX6-NEXT: v_cndmask_b32_e64 v4, v1, -1, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v5, v2, -1, vcc -; GFX6-NEXT: v_mov_b32_e32 v1, s3 -; GFX6-NEXT: s_cselect_b32 s3, 1, 0 +; GFX6-NEXT: s_add_u32 s0, s4, s12 ; GFX6-NEXT: v_mov_b32_e32 v2, s12 -; GFX6-NEXT: s_and_b32 s3, s3, 1 +; GFX6-NEXT: v_cndmask_b32_e64 v4, v1, -1, vcc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: s_addc_u32 s1, s5, s13 ; GFX6-NEXT: v_mov_b32_e32 v3, s13 ; GFX6-NEXT: v_cndmask_b32_e64 v6, v0, -1, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v7, v1, -1, vcc -; GFX6-NEXT: s_cmp_lg_u32 s3, 0 +; GFX6-NEXT: s_addc_u32 s2, s6, s14 ; GFX6-NEXT: v_mov_b32_e32 v0, s14 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX6-NEXT: s_addc_u32 s3, s7, s15 @@ -3528,18 +3438,9 @@ ; GFX8-LABEL: s_uaddsat_v2i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_add_u32 s0, s0, s8 -; GFX8-NEXT: s_cselect_b32 s16, 1, 0 -; GFX8-NEXT: s_and_b32 s16, s16, 1 -; GFX8-NEXT: s_cmp_lg_u32 s16, 0 ; GFX8-NEXT: s_addc_u32 s1, s1, s9 -; GFX8-NEXT: s_cselect_b32 s16, 1, 0 -; GFX8-NEXT: s_and_b32 s16, s16, 1 -; GFX8-NEXT: s_cmp_lg_u32 s16, 0 -; GFX8-NEXT: s_addc_u32 s2, s2, s10 -; GFX8-NEXT: s_cselect_b32 s16, 1, 0 -; GFX8-NEXT: s_and_b32 s16, s16, 1 ; GFX8-NEXT: v_mov_b32_e32 v2, s8 -; GFX8-NEXT: s_cmp_lg_u32 s16, 0 +; GFX8-NEXT: s_addc_u32 s2, s2, s10 ; GFX8-NEXT: v_mov_b32_e32 v3, s9 ; GFX8-NEXT: s_addc_u32 s3, s3, s11 ; GFX8-NEXT: v_mov_b32_e32 v0, s10 @@ -3552,28 +3453,19 @@ ; GFX8-NEXT: s_and_b32 s8, 1, s10 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s8 -; GFX8-NEXT: v_mov_b32_e32 v1, s0 -; GFX8-NEXT: s_add_u32 s0, s4, s12 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: s_cselect_b32 s1, 1, 0 -; GFX8-NEXT: s_and_b32 s1, s1, 1 -; GFX8-NEXT: s_cmp_lg_u32 s1, 0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_addc_u32 s1, s5, s13 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: s_cselect_b32 s2, 1, 0 -; GFX8-NEXT: s_and_b32 s2, s2, 1 -; GFX8-NEXT: s_cmp_lg_u32 s2, 0 -; GFX8-NEXT: s_addc_u32 s2, s6, s14 -; GFX8-NEXT: v_cndmask_b32_e64 v4, v1, -1, vcc -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_cselect_b32 s3, 1, 0 +; GFX8-NEXT: s_add_u32 s0, s4, s12 ; GFX8-NEXT: v_cndmask_b32_e64 v5, v2, -1, vcc -; GFX8-NEXT: s_and_b32 s3, s3, 1 +; GFX8-NEXT: s_addc_u32 s1, s5, s13 ; GFX8-NEXT: v_mov_b32_e32 v2, s12 -; GFX8-NEXT: s_cmp_lg_u32 s3, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v4, v1, -1, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_addc_u32 s2, s6, s14 ; GFX8-NEXT: v_mov_b32_e32 v3, s13 ; GFX8-NEXT: v_cndmask_b32_e64 v6, v0, -1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v7, v1, -1, vcc @@ -3612,18 +3504,9 @@ ; GFX9-LABEL: s_uaddsat_v2i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 s0, s0, s8 -; GFX9-NEXT: s_cselect_b32 s16, 1, 0 -; GFX9-NEXT: s_and_b32 s16, s16, 1 -; GFX9-NEXT: s_cmp_lg_u32 s16, 0 ; GFX9-NEXT: s_addc_u32 s1, s1, s9 -; GFX9-NEXT: s_cselect_b32 s16, 1, 0 -; GFX9-NEXT: s_and_b32 s16, s16, 1 -; GFX9-NEXT: s_cmp_lg_u32 s16, 0 -; GFX9-NEXT: s_addc_u32 s2, s2, s10 -; GFX9-NEXT: s_cselect_b32 s16, 1, 0 -; GFX9-NEXT: s_and_b32 s16, s16, 1 ; GFX9-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-NEXT: s_cmp_lg_u32 s16, 0 +; GFX9-NEXT: s_addc_u32 s2, s2, s10 ; GFX9-NEXT: v_mov_b32_e32 v3, s9 ; GFX9-NEXT: s_addc_u32 s3, s3, s11 ; GFX9-NEXT: v_mov_b32_e32 v0, s10 @@ -3636,28 +3519,19 @@ ; GFX9-NEXT: s_and_b32 s8, 1, s10 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_add_u32 s0, s4, s12 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: s_cselect_b32 s1, 1, 0 -; GFX9-NEXT: s_and_b32 s1, s1, 1 -; GFX9-NEXT: s_cmp_lg_u32 s1, 0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_addc_u32 s1, s5, s13 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: s_cselect_b32 s2, 1, 0 -; GFX9-NEXT: s_and_b32 s2, s2, 1 -; GFX9-NEXT: s_cmp_lg_u32 s2, 0 -; GFX9-NEXT: s_addc_u32 s2, s6, s14 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v1, -1, vcc -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: s_cselect_b32 s3, 1, 0 +; GFX9-NEXT: s_add_u32 s0, s4, s12 ; GFX9-NEXT: v_cndmask_b32_e64 v5, v2, -1, vcc -; GFX9-NEXT: s_and_b32 s3, s3, 1 +; GFX9-NEXT: s_addc_u32 s1, s5, s13 ; GFX9-NEXT: v_mov_b32_e32 v2, s12 -; GFX9-NEXT: s_cmp_lg_u32 s3, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v4, v1, -1, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_addc_u32 s2, s6, s14 ; GFX9-NEXT: v_mov_b32_e32 v3, s13 ; GFX9-NEXT: v_cndmask_b32_e64 v6, v0, -1, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v7, v1, -1, vcc @@ -3696,69 +3570,51 @@ ; GFX10-LABEL: s_uaddsat_v2i128: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_add_u32 s0, s0, s8 -; GFX10-NEXT: s_cselect_b32 s16, 1, 0 -; GFX10-NEXT: s_and_b32 s16, s16, 1 -; GFX10-NEXT: s_cmp_lg_u32 s16, 0 ; GFX10-NEXT: s_addc_u32 s1, s1, s9 -; GFX10-NEXT: s_cselect_b32 s16, 1, 0 -; GFX10-NEXT: v_cmp_lt_u64_e64 s8, s[0:1], s[8:9] -; GFX10-NEXT: s_and_b32 s16, s16, 1 -; GFX10-NEXT: s_cmp_lg_u32 s16, 0 ; GFX10-NEXT: s_addc_u32 s2, s2, s10 -; GFX10-NEXT: s_cselect_b32 s16, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s8 -; GFX10-NEXT: s_and_b32 s16, s16, 1 -; GFX10-NEXT: s_cmp_lg_u32 s16, 0 +; GFX10-NEXT: v_cmp_lt_u64_e64 s8, s[0:1], s[8:9] ; GFX10-NEXT: s_addc_u32 s3, s3, s11 ; GFX10-NEXT: s_cmp_eq_u64 s[2:3], s[10:11] -; GFX10-NEXT: v_cmp_lt_u64_e64 s10, s[2:3], s[10:11] ; GFX10-NEXT: s_cselect_b32 s16, 1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s8 +; GFX10-NEXT: v_cmp_lt_u64_e64 s8, s[2:3], s[10:11] +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s8 ; GFX10-NEXT: s_and_b32 s8, 1, s16 ; GFX10-NEXT: s_add_u32 s4, s4, s12 -; GFX10-NEXT: s_cselect_b32 s9, 1, 0 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s8 -; GFX10-NEXT: s_and_b32 s9, s9, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s10 -; GFX10-NEXT: s_cmp_lg_u32 s9, 0 ; GFX10-NEXT: s_addc_u32 s5, s5, s13 -; GFX10-NEXT: s_cselect_b32 s9, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX10-NEXT: s_and_b32 s9, s9, 1 -; GFX10-NEXT: s_cmp_lg_u32 s9, 0 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s8 ; GFX10-NEXT: v_cmp_lt_u64_e64 s9, s[4:5], s[12:13] ; GFX10-NEXT: s_addc_u32 s6, s6, s14 -; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: s_and_b32 s8, s8, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s9 -; GFX10-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10-NEXT: s_addc_u32 s7, s7, s15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: s_cmp_eq_u64 s[6:7], s[14:15] +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s9 ; GFX10-NEXT: v_cmp_lt_u64_e64 s9, s[6:7], s[14:15] ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_and_b32 s8, 1, s8 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s8 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s9 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 1, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, s0, -1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, s1, -1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, s2, -1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, s3, -1, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: v_readfirstlane_b32 s0, v1 -; GFX10-NEXT: v_readfirstlane_b32 s1, v2 -; GFX10-NEXT: v_readfirstlane_b32 s2, v3 -; GFX10-NEXT: v_readfirstlane_b32 s3, v4 -; GFX10-NEXT: v_cndmask_b32_e64 v0, s4, -1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, s5, -1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, s6, -1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, s7, -1, vcc_lo -; GFX10-NEXT: v_readfirstlane_b32 s4, v0 -; GFX10-NEXT: v_readfirstlane_b32 s5, v1 -; GFX10-NEXT: v_readfirstlane_b32 s6, v2 -; GFX10-NEXT: v_readfirstlane_b32 s7, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, s0, -1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, s2, -1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, s3, -1, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, s1, -1, vcc_lo +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v4, s4, -1, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v5, s5, -1, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v6, s6, -1, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v7, s7, -1, s0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s4, v4 +; GFX10-NEXT: v_readfirstlane_b32 s5, v5 +; GFX10-NEXT: v_readfirstlane_b32 s6, v6 +; GFX10-NEXT: v_readfirstlane_b32 s7, v7 ; GFX10-NEXT: ; return to shader part epilog %result = call <2 x i128> @llvm.uadd.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs) ret <2 x i128> %result Index: llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll @@ -194,14 +194,11 @@ ; CHECK-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s3 ; CHECK-NEXT: s_sub_u32 s4, 0, s2 -; CHECK-NEXT: s_cselect_b32 s5, 1, 0 ; CHECK-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-NEXT: v_mac_f32_e32 v0, 0x4f800000, v2 -; CHECK-NEXT: s_and_b32 s5, s5, 1 +; CHECK-NEXT: s_subb_u32 s5, 0, s3 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; CHECK-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; CHECK-NEXT: s_cmp_lg_u32 s5, 0 -; CHECK-NEXT: s_subb_u32 s5, 0, s3 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 ; CHECK-NEXT: v_trunc_f32_e32 v2, v2 ; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -117,13 +117,10 @@ ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s11 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s10 ; GFX8-NEXT: s_sub_u32 s0, 0, s10 -; GFX8-NEXT: s_cselect_b32 s1, 1, 0 +; GFX8-NEXT: s_subb_u32 s1, 0, s11 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: s_and_b32 s1, s1, 1 -; GFX8-NEXT: s_cmp_lg_u32 s1, 0 -; GFX8-NEXT: s_subb_u32 s1, 0, s11 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX8-NEXT: v_trunc_f32_e32 v1, v1 @@ -140,19 +137,19 @@ ; GFX8-NEXT: v_mul_lo_u32 v3, v1, v4 ; GFX8-NEXT: v_mul_lo_u32 v5, v0, v2 ; GFX8-NEXT: v_mul_hi_u32 v6, v0, v4 -; GFX8-NEXT: v_mul_lo_u32 v7, v1, v2 ; GFX8-NEXT: v_mul_hi_u32 v4, v1, v4 +; GFX8-NEXT: v_mul_lo_u32 v7, v1, v2 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8-NEXT: v_mul_hi_u32 v8, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3 -; GFX8-NEXT: v_mul_hi_u32 v5, v0, v2 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8 +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v6 ; GFX8-NEXT: v_mul_hi_u32 v2, v1, v2 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc @@ -269,13 +266,10 @@ ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s11 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s10 ; GFX9-NEXT: s_sub_u32 s0, 0, s10 -; GFX9-NEXT: s_cselect_b32 s1, 1, 0 +; GFX9-NEXT: s_subb_u32 s1, 0, s11 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_and_b32 s1, s1, 1 -; GFX9-NEXT: s_cmp_lg_u32 s1, 0 -; GFX9-NEXT: s_subb_u32 s1, 0, s11 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 @@ -296,16 +290,16 @@ ; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v7, v5 ; GFX9-NEXT: v_mul_hi_u32 v2, v1, v2 ; GFX9-NEXT: v_add_u32_e32 v3, v6, v3 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v5, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v5, v5, v6 +; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add3_u32 v2, v5, v4, v2 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 @@ -412,11 +406,8 @@ ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s11 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s10 ; GFX10-NEXT: s_sub_u32 s0, 0, s10 -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: s_and_b32 s1, s1, 1 -; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 -; GFX10-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10-NEXT: s_subb_u32 s1, 0, s11 +; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -1026,13 +1017,12 @@ ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s13 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s12 ; GFX8-NEXT: s_sub_u32 s0, 0, s12 -; GFX8-NEXT: s_cselect_b32 s1, 1, 0 +; GFX8-NEXT: s_subb_u32 s1, 0, s13 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: s_and_b32 s1, s1, 1 -; GFX8-NEXT: s_cmp_lg_u32 s1, 0 -; GFX8-NEXT: s_subb_u32 s1, 0, s13 +; GFX8-NEXT: s_sub_u32 s2, 0, s14 +; GFX8-NEXT: s_subb_u32 s3, 0, s15 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX8-NEXT: v_trunc_f32_e32 v1, v1 @@ -1040,7 +1030,6 @@ ; GFX8-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: s_sub_u32 s2, 0, s14 ; GFX8-NEXT: v_mul_lo_u32 v2, s0, v1 ; GFX8-NEXT: v_mul_lo_u32 v3, s1, v0 ; GFX8-NEXT: v_mul_hi_u32 v5, s0, v0 @@ -1050,19 +1039,19 @@ ; GFX8-NEXT: v_mul_lo_u32 v3, v1, v4 ; GFX8-NEXT: v_mul_lo_u32 v5, v0, v2 ; GFX8-NEXT: v_mul_hi_u32 v6, v0, v4 -; GFX8-NEXT: v_mul_lo_u32 v7, v1, v2 ; GFX8-NEXT: v_mul_hi_u32 v4, v1, v4 +; GFX8-NEXT: v_mul_lo_u32 v7, v1, v2 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8-NEXT: v_mul_hi_u32 v8, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3 -; GFX8-NEXT: v_mul_hi_u32 v5, v0, v2 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8 +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v6 ; GFX8-NEXT: v_mul_hi_u32 v2, v1, v2 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc @@ -1171,23 +1160,19 @@ ; GFX8-NEXT: v_trunc_f32_e32 v6, v6 ; GFX8-NEXT: v_mul_f32_e32 v7, 0xcf800000, v6 ; GFX8-NEXT: v_add_f32_e32 v3, v7, v3 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[0:1] ; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GFX8-NEXT: s_cselect_b32 s0, 1, 0 -; GFX8-NEXT: s_and_b32 s0, s0, 1 -; GFX8-NEXT: s_cmp_lg_u32 s0, 0 -; GFX8-NEXT: s_subb_u32 s3, 0, s15 +; GFX8-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc ; GFX8-NEXT: v_mul_lo_u32 v7, s3, v3 ; GFX8-NEXT: v_mul_lo_u32 v8, s2, v6 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc ; GFX8-NEXT: v_mul_hi_u32 v10, s2, v3 ; GFX8-NEXT: v_mul_lo_u32 v9, s2, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v2, vcc ; GFX8-NEXT: v_add_u32_e64 v7, s[0:1], v7, v8 ; GFX8-NEXT: v_add_u32_e64 v7, s[0:1], v7, v10 ; GFX8-NEXT: v_mul_lo_u32 v8, v6, v9 ; GFX8-NEXT: v_mul_lo_u32 v10, v3, v7 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v2, vcc ; GFX8-NEXT: v_mul_hi_u32 v2, v3, v9 ; GFX8-NEXT: v_mul_hi_u32 v9, v6, v9 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v10 @@ -1318,13 +1303,13 @@ ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s13 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s12 ; GFX9-NEXT: s_sub_u32 s0, 0, s12 -; GFX9-NEXT: s_cselect_b32 s1, 1, 0 +; GFX9-NEXT: s_subb_u32 s1, 0, s13 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_and_b32 s1, s1, 1 -; GFX9-NEXT: s_cmp_lg_u32 s1, 0 -; GFX9-NEXT: s_subb_u32 s1, 0, s13 +; GFX9-NEXT: v_cvt_f32_u32_e32 v14, s15 +; GFX9-NEXT: s_sub_u32 s2, 0, s14 +; GFX9-NEXT: s_subb_u32 s3, 0, s15 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 @@ -1332,14 +1317,12 @@ ; GFX9-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v14, s15 -; GFX9-NEXT: s_sub_u32 s2, 0, s14 +; GFX9-NEXT: v_mul_f32_e32 v14, 0x4f800000, v14 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX9-NEXT: v_mul_lo_u32 v2, s0, v1 ; GFX9-NEXT: v_mul_lo_u32 v3, s1, v0 ; GFX9-NEXT: v_mul_hi_u32 v4, s0, v0 ; GFX9-NEXT: v_mul_lo_u32 v5, s0, v0 -; GFX9-NEXT: v_mul_f32_e32 v14, 0x4f800000, v14 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX9-NEXT: v_add3_u32 v2, v3, v2, v4 ; GFX9-NEXT: v_mul_lo_u32 v3, v1, v5 ; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 @@ -1349,16 +1332,16 @@ ; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v7, v5 ; GFX9-NEXT: v_mul_hi_u32 v2, v1, v2 ; GFX9-NEXT: v_add_u32_e32 v3, v6, v3 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v5, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v5, v5, v6 +; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add3_u32 v2, v5, v4, v2 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 @@ -1455,20 +1438,16 @@ ; GFX9-NEXT: v_add_f32_e32 v5, v13, v5 ; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v12 -; GFX9-NEXT: s_cselect_b32 s0, 1, 0 -; GFX9-NEXT: s_and_b32 s0, s0, 1 -; GFX9-NEXT: s_cmp_lg_u32 s0, 0 -; GFX9-NEXT: s_subb_u32 s3, 0, s15 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc ; GFX9-NEXT: v_mul_lo_u32 v13, s3, v5 ; GFX9-NEXT: v_mul_lo_u32 v14, s2, v12 ; GFX9-NEXT: v_mul_hi_u32 v16, s2, v5 ; GFX9-NEXT: v_mul_lo_u32 v17, s2, v5 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc ; GFX9-NEXT: v_add3_u32 v4, v13, v14, v16 ; GFX9-NEXT: v_mul_lo_u32 v9, v12, v17 ; GFX9-NEXT: v_mul_lo_u32 v13, v5, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc ; GFX9-NEXT: v_mul_hi_u32 v10, v5, v17 ; GFX9-NEXT: v_mul_hi_u32 v14, v12, v17 ; GFX9-NEXT: v_add_co_u32_e64 v9, s[0:1], v9, v13 @@ -1600,19 +1579,13 @@ ; GFX10-NEXT: s_sub_u32 s0, 0, s12 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX10-NEXT: v_mul_f32_e32 v2, 0x4f800000, v2 -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: s_and_b32 s1, s1, 1 +; GFX10-NEXT: s_subb_u32 s1, 0, s13 +; GFX10-NEXT: s_sub_u32 s2, 0, s14 +; GFX10-NEXT: s_subb_u32 s3, 0, s15 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_add_f32_e32 v1, v2, v3 -; GFX10-NEXT: s_cmp_lg_u32 s1, 0 -; GFX10-NEXT: s_subb_u32 s1, 0, s13 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX10-NEXT: s_sub_u32 s2, 0, s14 -; GFX10-NEXT: s_cselect_b32 s3, 1, 0 -; GFX10-NEXT: s_and_b32 s3, s3, 1 -; GFX10-NEXT: s_cmp_lg_u32 s3, 0 -; GFX10-NEXT: s_subb_u32 s3, 0, s15 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 ; GFX10-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 @@ -1690,174 +1663,174 @@ ; GFX10-NEXT: v_mul_lo_u32 v11, s2, v3 ; GFX10-NEXT: v_mul_lo_u32 v4, s0, v0 ; GFX10-NEXT: v_mul_lo_u32 v8, s2, v1 +; GFX10-NEXT: v_mov_b32_e32 v12, 0 ; GFX10-NEXT: v_add3_u32 v5, v6, v5, v7 ; GFX10-NEXT: v_add3_u32 v9, v9, v11, v10 -; GFX10-NEXT: v_mul_lo_u32 v12, v2, v4 +; GFX10-NEXT: v_mul_lo_u32 v13, v2, v4 ; GFX10-NEXT: v_mul_lo_u32 v10, v0, v5 -; GFX10-NEXT: v_mul_hi_u32 v13, v0, v4 +; GFX10-NEXT: v_mul_hi_u32 v14, v0, v4 ; GFX10-NEXT: v_mul_hi_u32 v4, v2, v4 ; GFX10-NEXT: v_mul_lo_u32 v11, v2, v5 ; GFX10-NEXT: v_mul_lo_u32 v6, v3, v8 -; GFX10-NEXT: v_mul_lo_u32 v15, v1, v9 +; GFX10-NEXT: v_mul_lo_u32 v16, v1, v9 ; GFX10-NEXT: v_mul_hi_u32 v7, v1, v8 ; GFX10-NEXT: v_mul_hi_u32 v8, v3, v8 -; GFX10-NEXT: v_add_co_u32 v10, s0, v12, v10 -; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s0 +; GFX10-NEXT: v_mul_lo_u32 v17, v3, v9 +; GFX10-NEXT: v_add_co_u32 v10, s0, v13, v10 +; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v4, s0, v11, v4 -; GFX10-NEXT: v_mul_lo_u32 v16, v3, v9 +; GFX10-NEXT: v_mul_hi_u32 v15, v0, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v15 -; GFX10-NEXT: v_mul_hi_u32 v14, v0, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v15, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v10, s0, v10, v13 +; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v16 +; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v8, s0, v17, v8 +; GFX10-NEXT: v_cndmask_b32_e64 v17, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v10, s0, v10, v14 +; GFX10-NEXT: v_mul_hi_u32 v18, v1, v9 ; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v8, s0, v16, v8 -; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v4, s0, v4, v14 -; GFX10-NEXT: v_add_nc_u32_e32 v10, v12, v10 +; GFX10-NEXT: v_add_co_u32 v4, s0, v4, v15 ; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v7 -; GFX10-NEXT: v_mul_hi_u32 v5, v2, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v4, s0, v4, v10 -; GFX10-NEXT: v_add_nc_u32_e32 v7, v11, v14 +; GFX10-NEXT: v_add_nc_u32_e32 v7, v13, v10 +; GFX10-NEXT: v_add_co_u32 v8, s0, v8, v18 +; GFX10-NEXT: v_mul_hi_u32 v5, v2, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s0 -; GFX10-NEXT: v_mul_hi_u32 v17, v1, v9 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 -; GFX10-NEXT: v_add_nc_u32_e32 v6, v15, v6 -; GFX10-NEXT: v_add3_u32 v5, v7, v10, v5 +; GFX10-NEXT: v_add_nc_u32_e32 v6, v16, v6 +; GFX10-NEXT: v_add_co_u32 v4, s0, v4, v7 +; GFX10-NEXT: v_add_nc_u32_e32 v11, v11, v14 +; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0 ; GFX10-NEXT: v_mul_hi_u32 v9, v3, v9 -; GFX10-NEXT: v_mul_hi_u32 v10, s9, v0 -; GFX10-NEXT: v_mov_b32_e32 v12, 0 -; GFX10-NEXT: v_add_co_u32 v8, s0, v8, v17 -; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v4, s0, v8, v6 -; GFX10-NEXT: v_mul_lo_u32 v6, s9, v0 -; GFX10-NEXT: v_mul_lo_u32 v8, s8, v2 -; GFX10-NEXT: v_mul_hi_u32 v0, s8, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v7, v13, v11 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 -; GFX10-NEXT: v_mul_lo_u32 v11, s9, v2 -; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, v1, v4 -; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v8 -; GFX10-NEXT: v_add3_u32 v5, v7, v5, v9 -; GFX10-NEXT: v_mul_hi_u32 v7, s8, v2 +; GFX10-NEXT: v_add_co_u32 v6, s0, v8, v6 +; GFX10-NEXT: v_add_nc_u32_e32 v10, v17, v10 ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v0, s1, v6, v0 -; GFX10-NEXT: v_add_co_u32 v9, s0, v11, v10 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 +; GFX10-NEXT: v_add3_u32 v5, v11, v7, v5 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-NEXT: v_add3_u32 v4, v10, v8, v9 +; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo +; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, v1, v6 +; GFX10-NEXT: v_mul_lo_u32 v5, s9, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo +; GFX10-NEXT: v_mul_lo_u32 v4, s8, v2 +; GFX10-NEXT: v_mul_hi_u32 v7, s8, v0 +; GFX10-NEXT: v_mul_hi_u32 v0, s9, v0 +; GFX10-NEXT: v_mul_lo_u32 v9, s9, v2 +; GFX10-NEXT: v_mul_hi_u32 v10, s8, v2 ; GFX10-NEXT: v_mul_hi_u32 v2, s9, v2 -; GFX10-NEXT: v_add_co_u32 v7, s0, v9, v7 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v8, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo -; GFX10-NEXT: v_add_co_u32 v0, s0, v7, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v4, v6, v9 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 ; GFX10-NEXT: v_mul_lo_u32 v6, s11, v1 -; GFX10-NEXT: v_mul_lo_u32 v8, s10, v3 -; GFX10-NEXT: v_mul_lo_u32 v9, s13, v0 -; GFX10-NEXT: v_mul_hi_u32 v10, s12, v0 -; GFX10-NEXT: v_add3_u32 v2, v4, v5, v2 -; GFX10-NEXT: v_mul_hi_u32 v7, s10, v1 +; GFX10-NEXT: v_mul_hi_u32 v8, s10, v1 +; GFX10-NEXT: v_add_co_u32 v4, s0, v5, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v0, s0, v9, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v4, s0, v4, v7 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v0, s0, v0, v10 +; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0 +; GFX10-NEXT: v_mul_lo_u32 v10, s10, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v4, v5, v4 ; GFX10-NEXT: v_mul_hi_u32 v1, s11, v1 -; GFX10-NEXT: v_mul_lo_u32 v4, s11, v3 +; GFX10-NEXT: v_mul_lo_u32 v5, s11, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v7, v9, v7 +; GFX10-NEXT: v_mul_hi_u32 v11, s10, v3 +; GFX10-NEXT: v_add_co_u32 v0, s0, v0, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v10 +; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v1, s0, v5, v1 +; GFX10-NEXT: v_add3_u32 v2, v7, v4, v2 +; GFX10-NEXT: v_mul_lo_u32 v5, s13, v0 +; GFX10-NEXT: v_mul_hi_u32 v7, s12, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 ; GFX10-NEXT: v_mul_lo_u32 v13, s12, v0 -; GFX10-NEXT: v_mul_lo_u32 v11, s12, v2 +; GFX10-NEXT: v_mul_lo_u32 v10, s12, v2 ; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v8 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v1, s0, v1, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 -; GFX10-NEXT: v_mul_hi_u32 v5, s10, v3 ; GFX10-NEXT: v_mul_hi_u32 v3, s11, v3 -; GFX10-NEXT: v_add_co_u32 v1, s0, v4, v1 -; GFX10-NEXT: v_add3_u32 v9, v9, v11, v10 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 -; GFX10-NEXT: v_sub_co_u32 v10, vcc_lo, s8, v13 -; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v7 -; GFX10-NEXT: v_sub_nc_u32_e32 v7, s9, v9 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 -; GFX10-NEXT: v_sub_co_ci_u32_e64 v9, s0, s9, v9, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s12, v10 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s13, v7, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s13, v9 -; GFX10-NEXT: v_add_nc_u32_e32 v6, v8, v6 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v14, vcc_lo, v10, s12 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v15, s0, 0, v7, vcc_lo -; GFX10-NEXT: v_add_co_u32 v1, s0, v1, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s13, v9 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s13, v7, vcc_lo -; GFX10-NEXT: v_add_nc_u32_e32 v4, v4, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v11, v13, v11, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s13, v15 -; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s12, v14 -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, -1, s0 +; GFX10-NEXT: v_add3_u32 v5, v5, v10, v7 +; GFX10-NEXT: v_sub_co_u32 v7, vcc_lo, s8, v13 +; GFX10-NEXT: v_add_nc_u32_e32 v4, v4, v8 +; GFX10-NEXT: v_add_nc_u32_e32 v6, v9, v6 +; GFX10-NEXT: v_sub_nc_u32_e32 v8, s9, v5 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v5, s0, s9, v5, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s12, v7 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v8, vcc_lo, s13, v8, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s13, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, -1, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v11, vcc_lo, v7, s12 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v13, s0, 0, v8, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s13, v5 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v8, vcc_lo, s13, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v9, v10, v9, s0 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s12, v11 +; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, -1, s0 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s13, v13 +; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, -1, s0 ; GFX10-NEXT: v_add_co_u32 v6, s0, v1, v6 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v5, s0, v0, 1 +; GFX10-NEXT: v_add_co_u32 v15, s0, v0, 1 ; GFX10-NEXT: v_add_co_ci_u32_e64 v16, s0, 0, v2, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s13, v15 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s13, v13 ; GFX10-NEXT: v_add3_u32 v3, v4, v1, v3 ; GFX10-NEXT: v_mul_hi_u32 v18, s14, v6 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v13, v8, s0 -; GFX10-NEXT: v_mul_lo_u32 v13, s15, v6 +; GFX10-NEXT: v_cndmask_b32_e64 v10, v14, v10, s0 +; GFX10-NEXT: v_mul_lo_u32 v14, s15, v6 ; GFX10-NEXT: v_mul_lo_u32 v17, s14, v3 -; GFX10-NEXT: v_add_co_u32 v1, s0, v5, 1 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX10-NEXT: v_add_co_u32 v1, s0, v15, 1 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 ; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, 0, v16, s0 -; GFX10-NEXT: v_sub_co_u32 v19, s0, v14, s12 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo -; GFX10-NEXT: v_mul_lo_u32 v5, s14, v6 +; GFX10-NEXT: v_sub_co_u32 v19, s0, v11, s12 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v15, v1, vcc_lo +; GFX10-NEXT: v_mul_lo_u32 v15, s14, v6 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc_lo -; GFX10-NEXT: v_add3_u32 v13, v13, v17, v18 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v7, s0, 0, v7, s0 +; GFX10-NEXT: v_add3_u32 v14, v14, v17, v18 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v8, s0, 0, v8, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo -; GFX10-NEXT: v_sub_nc_u32_e32 v2, s11, v13 -; GFX10-NEXT: v_sub_co_u32 v11, s0, s10, v5 -; GFX10-NEXT: v_sub_co_ci_u32_e64 v16, s1, s11, v13, s0 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, s11, v14 +; GFX10-NEXT: v_sub_co_u32 v9, s0, s10, v15 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v15, s1, s11, v14, s0 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v2, s0, s15, v2, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s14, v11 -; GFX10-NEXT: v_cmp_ne_u32_e64 s1, 0, v8 -; GFX10-NEXT: v_cmp_le_u32_e64 s2, s15, v16 -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, -1, s0 -; GFX10-NEXT: v_sub_co_u32 v13, s0, v11, s14 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v14, v19, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, -1, s2 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v14, s2, 0, v2, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v15, v7, s1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s15, v16 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 s1, 0, v10 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s14, v9 +; GFX10-NEXT: v_cmp_le_u32_e64 s2, s15, v15 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v11, v19, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s0 +; GFX10-NEXT: v_sub_co_u32 v14, s0, v9, s14 +; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, -1, s2 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v16, s2, 0, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v13, v8, s1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s15, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc_lo ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v2, s0, s15, v2, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v8, s1 -; GFX10-NEXT: v_cmp_le_u32_e64 s1, s15, v14 -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, -1, s1 -; GFX10-NEXT: v_cmp_le_u32_e64 s1, s14, v13 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v7, v10, v11, s1 +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s15, v16 ; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, -1, s1 -; GFX10-NEXT: v_add_co_u32 v15, s1, v6, 1 +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s14, v14 +; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s1 +; GFX10-NEXT: v_add_co_u32 v13, s1, v6, 1 ; GFX10-NEXT: v_add_co_ci_u32_e64 v17, s1, 0, v3, s1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s15, v14 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v10, s1 -; GFX10-NEXT: v_add_co_u32 v10, s1, v15, 1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s15, v16 +; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v11, s1 +; GFX10-NEXT: v_add_co_u32 v11, s1, v13, 1 ; GFX10-NEXT: v_add_co_ci_u32_e64 v18, s1, 0, v17, s1 -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v8 -; GFX10-NEXT: v_sub_co_u32 v8, s1, v13, s14 +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v10 +; GFX10-NEXT: v_sub_co_u32 v10, s1, v14, s14 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v2, s1, 0, v2, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v10, v15, v10, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v15, v17, v18, s0 -; GFX10-NEXT: v_cmp_ne_u32_e64 s1, 0, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v13, v8, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v13, v14, v2, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v9, v7, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v10, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v15, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v11, v8, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v16, v13, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v11, v13, v11, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v13, v17, v18, s0 +; GFX10-NEXT: v_cmp_ne_u32_e64 s1, 0, v7 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v14, v10, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v10, v16, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v11, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v13, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v9, v7, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v15, v10, s1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dwordx4 v12, v[0:3], s[4:5] ; GFX10-NEXT: global_store_dwordx4 v12, v[4:7], s[6:7] Index: llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -191,14 +191,11 @@ ; CHECK-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s3 ; CHECK-NEXT: s_sub_u32 s4, 0, s2 -; CHECK-NEXT: s_cselect_b32 s5, 1, 0 ; CHECK-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-NEXT: v_mac_f32_e32 v0, 0x4f800000, v2 -; CHECK-NEXT: s_and_b32 s5, s5, 1 +; CHECK-NEXT: s_subb_u32 s5, 0, s3 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; CHECK-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; CHECK-NEXT: s_cmp_lg_u32 s5, 0 -; CHECK-NEXT: s_subb_u32 s5, 0, s3 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 ; CHECK-NEXT: v_trunc_f32_e32 v2, v2 ; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 @@ -1103,226 +1100,220 @@ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: s_mov_b32 s8, 0x12d8fb ; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v4, 0 -; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s8 +; GISEL-NEXT: v_cvt_f32_u32_e32 v6, s8 ; GISEL-NEXT: s_sub_u32 s6, 0, s8 -; GISEL-NEXT: s_cselect_b32 s4, 1, 0 -; GISEL-NEXT: v_madmk_f32 v6, v4, 0x4f800000, v5 -; GISEL-NEXT: s_and_b32 s4, s4, 1 -; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v4 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v6 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v4 -; GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GISEL-NEXT: v_madmk_f32 v5, v4, 0x4f800000, v6 ; GISEL-NEXT: s_subb_u32 s7, 0, 0 ; GISEL-NEXT: s_bfe_i32 s4, -1, 0x10000 ; GISEL-NEXT: s_bfe_i32 s5, -1, 0x10000 -; GISEL-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v5 -; GISEL-NEXT: v_mul_f32_e32 v8, 0x2f800000, v6 +; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v4 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v5 ; GISEL-NEXT: v_mov_b32_e32 v5, s4 ; GISEL-NEXT: v_mov_b32_e32 v4, s5 -; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v7 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6 ; GISEL-NEXT: s_sub_u32 s9, 0, s8 -; GISEL-NEXT: s_cselect_b32 s4, 1, 0 +; GISEL-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 +; GISEL-NEXT: s_subb_u32 s10, 0, 0 +; GISEL-NEXT: s_bfe_i32 s4, -1, 0x10000 +; GISEL-NEXT: s_bfe_i32 s11, -1, 0x10000 +; GISEL-NEXT: v_mul_f32_e32 v8, 0x2f800000, v7 +; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v6 +; GISEL-NEXT: v_mov_b32_e32 v10, s4 ; GISEL-NEXT: v_trunc_f32_e32 v8, v8 ; GISEL-NEXT: v_trunc_f32_e32 v9, v9 -; GISEL-NEXT: s_and_b32 s4, s4, 1 -; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v8 +; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v8 ; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v9 +; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v9 ; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GISEL-NEXT: v_mul_lo_u32 v10, s6, v8 ; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GISEL-NEXT: s_cmp_lg_u32 s4, 0 -; GISEL-NEXT: s_subb_u32 s10, 0, 0 -; GISEL-NEXT: v_mul_lo_u32 v11, s9, v9 -; GISEL-NEXT: s_bfe_i32 s4, -1, 0x10000 -; GISEL-NEXT: s_bfe_i32 s11, -1, 0x10000 -; GISEL-NEXT: v_mul_lo_u32 v12, s6, v6 -; GISEL-NEXT: v_mul_lo_u32 v13, s7, v6 -; GISEL-NEXT: v_mul_hi_u32 v14, s6, v6 -; GISEL-NEXT: v_mul_lo_u32 v15, s9, v7 -; GISEL-NEXT: v_mul_lo_u32 v16, s10, v7 -; GISEL-NEXT: v_mul_hi_u32 v17, s9, v7 -; GISEL-NEXT: v_mov_b32_e32 v18, s4 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10 -; GISEL-NEXT: v_mul_lo_u32 v13, v8, v12 -; GISEL-NEXT: v_mul_hi_u32 v19, v6, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v16, v11 -; GISEL-NEXT: v_mul_lo_u32 v16, v9, v15 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 -; GISEL-NEXT: v_mul_hi_u32 v14, v7, v15 -; GISEL-NEXT: v_mul_hi_u32 v15, v9, v15 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v17 -; GISEL-NEXT: v_mul_lo_u32 v17, v7, v11 +; GISEL-NEXT: v_mul_lo_u32 v11, s6, v8 +; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GISEL-NEXT: v_mul_lo_u32 v12, s9, v9 +; GISEL-NEXT: v_mul_lo_u32 v13, s6, v7 +; GISEL-NEXT: v_mul_lo_u32 v14, s7, v7 +; GISEL-NEXT: v_mul_hi_u32 v15, s6, v7 +; GISEL-NEXT: v_mul_lo_u32 v16, s9, v6 +; GISEL-NEXT: v_mul_lo_u32 v17, s10, v6 +; GISEL-NEXT: v_mul_hi_u32 v18, s9, v6 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v11 +; GISEL-NEXT: v_mul_lo_u32 v14, v8, v13 +; GISEL-NEXT: v_mul_hi_u32 v19, v7, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v17, v12 +; GISEL-NEXT: v_mul_lo_u32 v17, v9, v16 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v15 +; GISEL-NEXT: v_mul_hi_u32 v15, v6, v16 +; GISEL-NEXT: v_mul_hi_u32 v16, v9, v16 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v18 +; GISEL-NEXT: v_mul_lo_u32 v18, v6, v12 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15 +; GISEL-NEXT: v_mul_lo_u32 v15, v7, v11 +; GISEL-NEXT: v_mul_lo_u32 v17, v8, v11 +; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v19 +; GISEL-NEXT: v_mul_hi_u32 v14, v7, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v8, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v19 +; GISEL-NEXT: v_mul_lo_u32 v19, v9, v12 +; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v17, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v17, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v18, v17 +; GISEL-NEXT: v_mul_hi_u32 v18, v6, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v19, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v19, v18 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v18, v17 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13 +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v11, vcc +; GISEL-NEXT: v_mul_lo_u32 v11, s6, v7 +; GISEL-NEXT: v_mul_lo_u32 v13, s7, v7 +; GISEL-NEXT: v_mul_hi_u32 v14, s6, v7 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v16 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v12, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, s9, v6 +; GISEL-NEXT: v_mul_lo_u32 v15, s10, v6 +; GISEL-NEXT: v_mul_hi_u32 v16, s9, v6 +; GISEL-NEXT: v_mul_lo_u32 v17, s6, v8 +; GISEL-NEXT: v_mul_lo_u32 v18, v8, v11 +; GISEL-NEXT: v_mul_hi_u32 v19, v7, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v8, v11 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v17 +; GISEL-NEXT: v_mul_lo_u32 v17, s9, v9 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v17 +; GISEL-NEXT: v_mul_lo_u32 v17, v9, v12 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_mul_hi_u32 v14, v6, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 +; GISEL-NEXT: v_mul_lo_u32 v16, v6, v15 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14 -; GISEL-NEXT: v_mul_lo_u32 v14, v6, v10 -; GISEL-NEXT: v_mul_lo_u32 v16, v8, v10 -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v19 -; GISEL-NEXT: v_mul_hi_u32 v13, v6, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v8, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v14, v7, v13 +; GISEL-NEXT: v_mul_lo_u32 v16, v8, v13 +; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v18, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v19 -; GISEL-NEXT: v_mul_lo_u32 v19, v9, v11 -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v16, v12 +; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v18, v19 +; GISEL-NEXT: v_mul_lo_u32 v19, v9, v15 +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v16, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v16, v13 +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v16, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16 -; GISEL-NEXT: v_mul_hi_u32 v17, v7, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v9, v11 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v19, v15 +; GISEL-NEXT: v_mul_hi_u32 v17, v6, v15 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v19, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v17 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v16 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v13 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v10, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, s6, v6 -; GISEL-NEXT: v_mul_lo_u32 v12, s7, v6 -; GISEL-NEXT: v_mul_hi_u32 v13, s6, v6 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v15 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v11, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, s9, v7 -; GISEL-NEXT: v_mul_lo_u32 v14, s10, v7 -; GISEL-NEXT: v_mul_hi_u32 v15, s9, v7 -; GISEL-NEXT: v_mul_lo_u32 v16, s6, v8 -; GISEL-NEXT: v_mul_lo_u32 v17, v8, v10 -; GISEL-NEXT: v_mul_hi_u32 v19, v6, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v8, v10 +; GISEL-NEXT: v_mov_b32_e32 v19, s11 +; GISEL-NEXT: v_mul_hi_u32 v13, v8, v13 +; GISEL-NEXT: v_mul_hi_u32 v15, v9, v15 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, s9, v9 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v9, v11 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v7, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v9, v11 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 -; GISEL-NEXT: v_mul_lo_u32 v15, v7, v14 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v16, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; GISEL-NEXT: v_mul_lo_u32 v13, v6, v12 -; GISEL-NEXT: v_mul_lo_u32 v15, v8, v12 -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v17, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v19 -; GISEL-NEXT: v_mul_hi_u32 v13, v6, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v17, s[4:5], v17, v19 -; GISEL-NEXT: v_mul_lo_u32 v19, v9, v14 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v15, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v15, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v16, v15 -; GISEL-NEXT: v_mul_hi_u32 v16, v7, v14 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v19, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v19, v16 -; GISEL-NEXT: v_mov_b32_e32 v19, s11 -; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12 -; GISEL-NEXT: v_mul_hi_u32 v14, v9, v14 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v17 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v16, v15 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v15 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v12, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v1, v6 -; GISEL-NEXT: v_mul_hi_u32 v12, v0, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v1, v6 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v18 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v16 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v13, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v3, v7 -; GISEL-NEXT: v_mul_hi_u32 v13, v2, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v3, v7 -; GISEL-NEXT: v_mul_lo_u32 v14, v0, v8 -; GISEL-NEXT: v_mul_lo_u32 v15, v1, v8 -; GISEL-NEXT: v_mul_hi_u32 v16, v0, v8 +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v13, vcc +; GISEL-NEXT: v_mul_lo_u32 v11, v1, v7 +; GISEL-NEXT: v_mul_hi_u32 v13, v0, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v1, v7 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v14, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v3, v6 +; GISEL-NEXT: v_mul_hi_u32 v14, v2, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, v3, v6 +; GISEL-NEXT: v_mul_lo_u32 v15, v0, v8 +; GISEL-NEXT: v_mul_lo_u32 v16, v1, v8 +; GISEL-NEXT: v_mul_hi_u32 v17, v0, v8 ; GISEL-NEXT: v_mul_hi_u32 v8, v1, v8 -; GISEL-NEXT: v_mul_lo_u32 v17, v2, v9 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; GISEL-NEXT: v_mul_lo_u32 v11, v3, v9 -; GISEL-NEXT: v_mul_hi_u32 v13, v2, v9 +; GISEL-NEXT: v_mul_lo_u32 v18, v2, v9 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; GISEL-NEXT: v_mul_lo_u32 v12, v3, v9 +; GISEL-NEXT: v_mul_hi_u32 v14, v2, v9 ; GISEL-NEXT: v_mul_hi_u32 v9, v3, v9 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v15, v6 +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v11, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v16 +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v16, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v12, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v16 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; GISEL-NEXT: v_mul_lo_u32 v12, s8, v6 -; GISEL-NEXT: v_mul_lo_u32 v14, 0, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, s8, v6 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v15, v11 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v18, v17 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 ; GISEL-NEXT: v_mul_lo_u32 v13, s8, v7 ; GISEL-NEXT: v_mul_lo_u32 v15, 0, v7 ; GISEL-NEXT: v_mul_hi_u32 v7, s8, v7 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; GISEL-NEXT: v_mul_lo_u32 v14, s8, v6 +; GISEL-NEXT: v_mul_lo_u32 v16, 0, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, s8, v6 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 ; GISEL-NEXT: v_mul_lo_u32 v8, s8, v8 ; GISEL-NEXT: v_mul_lo_u32 v9, s8, v9 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v15, v9 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v12 -; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], v1, v6, vcc -; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v6 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v15, v8 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v16, v9 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v13 +; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], v1, v7, vcc +; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v7 ; GISEL-NEXT: v_cmp_le_u32_e64 s[4:5], s8, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v13 -; GISEL-NEXT: v_subb_u32_e64 v9, s[6:7], v3, v7, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v3, s[6:7], v3, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v14 +; GISEL-NEXT: v_subb_u32_e64 v9, s[6:7], v3, v6, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v3, s[6:7], v3, v6 ; GISEL-NEXT: v_cmp_le_u32_e64 s[6:7], s8, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[6:7] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[6:7] ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v6, v18, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc ; GISEL-NEXT: v_subbrev_u32_e64 v3, vcc, 0, v3, s[4:5] ; GISEL-NEXT: v_subrev_i32_e32 v7, vcc, s8, v0 ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc Index: llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -2460,11 +2460,8 @@ define amdgpu_ps i64 @s_usubsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX6-LABEL: s_usubsat_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_sub_u32 s4, s0, s2 -; GFX6-NEXT: s_cselect_b32 s5, 1, 0 -; GFX6-NEXT: s_and_b32 s5, s5, 1 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NEXT: s_cmp_lg_u32 s5, 0 +; GFX6-NEXT: s_sub_u32 s4, s0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_subb_u32 s5, s1, s3 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] @@ -2478,11 +2475,8 @@ ; ; GFX8-LABEL: s_usubsat_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sub_u32 s4, s0, s2 -; GFX8-NEXT: s_cselect_b32 s5, 1, 0 -; GFX8-NEXT: s_and_b32 s5, s5, 1 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: s_cmp_lg_u32 s5, 0 +; GFX8-NEXT: s_sub_u32 s4, s0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_subb_u32 s5, s1, s3 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] @@ -2496,11 +2490,8 @@ ; ; GFX9-LABEL: s_usubsat_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_sub_u32 s4, s0, s2 -; GFX9-NEXT: s_cselect_b32 s5, 1, 0 -; GFX9-NEXT: s_and_b32 s5, s5, 1 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: s_cmp_lg_u32 s5, 0 +; GFX9-NEXT: s_sub_u32 s4, s0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: s_subb_u32 s5, s1, s3 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] @@ -2515,10 +2506,7 @@ ; GFX10-LABEL: s_usubsat_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_sub_u32 s4, s0, s2 -; GFX10-NEXT: s_cselect_b32 s5, 1, 0 ; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[2:3] -; GFX10-NEXT: s_and_b32 s5, s5, 1 -; GFX10-NEXT: s_cmp_lg_u32 s5, 0 ; GFX10-NEXT: s_subb_u32 s1, s1, s3 ; GFX10-NEXT: v_cndmask_b32_e64 v0, s4, 0, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, s1, 0, s0 @@ -2685,21 +2673,15 @@ define amdgpu_ps <2 x i64> @s_usubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inreg %rhs) { ; GFX6-LABEL: s_usubsat_v2i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_sub_u32 s8, s0, s4 -; GFX6-NEXT: s_cselect_b32 s9, 1, 0 -; GFX6-NEXT: s_and_b32 s9, s9, 1 -; GFX6-NEXT: s_cmp_lg_u32 s9, 0 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: s_subb_u32 s9, s1, s5 +; GFX6-NEXT: s_sub_u32 s8, s0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_subb_u32 s9, s1, s5 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] -; GFX6-NEXT: s_sub_u32 s0, s2, s6 -; GFX6-NEXT: s_cselect_b32 s1, 1, 0 -; GFX6-NEXT: s_and_b32 s1, s1, 1 ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: v_mov_b32_e32 v3, s9 -; GFX6-NEXT: s_cmp_lg_u32 s1, 0 +; GFX6-NEXT: s_sub_u32 s0, s2, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc @@ -2717,21 +2699,15 @@ ; ; GFX8-LABEL: s_usubsat_v2i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sub_u32 s8, s0, s4 -; GFX8-NEXT: s_cselect_b32 s9, 1, 0 -; GFX8-NEXT: s_and_b32 s9, s9, 1 -; GFX8-NEXT: s_cmp_lg_u32 s9, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: s_subb_u32 s9, s1, s5 +; GFX8-NEXT: s_sub_u32 s8, s0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_subb_u32 s9, s1, s5 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] -; GFX8-NEXT: s_sub_u32 s0, s2, s6 -; GFX8-NEXT: s_cselect_b32 s1, 1, 0 -; GFX8-NEXT: s_and_b32 s1, s1, 1 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NEXT: v_mov_b32_e32 v3, s9 -; GFX8-NEXT: s_cmp_lg_u32 s1, 0 +; GFX8-NEXT: s_sub_u32 s0, s2, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc @@ -2749,21 +2725,15 @@ ; ; GFX9-LABEL: s_usubsat_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_sub_u32 s8, s0, s4 -; GFX9-NEXT: s_cselect_b32 s9, 1, 0 -; GFX9-NEXT: s_and_b32 s9, s9, 1 -; GFX9-NEXT: s_cmp_lg_u32 s9, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: s_subb_u32 s9, s1, s5 +; GFX9-NEXT: s_sub_u32 s8, s0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_subb_u32 s9, s1, s5 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] -; GFX9-NEXT: s_sub_u32 s0, s2, s6 -; GFX9-NEXT: s_cselect_b32 s1, 1, 0 -; GFX9-NEXT: s_and_b32 s1, s1, 1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-NEXT: s_cmp_lg_u32 s1, 0 +; GFX9-NEXT: s_sub_u32 s0, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc @@ -2782,23 +2752,17 @@ ; GFX10-LABEL: s_usubsat_v2i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_sub_u32 s8, s0, s4 -; GFX10-NEXT: s_cselect_b32 s9, 1, 0 -; GFX10-NEXT: s_and_b32 s9, s9, 1 -; GFX10-NEXT: s_cmp_lg_u32 s9, 0 -; GFX10-NEXT: s_subb_u32 s9, s1, s5 -; GFX10-NEXT: v_cmp_lt_u64_e64 s1, s[0:1], s[4:5] +; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5] +; GFX10-NEXT: s_subb_u32 s1, s1, s5 ; GFX10-NEXT: s_sub_u32 s0, s2, s6 -; GFX10-NEXT: s_cselect_b32 s4, 1, 0 ; GFX10-NEXT: v_cmp_lt_u64_e64 s2, s[2:3], s[6:7] -; GFX10-NEXT: s_and_b32 s4, s4, 1 -; GFX10-NEXT: s_cmp_lg_u32 s4, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, s8, 0, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, s9, 0, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, s1, 0, s4 ; GFX10-NEXT: s_subb_u32 s1, s3, s7 +; GFX10-NEXT: v_cndmask_b32_e64 v0, s8, 0, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v2, s0, 0, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v3, s1, 0, s2 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: ; return to shader part epilog @@ -2809,28 +2773,19 @@ define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX6-LABEL: s_usubsat_i128: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_sub_u32 s8, s0, s4 -; GFX6-NEXT: s_cselect_b32 s9, 1, 0 -; GFX6-NEXT: s_and_b32 s9, s9, 1 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_cmp_lg_u32 s9, 0 ; GFX6-NEXT: v_mov_b32_e32 v3, s5 -; GFX6-NEXT: s_subb_u32 s9, s1, s5 ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] -; GFX6-NEXT: s_cselect_b32 s10, 1, 0 ; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: s_and_b32 s10, s10, 1 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] -; GFX6-NEXT: s_cmp_lg_u32 s10, 0 -; GFX6-NEXT: s_subb_u32 s10, s2, s6 +; GFX6-NEXT: s_sub_u32 s8, s0, s4 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1] -; GFX6-NEXT: s_cselect_b32 s11, 1, 0 -; GFX6-NEXT: s_and_b32 s11, s11, 1 +; GFX6-NEXT: s_subb_u32 s9, s1, s5 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX6-NEXT: s_cmp_lg_u32 s11, 0 +; GFX6-NEXT: s_subb_u32 s10, s2, s6 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX6-NEXT: s_subb_u32 s11, s3, s7 ; GFX6-NEXT: v_mov_b32_e32 v1, s8 @@ -2851,18 +2806,9 @@ ; GFX8-LABEL: s_usubsat_i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_sub_u32 s8, s0, s4 -; GFX8-NEXT: s_cselect_b32 s9, 1, 0 -; GFX8-NEXT: s_and_b32 s9, s9, 1 -; GFX8-NEXT: s_cmp_lg_u32 s9, 0 ; GFX8-NEXT: s_subb_u32 s9, s1, s5 -; GFX8-NEXT: s_cselect_b32 s10, 1, 0 -; GFX8-NEXT: s_and_b32 s10, s10, 1 -; GFX8-NEXT: s_cmp_lg_u32 s10, 0 -; GFX8-NEXT: s_subb_u32 s10, s2, s6 -; GFX8-NEXT: s_cselect_b32 s11, 1, 0 -; GFX8-NEXT: s_and_b32 s11, s11, 1 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: s_cmp_lg_u32 s11, 0 +; GFX8-NEXT: s_subb_u32 s10, s2, s6 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: s_subb_u32 s11, s3, s7 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 @@ -2895,18 +2841,9 @@ ; GFX9-LABEL: s_usubsat_i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_sub_u32 s8, s0, s4 -; GFX9-NEXT: s_cselect_b32 s9, 1, 0 -; GFX9-NEXT: s_and_b32 s9, s9, 1 -; GFX9-NEXT: s_cmp_lg_u32 s9, 0 ; GFX9-NEXT: s_subb_u32 s9, s1, s5 -; GFX9-NEXT: s_cselect_b32 s10, 1, 0 -; GFX9-NEXT: s_and_b32 s10, s10, 1 -; GFX9-NEXT: s_cmp_lg_u32 s10, 0 -; GFX9-NEXT: s_subb_u32 s10, s2, s6 -; GFX9-NEXT: s_cselect_b32 s11, 1, 0 -; GFX9-NEXT: s_and_b32 s11, s11, 1 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: s_cmp_lg_u32 s11, 0 +; GFX9-NEXT: s_subb_u32 s10, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: s_subb_u32 s11, s3, s7 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 @@ -2939,33 +2876,24 @@ ; GFX10-LABEL: s_usubsat_i128: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_sub_u32 s8, s0, s4 -; GFX10-NEXT: s_cselect_b32 s9, 1, 0 ; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[4:5] -; GFX10-NEXT: s_and_b32 s9, s9, 1 -; GFX10-NEXT: s_cmp_lg_u32 s9, 0 ; GFX10-NEXT: s_subb_u32 s9, s1, s5 -; GFX10-NEXT: s_cselect_b32 s10, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX10-NEXT: s_and_b32 s10, s10, 1 -; GFX10-NEXT: s_cmp_lg_u32 s10, 0 ; GFX10-NEXT: s_subb_u32 s10, s2, s6 -; GFX10-NEXT: s_cselect_b32 s11, 1, 0 -; GFX10-NEXT: s_and_b32 s11, s11, 1 -; GFX10-NEXT: s_cmp_lg_u32 s11, 0 -; GFX10-NEXT: s_subb_u32 s1, s3, s7 +; GFX10-NEXT: s_subb_u32 s11, s3, s7 ; GFX10-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] -; GFX10-NEXT: v_cmp_lt_u64_e64 s2, s[2:3], s[6:7] -; GFX10-NEXT: s_cselect_b32 s0, 1, 0 -; GFX10-NEXT: s_and_b32 s0, 1, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], s[6:7] +; GFX10-NEXT: s_cselect_b32 s12, 1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX10-NEXT: s_and_b32 s0, 1, s12 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, s8, 0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, s9, 0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, s10, 0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, s1, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, s11, 0, vcc_lo ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 @@ -3319,61 +3247,43 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs) { ; GFX6-LABEL: s_usubsat_v2i128: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_sub_u32 s16, s0, s8 -; GFX6-NEXT: s_cselect_b32 s17, 1, 0 -; GFX6-NEXT: s_and_b32 s17, s17, 1 -; GFX6-NEXT: s_cmp_lg_u32 s17, 0 -; GFX6-NEXT: s_subb_u32 s17, s1, s9 ; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: s_cselect_b32 s18, 1, 0 ; GFX6-NEXT: v_mov_b32_e32 v3, s9 -; GFX6-NEXT: s_and_b32 s18, s18, 1 ; GFX6-NEXT: v_mov_b32_e32 v0, s10 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] -; GFX6-NEXT: s_cmp_lg_u32 s18, 0 ; GFX6-NEXT: v_mov_b32_e32 v1, s11 -; GFX6-NEXT: s_subb_u32 s18, s2, s10 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] -; GFX6-NEXT: s_cselect_b32 s19, 1, 0 -; GFX6-NEXT: s_and_b32 s19, s19, 1 +; GFX6-NEXT: s_sub_u32 s16, s0, s8 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1] -; GFX6-NEXT: s_cmp_lg_u32 s19, 0 -; GFX6-NEXT: s_subb_u32 s19, s3, s11 +; GFX6-NEXT: s_subb_u32 s17, s1, s9 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_sub_u32 s0, s4, s12 +; GFX6-NEXT: s_subb_u32 s18, s2, s10 ; GFX6-NEXT: v_mov_b32_e32 v2, s17 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX6-NEXT: s_cselect_b32 s1, 1, 0 +; GFX6-NEXT: s_subb_u32 s19, s3, s11 ; GFX6-NEXT: v_mov_b32_e32 v1, s16 ; GFX6-NEXT: v_cndmask_b32_e64 v5, v2, 0, vcc -; GFX6-NEXT: s_and_b32 s1, s1, 1 ; GFX6-NEXT: v_mov_b32_e32 v2, s12 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v1, 0, vcc ; GFX6-NEXT: v_mov_b32_e32 v0, s18 ; GFX6-NEXT: v_mov_b32_e32 v1, s19 -; GFX6-NEXT: s_cmp_lg_u32 s1, 0 ; GFX6-NEXT: v_mov_b32_e32 v3, s13 ; GFX6-NEXT: v_cndmask_b32_e64 v6, v0, 0, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v7, v1, 0, vcc -; GFX6-NEXT: s_subb_u32 s1, s5, s13 ; GFX6-NEXT: v_mov_b32_e32 v0, s14 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] -; GFX6-NEXT: s_cselect_b32 s2, 1, 0 ; GFX6-NEXT: v_mov_b32_e32 v1, s15 -; GFX6-NEXT: s_and_b32 s2, s2, 1 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] -; GFX6-NEXT: s_cmp_lg_u32 s2, 0 -; GFX6-NEXT: s_subb_u32 s2, s6, s14 +; GFX6-NEXT: s_sub_u32 s0, s4, s12 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[0:1] -; GFX6-NEXT: s_cselect_b32 s3, 1, 0 -; GFX6-NEXT: s_and_b32 s3, s3, 1 +; GFX6-NEXT: s_subb_u32 s1, s5, s13 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX6-NEXT: s_cmp_lg_u32 s3, 0 +; GFX6-NEXT: s_subb_u32 s2, s6, s14 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX6-NEXT: s_subb_u32 s3, s7, s15 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 @@ -3398,18 +3308,9 @@ ; GFX8-LABEL: s_usubsat_v2i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_sub_u32 s16, s0, s8 -; GFX8-NEXT: s_cselect_b32 s17, 1, 0 -; GFX8-NEXT: s_and_b32 s17, s17, 1 -; GFX8-NEXT: s_cmp_lg_u32 s17, 0 ; GFX8-NEXT: s_subb_u32 s17, s1, s9 -; GFX8-NEXT: s_cselect_b32 s18, 1, 0 -; GFX8-NEXT: s_and_b32 s18, s18, 1 -; GFX8-NEXT: s_cmp_lg_u32 s18, 0 -; GFX8-NEXT: s_subb_u32 s18, s2, s10 -; GFX8-NEXT: s_cselect_b32 s19, 1, 0 -; GFX8-NEXT: s_and_b32 s19, s19, 1 ; GFX8-NEXT: v_mov_b32_e32 v2, s8 -; GFX8-NEXT: s_cmp_lg_u32 s19, 0 +; GFX8-NEXT: s_subb_u32 s18, s2, s10 ; GFX8-NEXT: v_mov_b32_e32 v3, s9 ; GFX8-NEXT: s_subb_u32 s19, s3, s11 ; GFX8-NEXT: v_mov_b32_e32 v0, s10 @@ -3422,28 +3323,19 @@ ; GFX8-NEXT: s_and_b32 s0, 1, s10 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX8-NEXT: s_sub_u32 s0, s4, s12 -; GFX8-NEXT: s_cselect_b32 s1, 1, 0 -; GFX8-NEXT: s_and_b32 s1, s1, 1 -; GFX8-NEXT: s_cmp_lg_u32 s1, 0 -; GFX8-NEXT: s_subb_u32 s1, s5, s13 -; GFX8-NEXT: s_cselect_b32 s2, 1, 0 -; GFX8-NEXT: s_and_b32 s2, s2, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: s_cmp_lg_u32 s2, 0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_subb_u32 s2, s6, s14 ; GFX8-NEXT: v_mov_b32_e32 v2, s17 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX8-NEXT: s_cselect_b32 s3, 1, 0 +; GFX8-NEXT: s_sub_u32 s0, s4, s12 ; GFX8-NEXT: v_mov_b32_e32 v1, s16 ; GFX8-NEXT: v_cndmask_b32_e64 v5, v2, 0, vcc -; GFX8-NEXT: s_and_b32 s3, s3, 1 +; GFX8-NEXT: s_subb_u32 s1, s5, s13 ; GFX8-NEXT: v_mov_b32_e32 v2, s12 ; GFX8-NEXT: v_cndmask_b32_e64 v4, v1, 0, vcc ; GFX8-NEXT: v_mov_b32_e32 v0, s18 ; GFX8-NEXT: v_mov_b32_e32 v1, s19 -; GFX8-NEXT: s_cmp_lg_u32 s3, 0 +; GFX8-NEXT: s_subb_u32 s2, s6, s14 ; GFX8-NEXT: v_mov_b32_e32 v3, s13 ; GFX8-NEXT: v_cndmask_b32_e64 v6, v0, 0, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v7, v1, 0, vcc @@ -3482,18 +3374,9 @@ ; GFX9-LABEL: s_usubsat_v2i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_sub_u32 s16, s0, s8 -; GFX9-NEXT: s_cselect_b32 s17, 1, 0 -; GFX9-NEXT: s_and_b32 s17, s17, 1 -; GFX9-NEXT: s_cmp_lg_u32 s17, 0 ; GFX9-NEXT: s_subb_u32 s17, s1, s9 -; GFX9-NEXT: s_cselect_b32 s18, 1, 0 -; GFX9-NEXT: s_and_b32 s18, s18, 1 -; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_subb_u32 s18, s2, s10 -; GFX9-NEXT: s_cselect_b32 s19, 1, 0 -; GFX9-NEXT: s_and_b32 s19, s19, 1 ; GFX9-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_subb_u32 s18, s2, s10 ; GFX9-NEXT: v_mov_b32_e32 v3, s9 ; GFX9-NEXT: s_subb_u32 s19, s3, s11 ; GFX9-NEXT: v_mov_b32_e32 v0, s10 @@ -3506,28 +3389,19 @@ ; GFX9-NEXT: s_and_b32 s0, 1, s10 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX9-NEXT: s_sub_u32 s0, s4, s12 -; GFX9-NEXT: s_cselect_b32 s1, 1, 0 -; GFX9-NEXT: s_and_b32 s1, s1, 1 -; GFX9-NEXT: s_cmp_lg_u32 s1, 0 -; GFX9-NEXT: s_subb_u32 s1, s5, s13 -; GFX9-NEXT: s_cselect_b32 s2, 1, 0 -; GFX9-NEXT: s_and_b32 s2, s2, 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: s_cmp_lg_u32 s2, 0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_subb_u32 s2, s6, s14 ; GFX9-NEXT: v_mov_b32_e32 v2, s17 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_cselect_b32 s3, 1, 0 +; GFX9-NEXT: s_sub_u32 s0, s4, s12 ; GFX9-NEXT: v_mov_b32_e32 v1, s16 ; GFX9-NEXT: v_cndmask_b32_e64 v5, v2, 0, vcc -; GFX9-NEXT: s_and_b32 s3, s3, 1 +; GFX9-NEXT: s_subb_u32 s1, s5, s13 ; GFX9-NEXT: v_mov_b32_e32 v2, s12 ; GFX9-NEXT: v_cndmask_b32_e64 v4, v1, 0, vcc ; GFX9-NEXT: v_mov_b32_e32 v0, s18 ; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: s_cmp_lg_u32 s3, 0 +; GFX9-NEXT: s_subb_u32 s2, s6, s14 ; GFX9-NEXT: v_mov_b32_e32 v3, s13 ; GFX9-NEXT: v_cndmask_b32_e64 v6, v0, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v7, v1, 0, vcc @@ -3566,69 +3440,51 @@ ; GFX10-LABEL: s_usubsat_v2i128: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_sub_u32 s16, s0, s8 -; GFX10-NEXT: s_cselect_b32 s17, 1, 0 ; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[8:9] -; GFX10-NEXT: s_and_b32 s17, s17, 1 -; GFX10-NEXT: s_cmp_lg_u32 s17, 0 ; GFX10-NEXT: s_subb_u32 s17, s1, s9 -; GFX10-NEXT: s_cselect_b32 s18, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX10-NEXT: s_and_b32 s18, s18, 1 -; GFX10-NEXT: s_cmp_lg_u32 s18, 0 ; GFX10-NEXT: s_subb_u32 s18, s2, s10 -; GFX10-NEXT: s_cselect_b32 s19, 1, 0 -; GFX10-NEXT: s_and_b32 s19, s19, 1 -; GFX10-NEXT: s_cmp_lg_u32 s19, 0 ; GFX10-NEXT: s_subb_u32 s19, s3, s11 ; GFX10-NEXT: s_cmp_eq_u64 s[2:3], s[10:11] -; GFX10-NEXT: v_cmp_lt_u64_e64 s2, s[2:3], s[10:11] +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], s[10:11] ; GFX10-NEXT: s_cselect_b32 s20, 1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX10-NEXT: s_and_b32 s0, 1, s20 -; GFX10-NEXT: s_sub_u32 s8, s4, s12 -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 +; GFX10-NEXT: s_sub_u32 s2, s4, s12 +; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[4:5], s[12:13] ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10-NEXT: s_and_b32 s1, s1, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 -; GFX10-NEXT: s_cmp_lg_u32 s1, 0 -; GFX10-NEXT: s_subb_u32 s3, s5, s13 -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX10-NEXT: s_and_b32 s1, s1, 1 -; GFX10-NEXT: s_cmp_lg_u32 s1, 0 -; GFX10-NEXT: v_cmp_lt_u64_e64 s1, s[4:5], s[12:13] -; GFX10-NEXT: s_subb_u32 s10, s6, s14 -; GFX10-NEXT: s_cselect_b32 s0, 1, 0 -; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: s_and_b32 s0, s0, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1 -; GFX10-NEXT: s_cmp_lg_u32 s0, 0 -; GFX10-NEXT: v_cmp_lt_u64_e64 s1, s[6:7], s[14:15] -; GFX10-NEXT: s_subb_u32 s9, s7, s15 +; GFX10-NEXT: s_subb_u32 s1, s5, s13 +; GFX10-NEXT: s_subb_u32 s8, s6, s14 +; GFX10-NEXT: s_subb_u32 s3, s7, s15 ; GFX10-NEXT: s_cmp_eq_u64 s[6:7], s[14:15] +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4 +; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[6:7], s[14:15] ; GFX10-NEXT: s_cselect_b32 s0, 1, 0 ; GFX10-NEXT: s_and_b32 s0, 1, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 1, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, s16, 0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, s17, 0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, s18, 0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, s19, 0, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: v_readfirstlane_b32 s0, v1 -; GFX10-NEXT: v_readfirstlane_b32 s1, v2 -; GFX10-NEXT: v_readfirstlane_b32 s2, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v0, s8, 0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, s3, 0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, s10, 0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, s9, 0, vcc_lo -; GFX10-NEXT: v_readfirstlane_b32 s3, v4 -; GFX10-NEXT: v_readfirstlane_b32 s4, v0 -; GFX10-NEXT: v_readfirstlane_b32 s5, v1 -; GFX10-NEXT: v_readfirstlane_b32 s6, v2 -; GFX10-NEXT: v_readfirstlane_b32 s7, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, s16, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, s18, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, s19, 0, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, s17, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v4, s2, 0, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v5, s1, 0, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v6, s8, 0, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v7, s3, 0, s0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 +; GFX10-NEXT: v_readfirstlane_b32 s4, v4 +; GFX10-NEXT: v_readfirstlane_b32 s5, v5 +; GFX10-NEXT: v_readfirstlane_b32 s6, v6 +; GFX10-NEXT: v_readfirstlane_b32 s7, v7 ; GFX10-NEXT: ; return to shader part epilog %result = call <2 x i128> @llvm.usub.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs) ret <2 x i128> %result Index: llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll @@ -190,9 +190,6 @@ ; GCN-NEXT: s_xor_b64 s[2:3], s[0:1], s[2:3] ; GCN-NEXT: s_not_b64 s[4:5], s[2:3] ; GCN-NEXT: s_add_u32 s2, s2, s0 -; GCN-NEXT: s_cselect_b32 s0, 1, 0 -; GCN-NEXT: s_and_b32 s0, s0, 1 -; GCN-NEXT: s_cmp_lg_u32 s0, 0 ; GCN-NEXT: s_addc_u32 s3, s3, s1 ; GCN-NEXT: s_mov_b32 s0, s4 ; GCN-NEXT: s_mov_b32 s1, s5 @@ -203,11 +200,8 @@ ; GFX10-NEXT: s_xor_b64 s[2:3], s[0:1], s[2:3] ; GFX10-NEXT: s_not_b64 s[4:5], s[2:3] ; GFX10-NEXT: s_add_u32 s2, s2, s0 -; GFX10-NEXT: s_cselect_b32 s0, 1, 0 -; GFX10-NEXT: s_and_b32 s0, s0, 1 -; GFX10-NEXT: s_cmp_lg_u32 s0, 0 -; GFX10-NEXT: s_mov_b32 s0, s4 ; GFX10-NEXT: s_addc_u32 s3, s3, s1 +; GFX10-NEXT: s_mov_b32 s0, s4 ; GFX10-NEXT: s_mov_b32 s1, s5 ; GFX10-NEXT: ; return to shader part epilog %xor = xor i64 %a, %b Index: llvm/test/CodeGen/AMDGPU/bfi_int.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/bfi_int.ll +++ llvm/test/CodeGen/AMDGPU/bfi_int.ll @@ -1616,9 +1616,6 @@ ; GFX8-GISEL-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] ; GFX8-GISEL-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] ; GFX8-GISEL-NEXT: s_add_u32 s0, s0, 10 -; GFX8-GISEL-NEXT: s_cselect_b32 s2, 1, 0 -; GFX8-GISEL-NEXT: s_and_b32 s2, s2, 1 -; GFX8-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX8-GISEL-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s1 @@ -1635,9 +1632,6 @@ ; GFX10-GISEL-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 10 -; GFX10-GISEL-NEXT: s_cselect_b32 s2, 1, 0 -; GFX10-GISEL-NEXT: s_and_b32 s2, s2, 1 -; GFX10-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1 @@ -1710,9 +1704,6 @@ ; GFX8-GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] ; GFX8-GISEL-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] ; GFX8-GISEL-NEXT: s_add_u32 s0, s0, 10 -; GFX8-GISEL-NEXT: s_cselect_b32 s2, 1, 0 -; GFX8-GISEL-NEXT: s_and_b32 s2, s2, 1 -; GFX8-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX8-GISEL-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s1 @@ -1729,9 +1720,6 @@ ; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] ; GFX10-GISEL-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 10 -; GFX10-GISEL-NEXT: s_cselect_b32 s2, 1, 0 -; GFX10-GISEL-NEXT: s_and_b32 s2, s2, 1 -; GFX10-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1 @@ -1804,9 +1792,6 @@ ; GFX8-GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] ; GFX8-GISEL-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] ; GFX8-GISEL-NEXT: s_add_u32 s0, s0, 10 -; GFX8-GISEL-NEXT: s_cselect_b32 s2, 1, 0 -; GFX8-GISEL-NEXT: s_and_b32 s2, s2, 1 -; GFX8-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX8-GISEL-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s1 @@ -1823,9 +1808,6 @@ ; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] ; GFX10-GISEL-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 10 -; GFX10-GISEL-NEXT: s_cselect_b32 s2, 1, 0 -; GFX10-GISEL-NEXT: s_and_b32 s2, s2, 1 -; GFX10-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1 @@ -1902,9 +1884,6 @@ ; GFX8-GISEL-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1] ; GFX8-GISEL-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] ; GFX8-GISEL-NEXT: s_add_u32 s0, s0, 10 -; GFX8-GISEL-NEXT: s_cselect_b32 s2, 1, 0 -; GFX8-GISEL-NEXT: s_and_b32 s2, s2, 1 -; GFX8-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX8-GISEL-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s1 @@ -1922,9 +1901,6 @@ ; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[6:7], s[2:3] ; GFX10-GISEL-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 10 -; GFX10-GISEL-NEXT: s_cselect_b32 s2, 1, 0 -; GFX10-GISEL-NEXT: s_and_b32 s2, s2, 1 -; GFX10-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1 Index: llvm/test/CodeGen/AMDGPU/constrained-shift.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/constrained-shift.ll +++ llvm/test/CodeGen/AMDGPU/constrained-shift.ll @@ -283,14 +283,8 @@ ; GISEL-NEXT: s_lshr_b64 s[6:7], s[0:1], s2 ; GISEL-NEXT: s_ashr_i64 s[0:1], s[0:1], s2 ; GISEL-NEXT: s_add_u32 s2, s4, s6 -; GISEL-NEXT: s_cselect_b32 s3, 1, 0 -; GISEL-NEXT: s_and_b32 s3, s3, 1 -; GISEL-NEXT: s_cmp_lg_u32 s3, 0 ; GISEL-NEXT: s_addc_u32 s3, s5, s7 ; GISEL-NEXT: s_add_u32 s0, s2, s0 -; GISEL-NEXT: s_cselect_b32 s2, 1, 0 -; GISEL-NEXT: s_and_b32 s2, s2, 1 -; GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GISEL-NEXT: s_addc_u32 s1, s3, s1 ; GISEL-NEXT: ; return to shader part epilog %and = and i64 %b, 63 @@ -322,14 +316,8 @@ ; GISEL-NEXT: s_lshr_b64 s[6:7], s[0:1], s2 ; GISEL-NEXT: s_ashr_i64 s[0:1], s[0:1], s2 ; GISEL-NEXT: s_add_u32 s2, s4, s6 -; GISEL-NEXT: s_cselect_b32 s3, 1, 0 -; GISEL-NEXT: s_and_b32 s3, s3, 1 -; GISEL-NEXT: s_cmp_lg_u32 s3, 0 ; GISEL-NEXT: s_addc_u32 s3, s5, s7 ; GISEL-NEXT: s_add_u32 s0, s2, s0 -; GISEL-NEXT: s_cselect_b32 s2, 1, 0 -; GISEL-NEXT: s_and_b32 s2, s2, 1 -; GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GISEL-NEXT: s_addc_u32 s1, s3, s1 ; GISEL-NEXT: ; return to shader part epilog %and = and i64 %b, 255 Index: llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp =================================================================== --- llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp +++ llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp @@ -1972,3 +1972,24 @@ CheckBits(30, Copies.size() - 2); CheckBits(5, Copies.size() - 1); } + +TEST_F(AArch64GISelMITest, TestKnownBitsUADDO) { + StringRef MIRString = R"( + %ptr:_(p0) = G_IMPLICIT_DEF + %ld0:_(s32) = G_LOAD %ptr(p0) :: (load (s16)) + %ld1:_(s32) = G_LOAD %ptr(p0) :: (load (s16)) + + %add:_(s32), %overflow:_(s32) = G_UADDO %ld0, %ld1 + %copy_overflow:_(s32) = COPY %overflow +)"; + + setUp(MIRString); + if (!TM) + return; + + Register CopyOverflow = Copies[Copies.size() - 1]; + GISelKnownBits Info(*MF); + KnownBits Res = Info.getKnownBits(CopyOverflow); + EXPECT_EQ(0u, Res.One.getZExtValue()); + EXPECT_EQ(31u, Res.Zero.countLeadingOnes()); +} Index: llvm/unittests/CodeGen/GlobalISel/KnownBitsVectorTest.cpp =================================================================== --- llvm/unittests/CodeGen/GlobalISel/KnownBitsVectorTest.cpp +++ llvm/unittests/CodeGen/GlobalISel/KnownBitsVectorTest.cpp @@ -1527,3 +1527,24 @@ EXPECT_EQ(0u, Res.One.getZExtValue()); EXPECT_EQ(0xFFFFFFFFFFFFFFF8u, Res.Zero.getZExtValue()); } + +TEST_F(AArch64GISelMITest, TestNumSignBitsUAddoOverflow) { + StringRef MIRString = R"( + %copy_x0:_(s64) = COPY $x0 + %copy_x1:_(s64) = COPY $x1 + %x0_x1:_(<2 x s64>) = G_BUILD_VECTOR %copy_x0, %copy_x1 + %uaddo:_(<2 x s64>), %overflow:_(<2 x s32>) = G_UADDO %x0_x1, %x0_x1 + %result:_(<2 x s32>) = COPY %overflow +)"; + + setUp(MIRString); + if (!TM) + return; + + Register CopyOverflow = Copies[Copies.size() - 1]; + + GISelKnownBits Info(*MF); + + // Assert sign-extension from vector boolean + EXPECT_EQ(32u, Info.computeNumSignBits(CopyOverflow)); +}