Index: llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -816,7 +816,7 @@ // Shift cannot use a min/max expansion, we can't detect overflow if all of // the bits have been shifted out. - if (IsShift || TLI.isOperationLegalOrCustom(Opcode, PromotedType)) { + if (IsShift || TLI.isOperationLegal(Opcode, PromotedType)) { unsigned ShiftOp; switch (Opcode) { case ISD::SADDSAT: Index: llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -8103,14 +8103,12 @@ return DAG.getSelect(dl, VT, Overflow, Zero, SumDiff); } - // SatMax -> Overflow && SumDiff < 0 - // SatMin -> Overflow && SumDiff >= 0 + // Overflow ? (SumDiff >> BW) ^ MinVal : SumDiff APInt MinVal = APInt::getSignedMinValue(BitWidth); - APInt MaxVal = APInt::getSignedMaxValue(BitWidth); SDValue SatMin = DAG.getConstant(MinVal, dl, VT); - SDValue SatMax = DAG.getConstant(MaxVal, dl, VT); - SDValue SumNeg = DAG.getSetCC(dl, BoolVT, SumDiff, Zero, ISD::SETLT); - Result = DAG.getSelect(dl, VT, SumNeg, SatMax, SatMin); + SDValue Shift = DAG.getNode(ISD::SRA, dl, VT, SumDiff, + DAG.getConstant(BitWidth - 1, dl, VT)); + Result = DAG.getNode(ISD::XOR, dl, VT, Shift, SatMin); return DAG.getSelect(dl, VT, Overflow, Result, SumDiff); } Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -207,6 +207,13 @@ setOperationAction(ISD::ABS , MVT::i64 , Custom); } + // Signed saturation subtraction. + setOperationAction(ISD::SSUBSAT , MVT::i8 , Custom); + setOperationAction(ISD::SSUBSAT , MVT::i16 , Custom); + setOperationAction(ISD::SSUBSAT , MVT::i32 , Custom); + if (Subtarget.is64Bit()) + setOperationAction(ISD::SSUBSAT , MVT::i64 , Custom); + // Funnel shifts. for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) { // For slow shld targets we only lower for code size. @@ -27958,6 +27965,23 @@ return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT)); } + if (Opcode == ISD::SSUBSAT && !VT.isVector()) { + unsigned BitWidth = VT.getScalarSizeInBits(); + APInt MinVal = APInt::getSignedMinValue(BitWidth); + APInt MaxVal = APInt::getSignedMaxValue(BitWidth); + SDValue Zero = DAG.getConstant(0, DL, VT); + SDValue Result = + DAG.getNode(ISD::SSUBO, DL, DAG.getVTList(VT, SetCCResultType), X, Y); + SDValue SumDiff = Result.getValue(0); + SDValue Overflow = Result.getValue(1); + SDValue SatMin = DAG.getConstant(MinVal, DL, VT); + SDValue SatMax = DAG.getConstant(MaxVal, DL, VT); + SDValue SumNeg = + DAG.getSetCC(DL, SetCCResultType, SumDiff, Zero, ISD::SETLT); + Result = DAG.getSelect(DL, VT, SumNeg, SatMax, SatMin); + return DAG.getSelect(DL, VT, Overflow, Result, SumDiff); + } + // Use default expansion. return SDValue(); } Index: llvm/test/CodeGen/AArch64/sadd_sat.ll =================================================================== --- llvm/test/CodeGen/AArch64/sadd_sat.ll +++ llvm/test/CodeGen/AArch64/sadd_sat.ll @@ -12,11 +12,9 @@ ; CHECK-LABEL: func: ; CHECK: // %bb.0: ; CHECK-NEXT: adds w8, w0, w1 -; CHECK-NEXT: mov w9, #2147483647 -; CHECK-NEXT: cmp w8, #0 -; CHECK-NEXT: cinv w8, w9, ge -; CHECK-NEXT: adds w9, w0, w1 -; CHECK-NEXT: csel w0, w8, w9, vs +; CHECK-NEXT: asr w9, w8, #31 +; CHECK-NEXT: eor w9, w9, #0x80000000 +; CHECK-NEXT: csel w0, w9, w8, vs ; CHECK-NEXT: ret %tmp = call i32 @llvm.sadd.sat.i32(i32 %x, i32 %y); ret i32 %tmp; @@ -26,11 +24,9 @@ ; CHECK-LABEL: func2: ; CHECK: // %bb.0: ; CHECK-NEXT: adds x8, x0, x1 -; CHECK-NEXT: mov x9, #9223372036854775807 -; CHECK-NEXT: cmp x8, #0 -; CHECK-NEXT: cinv x8, x9, ge -; CHECK-NEXT: adds x9, x0, x1 -; CHECK-NEXT: csel x0, x8, x9, vs +; CHECK-NEXT: asr x9, x8, #63 +; CHECK-NEXT: eor x9, x9, #0x8000000000000000 +; CHECK-NEXT: csel x0, x9, x8, vs ; CHECK-NEXT: ret %tmp = call i64 @llvm.sadd.sat.i64(i64 %x, i64 %y); ret i64 %tmp; Index: llvm/test/CodeGen/AArch64/sadd_sat_plus.ll =================================================================== --- llvm/test/CodeGen/AArch64/sadd_sat_plus.ll +++ llvm/test/CodeGen/AArch64/sadd_sat_plus.ll @@ -11,11 +11,9 @@ ; CHECK-LABEL: func32: ; CHECK: // %bb.0: ; CHECK-NEXT: mul w8, w1, w2 -; CHECK-NEXT: adds w10, w0, w8 -; CHECK-NEXT: mov w9, #2147483647 -; CHECK-NEXT: cmp w10, #0 -; CHECK-NEXT: cinv w9, w9, ge ; CHECK-NEXT: adds w8, w0, w8 +; CHECK-NEXT: asr w9, w8, #31 +; CHECK-NEXT: eor w9, w9, #0x80000000 ; CHECK-NEXT: csel w0, w9, w8, vs ; CHECK-NEXT: ret %a = mul i32 %y, %z @@ -27,11 +25,9 @@ ; CHECK-LABEL: func64: ; CHECK: // %bb.0: ; CHECK-NEXT: adds x8, x0, x2 -; CHECK-NEXT: mov x9, #9223372036854775807 -; CHECK-NEXT: cmp x8, #0 -; CHECK-NEXT: cinv x8, x9, ge -; CHECK-NEXT: adds x9, x0, x2 -; CHECK-NEXT: csel x0, x8, x9, vs +; CHECK-NEXT: asr x9, x8, #63 +; CHECK-NEXT: eor x9, x9, #0x8000000000000000 +; CHECK-NEXT: csel x0, x9, x8, vs ; CHECK-NEXT: ret %a = mul i64 %y, %z %tmp = call i64 @llvm.sadd.sat.i64(i64 %x, i64 %z) Index: llvm/test/CodeGen/AArch64/sadd_sat_vec.ll =================================================================== --- llvm/test/CodeGen/AArch64/sadd_sat_vec.ll +++ llvm/test/CodeGen/AArch64/sadd_sat_vec.ll @@ -351,26 +351,23 @@ ; CHECK-LABEL: v2i128: ; CHECK: // %bb.0: ; CHECK-NEXT: adds x8, x2, x6 -; CHECK-NEXT: adcs x12, x3, x7 -; CHECK-NEXT: mov x9, #9223372036854775807 -; CHECK-NEXT: eor x10, x3, x7 -; CHECK-NEXT: cmp x12, #0 -; CHECK-NEXT: eor x13, x3, x12 -; CHECK-NEXT: cinv x14, x9, ge -; CHECK-NEXT: bics xzr, x13, x10 -; CHECK-NEXT: asr x10, x12, #63 -; CHECK-NEXT: csel x2, x10, x8, lt -; CHECK-NEXT: csel x3, x14, x12, lt +; CHECK-NEXT: adcs x11, x3, x7 +; CHECK-NEXT: eor x9, x3, x7 +; CHECK-NEXT: eor x12, x3, x11 +; CHECK-NEXT: bics xzr, x12, x9 +; CHECK-NEXT: asr x9, x11, #63 +; CHECK-NEXT: eor x12, x9, #0x8000000000000000 +; CHECK-NEXT: csel x2, x9, x8, lt +; CHECK-NEXT: csel x3, x12, x11, lt ; CHECK-NEXT: adds x8, x0, x4 -; CHECK-NEXT: adcs x10, x1, x5 -; CHECK-NEXT: eor x11, x1, x5 -; CHECK-NEXT: cmp x10, #0 -; CHECK-NEXT: eor x12, x1, x10 -; CHECK-NEXT: cinv x9, x9, ge -; CHECK-NEXT: bics xzr, x12, x11 -; CHECK-NEXT: asr x11, x10, #63 +; CHECK-NEXT: adcs x9, x1, x5 +; CHECK-NEXT: eor x10, x1, x5 +; CHECK-NEXT: eor x12, x1, x9 +; CHECK-NEXT: asr x11, x9, #63 +; CHECK-NEXT: bics xzr, x12, x10 +; CHECK-NEXT: eor x13, x11, #0x8000000000000000 ; CHECK-NEXT: csel x8, x11, x8, lt -; CHECK-NEXT: csel x1, x9, x10, lt +; CHECK-NEXT: csel x1, x13, x9, lt ; CHECK-NEXT: fmov d0, x8 ; CHECK-NEXT: mov v0.d[1], x1 ; CHECK-NEXT: fmov x0, d0 Index: llvm/test/CodeGen/AArch64/ssub_sat.ll =================================================================== --- llvm/test/CodeGen/AArch64/ssub_sat.ll +++ llvm/test/CodeGen/AArch64/ssub_sat.ll @@ -12,11 +12,9 @@ ; CHECK-LABEL: func: ; CHECK: // %bb.0: ; CHECK-NEXT: subs w8, w0, w1 -; CHECK-NEXT: mov w9, #2147483647 -; CHECK-NEXT: cmp w8, #0 -; CHECK-NEXT: cinv w8, w9, ge -; CHECK-NEXT: subs w9, w0, w1 -; CHECK-NEXT: csel w0, w8, w9, vs +; CHECK-NEXT: asr w9, w8, #31 +; CHECK-NEXT: eor w9, w9, #0x80000000 +; CHECK-NEXT: csel w0, w9, w8, vs ; CHECK-NEXT: ret %tmp = call i32 @llvm.ssub.sat.i32(i32 %x, i32 %y); ret i32 %tmp; @@ -26,11 +24,9 @@ ; CHECK-LABEL: func2: ; CHECK: // %bb.0: ; CHECK-NEXT: subs x8, x0, x1 -; CHECK-NEXT: mov x9, #9223372036854775807 -; CHECK-NEXT: cmp x8, #0 -; CHECK-NEXT: cinv x8, x9, ge -; CHECK-NEXT: subs x9, x0, x1 -; CHECK-NEXT: csel x0, x8, x9, vs +; CHECK-NEXT: asr x9, x8, #63 +; CHECK-NEXT: eor x9, x9, #0x8000000000000000 +; CHECK-NEXT: csel x0, x9, x8, vs ; CHECK-NEXT: ret %tmp = call i64 @llvm.ssub.sat.i64(i64 %x, i64 %y); ret i64 %tmp; Index: llvm/test/CodeGen/AArch64/ssub_sat_plus.ll =================================================================== --- llvm/test/CodeGen/AArch64/ssub_sat_plus.ll +++ llvm/test/CodeGen/AArch64/ssub_sat_plus.ll @@ -11,11 +11,9 @@ ; CHECK-LABEL: func32: ; CHECK: // %bb.0: ; CHECK-NEXT: mul w8, w1, w2 -; CHECK-NEXT: subs w10, w0, w8 -; CHECK-NEXT: mov w9, #2147483647 -; CHECK-NEXT: cmp w10, #0 -; CHECK-NEXT: cinv w9, w9, ge ; CHECK-NEXT: subs w8, w0, w8 +; CHECK-NEXT: asr w9, w8, #31 +; CHECK-NEXT: eor w9, w9, #0x80000000 ; CHECK-NEXT: csel w0, w9, w8, vs ; CHECK-NEXT: ret %a = mul i32 %y, %z @@ -27,11 +25,9 @@ ; CHECK-LABEL: func64: ; CHECK: // %bb.0: ; CHECK-NEXT: subs x8, x0, x2 -; CHECK-NEXT: mov x9, #9223372036854775807 -; CHECK-NEXT: cmp x8, #0 -; CHECK-NEXT: cinv x8, x9, ge -; CHECK-NEXT: subs x9, x0, x2 -; CHECK-NEXT: csel x0, x8, x9, vs +; CHECK-NEXT: asr x9, x8, #63 +; CHECK-NEXT: eor x9, x9, #0x8000000000000000 +; CHECK-NEXT: csel x0, x9, x8, vs ; CHECK-NEXT: ret %a = mul i64 %y, %z %tmp = call i64 @llvm.ssub.sat.i64(i64 %x, i64 %z) Index: llvm/test/CodeGen/AArch64/ssub_sat_vec.ll =================================================================== --- llvm/test/CodeGen/AArch64/ssub_sat_vec.ll +++ llvm/test/CodeGen/AArch64/ssub_sat_vec.ll @@ -354,26 +354,23 @@ ; CHECK-LABEL: v2i128: ; CHECK: // %bb.0: ; CHECK-NEXT: subs x8, x2, x6 -; CHECK-NEXT: sbcs x12, x3, x7 -; CHECK-NEXT: mov x9, #9223372036854775807 -; CHECK-NEXT: eor x10, x3, x7 -; CHECK-NEXT: cmp x12, #0 -; CHECK-NEXT: eor x13, x3, x12 -; CHECK-NEXT: cinv x14, x9, ge -; CHECK-NEXT: tst x10, x13 -; CHECK-NEXT: asr x10, x12, #63 -; CHECK-NEXT: csel x2, x10, x8, lt -; CHECK-NEXT: csel x3, x14, x12, lt +; CHECK-NEXT: sbcs x11, x3, x7 +; CHECK-NEXT: eor x9, x3, x7 +; CHECK-NEXT: eor x12, x3, x11 +; CHECK-NEXT: tst x9, x12 +; CHECK-NEXT: asr x9, x11, #63 +; CHECK-NEXT: eor x12, x9, #0x8000000000000000 +; CHECK-NEXT: csel x2, x9, x8, lt +; CHECK-NEXT: csel x3, x12, x11, lt ; CHECK-NEXT: subs x8, x0, x4 -; CHECK-NEXT: sbcs x10, x1, x5 -; CHECK-NEXT: eor x11, x1, x5 -; CHECK-NEXT: cmp x10, #0 -; CHECK-NEXT: eor x12, x1, x10 -; CHECK-NEXT: cinv x9, x9, ge -; CHECK-NEXT: tst x11, x12 -; CHECK-NEXT: asr x11, x10, #63 +; CHECK-NEXT: sbcs x9, x1, x5 +; CHECK-NEXT: eor x10, x1, x5 +; CHECK-NEXT: eor x12, x1, x9 +; CHECK-NEXT: asr x11, x9, #63 +; CHECK-NEXT: tst x10, x12 +; CHECK-NEXT: eor x13, x11, #0x8000000000000000 ; CHECK-NEXT: csel x8, x11, x8, lt -; CHECK-NEXT: csel x1, x9, x10, lt +; CHECK-NEXT: csel x1, x13, x9, lt ; CHECK-NEXT: fmov d0, x8 ; CHECK-NEXT: mov v0.d[1], x1 ; CHECK-NEXT: fmov x0, d0 Index: llvm/test/CodeGen/AMDGPU/saddsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/saddsat.ll +++ llvm/test/CodeGen/AMDGPU/saddsat.ll @@ -62,10 +62,8 @@ ; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v1 ; GFX8-NEXT: v_add_u16_e32 v1, v0, v1 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, 0xffff8000 -; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fff -; GFX8-NEXT: v_cmp_gt_i16_e64 s[6:7], 0, v1 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[6:7] +; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v1 +; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -93,10 +91,8 @@ ; GFX6-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1 ; GFX6-NEXT: v_add_i32_e64 v1, s[4:5], v0, v1 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v1, v0 -; GFX6-NEXT: v_bfrev_b32_e32 v0, 1 -; GFX6-NEXT: v_bfrev_b32_e32 v2, -2 -; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v1 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[6:7] +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v1 +; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -107,10 +103,8 @@ ; GFX8-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1 ; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v0, v1 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v1, v0 -; GFX8-NEXT: v_bfrev_b32_e32 v0, 1 -; GFX8-NEXT: v_bfrev_b32_e32 v2, -2 -; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v1 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[6:7] +; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v1 +; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -159,19 +153,18 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX8-NEXT: v_add_u16_e32 v4, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v5, 0xffff8000 -; GFX8-NEXT: v_mov_b32_e32 v6, 0x7fff -; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v7, v5, v6, vcc -; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v4, v3 ; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v2 +; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v4, v3 +; GFX8-NEXT: v_ashrrev_i16_e32 v2, 15, v4 +; GFX8-NEXT: s_movk_i32 s6, 0x8000 +; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v1 ; GFX8-NEXT: v_add_u16_e32 v1, v0, v1 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v1, v0 -; GFX8-NEXT: v_cmp_gt_i16_e64 s[6:7], 0, v1 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[6:7] +; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v1 +; GFX8-NEXT: v_xor_b32_e32 v0, s6, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc @@ -228,26 +221,25 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX8-NEXT: v_add_u16_e32 v6, v5, v4 -; GFX8-NEXT: v_mov_b32_e32 v7, 0xffff8000 -; GFX8-NEXT: v_mov_b32_e32 v8, 0x7fff -; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v6 -; GFX8-NEXT: v_cndmask_b32_e32 v9, v7, v8, vcc -; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v6, v5 ; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v4 +; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v6, v5 +; GFX8-NEXT: v_ashrrev_i16_e32 v4, 15, v6 +; GFX8-NEXT: s_movk_i32 s6, 0x8000 +; GFX8-NEXT: v_xor_b32_e32 v4, s6, v4 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v3 ; GFX8-NEXT: v_add_u16_e32 v3, v1, v3 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v3, v1 -; GFX8-NEXT: v_cmp_gt_i16_e64 s[6:7], 0, v3 -; GFX8-NEXT: v_cndmask_b32_e64 v1, v7, v8, s[6:7] +; GFX8-NEXT: v_ashrrev_i16_e32 v1, 15, v3 +; GFX8-NEXT: v_xor_b32_e32 v1, s6, v1 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v2 ; GFX8-NEXT: v_add_u16_e32 v2, v0, v2 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v2, v0 -; GFX8-NEXT: v_cmp_gt_i16_e64 s[6:7], 0, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[6:7] +; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v2 +; GFX8-NEXT: v_xor_b32_e32 v0, s6, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 @@ -313,19 +305,18 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX8-NEXT: v_add_u16_e32 v6, v5, v4 -; GFX8-NEXT: v_mov_b32_e32 v7, 0xffff8000 -; GFX8-NEXT: v_mov_b32_e32 v8, 0x7fff -; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v6 -; GFX8-NEXT: v_cndmask_b32_e32 v9, v7, v8, vcc -; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v6, v5 ; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v4 +; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v6, v5 +; GFX8-NEXT: v_ashrrev_i16_e32 v4, 15, v6 +; GFX8-NEXT: s_movk_i32 s6, 0x8000 +; GFX8-NEXT: v_xor_b32_e32 v4, s6, v4 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v2 ; GFX8-NEXT: v_add_u16_e32 v2, v0, v2 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v2, v0 -; GFX8-NEXT: v_cmp_gt_i16_e64 s[6:7], 0, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[6:7] +; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v2 +; GFX8-NEXT: v_xor_b32_e32 v0, s6, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -333,17 +324,17 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX8-NEXT: v_add_u16_e32 v5, v4, v2 -; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc -; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v5, v4 ; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v2 +; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v5, v4 +; GFX8-NEXT: v_ashrrev_i16_e32 v2, 15, v5 +; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc ; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v3 ; GFX8-NEXT: v_add_u16_e32 v3, v1, v3 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v3, v1 -; GFX8-NEXT: v_cmp_gt_i16_e64 s[6:7], 0, v3 -; GFX8-NEXT: v_cndmask_b32_e64 v1, v7, v8, s[6:7] +; GFX8-NEXT: v_ashrrev_i16_e32 v1, 15, v3 +; GFX8-NEXT: v_xor_b32_e32 v1, s6, v1 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc @@ -376,17 +367,16 @@ ; GFX6-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 ; GFX6-NEXT: v_add_i32_e64 v2, s[4:5], v0, v2 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v0 -; GFX6-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX6-NEXT: v_bfrev_b32_e32 v5, -2 -; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v2 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v4, v5, s[6:7] +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v2 +; GFX6-NEXT: s_brev_b32 s6, 1 +; GFX6-NEXT: v_xor_b32_e32 v0, s6, v0 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX6-NEXT: v_add_i32_e64 v2, s[4:5], v1, v3 -; GFX6-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1 -; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v2 -; GFX6-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[6:7] +; GFX6-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v2 +; GFX6-NEXT: v_xor_b32_e32 v1, s6, v1 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -397,17 +387,16 @@ ; GFX8-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 ; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v0, v2 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v0 -; GFX8-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX8-NEXT: v_bfrev_b32_e32 v5, -2 -; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v5, s[6:7] +; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v2 +; GFX8-NEXT: s_brev_b32 s6, 1 +; GFX8-NEXT: v_xor_b32_e32 v0, s6, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v1, v3 -; GFX8-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1 -; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[6:7] +; GFX8-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3 +; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v2 +; GFX8-NEXT: v_xor_b32_e32 v1, s6, v1 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -438,13 +427,10 @@ ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v1, v3, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] -; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v5 ; GFX6-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX6-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[4:5] -; GFX6-NEXT: v_bfrev_b32_e32 v2, -2 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v5 -; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -455,13 +441,10 @@ ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v1, v3, vcc ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] -; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v5 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX8-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[4:5] -; GFX8-NEXT: v_bfrev_b32_e32 v2, -2 -; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -472,13 +455,10 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v3, vcc ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v5 ; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[4:5] -; GFX9-NEXT: v_bfrev_b32_e32 v2, -2 -; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX9-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -487,15 +467,13 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 -; GFX10-NEXT: v_bfrev_b32_e32 v6, -2 -; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo ; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[2:3] -; GFX10-NEXT: v_cmp_gt_i64_e64 s5, 0, v[4:5] +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] -; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0x80000000, v6, s5 +; GFX10-NEXT: v_xor_b32_e32 v1, 0x80000000, v6 ; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs) Index: llvm/test/CodeGen/AMDGPU/ssubsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/ssubsat.ll +++ llvm/test/CodeGen/AMDGPU/ssubsat.ll @@ -62,10 +62,8 @@ ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v1 ; GFX8-NEXT: v_sub_u16_e32 v1, v0, v1 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, 0xffff8000 -; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fff -; GFX8-NEXT: v_cmp_gt_i16_e64 s[6:7], 0, v1 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[6:7] +; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v1 +; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -93,10 +91,8 @@ ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v1 ; GFX6-NEXT: v_sub_i32_e64 v1, s[4:5], v0, v1 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v1, v0 -; GFX6-NEXT: v_bfrev_b32_e32 v0, 1 -; GFX6-NEXT: v_bfrev_b32_e32 v2, -2 -; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v1 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[6:7] +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v1 +; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -107,10 +103,8 @@ ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v1 ; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v0, v1 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v1, v0 -; GFX8-NEXT: v_bfrev_b32_e32 v0, 1 -; GFX8-NEXT: v_bfrev_b32_e32 v2, -2 -; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v1 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[6:7] +; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v1 +; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -159,19 +153,18 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX8-NEXT: v_sub_u16_e32 v4, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v5, 0xffff8000 -; GFX8-NEXT: v_mov_b32_e32 v6, 0x7fff -; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v7, v5, v6, vcc -; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v4, v3 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], 0, v2 +; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v4, v3 +; GFX8-NEXT: v_ashrrev_i16_e32 v2, 15, v4 +; GFX8-NEXT: s_movk_i32 s6, 0x8000 +; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v1 ; GFX8-NEXT: v_sub_u16_e32 v1, v0, v1 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v1, v0 -; GFX8-NEXT: v_cmp_gt_i16_e64 s[6:7], 0, v1 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[6:7] +; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v1 +; GFX8-NEXT: v_xor_b32_e32 v0, s6, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc @@ -229,26 +222,25 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX8-NEXT: v_sub_u16_e32 v6, v5, v4 -; GFX8-NEXT: v_mov_b32_e32 v7, 0xffff8000 -; GFX8-NEXT: v_mov_b32_e32 v8, 0x7fff -; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v6 -; GFX8-NEXT: v_cndmask_b32_e32 v9, v7, v8, vcc -; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v6, v5 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], 0, v4 +; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v6, v5 +; GFX8-NEXT: v_ashrrev_i16_e32 v4, 15, v6 +; GFX8-NEXT: s_movk_i32 s6, 0x8000 +; GFX8-NEXT: v_xor_b32_e32 v4, s6, v4 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v3 ; GFX8-NEXT: v_sub_u16_e32 v3, v1, v3 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v3, v1 -; GFX8-NEXT: v_cmp_gt_i16_e64 s[6:7], 0, v3 -; GFX8-NEXT: v_cndmask_b32_e64 v1, v7, v8, s[6:7] +; GFX8-NEXT: v_ashrrev_i16_e32 v1, 15, v3 +; GFX8-NEXT: v_xor_b32_e32 v1, s6, v1 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v2 ; GFX8-NEXT: v_sub_u16_e32 v2, v0, v2 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v2, v0 -; GFX8-NEXT: v_cmp_gt_i16_e64 s[6:7], 0, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[6:7] +; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v2 +; GFX8-NEXT: v_xor_b32_e32 v0, s6, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 @@ -314,19 +306,18 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX8-NEXT: v_sub_u16_e32 v6, v5, v4 -; GFX8-NEXT: v_mov_b32_e32 v7, 0xffff8000 -; GFX8-NEXT: v_mov_b32_e32 v8, 0x7fff -; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v6 -; GFX8-NEXT: v_cndmask_b32_e32 v9, v7, v8, vcc -; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v6, v5 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], 0, v4 +; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v6, v5 +; GFX8-NEXT: v_ashrrev_i16_e32 v4, 15, v6 +; GFX8-NEXT: s_movk_i32 s6, 0x8000 +; GFX8-NEXT: v_xor_b32_e32 v4, s6, v4 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v2 ; GFX8-NEXT: v_sub_u16_e32 v2, v0, v2 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v2, v0 -; GFX8-NEXT: v_cmp_gt_i16_e64 s[6:7], 0, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[6:7] +; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v2 +; GFX8-NEXT: v_xor_b32_e32 v0, s6, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -334,17 +325,17 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX8-NEXT: v_sub_u16_e32 v5, v4, v2 -; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc -; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v5, v4 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], 0, v2 +; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v5, v4 +; GFX8-NEXT: v_ashrrev_i16_e32 v2, 15, v5 +; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v3 ; GFX8-NEXT: v_sub_u16_e32 v3, v1, v3 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v3, v1 -; GFX8-NEXT: v_cmp_gt_i16_e64 s[6:7], 0, v3 -; GFX8-NEXT: v_cndmask_b32_e64 v1, v7, v8, s[6:7] +; GFX8-NEXT: v_ashrrev_i16_e32 v1, 15, v3 +; GFX8-NEXT: v_xor_b32_e32 v1, s6, v1 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc @@ -377,17 +368,16 @@ ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v2 ; GFX6-NEXT: v_sub_i32_e64 v2, s[4:5], v0, v2 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v0 -; GFX6-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX6-NEXT: v_bfrev_b32_e32 v5, -2 -; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v2 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v4, v5, s[6:7] +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v2 +; GFX6-NEXT: s_brev_b32 s6, 1 +; GFX6-NEXT: v_xor_b32_e32 v0, s6, v0 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX6-NEXT: v_sub_i32_e64 v2, s[4:5], v1, v3 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v3 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1 -; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v2 -; GFX6-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[6:7] +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v3 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v2 +; GFX6-NEXT: v_xor_b32_e32 v1, s6, v1 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -398,17 +388,16 @@ ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v2 ; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v0, v2 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v0 -; GFX8-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX8-NEXT: v_bfrev_b32_e32 v5, -2 -; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v5, s[6:7] +; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v2 +; GFX8-NEXT: s_brev_b32 s6, 1 +; GFX8-NEXT: v_xor_b32_e32 v0, s6, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v1, v3 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v3 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1 -; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[6:7] +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v3 +; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v2 +; GFX8-NEXT: v_xor_b32_e32 v1, s6, v1 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -438,24 +427,23 @@ ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v3 ; GFX6-NEXT: v_sub_i32_e64 v3, s[4:5], v0, v3 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v0 -; GFX6-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX6-NEXT: v_bfrev_b32_e32 v7, -2 -; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v3 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[6:7] +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3 +; GFX6-NEXT: s_brev_b32 s6, 1 +; GFX6-NEXT: v_xor_b32_e32 v0, s6, v0 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc ; GFX6-NEXT: v_sub_i32_e64 v3, s[4:5], v1, v4 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v4 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v1 -; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v3 -; GFX6-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[6:7] +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v4 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v3 +; GFX6-NEXT: v_xor_b32_e32 v1, s6, v1 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX6-NEXT: v_sub_i32_e64 v3, s[4:5], v2, v5 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v2 -; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v3 -; GFX6-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[6:7] +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5 +; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v3 +; GFX6-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -466,24 +454,23 @@ ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v3 ; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v0, v3 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v0 -; GFX8-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX8-NEXT: v_bfrev_b32_e32 v7, -2 -; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v3 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[6:7] +; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3 +; GFX8-NEXT: s_brev_b32 s6, 1 +; GFX8-NEXT: v_xor_b32_e32 v0, s6, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc ; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v1, v4 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v4 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v1 -; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v3 -; GFX8-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[6:7] +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v4 +; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v3 +; GFX8-NEXT: v_xor_b32_e32 v1, s6, v1 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v2, v5 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v2 -; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v3 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[6:7] +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5 +; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v3 +; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -515,31 +502,30 @@ ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v4 ; GFX6-NEXT: v_sub_i32_e64 v4, s[4:5], v0, v4 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v0 -; GFX6-NEXT: v_bfrev_b32_e32 v8, 1 -; GFX6-NEXT: v_bfrev_b32_e32 v9, -2 -; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v4 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v8, v9, s[6:7] +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v4 +; GFX6-NEXT: s_brev_b32 s6, 1 +; GFX6-NEXT: v_xor_b32_e32 v0, s6, v0 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX6-NEXT: v_sub_i32_e64 v4, s[4:5], v1, v5 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v1 -; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v4 -; GFX6-NEXT: v_cndmask_b32_e64 v1, v8, v9, s[6:7] +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v4 +; GFX6-NEXT: v_xor_b32_e32 v1, s6, v1 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GFX6-NEXT: v_sub_i32_e64 v4, s[4:5], v2, v6 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v6 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v2 -; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v4 -; GFX6-NEXT: v_cndmask_b32_e64 v2, v8, v9, s[6:7] +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v6 +; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v4 +; GFX6-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX6-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v7 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v7 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v3 -; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v4 -; GFX6-NEXT: v_cndmask_b32_e64 v3, v8, v9, s[6:7] +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v7 +; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v4 +; GFX6-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -550,31 +536,30 @@ ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v4 ; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v0, v4 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v0 -; GFX8-NEXT: v_bfrev_b32_e32 v8, 1 -; GFX8-NEXT: v_bfrev_b32_e32 v9, -2 -; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v4 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v8, v9, s[6:7] +; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v4 +; GFX8-NEXT: s_brev_b32 s6, 1 +; GFX8-NEXT: v_xor_b32_e32 v0, s6, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v1, v5 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v1 -; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v4 -; GFX8-NEXT: v_cndmask_b32_e64 v1, v8, v9, s[6:7] +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5 +; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v4 +; GFX8-NEXT: v_xor_b32_e32 v1, s6, v1 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v2, v6 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v6 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v2 -; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v4 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v8, v9, s[6:7] +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v6 +; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v4 +; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v3, v7 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v7 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v3 -; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v4 -; GFX8-NEXT: v_cndmask_b32_e64 v3, v8, v9, s[6:7] +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v7 +; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v4 +; GFX8-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -608,59 +593,59 @@ ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v8 ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v0, v8 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v0 -; GFX6-NEXT: v_bfrev_b32_e32 v16, 1 -; GFX6-NEXT: v_bfrev_b32_e32 v17, -2 -; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v8 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v16, v17, s[6:7] +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v8 +; GFX6-NEXT: s_brev_b32 s6, 1 +; GFX6-NEXT: v_xor_b32_e32 v0, s6, v0 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v1, v9 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v9 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v1 -; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v8 -; GFX6-NEXT: v_cndmask_b32_e64 v1, v16, v17, s[6:7] +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v9 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v8 +; GFX6-NEXT: v_xor_b32_e32 v1, s6, v1 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v2, v10 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v10 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v2 -; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v8 -; GFX6-NEXT: v_cndmask_b32_e64 v2, v16, v17, s[6:7] +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v10 +; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v8 +; GFX6-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v3, v11 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v11 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v3 -; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v8 -; GFX6-NEXT: v_cndmask_b32_e64 v3, v16, v17, s[6:7] +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v11 +; GFX6-NEXT: v_bfrev_b32_e32 v16, 1 +; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v8 +; GFX6-NEXT: v_xor_b32_e32 v3, v16, v3 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v4, v12 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v12 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v4 -; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v8 -; GFX6-NEXT: v_cndmask_b32_e64 v4, v16, v17, s[6:7] +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v12 +; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v8 +; GFX6-NEXT: v_xor_b32_e32 v4, v16, v4 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v5, v13 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v13 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v5 -; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v8 -; GFX6-NEXT: v_cndmask_b32_e64 v5, v16, v17, s[6:7] +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v13 +; GFX6-NEXT: v_ashrrev_i32_e32 v5, 31, v8 +; GFX6-NEXT: v_xor_b32_e32 v5, v16, v5 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v6, v14 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v14 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v6 -; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v8 -; GFX6-NEXT: v_cndmask_b32_e64 v6, v16, v17, s[6:7] +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v14 +; GFX6-NEXT: v_ashrrev_i32_e32 v6, 31, v8 +; GFX6-NEXT: v_xor_b32_e32 v6, v16, v6 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v7, v15 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v15 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v7 -; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v8 -; GFX6-NEXT: v_cndmask_b32_e64 v7, v16, v17, s[6:7] +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v15 +; GFX6-NEXT: v_ashrrev_i32_e32 v7, 31, v8 +; GFX6-NEXT: v_xor_b32_e32 v7, v16, v7 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -671,59 +656,59 @@ ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v8 ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v0, v8 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v0 -; GFX8-NEXT: v_bfrev_b32_e32 v16, 1 -; GFX8-NEXT: v_bfrev_b32_e32 v17, -2 -; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v16, v17, s[6:7] +; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v8 +; GFX8-NEXT: s_brev_b32 s6, 1 +; GFX8-NEXT: v_xor_b32_e32 v0, s6, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v1, v9 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v9 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v1 -; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v1, v16, v17, s[6:7] +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v9 +; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v8 +; GFX8-NEXT: v_xor_b32_e32 v1, s6, v1 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v2, v10 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v10 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v2 -; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v16, v17, s[6:7] +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v10 +; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v8 +; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v3, v11 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v11 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v3 -; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v3, v16, v17, s[6:7] +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v11 +; GFX8-NEXT: v_bfrev_b32_e32 v16, 1 +; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v8 +; GFX8-NEXT: v_xor_b32_e32 v3, v16, v3 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v4, v12 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v12 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v4 -; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v4, v16, v17, s[6:7] +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v12 +; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v8 +; GFX8-NEXT: v_xor_b32_e32 v4, v16, v4 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v5, v13 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v13 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v5 -; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v5, v16, v17, s[6:7] +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v13 +; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v8 +; GFX8-NEXT: v_xor_b32_e32 v5, v16, v5 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v6, v14 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v14 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v6 -; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v6, v16, v17, s[6:7] +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v14 +; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v8 +; GFX8-NEXT: v_xor_b32_e32 v6, v16, v6 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v7, v15 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v15 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v7 -; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v7, v16, v17, s[6:7] +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v15 +; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v8 +; GFX8-NEXT: v_xor_b32_e32 v7, v16, v7 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -765,115 +750,115 @@ ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v16 ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v0, v16 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v0 -; GFX6-NEXT: v_bfrev_b32_e32 v32, 1 -; GFX6-NEXT: v_bfrev_b32_e32 v33, -2 -; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v32, v33, s[6:7] +; GFX6-NEXT: s_brev_b32 s6, 1 +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v16 +; GFX6-NEXT: v_xor_b32_e32 v0, s6, v0 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v1, v17 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v17 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v1 -; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 -; GFX6-NEXT: v_cndmask_b32_e64 v1, v32, v33, s[6:7] +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v16 +; GFX6-NEXT: v_xor_b32_e32 v1, s6, v1 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v2, v18 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v18 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v2 -; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 -; GFX6-NEXT: v_cndmask_b32_e64 v2, v32, v33, s[6:7] +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v18 +; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v16 +; GFX6-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v3, v19 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v19 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v3 -; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 -; GFX6-NEXT: v_cndmask_b32_e64 v3, v32, v33, s[6:7] +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v19 +; GFX6-NEXT: v_bfrev_b32_e32 v17, 1 +; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v16 +; GFX6-NEXT: v_xor_b32_e32 v3, v17, v3 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v3, v16, v3, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v4, v20 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v20 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v4 -; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 -; GFX6-NEXT: v_cndmask_b32_e64 v4, v32, v33, s[6:7] +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v20 +; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v16 +; GFX6-NEXT: v_xor_b32_e32 v4, v17, v4 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v5, v21 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v21 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v5 -; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 -; GFX6-NEXT: v_cndmask_b32_e64 v5, v32, v33, s[6:7] +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v21 +; GFX6-NEXT: v_ashrrev_i32_e32 v5, 31, v16 +; GFX6-NEXT: v_xor_b32_e32 v5, v17, v5 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v6, v22 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v22 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v6 -; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 -; GFX6-NEXT: v_cndmask_b32_e64 v6, v32, v33, s[6:7] +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v22 +; GFX6-NEXT: v_ashrrev_i32_e32 v6, 31, v16 +; GFX6-NEXT: v_xor_b32_e32 v6, v17, v6 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v6, v16, v6, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v7, v23 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v23 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v7 -; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 -; GFX6-NEXT: v_cndmask_b32_e64 v7, v32, v33, s[6:7] +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v23 +; GFX6-NEXT: v_ashrrev_i32_e32 v7, 31, v16 +; GFX6-NEXT: v_xor_b32_e32 v7, v17, v7 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v7, v16, v7, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v8, v24 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v24 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v8 -; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 -; GFX6-NEXT: v_cndmask_b32_e64 v8, v32, v33, s[6:7] +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v24 +; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v16 +; GFX6-NEXT: v_xor_b32_e32 v8, v17, v8 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v9, v25 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v25 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v9 -; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 -; GFX6-NEXT: v_cndmask_b32_e64 v9, v32, v33, s[6:7] +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v25 +; GFX6-NEXT: v_ashrrev_i32_e32 v9, 31, v16 +; GFX6-NEXT: v_xor_b32_e32 v9, v17, v9 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v9, v16, v9, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v10, v26 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v26 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v10 -; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 -; GFX6-NEXT: v_cndmask_b32_e64 v10, v32, v33, s[6:7] +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v26 +; GFX6-NEXT: v_ashrrev_i32_e32 v10, 31, v16 +; GFX6-NEXT: v_xor_b32_e32 v10, v17, v10 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v10, v16, v10, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v11, v27 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v27 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v11 -; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 -; GFX6-NEXT: v_cndmask_b32_e64 v11, v32, v33, s[6:7] +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v27 +; GFX6-NEXT: v_ashrrev_i32_e32 v11, 31, v16 +; GFX6-NEXT: v_xor_b32_e32 v11, v17, v11 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v11, v16, v11, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v12, v28 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v28 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v12 -; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 -; GFX6-NEXT: v_cndmask_b32_e64 v12, v32, v33, s[6:7] +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v28 +; GFX6-NEXT: v_ashrrev_i32_e32 v12, 31, v16 +; GFX6-NEXT: v_xor_b32_e32 v12, v17, v12 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v12, v16, v12, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v13, v29 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v29 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v13 -; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 -; GFX6-NEXT: v_cndmask_b32_e64 v13, v32, v33, s[6:7] +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v29 +; GFX6-NEXT: v_ashrrev_i32_e32 v13, 31, v16 +; GFX6-NEXT: v_xor_b32_e32 v13, v17, v13 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v13, v16, v13, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v14, v30 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v30 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v14 -; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 -; GFX6-NEXT: v_cndmask_b32_e64 v14, v32, v33, s[6:7] +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v30 +; GFX6-NEXT: v_ashrrev_i32_e32 v14, 31, v16 +; GFX6-NEXT: v_xor_b32_e32 v14, v17, v14 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v14, v16, v14, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v15, v31 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v31 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v15 -; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 -; GFX6-NEXT: v_cndmask_b32_e64 v15, v32, v33, s[6:7] +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v31 +; GFX6-NEXT: v_ashrrev_i32_e32 v15, 31, v16 +; GFX6-NEXT: v_xor_b32_e32 v15, v17, v15 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -884,115 +869,115 @@ ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v16 ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v0, v16 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v0 -; GFX8-NEXT: v_bfrev_b32_e32 v32, 1 -; GFX8-NEXT: v_bfrev_b32_e32 v33, -2 -; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v32, v33, s[6:7] +; GFX8-NEXT: s_brev_b32 s6, 1 +; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v16 +; GFX8-NEXT: v_xor_b32_e32 v0, s6, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v1, v17 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v17 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v1 -; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 -; GFX8-NEXT: v_cndmask_b32_e64 v1, v32, v33, s[6:7] +; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v16 +; GFX8-NEXT: v_xor_b32_e32 v1, s6, v1 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v2, v18 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v18 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v2 -; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v32, v33, s[6:7] +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v18 +; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v16 +; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v3, v19 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v19 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v3 -; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 -; GFX8-NEXT: v_cndmask_b32_e64 v3, v32, v33, s[6:7] +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v19 +; GFX8-NEXT: v_bfrev_b32_e32 v17, 1 +; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v16 +; GFX8-NEXT: v_xor_b32_e32 v3, v17, v3 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v3, v16, v3, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v4, v20 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v20 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v4 -; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 -; GFX8-NEXT: v_cndmask_b32_e64 v4, v32, v33, s[6:7] +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v20 +; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v16 +; GFX8-NEXT: v_xor_b32_e32 v4, v17, v4 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v5, v21 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v21 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v5 -; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 -; GFX8-NEXT: v_cndmask_b32_e64 v5, v32, v33, s[6:7] +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v21 +; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v16 +; GFX8-NEXT: v_xor_b32_e32 v5, v17, v5 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v6, v22 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v22 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v6 -; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 -; GFX8-NEXT: v_cndmask_b32_e64 v6, v32, v33, s[6:7] +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v22 +; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v16 +; GFX8-NEXT: v_xor_b32_e32 v6, v17, v6 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v6, v16, v6, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v7, v23 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v23 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v7 -; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 -; GFX8-NEXT: v_cndmask_b32_e64 v7, v32, v33, s[6:7] +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v23 +; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v16 +; GFX8-NEXT: v_xor_b32_e32 v7, v17, v7 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v7, v16, v7, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v8, v24 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v24 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v8 -; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 -; GFX8-NEXT: v_cndmask_b32_e64 v8, v32, v33, s[6:7] +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v24 +; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v16 +; GFX8-NEXT: v_xor_b32_e32 v8, v17, v8 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v9, v25 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v25 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v9 -; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 -; GFX8-NEXT: v_cndmask_b32_e64 v9, v32, v33, s[6:7] +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v25 +; GFX8-NEXT: v_ashrrev_i32_e32 v9, 31, v16 +; GFX8-NEXT: v_xor_b32_e32 v9, v17, v9 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v9, v16, v9, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v10, v26 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v26 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v10 -; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 -; GFX8-NEXT: v_cndmask_b32_e64 v10, v32, v33, s[6:7] +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v26 +; GFX8-NEXT: v_ashrrev_i32_e32 v10, 31, v16 +; GFX8-NEXT: v_xor_b32_e32 v10, v17, v10 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v10, v16, v10, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v11, v27 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v27 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v11 -; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 -; GFX8-NEXT: v_cndmask_b32_e64 v11, v32, v33, s[6:7] +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v27 +; GFX8-NEXT: v_ashrrev_i32_e32 v11, 31, v16 +; GFX8-NEXT: v_xor_b32_e32 v11, v17, v11 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v11, v16, v11, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v12, v28 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v28 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v12 -; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 -; GFX8-NEXT: v_cndmask_b32_e64 v12, v32, v33, s[6:7] +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v28 +; GFX8-NEXT: v_ashrrev_i32_e32 v12, 31, v16 +; GFX8-NEXT: v_xor_b32_e32 v12, v17, v12 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v12, v16, v12, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v13, v29 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v29 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v13 -; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 -; GFX8-NEXT: v_cndmask_b32_e64 v13, v32, v33, s[6:7] +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v29 +; GFX8-NEXT: v_ashrrev_i32_e32 v13, 31, v16 +; GFX8-NEXT: v_xor_b32_e32 v13, v17, v13 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v13, v16, v13, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v14, v30 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v30 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v14 -; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 -; GFX8-NEXT: v_cndmask_b32_e64 v14, v32, v33, s[6:7] +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v30 +; GFX8-NEXT: v_ashrrev_i32_e32 v14, 31, v16 +; GFX8-NEXT: v_xor_b32_e32 v14, v17, v14 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v14, v16, v14, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v15, v31 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v31 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v15 -; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 -; GFX8-NEXT: v_cndmask_b32_e64 v15, v32, v33, s[6:7] +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v31 +; GFX8-NEXT: v_ashrrev_i32_e32 v15, 31, v16 +; GFX8-NEXT: v_xor_b32_e32 v15, v17, v15 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -1052,13 +1037,10 @@ ; GFX6-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] ; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[2:3] -; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v5 ; GFX6-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX6-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[4:5] -; GFX6-NEXT: v_bfrev_b32_e32 v2, -2 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v5 -; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -1069,13 +1051,10 @@ ; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] ; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[2:3] -; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v5 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX8-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[4:5] -; GFX8-NEXT: v_bfrev_b32_e32 v2, -2 -; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -1086,13 +1065,10 @@ ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v3, vcc ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[2:3] -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v5 ; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[4:5] -; GFX9-NEXT: v_bfrev_b32_e32 v2, -2 -; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX9-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1101,15 +1077,13 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2 -; GFX10-NEXT: v_bfrev_b32_e32 v6, -2 -; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[2:3] -; GFX10-NEXT: v_cmp_gt_i64_e64 s5, 0, v[4:5] +; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] -; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0x80000000, v6, s5 +; GFX10-NEXT: v_xor_b32_e32 v1, 0x80000000, v6 ; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs) Index: llvm/test/CodeGen/ARM/qdadd.ll =================================================================== --- llvm/test/CodeGen/ARM/qdadd.ll +++ llvm/test/CodeGen/ARM/qdadd.ll @@ -7,32 +7,14 @@ define i32 @qdadd(i32 %x, i32 %y) nounwind { ; CHECK-T2NODSP-LABEL: qdadd: ; CHECK-T2NODSP: @ %bb.0: -; CHECK-T2NODSP-NEXT: .save {r7, lr} -; CHECK-T2NODSP-NEXT: push {r7, lr} -; CHECK-T2NODSP-NEXT: movs r3, #0 -; CHECK-T2NODSP-NEXT: adds.w r12, r0, r0 -; CHECK-T2NODSP-NEXT: it mi -; CHECK-T2NODSP-NEXT: movmi r3, #1 -; CHECK-T2NODSP-NEXT: cmp r3, #0 -; CHECK-T2NODSP-NEXT: mov.w r3, #-2147483648 -; CHECK-T2NODSP-NEXT: mov.w lr, #0 -; CHECK-T2NODSP-NEXT: it ne -; CHECK-T2NODSP-NEXT: mvnne r3, #-2147483648 -; CHECK-T2NODSP-NEXT: cmp r12, r0 -; CHECK-T2NODSP-NEXT: it vc -; CHECK-T2NODSP-NEXT: movvc r3, r12 -; CHECK-T2NODSP-NEXT: adds r0, r3, r1 +; CHECK-T2NODSP-NEXT: adds r0, r0, r0 ; CHECK-T2NODSP-NEXT: mov.w r2, #-2147483648 -; CHECK-T2NODSP-NEXT: it mi -; CHECK-T2NODSP-NEXT: movmi.w lr, #1 -; CHECK-T2NODSP-NEXT: cmp.w lr, #0 -; CHECK-T2NODSP-NEXT: it ne -; CHECK-T2NODSP-NEXT: mvnne r2, #-2147483648 -; CHECK-T2NODSP-NEXT: cmp r0, r3 -; CHECK-T2NODSP-NEXT: it vc -; CHECK-T2NODSP-NEXT: movvc r2, r0 -; CHECK-T2NODSP-NEXT: mov r0, r2 -; CHECK-T2NODSP-NEXT: pop {r7, pc} +; CHECK-T2NODSP-NEXT: it vs +; CHECK-T2NODSP-NEXT: eorvs.w r0, r2, r0, asr #31 +; CHECK-T2NODSP-NEXT: adds r0, r0, r1 +; CHECK-T2NODSP-NEXT: it vs +; CHECK-T2NODSP-NEXT: eorvs.w r0, r2, r0, asr #31 +; CHECK-T2NODSP-NEXT: bx lr ; ; CHECK-T2DSP-LABEL: qdadd: ; CHECK-T2DSP: @ %bb.0: @@ -51,32 +33,14 @@ define i32 @qdadd_c(i32 %x, i32 %y) nounwind { ; CHECK-T2NODSP-LABEL: qdadd_c: ; CHECK-T2NODSP: @ %bb.0: -; CHECK-T2NODSP-NEXT: .save {r7, lr} -; CHECK-T2NODSP-NEXT: push {r7, lr} -; CHECK-T2NODSP-NEXT: movs r3, #0 -; CHECK-T2NODSP-NEXT: adds.w r12, r0, r0 -; CHECK-T2NODSP-NEXT: it mi -; CHECK-T2NODSP-NEXT: movmi r3, #1 -; CHECK-T2NODSP-NEXT: cmp r3, #0 -; CHECK-T2NODSP-NEXT: mov.w r3, #-2147483648 -; CHECK-T2NODSP-NEXT: mov.w lr, #0 -; CHECK-T2NODSP-NEXT: it ne -; CHECK-T2NODSP-NEXT: mvnne r3, #-2147483648 -; CHECK-T2NODSP-NEXT: cmp r12, r0 -; CHECK-T2NODSP-NEXT: it vc -; CHECK-T2NODSP-NEXT: movvc r3, r12 -; CHECK-T2NODSP-NEXT: adds r0, r1, r3 +; CHECK-T2NODSP-NEXT: adds r0, r0, r0 ; CHECK-T2NODSP-NEXT: mov.w r2, #-2147483648 -; CHECK-T2NODSP-NEXT: it mi -; CHECK-T2NODSP-NEXT: movmi.w lr, #1 -; CHECK-T2NODSP-NEXT: cmp.w lr, #0 -; CHECK-T2NODSP-NEXT: it ne -; CHECK-T2NODSP-NEXT: mvnne r2, #-2147483648 -; CHECK-T2NODSP-NEXT: cmp r0, r1 -; CHECK-T2NODSP-NEXT: it vc -; CHECK-T2NODSP-NEXT: movvc r2, r0 -; CHECK-T2NODSP-NEXT: mov r0, r2 -; CHECK-T2NODSP-NEXT: pop {r7, pc} +; CHECK-T2NODSP-NEXT: it vs +; CHECK-T2NODSP-NEXT: eorvs.w r0, r2, r0, asr #31 +; CHECK-T2NODSP-NEXT: adds r0, r0, r1 +; CHECK-T2NODSP-NEXT: it vs +; CHECK-T2NODSP-NEXT: eorvs.w r0, r2, r0, asr #31 +; CHECK-T2NODSP-NEXT: bx lr ; ; CHECK-T2DSP-LABEL: qdadd_c: ; CHECK-T2DSP: @ %bb.0: @@ -95,32 +59,14 @@ define i32 @qdsub(i32 %x, i32 %y) nounwind { ; CHECK-T2NODSP-LABEL: qdsub: ; CHECK-T2NODSP: @ %bb.0: -; CHECK-T2NODSP-NEXT: .save {r7, lr} -; CHECK-T2NODSP-NEXT: push {r7, lr} -; CHECK-T2NODSP-NEXT: movs r3, #0 -; CHECK-T2NODSP-NEXT: adds.w r12, r0, r0 -; CHECK-T2NODSP-NEXT: it mi -; CHECK-T2NODSP-NEXT: movmi r3, #1 -; CHECK-T2NODSP-NEXT: cmp r3, #0 -; CHECK-T2NODSP-NEXT: mov.w r3, #-2147483648 -; CHECK-T2NODSP-NEXT: mov.w lr, #0 -; CHECK-T2NODSP-NEXT: it ne -; CHECK-T2NODSP-NEXT: mvnne r3, #-2147483648 -; CHECK-T2NODSP-NEXT: cmp r12, r0 -; CHECK-T2NODSP-NEXT: it vc -; CHECK-T2NODSP-NEXT: movvc r3, r12 -; CHECK-T2NODSP-NEXT: subs r0, r1, r3 +; CHECK-T2NODSP-NEXT: adds r0, r0, r0 ; CHECK-T2NODSP-NEXT: mov.w r2, #-2147483648 -; CHECK-T2NODSP-NEXT: it mi -; CHECK-T2NODSP-NEXT: movmi.w lr, #1 -; CHECK-T2NODSP-NEXT: cmp.w lr, #0 -; CHECK-T2NODSP-NEXT: it ne -; CHECK-T2NODSP-NEXT: mvnne r2, #-2147483648 -; CHECK-T2NODSP-NEXT: cmp r1, r3 -; CHECK-T2NODSP-NEXT: it vc -; CHECK-T2NODSP-NEXT: movvc r2, r0 -; CHECK-T2NODSP-NEXT: mov r0, r2 -; CHECK-T2NODSP-NEXT: pop {r7, pc} +; CHECK-T2NODSP-NEXT: it vs +; CHECK-T2NODSP-NEXT: eorvs.w r0, r2, r0, asr #31 +; CHECK-T2NODSP-NEXT: subs r0, r1, r0 +; CHECK-T2NODSP-NEXT: it vs +; CHECK-T2NODSP-NEXT: eorvs.w r0, r2, r0, asr #31 +; CHECK-T2NODSP-NEXT: bx lr ; ; CHECK-T2DSP-LABEL: qdsub: ; CHECK-T2DSP: @ %bb.0: @@ -139,32 +85,14 @@ define i32 @qdsub_c(i32 %x, i32 %y) nounwind { ; CHECK-T2NODSP-LABEL: qdsub_c: ; CHECK-T2NODSP: @ %bb.0: -; CHECK-T2NODSP-NEXT: .save {r7, lr} -; CHECK-T2NODSP-NEXT: push {r7, lr} -; CHECK-T2NODSP-NEXT: movs r3, #0 -; CHECK-T2NODSP-NEXT: adds.w r12, r0, r0 -; CHECK-T2NODSP-NEXT: it mi -; CHECK-T2NODSP-NEXT: movmi r3, #1 -; CHECK-T2NODSP-NEXT: cmp r3, #0 -; CHECK-T2NODSP-NEXT: mov.w r3, #-2147483648 -; CHECK-T2NODSP-NEXT: mov.w lr, #0 -; CHECK-T2NODSP-NEXT: it ne -; CHECK-T2NODSP-NEXT: mvnne r3, #-2147483648 -; CHECK-T2NODSP-NEXT: cmp r12, r0 -; CHECK-T2NODSP-NEXT: it vc -; CHECK-T2NODSP-NEXT: movvc r3, r12 -; CHECK-T2NODSP-NEXT: subs r0, r3, r1 +; CHECK-T2NODSP-NEXT: adds r0, r0, r0 ; CHECK-T2NODSP-NEXT: mov.w r2, #-2147483648 -; CHECK-T2NODSP-NEXT: it mi -; CHECK-T2NODSP-NEXT: movmi.w lr, #1 -; CHECK-T2NODSP-NEXT: cmp.w lr, #0 -; CHECK-T2NODSP-NEXT: it ne -; CHECK-T2NODSP-NEXT: mvnne r2, #-2147483648 -; CHECK-T2NODSP-NEXT: cmp r3, r1 -; CHECK-T2NODSP-NEXT: it vc -; CHECK-T2NODSP-NEXT: movvc r2, r0 -; CHECK-T2NODSP-NEXT: mov r0, r2 -; CHECK-T2NODSP-NEXT: pop {r7, pc} +; CHECK-T2NODSP-NEXT: it vs +; CHECK-T2NODSP-NEXT: eorvs.w r0, r2, r0, asr #31 +; CHECK-T2NODSP-NEXT: subs r0, r0, r1 +; CHECK-T2NODSP-NEXT: it vs +; CHECK-T2NODSP-NEXT: eorvs.w r0, r2, r0, asr #31 +; CHECK-T2NODSP-NEXT: bx lr ; ; CHECK-T2DSP-LABEL: qdsub_c: ; CHECK-T2DSP: @ %bb.0: Index: llvm/test/CodeGen/ARM/sadd_sat.ll =================================================================== --- llvm/test/CodeGen/ARM/sadd_sat.ll +++ llvm/test/CodeGen/ARM/sadd_sat.ll @@ -16,48 +16,22 @@ define i32 @func(i32 %x, i32 %y) nounwind { ; CHECK-T1-LABEL: func: ; CHECK-T1: @ %bb.0: -; CHECK-T1-NEXT: mov r2, r0 -; CHECK-T1-NEXT: movs r3, #1 ; CHECK-T1-NEXT: adds r0, r0, r1 -; CHECK-T1-NEXT: mov r1, r3 -; CHECK-T1-NEXT: bmi .LBB0_2 +; CHECK-T1-NEXT: bvc .LBB0_2 ; CHECK-T1-NEXT: @ %bb.1: -; CHECK-T1-NEXT: movs r1, #0 +; CHECK-T1-NEXT: asrs r1, r0, #31 +; CHECK-T1-NEXT: movs r0, #1 +; CHECK-T1-NEXT: lsls r0, r0, #31 +; CHECK-T1-NEXT: eors r0, r1 ; CHECK-T1-NEXT: .LBB0_2: -; CHECK-T1-NEXT: cmp r1, #0 -; CHECK-T1-NEXT: bne .LBB0_4 -; CHECK-T1-NEXT: @ %bb.3: -; CHECK-T1-NEXT: lsls r1, r3, #31 -; CHECK-T1-NEXT: cmp r0, r2 -; CHECK-T1-NEXT: bvs .LBB0_5 -; CHECK-T1-NEXT: b .LBB0_6 -; CHECK-T1-NEXT: .LBB0_4: -; CHECK-T1-NEXT: ldr r1, .LCPI0_0 -; CHECK-T1-NEXT: cmp r0, r2 -; CHECK-T1-NEXT: bvc .LBB0_6 -; CHECK-T1-NEXT: .LBB0_5: -; CHECK-T1-NEXT: mov r0, r1 -; CHECK-T1-NEXT: .LBB0_6: ; CHECK-T1-NEXT: bx lr -; CHECK-T1-NEXT: .p2align 2 -; CHECK-T1-NEXT: @ %bb.7: -; CHECK-T1-NEXT: .LCPI0_0: -; CHECK-T1-NEXT: .long 2147483647 @ 0x7fffffff ; ; CHECK-T2NODSP-LABEL: func: ; CHECK-T2NODSP: @ %bb.0: -; CHECK-T2NODSP-NEXT: adds r2, r0, r1 -; CHECK-T2NODSP-NEXT: mov.w r3, #0 +; CHECK-T2NODSP-NEXT: adds r0, r0, r1 ; CHECK-T2NODSP-NEXT: mov.w r1, #-2147483648 -; CHECK-T2NODSP-NEXT: it mi -; CHECK-T2NODSP-NEXT: movmi r3, #1 -; CHECK-T2NODSP-NEXT: cmp r3, #0 -; CHECK-T2NODSP-NEXT: it ne -; CHECK-T2NODSP-NEXT: mvnne r1, #-2147483648 -; CHECK-T2NODSP-NEXT: cmp r2, r0 -; CHECK-T2NODSP-NEXT: it vc -; CHECK-T2NODSP-NEXT: movvc r1, r2 -; CHECK-T2NODSP-NEXT: mov r0, r1 +; CHECK-T2NODSP-NEXT: it vs +; CHECK-T2NODSP-NEXT: eorvs.w r0, r1, r0, asr #31 ; CHECK-T2NODSP-NEXT: bx lr ; ; CHECK-T2DSP-LABEL: func: @@ -67,15 +41,9 @@ ; ; CHECK-ARMNODPS-LABEL: func: ; CHECK-ARMNODPS: @ %bb.0: -; CHECK-ARMNODPS-NEXT: adds r2, r0, r1 -; CHECK-ARMNODPS-NEXT: mov r3, #0 -; CHECK-ARMNODPS-NEXT: movmi r3, #1 +; CHECK-ARMNODPS-NEXT: adds r0, r0, r1 ; CHECK-ARMNODPS-NEXT: mov r1, #-2147483648 -; CHECK-ARMNODPS-NEXT: cmp r3, #0 -; CHECK-ARMNODPS-NEXT: mvnne r1, #-2147483648 -; CHECK-ARMNODPS-NEXT: cmp r2, r0 -; CHECK-ARMNODPS-NEXT: movvc r1, r2 -; CHECK-ARMNODPS-NEXT: mov r0, r1 +; CHECK-ARMNODPS-NEXT: eorvs r0, r1, r0, asr #31 ; CHECK-ARMNODPS-NEXT: bx lr ; ; CHECK-ARMBASEDSP-LABEL: func: @@ -97,36 +65,28 @@ ; CHECK-T1-NEXT: .save {r4, lr} ; CHECK-T1-NEXT: push {r4, lr} ; CHECK-T1-NEXT: mov r4, r1 -; CHECK-T1-NEXT: eors r4, r3 -; CHECK-T1-NEXT: adds r0, r0, r2 -; CHECK-T1-NEXT: adcs r3, r1 ; CHECK-T1-NEXT: eors r1, r3 -; CHECK-T1-NEXT: bics r1, r4 -; CHECK-T1-NEXT: bpl .LBB1_2 +; CHECK-T1-NEXT: adds r2, r0, r2 +; CHECK-T1-NEXT: adcs r3, r4 +; CHECK-T1-NEXT: eors r4, r3 +; CHECK-T1-NEXT: bics r4, r1 +; CHECK-T1-NEXT: asrs r1, r3, #31 +; CHECK-T1-NEXT: cmp r4, #0 +; CHECK-T1-NEXT: mov r0, r1 +; CHECK-T1-NEXT: bmi .LBB1_2 ; CHECK-T1-NEXT: @ %bb.1: -; CHECK-T1-NEXT: asrs r0, r3, #31 +; CHECK-T1-NEXT: mov r0, r2 ; CHECK-T1-NEXT: .LBB1_2: -; CHECK-T1-NEXT: cmp r3, #0 +; CHECK-T1-NEXT: cmp r4, #0 ; CHECK-T1-NEXT: bmi .LBB1_4 ; CHECK-T1-NEXT: @ %bb.3: +; CHECK-T1-NEXT: mov r1, r3 +; CHECK-T1-NEXT: pop {r4, pc} +; CHECK-T1-NEXT: .LBB1_4: ; CHECK-T1-NEXT: movs r2, #1 ; CHECK-T1-NEXT: lsls r2, r2, #31 -; CHECK-T1-NEXT: cmp r1, #0 -; CHECK-T1-NEXT: bpl .LBB1_5 -; CHECK-T1-NEXT: b .LBB1_6 -; CHECK-T1-NEXT: .LBB1_4: -; CHECK-T1-NEXT: ldr r2, .LCPI1_0 -; CHECK-T1-NEXT: cmp r1, #0 -; CHECK-T1-NEXT: bmi .LBB1_6 -; CHECK-T1-NEXT: .LBB1_5: -; CHECK-T1-NEXT: mov r2, r3 -; CHECK-T1-NEXT: .LBB1_6: -; CHECK-T1-NEXT: mov r1, r2 +; CHECK-T1-NEXT: eors r1, r2 ; CHECK-T1-NEXT: pop {r4, pc} -; CHECK-T1-NEXT: .p2align 2 -; CHECK-T1-NEXT: @ %bb.7: -; CHECK-T1-NEXT: .LCPI1_0: -; CHECK-T1-NEXT: .long 2147483647 @ 0x7fffffff ; ; CHECK-T2-LABEL: func2: ; CHECK-T2: @ %bb.0: @@ -134,17 +94,14 @@ ; CHECK-T2-NEXT: eor.w r12, r1, r3 ; CHECK-T2-NEXT: adc.w r2, r1, r3 ; CHECK-T2-NEXT: eors r1, r2 -; CHECK-T2-NEXT: bic.w r3, r1, r12 +; CHECK-T2-NEXT: bic.w r1, r1, r12 +; CHECK-T2-NEXT: cmp r1, #0 ; CHECK-T2-NEXT: mov.w r1, #-2147483648 -; CHECK-T2-NEXT: cmp r3, #0 ; CHECK-T2-NEXT: it mi ; CHECK-T2-NEXT: asrmi r0, r2, #31 -; CHECK-T2-NEXT: cmp r2, #0 ; CHECK-T2-NEXT: it mi -; CHECK-T2-NEXT: mvnmi r1, #-2147483648 -; CHECK-T2-NEXT: cmp r3, #0 -; CHECK-T2-NEXT: it pl -; CHECK-T2-NEXT: movpl r1, r2 +; CHECK-T2-NEXT: eormi.w r2, r1, r2, asr #31 +; CHECK-T2-NEXT: mov r1, r2 ; CHECK-T2-NEXT: bx lr ; ; CHECK-ARM-LABEL: func2: @@ -153,14 +110,12 @@ ; CHECK-ARM-NEXT: eor r12, r1, r3 ; CHECK-ARM-NEXT: adc r2, r1, r3 ; CHECK-ARM-NEXT: eor r1, r1, r2 -; CHECK-ARM-NEXT: bic r3, r1, r12 +; CHECK-ARM-NEXT: bic r1, r1, r12 +; CHECK-ARM-NEXT: cmp r1, #0 ; CHECK-ARM-NEXT: mov r1, #-2147483648 -; CHECK-ARM-NEXT: cmp r3, #0 ; CHECK-ARM-NEXT: asrmi r0, r2, #31 -; CHECK-ARM-NEXT: cmp r2, #0 -; CHECK-ARM-NEXT: mvnmi r1, #-2147483648 -; CHECK-ARM-NEXT: cmp r3, #0 -; CHECK-ARM-NEXT: movpl r1, r2 +; CHECK-ARM-NEXT: eormi r2, r1, r2, asr #31 +; CHECK-ARM-NEXT: mov r1, r2 ; CHECK-ARM-NEXT: bx lr %tmp = call i64 @llvm.sadd.sat.i64(i64 %x, i64 %y) ret i64 %tmp Index: llvm/test/CodeGen/ARM/sadd_sat_plus.ll =================================================================== --- llvm/test/CodeGen/ARM/sadd_sat_plus.ll +++ llvm/test/CodeGen/ARM/sadd_sat_plus.ll @@ -13,49 +13,24 @@ define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind { ; CHECK-T1-LABEL: func32: ; CHECK-T1: @ %bb.0: -; CHECK-T1-NEXT: mov r3, r0 ; CHECK-T1-NEXT: muls r1, r2, r1 -; CHECK-T1-NEXT: movs r2, #1 ; CHECK-T1-NEXT: adds r0, r0, r1 -; CHECK-T1-NEXT: mov r1, r2 -; CHECK-T1-NEXT: bmi .LBB0_2 +; CHECK-T1-NEXT: bvc .LBB0_2 ; CHECK-T1-NEXT: @ %bb.1: -; CHECK-T1-NEXT: movs r1, #0 +; CHECK-T1-NEXT: asrs r1, r0, #31 +; CHECK-T1-NEXT: movs r0, #1 +; CHECK-T1-NEXT: lsls r0, r0, #31 +; CHECK-T1-NEXT: eors r0, r1 ; CHECK-T1-NEXT: .LBB0_2: -; CHECK-T1-NEXT: cmp r1, #0 -; CHECK-T1-NEXT: bne .LBB0_4 -; CHECK-T1-NEXT: @ %bb.3: -; CHECK-T1-NEXT: lsls r1, r2, #31 -; CHECK-T1-NEXT: cmp r0, r3 -; CHECK-T1-NEXT: bvs .LBB0_5 -; CHECK-T1-NEXT: b .LBB0_6 -; CHECK-T1-NEXT: .LBB0_4: -; CHECK-T1-NEXT: ldr r1, .LCPI0_0 -; CHECK-T1-NEXT: cmp r0, r3 -; CHECK-T1-NEXT: bvc .LBB0_6 -; CHECK-T1-NEXT: .LBB0_5: -; CHECK-T1-NEXT: mov r0, r1 -; CHECK-T1-NEXT: .LBB0_6: ; CHECK-T1-NEXT: bx lr -; CHECK-T1-NEXT: .p2align 2 -; CHECK-T1-NEXT: @ %bb.7: -; CHECK-T1-NEXT: .LCPI0_0: -; CHECK-T1-NEXT: .long 2147483647 @ 0x7fffffff ; ; CHECK-T2NODSP-LABEL: func32: ; CHECK-T2NODSP: @ %bb.0: -; CHECK-T2NODSP-NEXT: mla r2, r1, r2, r0 -; CHECK-T2NODSP-NEXT: movs r3, #0 -; CHECK-T2NODSP-NEXT: mov.w r1, #-2147483648 -; CHECK-T2NODSP-NEXT: cmp r2, #0 -; CHECK-T2NODSP-NEXT: it mi -; CHECK-T2NODSP-NEXT: movmi r3, #1 -; CHECK-T2NODSP-NEXT: cmp r3, #0 -; CHECK-T2NODSP-NEXT: it ne -; CHECK-T2NODSP-NEXT: mvnne r1, #-2147483648 -; CHECK-T2NODSP-NEXT: cmp r2, r0 -; CHECK-T2NODSP-NEXT: it vc -; CHECK-T2NODSP-NEXT: movvc r1, r2 +; CHECK-T2NODSP-NEXT: mla r1, r1, r2, r0 +; CHECK-T2NODSP-NEXT: mov.w r2, #-2147483648 +; CHECK-T2NODSP-NEXT: cmp r1, r0 +; CHECK-T2NODSP-NEXT: it vs +; CHECK-T2NODSP-NEXT: eorvs.w r1, r2, r1, asr #31 ; CHECK-T2NODSP-NEXT: mov r0, r1 ; CHECK-T2NODSP-NEXT: bx lr ; @@ -84,35 +59,28 @@ ; CHECK-T1-NEXT: mov r2, r1 ; CHECK-T1-NEXT: eors r2, r3 ; CHECK-T1-NEXT: ldr r4, [sp, #8] -; CHECK-T1-NEXT: adds r0, r0, r4 +; CHECK-T1-NEXT: adds r4, r0, r4 ; CHECK-T1-NEXT: adcs r3, r1 ; CHECK-T1-NEXT: eors r1, r3 ; CHECK-T1-NEXT: bics r1, r2 -; CHECK-T1-NEXT: bpl .LBB1_2 +; CHECK-T1-NEXT: asrs r2, r3, #31 +; CHECK-T1-NEXT: cmp r1, #0 +; CHECK-T1-NEXT: mov r0, r2 +; CHECK-T1-NEXT: bmi .LBB1_2 ; CHECK-T1-NEXT: @ %bb.1: -; CHECK-T1-NEXT: asrs r0, r3, #31 +; CHECK-T1-NEXT: mov r0, r4 ; CHECK-T1-NEXT: .LBB1_2: -; CHECK-T1-NEXT: cmp r3, #0 +; CHECK-T1-NEXT: cmp r1, #0 ; CHECK-T1-NEXT: bmi .LBB1_4 ; CHECK-T1-NEXT: @ %bb.3: -; CHECK-T1-NEXT: movs r2, #1 -; CHECK-T1-NEXT: lsls r2, r2, #31 -; CHECK-T1-NEXT: cmp r1, #0 -; CHECK-T1-NEXT: bpl .LBB1_5 -; CHECK-T1-NEXT: b .LBB1_6 +; CHECK-T1-NEXT: mov r1, r3 +; CHECK-T1-NEXT: pop {r4, pc} ; CHECK-T1-NEXT: .LBB1_4: -; CHECK-T1-NEXT: ldr r2, .LCPI1_0 -; CHECK-T1-NEXT: cmp r1, #0 -; CHECK-T1-NEXT: bmi .LBB1_6 -; CHECK-T1-NEXT: .LBB1_5: -; CHECK-T1-NEXT: mov r2, r3 -; CHECK-T1-NEXT: .LBB1_6: +; CHECK-T1-NEXT: movs r1, #1 +; CHECK-T1-NEXT: lsls r1, r1, #31 +; CHECK-T1-NEXT: eors r2, r1 ; CHECK-T1-NEXT: mov r1, r2 ; CHECK-T1-NEXT: pop {r4, pc} -; CHECK-T1-NEXT: .p2align 2 -; CHECK-T1-NEXT: @ %bb.7: -; CHECK-T1-NEXT: .LCPI1_0: -; CHECK-T1-NEXT: .long 2147483647 @ 0x7fffffff ; ; CHECK-T2-LABEL: func64: ; CHECK-T2: @ %bb.0: @@ -122,17 +90,14 @@ ; CHECK-T2-NEXT: adc.w r2, r1, r12 ; CHECK-T2-NEXT: eor.w r3, r1, r12 ; CHECK-T2-NEXT: eors r1, r2 -; CHECK-T2-NEXT: bic.w r3, r1, r3 +; CHECK-T2-NEXT: bics r1, r3 +; CHECK-T2-NEXT: cmp r1, #0 ; CHECK-T2-NEXT: mov.w r1, #-2147483648 -; CHECK-T2-NEXT: cmp r3, #0 ; CHECK-T2-NEXT: it mi ; CHECK-T2-NEXT: asrmi r0, r2, #31 -; CHECK-T2-NEXT: cmp r2, #0 ; CHECK-T2-NEXT: it mi -; CHECK-T2-NEXT: mvnmi r1, #-2147483648 -; CHECK-T2-NEXT: cmp r3, #0 -; CHECK-T2-NEXT: it pl -; CHECK-T2-NEXT: movpl r1, r2 +; CHECK-T2-NEXT: eormi.w r2, r1, r2, asr #31 +; CHECK-T2-NEXT: mov r1, r2 ; CHECK-T2-NEXT: bx lr ; ; CHECK-ARM-LABEL: func64: @@ -143,14 +108,12 @@ ; CHECK-ARM-NEXT: eor r3, r1, r2 ; CHECK-ARM-NEXT: adc r2, r1, r2 ; CHECK-ARM-NEXT: eor r1, r1, r2 -; CHECK-ARM-NEXT: bic r3, r1, r3 +; CHECK-ARM-NEXT: bic r1, r1, r3 +; CHECK-ARM-NEXT: cmp r1, #0 ; CHECK-ARM-NEXT: mov r1, #-2147483648 -; CHECK-ARM-NEXT: cmp r3, #0 ; CHECK-ARM-NEXT: asrmi r0, r2, #31 -; CHECK-ARM-NEXT: cmp r2, #0 -; CHECK-ARM-NEXT: mvnmi r1, #-2147483648 -; CHECK-ARM-NEXT: cmp r3, #0 -; CHECK-ARM-NEXT: movpl r1, r2 +; CHECK-ARM-NEXT: eormi r2, r1, r2, asr #31 +; CHECK-ARM-NEXT: mov r1, r2 ; CHECK-ARM-NEXT: bx lr %a = mul i64 %y, %z %tmp = call i64 @llvm.sadd.sat.i64(i64 %x, i64 %z) Index: llvm/test/CodeGen/ARM/ssub_sat.ll =================================================================== --- llvm/test/CodeGen/ARM/ssub_sat.ll +++ llvm/test/CodeGen/ARM/ssub_sat.ll @@ -16,50 +16,22 @@ define i32 @func(i32 %x, i32 %y) nounwind { ; CHECK-T1-LABEL: func: ; CHECK-T1: @ %bb.0: -; CHECK-T1-NEXT: .save {r4, lr} -; CHECK-T1-NEXT: push {r4, lr} -; CHECK-T1-NEXT: mov r2, r0 -; CHECK-T1-NEXT: movs r3, #1 ; CHECK-T1-NEXT: subs r0, r0, r1 -; CHECK-T1-NEXT: mov r4, r3 -; CHECK-T1-NEXT: bmi .LBB0_2 +; CHECK-T1-NEXT: bvc .LBB0_2 ; CHECK-T1-NEXT: @ %bb.1: -; CHECK-T1-NEXT: movs r4, #0 +; CHECK-T1-NEXT: asrs r1, r0, #31 +; CHECK-T1-NEXT: movs r0, #1 +; CHECK-T1-NEXT: lsls r0, r0, #31 +; CHECK-T1-NEXT: eors r0, r1 ; CHECK-T1-NEXT: .LBB0_2: -; CHECK-T1-NEXT: cmp r4, #0 -; CHECK-T1-NEXT: bne .LBB0_4 -; CHECK-T1-NEXT: @ %bb.3: -; CHECK-T1-NEXT: lsls r3, r3, #31 -; CHECK-T1-NEXT: cmp r2, r1 -; CHECK-T1-NEXT: bvs .LBB0_5 -; CHECK-T1-NEXT: b .LBB0_6 -; CHECK-T1-NEXT: .LBB0_4: -; CHECK-T1-NEXT: ldr r3, .LCPI0_0 -; CHECK-T1-NEXT: cmp r2, r1 -; CHECK-T1-NEXT: bvc .LBB0_6 -; CHECK-T1-NEXT: .LBB0_5: -; CHECK-T1-NEXT: mov r0, r3 -; CHECK-T1-NEXT: .LBB0_6: -; CHECK-T1-NEXT: pop {r4, pc} -; CHECK-T1-NEXT: .p2align 2 -; CHECK-T1-NEXT: @ %bb.7: -; CHECK-T1-NEXT: .LCPI0_0: -; CHECK-T1-NEXT: .long 2147483647 @ 0x7fffffff +; CHECK-T1-NEXT: bx lr ; ; CHECK-T2NODSP-LABEL: func: ; CHECK-T2NODSP: @ %bb.0: -; CHECK-T2NODSP-NEXT: subs.w r12, r0, r1 -; CHECK-T2NODSP-NEXT: mov.w r3, #0 -; CHECK-T2NODSP-NEXT: mov.w r2, #-2147483648 -; CHECK-T2NODSP-NEXT: it mi -; CHECK-T2NODSP-NEXT: movmi r3, #1 -; CHECK-T2NODSP-NEXT: cmp r3, #0 -; CHECK-T2NODSP-NEXT: it ne -; CHECK-T2NODSP-NEXT: mvnne r2, #-2147483648 -; CHECK-T2NODSP-NEXT: cmp r0, r1 -; CHECK-T2NODSP-NEXT: it vc -; CHECK-T2NODSP-NEXT: movvc r2, r12 -; CHECK-T2NODSP-NEXT: mov r0, r2 +; CHECK-T2NODSP-NEXT: subs r0, r0, r1 +; CHECK-T2NODSP-NEXT: mov.w r1, #-2147483648 +; CHECK-T2NODSP-NEXT: it vs +; CHECK-T2NODSP-NEXT: eorvs.w r0, r1, r0, asr #31 ; CHECK-T2NODSP-NEXT: bx lr ; ; CHECK-T2DSP-LABEL: func: @@ -69,15 +41,9 @@ ; ; CHECK-ARMNODPS-LABEL: func: ; CHECK-ARMNODPS: @ %bb.0: -; CHECK-ARMNODPS-NEXT: subs r12, r0, r1 -; CHECK-ARMNODPS-NEXT: mov r3, #0 -; CHECK-ARMNODPS-NEXT: movmi r3, #1 -; CHECK-ARMNODPS-NEXT: mov r2, #-2147483648 -; CHECK-ARMNODPS-NEXT: cmp r3, #0 -; CHECK-ARMNODPS-NEXT: mvnne r2, #-2147483648 -; CHECK-ARMNODPS-NEXT: cmp r0, r1 -; CHECK-ARMNODPS-NEXT: movvc r2, r12 -; CHECK-ARMNODPS-NEXT: mov r0, r2 +; CHECK-ARMNODPS-NEXT: subs r0, r0, r1 +; CHECK-ARMNODPS-NEXT: mov r1, #-2147483648 +; CHECK-ARMNODPS-NEXT: eorvs r0, r1, r0, asr #31 ; CHECK-ARMNODPS-NEXT: bx lr ; ; CHECK-ARMBASEDSP-LABEL: func: @@ -98,38 +64,30 @@ ; CHECK-T1: @ %bb.0: ; CHECK-T1-NEXT: .save {r4, r5, r7, lr} ; CHECK-T1-NEXT: push {r4, r5, r7, lr} -; CHECK-T1-NEXT: mov r5, r1 -; CHECK-T1-NEXT: eors r5, r3 -; CHECK-T1-NEXT: subs r0, r0, r2 ; CHECK-T1-NEXT: mov r4, r1 -; CHECK-T1-NEXT: sbcs r4, r3 -; CHECK-T1-NEXT: eors r1, r4 -; CHECK-T1-NEXT: ands r1, r5 -; CHECK-T1-NEXT: bpl .LBB1_2 +; CHECK-T1-NEXT: eors r1, r3 +; CHECK-T1-NEXT: subs r5, r0, r2 +; CHECK-T1-NEXT: mov r2, r4 +; CHECK-T1-NEXT: sbcs r2, r3 +; CHECK-T1-NEXT: eors r4, r2 +; CHECK-T1-NEXT: ands r4, r1 +; CHECK-T1-NEXT: asrs r1, r2, #31 +; CHECK-T1-NEXT: cmp r4, #0 +; CHECK-T1-NEXT: mov r0, r1 +; CHECK-T1-NEXT: bmi .LBB1_2 ; CHECK-T1-NEXT: @ %bb.1: -; CHECK-T1-NEXT: asrs r0, r4, #31 +; CHECK-T1-NEXT: mov r0, r5 ; CHECK-T1-NEXT: .LBB1_2: ; CHECK-T1-NEXT: cmp r4, #0 ; CHECK-T1-NEXT: bmi .LBB1_4 ; CHECK-T1-NEXT: @ %bb.3: +; CHECK-T1-NEXT: mov r1, r2 +; CHECK-T1-NEXT: pop {r4, r5, r7, pc} +; CHECK-T1-NEXT: .LBB1_4: ; CHECK-T1-NEXT: movs r2, #1 ; CHECK-T1-NEXT: lsls r2, r2, #31 -; CHECK-T1-NEXT: cmp r1, #0 -; CHECK-T1-NEXT: bpl .LBB1_5 -; CHECK-T1-NEXT: b .LBB1_6 -; CHECK-T1-NEXT: .LBB1_4: -; CHECK-T1-NEXT: ldr r2, .LCPI1_0 -; CHECK-T1-NEXT: cmp r1, #0 -; CHECK-T1-NEXT: bmi .LBB1_6 -; CHECK-T1-NEXT: .LBB1_5: -; CHECK-T1-NEXT: mov r2, r4 -; CHECK-T1-NEXT: .LBB1_6: -; CHECK-T1-NEXT: mov r1, r2 +; CHECK-T1-NEXT: eors r1, r2 ; CHECK-T1-NEXT: pop {r4, r5, r7, pc} -; CHECK-T1-NEXT: .p2align 2 -; CHECK-T1-NEXT: @ %bb.7: -; CHECK-T1-NEXT: .LCPI1_0: -; CHECK-T1-NEXT: .long 2147483647 @ 0x7fffffff ; ; CHECK-T2-LABEL: func2: ; CHECK-T2: @ %bb.0: @@ -137,16 +95,13 @@ ; CHECK-T2-NEXT: eor.w r12, r1, r3 ; CHECK-T2-NEXT: sbc.w r2, r1, r3 ; CHECK-T2-NEXT: eors r1, r2 -; CHECK-T2-NEXT: ands.w r3, r12, r1 -; CHECK-T2-NEXT: mov.w r1, #-2147483648 +; CHECK-T2-NEXT: ands.w r1, r1, r12 ; CHECK-T2-NEXT: it mi ; CHECK-T2-NEXT: asrmi r0, r2, #31 -; CHECK-T2-NEXT: cmp r2, #0 +; CHECK-T2-NEXT: mov.w r1, #-2147483648 ; CHECK-T2-NEXT: it mi -; CHECK-T2-NEXT: mvnmi r1, #-2147483648 -; CHECK-T2-NEXT: cmp r3, #0 -; CHECK-T2-NEXT: it pl -; CHECK-T2-NEXT: movpl r1, r2 +; CHECK-T2-NEXT: eormi.w r2, r1, r2, asr #31 +; CHECK-T2-NEXT: mov r1, r2 ; CHECK-T2-NEXT: bx lr ; ; CHECK-ARM-LABEL: func2: @@ -155,13 +110,11 @@ ; CHECK-ARM-NEXT: eor r12, r1, r3 ; CHECK-ARM-NEXT: sbc r2, r1, r3 ; CHECK-ARM-NEXT: eor r1, r1, r2 -; CHECK-ARM-NEXT: ands r3, r12, r1 -; CHECK-ARM-NEXT: mov r1, #-2147483648 +; CHECK-ARM-NEXT: ands r1, r12, r1 ; CHECK-ARM-NEXT: asrmi r0, r2, #31 -; CHECK-ARM-NEXT: cmp r2, #0 -; CHECK-ARM-NEXT: mvnmi r1, #-2147483648 -; CHECK-ARM-NEXT: cmp r3, #0 -; CHECK-ARM-NEXT: movpl r1, r2 +; CHECK-ARM-NEXT: mov r1, #-2147483648 +; CHECK-ARM-NEXT: eormi r2, r1, r2, asr #31 +; CHECK-ARM-NEXT: mov r1, r2 ; CHECK-ARM-NEXT: bx lr %tmp = call i64 @llvm.ssub.sat.i64(i64 %x, i64 %y) ret i64 %tmp @@ -373,165 +326,64 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; CHECK-T1-LABEL: vec: ; CHECK-T1: @ %bb.0: -; CHECK-T1-NEXT: .save {r4, r5, r6, r7, lr} -; CHECK-T1-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-T1-NEXT: .pad #12 -; CHECK-T1-NEXT: sub sp, #12 -; CHECK-T1-NEXT: str r3, [sp] @ 4-byte Spill -; CHECK-T1-NEXT: mov r4, r1 -; CHECK-T1-NEXT: mov r1, r0 -; CHECK-T1-NEXT: ldr r5, [sp, #32] -; CHECK-T1-NEXT: movs r7, #1 -; CHECK-T1-NEXT: movs r0, #0 -; CHECK-T1-NEXT: str r0, [sp, #8] @ 4-byte Spill -; CHECK-T1-NEXT: subs r0, r1, r5 -; CHECK-T1-NEXT: str r0, [sp, #4] @ 4-byte Spill -; CHECK-T1-NEXT: mov r6, r7 -; CHECK-T1-NEXT: bmi .LBB5_2 +; CHECK-T1-NEXT: .save {r4, r5, r6, lr} +; CHECK-T1-NEXT: push {r4, r5, r6, lr} +; CHECK-T1-NEXT: mov r4, r0 +; CHECK-T1-NEXT: ldr r6, [sp, #16] +; CHECK-T1-NEXT: subs r0, r0, r6 +; CHECK-T1-NEXT: movs r5, #1 +; CHECK-T1-NEXT: lsls r5, r5, #31 +; CHECK-T1-NEXT: cmp r4, r6 +; CHECK-T1-NEXT: bvc .LBB5_2 ; CHECK-T1-NEXT: @ %bb.1: -; CHECK-T1-NEXT: ldr r6, [sp, #8] @ 4-byte Reload +; CHECK-T1-NEXT: asrs r0, r0, #31 +; CHECK-T1-NEXT: eors r0, r5 ; CHECK-T1-NEXT: .LBB5_2: -; CHECK-T1-NEXT: lsls r3, r7, #31 -; CHECK-T1-NEXT: ldr r0, .LCPI5_0 -; CHECK-T1-NEXT: cmp r6, #0 -; CHECK-T1-NEXT: mov r6, r0 -; CHECK-T1-NEXT: bne .LBB5_4 +; CHECK-T1-NEXT: ldr r4, [sp, #20] +; CHECK-T1-NEXT: subs r1, r1, r4 +; CHECK-T1-NEXT: bvc .LBB5_4 ; CHECK-T1-NEXT: @ %bb.3: -; CHECK-T1-NEXT: mov r6, r3 +; CHECK-T1-NEXT: asrs r1, r1, #31 +; CHECK-T1-NEXT: eors r1, r5 ; CHECK-T1-NEXT: .LBB5_4: -; CHECK-T1-NEXT: cmp r1, r5 +; CHECK-T1-NEXT: ldr r4, [sp, #24] +; CHECK-T1-NEXT: subs r2, r2, r4 ; CHECK-T1-NEXT: bvc .LBB5_6 ; CHECK-T1-NEXT: @ %bb.5: -; CHECK-T1-NEXT: str r6, [sp, #4] @ 4-byte Spill +; CHECK-T1-NEXT: asrs r2, r2, #31 +; CHECK-T1-NEXT: eors r2, r5 ; CHECK-T1-NEXT: .LBB5_6: -; CHECK-T1-NEXT: ldr r5, [sp, #36] -; CHECK-T1-NEXT: subs r1, r4, r5 -; CHECK-T1-NEXT: mov r6, r7 -; CHECK-T1-NEXT: bmi .LBB5_8 +; CHECK-T1-NEXT: ldr r4, [sp, #28] +; CHECK-T1-NEXT: subs r3, r3, r4 +; CHECK-T1-NEXT: bvc .LBB5_8 ; CHECK-T1-NEXT: @ %bb.7: -; CHECK-T1-NEXT: ldr r6, [sp, #8] @ 4-byte Reload +; CHECK-T1-NEXT: asrs r3, r3, #31 +; CHECK-T1-NEXT: eors r3, r5 ; CHECK-T1-NEXT: .LBB5_8: -; CHECK-T1-NEXT: cmp r6, #0 -; CHECK-T1-NEXT: mov r6, r0 -; CHECK-T1-NEXT: bne .LBB5_10 -; CHECK-T1-NEXT: @ %bb.9: -; CHECK-T1-NEXT: mov r6, r3 -; CHECK-T1-NEXT: .LBB5_10: -; CHECK-T1-NEXT: cmp r4, r5 -; CHECK-T1-NEXT: bvc .LBB5_12 -; CHECK-T1-NEXT: @ %bb.11: -; CHECK-T1-NEXT: mov r1, r6 -; CHECK-T1-NEXT: .LBB5_12: -; CHECK-T1-NEXT: ldr r5, [sp, #40] -; CHECK-T1-NEXT: subs r4, r2, r5 -; CHECK-T1-NEXT: mov r6, r7 -; CHECK-T1-NEXT: bmi .LBB5_14 -; CHECK-T1-NEXT: @ %bb.13: -; CHECK-T1-NEXT: ldr r6, [sp, #8] @ 4-byte Reload -; CHECK-T1-NEXT: .LBB5_14: -; CHECK-T1-NEXT: cmp r6, #0 -; CHECK-T1-NEXT: mov r6, r0 -; CHECK-T1-NEXT: bne .LBB5_16 -; CHECK-T1-NEXT: @ %bb.15: -; CHECK-T1-NEXT: mov r6, r3 -; CHECK-T1-NEXT: .LBB5_16: -; CHECK-T1-NEXT: cmp r2, r5 -; CHECK-T1-NEXT: bvc .LBB5_18 -; CHECK-T1-NEXT: @ %bb.17: -; CHECK-T1-NEXT: mov r4, r6 -; CHECK-T1-NEXT: .LBB5_18: -; CHECK-T1-NEXT: ldr r2, [sp, #44] -; CHECK-T1-NEXT: ldr r6, [sp] @ 4-byte Reload -; CHECK-T1-NEXT: subs r5, r6, r2 -; CHECK-T1-NEXT: bpl .LBB5_23 -; CHECK-T1-NEXT: @ %bb.19: -; CHECK-T1-NEXT: cmp r7, #0 -; CHECK-T1-NEXT: beq .LBB5_24 -; CHECK-T1-NEXT: .LBB5_20: -; CHECK-T1-NEXT: cmp r6, r2 -; CHECK-T1-NEXT: bvc .LBB5_22 -; CHECK-T1-NEXT: .LBB5_21: -; CHECK-T1-NEXT: mov r5, r0 -; CHECK-T1-NEXT: .LBB5_22: -; CHECK-T1-NEXT: ldr r0, [sp, #4] @ 4-byte Reload -; CHECK-T1-NEXT: mov r2, r4 -; CHECK-T1-NEXT: mov r3, r5 -; CHECK-T1-NEXT: add sp, #12 -; CHECK-T1-NEXT: pop {r4, r5, r6, r7, pc} -; CHECK-T1-NEXT: .LBB5_23: -; CHECK-T1-NEXT: ldr r7, [sp, #8] @ 4-byte Reload -; CHECK-T1-NEXT: cmp r7, #0 -; CHECK-T1-NEXT: bne .LBB5_20 -; CHECK-T1-NEXT: .LBB5_24: -; CHECK-T1-NEXT: mov r0, r3 -; CHECK-T1-NEXT: cmp r6, r2 -; CHECK-T1-NEXT: bvs .LBB5_21 -; CHECK-T1-NEXT: b .LBB5_22 -; CHECK-T1-NEXT: .p2align 2 -; CHECK-T1-NEXT: @ %bb.25: -; CHECK-T1-NEXT: .LCPI5_0: -; CHECK-T1-NEXT: .long 2147483647 @ 0x7fffffff +; CHECK-T1-NEXT: pop {r4, r5, r6, pc} ; ; CHECK-T2NODSP-LABEL: vec: ; CHECK-T2NODSP: @ %bb.0: -; CHECK-T2NODSP-NEXT: .save {r4, r5, r6, r7, lr} -; CHECK-T2NODSP-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-T2NODSP-NEXT: .pad #4 -; CHECK-T2NODSP-NEXT: sub sp, #4 -; CHECK-T2NODSP-NEXT: ldr r4, [sp, #24] -; CHECK-T2NODSP-NEXT: mov lr, r0 -; CHECK-T2NODSP-NEXT: ldr r7, [sp, #28] -; CHECK-T2NODSP-NEXT: movs r5, #0 -; CHECK-T2NODSP-NEXT: subs r6, r0, r4 -; CHECK-T2NODSP-NEXT: mov.w r0, #0 -; CHECK-T2NODSP-NEXT: it mi -; CHECK-T2NODSP-NEXT: movmi r0, #1 -; CHECK-T2NODSP-NEXT: cmp r0, #0 -; CHECK-T2NODSP-NEXT: mov.w r0, #-2147483648 +; CHECK-T2NODSP-NEXT: .save {r7, lr} +; CHECK-T2NODSP-NEXT: push {r7, lr} +; CHECK-T2NODSP-NEXT: ldr.w r12, [sp, #8] +; CHECK-T2NODSP-NEXT: ldr.w lr, [sp, #12] +; CHECK-T2NODSP-NEXT: subs.w r0, r0, r12 ; CHECK-T2NODSP-NEXT: mov.w r12, #-2147483648 -; CHECK-T2NODSP-NEXT: it ne -; CHECK-T2NODSP-NEXT: mvnne r0, #-2147483648 -; CHECK-T2NODSP-NEXT: cmp lr, r4 -; CHECK-T2NODSP-NEXT: it vc -; CHECK-T2NODSP-NEXT: movvc r0, r6 -; CHECK-T2NODSP-NEXT: subs r6, r1, r7 -; CHECK-T2NODSP-NEXT: mov.w r4, #0 -; CHECK-T2NODSP-NEXT: mov.w lr, #-2147483648 -; CHECK-T2NODSP-NEXT: it mi -; CHECK-T2NODSP-NEXT: movmi r4, #1 -; CHECK-T2NODSP-NEXT: cmp r4, #0 -; CHECK-T2NODSP-NEXT: it ne -; CHECK-T2NODSP-NEXT: mvnne lr, #-2147483648 -; CHECK-T2NODSP-NEXT: cmp r1, r7 -; CHECK-T2NODSP-NEXT: ldr r1, [sp, #32] -; CHECK-T2NODSP-NEXT: mov.w r4, #0 -; CHECK-T2NODSP-NEXT: it vc -; CHECK-T2NODSP-NEXT: movvc lr, r6 -; CHECK-T2NODSP-NEXT: subs r6, r2, r1 -; CHECK-T2NODSP-NEXT: it mi -; CHECK-T2NODSP-NEXT: movmi r4, #1 -; CHECK-T2NODSP-NEXT: cmp r4, #0 -; CHECK-T2NODSP-NEXT: mov.w r4, #-2147483648 -; CHECK-T2NODSP-NEXT: it ne -; CHECK-T2NODSP-NEXT: mvnne r4, #-2147483648 -; CHECK-T2NODSP-NEXT: cmp r2, r1 -; CHECK-T2NODSP-NEXT: ldr r1, [sp, #36] -; CHECK-T2NODSP-NEXT: it vc -; CHECK-T2NODSP-NEXT: movvc r4, r6 -; CHECK-T2NODSP-NEXT: subs r2, r3, r1 -; CHECK-T2NODSP-NEXT: it mi -; CHECK-T2NODSP-NEXT: movmi r5, #1 -; CHECK-T2NODSP-NEXT: cmp r5, #0 -; CHECK-T2NODSP-NEXT: it ne -; CHECK-T2NODSP-NEXT: mvnne r12, #-2147483648 -; CHECK-T2NODSP-NEXT: cmp r3, r1 -; CHECK-T2NODSP-NEXT: it vc -; CHECK-T2NODSP-NEXT: movvc r12, r2 -; CHECK-T2NODSP-NEXT: mov r1, lr -; CHECK-T2NODSP-NEXT: mov r2, r4 -; CHECK-T2NODSP-NEXT: mov r3, r12 -; CHECK-T2NODSP-NEXT: add sp, #4 -; CHECK-T2NODSP-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-T2NODSP-NEXT: it vs +; CHECK-T2NODSP-NEXT: eorvs.w r0, r12, r0, asr #31 +; CHECK-T2NODSP-NEXT: subs.w r1, r1, lr +; CHECK-T2NODSP-NEXT: ldr.w lr, [sp, #16] +; CHECK-T2NODSP-NEXT: it vs +; CHECK-T2NODSP-NEXT: eorvs.w r1, r12, r1, asr #31 +; CHECK-T2NODSP-NEXT: subs.w r2, r2, lr +; CHECK-T2NODSP-NEXT: ldr.w lr, [sp, #20] +; CHECK-T2NODSP-NEXT: it vs +; CHECK-T2NODSP-NEXT: eorvs.w r2, r12, r2, asr #31 +; CHECK-T2NODSP-NEXT: subs.w r3, r3, lr +; CHECK-T2NODSP-NEXT: it vs +; CHECK-T2NODSP-NEXT: eorvs.w r3, r12, r3, asr #31 +; CHECK-T2NODSP-NEXT: pop {r7, pc} ; ; CHECK-T2DSP-LABEL: vec: ; CHECK-T2DSP: @ %bb.0: @@ -547,49 +399,22 @@ ; ; CHECK-ARMNODPS-LABEL: vec: ; CHECK-ARMNODPS: @ %bb.0: -; CHECK-ARMNODPS-NEXT: .save {r4, r5, r6, r7, r11, lr} -; CHECK-ARMNODPS-NEXT: push {r4, r5, r6, r7, r11, lr} -; CHECK-ARMNODPS-NEXT: ldr r4, [sp, #24] -; CHECK-ARMNODPS-NEXT: mov lr, r0 -; CHECK-ARMNODPS-NEXT: ldr r7, [sp, #28] -; CHECK-ARMNODPS-NEXT: mov r5, #0 -; CHECK-ARMNODPS-NEXT: subs r6, r0, r4 -; CHECK-ARMNODPS-NEXT: mov r0, #0 -; CHECK-ARMNODPS-NEXT: movmi r0, #1 -; CHECK-ARMNODPS-NEXT: cmp r0, #0 -; CHECK-ARMNODPS-NEXT: mov r0, #-2147483648 +; CHECK-ARMNODPS-NEXT: .save {r11, lr} +; CHECK-ARMNODPS-NEXT: push {r11, lr} +; CHECK-ARMNODPS-NEXT: ldr r12, [sp, #8] +; CHECK-ARMNODPS-NEXT: ldr lr, [sp, #12] +; CHECK-ARMNODPS-NEXT: subs r0, r0, r12 ; CHECK-ARMNODPS-NEXT: mov r12, #-2147483648 -; CHECK-ARMNODPS-NEXT: mvnne r0, #-2147483648 -; CHECK-ARMNODPS-NEXT: cmp lr, r4 -; CHECK-ARMNODPS-NEXT: movvc r0, r6 -; CHECK-ARMNODPS-NEXT: subs r6, r1, r7 -; CHECK-ARMNODPS-NEXT: mov r4, #0 -; CHECK-ARMNODPS-NEXT: mov lr, #-2147483648 -; CHECK-ARMNODPS-NEXT: movmi r4, #1 -; CHECK-ARMNODPS-NEXT: cmp r4, #0 -; CHECK-ARMNODPS-NEXT: mvnne lr, #-2147483648 -; CHECK-ARMNODPS-NEXT: cmp r1, r7 -; CHECK-ARMNODPS-NEXT: ldr r1, [sp, #32] -; CHECK-ARMNODPS-NEXT: movvc lr, r6 -; CHECK-ARMNODPS-NEXT: mov r4, #0 -; CHECK-ARMNODPS-NEXT: subs r6, r2, r1 -; CHECK-ARMNODPS-NEXT: movmi r4, #1 -; CHECK-ARMNODPS-NEXT: cmp r4, #0 -; CHECK-ARMNODPS-NEXT: mov r4, #-2147483648 -; CHECK-ARMNODPS-NEXT: mvnne r4, #-2147483648 -; CHECK-ARMNODPS-NEXT: cmp r2, r1 -; CHECK-ARMNODPS-NEXT: ldr r1, [sp, #36] -; CHECK-ARMNODPS-NEXT: movvc r4, r6 -; CHECK-ARMNODPS-NEXT: subs r2, r3, r1 -; CHECK-ARMNODPS-NEXT: movmi r5, #1 -; CHECK-ARMNODPS-NEXT: cmp r5, #0 -; CHECK-ARMNODPS-NEXT: mvnne r12, #-2147483648 -; CHECK-ARMNODPS-NEXT: cmp r3, r1 -; CHECK-ARMNODPS-NEXT: movvc r12, r2 -; CHECK-ARMNODPS-NEXT: mov r1, lr -; CHECK-ARMNODPS-NEXT: mov r2, r4 -; CHECK-ARMNODPS-NEXT: mov r3, r12 -; CHECK-ARMNODPS-NEXT: pop {r4, r5, r6, r7, r11, pc} +; CHECK-ARMNODPS-NEXT: eorvs r0, r12, r0, asr #31 +; CHECK-ARMNODPS-NEXT: subs r1, r1, lr +; CHECK-ARMNODPS-NEXT: ldr lr, [sp, #16] +; CHECK-ARMNODPS-NEXT: eorvs r1, r12, r1, asr #31 +; CHECK-ARMNODPS-NEXT: subs r2, r2, lr +; CHECK-ARMNODPS-NEXT: ldr lr, [sp, #20] +; CHECK-ARMNODPS-NEXT: eorvs r2, r12, r2, asr #31 +; CHECK-ARMNODPS-NEXT: subs r3, r3, lr +; CHECK-ARMNODPS-NEXT: eorvs r3, r12, r3, asr #31 +; CHECK-ARMNODPS-NEXT: pop {r11, pc} ; ; CHECK-ARMBASEDSP-LABEL: vec: ; CHECK-ARMBASEDSP: @ %bb.0: Index: llvm/test/CodeGen/ARM/ssub_sat_plus.ll =================================================================== --- llvm/test/CodeGen/ARM/ssub_sat_plus.ll +++ llvm/test/CodeGen/ARM/ssub_sat_plus.ll @@ -13,56 +13,27 @@ define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind { ; CHECK-T1-LABEL: func32: ; CHECK-T1: @ %bb.0: -; CHECK-T1-NEXT: .save {r4, lr} -; CHECK-T1-NEXT: push {r4, lr} -; CHECK-T1-NEXT: mov r3, r0 ; CHECK-T1-NEXT: muls r1, r2, r1 -; CHECK-T1-NEXT: movs r2, #1 ; CHECK-T1-NEXT: subs r0, r0, r1 -; CHECK-T1-NEXT: mov r4, r2 -; CHECK-T1-NEXT: bmi .LBB0_2 +; CHECK-T1-NEXT: bvc .LBB0_2 ; CHECK-T1-NEXT: @ %bb.1: -; CHECK-T1-NEXT: movs r4, #0 +; CHECK-T1-NEXT: asrs r1, r0, #31 +; CHECK-T1-NEXT: movs r0, #1 +; CHECK-T1-NEXT: lsls r0, r0, #31 +; CHECK-T1-NEXT: eors r0, r1 ; CHECK-T1-NEXT: .LBB0_2: -; CHECK-T1-NEXT: cmp r4, #0 -; CHECK-T1-NEXT: bne .LBB0_4 -; CHECK-T1-NEXT: @ %bb.3: -; CHECK-T1-NEXT: lsls r2, r2, #31 -; CHECK-T1-NEXT: cmp r3, r1 -; CHECK-T1-NEXT: bvs .LBB0_5 -; CHECK-T1-NEXT: b .LBB0_6 -; CHECK-T1-NEXT: .LBB0_4: -; CHECK-T1-NEXT: ldr r2, .LCPI0_0 -; CHECK-T1-NEXT: cmp r3, r1 -; CHECK-T1-NEXT: bvc .LBB0_6 -; CHECK-T1-NEXT: .LBB0_5: -; CHECK-T1-NEXT: mov r0, r2 -; CHECK-T1-NEXT: .LBB0_6: -; CHECK-T1-NEXT: pop {r4, pc} -; CHECK-T1-NEXT: .p2align 2 -; CHECK-T1-NEXT: @ %bb.7: -; CHECK-T1-NEXT: .LCPI0_0: -; CHECK-T1-NEXT: .long 2147483647 @ 0x7fffffff +; CHECK-T1-NEXT: bx lr ; ; CHECK-T2NODSP-LABEL: func32: ; CHECK-T2NODSP: @ %bb.0: -; CHECK-T2NODSP-NEXT: .save {r7, lr} -; CHECK-T2NODSP-NEXT: push {r7, lr} -; CHECK-T2NODSP-NEXT: mls r12, r1, r2, r0 -; CHECK-T2NODSP-NEXT: mov.w lr, #0 -; CHECK-T2NODSP-NEXT: mov.w r3, #-2147483648 +; CHECK-T2NODSP-NEXT: mls r3, r1, r2, r0 +; CHECK-T2NODSP-NEXT: mov.w r12, #-2147483648 ; CHECK-T2NODSP-NEXT: muls r1, r2, r1 -; CHECK-T2NODSP-NEXT: cmp.w r12, #0 -; CHECK-T2NODSP-NEXT: it mi -; CHECK-T2NODSP-NEXT: movmi.w lr, #1 -; CHECK-T2NODSP-NEXT: cmp.w lr, #0 -; CHECK-T2NODSP-NEXT: it ne -; CHECK-T2NODSP-NEXT: mvnne r3, #-2147483648 ; CHECK-T2NODSP-NEXT: cmp r0, r1 -; CHECK-T2NODSP-NEXT: it vc -; CHECK-T2NODSP-NEXT: movvc r3, r12 +; CHECK-T2NODSP-NEXT: it vs +; CHECK-T2NODSP-NEXT: eorvs.w r3, r12, r3, asr #31 ; CHECK-T2NODSP-NEXT: mov r0, r3 -; CHECK-T2NODSP-NEXT: pop {r7, pc} +; CHECK-T2NODSP-NEXT: bx lr ; ; CHECK-T2DSP-LABEL: func32: ; CHECK-T2DSP: @ %bb.0: @@ -83,42 +54,35 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind { ; CHECK-T1-LABEL: func64: ; CHECK-T1: @ %bb.0: -; CHECK-T1-NEXT: .save {r4, lr} -; CHECK-T1-NEXT: push {r4, lr} -; CHECK-T1-NEXT: ldr r2, [sp, #12] -; CHECK-T1-NEXT: mov r4, r1 -; CHECK-T1-NEXT: eors r4, r2 -; CHECK-T1-NEXT: ldr r3, [sp, #8] -; CHECK-T1-NEXT: subs r0, r0, r3 +; CHECK-T1-NEXT: .save {r4, r5, r7, lr} +; CHECK-T1-NEXT: push {r4, r5, r7, lr} +; CHECK-T1-NEXT: ldr r2, [sp, #20] +; CHECK-T1-NEXT: mov r5, r1 +; CHECK-T1-NEXT: eors r5, r2 +; CHECK-T1-NEXT: ldr r3, [sp, #16] +; CHECK-T1-NEXT: subs r4, r0, r3 ; CHECK-T1-NEXT: mov r3, r1 ; CHECK-T1-NEXT: sbcs r3, r2 ; CHECK-T1-NEXT: eors r1, r3 -; CHECK-T1-NEXT: ands r1, r4 -; CHECK-T1-NEXT: bpl .LBB1_2 +; CHECK-T1-NEXT: ands r1, r5 +; CHECK-T1-NEXT: asrs r2, r3, #31 +; CHECK-T1-NEXT: cmp r1, #0 +; CHECK-T1-NEXT: mov r0, r2 +; CHECK-T1-NEXT: bmi .LBB1_2 ; CHECK-T1-NEXT: @ %bb.1: -; CHECK-T1-NEXT: asrs r0, r3, #31 +; CHECK-T1-NEXT: mov r0, r4 ; CHECK-T1-NEXT: .LBB1_2: -; CHECK-T1-NEXT: cmp r3, #0 +; CHECK-T1-NEXT: cmp r1, #0 ; CHECK-T1-NEXT: bmi .LBB1_4 ; CHECK-T1-NEXT: @ %bb.3: -; CHECK-T1-NEXT: movs r2, #1 -; CHECK-T1-NEXT: lsls r2, r2, #31 -; CHECK-T1-NEXT: cmp r1, #0 -; CHECK-T1-NEXT: bpl .LBB1_5 -; CHECK-T1-NEXT: b .LBB1_6 +; CHECK-T1-NEXT: mov r1, r3 +; CHECK-T1-NEXT: pop {r4, r5, r7, pc} ; CHECK-T1-NEXT: .LBB1_4: -; CHECK-T1-NEXT: ldr r2, .LCPI1_0 -; CHECK-T1-NEXT: cmp r1, #0 -; CHECK-T1-NEXT: bmi .LBB1_6 -; CHECK-T1-NEXT: .LBB1_5: -; CHECK-T1-NEXT: mov r2, r3 -; CHECK-T1-NEXT: .LBB1_6: +; CHECK-T1-NEXT: movs r1, #1 +; CHECK-T1-NEXT: lsls r1, r1, #31 +; CHECK-T1-NEXT: eors r2, r1 ; CHECK-T1-NEXT: mov r1, r2 -; CHECK-T1-NEXT: pop {r4, pc} -; CHECK-T1-NEXT: .p2align 2 -; CHECK-T1-NEXT: @ %bb.7: -; CHECK-T1-NEXT: .LCPI1_0: -; CHECK-T1-NEXT: .long 2147483647 @ 0x7fffffff +; CHECK-T1-NEXT: pop {r4, r5, r7, pc} ; ; CHECK-T2-LABEL: func64: ; CHECK-T2: @ %bb.0: @@ -128,16 +92,13 @@ ; CHECK-T2-NEXT: sbc.w r2, r1, r12 ; CHECK-T2-NEXT: eor.w r3, r1, r12 ; CHECK-T2-NEXT: eors r1, r2 -; CHECK-T2-NEXT: ands r3, r1 -; CHECK-T2-NEXT: mov.w r1, #-2147483648 +; CHECK-T2-NEXT: ands r1, r3 ; CHECK-T2-NEXT: it mi ; CHECK-T2-NEXT: asrmi r0, r2, #31 -; CHECK-T2-NEXT: cmp r2, #0 +; CHECK-T2-NEXT: mov.w r1, #-2147483648 ; CHECK-T2-NEXT: it mi -; CHECK-T2-NEXT: mvnmi r1, #-2147483648 -; CHECK-T2-NEXT: cmp r3, #0 -; CHECK-T2-NEXT: it pl -; CHECK-T2-NEXT: movpl r1, r2 +; CHECK-T2-NEXT: eormi.w r2, r1, r2, asr #31 +; CHECK-T2-NEXT: mov r1, r2 ; CHECK-T2-NEXT: bx lr ; ; CHECK-ARM-LABEL: func64: @@ -148,13 +109,11 @@ ; CHECK-ARM-NEXT: eor r3, r1, r2 ; CHECK-ARM-NEXT: sbc r2, r1, r2 ; CHECK-ARM-NEXT: eor r1, r1, r2 -; CHECK-ARM-NEXT: ands r3, r3, r1 -; CHECK-ARM-NEXT: mov r1, #-2147483648 +; CHECK-ARM-NEXT: ands r1, r3, r1 ; CHECK-ARM-NEXT: asrmi r0, r2, #31 -; CHECK-ARM-NEXT: cmp r2, #0 -; CHECK-ARM-NEXT: mvnmi r1, #-2147483648 -; CHECK-ARM-NEXT: cmp r3, #0 -; CHECK-ARM-NEXT: movpl r1, r2 +; CHECK-ARM-NEXT: mov r1, #-2147483648 +; CHECK-ARM-NEXT: eormi r2, r1, r2, asr #31 +; CHECK-ARM-NEXT: mov r1, r2 ; CHECK-ARM-NEXT: bx lr %a = mul i64 %y, %z %tmp = call i64 @llvm.ssub.sat.i64(i64 %x, i64 %z) Index: llvm/test/CodeGen/PowerPC/sat-add.ll =================================================================== --- llvm/test/CodeGen/PowerPC/sat-add.ll +++ llvm/test/CodeGen/PowerPC/sat-add.ll @@ -747,118 +747,114 @@ ; CHECK-NEXT: vadduqm 0, 2, 6 ; CHECK-NEXT: xxswapd 0, 34 ; CHECK-NEXT: std 30, -16(1) # 8-byte Folded Spill -; CHECK-NEXT: li 3, -1 +; CHECK-NEXT: addis 3, 2, .LCPI48_0@toc@ha ; CHECK-NEXT: vadduqm 1, 3, 7 ; CHECK-NEXT: xxswapd 1, 35 -; CHECK-NEXT: xxswapd 2, 32 +; CHECK-NEXT: addi 3, 3, .LCPI48_0@toc@l +; CHECK-NEXT: xxswapd 3, 32 ; CHECK-NEXT: mfvsrd 4, 34 -; CHECK-NEXT: mfvsrd 9, 32 -; CHECK-NEXT: mffprd 0, 0 +; CHECK-NEXT: mfvsrd 8, 32 +; CHECK-NEXT: xxswapd 2, 36 +; CHECK-NEXT: mffprd 12, 0 ; CHECK-NEXT: xxswapd 0, 33 -; CHECK-NEXT: mfvsrd 5, 38 -; CHECK-NEXT: cmpld 9, 4 -; CHECK-NEXT: cmpd 1, 9, 4 -; CHECK-NEXT: vadduqm 6, 4, 8 -; CHECK-NEXT: mffprd 4, 2 -; CHECK-NEXT: sradi 5, 5, 63 -; CHECK-NEXT: mffprd 30, 1 -; CHECK-NEXT: xxswapd 1, 36 +; CHECK-NEXT: vadduqm 10, 4, 8 +; CHECK-NEXT: cmpld 8, 4 +; CHECK-NEXT: cmpd 1, 8, 4 +; CHECK-NEXT: mffprd 4, 3 +; CHECK-NEXT: lxvd2x 3, 0, 3 +; CHECK-NEXT: sradi 3, 8, 63 +; CHECK-NEXT: mffprd 0, 1 +; CHECK-NEXT: xxswapd 1, 37 +; CHECK-NEXT: mfvsrd 5, 35 +; CHECK-NEXT: vadduqm 11, 5, 9 +; CHECK-NEXT: xxswapd 34, 3 +; CHECK-NEXT: mfvsrd 9, 33 ; CHECK-NEXT: crandc 20, 4, 2 -; CHECK-NEXT: cmpld 1, 4, 0 +; CHECK-NEXT: cmpld 1, 4, 12 ; CHECK-NEXT: mffprd 4, 0 -; CHECK-NEXT: xxswapd 0, 38 -; CHECK-NEXT: mfvsrd 6, 35 -; CHECK-NEXT: vadduqm 10, 5, 9 -; CHECK-NEXT: cmpld 6, 4, 30 -; CHECK-NEXT: ld 30, -16(1) # 8-byte Folded Reload -; CHECK-NEXT: mfvsrd 10, 33 -; CHECK-NEXT: mfvsrd 7, 36 -; CHECK-NEXT: mfvsrd 11, 38 +; CHECK-NEXT: xxswapd 0, 42 +; CHECK-NEXT: mfvsrd 6, 36 +; CHECK-NEXT: mfvsrd 10, 42 +; CHECK-NEXT: cmpld 6, 4, 0 ; CHECK-NEXT: crand 21, 2, 4 -; CHECK-NEXT: cmpld 10, 6 -; CHECK-NEXT: cmpd 1, 10, 6 -; CHECK-NEXT: mffprd 6, 1 -; CHECK-NEXT: xxswapd 1, 37 +; CHECK-NEXT: cmpld 9, 5 +; CHECK-NEXT: cmpd 1, 9, 5 +; CHECK-NEXT: mffprd 5, 1 +; CHECK-NEXT: xxswapd 1, 43 +; CHECK-NEXT: mffprd 30, 2 ; CHECK-NEXT: mffprd 4, 0 -; CHECK-NEXT: xxswapd 0, 42 -; CHECK-NEXT: mfvsrd 8, 37 -; CHECK-NEXT: mfvsrd 12, 42 +; CHECK-NEXT: mfvsrd 7, 37 +; CHECK-NEXT: mfvsrd 11, 43 ; CHECK-NEXT: crandc 22, 4, 2 -; CHECK-NEXT: cmpd 1, 11, 7 +; CHECK-NEXT: cmpd 1, 10, 6 ; CHECK-NEXT: crand 23, 2, 24 -; CHECK-NEXT: cmpld 11, 7 +; CHECK-NEXT: cmpld 10, 6 ; CHECK-NEXT: crandc 24, 4, 2 -; CHECK-NEXT: cmpld 1, 4, 6 +; CHECK-NEXT: cmpld 1, 4, 30 +; CHECK-NEXT: ld 30, -16(1) # 8-byte Folded Reload ; CHECK-NEXT: mffprd 4, 1 -; CHECK-NEXT: mffprd 6, 0 +; CHECK-NEXT: mfvsrd 6, 38 ; CHECK-NEXT: crand 25, 2, 4 -; CHECK-NEXT: cmpld 12, 8 -; CHECK-NEXT: cmpd 1, 12, 8 +; CHECK-NEXT: cmpld 11, 7 +; CHECK-NEXT: cmpd 1, 11, 7 ; CHECK-NEXT: crandc 26, 4, 2 -; CHECK-NEXT: cmpld 1, 6, 4 +; CHECK-NEXT: cmpld 1, 4, 5 +; CHECK-NEXT: sradi 4, 6, 63 +; CHECK-NEXT: mtfprd 0, 4 ; CHECK-NEXT: mfvsrd 4, 39 -; CHECK-NEXT: mtfprd 0, 5 +; CHECK-NEXT: mfvsrd 5, 40 +; CHECK-NEXT: mfvsrd 6, 41 ; CHECK-NEXT: sradi 4, 4, 63 -; CHECK-NEXT: mfvsrd 5, 41 ; CHECK-NEXT: mtfprd 1, 4 -; CHECK-NEXT: xxspltd 34, 0, 0 -; CHECK-NEXT: mfvsrd 4, 40 +; CHECK-NEXT: sradi 4, 5, 63 +; CHECK-NEXT: mtfprd 2, 4 +; CHECK-NEXT: sradi 4, 6, 63 +; CHECK-NEXT: mtfprd 5, 3 +; CHECK-NEXT: sradi 3, 10, 63 +; CHECK-NEXT: mtfprd 4, 4 +; CHECK-NEXT: sradi 4, 9, 63 +; CHECK-NEXT: mtfprd 6, 4 +; CHECK-NEXT: xxspltd 35, 5, 0 +; CHECK-NEXT: sradi 4, 11, 63 ; CHECK-NEXT: crnor 20, 21, 20 -; CHECK-NEXT: sradi 4, 4, 63 +; CHECK-NEXT: xxspltd 38, 4, 0 +; CHECK-NEXT: mtfprd 3, 3 +; CHECK-NEXT: li 3, -1 +; CHECK-NEXT: xxspltd 36, 6, 0 +; CHECK-NEXT: mtfprd 5, 4 ; CHECK-NEXT: crand 27, 2, 4 -; CHECK-NEXT: mtfprd 2, 4 -; CHECK-NEXT: sradi 4, 5, 63 -; CHECK-NEXT: sradi 5, 10, 63 -; CHECK-NEXT: mtfprd 3, 4 +; CHECK-NEXT: xxspltd 37, 3, 0 +; CHECK-NEXT: xxlxor 3, 35, 34 +; CHECK-NEXT: xxspltd 35, 5, 0 ; CHECK-NEXT: isel 4, 0, 3, 20 -; CHECK-NEXT: xxspltd 36, 2, 0 +; CHECK-NEXT: mtfprd 8, 4 ; CHECK-NEXT: crnor 20, 23, 22 -; CHECK-NEXT: mtfprd 4, 4 -; CHECK-NEXT: sradi 4, 9, 63 -; CHECK-NEXT: mtfprd 0, 4 -; CHECK-NEXT: addis 4, 2, .LCPI48_0@toc@ha -; CHECK-NEXT: mtfprd 5, 5 -; CHECK-NEXT: xxspltd 35, 4, 0 -; CHECK-NEXT: addi 4, 4, .LCPI48_0@toc@l -; CHECK-NEXT: isel 5, 0, 3, 20 -; CHECK-NEXT: lxvd2x 6, 0, 4 -; CHECK-NEXT: mtfprd 4, 5 -; CHECK-NEXT: addis 5, 2, .LCPI48_1@toc@ha -; CHECK-NEXT: xxspltd 37, 5, 0 -; CHECK-NEXT: addi 4, 5, .LCPI48_1@toc@l -; CHECK-NEXT: xxlxor 7, 34, 35 +; CHECK-NEXT: crnor 21, 25, 24 +; CHECK-NEXT: crnor 22, 27, 26 +; CHECK-NEXT: xxlxor 5, 36, 34 +; CHECK-NEXT: xxspltd 36, 2, 0 +; CHECK-NEXT: xxlxor 6, 37, 34 +; CHECK-NEXT: xxlxor 7, 35, 34 +; CHECK-NEXT: xxspltd 34, 0, 0 +; CHECK-NEXT: xxspltd 35, 8, 0 +; CHECK-NEXT: isel 4, 0, 3, 20 +; CHECK-NEXT: isel 5, 0, 3, 21 +; CHECK-NEXT: isel 3, 0, 3, 22 +; CHECK-NEXT: xxlxor 0, 34, 35 ; CHECK-NEXT: xxspltd 34, 1, 0 -; CHECK-NEXT: sradi 5, 11, 63 -; CHECK-NEXT: lxvd2x 8, 0, 4 -; CHECK-NEXT: xxspltd 35, 4, 0 -; CHECK-NEXT: crnor 20, 25, 24 -; CHECK-NEXT: sradi 4, 12, 63 -; CHECK-NEXT: crnor 21, 27, 26 -; CHECK-NEXT: xxswapd 4, 6 +; CHECK-NEXT: mtfprd 8, 4 ; CHECK-NEXT: mtfprd 1, 5 -; CHECK-NEXT: mtfprd 9, 4 -; CHECK-NEXT: xxswapd 6, 8 -; CHECK-NEXT: xxlxor 2, 34, 35 -; CHECK-NEXT: xxspltd 35, 0, 0 -; CHECK-NEXT: isel 4, 0, 3, 20 -; CHECK-NEXT: xxspltd 39, 1, 0 -; CHECK-NEXT: isel 3, 0, 3, 21 -; CHECK-NEXT: xxspltd 40, 9, 0 -; CHECK-NEXT: mtfprd 0, 4 -; CHECK-NEXT: xxspltd 34, 3, 0 -; CHECK-NEXT: mtfprd 1, 3 -; CHECK-NEXT: xxsel 3, 6, 4, 39 -; CHECK-NEXT: xxspltd 41, 0, 0 -; CHECK-NEXT: xxsel 0, 6, 4, 35 -; CHECK-NEXT: xxspltd 35, 1, 0 -; CHECK-NEXT: xxsel 1, 6, 4, 37 -; CHECK-NEXT: xxsel 4, 6, 4, 40 -; CHECK-NEXT: xxlxor 5, 36, 41 -; CHECK-NEXT: xxlxor 6, 34, 35 -; CHECK-NEXT: xxsel 34, 32, 0, 7 -; CHECK-NEXT: xxsel 35, 33, 1, 2 -; CHECK-NEXT: xxsel 36, 38, 3, 5 -; CHECK-NEXT: xxsel 37, 42, 4, 6 +; CHECK-NEXT: mtfprd 9, 3 +; CHECK-NEXT: xxspltd 35, 8, 0 +; CHECK-NEXT: xxspltd 37, 1, 0 +; CHECK-NEXT: xxspltd 39, 9, 0 +; CHECK-NEXT: xxlxor 1, 34, 35 +; CHECK-NEXT: xxsel 34, 32, 3, 0 +; CHECK-NEXT: xxlxor 2, 36, 37 +; CHECK-NEXT: xxlxor 4, 38, 39 +; CHECK-NEXT: xxsel 35, 33, 5, 1 +; CHECK-NEXT: xxsel 36, 42, 6, 2 +; CHECK-NEXT: xxsel 37, 43, 7, 4 ; CHECK-NEXT: blr %c = call <4 x i128> @llvm.sadd.sat.v4i128(<4 x i128> %a, <4 x i128> %b) ret <4 x i128> %c Index: llvm/test/CodeGen/RISCV/sadd_sat.ll =================================================================== --- llvm/test/CodeGen/RISCV/sadd_sat.ll +++ llvm/test/CodeGen/RISCV/sadd_sat.ll @@ -21,9 +21,9 @@ ; RV32I-NEXT: slti a1, a1, 0 ; RV32I-NEXT: beq a1, a2, .LBB0_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: slti a0, a0, 0 +; RV32I-NEXT: srai a0, a0, 31 ; RV32I-NEXT: lui a1, 524288 -; RV32I-NEXT: sub a0, a1, a0 +; RV32I-NEXT: xor a0, a0, a1 ; RV32I-NEXT: .LBB0_2: ; RV32I-NEXT: ret ; @@ -52,9 +52,9 @@ ; RV32IZbbNOZbt-NEXT: slti a1, a1, 0 ; RV32IZbbNOZbt-NEXT: beq a1, a2, .LBB0_2 ; RV32IZbbNOZbt-NEXT: # %bb.1: -; RV32IZbbNOZbt-NEXT: slti a0, a0, 0 +; RV32IZbbNOZbt-NEXT: srai a0, a0, 31 ; RV32IZbbNOZbt-NEXT: lui a1, 524288 -; RV32IZbbNOZbt-NEXT: sub a0, a1, a0 +; RV32IZbbNOZbt-NEXT: xor a0, a0, a1 ; RV32IZbbNOZbt-NEXT: .LBB0_2: ; RV32IZbbNOZbt-NEXT: ret ; @@ -70,14 +70,13 @@ ; RV32IZbbZbt-LABEL: func: ; RV32IZbbZbt: # %bb.0: ; RV32IZbbZbt-NEXT: add a2, a0, a1 -; RV32IZbbZbt-NEXT: slti a3, a2, 0 -; RV32IZbbZbt-NEXT: lui a4, 524288 -; RV32IZbbZbt-NEXT: addi a5, a4, -1 -; RV32IZbbZbt-NEXT: cmov a3, a3, a5, a4 ; RV32IZbbZbt-NEXT: slt a0, a2, a0 ; RV32IZbbZbt-NEXT: slti a1, a1, 0 ; RV32IZbbZbt-NEXT: xor a0, a1, a0 -; RV32IZbbZbt-NEXT: cmov a0, a0, a3, a2 +; RV32IZbbZbt-NEXT: srai a1, a2, 31 +; RV32IZbbZbt-NEXT: lui a3, 524288 +; RV32IZbbZbt-NEXT: xor a1, a1, a3 +; RV32IZbbZbt-NEXT: cmov a0, a0, a1, a2 ; RV32IZbbZbt-NEXT: ret %tmp = call i32 @llvm.sadd.sat.i32(i32 %x, i32 %y); ret i32 %tmp; @@ -98,11 +97,9 @@ ; RV32I-NEXT: and a2, a3, a2 ; RV32I-NEXT: bgez a2, .LBB1_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: slti a0, a1, 0 -; RV32I-NEXT: lui a2, 524288 -; RV32I-NEXT: sub a2, a2, a0 ; RV32I-NEXT: srai a0, a1, 31 -; RV32I-NEXT: mv a1, a2 +; RV32I-NEXT: lui a1, 524288 +; RV32I-NEXT: xor a1, a0, a1 ; RV32I-NEXT: .LBB1_2: ; RV32I-NEXT: ret ; @@ -114,10 +111,10 @@ ; RV64I-NEXT: slti a1, a1, 0 ; RV64I-NEXT: beq a1, a2, .LBB1_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: slti a0, a0, 0 +; RV64I-NEXT: srai a0, a0, 63 ; RV64I-NEXT: addi a1, zero, -1 ; RV64I-NEXT: slli a1, a1, 63 -; RV64I-NEXT: sub a0, a1, a0 +; RV64I-NEXT: xor a0, a0, a1 ; RV64I-NEXT: .LBB1_2: ; RV64I-NEXT: ret ; @@ -134,11 +131,9 @@ ; RV32IZbbNOZbt-NEXT: andn a2, a2, a3 ; RV32IZbbNOZbt-NEXT: bgez a2, .LBB1_2 ; RV32IZbbNOZbt-NEXT: # %bb.1: -; RV32IZbbNOZbt-NEXT: slti a0, a1, 0 -; RV32IZbbNOZbt-NEXT: lui a2, 524288 -; RV32IZbbNOZbt-NEXT: sub a2, a2, a0 ; RV32IZbbNOZbt-NEXT: srai a0, a1, 31 -; RV32IZbbNOZbt-NEXT: mv a1, a2 +; RV32IZbbNOZbt-NEXT: lui a1, 524288 +; RV32IZbbNOZbt-NEXT: xor a1, a0, a1 ; RV32IZbbNOZbt-NEXT: .LBB1_2: ; RV32IZbbNOZbt-NEXT: ret ; @@ -150,10 +145,10 @@ ; RV64IZbbNOZbt-NEXT: slti a1, a1, 0 ; RV64IZbbNOZbt-NEXT: beq a1, a2, .LBB1_2 ; RV64IZbbNOZbt-NEXT: # %bb.1: -; RV64IZbbNOZbt-NEXT: slti a0, a0, 0 +; RV64IZbbNOZbt-NEXT: srai a0, a0, 63 ; RV64IZbbNOZbt-NEXT: addi a1, zero, -1 ; RV64IZbbNOZbt-NEXT: slli a1, a1, 63 -; RV64IZbbNOZbt-NEXT: sub a0, a1, a0 +; RV64IZbbNOZbt-NEXT: xor a0, a0, a1 ; RV64IZbbNOZbt-NEXT: .LBB1_2: ; RV64IZbbNOZbt-NEXT: ret ; @@ -163,31 +158,28 @@ ; RV32IZbbZbt-NEXT: add a2, a0, a2 ; RV32IZbbZbt-NEXT: sltu a0, a2, a0 ; RV32IZbbZbt-NEXT: add a0, a4, a0 -; RV32IZbbZbt-NEXT: slti a4, a0, 0 -; RV32IZbbZbt-NEXT: lui a6, 524288 -; RV32IZbbZbt-NEXT: addi a5, a6, -1 -; RV32IZbbZbt-NEXT: cmov a4, a4, a5, a6 +; RV32IZbbZbt-NEXT: srai a4, a0, 31 +; RV32IZbbZbt-NEXT: lui a5, 524288 +; RV32IZbbZbt-NEXT: xor a6, a4, a5 ; RV32IZbbZbt-NEXT: xor a5, a1, a0 ; RV32IZbbZbt-NEXT: xor a1, a1, a3 ; RV32IZbbZbt-NEXT: andn a1, a5, a1 ; RV32IZbbZbt-NEXT: slti a3, a1, 0 -; RV32IZbbZbt-NEXT: cmov a1, a3, a4, a0 -; RV32IZbbZbt-NEXT: srai a0, a0, 31 -; RV32IZbbZbt-NEXT: cmov a0, a3, a0, a2 +; RV32IZbbZbt-NEXT: cmov a1, a3, a6, a0 +; RV32IZbbZbt-NEXT: cmov a0, a3, a4, a2 ; RV32IZbbZbt-NEXT: ret ; ; RV64IZbbZbt-LABEL: func2: ; RV64IZbbZbt: # %bb.0: ; RV64IZbbZbt-NEXT: add a2, a0, a1 -; RV64IZbbZbt-NEXT: slti a3, a2, 0 -; RV64IZbbZbt-NEXT: addi a4, zero, -1 -; RV64IZbbZbt-NEXT: slli a5, a4, 63 -; RV64IZbbZbt-NEXT: srli a4, a4, 1 -; RV64IZbbZbt-NEXT: cmov a3, a3, a4, a5 ; RV64IZbbZbt-NEXT: slt a0, a2, a0 ; RV64IZbbZbt-NEXT: slti a1, a1, 0 ; RV64IZbbZbt-NEXT: xor a0, a1, a0 -; RV64IZbbZbt-NEXT: cmov a0, a0, a3, a2 +; RV64IZbbZbt-NEXT: srai a1, a2, 63 +; RV64IZbbZbt-NEXT: addi a3, zero, -1 +; RV64IZbbZbt-NEXT: slli a3, a3, 63 +; RV64IZbbZbt-NEXT: xor a1, a1, a3 +; RV64IZbbZbt-NEXT: cmov a0, a0, a1, a2 ; RV64IZbbZbt-NEXT: ret %tmp = call i64 @llvm.sadd.sat.i64(i64 %x, i64 %y); ret i64 %tmp; Index: llvm/test/CodeGen/RISCV/sadd_sat_plus.ll =================================================================== --- llvm/test/CodeGen/RISCV/sadd_sat_plus.ll +++ llvm/test/CodeGen/RISCV/sadd_sat_plus.ll @@ -22,9 +22,9 @@ ; RV32I-NEXT: slti a1, a1, 0 ; RV32I-NEXT: beq a1, a2, .LBB0_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: slti a0, a0, 0 +; RV32I-NEXT: srai a0, a0, 31 ; RV32I-NEXT: lui a1, 524288 -; RV32I-NEXT: sub a0, a1, a0 +; RV32I-NEXT: xor a0, a0, a1 ; RV32I-NEXT: .LBB0_2: ; RV32I-NEXT: ret ; @@ -56,9 +56,9 @@ ; RV32IZbbNOZbt-NEXT: slti a1, a1, 0 ; RV32IZbbNOZbt-NEXT: beq a1, a2, .LBB0_2 ; RV32IZbbNOZbt-NEXT: # %bb.1: -; RV32IZbbNOZbt-NEXT: slti a0, a0, 0 +; RV32IZbbNOZbt-NEXT: srai a0, a0, 31 ; RV32IZbbNOZbt-NEXT: lui a1, 524288 -; RV32IZbbNOZbt-NEXT: sub a0, a1, a0 +; RV32IZbbNOZbt-NEXT: xor a0, a0, a1 ; RV32IZbbNOZbt-NEXT: .LBB0_2: ; RV32IZbbNOZbt-NEXT: ret ; @@ -80,10 +80,9 @@ ; RV32IZbbZbt-NEXT: slt a0, a2, a0 ; RV32IZbbZbt-NEXT: slti a1, a1, 0 ; RV32IZbbZbt-NEXT: xor a0, a1, a0 -; RV32IZbbZbt-NEXT: slti a1, a2, 0 +; RV32IZbbZbt-NEXT: srai a1, a2, 31 ; RV32IZbbZbt-NEXT: lui a3, 524288 -; RV32IZbbZbt-NEXT: addi a4, a3, -1 -; RV32IZbbZbt-NEXT: cmov a1, a1, a4, a3 +; RV32IZbbZbt-NEXT: xor a1, a1, a3 ; RV32IZbbZbt-NEXT: cmov a0, a0, a1, a2 ; RV32IZbbZbt-NEXT: ret %a = mul i32 %y, %z @@ -106,11 +105,9 @@ ; RV32I-NEXT: and a2, a2, a3 ; RV32I-NEXT: bgez a2, .LBB1_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: slti a0, a1, 0 -; RV32I-NEXT: lui a2, 524288 -; RV32I-NEXT: sub a2, a2, a0 ; RV32I-NEXT: srai a0, a1, 31 -; RV32I-NEXT: mv a1, a2 +; RV32I-NEXT: lui a1, 524288 +; RV32I-NEXT: xor a1, a0, a1 ; RV32I-NEXT: .LBB1_2: ; RV32I-NEXT: ret ; @@ -122,10 +119,10 @@ ; RV64I-NEXT: slti a2, a2, 0 ; RV64I-NEXT: beq a2, a1, .LBB1_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: slti a0, a0, 0 +; RV64I-NEXT: srai a0, a0, 63 ; RV64I-NEXT: addi a1, zero, -1 ; RV64I-NEXT: slli a1, a1, 63 -; RV64I-NEXT: sub a0, a1, a0 +; RV64I-NEXT: xor a0, a0, a1 ; RV64I-NEXT: .LBB1_2: ; RV64I-NEXT: ret ; @@ -142,11 +139,9 @@ ; RV32IZbbNOZbt-NEXT: andn a2, a3, a2 ; RV32IZbbNOZbt-NEXT: bgez a2, .LBB1_2 ; RV32IZbbNOZbt-NEXT: # %bb.1: -; RV32IZbbNOZbt-NEXT: slti a0, a1, 0 -; RV32IZbbNOZbt-NEXT: lui a2, 524288 -; RV32IZbbNOZbt-NEXT: sub a2, a2, a0 ; RV32IZbbNOZbt-NEXT: srai a0, a1, 31 -; RV32IZbbNOZbt-NEXT: mv a1, a2 +; RV32IZbbNOZbt-NEXT: lui a1, 524288 +; RV32IZbbNOZbt-NEXT: xor a1, a0, a1 ; RV32IZbbNOZbt-NEXT: .LBB1_2: ; RV32IZbbNOZbt-NEXT: ret ; @@ -158,10 +153,10 @@ ; RV64IZbbNOZbt-NEXT: slti a2, a2, 0 ; RV64IZbbNOZbt-NEXT: beq a2, a1, .LBB1_2 ; RV64IZbbNOZbt-NEXT: # %bb.1: -; RV64IZbbNOZbt-NEXT: slti a0, a0, 0 +; RV64IZbbNOZbt-NEXT: srai a0, a0, 63 ; RV64IZbbNOZbt-NEXT: addi a1, zero, -1 ; RV64IZbbNOZbt-NEXT: slli a1, a1, 63 -; RV64IZbbNOZbt-NEXT: sub a0, a1, a0 +; RV64IZbbNOZbt-NEXT: xor a0, a0, a1 ; RV64IZbbNOZbt-NEXT: .LBB1_2: ; RV64IZbbNOZbt-NEXT: ret ; @@ -171,31 +166,28 @@ ; RV32IZbbZbt-NEXT: add a3, a0, a4 ; RV32IZbbZbt-NEXT: sltu a0, a3, a0 ; RV32IZbbZbt-NEXT: add a0, a2, a0 -; RV32IZbbZbt-NEXT: slti a2, a0, 0 -; RV32IZbbZbt-NEXT: lui a6, 524288 -; RV32IZbbZbt-NEXT: addi a4, a6, -1 -; RV32IZbbZbt-NEXT: cmov a2, a2, a4, a6 +; RV32IZbbZbt-NEXT: srai a2, a0, 31 +; RV32IZbbZbt-NEXT: lui a4, 524288 +; RV32IZbbZbt-NEXT: xor a6, a2, a4 ; RV32IZbbZbt-NEXT: xor a4, a1, a0 ; RV32IZbbZbt-NEXT: xor a1, a1, a5 ; RV32IZbbZbt-NEXT: andn a1, a4, a1 ; RV32IZbbZbt-NEXT: slti a4, a1, 0 -; RV32IZbbZbt-NEXT: cmov a1, a4, a2, a0 -; RV32IZbbZbt-NEXT: srai a0, a0, 31 -; RV32IZbbZbt-NEXT: cmov a0, a4, a0, a3 +; RV32IZbbZbt-NEXT: cmov a1, a4, a6, a0 +; RV32IZbbZbt-NEXT: cmov a0, a4, a2, a3 ; RV32IZbbZbt-NEXT: ret ; ; RV64IZbbZbt-LABEL: func64: ; RV64IZbbZbt: # %bb.0: ; RV64IZbbZbt-NEXT: add a1, a0, a2 -; RV64IZbbZbt-NEXT: slti a3, a1, 0 -; RV64IZbbZbt-NEXT: addi a4, zero, -1 -; RV64IZbbZbt-NEXT: slli a5, a4, 63 -; RV64IZbbZbt-NEXT: srli a4, a4, 1 -; RV64IZbbZbt-NEXT: cmov a3, a3, a4, a5 ; RV64IZbbZbt-NEXT: slt a0, a1, a0 ; RV64IZbbZbt-NEXT: slti a2, a2, 0 ; RV64IZbbZbt-NEXT: xor a0, a2, a0 -; RV64IZbbZbt-NEXT: cmov a0, a0, a3, a1 +; RV64IZbbZbt-NEXT: srai a2, a1, 63 +; RV64IZbbZbt-NEXT: addi a3, zero, -1 +; RV64IZbbZbt-NEXT: slli a3, a3, 63 +; RV64IZbbZbt-NEXT: xor a2, a2, a3 +; RV64IZbbZbt-NEXT: cmov a0, a0, a2, a1 ; RV64IZbbZbt-NEXT: ret %a = mul i64 %y, %z %tmp = call i64 @llvm.sadd.sat.i64(i64 %x, i64 %z) Index: llvm/test/CodeGen/RISCV/ssub_sat.ll =================================================================== --- llvm/test/CodeGen/RISCV/ssub_sat.ll +++ llvm/test/CodeGen/RISCV/ssub_sat.ll @@ -21,9 +21,9 @@ ; RV32I-NEXT: slt a1, a0, a2 ; RV32I-NEXT: beq a3, a1, .LBB0_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: slti a0, a0, 0 +; RV32I-NEXT: srai a0, a0, 31 ; RV32I-NEXT: lui a1, 524288 -; RV32I-NEXT: sub a0, a1, a0 +; RV32I-NEXT: xor a0, a0, a1 ; RV32I-NEXT: .LBB0_2: ; RV32I-NEXT: ret ; @@ -52,9 +52,9 @@ ; RV32IZbbNOZbt-NEXT: slt a1, a0, a2 ; RV32IZbbNOZbt-NEXT: beq a3, a1, .LBB0_2 ; RV32IZbbNOZbt-NEXT: # %bb.1: -; RV32IZbbNOZbt-NEXT: slti a0, a0, 0 +; RV32IZbbNOZbt-NEXT: srai a0, a0, 31 ; RV32IZbbNOZbt-NEXT: lui a1, 524288 -; RV32IZbbNOZbt-NEXT: sub a0, a1, a0 +; RV32IZbbNOZbt-NEXT: xor a0, a0, a1 ; RV32IZbbNOZbt-NEXT: .LBB0_2: ; RV32IZbbNOZbt-NEXT: ret ; @@ -73,10 +73,9 @@ ; RV32IZbbZbt-NEXT: sub a1, a0, a1 ; RV32IZbbZbt-NEXT: slt a0, a1, a0 ; RV32IZbbZbt-NEXT: xor a0, a2, a0 -; RV32IZbbZbt-NEXT: slti a2, a1, 0 +; RV32IZbbZbt-NEXT: srai a2, a1, 31 ; RV32IZbbZbt-NEXT: lui a3, 524288 -; RV32IZbbZbt-NEXT: addi a4, a3, -1 -; RV32IZbbZbt-NEXT: cmov a2, a2, a4, a3 +; RV32IZbbZbt-NEXT: xor a2, a2, a3 ; RV32IZbbZbt-NEXT: cmov a0, a0, a2, a1 ; RV32IZbbZbt-NEXT: ret %tmp = call i32 @llvm.ssub.sat.i32(i32 %x, i32 %y); @@ -98,11 +97,9 @@ ; RV32I-NEXT: sub a0, a0, a2 ; RV32I-NEXT: ret ; RV32I-NEXT: .LBB1_2: -; RV32I-NEXT: slti a0, a1, 0 -; RV32I-NEXT: lui a2, 524288 -; RV32I-NEXT: sub a2, a2, a0 ; RV32I-NEXT: srai a0, a1, 31 -; RV32I-NEXT: mv a1, a2 +; RV32I-NEXT: lui a1, 524288 +; RV32I-NEXT: xor a1, a0, a1 ; RV32I-NEXT: ret ; ; RV64I-LABEL: func2: @@ -113,10 +110,10 @@ ; RV64I-NEXT: slt a1, a0, a2 ; RV64I-NEXT: beq a3, a1, .LBB1_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: slti a0, a0, 0 +; RV64I-NEXT: srai a0, a0, 63 ; RV64I-NEXT: addi a1, zero, -1 ; RV64I-NEXT: slli a1, a1, 63 -; RV64I-NEXT: sub a0, a1, a0 +; RV64I-NEXT: xor a0, a0, a1 ; RV64I-NEXT: .LBB1_2: ; RV64I-NEXT: ret ; @@ -134,11 +131,9 @@ ; RV32IZbbNOZbt-NEXT: sub a0, a0, a2 ; RV32IZbbNOZbt-NEXT: ret ; RV32IZbbNOZbt-NEXT: .LBB1_2: -; RV32IZbbNOZbt-NEXT: slti a0, a1, 0 -; RV32IZbbNOZbt-NEXT: lui a2, 524288 -; RV32IZbbNOZbt-NEXT: sub a2, a2, a0 ; RV32IZbbNOZbt-NEXT: srai a0, a1, 31 -; RV32IZbbNOZbt-NEXT: mv a1, a2 +; RV32IZbbNOZbt-NEXT: lui a1, 524288 +; RV32IZbbNOZbt-NEXT: xor a1, a0, a1 ; RV32IZbbNOZbt-NEXT: ret ; ; RV64IZbbNOZbt-LABEL: func2: @@ -149,10 +144,10 @@ ; RV64IZbbNOZbt-NEXT: slt a1, a0, a2 ; RV64IZbbNOZbt-NEXT: beq a3, a1, .LBB1_2 ; RV64IZbbNOZbt-NEXT: # %bb.1: -; RV64IZbbNOZbt-NEXT: slti a0, a0, 0 +; RV64IZbbNOZbt-NEXT: srai a0, a0, 63 ; RV64IZbbNOZbt-NEXT: addi a1, zero, -1 ; RV64IZbbNOZbt-NEXT: slli a1, a1, 63 -; RV64IZbbNOZbt-NEXT: sub a0, a1, a0 +; RV64IZbbNOZbt-NEXT: xor a0, a0, a1 ; RV64IZbbNOZbt-NEXT: .LBB1_2: ; RV64IZbbNOZbt-NEXT: ret ; @@ -161,18 +156,16 @@ ; RV32IZbbZbt-NEXT: sltu a4, a0, a2 ; RV32IZbbZbt-NEXT: sub a5, a1, a3 ; RV32IZbbZbt-NEXT: sub a4, a5, a4 -; RV32IZbbZbt-NEXT: slti a7, a4, 0 -; RV32IZbbZbt-NEXT: lui a6, 524288 -; RV32IZbbZbt-NEXT: addi a5, a6, -1 -; RV32IZbbZbt-NEXT: cmov a6, a7, a5, a6 +; RV32IZbbZbt-NEXT: srai a6, a4, 31 +; RV32IZbbZbt-NEXT: lui a5, 524288 +; RV32IZbbZbt-NEXT: xor a7, a6, a5 ; RV32IZbbZbt-NEXT: xor a5, a1, a4 ; RV32IZbbZbt-NEXT: xor a1, a1, a3 ; RV32IZbbZbt-NEXT: and a1, a1, a5 ; RV32IZbbZbt-NEXT: slti a3, a1, 0 -; RV32IZbbZbt-NEXT: cmov a1, a3, a6, a4 -; RV32IZbbZbt-NEXT: srai a4, a4, 31 +; RV32IZbbZbt-NEXT: cmov a1, a3, a7, a4 ; RV32IZbbZbt-NEXT: sub a0, a0, a2 -; RV32IZbbZbt-NEXT: cmov a0, a3, a4, a0 +; RV32IZbbZbt-NEXT: cmov a0, a3, a6, a0 ; RV32IZbbZbt-NEXT: ret ; ; RV64IZbbZbt-LABEL: func2: @@ -181,11 +174,10 @@ ; RV64IZbbZbt-NEXT: sub a1, a0, a1 ; RV64IZbbZbt-NEXT: slt a0, a1, a0 ; RV64IZbbZbt-NEXT: xor a0, a2, a0 -; RV64IZbbZbt-NEXT: slti a2, a1, 0 +; RV64IZbbZbt-NEXT: srai a2, a1, 63 ; RV64IZbbZbt-NEXT: addi a3, zero, -1 -; RV64IZbbZbt-NEXT: slli a4, a3, 63 -; RV64IZbbZbt-NEXT: srli a3, a3, 1 -; RV64IZbbZbt-NEXT: cmov a2, a2, a3, a4 +; RV64IZbbZbt-NEXT: slli a3, a3, 63 +; RV64IZbbZbt-NEXT: xor a2, a2, a3 ; RV64IZbbZbt-NEXT: cmov a0, a0, a2, a1 ; RV64IZbbZbt-NEXT: ret %tmp = call i64 @llvm.ssub.sat.i64(i64 %x, i64 %y); Index: llvm/test/CodeGen/RISCV/ssub_sat_plus.ll =================================================================== --- llvm/test/CodeGen/RISCV/ssub_sat_plus.ll +++ llvm/test/CodeGen/RISCV/ssub_sat_plus.ll @@ -22,9 +22,9 @@ ; RV32I-NEXT: slt a2, a0, a3 ; RV32I-NEXT: beq a1, a2, .LBB0_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: slti a0, a0, 0 +; RV32I-NEXT: srai a0, a0, 31 ; RV32I-NEXT: lui a1, 524288 -; RV32I-NEXT: sub a0, a1, a0 +; RV32I-NEXT: xor a0, a0, a1 ; RV32I-NEXT: .LBB0_2: ; RV32I-NEXT: ret ; @@ -56,9 +56,9 @@ ; RV32IZbbNOZbt-NEXT: slt a2, a0, a3 ; RV32IZbbNOZbt-NEXT: beq a1, a2, .LBB0_2 ; RV32IZbbNOZbt-NEXT: # %bb.1: -; RV32IZbbNOZbt-NEXT: slti a0, a0, 0 +; RV32IZbbNOZbt-NEXT: srai a0, a0, 31 ; RV32IZbbNOZbt-NEXT: lui a1, 524288 -; RV32IZbbNOZbt-NEXT: sub a0, a1, a0 +; RV32IZbbNOZbt-NEXT: xor a0, a0, a1 ; RV32IZbbNOZbt-NEXT: .LBB0_2: ; RV32IZbbNOZbt-NEXT: ret ; @@ -80,10 +80,9 @@ ; RV32IZbbZbt-NEXT: sub a1, a0, a1 ; RV32IZbbZbt-NEXT: slt a0, a1, a0 ; RV32IZbbZbt-NEXT: xor a0, a2, a0 -; RV32IZbbZbt-NEXT: slti a2, a1, 0 +; RV32IZbbZbt-NEXT: srai a2, a1, 31 ; RV32IZbbZbt-NEXT: lui a3, 524288 -; RV32IZbbZbt-NEXT: addi a4, a3, -1 -; RV32IZbbZbt-NEXT: cmov a2, a2, a4, a3 +; RV32IZbbZbt-NEXT: xor a2, a2, a3 ; RV32IZbbZbt-NEXT: cmov a0, a0, a2, a1 ; RV32IZbbZbt-NEXT: ret %a = mul i32 %y, %z @@ -106,11 +105,9 @@ ; RV32I-NEXT: sub a0, a0, a4 ; RV32I-NEXT: ret ; RV32I-NEXT: .LBB1_2: -; RV32I-NEXT: slti a0, a1, 0 -; RV32I-NEXT: lui a2, 524288 -; RV32I-NEXT: sub a2, a2, a0 ; RV32I-NEXT: srai a0, a1, 31 -; RV32I-NEXT: mv a1, a2 +; RV32I-NEXT: lui a1, 524288 +; RV32I-NEXT: xor a1, a0, a1 ; RV32I-NEXT: ret ; ; RV64I-LABEL: func64: @@ -121,10 +118,10 @@ ; RV64I-NEXT: slt a1, a0, a1 ; RV64I-NEXT: beq a3, a1, .LBB1_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: slti a0, a0, 0 +; RV64I-NEXT: srai a0, a0, 63 ; RV64I-NEXT: addi a1, zero, -1 ; RV64I-NEXT: slli a1, a1, 63 -; RV64I-NEXT: sub a0, a1, a0 +; RV64I-NEXT: xor a0, a0, a1 ; RV64I-NEXT: .LBB1_2: ; RV64I-NEXT: ret ; @@ -142,11 +139,9 @@ ; RV32IZbbNOZbt-NEXT: sub a0, a0, a4 ; RV32IZbbNOZbt-NEXT: ret ; RV32IZbbNOZbt-NEXT: .LBB1_2: -; RV32IZbbNOZbt-NEXT: slti a0, a1, 0 -; RV32IZbbNOZbt-NEXT: lui a2, 524288 -; RV32IZbbNOZbt-NEXT: sub a2, a2, a0 ; RV32IZbbNOZbt-NEXT: srai a0, a1, 31 -; RV32IZbbNOZbt-NEXT: mv a1, a2 +; RV32IZbbNOZbt-NEXT: lui a1, 524288 +; RV32IZbbNOZbt-NEXT: xor a1, a0, a1 ; RV32IZbbNOZbt-NEXT: ret ; ; RV64IZbbNOZbt-LABEL: func64: @@ -157,10 +152,10 @@ ; RV64IZbbNOZbt-NEXT: slt a1, a0, a1 ; RV64IZbbNOZbt-NEXT: beq a3, a1, .LBB1_2 ; RV64IZbbNOZbt-NEXT: # %bb.1: -; RV64IZbbNOZbt-NEXT: slti a0, a0, 0 +; RV64IZbbNOZbt-NEXT: srai a0, a0, 63 ; RV64IZbbNOZbt-NEXT: addi a1, zero, -1 ; RV64IZbbNOZbt-NEXT: slli a1, a1, 63 -; RV64IZbbNOZbt-NEXT: sub a0, a1, a0 +; RV64IZbbNOZbt-NEXT: xor a0, a0, a1 ; RV64IZbbNOZbt-NEXT: .LBB1_2: ; RV64IZbbNOZbt-NEXT: ret ; @@ -169,18 +164,16 @@ ; RV32IZbbZbt-NEXT: sltu a2, a0, a4 ; RV32IZbbZbt-NEXT: sub a3, a1, a5 ; RV32IZbbZbt-NEXT: sub a2, a3, a2 -; RV32IZbbZbt-NEXT: slti a7, a2, 0 -; RV32IZbbZbt-NEXT: lui a6, 524288 -; RV32IZbbZbt-NEXT: addi a3, a6, -1 -; RV32IZbbZbt-NEXT: cmov a6, a7, a3, a6 +; RV32IZbbZbt-NEXT: srai a6, a2, 31 +; RV32IZbbZbt-NEXT: lui a3, 524288 +; RV32IZbbZbt-NEXT: xor a7, a6, a3 ; RV32IZbbZbt-NEXT: xor a3, a1, a2 ; RV32IZbbZbt-NEXT: xor a1, a1, a5 ; RV32IZbbZbt-NEXT: and a1, a1, a3 ; RV32IZbbZbt-NEXT: slti a3, a1, 0 -; RV32IZbbZbt-NEXT: cmov a1, a3, a6, a2 -; RV32IZbbZbt-NEXT: srai a2, a2, 31 +; RV32IZbbZbt-NEXT: cmov a1, a3, a7, a2 ; RV32IZbbZbt-NEXT: sub a0, a0, a4 -; RV32IZbbZbt-NEXT: cmov a0, a3, a2, a0 +; RV32IZbbZbt-NEXT: cmov a0, a3, a6, a0 ; RV32IZbbZbt-NEXT: ret ; ; RV64IZbbZbt-LABEL: func64: @@ -189,11 +182,10 @@ ; RV64IZbbZbt-NEXT: sub a2, a0, a2 ; RV64IZbbZbt-NEXT: slt a0, a2, a0 ; RV64IZbbZbt-NEXT: xor a0, a1, a0 -; RV64IZbbZbt-NEXT: slti a1, a2, 0 +; RV64IZbbZbt-NEXT: srai a1, a2, 63 ; RV64IZbbZbt-NEXT: addi a3, zero, -1 -; RV64IZbbZbt-NEXT: slli a4, a3, 63 -; RV64IZbbZbt-NEXT: srli a3, a3, 1 -; RV64IZbbZbt-NEXT: cmov a1, a1, a3, a4 +; RV64IZbbZbt-NEXT: slli a3, a3, 63 +; RV64IZbbZbt-NEXT: xor a1, a1, a3 ; RV64IZbbZbt-NEXT: cmov a0, a0, a1, a2 ; RV64IZbbZbt-NEXT: ret %a = mul i64 %y, %z Index: llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll +++ llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll @@ -60,20 +60,14 @@ ; CHECK-NEXT: cmp r5, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: asrne r3, r1, #31 -; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: cset r2, mi -; CHECK-NEXT: mvn r3, #-2147483648 -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: cinv r2, r3, eq ; CHECK-NEXT: cmp.w r12, #0 -; CHECK-NEXT: csel r0, r2, r0, ne -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r2, mi -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: cinv r2, r3, eq +; CHECK-NEXT: mov.w r2, #-2147483648 +; CHECK-NEXT: it ne +; CHECK-NEXT: eorne.w r0, r2, r0, asr #31 ; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: csel r1, r2, r1, ne +; CHECK-NEXT: it ne +; CHECK-NEXT: eorne.w r1, r2, r1, asr #31 ; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: @@ -183,40 +177,34 @@ ; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: vmov r0, r1, d3 ; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: subs r0, r2, r0 +; CHECK-NEXT: subs r2, r2, r0 ; CHECK-NEXT: eor.w r12, r3, r1 -; CHECK-NEXT: sbc.w r1, r3, r1 -; CHECK-NEXT: eor.w r2, r3, r1 +; CHECK-NEXT: sbc.w r0, r3, r1 +; CHECK-NEXT: eor.w r1, r3, r0 ; CHECK-NEXT: vmov r3, r4, d0 -; CHECK-NEXT: ands.w r2, r2, r12 -; CHECK-NEXT: vmov lr, r2, d2 +; CHECK-NEXT: ands.w r1, r1, r12 +; CHECK-NEXT: vmov lr, r1, d2 ; CHECK-NEXT: cset r12, mi ; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: asrne r0, r1, #31 +; CHECK-NEXT: asrne r2, r0, #31 ; CHECK-NEXT: subs.w r3, r3, lr -; CHECK-NEXT: eor.w r5, r4, r2 -; CHECK-NEXT: sbc.w r2, r4, r2 -; CHECK-NEXT: eors r4, r2 +; CHECK-NEXT: eor.w r5, r4, r1 +; CHECK-NEXT: sbc.w r1, r4, r1 +; CHECK-NEXT: eors r4, r1 ; CHECK-NEXT: ands r5, r4 ; CHECK-NEXT: cset r5, mi ; CHECK-NEXT: cmp r5, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: asrne r3, r2, #31 -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: vmov q0[2], q0[0], r3, r0 -; CHECK-NEXT: cset r0, mi -; CHECK-NEXT: mvn r3, #-2147483648 -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: cinv r0, r3, eq +; CHECK-NEXT: asrne r3, r1, #31 +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: cmp.w r12, #0 -; CHECK-NEXT: csel r0, r0, r1, ne -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: cset r1, mi -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cinv r1, r3, eq +; CHECK-NEXT: mov.w r2, #-2147483648 +; CHECK-NEXT: it ne +; CHECK-NEXT: eorne.w r0, r2, r0, asr #31 ; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: csel r1, r1, r2, ne +; CHECK-NEXT: it ne +; CHECK-NEXT: eorne.w r1, r2, r1, asr #31 ; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: Index: llvm/test/CodeGen/X86/combine-add-ssat.ll =================================================================== --- llvm/test/CodeGen/X86/combine-add-ssat.ll +++ llvm/test/CodeGen/X86/combine-add-ssat.ll @@ -77,11 +77,10 @@ define i32 @combine_constant_i32(i32 %a0) { ; CHECK-LABEL: combine_constant_i32: ; CHECK: # %bb.0: -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: movl %edi, %ecx -; CHECK-NEXT: incl %ecx -; CHECK-NEXT: setns %al -; CHECK-NEXT: addl $2147483647, %eax # imm = 0x7FFFFFFF +; CHECK-NEXT: # kill: def $edi killed $edi def $rdi +; CHECK-NEXT: leal 1(%rdi), %eax +; CHECK-NEXT: sarl $31, %eax +; CHECK-NEXT: xorl $-2147483648, %eax # imm = 0x80000000 ; CHECK-NEXT: incl %edi ; CHECK-NEXT: cmovnol %edi, %eax ; CHECK-NEXT: retq @@ -125,13 +124,13 @@ define i32 @combine_no_overflow_i32(i32 %a0, i32 %a1) { ; CHECK-LABEL: combine_no_overflow_i32: ; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $esi killed $esi def $rsi +; CHECK-NEXT: # kill: def $edi killed $edi def $rdi ; CHECK-NEXT: sarl $16, %edi ; CHECK-NEXT: shrl $16, %esi -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: movl %edi, %ecx -; CHECK-NEXT: addl %esi, %ecx -; CHECK-NEXT: setns %al -; CHECK-NEXT: addl $2147483647, %eax # imm = 0x7FFFFFFF +; CHECK-NEXT: leal (%rdi,%rsi), %eax +; CHECK-NEXT: sarl $31, %eax +; CHECK-NEXT: xorl $-2147483648, %eax # imm = 0x80000000 ; CHECK-NEXT: addl %edi, %esi ; CHECK-NEXT: cmovnol %esi, %eax ; CHECK-NEXT: retq Index: llvm/test/CodeGen/X86/sadd_sat.ll =================================================================== --- llvm/test/CodeGen/X86/sadd_sat.ll +++ llvm/test/CodeGen/X86/sadd_sat.ll @@ -12,26 +12,22 @@ define i32 @func(i32 %x, i32 %y) nounwind { ; X86-LABEL: func: ; X86: # %bb.0: -; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %edx, %esi -; X86-NEXT: setns %cl -; X86-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF -; X86-NEXT: addl %edx, %eax -; X86-NEXT: cmovol %ecx, %eax -; X86-NEXT: popl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: leal (%eax,%ecx), %edx +; X86-NEXT: sarl $31, %edx +; X86-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: cmovol %edx, %eax ; X86-NEXT: retl ; ; X64-LABEL: func: ; X64: # %bb.0: -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: movl %edi, %ecx -; X64-NEXT: addl %esi, %ecx -; X64-NEXT: setns %al -; X64-NEXT: addl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: # kill: def $esi killed $esi def $rsi +; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: leal (%rdi,%rsi), %eax +; X64-NEXT: sarl $31, %eax +; X64-NEXT: xorl $-2147483648, %eax # imm = 0x80000000 ; X64-NEXT: addl %esi, %edi ; X64-NEXT: cmovnol %edi, %eax ; X64-NEXT: retq @@ -43,34 +39,27 @@ ; X86-LABEL: func2: ; X86: # %bb.0: ; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: addl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: adcl {{[0-9]+}}(%esp), %esi +; X86-NEXT: addl {{[0-9]+}}(%esp), %eax +; X86-NEXT: adcl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: seto %bl -; X86-NEXT: movl %esi, %eax -; X86-NEXT: sarl $31, %eax +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: sarl $31, %edx ; X86-NEXT: testb %bl, %bl -; X86-NEXT: cmovel %ecx, %eax -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: testl %esi, %esi -; X86-NEXT: setns %dl -; X86-NEXT: addl $2147483647, %edx # imm = 0x7FFFFFFF +; X86-NEXT: cmovnel %edx, %eax +; X86-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 ; X86-NEXT: testb %bl, %bl -; X86-NEXT: cmovel %esi, %edx -; X86-NEXT: popl %esi +; X86-NEXT: cmovel %ecx, %edx ; X86-NEXT: popl %ebx ; X86-NEXT: retl ; ; X64-LABEL: func2: ; X64: # %bb.0: -; X64-NEXT: xorl %ecx, %ecx -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: addq %rsi, %rax -; X64-NEXT: setns %cl -; X64-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF -; X64-NEXT: addq %rcx, %rax +; X64-NEXT: leaq (%rdi,%rsi), %rcx +; X64-NEXT: sarq $63, %rcx +; X64-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 +; X64-NEXT: xorq %rcx, %rax ; X64-NEXT: addq %rsi, %rdi ; X64-NEXT: cmovnoq %rdi, %rax ; X64-NEXT: retq @@ -81,27 +70,26 @@ define signext i16 @func16(i16 signext %x, i16 signext %y) nounwind { ; X86-LABEL: func16: ; X86: # %bb.0: -; X86-NEXT: pushl %esi ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addw %dx, %si -; X86-NEXT: setns %cl -; X86-NEXT: addl $32767, %ecx # imm = 0x7FFF -; X86-NEXT: addw %dx, %ax -; X86-NEXT: cmovol %ecx, %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %eax, %edx +; X86-NEXT: addw %cx, %dx +; X86-NEXT: movswl %dx, %edx +; X86-NEXT: sarl $15, %edx +; X86-NEXT: xorl $-32768, %edx # imm = 0x8000 +; X86-NEXT: addw %cx, %ax +; X86-NEXT: cmovol %edx, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax -; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: func16: ; X64: # %bb.0: -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: movl %edi, %ecx -; X64-NEXT: addw %si, %cx -; X64-NEXT: setns %al -; X64-NEXT: addl $32767, %eax # imm = 0x7FFF +; X64-NEXT: # kill: def $esi killed $esi def $rsi +; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: leal (%rdi,%rsi), %eax +; X64-NEXT: cwtl +; X64-NEXT: sarl $15, %eax +; X64-NEXT: xorl $-32768, %eax # imm = 0x8000 ; X64-NEXT: addw %si, %di ; X64-NEXT: cmovnol %edi, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax @@ -114,28 +102,29 @@ ; X86-LABEL: func8: ; X86: # %bb.0: ; X86-NEXT: movb {{[0-9]+}}(%esp), %al -; X86-NEXT: movb {{[0-9]+}}(%esp), %dl -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: movb %al, %ah -; X86-NEXT: addb %dl, %ah -; X86-NEXT: setns %cl -; X86-NEXT: addl $127, %ecx -; X86-NEXT: addb %dl, %al -; X86-NEXT: movzbl %al, %eax -; X86-NEXT: cmovol %ecx, %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: movl %eax, %edx +; X86-NEXT: addb %cl, %dl +; X86-NEXT: sarb $7, %dl +; X86-NEXT: xorb $-128, %dl +; X86-NEXT: addb %cl, %al +; X86-NEXT: movzbl %al, %ecx +; X86-NEXT: movzbl %dl, %eax +; X86-NEXT: cmovnol %ecx, %eax ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl ; ; X64-LABEL: func8: ; X64: # %bb.0: -; X64-NEXT: xorl %ecx, %ecx -; X64-NEXT: movl %edi, %eax -; X64-NEXT: addb %sil, %al -; X64-NEXT: setns %cl -; X64-NEXT: addl $127, %ecx +; X64-NEXT: # kill: def $esi killed $esi def $rsi +; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: leal (%rdi,%rsi), %eax +; X64-NEXT: sarb $7, %al +; X64-NEXT: xorb $-128, %al ; X64-NEXT: addb %sil, %dil -; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: cmovol %ecx, %eax +; X64-NEXT: movzbl %dil, %ecx +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: cmovnol %ecx, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %tmp = call i8 @llvm.sadd.sat.i8(i8 %x, i8 %y) @@ -176,72 +165,59 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-LABEL: vec: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: leal (%ecx,%eax), %esi +; X86-NEXT: sarl $31, %esi +; X86-NEXT: xorl $-2147483648, %esi # imm = 0x80000000 +; X86-NEXT: addl %eax, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: addl %edx, %esi -; X86-NEXT: setns %al -; X86-NEXT: addl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-NEXT: addl %edx, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: cmovol %eax, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: movl %edx, %edi -; X86-NEXT: addl %esi, %edi -; X86-NEXT: setns %al -; X86-NEXT: addl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-NEXT: addl %esi, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: cmovol %eax, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: movl %esi, %ebx -; X86-NEXT: addl %edi, %ebx -; X86-NEXT: setns %al -; X86-NEXT: addl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-NEXT: addl %edi, %esi +; X86-NEXT: cmovol %esi, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: leal (%edx,%eax), %esi +; X86-NEXT: sarl $31, %esi +; X86-NEXT: xorl $-2147483648, %esi # imm = 0x80000000 +; X86-NEXT: addl %eax, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmovol %eax, %esi +; X86-NEXT: cmovol %esi, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: movl %edi, %ebp -; X86-NEXT: addl %eax, %ebp -; X86-NEXT: setns %bl -; X86-NEXT: addl $2147483647, %ebx # imm = 0x7FFFFFFF +; X86-NEXT: leal (%edi,%eax), %esi +; X86-NEXT: sarl $31, %esi +; X86-NEXT: xorl $-2147483648, %esi # imm = 0x80000000 ; X86-NEXT: addl %eax, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: cmovol %esi, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: leal (%ebx,%eax), %esi +; X86-NEXT: sarl $31, %esi +; X86-NEXT: xorl $-2147483648, %esi # imm = 0x80000000 +; X86-NEXT: addl %eax, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmovol %ebx, %edi +; X86-NEXT: cmovol %esi, %ebx ; X86-NEXT: movl %ecx, 12(%eax) ; X86-NEXT: movl %edx, 8(%eax) -; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edi, (%eax) +; X86-NEXT: movl %edi, 4(%eax) +; X86-NEXT: movl %ebx, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 ; ; X64-LABEL: vec: ; X64: # %bb.0: ; X64-NEXT: pxor %xmm2, %xmm2 -; X64-NEXT: pxor %xmm3, %xmm3 -; X64-NEXT: pcmpgtd %xmm1, %xmm3 +; X64-NEXT: pcmpgtd %xmm1, %xmm2 ; X64-NEXT: paddd %xmm0, %xmm1 ; X64-NEXT: pcmpgtd %xmm1, %xmm0 -; X64-NEXT: pxor %xmm3, %xmm0 -; X64-NEXT: movdqa %xmm1, %xmm3 -; X64-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; X64-NEXT: pcmpgtd %xmm1, %xmm2 -; X64-NEXT: psrld $1, %xmm2 -; X64-NEXT: por %xmm3, %xmm2 -; X64-NEXT: pand %xmm0, %xmm2 -; X64-NEXT: pandn %xmm1, %xmm0 +; X64-NEXT: pxor %xmm2, %xmm0 +; X64-NEXT: movdqa %xmm0, %xmm2 +; X64-NEXT: pandn %xmm1, %xmm2 +; X64-NEXT: psrad $31, %xmm1 +; X64-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-NEXT: pand %xmm1, %xmm0 ; X64-NEXT: por %xmm2, %xmm0 ; X64-NEXT: retq %tmp = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %x, <4 x i32> %y); Index: llvm/test/CodeGen/X86/sadd_sat_plus.ll =================================================================== --- llvm/test/CodeGen/X86/sadd_sat_plus.ll +++ llvm/test/CodeGen/X86/sadd_sat_plus.ll @@ -11,28 +11,24 @@ define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind { ; X86-LABEL: func32: ; X86: # %bb.0: -; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: imull {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %edx, %esi -; X86-NEXT: setns %cl -; X86-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF -; X86-NEXT: addl %edx, %eax -; X86-NEXT: cmovol %ecx, %eax -; X86-NEXT: popl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: leal (%eax,%ecx), %edx +; X86-NEXT: sarl $31, %edx +; X86-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: cmovol %edx, %eax ; X86-NEXT: retl ; ; X64-LABEL: func32: ; X64: # %bb.0: +; X64-NEXT: # kill: def $esi killed $esi def $rsi +; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: imull %edx, %esi -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: movl %edi, %ecx -; X64-NEXT: addl %esi, %ecx -; X64-NEXT: setns %al -; X64-NEXT: addl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: leal (%rdi,%rsi), %eax +; X64-NEXT: sarl $31, %eax +; X64-NEXT: xorl $-2147483648, %eax # imm = 0x80000000 ; X64-NEXT: addl %edi, %esi ; X64-NEXT: cmovnol %esi, %eax ; X64-NEXT: retq @@ -45,34 +41,27 @@ ; X86-LABEL: func64: ; X86: # %bb.0: ; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: addl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: adcl {{[0-9]+}}(%esp), %esi +; X86-NEXT: addl {{[0-9]+}}(%esp), %eax +; X86-NEXT: adcl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: seto %bl -; X86-NEXT: movl %esi, %eax -; X86-NEXT: sarl $31, %eax +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: sarl $31, %edx ; X86-NEXT: testb %bl, %bl -; X86-NEXT: cmovel %ecx, %eax -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: testl %esi, %esi -; X86-NEXT: setns %dl -; X86-NEXT: addl $2147483647, %edx # imm = 0x7FFFFFFF +; X86-NEXT: cmovnel %edx, %eax +; X86-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 ; X86-NEXT: testb %bl, %bl -; X86-NEXT: cmovel %esi, %edx -; X86-NEXT: popl %esi +; X86-NEXT: cmovel %ecx, %edx ; X86-NEXT: popl %ebx ; X86-NEXT: retl ; ; X64-LABEL: func64: ; X64: # %bb.0: -; X64-NEXT: xorl %ecx, %ecx -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: addq %rdx, %rax -; X64-NEXT: setns %cl -; X64-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF -; X64-NEXT: addq %rcx, %rax +; X64-NEXT: leaq (%rdi,%rdx), %rcx +; X64-NEXT: sarq $63, %rcx +; X64-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 +; X64-NEXT: xorq %rcx, %rax ; X64-NEXT: addq %rdx, %rdi ; X64-NEXT: cmovnoq %rdi, %rax ; X64-NEXT: retq @@ -84,31 +73,30 @@ define signext i16 @func16(i16 signext %x, i16 signext %y, i16 signext %z) nounwind { ; X86-LABEL: func16: ; X86: # %bb.0: -; X86-NEXT: pushl %esi ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: imulw {{[0-9]+}}(%esp), %ax -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addw %dx, %si -; X86-NEXT: setns %cl -; X86-NEXT: addl $32767, %ecx # imm = 0x7FFF -; X86-NEXT: addw %dx, %ax -; X86-NEXT: cmovol %ecx, %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %eax, %edx +; X86-NEXT: addw %cx, %dx +; X86-NEXT: movswl %dx, %edx +; X86-NEXT: sarl $15, %edx +; X86-NEXT: xorl $-32768, %edx # imm = 0x8000 +; X86-NEXT: addw %cx, %ax +; X86-NEXT: cmovol %edx, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax -; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: func16: ; X64: # %bb.0: +; X64-NEXT: # kill: def $esi killed $esi def $rsi +; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: imull %edx, %esi -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: movl %edi, %ecx -; X64-NEXT: addw %si, %cx -; X64-NEXT: setns %al -; X64-NEXT: addl $32767, %eax # imm = 0x7FFF -; X64-NEXT: addw %si, %di -; X64-NEXT: cmovnol %edi, %eax +; X64-NEXT: leal (%rdi,%rsi), %eax +; X64-NEXT: cwtl +; X64-NEXT: sarl $15, %eax +; X64-NEXT: xorl $-32768, %eax # imm = 0x8000 +; X64-NEXT: addw %di, %si +; X64-NEXT: cmovnol %esi, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq %a = mul i16 %y, %z @@ -121,31 +109,32 @@ ; X86: # %bb.0: ; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: mulb {{[0-9]+}}(%esp) -; X86-NEXT: movb {{[0-9]+}}(%esp), %dl -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: movb %al, %ah -; X86-NEXT: addb %dl, %ah -; X86-NEXT: setns %cl -; X86-NEXT: addl $127, %ecx -; X86-NEXT: addb %dl, %al -; X86-NEXT: movzbl %al, %eax -; X86-NEXT: cmovol %ecx, %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: movl %eax, %edx +; X86-NEXT: addb %cl, %dl +; X86-NEXT: sarb $7, %dl +; X86-NEXT: xorb $-128, %dl +; X86-NEXT: addb %cl, %al +; X86-NEXT: movzbl %al, %ecx +; X86-NEXT: movzbl %dl, %eax +; X86-NEXT: cmovnol %ecx, %eax ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl ; ; X64-LABEL: func8: ; X64: # %bb.0: ; X64-NEXT: movl %esi, %eax +; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: mulb %dl -; X64-NEXT: xorl %ecx, %ecx -; X64-NEXT: movl %edi, %edx -; X64-NEXT: addb %al, %dl -; X64-NEXT: setns %cl -; X64-NEXT: addl $127, %ecx -; X64-NEXT: addb %al, %dil -; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: cmovol %ecx, %eax +; X64-NEXT: # kill: def $al killed $al def $rax +; X64-NEXT: leal (%rdi,%rax), %ecx +; X64-NEXT: sarb $7, %cl +; X64-NEXT: xorb $-128, %cl +; X64-NEXT: addb %dil, %al +; X64-NEXT: movzbl %al, %edx +; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: cmovnol %edx, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %a = mul i8 %y, %z Index: llvm/test/CodeGen/X86/sadd_sat_vec.ll =================================================================== --- llvm/test/CodeGen/X86/sadd_sat_vec.ll +++ llvm/test/CodeGen/X86/sadd_sat_vec.ll @@ -429,32 +429,30 @@ define void @v1i8(<1 x i8>* %px, <1 x i8>* %py, <1 x i8>* %pz) nounwind { ; SSE-LABEL: v1i8: ; SSE: # %bb.0: -; SSE-NEXT: movb (%rdi), %cl -; SSE-NEXT: movb (%rsi), %dil -; SSE-NEXT: xorl %esi, %esi -; SSE-NEXT: movl %ecx, %eax -; SSE-NEXT: addb %dil, %al -; SSE-NEXT: setns %sil -; SSE-NEXT: addl $127, %esi -; SSE-NEXT: addb %dil, %cl +; SSE-NEXT: movb (%rdi), %al +; SSE-NEXT: movb (%rsi), %cl +; SSE-NEXT: leal (%rax,%rcx), %esi +; SSE-NEXT: sarb $7, %sil +; SSE-NEXT: xorb $-128, %sil +; SSE-NEXT: addb %al, %cl ; SSE-NEXT: movzbl %cl, %eax -; SSE-NEXT: cmovol %esi, %eax -; SSE-NEXT: movb %al, (%rdx) +; SSE-NEXT: movzbl %sil, %ecx +; SSE-NEXT: cmovnol %eax, %ecx +; SSE-NEXT: movb %cl, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: v1i8: ; AVX: # %bb.0: -; AVX-NEXT: movb (%rdi), %cl -; AVX-NEXT: movb (%rsi), %dil -; AVX-NEXT: xorl %esi, %esi -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: addb %dil, %al -; AVX-NEXT: setns %sil -; AVX-NEXT: addl $127, %esi -; AVX-NEXT: addb %dil, %cl +; AVX-NEXT: movb (%rdi), %al +; AVX-NEXT: movb (%rsi), %cl +; AVX-NEXT: leal (%rax,%rcx), %esi +; AVX-NEXT: sarb $7, %sil +; AVX-NEXT: xorb $-128, %sil +; AVX-NEXT: addb %al, %cl ; AVX-NEXT: movzbl %cl, %eax -; AVX-NEXT: cmovol %esi, %eax -; AVX-NEXT: movb %al, (%rdx) +; AVX-NEXT: movzbl %sil, %ecx +; AVX-NEXT: cmovnol %eax, %ecx +; AVX-NEXT: movb %cl, (%rdx) ; AVX-NEXT: retq %x = load <1 x i8>, <1 x i8>* %px %y = load <1 x i8>, <1 x i8>* %py @@ -468,28 +466,26 @@ ; SSE: # %bb.0: ; SSE-NEXT: movzwl (%rdi), %eax ; SSE-NEXT: movzwl (%rsi), %ecx -; SSE-NEXT: xorl %esi, %esi -; SSE-NEXT: movl %eax, %edi -; SSE-NEXT: addw %cx, %di -; SSE-NEXT: setns %sil -; SSE-NEXT: addl $32767, %esi # imm = 0x7FFF -; SSE-NEXT: addw %cx, %ax -; SSE-NEXT: cmovol %esi, %eax -; SSE-NEXT: movw %ax, (%rdx) +; SSE-NEXT: leal (%rax,%rcx), %esi +; SSE-NEXT: movswl %si, %esi +; SSE-NEXT: sarl $15, %esi +; SSE-NEXT: xorl $-32768, %esi # imm = 0x8000 +; SSE-NEXT: addw %ax, %cx +; SSE-NEXT: cmovol %esi, %ecx +; SSE-NEXT: movw %cx, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: v1i16: ; AVX: # %bb.0: ; AVX-NEXT: movzwl (%rdi), %eax ; AVX-NEXT: movzwl (%rsi), %ecx -; AVX-NEXT: xorl %esi, %esi -; AVX-NEXT: movl %eax, %edi -; AVX-NEXT: addw %cx, %di -; AVX-NEXT: setns %sil -; AVX-NEXT: addl $32767, %esi # imm = 0x7FFF -; AVX-NEXT: addw %cx, %ax -; AVX-NEXT: cmovol %esi, %eax -; AVX-NEXT: movw %ax, (%rdx) +; AVX-NEXT: leal (%rax,%rcx), %esi +; AVX-NEXT: movswl %si, %esi +; AVX-NEXT: sarl $15, %esi +; AVX-NEXT: xorl $-32768, %esi # imm = 0x8000 +; AVX-NEXT: addw %ax, %cx +; AVX-NEXT: cmovol %esi, %ecx +; AVX-NEXT: movw %cx, (%rdx) ; AVX-NEXT: retq %x = load <1 x i16>, <1 x i16>* %px %y = load <1 x i16>, <1 x i16>* %py @@ -598,84 +594,76 @@ ; SSE2-LABEL: v2i32: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: pandn %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v2i32: ; SSSE3: # %bb.0: ; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: pxor %xmm3, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 ; SSSE3-NEXT: paddd %xmm0, %xmm1 ; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 -; SSSE3-NEXT: pxor %xmm3, %xmm0 -; SSSE3-NEXT: movdqa %xmm1, %xmm3 -; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 -; SSSE3-NEXT: psrld $1, %xmm2 -; SSSE3-NEXT: por %xmm3, %xmm2 -; SSSE3-NEXT: pand %xmm0, %xmm2 -; SSSE3-NEXT: pandn %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pandn %xmm1, %xmm2 +; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSSE3-NEXT: pand %xmm1, %xmm0 ; SSSE3-NEXT: por %xmm2, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v2i32: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: paddd %xmm1, %xmm3 -; SSE41-NEXT: movaps {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: blendvps %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE41-NEXT: pxor %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: movaps %xmm3, %xmm0 +; SSE41-NEXT: paddd %xmm1, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: psrad $31, %xmm1 +; SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm2 +; SSE41-NEXT: movaps %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: v2i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; AVX1-NEXT: vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0 +; AVX1-NEXT: vpsrad $31, %xmm2, %xmm1 +; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v2i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647] -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vblendvps %xmm2, %xmm3, %xmm4, %xmm3 ; AVX2-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0 +; AVX2-NEXT: vpsrad $31, %xmm2, %xmm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; AVX2-NEXT: vpxor %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: v2i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm2 -; AVX512F-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647] -; AVX512F-NEXT: vbroadcastss {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] -; AVX512F-NEXT: vblendvps %xmm2, %xmm3, %xmm4, %xmm3 ; AVX512F-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpxor %xmm0, %xmm1, %xmm0 -; AVX512F-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0 +; AVX512F-NEXT: vpsrad $31, %xmm2, %xmm1 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; AVX512F-NEXT: vpxor %xmm3, %xmm1, %xmm1 +; AVX512F-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v2i32: @@ -685,10 +673,8 @@ ; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 ; AVX512BW-NEXT: kxorw %k1, %k0, %k1 -; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm2, %k2 -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648] -; AVX512BW-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k2} -; AVX512BW-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} +; AVX512BW-NEXT: vpsrad $31, %xmm1, %xmm0 +; AVX512BW-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 {%k1} ; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512BW-NEXT: retq %z = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %x, <2 x i32> %y) @@ -699,84 +685,76 @@ ; SSE2-LABEL: v4i32: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: pandn %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v4i32: ; SSSE3: # %bb.0: ; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: pxor %xmm3, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 ; SSSE3-NEXT: paddd %xmm0, %xmm1 ; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 -; SSSE3-NEXT: pxor %xmm3, %xmm0 -; SSSE3-NEXT: movdqa %xmm1, %xmm3 -; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 -; SSSE3-NEXT: psrld $1, %xmm2 -; SSSE3-NEXT: por %xmm3, %xmm2 -; SSSE3-NEXT: pand %xmm0, %xmm2 -; SSSE3-NEXT: pandn %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pandn %xmm1, %xmm2 +; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSSE3-NEXT: pand %xmm1, %xmm0 ; SSSE3-NEXT: por %xmm2, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v4i32: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: paddd %xmm1, %xmm3 -; SSE41-NEXT: movaps {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: blendvps %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE41-NEXT: pxor %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: movaps %xmm3, %xmm0 +; SSE41-NEXT: paddd %xmm1, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: psrad $31, %xmm1 +; SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm2 +; SSE41-NEXT: movaps %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: v4i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; AVX1-NEXT: vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0 +; AVX1-NEXT: vpsrad $31, %xmm2, %xmm1 +; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v4i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647] -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vblendvps %xmm2, %xmm3, %xmm4, %xmm3 ; AVX2-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0 +; AVX2-NEXT: vpsrad $31, %xmm2, %xmm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; AVX2-NEXT: vpxor %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: v4i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm2 -; AVX512F-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647] -; AVX512F-NEXT: vbroadcastss {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] -; AVX512F-NEXT: vblendvps %xmm2, %xmm3, %xmm4, %xmm3 ; AVX512F-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpxor %xmm0, %xmm1, %xmm0 -; AVX512F-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0 +; AVX512F-NEXT: vpsrad $31, %xmm2, %xmm1 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; AVX512F-NEXT: vpxor %xmm3, %xmm1, %xmm1 +; AVX512F-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v4i32: @@ -786,10 +764,8 @@ ; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 ; AVX512BW-NEXT: kxorw %k1, %k0, %k1 -; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm2, %k2 -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648] -; AVX512BW-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k2} -; AVX512BW-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} +; AVX512BW-NEXT: vpsrad $31, %xmm1, %xmm0 +; AVX512BW-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 {%k1} ; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512BW-NEXT: retq %z = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %x, <4 x i32> %y) @@ -805,29 +781,23 @@ ; SSE2-NEXT: paddd %xmm0, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 ; SSE2-NEXT: pxor %xmm5, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pandn %xmm5, %xmm6 -; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm7 -; SSE2-NEXT: psrld $1, %xmm7 -; SSE2-NEXT: por %xmm6, %xmm7 -; SSE2-NEXT: pand %xmm0, %xmm7 -; SSE2-NEXT: pandn %xmm2, %xmm0 -; SSE2-NEXT: por %xmm7, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: pandn %xmm2, %xmm5 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: pxor %xmm6, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 ; SSE2-NEXT: paddd %xmm1, %xmm3 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: pandn %xmm5, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE2-NEXT: psrld $1, %xmm4 -; SSE2-NEXT: por %xmm2, %xmm4 -; SSE2-NEXT: pand %xmm1, %xmm4 -; SSE2-NEXT: pandn %xmm3, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm3, %xmm2 +; SSE2-NEXT: psrad $31, %xmm3 +; SSE2-NEXT: pxor %xmm3, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v8i32: @@ -838,55 +808,47 @@ ; SSSE3-NEXT: paddd %xmm0, %xmm2 ; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0 ; SSSE3-NEXT: pxor %xmm5, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm2, %xmm6 -; SSSE3-NEXT: pandn %xmm5, %xmm6 -; SSSE3-NEXT: pxor %xmm7, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm7 -; SSSE3-NEXT: psrld $1, %xmm7 -; SSSE3-NEXT: por %xmm6, %xmm7 -; SSSE3-NEXT: pand %xmm0, %xmm7 -; SSSE3-NEXT: pandn %xmm2, %xmm0 -; SSSE3-NEXT: por %xmm7, %xmm0 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 +; SSSE3-NEXT: movdqa %xmm0, %xmm5 +; SSSE3-NEXT: pandn %xmm2, %xmm5 +; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: pxor %xmm6, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: por %xmm5, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 ; SSSE3-NEXT: paddd %xmm1, %xmm3 ; SSSE3-NEXT: pcmpgtd %xmm3, %xmm1 -; SSSE3-NEXT: pxor %xmm2, %xmm1 -; SSSE3-NEXT: movdqa %xmm3, %xmm2 -; SSSE3-NEXT: pandn %xmm5, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 -; SSSE3-NEXT: psrld $1, %xmm4 -; SSSE3-NEXT: por %xmm2, %xmm4 -; SSSE3-NEXT: pand %xmm1, %xmm4 -; SSSE3-NEXT: pandn %xmm3, %xmm1 -; SSSE3-NEXT: por %xmm4, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pandn %xmm3, %xmm2 +; SSSE3-NEXT: psrad $31, %xmm3 +; SSSE3-NEXT: pxor %xmm3, %xmm6 +; SSSE3-NEXT: pand %xmm6, %xmm1 +; SSSE3-NEXT: por %xmm2, %xmm1 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v8i32: ; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm1, %xmm5 ; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: paddd %xmm2, %xmm5 -; SSE41-NEXT: movaps {{.*#+}} xmm8 = [2147483647,2147483647,2147483647,2147483647] -; SSE41-NEXT: movaps {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] -; SSE41-NEXT: movaps %xmm6, %xmm7 +; SSE41-NEXT: paddd %xmm2, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm1 +; SSE41-NEXT: psrad $31, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSE41-NEXT: pxor %xmm2, %xmm1 +; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm4 +; SSE41-NEXT: movdqa %xmm5, %xmm1 +; SSE41-NEXT: paddd %xmm3, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm1, %xmm5 +; SSE41-NEXT: pxor %xmm3, %xmm5 +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: psrad $31, %xmm3 +; SSE41-NEXT: pxor %xmm2, %xmm3 ; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm8, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm4 -; SSE41-NEXT: pxor %xmm2, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm7, %xmm5 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: paddd %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm8, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm1 -; SSE41-NEXT: pxor %xmm3, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm6, %xmm2 -; SSE41-NEXT: movaps %xmm5, %xmm0 -; SSE41-NEXT: movaps %xmm2, %xmm1 +; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm1 +; SSE41-NEXT: movaps %xmm4, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: v8i32: @@ -896,35 +858,37 @@ ; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm4 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm5 -; AVX1-NEXT: vmovaps {{.*#+}} ymm6 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] -; AVX1-NEXT: vblendvps %ymm5, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpgtd %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: vxorps %ymm0, %ymm1, %ymm0 -; AVX1-NEXT: vblendvps %ymm0, %ymm6, %ymm5, %ymm0 +; AVX1-NEXT: vpsrad $31, %xmm4, %xmm1 +; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX1-NEXT: vblendvps %ymm0, %ymm1, %ymm5, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v8i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647] -; AVX2-NEXT: vbroadcastss {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vblendvps %ymm2, %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vblendvps %ymm0, %ymm3, %ymm2, %ymm0 +; AVX2-NEXT: vpsrad $31, %ymm2, %ymm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] +; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vblendvps %ymm0, %ymm1, %ymm2, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: v8i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpaddd %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vbroadcastss {{.*#+}} ymm3 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647] -; AVX512F-NEXT: vbroadcastss {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] -; AVX512F-NEXT: vblendvps %ymm2, %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpxor %ymm0, %ymm1, %ymm0 -; AVX512F-NEXT: vblendvps %ymm0, %ymm3, %ymm2, %ymm0 +; AVX512F-NEXT: vpsrad $31, %ymm2, %ymm1 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] +; AVX512F-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vblendvps %ymm0, %ymm1, %ymm2, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v8i32: @@ -934,10 +898,8 @@ ; AVX512BW-NEXT: vpaddd %ymm1, %ymm0, %ymm1 ; AVX512BW-NEXT: vpcmpgtd %ymm1, %ymm0, %k1 ; AVX512BW-NEXT: kxorw %k1, %k0, %k1 -; AVX512BW-NEXT: vpcmpgtd %ymm1, %ymm2, %k2 -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] -; AVX512BW-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 {%k2} -; AVX512BW-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} +; AVX512BW-NEXT: vpsrad $31, %ymm1, %ymm0 +; AVX512BW-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm1 {%k1} ; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512BW-NEXT: retq %z = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> %x, <8 x i32> %y) @@ -953,57 +915,45 @@ ; SSE2-NEXT: paddd %xmm0, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm0 ; SSE2-NEXT: pxor %xmm9, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm10 +; SSE2-NEXT: pandn %xmm4, %xmm10 +; SSE2-NEXT: psrad $31, %xmm4 ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm4, %xmm10 -; SSE2-NEXT: pandn %xmm9, %xmm10 -; SSE2-NEXT: pxor %xmm11, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm11 -; SSE2-NEXT: psrld $1, %xmm11 -; SSE2-NEXT: por %xmm10, %xmm11 -; SSE2-NEXT: pand %xmm0, %xmm11 -; SSE2-NEXT: pandn %xmm4, %xmm0 -; SSE2-NEXT: por %xmm11, %xmm0 +; SSE2-NEXT: pxor %xmm9, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: por %xmm10, %xmm0 ; SSE2-NEXT: pxor %xmm4, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 ; SSE2-NEXT: paddd %xmm1, %xmm5 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm1 ; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm5, %xmm10 -; SSE2-NEXT: pandn %xmm9, %xmm10 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 -; SSE2-NEXT: psrld $1, %xmm4 -; SSE2-NEXT: por %xmm10, %xmm4 -; SSE2-NEXT: pand %xmm1, %xmm4 -; SSE2-NEXT: pandn %xmm5, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pandn %xmm5, %xmm4 +; SSE2-NEXT: psrad $31, %xmm5 +; SSE2-NEXT: pxor %xmm9, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm1 ; SSE2-NEXT: por %xmm4, %xmm1 ; SSE2-NEXT: pxor %xmm4, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 ; SSE2-NEXT: paddd %xmm2, %xmm6 ; SSE2-NEXT: pcmpgtd %xmm6, %xmm2 ; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm6, %xmm4 -; SSE2-NEXT: pandn %xmm9, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm5 -; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: por %xmm4, %xmm5 -; SSE2-NEXT: pand %xmm2, %xmm5 -; SSE2-NEXT: pandn %xmm6, %xmm2 -; SSE2-NEXT: por %xmm5, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pandn %xmm6, %xmm4 +; SSE2-NEXT: psrad $31, %xmm6 +; SSE2-NEXT: pxor %xmm9, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm2 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm8 ; SSE2-NEXT: paddd %xmm3, %xmm7 ; SSE2-NEXT: pcmpgtd %xmm7, %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm3 -; SSE2-NEXT: movdqa %xmm7, %xmm4 -; SSE2-NEXT: pandn %xmm9, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm8 -; SSE2-NEXT: psrld $1, %xmm8 -; SSE2-NEXT: por %xmm4, %xmm8 -; SSE2-NEXT: pand %xmm3, %xmm8 -; SSE2-NEXT: pandn %xmm7, %xmm3 -; SSE2-NEXT: por %xmm8, %xmm3 +; SSE2-NEXT: pxor %xmm8, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm7, %xmm4 +; SSE2-NEXT: psrad $31, %xmm7 +; SSE2-NEXT: pxor %xmm7, %xmm9 +; SSE2-NEXT: pand %xmm9, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm3 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v16i32: @@ -1014,105 +964,89 @@ ; SSSE3-NEXT: paddd %xmm0, %xmm4 ; SSSE3-NEXT: pcmpgtd %xmm4, %xmm0 ; SSSE3-NEXT: pxor %xmm9, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm10 +; SSSE3-NEXT: pandn %xmm4, %xmm10 +; SSSE3-NEXT: psrad $31, %xmm4 ; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm4, %xmm10 -; SSSE3-NEXT: pandn %xmm9, %xmm10 -; SSSE3-NEXT: pxor %xmm11, %xmm11 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm11 -; SSSE3-NEXT: psrld $1, %xmm11 -; SSSE3-NEXT: por %xmm10, %xmm11 -; SSSE3-NEXT: pand %xmm0, %xmm11 -; SSSE3-NEXT: pandn %xmm4, %xmm0 -; SSSE3-NEXT: por %xmm11, %xmm0 +; SSSE3-NEXT: pxor %xmm9, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm0 +; SSSE3-NEXT: por %xmm10, %xmm0 ; SSSE3-NEXT: pxor %xmm4, %xmm4 ; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4 ; SSSE3-NEXT: paddd %xmm1, %xmm5 ; SSSE3-NEXT: pcmpgtd %xmm5, %xmm1 ; SSSE3-NEXT: pxor %xmm4, %xmm1 -; SSSE3-NEXT: movdqa %xmm5, %xmm10 -; SSSE3-NEXT: pandn %xmm9, %xmm10 -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4 -; SSSE3-NEXT: psrld $1, %xmm4 -; SSSE3-NEXT: por %xmm10, %xmm4 -; SSSE3-NEXT: pand %xmm1, %xmm4 -; SSSE3-NEXT: pandn %xmm5, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSSE3-NEXT: pandn %xmm5, %xmm4 +; SSSE3-NEXT: psrad $31, %xmm5 +; SSSE3-NEXT: pxor %xmm9, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm1 ; SSSE3-NEXT: por %xmm4, %xmm1 ; SSSE3-NEXT: pxor %xmm4, %xmm4 ; SSSE3-NEXT: pcmpgtd %xmm6, %xmm4 ; SSSE3-NEXT: paddd %xmm2, %xmm6 ; SSSE3-NEXT: pcmpgtd %xmm6, %xmm2 ; SSSE3-NEXT: pxor %xmm4, %xmm2 -; SSSE3-NEXT: movdqa %xmm6, %xmm4 -; SSSE3-NEXT: pandn %xmm9, %xmm4 -; SSSE3-NEXT: pxor %xmm5, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm5 -; SSSE3-NEXT: psrld $1, %xmm5 -; SSSE3-NEXT: por %xmm4, %xmm5 -; SSSE3-NEXT: pand %xmm2, %xmm5 -; SSSE3-NEXT: pandn %xmm6, %xmm2 -; SSSE3-NEXT: por %xmm5, %xmm2 -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm4 +; SSSE3-NEXT: movdqa %xmm2, %xmm4 +; SSSE3-NEXT: pandn %xmm6, %xmm4 +; SSSE3-NEXT: psrad $31, %xmm6 +; SSSE3-NEXT: pxor %xmm9, %xmm6 +; SSSE3-NEXT: pand %xmm6, %xmm2 +; SSSE3-NEXT: por %xmm4, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 ; SSSE3-NEXT: paddd %xmm3, %xmm7 ; SSSE3-NEXT: pcmpgtd %xmm7, %xmm3 -; SSSE3-NEXT: pxor %xmm4, %xmm3 -; SSSE3-NEXT: movdqa %xmm7, %xmm4 -; SSSE3-NEXT: pandn %xmm9, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 -; SSSE3-NEXT: psrld $1, %xmm8 -; SSSE3-NEXT: por %xmm4, %xmm8 -; SSSE3-NEXT: pand %xmm3, %xmm8 -; SSSE3-NEXT: pandn %xmm7, %xmm3 -; SSSE3-NEXT: por %xmm8, %xmm3 +; SSSE3-NEXT: pxor %xmm8, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pandn %xmm7, %xmm4 +; SSSE3-NEXT: psrad $31, %xmm7 +; SSSE3-NEXT: pxor %xmm7, %xmm9 +; SSSE3-NEXT: pand %xmm9, %xmm3 +; SSSE3-NEXT: por %xmm4, %xmm3 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v16i32: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm3, %xmm8 -; SSE41-NEXT: movdqa %xmm2, %xmm12 -; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: movdqa %xmm2, %xmm10 +; SSE41-NEXT: movdqa %xmm1, %xmm3 ; SSE41-NEXT: movdqa %xmm0, %xmm9 ; SSE41-NEXT: paddd %xmm4, %xmm9 -; SSE41-NEXT: movaps {{.*#+}} xmm11 = [2147483647,2147483647,2147483647,2147483647] -; SSE41-NEXT: movaps {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648] -; SSE41-NEXT: movaps %xmm10, %xmm2 -; SSE41-NEXT: movdqa %xmm9, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm11, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm3 -; SSE41-NEXT: pxor %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm9 -; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: paddd %xmm5, %xmm4 -; SSE41-NEXT: movaps %xmm10, %xmm2 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm11, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm1 -; SSE41-NEXT: pxor %xmm5, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm4 -; SSE41-NEXT: movdqa %xmm12, %xmm3 -; SSE41-NEXT: paddd %xmm6, %xmm3 -; SSE41-NEXT: movaps %xmm10, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm1 +; SSE41-NEXT: psrad $31, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSE41-NEXT: pxor %xmm4, %xmm1 +; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm9 +; SSE41-NEXT: movdqa %xmm3, %xmm1 +; SSE41-NEXT: paddd %xmm5, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE41-NEXT: pxor %xmm5, %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psrad $31, %xmm2 +; SSE41-NEXT: pxor %xmm4, %xmm2 ; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm11, %xmm1 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm12 -; SSE41-NEXT: pxor %xmm6, %xmm12 -; SSE41-NEXT: movdqa %xmm12, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3 -; SSE41-NEXT: movdqa %xmm8, %xmm5 -; SSE41-NEXT: paddd %xmm7, %xmm5 -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm11, %xmm10 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm8 +; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm10, %xmm2 +; SSE41-NEXT: paddd %xmm6, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm10 +; SSE41-NEXT: pxor %xmm6, %xmm10 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psrad $31, %xmm3 +; SSE41-NEXT: pxor %xmm4, %xmm3 +; SSE41-NEXT: movdqa %xmm10, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm8, %xmm3 +; SSE41-NEXT: paddd %xmm7, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm8 ; SSE41-NEXT: pxor %xmm7, %xmm8 +; SSE41-NEXT: movdqa %xmm3, %xmm5 +; SSE41-NEXT: psrad $31, %xmm5 +; SSE41-NEXT: pxor %xmm4, %xmm5 ; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm10, %xmm5 +; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm3 ; SSE41-NEXT: movaps %xmm9, %xmm0 -; SSE41-NEXT: movaps %xmm4, %xmm1 -; SSE41-NEXT: movaps %xmm3, %xmm2 -; SSE41-NEXT: movaps %xmm5, %xmm3 ; SSE41-NEXT: retq ; ; AVX1-LABEL: v16i32: @@ -1122,41 +1056,47 @@ ; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm6 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm7 -; AVX1-NEXT: vmovaps {{.*#+}} ymm8 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647] -; AVX1-NEXT: vmovaps {{.*#+}} ymm9 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] -; AVX1-NEXT: vblendvps %ymm7, %ymm8, %ymm9, %ymm10 -; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm5 ; AVX1-NEXT: vpcmpgtd %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 ; AVX1-NEXT: vxorps %ymm0, %ymm2, %ymm0 -; AVX1-NEXT: vblendvps %ymm0, %ymm10, %ymm7, %ymm0 +; AVX1-NEXT: vpsrad $31, %xmm6, %xmm2 +; AVX1-NEXT: vpsrad $31, %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] +; AVX1-NEXT: vxorps %ymm4, %ymm2, %ymm2 +; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm7, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm5 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm6 -; AVX1-NEXT: vblendvps %ymm6, %ymm8, %ymm9, %ymm7 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpcmpgtd %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm7 +; AVX1-NEXT: vpcmpgtd %xmm2, %xmm5, %xmm5 +; AVX1-NEXT: vpcmpgtd %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; AVX1-NEXT: vxorps %ymm1, %ymm3, %ymm1 -; AVX1-NEXT: vblendvps %ymm1, %ymm7, %ymm6, %ymm1 +; AVX1-NEXT: vpsrad $31, %xmm6, %xmm3 +; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-NEXT: vxorps %ymm4, %ymm2, %ymm2 +; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm7, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v16i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm4 -; AVX2-NEXT: vbroadcastss {{.*#+}} ymm5 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647] -; AVX2-NEXT: vbroadcastss {{.*#+}} ymm6 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vblendvps %ymm4, %ymm5, %ymm6, %ymm7 ; AVX2-NEXT: vpcmpgtd %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vblendvps %ymm0, %ymm7, %ymm4, %ymm0 +; AVX2-NEXT: vpsrad $31, %ymm4, %ymm2 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] +; AVX2-NEXT: vpxor %ymm5, %ymm2, %ymm2 +; AVX2-NEXT: vblendvps %ymm0, %ymm2, %ymm4, %ymm0 ; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm2 -; AVX2-NEXT: vblendvps %ymm2, %ymm5, %ymm6, %ymm4 ; AVX2-NEXT: vpcmpgtd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpxor %ymm1, %ymm3, %ymm1 -; AVX2-NEXT: vblendvps %ymm1, %ymm4, %ymm2, %ymm1 +; AVX2-NEXT: vpsrad $31, %ymm2, %ymm3 +; AVX2-NEXT: vpxor %ymm5, %ymm3, %ymm3 +; AVX2-NEXT: vblendvps %ymm1, %ymm3, %ymm2, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: v16i32: @@ -1166,10 +1106,8 @@ ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm1 ; AVX512-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 ; AVX512-NEXT: kxorw %k1, %k0, %k1 -; AVX512-NEXT: vpcmpgtd %zmm1, %zmm2, %k2 -; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] -; AVX512-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0 {%k2} -; AVX512-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; AVX512-NEXT: vpsrad $31, %zmm1, %zmm0 +; AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm1 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512-NEXT: retq %z = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> %x, <16 x i32> %y) @@ -1194,17 +1132,14 @@ ; SSE2-NEXT: por %xmm2, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE2-NEXT: pxor %xmm3, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -1225,17 +1160,14 @@ ; SSSE3-NEXT: por %xmm2, %xmm3 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 -; SSSE3-NEXT: pxor %xmm3, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 +; SSSE3-NEXT: pxor %xmm3, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm1 ; SSSE3-NEXT: pandn %xmm0, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 -; SSSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSSE3-NEXT: por %xmm2, %xmm0 -; SSSE3-NEXT: pand %xmm4, %xmm0 +; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 ; SSSE3-NEXT: por %xmm1, %xmm0 ; SSSE3-NEXT: retq ; @@ -1243,21 +1175,22 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pxor %xmm0, %xmm4 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: pxor %xmm0, %xmm3 ; SSE41-NEXT: paddq %xmm1, %xmm2 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm3 -; SSE41-NEXT: por %xmm4, %xmm3 -; SSE41-NEXT: pxor %xmm1, %xmm3 -; SSE41-NEXT: movapd {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm5 +; SSE41-NEXT: por %xmm3, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: psrad $31, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: movapd %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -1265,31 +1198,34 @@ ; AVX1-LABEL: v2i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] -; AVX1-NEXT: vblendvpd %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vblendvpd %xmm0, %xmm3, %xmm2, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v2i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vmovapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] -; AVX2-NEXT: vblendvpd %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 ; AVX2-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vblendvpd %xmm0, %xmm3, %xmm2, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: v2i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm2 -; AVX512F-NEXT: vmovapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] -; AVX512F-NEXT: vblendvpd %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 ; AVX512F-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpxor %xmm0, %xmm1, %xmm0 -; AVX512F-NEXT: vblendvpd %xmm0, %xmm3, %xmm2, %xmm0 +; AVX512F-NEXT: vpsraq $63, %zmm2, %zmm1 +; AVX512F-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512F-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v2i64: @@ -1299,10 +1235,8 @@ ; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm1 ; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 ; AVX512BW-NEXT: kxorw %k1, %k0, %k1 -; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm2, %k2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808] -; AVX512BW-NEXT: vmovdqa64 {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} +; AVX512BW-NEXT: vpsraq $63, %xmm1, %xmm0 +; AVX512BW-NEXT: vpxorq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 {%k1} ; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512BW-NEXT: retq %z = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %x, <2 x i64> %y) @@ -1331,42 +1265,35 @@ ; SSE2-NEXT: pxor %xmm7, %xmm7 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 ; SSE2-NEXT: pxor %xmm6, %xmm7 -; SSE2-NEXT: movdqa %xmm7, %xmm5 -; SSE2-NEXT: pandn %xmm0, %xmm5 +; SSE2-NEXT: movdqa %xmm7, %xmm6 +; SSE2-NEXT: pandn %xmm0, %xmm6 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808] -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] -; SSE2-NEXT: pandn %xmm8, %xmm0 -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm6 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775807,9223372036854775807] -; SSE2-NEXT: pand %xmm9, %xmm6 -; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 ; SSE2-NEXT: pand %xmm7, %xmm0 -; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm6 ; SSE2-NEXT: paddq %xmm3, %xmm1 ; SSE2-NEXT: pxor %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE2-NEXT: movdqa %xmm6, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] ; SSE2-NEXT: por %xmm4, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 ; SSE2-NEXT: pandn %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] -; SSE2-NEXT: pandn %xmm8, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm2 -; SSE2-NEXT: pand %xmm9, %xmm2 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: retq ; @@ -1391,82 +1318,77 @@ ; SSSE3-NEXT: pxor %xmm7, %xmm7 ; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 ; SSSE3-NEXT: pxor %xmm6, %xmm7 -; SSSE3-NEXT: movdqa %xmm7, %xmm5 -; SSSE3-NEXT: pandn %xmm0, %xmm5 +; SSSE3-NEXT: movdqa %xmm7, %xmm6 +; SSSE3-NEXT: pandn %xmm0, %xmm6 +; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808] -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] -; SSSE3-NEXT: pandn %xmm8, %xmm0 -; SSSE3-NEXT: pxor %xmm6, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm6 -; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775807,9223372036854775807] -; SSSE3-NEXT: pand %xmm9, %xmm6 -; SSSE3-NEXT: por %xmm6, %xmm0 +; SSSE3-NEXT: pxor %xmm8, %xmm0 ; SSSE3-NEXT: pand %xmm7, %xmm0 -; SSSE3-NEXT: por %xmm5, %xmm0 -; SSSE3-NEXT: movdqa %xmm1, %xmm5 -; SSSE3-NEXT: pxor %xmm4, %xmm5 +; SSSE3-NEXT: por %xmm6, %xmm0 +; SSSE3-NEXT: movdqa %xmm1, %xmm6 +; SSSE3-NEXT: pxor %xmm4, %xmm6 ; SSSE3-NEXT: paddq %xmm3, %xmm1 ; SSSE3-NEXT: pxor %xmm1, %xmm4 -; SSSE3-NEXT: movdqa %xmm5, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 +; SSSE3-NEXT: movdqa %xmm6, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: pand %xmm7, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] ; SSSE3-NEXT: por %xmm4, %xmm5 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 -; SSSE3-NEXT: pxor %xmm5, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 +; SSSE3-NEXT: pxor %xmm5, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm3 ; SSSE3-NEXT: pandn %xmm1, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] -; SSSE3-NEXT: pandn %xmm8, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm2 -; SSSE3-NEXT: pand %xmm9, %xmm2 -; SSSE3-NEXT: por %xmm2, %xmm1 -; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pxor %xmm8, %xmm1 +; SSSE3-NEXT: pand %xmm2, %xmm1 ; SSSE3-NEXT: por %xmm3, %xmm1 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v4i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] +; SSE41-NEXT: pxor %xmm5, %xmm0 ; SSE41-NEXT: paddq %xmm2, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pxor %xmm6, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm6 +; SSE41-NEXT: pxor %xmm5, %xmm6 ; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm5 -; SSE41-NEXT: por %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm2, %xmm5 -; SSE41-NEXT: movapd {{.*#+}} xmm8 = [9223372036854775807,9223372036854775807] -; SSE41-NEXT: movapd {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808] -; SSE41-NEXT: movapd %xmm7, %xmm2 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2 -; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm6, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm6 +; SSE41-NEXT: por %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm2 +; SSE41-NEXT: psrad $31, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372036854775808,9223372036854775808] +; SSE41-NEXT: pxor %xmm6, %xmm2 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 ; SSE41-NEXT: paddq %xmm3, %xmm1 -; SSE41-NEXT: pxor %xmm1, %xmm6 -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm2 -; SSE41-NEXT: por %xmm0, %xmm2 -; SSE41-NEXT: pxor %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm7 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 +; SSE41-NEXT: pxor %xmm1, %xmm5 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm5 +; SSE41-NEXT: por %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psrad $31, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE41-NEXT: pxor %xmm6, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: movapd %xmm4, %xmm0 ; SSE41-NEXT: retq ; @@ -1477,35 +1399,39 @@ ; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm4 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm5 -; AVX1-NEXT: vmovapd {{.*#+}} ymm6 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX1-NEXT: vblendvpd %ymm5, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: vxorpd %ymm0, %ymm1, %ymm0 -; AVX1-NEXT: vblendvpd %ymm0, %ymm6, %ymm5, %ymm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX1-NEXT: vblendvpd %ymm0, %ymm1, %ymm5, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v4i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm3 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807] -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vblendvpd %ymm0, %ymm3, %ymm2, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm2, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: v4i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpaddq %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vbroadcastsd {{.*#+}} ymm3 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807] -; AVX512F-NEXT: vbroadcastsd {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX512F-NEXT: vblendvpd %ymm2, %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpxor %ymm0, %ymm1, %ymm0 -; AVX512F-NEXT: vblendvpd %ymm0, %ymm3, %ymm2, %ymm0 +; AVX512F-NEXT: vpsraq $63, %zmm2, %zmm1 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX512F-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vblendvpd %ymm0, %ymm1, %ymm2, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v4i64: @@ -1515,10 +1441,8 @@ ; AVX512BW-NEXT: vpaddq %ymm1, %ymm0, %ymm1 ; AVX512BW-NEXT: vpcmpgtq %ymm1, %ymm0, %k1 ; AVX512BW-NEXT: kxorw %k1, %k0, %k1 -; AVX512BW-NEXT: vpcmpgtq %ymm1, %ymm2, %k2 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm0 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX512BW-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1} +; AVX512BW-NEXT: vpsraq $63, %ymm1, %ymm0 +; AVX512BW-NEXT: vpxorq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm1 {%k1} ; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512BW-NEXT: retq %z = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> %x, <4 x i64> %y) @@ -1544,59 +1468,52 @@ ; SSE2-NEXT: por %xmm9, %xmm10 ; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3] ; SSE2-NEXT: pxor %xmm11, %xmm11 -; SSE2-NEXT: pxor %xmm13, %xmm13 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm13 -; SSE2-NEXT: pxor %xmm10, %xmm13 -; SSE2-NEXT: movdqa %xmm13, %xmm12 -; SSE2-NEXT: pandn %xmm0, %xmm12 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808] -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3] -; SSE2-NEXT: pandn %xmm9, %xmm0 ; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775807,9223372036854775807] -; SSE2-NEXT: pand %xmm10, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: pand %xmm13, %xmm0 -; SSE2-NEXT: por %xmm12, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm12 -; SSE2-NEXT: pxor %xmm8, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm4 +; SSE2-NEXT: pxor %xmm10, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm10 +; SSE2-NEXT: pandn %xmm0, %xmm10 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808] +; SSE2-NEXT: pxor %xmm9, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: por %xmm10, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm10 +; SSE2-NEXT: pxor %xmm8, %xmm10 ; SSE2-NEXT: paddq %xmm5, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: movdqa %xmm12, %xmm13 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm12, %xmm4 +; SSE2-NEXT: movdqa %xmm10, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm14, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm13[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm12 +; SSE2-NEXT: pand %xmm13, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm12[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm10 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] ; SSE2-NEXT: pxor %xmm5, %xmm5 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 -; SSE2-NEXT: pxor %xmm12, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm13 -; SSE2-NEXT: pandn %xmm1, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm1[1,1,3,3] -; SSE2-NEXT: pandn %xmm9, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm12, %xmm4 -; SSE2-NEXT: pand %xmm10, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: pxor %xmm10, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: pandn %xmm1, %xmm4 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm9, %xmm1 ; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: por %xmm13, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm12 -; SSE2-NEXT: pxor %xmm8, %xmm12 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm10 +; SSE2-NEXT: pxor %xmm8, %xmm10 ; SSE2-NEXT: paddq %xmm6, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm5 ; SSE2-NEXT: pxor %xmm8, %xmm5 -; SSE2-NEXT: movdqa %xmm12, %xmm4 +; SSE2-NEXT: movdqa %xmm10, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm12, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: pand %xmm13, %xmm5 +; SSE2-NEXT: pand %xmm12, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm5, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] @@ -1605,12 +1522,9 @@ ; SSE2-NEXT: pxor %xmm4, %xmm6 ; SSE2-NEXT: movdqa %xmm6, %xmm4 ; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm2[1,1,3,3] -; SSE2-NEXT: pandn %xmm9, %xmm2 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm12, %xmm5 -; SSE2-NEXT: pand %xmm10, %xmm5 -; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pxor %xmm9, %xmm2 ; SSE2-NEXT: pand %xmm6, %xmm2 ; SSE2-NEXT: por %xmm4, %xmm2 ; SSE2-NEXT: movdqa %xmm3, %xmm4 @@ -1626,17 +1540,14 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSE2-NEXT: por %xmm4, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE2-NEXT: pxor %xmm5, %xmm6 -; SSE2-NEXT: movdqa %xmm6, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm11 +; SSE2-NEXT: pxor %xmm5, %xmm11 +; SSE2-NEXT: movdqa %xmm11, %xmm4 ; SSE2-NEXT: pandn %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSE2-NEXT: pandn %xmm9, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm11 -; SSE2-NEXT: pand %xmm10, %xmm11 -; SSE2-NEXT: por %xmm11, %xmm3 -; SSE2-NEXT: pand %xmm6, %xmm3 +; SSE2-NEXT: psrad $31, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pxor %xmm9, %xmm3 +; SSE2-NEXT: pand %xmm11, %xmm3 ; SSE2-NEXT: por %xmm4, %xmm3 ; SSE2-NEXT: retq ; @@ -1658,59 +1569,52 @@ ; SSSE3-NEXT: por %xmm9, %xmm10 ; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3] ; SSSE3-NEXT: pxor %xmm11, %xmm11 -; SSSE3-NEXT: pxor %xmm13, %xmm13 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm13 -; SSSE3-NEXT: pxor %xmm10, %xmm13 -; SSSE3-NEXT: movdqa %xmm13, %xmm12 -; SSSE3-NEXT: pandn %xmm0, %xmm12 -; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808] -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3] -; SSSE3-NEXT: pandn %xmm9, %xmm0 ; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm4 -; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775807,9223372036854775807] -; SSSE3-NEXT: pand %xmm10, %xmm4 -; SSSE3-NEXT: por %xmm4, %xmm0 -; SSSE3-NEXT: pand %xmm13, %xmm0 -; SSSE3-NEXT: por %xmm12, %xmm0 -; SSSE3-NEXT: movdqa %xmm1, %xmm12 -; SSSE3-NEXT: pxor %xmm8, %xmm12 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm4 +; SSSE3-NEXT: pxor %xmm10, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, %xmm10 +; SSSE3-NEXT: pandn %xmm0, %xmm10 +; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808] +; SSSE3-NEXT: pxor %xmm9, %xmm0 +; SSSE3-NEXT: pand %xmm4, %xmm0 +; SSSE3-NEXT: por %xmm10, %xmm0 +; SSSE3-NEXT: movdqa %xmm1, %xmm10 +; SSSE3-NEXT: pxor %xmm8, %xmm10 ; SSSE3-NEXT: paddq %xmm5, %xmm1 ; SSSE3-NEXT: movdqa %xmm1, %xmm4 ; SSSE3-NEXT: pxor %xmm8, %xmm4 -; SSSE3-NEXT: movdqa %xmm12, %xmm13 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm13 -; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm12, %xmm4 +; SSSE3-NEXT: movdqa %xmm10, %xmm12 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm12 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: pand %xmm14, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm13[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm12 +; SSSE3-NEXT: pand %xmm13, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm12[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm10 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] ; SSSE3-NEXT: pxor %xmm5, %xmm5 ; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5 -; SSSE3-NEXT: pxor %xmm12, %xmm5 -; SSSE3-NEXT: movdqa %xmm5, %xmm13 -; SSSE3-NEXT: pandn %xmm1, %xmm13 -; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm1[1,1,3,3] -; SSSE3-NEXT: pandn %xmm9, %xmm1 -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm12, %xmm4 -; SSSE3-NEXT: pand %xmm10, %xmm4 -; SSSE3-NEXT: por %xmm4, %xmm1 +; SSSE3-NEXT: pxor %xmm10, %xmm5 +; SSSE3-NEXT: movdqa %xmm5, %xmm4 +; SSSE3-NEXT: pandn %xmm1, %xmm4 +; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pxor %xmm9, %xmm1 ; SSSE3-NEXT: pand %xmm5, %xmm1 -; SSSE3-NEXT: por %xmm13, %xmm1 -; SSSE3-NEXT: movdqa %xmm2, %xmm12 -; SSSE3-NEXT: pxor %xmm8, %xmm12 +; SSSE3-NEXT: por %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm2, %xmm10 +; SSSE3-NEXT: pxor %xmm8, %xmm10 ; SSSE3-NEXT: paddq %xmm6, %xmm2 ; SSSE3-NEXT: movdqa %xmm2, %xmm5 ; SSSE3-NEXT: pxor %xmm8, %xmm5 -; SSSE3-NEXT: movdqa %xmm12, %xmm4 +; SSSE3-NEXT: movdqa %xmm10, %xmm4 ; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm12, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm5 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: pand %xmm13, %xmm5 +; SSSE3-NEXT: pand %xmm12, %xmm5 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSSE3-NEXT: por %xmm5, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] @@ -1719,12 +1623,9 @@ ; SSSE3-NEXT: pxor %xmm4, %xmm6 ; SSSE3-NEXT: movdqa %xmm6, %xmm4 ; SSSE3-NEXT: pandn %xmm2, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm2[1,1,3,3] -; SSSE3-NEXT: pandn %xmm9, %xmm2 -; SSSE3-NEXT: pxor %xmm5, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm12, %xmm5 -; SSSE3-NEXT: pand %xmm10, %xmm5 -; SSSE3-NEXT: por %xmm5, %xmm2 +; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: pxor %xmm9, %xmm2 ; SSSE3-NEXT: pand %xmm6, %xmm2 ; SSSE3-NEXT: por %xmm4, %xmm2 ; SSSE3-NEXT: movdqa %xmm3, %xmm4 @@ -1740,91 +1641,92 @@ ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSSE3-NEXT: por %xmm4, %xmm5 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] -; SSSE3-NEXT: pxor %xmm6, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 -; SSSE3-NEXT: pxor %xmm5, %xmm6 -; SSSE3-NEXT: movdqa %xmm6, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm11 +; SSSE3-NEXT: pxor %xmm5, %xmm11 +; SSSE3-NEXT: movdqa %xmm11, %xmm4 ; SSSE3-NEXT: pandn %xmm3, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSSE3-NEXT: pandn %xmm9, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm11 -; SSSE3-NEXT: pand %xmm10, %xmm11 -; SSSE3-NEXT: por %xmm11, %xmm3 -; SSSE3-NEXT: pand %xmm6, %xmm3 +; SSSE3-NEXT: psrad $31, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: pxor %xmm9, %xmm3 +; SSSE3-NEXT: pand %xmm11, %xmm3 ; SSSE3-NEXT: por %xmm4, %xmm3 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v8i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm8 -; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648] +; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: paddq %xmm4, %xmm8 -; SSE41-NEXT: movdqa %xmm8, %xmm9 -; SSE41-NEXT: pxor %xmm10, %xmm9 +; SSE41-NEXT: movdqa %xmm8, %xmm10 +; SSE41-NEXT: pxor %xmm9, %xmm10 ; SSE41-NEXT: movdqa %xmm0, %xmm11 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm11 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm11, %xmm9 -; SSE41-NEXT: por %xmm0, %xmm9 -; SSE41-NEXT: pxor %xmm4, %xmm9 -; SSE41-NEXT: movapd {{.*#+}} xmm12 = [9223372036854775807,9223372036854775807] -; SSE41-NEXT: movapd {{.*#+}} xmm11 = [9223372036854775808,9223372036854775808] -; SSE41-NEXT: movapd %xmm11, %xmm4 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm4 -; SSE41-NEXT: movdqa %xmm9, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm10, %xmm11 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm11, %xmm10 +; SSE41-NEXT: por %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm8, %xmm4 +; SSE41-NEXT: psrad $31, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808] +; SSE41-NEXT: pxor %xmm10, %xmm4 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm8 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: paddq %xmm5, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: pxor %xmm10, %xmm4 -; SSE41-NEXT: movdqa %xmm0, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm9 +; SSE41-NEXT: pxor %xmm9, %xmm4 +; SSE41-NEXT: movdqa %xmm0, %xmm11 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm11 ; SSE41-NEXT: pcmpgtd %xmm4, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm9, %xmm4 +; SSE41-NEXT: pand %xmm11, %xmm4 ; SSE41-NEXT: por %xmm0, %xmm4 -; SSE41-NEXT: pxor %xmm5, %xmm4 -; SSE41-NEXT: movapd %xmm11, %xmm5 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm5 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: psrad $31, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE41-NEXT: pxor %xmm10, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: paddq %xmm6, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pxor %xmm10, %xmm4 +; SSE41-NEXT: pxor %xmm9, %xmm4 ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm4, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm4 ; SSE41-NEXT: por %xmm0, %xmm4 -; SSE41-NEXT: pxor %xmm6, %xmm4 -; SSE41-NEXT: movapd %xmm11, %xmm5 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm5 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: psrad $31, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE41-NEXT: pxor %xmm10, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 ; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: paddq %xmm7, %xmm3 -; SSE41-NEXT: pxor %xmm3, %xmm10 -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm4 -; SSE41-NEXT: por %xmm0, %xmm4 -; SSE41-NEXT: pxor %xmm7, %xmm4 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm11 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm3 +; SSE41-NEXT: pxor %xmm3, %xmm9 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm5 +; SSE41-NEXT: por %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: psrad $31, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE41-NEXT: pxor %xmm10, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 ; SSE41-NEXT: movapd %xmm8, %xmm0 ; SSE41-NEXT: retq ; @@ -1835,41 +1737,49 @@ ; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm6 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm7 -; AVX1-NEXT: vmovapd {{.*#+}} ymm8 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807] -; AVX1-NEXT: vmovapd {{.*#+}} ymm9 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX1-NEXT: vblendvpd %ymm7, %ymm8, %ymm9, %ymm10 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm5 ; AVX1-NEXT: vpcmpgtq %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 ; AVX1-NEXT: vxorpd %ymm0, %ymm2, %ymm0 -; AVX1-NEXT: vblendvpd %ymm0, %ymm10, %ymm7, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm5 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm6 -; AVX1-NEXT: vblendvpd %ymm6, %ymm8, %ymm9, %ymm7 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm2, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; AVX1-NEXT: vmovapd {{.*#+}} ymm5 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX1-NEXT: vxorpd %ymm5, %ymm4, %ymm4 +; AVX1-NEXT: vblendvpd %ymm0, %ymm4, %ymm7, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 +; AVX1-NEXT: vpaddq %xmm4, %xmm6, %xmm4 +; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm7 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm8 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 ; AVX1-NEXT: vxorpd %ymm1, %ymm3, %ymm1 -; AVX1-NEXT: vblendvpd %ymm1, %ymm7, %ymm6, %ymm1 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vxorpd %ymm5, %ymm2, %ymm2 +; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm8, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v8i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm4 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm5 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807] -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vblendvpd %ymm4, %ymm5, %ymm6, %ymm7 ; AVX2-NEXT: vpcmpgtq %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vblendvpd %ymm0, %ymm7, %ymm4, %ymm0 -; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm2 -; AVX2-NEXT: vblendvpd %ymm2, %ymm5, %ymm6, %ymm4 -; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpgtq %ymm4, %ymm2, %ymm5 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5 +; AVX2-NEXT: vblendvpd %ymm0, %ymm5, %ymm4, %ymm0 +; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm4 +; AVX2-NEXT: vpcmpgtq %ymm4, %ymm1, %ymm1 ; AVX2-NEXT: vpxor %ymm1, %ymm3, %ymm1 -; AVX2-NEXT: vblendvpd %ymm1, %ymm4, %ymm2, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm2 +; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm4, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: v8i64: @@ -1879,10 +1789,8 @@ ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm1 ; AVX512-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 ; AVX512-NEXT: kxorw %k1, %k0, %k1 -; AVX512-NEXT: vpcmpgtq %zmm1, %zmm2, %k2 -; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm0 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX512-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0 {%k2} -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512-NEXT: vpsraq $63, %zmm1, %zmm0 +; AVX512-NEXT: vpxorq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm1 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512-NEXT: retq %z = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> %x, <8 x i64> %y) @@ -1900,31 +1808,25 @@ ; SSE-NEXT: movq %r8, %rbx ; SSE-NEXT: sarq $63, %rbx ; SSE-NEXT: testb %r10b, %r10b -; SSE-NEXT: cmoveq %rcx, %rbx -; SSE-NEXT: xorl %ecx, %ecx -; SSE-NEXT: testq %r8, %r8 -; SSE-NEXT: setns %cl -; SSE-NEXT: movabsq $9223372036854775807, %r11 # imm = 0x7FFFFFFFFFFFFFFF -; SSE-NEXT: addq %r11, %rcx +; SSE-NEXT: cmovneq %rbx, %rcx +; SSE-NEXT: movabsq $-9223372036854775808, %r11 # imm = 0x8000000000000000 +; SSE-NEXT: xorq %r11, %rbx ; SSE-NEXT: testb %r10b, %r10b -; SSE-NEXT: cmoveq %r8, %rcx +; SSE-NEXT: cmoveq %r8, %rbx ; SSE-NEXT: addq %r9, %rsi ; SSE-NEXT: adcq {{[0-9]+}}(%rsp), %rdx ; SSE-NEXT: seto %r8b ; SSE-NEXT: movq %rdx, %rdi ; SSE-NEXT: sarq $63, %rdi ; SSE-NEXT: testb %r8b, %r8b -; SSE-NEXT: cmoveq %rsi, %rdi -; SSE-NEXT: xorl %esi, %esi -; SSE-NEXT: testq %rdx, %rdx -; SSE-NEXT: setns %sil -; SSE-NEXT: addq %r11, %rsi +; SSE-NEXT: cmovneq %rdi, %rsi +; SSE-NEXT: xorq %r11, %rdi ; SSE-NEXT: testb %r8b, %r8b -; SSE-NEXT: cmoveq %rdx, %rsi -; SSE-NEXT: movq %rbx, 16(%rax) -; SSE-NEXT: movq %rdi, (%rax) -; SSE-NEXT: movq %rcx, 24(%rax) -; SSE-NEXT: movq %rsi, 8(%rax) +; SSE-NEXT: cmoveq %rdx, %rdi +; SSE-NEXT: movq %rcx, 16(%rax) +; SSE-NEXT: movq %rsi, (%rax) +; SSE-NEXT: movq %rbx, 24(%rax) +; SSE-NEXT: movq %rdi, 8(%rax) ; SSE-NEXT: popq %rbx ; SSE-NEXT: retq ; @@ -1938,31 +1840,25 @@ ; AVX-NEXT: movq %r8, %rbx ; AVX-NEXT: sarq $63, %rbx ; AVX-NEXT: testb %r10b, %r10b -; AVX-NEXT: cmoveq %rcx, %rbx -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: testq %r8, %r8 -; AVX-NEXT: setns %cl -; AVX-NEXT: movabsq $9223372036854775807, %r11 # imm = 0x7FFFFFFFFFFFFFFF -; AVX-NEXT: addq %r11, %rcx +; AVX-NEXT: cmovneq %rbx, %rcx +; AVX-NEXT: movabsq $-9223372036854775808, %r11 # imm = 0x8000000000000000 +; AVX-NEXT: xorq %r11, %rbx ; AVX-NEXT: testb %r10b, %r10b -; AVX-NEXT: cmoveq %r8, %rcx +; AVX-NEXT: cmoveq %r8, %rbx ; AVX-NEXT: addq %r9, %rsi ; AVX-NEXT: adcq {{[0-9]+}}(%rsp), %rdx ; AVX-NEXT: seto %r8b ; AVX-NEXT: movq %rdx, %rdi ; AVX-NEXT: sarq $63, %rdi ; AVX-NEXT: testb %r8b, %r8b -; AVX-NEXT: cmoveq %rsi, %rdi -; AVX-NEXT: xorl %esi, %esi -; AVX-NEXT: testq %rdx, %rdx -; AVX-NEXT: setns %sil -; AVX-NEXT: addq %r11, %rsi +; AVX-NEXT: cmovneq %rdi, %rsi +; AVX-NEXT: xorq %r11, %rdi ; AVX-NEXT: testb %r8b, %r8b -; AVX-NEXT: cmoveq %rdx, %rsi -; AVX-NEXT: movq %rbx, 16(%rax) -; AVX-NEXT: movq %rdi, (%rax) -; AVX-NEXT: movq %rcx, 24(%rax) -; AVX-NEXT: movq %rsi, 8(%rax) +; AVX-NEXT: cmoveq %rdx, %rdi +; AVX-NEXT: movq %rcx, 16(%rax) +; AVX-NEXT: movq %rsi, (%rax) +; AVX-NEXT: movq %rbx, 24(%rax) +; AVX-NEXT: movq %rdi, 8(%rax) ; AVX-NEXT: popq %rbx ; AVX-NEXT: retq %z = call <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128> %x, <2 x i128> %y) Index: llvm/test/CodeGen/X86/ssub_sat.ll =================================================================== --- llvm/test/CodeGen/X86/ssub_sat.ll +++ llvm/test/CodeGen/X86/ssub_sat.ll @@ -39,23 +39,18 @@ ; X86-LABEL: func2: ; X86: # %bb.0: ; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi +; X86-NEXT: subl {{[0-9]+}}(%esp), %eax +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: seto %bl -; X86-NEXT: movl %esi, %eax -; X86-NEXT: sarl $31, %eax +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: sarl $31, %edx ; X86-NEXT: testb %bl, %bl -; X86-NEXT: cmovel %ecx, %eax -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: testl %esi, %esi -; X86-NEXT: setns %dl -; X86-NEXT: addl $2147483647, %edx # imm = 0x7FFFFFFF +; X86-NEXT: cmovnel %edx, %eax +; X86-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 ; X86-NEXT: testb %bl, %bl -; X86-NEXT: cmovel %esi, %edx -; X86-NEXT: popl %esi +; X86-NEXT: cmovel %ecx, %edx ; X86-NEXT: popl %ebx ; X86-NEXT: retl ; @@ -212,20 +207,19 @@ ; ; X64-LABEL: vec: ; X64: # %bb.0: -; X64-NEXT: pxor %xmm2, %xmm2 -; X64-NEXT: movdqa %xmm0, %xmm3 -; X64-NEXT: psubd %xmm1, %xmm3 -; X64-NEXT: pcmpgtd %xmm2, %xmm1 -; X64-NEXT: pcmpgtd %xmm3, %xmm0 +; X64-NEXT: pxor %xmm3, %xmm3 +; X64-NEXT: movdqa %xmm0, %xmm2 +; X64-NEXT: psubd %xmm1, %xmm2 +; X64-NEXT: pcmpgtd %xmm3, %xmm1 +; X64-NEXT: pcmpgtd %xmm2, %xmm0 ; X64-NEXT: pxor %xmm1, %xmm0 -; X64-NEXT: movdqa %xmm3, %xmm1 -; X64-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; X64-NEXT: pcmpgtd %xmm3, %xmm2 -; X64-NEXT: psrld $1, %xmm2 -; X64-NEXT: por %xmm2, %xmm1 -; X64-NEXT: pand %xmm0, %xmm1 -; X64-NEXT: pandn %xmm3, %xmm0 -; X64-NEXT: por %xmm1, %xmm0 +; X64-NEXT: movdqa %xmm0, %xmm1 +; X64-NEXT: pandn %xmm2, %xmm1 +; X64-NEXT: psrad $31, %xmm2 +; X64-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; X64-NEXT: pand %xmm0, %xmm2 +; X64-NEXT: por %xmm1, %xmm2 +; X64-NEXT: movdqa %xmm2, %xmm0 ; X64-NEXT: retq %tmp = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %x, <4 x i32> %y) ret <4 x i32> %tmp Index: llvm/test/CodeGen/X86/ssub_sat_plus.ll =================================================================== --- llvm/test/CodeGen/X86/ssub_sat_plus.ll +++ llvm/test/CodeGen/X86/ssub_sat_plus.ll @@ -41,23 +41,18 @@ ; X86-LABEL: func64: ; X86: # %bb.0: ; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi +; X86-NEXT: subl {{[0-9]+}}(%esp), %eax +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: seto %bl -; X86-NEXT: movl %esi, %eax -; X86-NEXT: sarl $31, %eax +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: sarl $31, %edx ; X86-NEXT: testb %bl, %bl -; X86-NEXT: cmovel %ecx, %eax -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: testl %esi, %esi -; X86-NEXT: setns %dl -; X86-NEXT: addl $2147483647, %edx # imm = 0x7FFFFFFF +; X86-NEXT: cmovnel %edx, %eax +; X86-NEXT: xorl $-2147483648, %edx # imm = 0x80000000 ; X86-NEXT: testb %bl, %bl -; X86-NEXT: cmovel %esi, %edx -; X86-NEXT: popl %esi +; X86-NEXT: cmovel %ecx, %edx ; X86-NEXT: popl %ebx ; X86-NEXT: retl ; Index: llvm/test/CodeGen/X86/ssub_sat_vec.ll =================================================================== --- llvm/test/CodeGen/X86/ssub_sat_vec.ll +++ llvm/test/CodeGen/X86/ssub_sat_vec.ll @@ -612,55 +612,51 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; SSE2-LABEL: v2i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psubd %xmm1, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psubd %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pandn %xmm3, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pandn %xmm2, %xmm1 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v2i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: psubd %xmm1, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm0 +; SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: psubd %xmm1, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0 ; SSSE3-NEXT: pxor %xmm1, %xmm0 -; SSSE3-NEXT: movdqa %xmm3, %xmm1 -; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 -; SSSE3-NEXT: psrld $1, %xmm2 -; SSSE3-NEXT: por %xmm2, %xmm1 -; SSSE3-NEXT: pand %xmm0, %xmm1 -; SSSE3-NEXT: pandn %xmm3, %xmm0 -; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pandn %xmm2, %xmm1 +; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSSE3-NEXT: pand %xmm0, %xmm2 +; SSSE3-NEXT: por %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v2i32: ; SSE41: # %bb.0: +; SSE41-NEXT: pxor %xmm3, %xmm3 ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: psubd %xmm1, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE41-NEXT: pxor %xmm1, %xmm2 -; SSE41-NEXT: movaps {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: blendvps %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3 -; SSE41-NEXT: movaps %xmm3, %xmm0 +; SSE41-NEXT: psubd %xmm1, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: psrad $31, %xmm1 +; SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm2 +; SSE41-NEXT: movaps %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: v2i32: @@ -670,8 +666,8 @@ ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; AVX1-NEXT: vblendvps %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 +; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -682,9 +678,9 @@ ; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647] -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vblendvps %xmm1, %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -695,9 +691,9 @@ ; AVX512F-NEXT: vpsubd %xmm1, %xmm0, %xmm1 ; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpxor %xmm0, %xmm2, %xmm0 -; AVX512F-NEXT: vbroadcastss {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647] -; AVX512F-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; AVX512F-NEXT: vblendvps %xmm1, %xmm2, %xmm3, %xmm2 +; AVX512F-NEXT: vpsrad $31, %xmm1, %xmm2 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; AVX512F-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; AVX512F-NEXT: retq ; @@ -708,10 +704,8 @@ ; AVX512BW-NEXT: vpsubd %xmm1, %xmm0, %xmm1 ; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 ; AVX512BW-NEXT: kxorw %k1, %k0, %k1 -; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm2, %k2 -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648] -; AVX512BW-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k2} -; AVX512BW-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} +; AVX512BW-NEXT: vpsrad $31, %xmm1, %xmm0 +; AVX512BW-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 {%k1} ; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512BW-NEXT: retq %z = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %x, <2 x i32> %y) @@ -721,55 +715,51 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; SSE2-LABEL: v4i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psubd %xmm1, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psubd %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pandn %xmm3, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pandn %xmm2, %xmm1 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v4i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: psubd %xmm1, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm0 +; SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: psubd %xmm1, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0 ; SSSE3-NEXT: pxor %xmm1, %xmm0 -; SSSE3-NEXT: movdqa %xmm3, %xmm1 -; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 -; SSSE3-NEXT: psrld $1, %xmm2 -; SSSE3-NEXT: por %xmm2, %xmm1 -; SSSE3-NEXT: pand %xmm0, %xmm1 -; SSSE3-NEXT: pandn %xmm3, %xmm0 -; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pandn %xmm2, %xmm1 +; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSSE3-NEXT: pand %xmm0, %xmm2 +; SSSE3-NEXT: por %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v4i32: ; SSE41: # %bb.0: +; SSE41-NEXT: pxor %xmm3, %xmm3 ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: psubd %xmm1, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE41-NEXT: pxor %xmm1, %xmm2 -; SSE41-NEXT: movaps {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: blendvps %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3 -; SSE41-NEXT: movaps %xmm3, %xmm0 +; SSE41-NEXT: psubd %xmm1, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: psrad $31, %xmm1 +; SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm2 +; SSE41-NEXT: movaps %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: v4i32: @@ -779,8 +769,8 @@ ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; AVX1-NEXT: vblendvps %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 +; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -791,9 +781,9 @@ ; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647] -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vblendvps %xmm1, %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -804,9 +794,9 @@ ; AVX512F-NEXT: vpsubd %xmm1, %xmm0, %xmm1 ; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpxor %xmm0, %xmm2, %xmm0 -; AVX512F-NEXT: vbroadcastss {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647] -; AVX512F-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; AVX512F-NEXT: vblendvps %xmm1, %xmm2, %xmm3, %xmm2 +; AVX512F-NEXT: vpsrad $31, %xmm1, %xmm2 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; AVX512F-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; AVX512F-NEXT: retq ; @@ -817,10 +807,8 @@ ; AVX512BW-NEXT: vpsubd %xmm1, %xmm0, %xmm1 ; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 ; AVX512BW-NEXT: kxorw %k1, %k0, %k1 -; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm2, %k2 -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648] -; AVX512BW-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k2} -; AVX512BW-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} +; AVX512BW-NEXT: vpsrad $31, %xmm1, %xmm0 +; AVX512BW-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 {%k1} ; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512BW-NEXT: retq %z = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %x, <4 x i32> %y) @@ -830,97 +818,87 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind { ; SSE2-LABEL: v8i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: psubd %xmm2, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: pandn %xmm2, %xmm6 -; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 -; SSE2-NEXT: psrld $1, %xmm7 -; SSE2-NEXT: por %xmm7, %xmm6 -; SSE2-NEXT: pand %xmm0, %xmm6 -; SSE2-NEXT: pandn %xmm5, %xmm0 -; SSE2-NEXT: por %xmm6, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: psubd %xmm3, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: psubd %xmm2, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT: pxor %xmm2, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: pxor %xmm6, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psubd %xmm3, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm5, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: pandn %xmm2, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 -; SSE2-NEXT: psrld $1, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm3 -; SSE2-NEXT: pand %xmm1, %xmm3 -; SSE2-NEXT: pandn %xmm5, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: pxor %xmm6, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v8i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: movdqa %xmm0, %xmm5 -; SSSE3-NEXT: psubd %xmm2, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm0 -; SSSE3-NEXT: pxor %xmm2, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm5, %xmm6 -; SSSE3-NEXT: pandn %xmm2, %xmm6 -; SSSE3-NEXT: pxor %xmm7, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 -; SSSE3-NEXT: psrld $1, %xmm7 -; SSSE3-NEXT: por %xmm7, %xmm6 -; SSSE3-NEXT: pand %xmm0, %xmm6 -; SSSE3-NEXT: pandn %xmm5, %xmm0 -; SSSE3-NEXT: por %xmm6, %xmm0 -; SSSE3-NEXT: movdqa %xmm1, %xmm5 -; SSSE3-NEXT: psubd %xmm3, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: pxor %xmm5, %xmm5 +; SSSE3-NEXT: psubd %xmm2, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 +; SSSE3-NEXT: pxor %xmm2, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, %xmm2 +; SSSE3-NEXT: pandn %xmm0, %xmm2 +; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: pxor %xmm6, %xmm0 +; SSSE3-NEXT: pand %xmm4, %xmm0 +; SSSE3-NEXT: por %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: psubd %xmm3, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 ; SSSE3-NEXT: pxor %xmm3, %xmm1 -; SSSE3-NEXT: movdqa %xmm5, %xmm3 +; SSSE3-NEXT: movdqa %xmm1, %xmm3 ; SSSE3-NEXT: pandn %xmm2, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4 -; SSSE3-NEXT: psrld $1, %xmm4 -; SSSE3-NEXT: por %xmm4, %xmm3 -; SSSE3-NEXT: pand %xmm1, %xmm3 -; SSSE3-NEXT: pandn %xmm5, %xmm1 -; SSSE3-NEXT: por %xmm3, %xmm1 +; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: pxor %xmm6, %xmm2 +; SSSE3-NEXT: pand %xmm1, %xmm2 +; SSSE3-NEXT: por %xmm3, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm1 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v8i32: ; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm1, %xmm5 +; SSE41-NEXT: pxor %xmm6, %xmm6 ; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pxor %xmm8, %xmm8 -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: psubd %xmm2, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm4 -; SSE41-NEXT: pxor %xmm2, %xmm4 -; SSE41-NEXT: movaps {{.*#+}} xmm7 = [2147483647,2147483647,2147483647,2147483647] -; SSE41-NEXT: movaps {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] -; SSE41-NEXT: movaps %xmm6, %xmm2 +; SSE41-NEXT: psubd %xmm2, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm6, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm1 +; SSE41-NEXT: psrad $31, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSE41-NEXT: pxor %xmm2, %xmm1 +; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm4 +; SSE41-NEXT: movdqa %xmm5, %xmm1 +; SSE41-NEXT: psubd %xmm3, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm6, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm1, %xmm5 +; SSE41-NEXT: pxor %xmm3, %xmm5 +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: psrad $31, %xmm3 +; SSE41-NEXT: pxor %xmm2, %xmm3 ; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm7, %xmm2 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm5 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psubd %xmm3, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm1 -; SSE41-NEXT: pxor %xmm3, %xmm1 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm7, %xmm6 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm6, %xmm2 -; SSE41-NEXT: movaps %xmm5, %xmm0 -; SSE41-NEXT: movaps %xmm2, %xmm1 +; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm1 +; SSE41-NEXT: movaps %xmm4, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: v8i32: @@ -937,10 +915,12 @@ ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; AVX1-NEXT: vxorps %ymm0, %ymm3, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm3 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1 +; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] -; AVX1-NEXT: vblendvps %ymm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0 +; AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX1-NEXT: vblendvps %ymm0, %ymm1, %ymm3, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v8i32: @@ -950,9 +930,9 @@ ; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647] -; AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vblendvps %ymm1, %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpsrad $31, %ymm1, %ymm2 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] +; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -963,9 +943,9 @@ ; AVX512F-NEXT: vpsubd %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpxor %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vbroadcastss {{.*#+}} ymm2 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647] -; AVX512F-NEXT: vbroadcastss {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] -; AVX512F-NEXT: vblendvps %ymm1, %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpsrad $31, %ymm1, %ymm2 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] +; AVX512F-NEXT: vpxor %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0 ; AVX512F-NEXT: retq ; @@ -976,10 +956,8 @@ ; AVX512BW-NEXT: vpsubd %ymm1, %ymm0, %ymm1 ; AVX512BW-NEXT: vpcmpgtd %ymm1, %ymm0, %k1 ; AVX512BW-NEXT: kxorw %k1, %k0, %k1 -; AVX512BW-NEXT: vpcmpgtd %ymm1, %ymm2, %k2 -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] -; AVX512BW-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 {%k2} -; AVX512BW-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} +; AVX512BW-NEXT: vpsrad $31, %ymm1, %ymm0 +; AVX512BW-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm1 {%k1} ; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512BW-NEXT: retq %z = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> %x, <8 x i32> %y) @@ -989,176 +967,157 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind { ; SSE2-LABEL: v16i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: movdqa %xmm0, %xmm9 -; SSE2-NEXT: psubd %xmm4, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm9, %xmm11 -; SSE2-NEXT: pandn %xmm10, %xmm11 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm4 -; SSE2-NEXT: psrld $1, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm11 -; SSE2-NEXT: pand %xmm0, %xmm11 -; SSE2-NEXT: pandn %xmm9, %xmm0 -; SSE2-NEXT: por %xmm11, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm9 -; SSE2-NEXT: psubd %xmm5, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm1 -; SSE2-NEXT: pxor %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm9, %xmm5 -; SSE2-NEXT: pandn %xmm10, %xmm5 -; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm9, %xmm9 +; SSE2-NEXT: psubd %xmm4, %xmm0 ; SSE2-NEXT: pcmpgtd %xmm9, %xmm4 -; SSE2-NEXT: psrld $1, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm5 -; SSE2-NEXT: pand %xmm1, %xmm5 -; SSE2-NEXT: pandn %xmm9, %xmm1 -; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: pxor %xmm10, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm8, %xmm1 +; SSE2-NEXT: psubd %xmm5, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm8 +; SSE2-NEXT: pxor %xmm5, %xmm8 +; SSE2-NEXT: movdqa %xmm8, %xmm4 +; SSE2-NEXT: pandn %xmm1, %xmm4 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pxor %xmm10, %xmm1 +; SSE2-NEXT: pand %xmm8, %xmm1 +; SSE2-NEXT: por %xmm4, %xmm1 ; SSE2-NEXT: movdqa %xmm2, %xmm4 ; SSE2-NEXT: psubd %xmm6, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm6 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm2 ; SSE2-NEXT: pxor %xmm6, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pandn %xmm10, %xmm5 -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: por %xmm6, %xmm5 -; SSE2-NEXT: pand %xmm2, %xmm5 -; SSE2-NEXT: pandn %xmm4, %xmm2 -; SSE2-NEXT: por %xmm5, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: psubd %xmm7, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm3 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pandn %xmm4, %xmm5 +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: pxor %xmm10, %xmm4 +; SSE2-NEXT: pand %xmm2, %xmm4 +; SSE2-NEXT: por %xmm5, %xmm4 +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: psubd %xmm7, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm3 ; SSE2-NEXT: pxor %xmm7, %xmm3 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pandn %xmm10, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm8 -; SSE2-NEXT: psrld $1, %xmm8 -; SSE2-NEXT: por %xmm8, %xmm5 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pandn %xmm5, %xmm2 +; SSE2-NEXT: psrad $31, %xmm5 +; SSE2-NEXT: pxor %xmm10, %xmm5 ; SSE2-NEXT: pand %xmm3, %xmm5 -; SSE2-NEXT: pandn %xmm4, %xmm3 -; SSE2-NEXT: por %xmm5, %xmm3 +; SSE2-NEXT: por %xmm2, %xmm5 +; SSE2-NEXT: movdqa %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm5, %xmm3 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v16i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pxor %xmm8, %xmm8 -; SSSE3-NEXT: movdqa %xmm0, %xmm9 -; SSSE3-NEXT: psubd %xmm4, %xmm9 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm0 -; SSSE3-NEXT: pxor %xmm4, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm9, %xmm11 -; SSSE3-NEXT: pandn %xmm10, %xmm11 -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm4 -; SSSE3-NEXT: psrld $1, %xmm4 -; SSSE3-NEXT: por %xmm4, %xmm11 -; SSSE3-NEXT: pand %xmm0, %xmm11 -; SSSE3-NEXT: pandn %xmm9, %xmm0 -; SSSE3-NEXT: por %xmm11, %xmm0 -; SSSE3-NEXT: movdqa %xmm1, %xmm9 -; SSSE3-NEXT: psubd %xmm5, %xmm9 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm1 -; SSSE3-NEXT: pxor %xmm5, %xmm1 -; SSSE3-NEXT: movdqa %xmm9, %xmm5 -; SSSE3-NEXT: pandn %xmm10, %xmm5 -; SSSE3-NEXT: pxor %xmm4, %xmm4 +; SSSE3-NEXT: movdqa %xmm1, %xmm8 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm9, %xmm9 +; SSSE3-NEXT: psubd %xmm4, %xmm0 ; SSSE3-NEXT: pcmpgtd %xmm9, %xmm4 -; SSSE3-NEXT: psrld $1, %xmm4 -; SSSE3-NEXT: por %xmm4, %xmm5 -; SSSE3-NEXT: pand %xmm1, %xmm5 -; SSSE3-NEXT: pandn %xmm9, %xmm1 -; SSSE3-NEXT: por %xmm5, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSSE3-NEXT: pandn %xmm0, %xmm4 +; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: pxor %xmm10, %xmm0 +; SSSE3-NEXT: pand %xmm1, %xmm0 +; SSSE3-NEXT: por %xmm4, %xmm0 +; SSSE3-NEXT: movdqa %xmm8, %xmm1 +; SSSE3-NEXT: psubd %xmm5, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm8 +; SSSE3-NEXT: pxor %xmm5, %xmm8 +; SSSE3-NEXT: movdqa %xmm8, %xmm4 +; SSSE3-NEXT: pandn %xmm1, %xmm4 +; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: pxor %xmm10, %xmm1 +; SSSE3-NEXT: pand %xmm8, %xmm1 +; SSSE3-NEXT: por %xmm4, %xmm1 ; SSSE3-NEXT: movdqa %xmm2, %xmm4 ; SSSE3-NEXT: psubd %xmm6, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm6 ; SSSE3-NEXT: pcmpgtd %xmm4, %xmm2 ; SSSE3-NEXT: pxor %xmm6, %xmm2 -; SSSE3-NEXT: movdqa %xmm4, %xmm5 -; SSSE3-NEXT: pandn %xmm10, %xmm5 -; SSSE3-NEXT: pxor %xmm6, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 -; SSSE3-NEXT: psrld $1, %xmm6 -; SSSE3-NEXT: por %xmm6, %xmm5 -; SSSE3-NEXT: pand %xmm2, %xmm5 -; SSSE3-NEXT: pandn %xmm4, %xmm2 -; SSSE3-NEXT: por %xmm5, %xmm2 -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: psubd %xmm7, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3 +; SSSE3-NEXT: movdqa %xmm2, %xmm5 +; SSSE3-NEXT: pandn %xmm4, %xmm5 +; SSSE3-NEXT: psrad $31, %xmm4 +; SSSE3-NEXT: pxor %xmm10, %xmm4 +; SSSE3-NEXT: pand %xmm2, %xmm4 +; SSSE3-NEXT: por %xmm5, %xmm4 +; SSSE3-NEXT: movdqa %xmm3, %xmm5 +; SSSE3-NEXT: psubd %xmm7, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm3 ; SSSE3-NEXT: pxor %xmm7, %xmm3 -; SSSE3-NEXT: movdqa %xmm4, %xmm5 -; SSSE3-NEXT: pandn %xmm10, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm8 -; SSSE3-NEXT: psrld $1, %xmm8 -; SSSE3-NEXT: por %xmm8, %xmm5 +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: pandn %xmm5, %xmm2 +; SSSE3-NEXT: psrad $31, %xmm5 +; SSSE3-NEXT: pxor %xmm10, %xmm5 ; SSSE3-NEXT: pand %xmm3, %xmm5 -; SSSE3-NEXT: pandn %xmm4, %xmm3 -; SSSE3-NEXT: por %xmm5, %xmm3 +; SSSE3-NEXT: por %xmm2, %xmm5 +; SSSE3-NEXT: movdqa %xmm4, %xmm2 +; SSSE3-NEXT: movdqa %xmm5, %xmm3 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v16i32: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm3, %xmm8 -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pxor %xmm10, %xmm10 +; SSE41-NEXT: movdqa %xmm2, %xmm10 +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: pxor %xmm11, %xmm11 ; SSE41-NEXT: movdqa %xmm0, %xmm9 ; SSE41-NEXT: psubd %xmm4, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm3 -; SSE41-NEXT: pxor %xmm4, %xmm3 -; SSE41-NEXT: movaps {{.*#+}} xmm12 = [2147483647,2147483647,2147483647,2147483647] -; SSE41-NEXT: movaps {{.*#+}} xmm11 = [2147483648,2147483648,2147483648,2147483648] -; SSE41-NEXT: movaps %xmm11, %xmm4 -; SSE41-NEXT: movdqa %xmm9, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm12, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm11, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm1 +; SSE41-NEXT: psrad $31, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSE41-NEXT: pxor %xmm4, %xmm1 +; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm9 +; SSE41-NEXT: movdqa %xmm3, %xmm1 +; SSE41-NEXT: psubd %xmm5, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm11, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE41-NEXT: pxor %xmm5, %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psrad $31, %xmm2 +; SSE41-NEXT: pxor %xmm4, %xmm2 ; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm4, %xmm9 -; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: psubd %xmm5, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm1 -; SSE41-NEXT: pxor %xmm5, %xmm1 -; SSE41-NEXT: movaps %xmm11, %xmm3 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm12, %xmm3 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm4 +; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm10, %xmm2 +; SSE41-NEXT: psubd %xmm6, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm11, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm10 +; SSE41-NEXT: pxor %xmm6, %xmm10 ; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: psubd %xmm6, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE41-NEXT: pxor %xmm6, %xmm2 -; SSE41-NEXT: movaps %xmm11, %xmm1 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm12, %xmm1 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3 -; SSE41-NEXT: movdqa %xmm8, %xmm5 -; SSE41-NEXT: psubd %xmm7, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm8 +; SSE41-NEXT: psrad $31, %xmm3 +; SSE41-NEXT: pxor %xmm4, %xmm3 +; SSE41-NEXT: movdqa %xmm10, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm8, %xmm3 +; SSE41-NEXT: psubd %xmm7, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm11, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm8 ; SSE41-NEXT: pxor %xmm7, %xmm8 -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm12, %xmm11 +; SSE41-NEXT: movdqa %xmm3, %xmm5 +; SSE41-NEXT: psrad $31, %xmm5 +; SSE41-NEXT: pxor %xmm4, %xmm5 ; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm11, %xmm5 +; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm3 ; SSE41-NEXT: movaps %xmm9, %xmm0 -; SSE41-NEXT: movaps %xmm4, %xmm1 -; SSE41-NEXT: movaps %xmm3, %xmm2 -; SSE41-NEXT: movaps %xmm5, %xmm3 ; SSE41-NEXT: retq ; ; AVX1-LABEL: v16i32: @@ -1175,25 +1134,30 @@ ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 ; AVX1-NEXT: vxorps %ymm0, %ymm6, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm6 +; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 +; AVX1-NEXT: vpsrad $31, %xmm4, %xmm4 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647] -; AVX1-NEXT: vmovaps {{.*#+}} ymm6 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] -; AVX1-NEXT: vblendvps %ymm2, %ymm4, %ymm6, %ymm7 -; AVX1-NEXT: vblendvps %ymm0, %ymm7, %ymm2, %ymm0 +; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] +; AVX1-NEXT: vxorps %ymm4, %ymm2, %ymm2 +; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm6, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 -; AVX1-NEXT: vpcmpgtd %xmm5, %xmm2, %xmm7 +; AVX1-NEXT: vpcmpgtd %xmm5, %xmm2, %xmm6 ; AVX1-NEXT: vpcmpgtd %xmm5, %xmm3, %xmm5 -; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 -; AVX1-NEXT: vpsubd %xmm2, %xmm7, %xmm2 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm7, %xmm7 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 +; AVX1-NEXT: vpsubd %xmm2, %xmm6, %xmm2 +; AVX1-NEXT: vpcmpgtd %xmm2, %xmm6, %xmm6 ; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm3 ; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 ; AVX1-NEXT: vxorps %ymm1, %ymm5, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm5 +; AVX1-NEXT: vpsrad $31, %xmm3, %xmm3 +; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-NEXT: vblendvps %ymm2, %ymm4, %ymm6, %ymm3 -; AVX1-NEXT: vblendvps %ymm1, %ymm3, %ymm2, %ymm1 +; AVX1-NEXT: vxorps %ymm4, %ymm2, %ymm2 +; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm5, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v16i32: @@ -1203,15 +1167,16 @@ ; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm0, %ymm5, %ymm0 -; AVX2-NEXT: vbroadcastss {{.*#+}} ymm5 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647] -; AVX2-NEXT: vbroadcastss {{.*#+}} ymm6 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vblendvps %ymm2, %ymm5, %ymm6, %ymm7 -; AVX2-NEXT: vblendvps %ymm0, %ymm7, %ymm2, %ymm0 +; AVX2-NEXT: vpsrad $31, %ymm2, %ymm5 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] +; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5 +; AVX2-NEXT: vblendvps %ymm0, %ymm5, %ymm2, %ymm0 ; AVX2-NEXT: vpcmpgtd %ymm4, %ymm3, %ymm2 ; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm3 ; AVX2-NEXT: vpcmpgtd %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpxor %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vblendvps %ymm3, %ymm5, %ymm6, %ymm2 +; AVX2-NEXT: vpsrad $31, %ymm3, %ymm2 +; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm2 ; AVX2-NEXT: vblendvps %ymm1, %ymm2, %ymm3, %ymm1 ; AVX2-NEXT: retq ; @@ -1222,10 +1187,8 @@ ; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm1 ; AVX512-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 ; AVX512-NEXT: kxorw %k1, %k0, %k1 -; AVX512-NEXT: vpcmpgtd %zmm1, %zmm2, %k2 -; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] -; AVX512-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0 {%k2} -; AVX512-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; AVX512-NEXT: vpsrad $31, %zmm1, %zmm0 +; AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm1 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512-NEXT: retq %z = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> %x, <16 x i32> %y) @@ -1261,12 +1224,9 @@ ; SSE2-NEXT: pxor %xmm4, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 -; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -1299,12 +1259,9 @@ ; SSSE3-NEXT: pxor %xmm4, %xmm2 ; SSSE3-NEXT: movdqa %xmm2, %xmm1 ; SSSE3-NEXT: pandn %xmm0, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 -; SSSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 -; SSSE3-NEXT: por %xmm4, %xmm0 +; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSSE3-NEXT: pand %xmm2, %xmm0 ; SSSE3-NEXT: por %xmm1, %xmm0 ; SSSE3-NEXT: retq @@ -1325,17 +1282,18 @@ ; SSE41-NEXT: pand %xmm5, %xmm4 ; SSE41-NEXT: por %xmm3, %xmm4 ; SSE41-NEXT: pxor %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm3 -; SSE41-NEXT: por %xmm1, %xmm3 -; SSE41-NEXT: pxor %xmm4, %xmm3 -; SSE41-NEXT: movapd {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: psrad $31, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: movapd %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -1343,24 +1301,24 @@ ; AVX1-LABEL: v2i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vmovapd {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX1-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpxor %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v2i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2 +; AVX2-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vmovapd {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX2-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX2-NEXT: vpxor %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX2-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -1371,9 +1329,10 @@ ; AVX512F-NEXT: vpsubq %xmm1, %xmm0, %xmm1 ; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpxor %xmm0, %xmm2, %xmm0 -; AVX512F-NEXT: vmovapd {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX512F-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512F-NEXT: vpsraq $63, %zmm1, %zmm2 +; AVX512F-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512F-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v2i64: @@ -1383,10 +1342,8 @@ ; AVX512BW-NEXT: vpsubq %xmm1, %xmm0, %xmm1 ; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 ; AVX512BW-NEXT: kxorw %k1, %k0, %k1 -; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm2, %k2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808] -; AVX512BW-NEXT: vmovdqa64 {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} +; AVX512BW-NEXT: vpsraq $63, %xmm1, %xmm0 +; AVX512BW-NEXT: vpxorq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 {%k1} ; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512BW-NEXT: retq %z = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %x, <2 x i64> %y) @@ -1417,34 +1374,29 @@ ; SSE2-NEXT: pcmpeqd %xmm8, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSE2-NEXT: pand %xmm6, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm7 -; SSE2-NEXT: pxor %xmm4, %xmm7 -; SSE2-NEXT: movdqa %xmm7, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm4 ; SSE2-NEXT: pandn %xmm0, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808] -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; SSE2-NEXT: pandn %xmm9, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775807,9223372036854775807] -; SSE2-NEXT: pand %xmm10, %xmm5 -; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm5, %xmm0 ; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 ; SSE2-NEXT: psubq %xmm3, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm5 ; SSE2-NEXT: pxor %xmm8, %xmm5 -; SSE2-NEXT: movdqa %xmm4, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSE2-NEXT: por %xmm4, %xmm5 ; SSE2-NEXT: pxor %xmm8, %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm4 @@ -1458,11 +1410,9 @@ ; SSE2-NEXT: pxor %xmm5, %xmm4 ; SSE2-NEXT: movdqa %xmm4, %xmm3 ; SSE2-NEXT: pandn %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] -; SSE2-NEXT: pandn %xmm9, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm2 -; SSE2-NEXT: pand %xmm10, %xmm2 -; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pand %xmm4, %xmm1 ; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: retq @@ -1490,34 +1440,29 @@ ; SSSE3-NEXT: pcmpeqd %xmm8, %xmm2 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSSE3-NEXT: pand %xmm6, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm7 -; SSSE3-NEXT: pxor %xmm4, %xmm7 -; SSSE3-NEXT: movdqa %xmm7, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: por %xmm2, %xmm5 +; SSSE3-NEXT: pxor %xmm4, %xmm5 +; SSSE3-NEXT: movdqa %xmm5, %xmm4 ; SSSE3-NEXT: pandn %xmm0, %xmm4 -; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808] -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; SSSE3-NEXT: pandn %xmm9, %xmm0 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: pxor %xmm5, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775807,9223372036854775807] -; SSSE3-NEXT: pand %xmm10, %xmm5 -; SSSE3-NEXT: por %xmm5, %xmm0 -; SSSE3-NEXT: pand %xmm7, %xmm0 +; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; SSSE3-NEXT: pxor %xmm2, %xmm0 +; SSSE3-NEXT: pand %xmm5, %xmm0 ; SSSE3-NEXT: por %xmm4, %xmm0 ; SSSE3-NEXT: movdqa %xmm1, %xmm4 ; SSSE3-NEXT: pxor %xmm8, %xmm4 ; SSSE3-NEXT: psubq %xmm3, %xmm1 ; SSSE3-NEXT: movdqa %xmm1, %xmm5 ; SSSE3-NEXT: pxor %xmm8, %xmm5 -; SSSE3-NEXT: movdqa %xmm4, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] +; SSSE3-NEXT: movdqa %xmm4, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm4, %xmm5 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSSE3-NEXT: por %xmm4, %xmm5 ; SSSE3-NEXT: pxor %xmm8, %xmm3 ; SSSE3-NEXT: movdqa %xmm3, %xmm4 @@ -1531,11 +1476,9 @@ ; SSSE3-NEXT: pxor %xmm5, %xmm4 ; SSSE3-NEXT: movdqa %xmm4, %xmm3 ; SSSE3-NEXT: pandn %xmm1, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] -; SSSE3-NEXT: pandn %xmm9, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm2 -; SSSE3-NEXT: pand %xmm10, %xmm2 -; SSSE3-NEXT: por %xmm2, %xmm1 +; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pxor %xmm2, %xmm1 ; SSSE3-NEXT: pand %xmm4, %xmm1 ; SSSE3-NEXT: por %xmm3, %xmm1 ; SSSE3-NEXT: retq @@ -1543,55 +1486,57 @@ ; SSE41-LABEL: v4i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648] +; SSE41-NEXT: pxor %xmm8, %xmm0 ; SSE41-NEXT: psubq %xmm2, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pxor %xmm9, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm6 +; SSE41-NEXT: pxor %xmm8, %xmm6 ; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm6, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] ; SSE41-NEXT: pand %xmm7, %xmm6 ; SSE41-NEXT: por %xmm0, %xmm6 -; SSE41-NEXT: pxor %xmm9, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm0, %xmm5 -; SSE41-NEXT: por %xmm2, %xmm5 -; SSE41-NEXT: pxor %xmm6, %xmm5 -; SSE41-NEXT: movapd {{.*#+}} xmm8 = [9223372036854775807,9223372036854775807] -; SSE41-NEXT: movapd {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808] -; SSE41-NEXT: movapd %xmm7, %xmm2 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2 -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 +; SSE41-NEXT: pxor %xmm8, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm2 +; SSE41-NEXT: psrad $31, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; SSE41-NEXT: pxor %xmm2, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: pxor %xmm8, %xmm0 ; SSE41-NEXT: psubq %xmm3, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: pxor %xmm9, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm6 -; SSE41-NEXT: por %xmm0, %xmm6 -; SSE41-NEXT: pxor %xmm9, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm0, %xmm2 -; SSE41-NEXT: por %xmm3, %xmm2 -; SSE41-NEXT: pxor %xmm6, %xmm2 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm7 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm5 +; SSE41-NEXT: pxor %xmm8, %xmm5 +; SSE41-NEXT: movdqa %xmm0, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm5 +; SSE41-NEXT: por %xmm0, %xmm5 +; SSE41-NEXT: pxor %xmm8, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: psrad $31, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE41-NEXT: pxor %xmm2, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 ; SSE41-NEXT: movapd %xmm4, %xmm0 ; SSE41-NEXT: retq ; @@ -1600,31 +1545,33 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpsubq %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm5, %xmm5 ; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-NEXT: vxorpd %ymm0, %ymm3, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 +; AVX1-NEXT: vxorpd %ymm0, %ymm4, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm4 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX1-NEXT: vblendvpd %ymm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX1-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0 +; AVX1-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX1-NEXT: vblendvpd %ymm0, %ymm1, %ymm4, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v4i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 ; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807] -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpxor %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm2 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -1635,9 +1582,9 @@ ; AVX512F-NEXT: vpsubq %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpxor %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vbroadcastsd {{.*#+}} ymm2 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807] -; AVX512F-NEXT: vbroadcastsd {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX512F-NEXT: vblendvpd %ymm1, %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpsraq $63, %zmm1, %zmm2 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX512F-NEXT: vpxor %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0 ; AVX512F-NEXT: retq ; @@ -1648,10 +1595,8 @@ ; AVX512BW-NEXT: vpsubq %ymm1, %ymm0, %ymm1 ; AVX512BW-NEXT: vpcmpgtq %ymm1, %ymm0, %k1 ; AVX512BW-NEXT: kxorw %k1, %k0, %k1 -; AVX512BW-NEXT: vpcmpgtq %ymm1, %ymm2, %k2 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm0 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX512BW-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1} +; AVX512BW-NEXT: vpsraq $63, %ymm1, %ymm0 +; AVX512BW-NEXT: vpxorq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm1 {%k1} ; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512BW-NEXT: retq %z = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> %x, <4 x i64> %y) @@ -1682,86 +1627,75 @@ ; SSE2-NEXT: pcmpeqd %xmm8, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm4[1,1,3,3] ; SSE2-NEXT: pand %xmm11, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm9[1,1,3,3] -; SSE2-NEXT: por %xmm12, %xmm13 -; SSE2-NEXT: pxor %xmm10, %xmm13 -; SSE2-NEXT: movdqa %xmm13, %xmm12 -; SSE2-NEXT: pandn %xmm0, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3] +; SSE2-NEXT: por %xmm12, %xmm4 +; SSE2-NEXT: pxor %xmm10, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm10 +; SSE2-NEXT: pandn %xmm0, %xmm10 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808] -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3] -; SSE2-NEXT: pandn %xmm9, %xmm0 -; SSE2-NEXT: pxor %xmm11, %xmm11 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775807,9223372036854775807] -; SSE2-NEXT: pand %xmm10, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: pand %xmm13, %xmm0 -; SSE2-NEXT: por %xmm12, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm12 -; SSE2-NEXT: pxor %xmm8, %xmm12 +; SSE2-NEXT: pxor %xmm9, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: por %xmm10, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm10 +; SSE2-NEXT: pxor %xmm8, %xmm10 ; SSE2-NEXT: psubq %xmm5, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: movdqa %xmm12, %xmm13 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm12, %xmm4 +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm14, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm13[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm12 +; SSE2-NEXT: pand %xmm12, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm10 ; SSE2-NEXT: pxor %xmm8, %xmm5 ; SSE2-NEXT: movdqa %xmm5, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm8, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm4[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm8, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: pand %xmm13, %xmm5 +; SSE2-NEXT: pand %xmm11, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm5, %xmm4 -; SSE2-NEXT: pxor %xmm12, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm13 -; SSE2-NEXT: pandn %xmm1, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm1[1,1,3,3] -; SSE2-NEXT: pandn %xmm9, %xmm1 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm12, %xmm5 -; SSE2-NEXT: pand %xmm10, %xmm5 -; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: pxor %xmm10, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pandn %xmm1, %xmm5 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm9, %xmm1 ; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: por %xmm13, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm12 -; SSE2-NEXT: pxor %xmm8, %xmm12 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm10 +; SSE2-NEXT: pxor %xmm8, %xmm10 ; SSE2-NEXT: psubq %xmm6, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm5 ; SSE2-NEXT: pxor %xmm8, %xmm5 -; SSE2-NEXT: movdqa %xmm12, %xmm4 +; SSE2-NEXT: movdqa %xmm10, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm12, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: pand %xmm13, %xmm5 +; SSE2-NEXT: pand %xmm11, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm5, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm6 ; SSE2-NEXT: movdqa %xmm6, %xmm5 ; SSE2-NEXT: pcmpgtd %xmm8, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm5[0,0,2,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm5[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm8, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: pand %xmm12, %xmm6 +; SSE2-NEXT: pand %xmm10, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSE2-NEXT: por %xmm6, %xmm5 ; SSE2-NEXT: pxor %xmm4, %xmm5 ; SSE2-NEXT: movdqa %xmm5, %xmm4 ; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm2[1,1,3,3] -; SSE2-NEXT: pandn %xmm9, %xmm2 -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm12, %xmm6 -; SSE2-NEXT: pand %xmm10, %xmm6 -; SSE2-NEXT: por %xmm6, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pxor %xmm9, %xmm2 ; SSE2-NEXT: pand %xmm5, %xmm2 ; SSE2-NEXT: por %xmm4, %xmm2 ; SSE2-NEXT: movdqa %xmm3, %xmm4 @@ -1771,10 +1705,10 @@ ; SSE2-NEXT: pxor %xmm8, %xmm5 ; SSE2-NEXT: movdqa %xmm4, %xmm6 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm6[0,0,2,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSE2-NEXT: pand %xmm12, %xmm4 +; SSE2-NEXT: pand %xmm10, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSE2-NEXT: por %xmm4, %xmm5 ; SSE2-NEXT: pxor %xmm8, %xmm7 @@ -1789,11 +1723,9 @@ ; SSE2-NEXT: pxor %xmm5, %xmm4 ; SSE2-NEXT: movdqa %xmm4, %xmm5 ; SSE2-NEXT: pandn %xmm3, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3] -; SSE2-NEXT: pandn %xmm9, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm11 -; SSE2-NEXT: pand %xmm10, %xmm11 -; SSE2-NEXT: por %xmm11, %xmm3 +; SSE2-NEXT: psrad $31, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pxor %xmm9, %xmm3 ; SSE2-NEXT: pand %xmm4, %xmm3 ; SSE2-NEXT: por %xmm5, %xmm3 ; SSE2-NEXT: retq @@ -1821,86 +1753,75 @@ ; SSSE3-NEXT: pcmpeqd %xmm8, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm4[1,1,3,3] ; SSSE3-NEXT: pand %xmm11, %xmm12 -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm9[1,1,3,3] -; SSSE3-NEXT: por %xmm12, %xmm13 -; SSSE3-NEXT: pxor %xmm10, %xmm13 -; SSSE3-NEXT: movdqa %xmm13, %xmm12 -; SSSE3-NEXT: pandn %xmm0, %xmm12 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3] +; SSSE3-NEXT: por %xmm12, %xmm4 +; SSSE3-NEXT: pxor %xmm10, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, %xmm10 +; SSSE3-NEXT: pandn %xmm0, %xmm10 +; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808] -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3] -; SSSE3-NEXT: pandn %xmm9, %xmm0 -; SSSE3-NEXT: pxor %xmm11, %xmm11 -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm4 -; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775807,9223372036854775807] -; SSSE3-NEXT: pand %xmm10, %xmm4 -; SSSE3-NEXT: por %xmm4, %xmm0 -; SSSE3-NEXT: pand %xmm13, %xmm0 -; SSSE3-NEXT: por %xmm12, %xmm0 -; SSSE3-NEXT: movdqa %xmm1, %xmm12 -; SSSE3-NEXT: pxor %xmm8, %xmm12 +; SSSE3-NEXT: pxor %xmm9, %xmm0 +; SSSE3-NEXT: pand %xmm4, %xmm0 +; SSSE3-NEXT: por %xmm10, %xmm0 +; SSSE3-NEXT: movdqa %xmm1, %xmm10 +; SSSE3-NEXT: pxor %xmm8, %xmm10 ; SSSE3-NEXT: psubq %xmm5, %xmm1 ; SSSE3-NEXT: movdqa %xmm1, %xmm4 ; SSSE3-NEXT: pxor %xmm8, %xmm4 -; SSSE3-NEXT: movdqa %xmm12, %xmm13 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm13 -; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm12, %xmm4 +; SSSE3-NEXT: movdqa %xmm10, %xmm11 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: pand %xmm14, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm13[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm12 +; SSSE3-NEXT: pand %xmm12, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm10 ; SSSE3-NEXT: pxor %xmm8, %xmm5 ; SSSE3-NEXT: movdqa %xmm5, %xmm4 ; SSSE3-NEXT: pcmpgtd %xmm8, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2] +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm4[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm8, %xmm5 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: pand %xmm13, %xmm5 +; SSSE3-NEXT: pand %xmm11, %xmm5 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSSE3-NEXT: por %xmm5, %xmm4 -; SSSE3-NEXT: pxor %xmm12, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm13 -; SSSE3-NEXT: pandn %xmm1, %xmm13 -; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm1[1,1,3,3] -; SSSE3-NEXT: pandn %xmm9, %xmm1 -; SSSE3-NEXT: pxor %xmm5, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm12, %xmm5 -; SSSE3-NEXT: pand %xmm10, %xmm5 -; SSSE3-NEXT: por %xmm5, %xmm1 +; SSSE3-NEXT: pxor %xmm10, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, %xmm5 +; SSSE3-NEXT: pandn %xmm1, %xmm5 +; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pxor %xmm9, %xmm1 ; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: por %xmm13, %xmm1 -; SSSE3-NEXT: movdqa %xmm2, %xmm12 -; SSSE3-NEXT: pxor %xmm8, %xmm12 +; SSSE3-NEXT: por %xmm5, %xmm1 +; SSSE3-NEXT: movdqa %xmm2, %xmm10 +; SSSE3-NEXT: pxor %xmm8, %xmm10 ; SSSE3-NEXT: psubq %xmm6, %xmm2 ; SSSE3-NEXT: movdqa %xmm2, %xmm5 ; SSSE3-NEXT: pxor %xmm8, %xmm5 -; SSSE3-NEXT: movdqa %xmm12, %xmm4 +; SSSE3-NEXT: movdqa %xmm10, %xmm4 ; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm12, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm5 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: pand %xmm13, %xmm5 +; SSSE3-NEXT: pand %xmm11, %xmm5 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSSE3-NEXT: por %xmm5, %xmm4 ; SSSE3-NEXT: pxor %xmm8, %xmm6 ; SSSE3-NEXT: movdqa %xmm6, %xmm5 ; SSSE3-NEXT: pcmpgtd %xmm8, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm5[0,0,2,2] +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm5[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm8, %xmm6 ; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSSE3-NEXT: pand %xmm12, %xmm6 +; SSSE3-NEXT: pand %xmm10, %xmm6 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSSE3-NEXT: por %xmm6, %xmm5 ; SSSE3-NEXT: pxor %xmm4, %xmm5 ; SSSE3-NEXT: movdqa %xmm5, %xmm4 ; SSSE3-NEXT: pandn %xmm2, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm2[1,1,3,3] -; SSSE3-NEXT: pandn %xmm9, %xmm2 -; SSSE3-NEXT: pxor %xmm6, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm12, %xmm6 -; SSSE3-NEXT: pand %xmm10, %xmm6 -; SSSE3-NEXT: por %xmm6, %xmm2 +; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: pxor %xmm9, %xmm2 ; SSSE3-NEXT: pand %xmm5, %xmm2 ; SSSE3-NEXT: por %xmm4, %xmm2 ; SSSE3-NEXT: movdqa %xmm3, %xmm4 @@ -1910,10 +1831,10 @@ ; SSSE3-NEXT: pxor %xmm8, %xmm5 ; SSSE3-NEXT: movdqa %xmm4, %xmm6 ; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm6[0,0,2,2] +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm4, %xmm5 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSSE3-NEXT: pand %xmm12, %xmm4 +; SSSE3-NEXT: pand %xmm10, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSSE3-NEXT: por %xmm4, %xmm5 ; SSSE3-NEXT: pxor %xmm8, %xmm7 @@ -1928,11 +1849,9 @@ ; SSSE3-NEXT: pxor %xmm5, %xmm4 ; SSSE3-NEXT: movdqa %xmm4, %xmm5 ; SSSE3-NEXT: pandn %xmm3, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3] -; SSSE3-NEXT: pandn %xmm9, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm11 -; SSSE3-NEXT: pand %xmm10, %xmm11 -; SSSE3-NEXT: por %xmm11, %xmm3 +; SSSE3-NEXT: psrad $31, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: pxor %xmm9, %xmm3 ; SSSE3-NEXT: pand %xmm4, %xmm3 ; SSSE3-NEXT: por %xmm5, %xmm3 ; SSSE3-NEXT: retq @@ -1940,139 +1859,148 @@ ; SSE41-LABEL: v8i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm8 -; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648] +; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: psubq %xmm4, %xmm8 -; SSE41-NEXT: movdqa %xmm8, %xmm9 -; SSE41-NEXT: pxor %xmm10, %xmm9 +; SSE41-NEXT: movdqa %xmm8, %xmm10 +; SSE41-NEXT: pxor %xmm9, %xmm10 ; SSE41-NEXT: movdqa %xmm0, %xmm11 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm11 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm11, %xmm12 -; SSE41-NEXT: por %xmm0, %xmm12 +; SSE41-NEXT: pcmpeqd %xmm10, %xmm11 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm11, %xmm10 +; SSE41-NEXT: por %xmm0, %xmm10 +; SSE41-NEXT: pxor %xmm9, %xmm4 +; SSE41-NEXT: movdqa %xmm4, %xmm11 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE41-NEXT: pand %xmm12, %xmm0 +; SSE41-NEXT: por %xmm11, %xmm0 +; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm8, %xmm4 +; SSE41-NEXT: psrad $31, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808] ; SSE41-NEXT: pxor %xmm10, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm0, %xmm9 -; SSE41-NEXT: por %xmm4, %xmm9 -; SSE41-NEXT: pxor %xmm12, %xmm9 -; SSE41-NEXT: movapd {{.*#+}} xmm12 = [9223372036854775807,9223372036854775807] -; SSE41-NEXT: movapd {{.*#+}} xmm11 = [9223372036854775808,9223372036854775808] -; SSE41-NEXT: movapd %xmm11, %xmm4 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm4 -; SSE41-NEXT: movdqa %xmm9, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm8 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: psubq %xmm5, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: pxor %xmm10, %xmm4 -; SSE41-NEXT: movdqa %xmm0, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm9 +; SSE41-NEXT: pxor %xmm9, %xmm4 +; SSE41-NEXT: movdqa %xmm0, %xmm11 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm11 ; SSE41-NEXT: pcmpgtd %xmm4, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm9, %xmm13 -; SSE41-NEXT: por %xmm0, %xmm13 -; SSE41-NEXT: pxor %xmm10, %xmm5 -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm0, %xmm4 -; SSE41-NEXT: por %xmm5, %xmm4 -; SSE41-NEXT: pxor %xmm13, %xmm4 -; SSE41-NEXT: movapd %xmm11, %xmm5 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm5 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm11, %xmm12 +; SSE41-NEXT: por %xmm0, %xmm12 +; SSE41-NEXT: pxor %xmm9, %xmm5 +; SSE41-NEXT: movdqa %xmm5, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] +; SSE41-NEXT: pand %xmm11, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: pxor %xmm12, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: psrad $31, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE41-NEXT: pxor %xmm10, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: psubq %xmm6, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pxor %xmm10, %xmm4 -; SSE41-NEXT: movdqa %xmm0, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm9 +; SSE41-NEXT: pxor %xmm9, %xmm4 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm4, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm9, %xmm5 -; SSE41-NEXT: por %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm10, %xmm6 -; SSE41-NEXT: movdqa %xmm6, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm0, %xmm4 -; SSE41-NEXT: por %xmm6, %xmm4 -; SSE41-NEXT: pxor %xmm5, %xmm4 -; SSE41-NEXT: movapd %xmm11, %xmm5 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm5 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm4 +; SSE41-NEXT: por %xmm0, %xmm4 +; SSE41-NEXT: pxor %xmm9, %xmm6 +; SSE41-NEXT: movdqa %xmm6, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm5[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE41-NEXT: pand %xmm11, %xmm0 +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: psrad $31, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE41-NEXT: pxor %xmm10, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 ; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: psubq %xmm7, %xmm3 ; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pxor %xmm10, %xmm4 +; SSE41-NEXT: pxor %xmm9, %xmm4 ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm4, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm6 -; SSE41-NEXT: por %xmm0, %xmm6 -; SSE41-NEXT: pxor %xmm10, %xmm7 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm0, %xmm4 -; SSE41-NEXT: por %xmm7, %xmm4 -; SSE41-NEXT: pxor %xmm6, %xmm4 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm11 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm4 +; SSE41-NEXT: por %xmm0, %xmm4 +; SSE41-NEXT: pxor %xmm9, %xmm7 +; SSE41-NEXT: movdqa %xmm7, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: psrad $31, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE41-NEXT: pxor %xmm10, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 ; SSE41-NEXT: movapd %xmm8, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: v8i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm4, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm2, %xmm7 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm7 ; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 -; AVX1-NEXT: vpsubq %xmm4, %xmm7, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm7, %xmm7 +; AVX1-NEXT: vpsubq %xmm5, %xmm7, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm7, %xmm7 ; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 ; AVX1-NEXT: vxorpd %ymm0, %ymm6, %ymm0 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-NEXT: vmovapd {{.*#+}} ymm4 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807] -; AVX1-NEXT: vmovapd {{.*#+}} ymm6 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX1-NEXT: vblendvpd %ymm2, %ymm4, %ymm6, %ymm7 -; AVX1-NEXT: vblendvpd %ymm0, %ymm7, %ymm2, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm6 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm4, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 +; AVX1-NEXT: vmovapd {{.*#+}} ymm5 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX1-NEXT: vxorpd %ymm5, %ymm2, %ymm2 +; AVX1-NEXT: vblendvpd %ymm0, %ymm2, %ymm6, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm2, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm3, %xmm5 -; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm7 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 ; AVX1-NEXT: vpsubq %xmm2, %xmm7, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm7, %xmm7 ; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 -; AVX1-NEXT: vxorpd %ymm1, %ymm5, %ymm1 +; AVX1-NEXT: vxorpd %ymm1, %ymm6, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm6 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-NEXT: vblendvpd %ymm2, %ymm4, %ymm6, %ymm3 -; AVX1-NEXT: vblendvpd %ymm1, %ymm3, %ymm2, %ymm1 +; AVX1-NEXT: vxorpd %ymm5, %ymm2, %ymm2 +; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm6, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v8i64: @@ -2082,15 +2010,16 @@ ; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm0, %ymm5, %ymm0 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm5 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807] -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vblendvpd %ymm2, %ymm5, %ymm6, %ymm7 -; AVX2-NEXT: vblendvpd %ymm0, %ymm7, %ymm2, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm4, %ymm5 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5 +; AVX2-NEXT: vblendvpd %ymm0, %ymm5, %ymm2, %ymm0 ; AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm2 ; AVX2-NEXT: vpsubq %ymm3, %ymm1, %ymm3 ; AVX2-NEXT: vpcmpgtq %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpxor %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vblendvpd %ymm3, %ymm5, %ymm6, %ymm2 +; AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm2 +; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm2 ; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm3, %ymm1 ; AVX2-NEXT: retq ; @@ -2101,10 +2030,8 @@ ; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm1 ; AVX512-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 ; AVX512-NEXT: kxorw %k1, %k0, %k1 -; AVX512-NEXT: vpcmpgtq %zmm1, %zmm2, %k2 -; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm0 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX512-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0 {%k2} -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512-NEXT: vpsraq $63, %zmm1, %zmm0 +; AVX512-NEXT: vpxorq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm1 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512-NEXT: retq %z = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> %x, <8 x i64> %y) @@ -2122,31 +2049,25 @@ ; SSE-NEXT: movq %r8, %rbx ; SSE-NEXT: sarq $63, %rbx ; SSE-NEXT: testb %r10b, %r10b -; SSE-NEXT: cmoveq %rcx, %rbx -; SSE-NEXT: xorl %ecx, %ecx -; SSE-NEXT: testq %r8, %r8 -; SSE-NEXT: setns %cl -; SSE-NEXT: movabsq $9223372036854775807, %r11 # imm = 0x7FFFFFFFFFFFFFFF -; SSE-NEXT: addq %r11, %rcx +; SSE-NEXT: cmovneq %rbx, %rcx +; SSE-NEXT: movabsq $-9223372036854775808, %r11 # imm = 0x8000000000000000 +; SSE-NEXT: xorq %r11, %rbx ; SSE-NEXT: testb %r10b, %r10b -; SSE-NEXT: cmoveq %r8, %rcx +; SSE-NEXT: cmoveq %r8, %rbx ; SSE-NEXT: subq %r9, %rsi ; SSE-NEXT: sbbq {{[0-9]+}}(%rsp), %rdx ; SSE-NEXT: seto %r8b ; SSE-NEXT: movq %rdx, %rdi ; SSE-NEXT: sarq $63, %rdi ; SSE-NEXT: testb %r8b, %r8b -; SSE-NEXT: cmoveq %rsi, %rdi -; SSE-NEXT: xorl %esi, %esi -; SSE-NEXT: testq %rdx, %rdx -; SSE-NEXT: setns %sil -; SSE-NEXT: addq %r11, %rsi +; SSE-NEXT: cmovneq %rdi, %rsi +; SSE-NEXT: xorq %r11, %rdi ; SSE-NEXT: testb %r8b, %r8b -; SSE-NEXT: cmoveq %rdx, %rsi -; SSE-NEXT: movq %rbx, 16(%rax) -; SSE-NEXT: movq %rdi, (%rax) -; SSE-NEXT: movq %rcx, 24(%rax) -; SSE-NEXT: movq %rsi, 8(%rax) +; SSE-NEXT: cmoveq %rdx, %rdi +; SSE-NEXT: movq %rcx, 16(%rax) +; SSE-NEXT: movq %rsi, (%rax) +; SSE-NEXT: movq %rbx, 24(%rax) +; SSE-NEXT: movq %rdi, 8(%rax) ; SSE-NEXT: popq %rbx ; SSE-NEXT: retq ; @@ -2160,31 +2081,25 @@ ; AVX-NEXT: movq %r8, %rbx ; AVX-NEXT: sarq $63, %rbx ; AVX-NEXT: testb %r10b, %r10b -; AVX-NEXT: cmoveq %rcx, %rbx -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: testq %r8, %r8 -; AVX-NEXT: setns %cl -; AVX-NEXT: movabsq $9223372036854775807, %r11 # imm = 0x7FFFFFFFFFFFFFFF -; AVX-NEXT: addq %r11, %rcx +; AVX-NEXT: cmovneq %rbx, %rcx +; AVX-NEXT: movabsq $-9223372036854775808, %r11 # imm = 0x8000000000000000 +; AVX-NEXT: xorq %r11, %rbx ; AVX-NEXT: testb %r10b, %r10b -; AVX-NEXT: cmoveq %r8, %rcx +; AVX-NEXT: cmoveq %r8, %rbx ; AVX-NEXT: subq %r9, %rsi ; AVX-NEXT: sbbq {{[0-9]+}}(%rsp), %rdx ; AVX-NEXT: seto %r8b ; AVX-NEXT: movq %rdx, %rdi ; AVX-NEXT: sarq $63, %rdi ; AVX-NEXT: testb %r8b, %r8b -; AVX-NEXT: cmoveq %rsi, %rdi -; AVX-NEXT: xorl %esi, %esi -; AVX-NEXT: testq %rdx, %rdx -; AVX-NEXT: setns %sil -; AVX-NEXT: addq %r11, %rsi +; AVX-NEXT: cmovneq %rdi, %rsi +; AVX-NEXT: xorq %r11, %rdi ; AVX-NEXT: testb %r8b, %r8b -; AVX-NEXT: cmoveq %rdx, %rsi -; AVX-NEXT: movq %rbx, 16(%rax) -; AVX-NEXT: movq %rdi, (%rax) -; AVX-NEXT: movq %rcx, 24(%rax) -; AVX-NEXT: movq %rsi, 8(%rax) +; AVX-NEXT: cmoveq %rdx, %rdi +; AVX-NEXT: movq %rcx, 16(%rax) +; AVX-NEXT: movq %rsi, (%rax) +; AVX-NEXT: movq %rbx, 24(%rax) +; AVX-NEXT: movq %rdi, 8(%rax) ; AVX-NEXT: popq %rbx ; AVX-NEXT: retq %z = call <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128> %x, <2 x i128> %y)