diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -549,6 +549,7 @@ SDValue N2, SDValue N3, ISD::CondCode CC); SDValue foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1, const SDLoc &DL); + SDValue foldSubToUSubSat(EVT DstVT, SDNode *N); SDValue unfoldMaskedMerge(SDNode *N); SDValue unfoldExtremeBitClearingToShifts(SDNode *N); SDValue SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond, @@ -3125,6 +3126,64 @@ return SDValue(); } +// Try to find umax(a,b) - b or a - umin(a,b) patterns that may be converted to +// usubsat(a,b), optionally as a truncated type. +SDValue DAGCombiner::foldSubToUSubSat(EVT DstVT, SDNode *N) { + if (N->getOpcode() != ISD::SUB || + !(!LegalOperations || hasOperation(ISD::USUBSAT, DstVT))) + return SDValue(); + + EVT SubVT = N->getValueType(0); + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + assert(DstVT.getScalarSizeInBits() <= SubVT.getScalarSizeInBits() && + "Illegal truncation"); + + auto TruncatedUSUBSAT = [&](SDValue LHS, SDValue RHS) { + SDLoc DL(N); + if (DstVT == SubVT) + return DAG.getNode(ISD::USUBSAT, DL, DstVT, LHS, RHS); + + // If the LHS is zero-extended then we can perform the USUBSAT as DstVT by + // clamping RHS. + KnownBits KnownLHS = DAG.computeKnownBits(LHS); + unsigned NumZeros = KnownLHS.countMinLeadingZeros(); + if (NumZeros < (SubVT.getScalarSizeInBits() - DstVT.getScalarSizeInBits())) + return SDValue(); + + SDValue SatLimit = + DAG.getConstant(APInt::getLowBitsSet(SubVT.getScalarSizeInBits(), + DstVT.getScalarSizeInBits()), + DL, SubVT); + RHS = DAG.getNode(ISD::UMIN, DL, SubVT, RHS, SatLimit); + RHS = DAG.getZExtOrTrunc(RHS, DL, DstVT); + LHS = DAG.getZExtOrTrunc(LHS, DL, DstVT); + return DAG.getNode(ISD::USUBSAT, DL, DstVT, LHS, RHS); + }; + + // Try to find umax(a,b) - b or a - umin(a,b) patterns + // they may be converted to usubsat(a,b). + if (Op0.getOpcode() == ISD::UMAX) { + SDValue MaxLHS = Op0.getOperand(0); + SDValue MaxRHS = Op0.getOperand(1); + if (MaxLHS == Op1) + return TruncatedUSUBSAT(MaxRHS, Op1); + if (MaxRHS == Op1) + return TruncatedUSUBSAT(MaxLHS, Op1); + } + + if (Op1.getOpcode() == ISD::UMIN) { + SDValue MinLHS = Op1.getOperand(0); + SDValue MinRHS = Op1.getOperand(1); + if (MinLHS == Op0) + return TruncatedUSUBSAT(Op0, MinRHS); + if (MinRHS == Op0) + return TruncatedUSUBSAT(Op0, MinLHS); + } + + return SDValue(); +} + // Since it may not be valid to emit a fold to zero for vector initializers // check if we can before folding. static SDValue tryFoldToZero(const SDLoc &DL, const TargetLowering &TLI, EVT VT, @@ -3343,6 +3402,9 @@ if (SDValue V = foldAddSubMasked1(false, N0, N1, DAG, SDLoc(N))) return V; + if (SDValue V = foldSubToUSubSat(VT, N)) + return V; + // (x - y) - 1 -> add (xor y, -1), x if (N0.hasOneUse() && N0.getOpcode() == ISD::SUB && isOneOrOneSplat(N1)) { SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(1), @@ -11816,6 +11878,9 @@ } } + if (SDValue V = foldSubToUSubSat(VT, N0.getNode())) + return V; + // Attempt to pre-truncate BUILD_VECTOR sources. if (N0.getOpcode() == ISD::BUILD_VECTOR && !LegalOperations && TLI.isTruncateFree(SrcVT.getScalarType(), VT.getScalarType()) && diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -49148,32 +49148,9 @@ return SDValue(); SDValue SubusLHS, SubusRHS; - // Try to find umax(a,b) - b or a - umin(a,b) patterns - // they may be converted to subus(a,b). - // TODO: Need to add IR canonicalization for this code. - if (Op0.getOpcode() == ISD::UMAX) { - SubusRHS = Op1; - SDValue MaxLHS = Op0.getOperand(0); - SDValue MaxRHS = Op0.getOperand(1); - if (MaxLHS == Op1) - SubusLHS = MaxRHS; - else if (MaxRHS == Op1) - SubusLHS = MaxLHS; - else - return SDValue(); - } else if (Op1.getOpcode() == ISD::UMIN) { - SubusLHS = Op0; - SDValue MinLHS = Op1.getOperand(0); - SDValue MinRHS = Op1.getOperand(1); - if (MinLHS == Op0) - SubusRHS = MinRHS; - else if (MinRHS == Op0) - SubusRHS = MinLHS; - else - return SDValue(); - } else if (Op1.getOpcode() == ISD::TRUNCATE && - Op1.getOperand(0).getOpcode() == ISD::UMIN && - (EltVT == MVT::i8 || EltVT == MVT::i16)) { + if (Op1.getOpcode() == ISD::TRUNCATE && + Op1.getOperand(0).getOpcode() == ISD::UMIN && + (EltVT == MVT::i8 || EltVT == MVT::i16)) { // Special case where the UMIN has been truncated. Try to push the truncate // further up. This is similar to the i32/i64 special processing. SubusLHS = Op0; diff --git a/llvm/test/CodeGen/AArch64/usub_sat.ll b/llvm/test/CodeGen/AArch64/usub_sat.ll --- a/llvm/test/CodeGen/AArch64/usub_sat.ll +++ b/llvm/test/CodeGen/AArch64/usub_sat.ll @@ -30,11 +30,9 @@ define i16 @func16(i16 %x, i16 %y) nounwind { ; CHECK-LABEL: func16: ; CHECK: // %bb.0: -; CHECK-NEXT: and w8, w1, #0xffff -; CHECK-NEXT: and w9, w0, #0xffff -; CHECK-NEXT: cmp w9, w8 -; CHECK-NEXT: csel w9, w9, w8, hi -; CHECK-NEXT: sub w0, w9, w8 +; CHECK-NEXT: and w8, w0, #0xffff +; CHECK-NEXT: subs w8, w8, w1, uxth +; CHECK-NEXT: csel w0, wzr, w8, lo ; CHECK-NEXT: ret %tmp = call i16 @llvm.usub.sat.i16(i16 %x, i16 %y); ret i16 %tmp; @@ -43,11 +41,9 @@ define i8 @func8(i8 %x, i8 %y) nounwind { ; CHECK-LABEL: func8: ; CHECK: // %bb.0: -; CHECK-NEXT: and w8, w1, #0xff -; CHECK-NEXT: and w9, w0, #0xff -; CHECK-NEXT: cmp w9, w8 -; CHECK-NEXT: csel w9, w9, w8, hi -; CHECK-NEXT: sub w0, w9, w8 +; CHECK-NEXT: and w8, w0, #0xff +; CHECK-NEXT: subs w8, w8, w1, uxtb +; CHECK-NEXT: csel w0, wzr, w8, lo ; CHECK-NEXT: ret %tmp = call i8 @llvm.usub.sat.i8(i8 %x, i8 %y); ret i8 %tmp; @@ -58,9 +54,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: and w8, w1, #0xf ; CHECK-NEXT: and w9, w0, #0xf -; CHECK-NEXT: cmp w9, w8 -; CHECK-NEXT: csel w9, w9, w8, hi -; CHECK-NEXT: sub w0, w9, w8 +; CHECK-NEXT: subs w8, w9, w8 +; CHECK-NEXT: csel w0, wzr, w8, lo ; CHECK-NEXT: ret %tmp = call i4 @llvm.usub.sat.i4(i4 %x, i4 %y); ret i4 %tmp; diff --git a/llvm/test/CodeGen/AArch64/usub_sat_plus.ll b/llvm/test/CodeGen/AArch64/usub_sat_plus.ll --- a/llvm/test/CodeGen/AArch64/usub_sat_plus.ll +++ b/llvm/test/CodeGen/AArch64/usub_sat_plus.ll @@ -33,12 +33,10 @@ define i16 @func16(i16 %x, i16 %y, i16 %z) nounwind { ; CHECK-LABEL: func16: ; CHECK: // %bb.0: -; CHECK-NEXT: mul w9, w1, w2 ; CHECK-NEXT: and w8, w0, #0xffff -; CHECK-NEXT: and w9, w9, #0xffff -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: csel w8, w8, w9, hi -; CHECK-NEXT: sub w0, w8, w9 +; CHECK-NEXT: mul w9, w1, w2 +; CHECK-NEXT: subs w8, w8, w9, uxth +; CHECK-NEXT: csel w0, wzr, w8, lo ; CHECK-NEXT: ret %a = mul i16 %y, %z %tmp = call i16 @llvm.usub.sat.i16(i16 %x, i16 %a) @@ -48,12 +46,10 @@ define i8 @func8(i8 %x, i8 %y, i8 %z) nounwind { ; CHECK-LABEL: func8: ; CHECK: // %bb.0: -; CHECK-NEXT: mul w9, w1, w2 ; CHECK-NEXT: and w8, w0, #0xff -; CHECK-NEXT: and w9, w9, #0xff -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: csel w8, w8, w9, hi -; CHECK-NEXT: sub w0, w8, w9 +; CHECK-NEXT: mul w9, w1, w2 +; CHECK-NEXT: subs w8, w8, w9, uxtb +; CHECK-NEXT: csel w0, wzr, w8, lo ; CHECK-NEXT: ret %a = mul i8 %y, %z %tmp = call i8 @llvm.usub.sat.i8(i8 %x, i8 %a) @@ -66,9 +62,8 @@ ; CHECK-NEXT: mul w9, w1, w2 ; CHECK-NEXT: and w8, w0, #0xf ; CHECK-NEXT: and w9, w9, #0xf -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: csel w8, w8, w9, hi -; CHECK-NEXT: sub w0, w8, w9 +; CHECK-NEXT: subs w8, w8, w9 +; CHECK-NEXT: csel w0, wzr, w8, lo ; CHECK-NEXT: ret %a = mul i4 %y, %z %tmp = call i4 @llvm.usub.sat.i4(i4 %x, i4 %a) diff --git a/llvm/test/CodeGen/AMDGPU/usubsat.ll b/llvm/test/CodeGen/AMDGPU/usubsat.ll --- a/llvm/test/CodeGen/AMDGPU/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/usubsat.ll @@ -125,17 +125,17 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v7, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v6, s4, v3 ; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v4 +; GFX6-NEXT: v_and_b32_e32 v7, s4, v4 ; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_max_u32_e32 v1, v1, v8 -; GFX6-NEXT: v_max_u32_e32 v0, v0, v7 -; GFX6-NEXT: v_and_b32_e32 v6, s4, v5 +; GFX6-NEXT: v_max_u32_e32 v1, v1, v7 +; GFX6-NEXT: v_max_u32_e32 v0, v0, v6 +; GFX6-NEXT: v_and_b32_e32 v5, s4, v5 ; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 -; GFX6-NEXT: v_max_u32_e32 v2, v2, v6 +; GFX6-NEXT: v_max_u32_e32 v2, v2, v5 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v2, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 diff --git a/llvm/test/CodeGen/ARM/usub_sat.ll b/llvm/test/CodeGen/ARM/usub_sat.ll --- a/llvm/test/CodeGen/ARM/usub_sat.ll +++ b/llvm/test/CodeGen/ARM/usub_sat.ll @@ -93,29 +93,26 @@ define zeroext i16 @func16(i16 zeroext %x, i16 zeroext %y) nounwind { ; CHECK-T1-LABEL: func16: ; CHECK-T1: @ %bb.0: -; CHECK-T1-NEXT: cmp r0, r1 -; CHECK-T1-NEXT: bhi .LBB2_2 +; CHECK-T1-NEXT: subs r0, r0, r1 +; CHECK-T1-NEXT: bhs .LBB2_2 ; CHECK-T1-NEXT: @ %bb.1: -; CHECK-T1-NEXT: mov r0, r1 +; CHECK-T1-NEXT: movs r0, #0 ; CHECK-T1-NEXT: .LBB2_2: -; CHECK-T1-NEXT: subs r0, r0, r1 ; CHECK-T1-NEXT: uxth r0, r0 ; CHECK-T1-NEXT: bx lr ; ; CHECK-T2-LABEL: func16: ; CHECK-T2: @ %bb.0: -; CHECK-T2-NEXT: cmp r0, r1 -; CHECK-T2-NEXT: it ls -; CHECK-T2-NEXT: movls r0, r1 ; CHECK-T2-NEXT: subs r0, r0, r1 +; CHECK-T2-NEXT: it lo +; CHECK-T2-NEXT: movlo r0, #0 ; CHECK-T2-NEXT: uxth r0, r0 ; CHECK-T2-NEXT: bx lr ; ; CHECK-ARM-LABEL: func16: ; CHECK-ARM: @ %bb.0: -; CHECK-ARM-NEXT: cmp r0, r1 -; CHECK-ARM-NEXT: movls r0, r1 -; CHECK-ARM-NEXT: sub r0, r0, r1 +; CHECK-ARM-NEXT: subs r0, r0, r1 +; CHECK-ARM-NEXT: movlo r0, #0 ; CHECK-ARM-NEXT: uxth r0, r0 ; CHECK-ARM-NEXT: bx lr %tmp = call i16 @llvm.usub.sat.i16(i16 %x, i16 %y) @@ -125,29 +122,26 @@ define zeroext i8 @func8(i8 zeroext %x, i8 zeroext %y) nounwind { ; CHECK-T1-LABEL: func8: ; CHECK-T1: @ %bb.0: -; CHECK-T1-NEXT: cmp r0, r1 -; CHECK-T1-NEXT: bhi .LBB3_2 +; CHECK-T1-NEXT: subs r0, r0, r1 +; CHECK-T1-NEXT: bhs .LBB3_2 ; CHECK-T1-NEXT: @ %bb.1: -; CHECK-T1-NEXT: mov r0, r1 +; CHECK-T1-NEXT: movs r0, #0 ; CHECK-T1-NEXT: .LBB3_2: -; CHECK-T1-NEXT: subs r0, r0, r1 ; CHECK-T1-NEXT: uxtb r0, r0 ; CHECK-T1-NEXT: bx lr ; ; CHECK-T2-LABEL: func8: ; CHECK-T2: @ %bb.0: -; CHECK-T2-NEXT: cmp r0, r1 -; CHECK-T2-NEXT: it ls -; CHECK-T2-NEXT: movls r0, r1 ; CHECK-T2-NEXT: subs r0, r0, r1 +; CHECK-T2-NEXT: it lo +; CHECK-T2-NEXT: movlo r0, #0 ; CHECK-T2-NEXT: uxtb r0, r0 ; CHECK-T2-NEXT: bx lr ; ; CHECK-ARM-LABEL: func8: ; CHECK-ARM: @ %bb.0: -; CHECK-ARM-NEXT: cmp r0, r1 -; CHECK-ARM-NEXT: movls r0, r1 -; CHECK-ARM-NEXT: sub r0, r0, r1 +; CHECK-ARM-NEXT: subs r0, r0, r1 +; CHECK-ARM-NEXT: movlo r0, #0 ; CHECK-ARM-NEXT: uxtb r0, r0 ; CHECK-ARM-NEXT: bx lr %tmp = call i8 @llvm.usub.sat.i8(i8 %x, i8 %y) @@ -157,30 +151,27 @@ define zeroext i4 @func3(i4 zeroext %x, i4 zeroext %y) nounwind { ; CHECK-T1-LABEL: func3: ; CHECK-T1: @ %bb.0: -; CHECK-T1-NEXT: cmp r0, r1 -; CHECK-T1-NEXT: bhi .LBB4_2 +; CHECK-T1-NEXT: subs r1, r0, r1 +; CHECK-T1-NEXT: bhs .LBB4_2 ; CHECK-T1-NEXT: @ %bb.1: -; CHECK-T1-NEXT: mov r0, r1 +; CHECK-T1-NEXT: movs r1, #0 ; CHECK-T1-NEXT: .LBB4_2: -; CHECK-T1-NEXT: subs r1, r0, r1 ; CHECK-T1-NEXT: movs r0, #15 ; CHECK-T1-NEXT: ands r0, r1 ; CHECK-T1-NEXT: bx lr ; ; CHECK-T2-LABEL: func3: ; CHECK-T2: @ %bb.0: -; CHECK-T2-NEXT: cmp r0, r1 -; CHECK-T2-NEXT: it ls -; CHECK-T2-NEXT: movls r0, r1 ; CHECK-T2-NEXT: subs r0, r0, r1 +; CHECK-T2-NEXT: it lo +; CHECK-T2-NEXT: movlo r0, #0 ; CHECK-T2-NEXT: and r0, r0, #15 ; CHECK-T2-NEXT: bx lr ; ; CHECK-ARM-LABEL: func3: ; CHECK-ARM: @ %bb.0: -; CHECK-ARM-NEXT: cmp r0, r1 -; CHECK-ARM-NEXT: movls r0, r1 -; CHECK-ARM-NEXT: sub r0, r0, r1 +; CHECK-ARM-NEXT: subs r0, r0, r1 +; CHECK-ARM-NEXT: movlo r0, #0 ; CHECK-ARM-NEXT: and r0, r0, #15 ; CHECK-ARM-NEXT: bx lr %tmp = call i4 @llvm.usub.sat.i4(i4 %x, i4 %y) diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll --- a/llvm/test/CodeGen/X86/psubus.ll +++ b/llvm/test/CodeGen/X86/psubus.ll @@ -1407,26 +1407,26 @@ ; ; SSSE3-LABEL: psubus_8i32_max: ; SSSE3: # %bb.0: # %vector.ph -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm2, %xmm5 -; SSSE3-NEXT: pxor %xmm4, %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183] -; SSSE3-NEXT: movdqa %xmm6, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 -; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535] -; SSSE3-NEXT: pand %xmm7, %xmm2 -; SSSE3-NEXT: pandn %xmm5, %xmm7 -; SSSE3-NEXT: por %xmm2, %xmm7 -; SSSE3-NEXT: pshufb %xmm3, %xmm7 -; SSSE3-NEXT: pxor %xmm1, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm2, %xmm4 +; SSSE3-NEXT: pxor %xmm3, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] +; SSSE3-NEXT: movdqa %xmm5, %xmm6 ; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 -; SSSE3-NEXT: pand %xmm6, %xmm1 -; SSSE3-NEXT: pandn %xmm5, %xmm6 -; SSSE3-NEXT: por %xmm1, %xmm6 -; SSSE3-NEXT: pshufb %xmm3, %xmm6 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0] -; SSSE3-NEXT: psubusw %xmm6, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535] +; SSSE3-NEXT: pand %xmm6, %xmm2 +; SSSE3-NEXT: pandn %xmm4, %xmm6 +; SSSE3-NEXT: por %xmm2, %xmm6 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3-NEXT: pshufb %xmm2, %xmm6 +; SSSE3-NEXT: pxor %xmm1, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm1 +; SSSE3-NEXT: pandn %xmm4, %xmm5 +; SSSE3-NEXT: por %xmm1, %xmm5 +; SSSE3-NEXT: pshufb %xmm2, %xmm5 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] +; SSSE3-NEXT: psubusw %xmm5, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: psubus_8i32_max: @@ -1671,25 +1671,25 @@ ; ; AVX1-LABEL: psubus_8i64_max: ; AVX1: # %bb.0: # %vector.ph -; AVX1-NEXT: vmovapd {{.*#+}} xmm3 = [65535,65535] -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808] -; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm6 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [9223372036854841343,9223372036854841343] -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm7, %xmm6 -; AVX1-NEXT: vblendvpd %xmm6, %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm7, %xmm6 -; AVX1-NEXT: vblendvpd %xmm6, %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm7, %xmm6 -; AVX1-NEXT: vblendvpd %xmm6, %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm5 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm7, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpackusdw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854841343,9223372036854841343] +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [65535,65535] +; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm7, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm7, %xmm2 +; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm7, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm1, %xmm7, %xmm1 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper @@ -1733,119 +1733,119 @@ ; SSE2-LABEL: psubus_16i32_max: ; SSE2: # %bb.0: # %vector.ph ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm5, %xmm8 +; SSE2-NEXT: movdqa %xmm3, %xmm8 ; SSE2-NEXT: pxor %xmm9, %xmm8 -; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147549183,2147549183,2147549183,2147549183] -; SSE2-NEXT: movdqa %xmm7, %xmm6 +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147549183,2147549183,2147549183,2147549183] +; SSE2-NEXT: movdqa %xmm10, %xmm6 ; SSE2-NEXT: pcmpgtd %xmm8, %xmm6 ; SSE2-NEXT: pcmpeqd %xmm8, %xmm8 -; SSE2-NEXT: pand %xmm6, %xmm5 -; SSE2-NEXT: pxor %xmm8, %xmm6 -; SSE2-NEXT: por %xmm5, %xmm6 -; SSE2-NEXT: pslld $16, %xmm6 -; SSE2-NEXT: psrad $16, %xmm6 -; SSE2-NEXT: movdqa %xmm4, %xmm10 -; SSE2-NEXT: pxor %xmm9, %xmm10 -; SSE2-NEXT: movdqa %xmm7, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm4 -; SSE2-NEXT: pxor %xmm8, %xmm5 -; SSE2-NEXT: por %xmm4, %xmm5 -; SSE2-NEXT: pslld $16, %xmm5 -; SSE2-NEXT: psrad $16, %xmm5 -; SSE2-NEXT: packssdw %xmm6, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pxor %xmm9, %xmm4 -; SSE2-NEXT: movdqa %xmm7, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 ; SSE2-NEXT: pand %xmm6, %xmm3 ; SSE2-NEXT: pxor %xmm8, %xmm6 ; SSE2-NEXT: por %xmm3, %xmm6 ; SSE2-NEXT: pslld $16, %xmm6 ; SSE2-NEXT: psrad $16, %xmm6 -; SSE2-NEXT: pxor %xmm2, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm7 -; SSE2-NEXT: pxor %xmm7, %xmm8 -; SSE2-NEXT: pand %xmm2, %xmm7 -; SSE2-NEXT: por %xmm8, %xmm7 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm9, %xmm3 +; SSE2-NEXT: movdqa %xmm10, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm2 +; SSE2-NEXT: pxor %xmm8, %xmm7 +; SSE2-NEXT: por %xmm2, %xmm7 ; SSE2-NEXT: pslld $16, %xmm7 ; SSE2-NEXT: psrad $16, %xmm7 ; SSE2-NEXT: packssdw %xmm6, %xmm7 ; SSE2-NEXT: psubusw %xmm7, %xmm0 -; SSE2-NEXT: psubusw %xmm5, %xmm1 +; SSE2-NEXT: movdqa %xmm5, %xmm2 +; SSE2-NEXT: pxor %xmm9, %xmm2 +; SSE2-NEXT: movdqa %xmm10, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm5 +; SSE2-NEXT: pxor %xmm8, %xmm3 +; SSE2-NEXT: por %xmm5, %xmm3 +; SSE2-NEXT: pslld $16, %xmm3 +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: pxor %xmm10, %xmm8 +; SSE2-NEXT: pand %xmm4, %xmm10 +; SSE2-NEXT: por %xmm8, %xmm10 +; SSE2-NEXT: pslld $16, %xmm10 +; SSE2-NEXT: psrad $16, %xmm10 +; SSE2-NEXT: packssdw %xmm3, %xmm10 +; SSE2-NEXT: psubusw %xmm10, %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: psubus_16i32_max: ; SSSE3: # %bb.0: # %vector.ph ; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm5, %xmm8 +; SSSE3-NEXT: movdqa %xmm3, %xmm8 ; SSSE3-NEXT: pxor %xmm9, %xmm8 -; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147549183,2147549183,2147549183,2147549183] -; SSSE3-NEXT: movdqa %xmm7, %xmm6 +; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [2147549183,2147549183,2147549183,2147549183] +; SSSE3-NEXT: movdqa %xmm10, %xmm6 ; SSSE3-NEXT: pcmpgtd %xmm8, %xmm6 ; SSSE3-NEXT: pcmpeqd %xmm8, %xmm8 -; SSSE3-NEXT: pand %xmm6, %xmm5 -; SSSE3-NEXT: pxor %xmm8, %xmm6 -; SSSE3-NEXT: por %xmm5, %xmm6 -; SSSE3-NEXT: pslld $16, %xmm6 -; SSSE3-NEXT: psrad $16, %xmm6 -; SSSE3-NEXT: movdqa %xmm4, %xmm10 -; SSSE3-NEXT: pxor %xmm9, %xmm10 -; SSSE3-NEXT: movdqa %xmm7, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm4 -; SSSE3-NEXT: pxor %xmm8, %xmm5 -; SSSE3-NEXT: por %xmm4, %xmm5 -; SSSE3-NEXT: pslld $16, %xmm5 -; SSSE3-NEXT: psrad $16, %xmm5 -; SSSE3-NEXT: packssdw %xmm6, %xmm5 -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pxor %xmm9, %xmm4 -; SSSE3-NEXT: movdqa %xmm7, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 ; SSSE3-NEXT: pand %xmm6, %xmm3 ; SSSE3-NEXT: pxor %xmm8, %xmm6 ; SSSE3-NEXT: por %xmm3, %xmm6 ; SSSE3-NEXT: pslld $16, %xmm6 ; SSSE3-NEXT: psrad $16, %xmm6 -; SSSE3-NEXT: pxor %xmm2, %xmm9 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm7 -; SSSE3-NEXT: pxor %xmm7, %xmm8 -; SSSE3-NEXT: pand %xmm2, %xmm7 -; SSSE3-NEXT: por %xmm8, %xmm7 +; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pxor %xmm9, %xmm3 +; SSSE3-NEXT: movdqa %xmm10, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm7 +; SSSE3-NEXT: pand %xmm7, %xmm2 +; SSSE3-NEXT: pxor %xmm8, %xmm7 +; SSSE3-NEXT: por %xmm2, %xmm7 ; SSSE3-NEXT: pslld $16, %xmm7 ; SSSE3-NEXT: psrad $16, %xmm7 ; SSSE3-NEXT: packssdw %xmm6, %xmm7 ; SSSE3-NEXT: psubusw %xmm7, %xmm0 -; SSSE3-NEXT: psubusw %xmm5, %xmm1 +; SSSE3-NEXT: movdqa %xmm5, %xmm2 +; SSSE3-NEXT: pxor %xmm9, %xmm2 +; SSSE3-NEXT: movdqa %xmm10, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm5 +; SSSE3-NEXT: pxor %xmm8, %xmm3 +; SSSE3-NEXT: por %xmm5, %xmm3 +; SSSE3-NEXT: pslld $16, %xmm3 +; SSSE3-NEXT: psrad $16, %xmm3 +; SSSE3-NEXT: pxor %xmm4, %xmm9 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm10 +; SSSE3-NEXT: pxor %xmm10, %xmm8 +; SSSE3-NEXT: pand %xmm4, %xmm10 +; SSSE3-NEXT: por %xmm8, %xmm10 +; SSSE3-NEXT: pslld $16, %xmm10 +; SSSE3-NEXT: psrad $16, %xmm10 +; SSSE3-NEXT: packssdw %xmm3, %xmm10 +; SSSE3-NEXT: psubusw %xmm10, %xmm1 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: psubus_16i32_max: ; SSE41: # %bb.0: # %vector.ph ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535] -; SSE41-NEXT: pminud %xmm6, %xmm5 -; SSE41-NEXT: pminud %xmm6, %xmm4 -; SSE41-NEXT: packusdw %xmm5, %xmm4 ; SSE41-NEXT: pminud %xmm6, %xmm3 ; SSE41-NEXT: pminud %xmm6, %xmm2 ; SSE41-NEXT: packusdw %xmm3, %xmm2 ; SSE41-NEXT: psubusw %xmm2, %xmm0 +; SSE41-NEXT: pminud %xmm6, %xmm5 +; SSE41-NEXT: pminud %xmm6, %xmm4 +; SSE41-NEXT: packusdw %xmm5, %xmm4 ; SSE41-NEXT: psubusw %xmm4, %xmm1 ; SSE41-NEXT: retq ; ; AVX1-LABEL: psubus_16i32_max: ; AVX1: # %bb.0: # %vector.ph -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [65535,65535,65535,65535] -; AVX1-NEXT: vpminud %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpminud %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [65535,65535,65535,65535] ; AVX1-NEXT: vpminud %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpminud %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpsubusw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpminud %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpminud %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -1902,26 +1902,26 @@ ; ; SSSE3-LABEL: psubus_i16_i32_max_swapped: ; SSSE3: # %bb.0: # %vector.ph -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm2, %xmm5 -; SSSE3-NEXT: pxor %xmm4, %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183] -; SSSE3-NEXT: movdqa %xmm6, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 -; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535] -; SSSE3-NEXT: pand %xmm7, %xmm2 -; SSSE3-NEXT: pandn %xmm5, %xmm7 -; SSSE3-NEXT: por %xmm2, %xmm7 -; SSSE3-NEXT: pshufb %xmm3, %xmm7 -; SSSE3-NEXT: pxor %xmm1, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm2, %xmm4 +; SSSE3-NEXT: pxor %xmm3, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] +; SSSE3-NEXT: movdqa %xmm5, %xmm6 ; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 -; SSSE3-NEXT: pand %xmm6, %xmm1 -; SSSE3-NEXT: pandn %xmm5, %xmm6 -; SSSE3-NEXT: por %xmm1, %xmm6 -; SSSE3-NEXT: pshufb %xmm3, %xmm6 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0] -; SSSE3-NEXT: psubusw %xmm6, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535] +; SSSE3-NEXT: pand %xmm6, %xmm2 +; SSSE3-NEXT: pandn %xmm4, %xmm6 +; SSSE3-NEXT: por %xmm2, %xmm6 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3-NEXT: pshufb %xmm2, %xmm6 +; SSSE3-NEXT: pxor %xmm1, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm1 +; SSSE3-NEXT: pandn %xmm4, %xmm5 +; SSSE3-NEXT: por %xmm1, %xmm5 +; SSSE3-NEXT: pshufb %xmm2, %xmm5 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] +; SSSE3-NEXT: psubusw %xmm5, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: psubus_i16_i32_max_swapped: diff --git a/llvm/test/CodeGen/X86/usub_sat.ll b/llvm/test/CodeGen/X86/usub_sat.ll --- a/llvm/test/CodeGen/X86/usub_sat.ll +++ b/llvm/test/CodeGen/X86/usub_sat.ll @@ -97,26 +97,20 @@ define zeroext i4 @func3(i4 zeroext %x, i4 zeroext %y) nounwind { ; X86-LABEL: func3: ; X86: # %bb.0: -; X86-NEXT: pushl %ebx ; X86-NEXT: movb {{[0-9]+}}(%esp), %al -; X86-NEXT: movb {{[0-9]+}}(%esp), %cl -; X86-NEXT: movzbl %cl, %edx -; X86-NEXT: movzbl %al, %ebx -; X86-NEXT: cmpb %al, %cl -; X86-NEXT: cmoval %edx, %ebx -; X86-NEXT: subb %al, %bl -; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: subb {{[0-9]+}}(%esp), %al +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: cmovbl %ecx, %eax ; X86-NEXT: andl $15, %eax -; X86-NEXT: popl %ebx ; X86-NEXT: retl ; ; X64-LABEL: func3: ; X64: # %bb.0: -; X64-NEXT: cmpb %sil, %dil -; X64-NEXT: movl %esi, %eax -; X64-NEXT: cmoval %edi, %eax -; X64-NEXT: subb %sil, %al -; X64-NEXT: movzbl %al, %eax +; X64-NEXT: xorl %ecx, %ecx +; X64-NEXT: subb %sil, %dil +; X64-NEXT: movzbl %dil, %eax +; X64-NEXT: cmovbl %ecx, %eax ; X64-NEXT: andl $15, %eax ; X64-NEXT: retq %tmp = call i4 @llvm.usub.sat.i4(i4 %x, i4 %y)