Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -40495,6 +40495,45 @@ PMADDBuilder); } +// Try to turn (add (umax X, C), -C) into (psubus X, C) +static SDValue combineAddToSUBUS(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + if (!Subtarget.hasSSE2()) + return SDValue(); + + EVT VT = N->getValueType(0); + + // psubus is available in SSE2 for i8 and i16 vectors. + if (!VT.isVector() || VT.getVectorNumElements() < 2 || + !isPowerOf2_32(VT.getVectorNumElements()) || + !(VT.getVectorElementType() == MVT::i8 || + VT.getVectorElementType() == MVT::i16)) + return SDValue(); + + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + if (Op0.getOpcode() != ISD::UMAX) + return SDValue(); + + // The add should have a constant that is the negative of the max. + auto MatchSUBUS = [](ConstantSDNode *Max, ConstantSDNode *Op) { + return Max->getAPIntValue() == (-Op->getAPIntValue()); + }; + if (!ISD::matchBinaryPredicate(Op0.getOperand(1), Op1, MatchSUBUS)) + return SDValue(); + + auto SUBUSBuilder = [](SelectionDAG &DAG, const SDLoc &DL, + ArrayRef Ops) { + return DAG.getNode(X86ISD::SUBUS, DL, Ops[0].getValueType(), Ops); + }; + + // Take both operands from the umax node. + SDLoc DL(N); + return SplitOpsAndApply(DAG, Subtarget, DL, VT, + { Op0.getOperand(0), Op0.getOperand(1) }, + SUBUSBuilder); +} + // Attempt to turn this pattern into PMADDWD. // (mul (add (zext (build_vector)), (zext (build_vector))), // (add (zext (build_vector)), (zext (build_vector))) @@ -40650,6 +40689,9 @@ if (SDValue V = combineIncDecVector(N, DAG)) return V; + if (SDValue V = combineAddToSUBUS(N, DAG, Subtarget)) + return V; + return combineAddOrSubToADCOrSBB(N, DAG); } Index: test/CodeGen/X86/psubus.ll =================================================================== --- test/CodeGen/X86/psubus.ll +++ test/CodeGen/X86/psubus.ll @@ -2411,3 +2411,37 @@ ret void } +define <16 x i8> @test19(<16 x i8> %x) { +; SSE-LABEL: test19: +; SSE: # %bb.0: # %entry +; SSE-NEXT: psubusb {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test19: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %0 = icmp ugt <16 x i8> %x, + %1 = select <16 x i1> %0, <16 x i8> %x, <16 x i8> + %2 = add <16 x i8> %1, + ret <16 x i8> %2 +} + +define <16 x i8> @test20(<16 x i8> %x) { +; SSE-LABEL: test20: +; SSE: # %bb.0: # %entry +; SSE-NEXT: psubusb {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test20: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %0 = icmp ugt <16 x i8> %x, + %1 = select <16 x i1> %0, <16 x i8> %x, <16 x i8> + %2 = add <16 x i8> %1, + ret <16 x i8> %2 +} +