Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -35474,6 +35474,81 @@ return combineAddOrSubToADCOrSBB(N, DAG); } +static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + // TODO: Need to add IR cannonicialization for this code. + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + EVT VT = N->getValueType(0); + + // PSUBUS is supported, starting from SSE2, but special preprocessing + // for v8i32 requires umin, which appears in SSE41. + if (!(Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) && + !(Subtarget.hasSSE41() && (VT == MVT::v8i32)) && + !(Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16))) + return SDValue(); + + SDValue SubusLHS, SubusRHS; + // Try to find umax(a,b) - b or a - umin(a,b) patterns + // they may be converted to subus(a,b). + if (Op0.getOpcode() == ISD::UMAX) { + SubusRHS = Op1; + SDValue MaxLHS = Op0.getOperand(0); + SDValue MaxRHS = Op0.getOperand(1); + if (DAG.isEqualTo(MaxLHS, Op1)) { + SubusLHS = MaxRHS; + } else if (DAG.isEqualTo(MaxRHS, Op1)) { + SubusLHS = MaxLHS; + } else { + return SDValue(); + } + } else if (Op1.getOpcode() == ISD::UMIN) { + SubusLHS = Op0; + SDValue MinLHS = Op1.getOperand(0); + SDValue MinRHS = Op1.getOperand(1); + if (DAG.isEqualTo(MinLHS, Op0)) { + SubusRHS = MinRHS; + } else if (DAG.isEqualTo(MinRHS, Op0)) { + SubusRHS = MinLHS; + } else { + return SDValue(); + } + } else { + return SDValue(); + } + + // PSUBUS doesn't support v8i32, but it can be enabled with special + // preprocessing in some cases. + if (VT != MVT::v8i32) { + return DAG.getNode(X86ISD::SUBUS, SDLoc(N), VT, SubusLHS, SubusRHS); + } + + // Special preprocessing for v8i32 case can be only applied + // if the value was zero extended from 16 bit. + if (SubusLHS.getOpcode() != ISD::ZERO_EXTEND) { + return SDValue(); + } + EVT ExtType = SubusLHS.getValueType(); + EVT ShrinkedType = SubusLHS.getOperand(0).getValueType(); + if (ShrinkedType != MVT::v8i16) + return SDValue(); + // If SubusLHS is zeroextended - truncate SubusRHS to it's + // size SubusRHS = umin(0xFFF.., SubusRHS). + SDValue SaturationConst = + DAG.getConstant(APInt::getLowBitsSet(ExtType.getScalarSizeInBits(), + ShrinkedType.getScalarSizeInBits()), + SDLoc(SubusLHS), ExtType); + SDValue UMin = + DAG.getNode(ISD::UMIN, SDLoc(SubusLHS), ExtType, Op1, SaturationConst); + SDValue NewSubusLHS = SubusLHS.getOperand(0); + SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType); + SDValue Psubus = DAG.getNode(X86ISD::SUBUS, SDLoc(N), ShrinkedType, + NewSubusLHS, NewSubusRHS); + // Zero extend the result, it may be used somewhere as 32 bit, + // if not zext and following trunc will shrink. + return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType); +} + static SDValue combineSub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDValue Op0 = N->getOperand(0); @@ -35507,6 +35582,10 @@ if (SDValue V = combineIncDecVector(N, DAG)) return V; + // Try to create PSUBUS if SUB's argument is max/min + if (SDValue V = combineSubToSubus(N, DAG, Subtarget)) + return V; + return combineAddOrSubToADCOrSBB(N, DAG); } Index: test/CodeGen/X86/psubus.ll =================================================================== --- test/CodeGen/X86/psubus.ll +++ test/CodeGen/X86/psubus.ll @@ -10,11 +10,13 @@ ; SSE: ## BB#0: ## %vector.ph ; SSE-NEXT: psubusw {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq +; SSE-NEXT: ## -- End function ; ; AVX-LABEL: test1: ; AVX: ## BB#0: ## %vector.ph ; AVX-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq +; AVX-NEXT: ## -- End function vector.ph: %0 = icmp slt <8 x i16> %x, zeroinitializer %1 = xor <8 x i16> %x, @@ -27,11 +29,13 @@ ; SSE: ## BB#0: ## %vector.ph ; SSE-NEXT: psubusw {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq +; SSE-NEXT: ## -- End function ; ; AVX-LABEL: test2: ; AVX: ## BB#0: ## %vector.ph ; AVX-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq +; AVX-NEXT: ## -- End function vector.ph: %0 = icmp ugt <8 x i16> %x, %1 = add <8 x i16> %x, @@ -47,6 +51,7 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE-NEXT: psubusw %xmm1, %xmm0 ; SSE-NEXT: retq +; SSE-NEXT: ## -- End function ; ; AVX1-LABEL: test3: ; AVX1: ## BB#0: ## %vector.ph @@ -55,6 +60,7 @@ ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq +; AVX1-NEXT: ## -- End function ; ; AVX2-LABEL: test3: ; AVX2: ## BB#0: ## %vector.ph @@ -62,6 +68,7 @@ ; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq +; AVX2-NEXT: ## -- End function vector.ph: %0 = insertelement <8 x i16> undef, i16 %w, i32 0 %broadcast15 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer @@ -76,11 +83,13 @@ ; SSE: ## BB#0: ## %vector.ph ; SSE-NEXT: psubusb {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq +; SSE-NEXT: ## -- End function ; ; AVX-LABEL: test4: ; AVX: ## BB#0: ## %vector.ph ; AVX-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq +; AVX-NEXT: ## -- End function vector.ph: %0 = icmp slt <16 x i8> %x, zeroinitializer %1 = xor <16 x i8> %x, @@ -93,11 +102,13 @@ ; SSE: ## BB#0: ## %vector.ph ; SSE-NEXT: psubusb {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq +; SSE-NEXT: ## -- End function ; ; AVX-LABEL: test5: ; AVX: ## BB#0: ## %vector.ph ; AVX-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq +; AVX-NEXT: ## -- End function vector.ph: %0 = icmp ugt <16 x i8> %x, %1 = add <16 x i8> %x, @@ -114,6 +125,7 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE2-NEXT: psubusb %xmm1, %xmm0 ; SSE2-NEXT: retq +; SSE2-NEXT: ## -- End function ; ; SSSE3-LABEL: test6: ; SSSE3: ## BB#0: ## %vector.ph @@ -122,6 +134,7 @@ ; SSSE3-NEXT: pshufb %xmm2, %xmm1 ; SSSE3-NEXT: psubusb %xmm1, %xmm0 ; SSSE3-NEXT: retq +; SSSE3-NEXT: ## -- End function ; ; SSE41-LABEL: test6: ; SSE41: ## BB#0: ## %vector.ph @@ -130,6 +143,7 @@ ; SSE41-NEXT: pshufb %xmm2, %xmm1 ; SSE41-NEXT: psubusb %xmm1, %xmm0 ; SSE41-NEXT: retq +; SSE41-NEXT: ## -- End function ; ; AVX1-LABEL: test6: ; AVX1: ## BB#0: ## %vector.ph @@ -138,6 +152,7 @@ ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq +; AVX1-NEXT: ## -- End function ; ; AVX2-LABEL: test6: ; AVX2: ## BB#0: ## %vector.ph @@ -145,6 +160,7 @@ ; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq +; AVX2-NEXT: ## -- End function vector.ph: %0 = insertelement <16 x i8> undef, i8 %w, i32 0 %broadcast15 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer @@ -161,6 +177,7 @@ ; SSE-NEXT: psubusw %xmm2, %xmm0 ; SSE-NEXT: psubusw %xmm2, %xmm1 ; SSE-NEXT: retq +; SSE-NEXT: ## -- End function ; ; AVX1-LABEL: test7: ; AVX1: ## BB#0: ## %vector.ph @@ -172,11 +189,13 @@ ; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq +; AVX1-NEXT: ## -- End function ; ; AVX2-LABEL: test7: ; AVX2: ## BB#0: ## %vector.ph ; AVX2-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq +; AVX2-NEXT: ## -- End function vector.ph: %0 = icmp slt <16 x i16> %x, zeroinitializer %1 = xor <16 x i16> %x, @@ -191,6 +210,7 @@ ; SSE-NEXT: psubusw %xmm2, %xmm0 ; SSE-NEXT: psubusw %xmm2, %xmm1 ; SSE-NEXT: retq +; SSE-NEXT: ## -- End function ; ; AVX1-LABEL: test8: ; AVX1: ## BB#0: ## %vector.ph @@ -208,11 +228,13 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm0, %ymm2, %ymm0 ; AVX1-NEXT: retq +; AVX1-NEXT: ## -- End function ; ; AVX2-LABEL: test8: ; AVX2: ## BB#0: ## %vector.ph ; AVX2-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq +; AVX2-NEXT: ## -- End function vector.ph: %0 = icmp ugt <16 x i16> %x, %1 = add <16 x i16> %x, @@ -229,6 +251,7 @@ ; SSE-NEXT: psubusw %xmm2, %xmm0 ; SSE-NEXT: psubusw %xmm2, %xmm1 ; SSE-NEXT: retq +; SSE-NEXT: ## -- End function ; ; AVX1-LABEL: test9: ; AVX1: ## BB#0: ## %vector.ph @@ -246,6 +269,7 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm3, %ymm0, %ymm0 ; AVX1-NEXT: retq +; AVX1-NEXT: ## -- End function ; ; AVX2-LABEL: test9: ; AVX2: ## BB#0: ## %vector.ph @@ -253,6 +277,7 @@ ; AVX2-NEXT: vpbroadcastw %xmm1, %ymm1 ; AVX2-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq +; AVX2-NEXT: ## -- End function vector.ph: %0 = insertelement <16 x i16> undef, i16 %w, i32 0 %broadcast15 = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> zeroinitializer @@ -269,6 +294,7 @@ ; SSE-NEXT: psubusb %xmm2, %xmm0 ; SSE-NEXT: psubusb %xmm2, %xmm1 ; SSE-NEXT: retq +; SSE-NEXT: ## -- End function ; ; AVX1-LABEL: test10: ; AVX1: ## BB#0: ## %vector.ph @@ -280,11 +306,13 @@ ; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq +; AVX1-NEXT: ## -- End function ; ; AVX2-LABEL: test10: ; AVX2: ## BB#0: ## %vector.ph ; AVX2-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq +; AVX2-NEXT: ## -- End function vector.ph: %0 = icmp slt <32 x i8> %x, zeroinitializer %1 = xor <32 x i8> %x, @@ -299,6 +327,7 @@ ; SSE-NEXT: psubusb %xmm2, %xmm0 ; SSE-NEXT: psubusb %xmm2, %xmm1 ; SSE-NEXT: retq +; SSE-NEXT: ## -- End function ; ; AVX1-LABEL: test11: ; AVX1: ## BB#0: ## %vector.ph @@ -316,11 +345,13 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm0, %ymm2, %ymm0 ; AVX1-NEXT: retq +; AVX1-NEXT: ## -- End function ; ; AVX2-LABEL: test11: ; AVX2: ## BB#0: ## %vector.ph ; AVX2-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq +; AVX2-NEXT: ## -- End function vector.ph: %0 = icmp ugt <32 x i8> %x, %1 = add <32 x i8> %x, @@ -338,6 +369,7 @@ ; SSE2-NEXT: psubusb %xmm2, %xmm0 ; SSE2-NEXT: psubusb %xmm2, %xmm1 ; SSE2-NEXT: retq +; SSE2-NEXT: ## -- End function ; ; SSSE3-LABEL: test12: ; SSSE3: ## BB#0: ## %vector.ph @@ -347,6 +379,7 @@ ; SSSE3-NEXT: psubusb %xmm2, %xmm0 ; SSSE3-NEXT: psubusb %xmm2, %xmm1 ; SSSE3-NEXT: retq +; SSSE3-NEXT: ## -- End function ; ; SSE41-LABEL: test12: ; SSE41: ## BB#0: ## %vector.ph @@ -356,6 +389,7 @@ ; SSE41-NEXT: psubusb %xmm2, %xmm0 ; SSE41-NEXT: psubusb %xmm2, %xmm1 ; SSE41-NEXT: retq +; SSE41-NEXT: ## -- End function ; ; AVX1-LABEL: test12: ; AVX1: ## BB#0: ## %vector.ph @@ -373,6 +407,7 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm3, %ymm0, %ymm0 ; AVX1-NEXT: retq +; AVX1-NEXT: ## -- End function ; ; AVX2-LABEL: test12: ; AVX2: ## BB#0: ## %vector.ph @@ -380,6 +415,7 @@ ; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1 ; AVX2-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq +; AVX2-NEXT: ## -- End function vector.ph: %0 = insertelement <32 x i8> undef, i8 %w, i32 0 %broadcast15 = shufflevector <32 x i8> %0, <32 x i8> undef, <32 x i32> zeroinitializer @@ -422,6 +458,7 @@ ; SSE2-NEXT: pandn %xmm3, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq +; SSE2-NEXT: ## -- End function ; ; SSSE3-LABEL: test13: ; SSSE3: ## BB#0: ## %vector.ph @@ -451,6 +488,7 @@ ; SSSE3-NEXT: pandn %xmm3, %xmm2 ; SSSE3-NEXT: movdqa %xmm2, %xmm0 ; SSSE3-NEXT: retq +; SSSE3-NEXT: ## -- End function ; ; SSE41-LABEL: test13: ; SSE41: ## BB#0: ## %vector.ph @@ -478,6 +516,7 @@ ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] ; SSE41-NEXT: pandn %xmm3, %xmm0 ; SSE41-NEXT: retq +; SSE41-NEXT: ## -- End function ; ; AVX1-LABEL: test13: ; AVX1: ## BB#0: ## %vector.ph @@ -502,11 +541,12 @@ ; AVX1-NEXT: vpandn %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq +; AVX1-NEXT: ## -- End function ; ; AVX2-LABEL: test13: ; AVX2: ## BB#0: ## %vector.ph ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] ; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3 ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vpcmpgtd %ymm2, %ymm3, %ymm2 @@ -518,6 +558,7 @@ ; AVX2-NEXT: vpandn %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq +; AVX2-NEXT: ## -- End function vector.ph: %lhs = zext <8 x i16> %x to <8 x i32> %cond = icmp ult <8 x i32> %lhs, %y @@ -579,6 +620,7 @@ ; SSE2-NEXT: packuswb %xmm3, %xmm1 ; SSE2-NEXT: pandn %xmm1, %xmm0 ; SSE2-NEXT: retq +; SSE2-NEXT: ## -- End function ; ; SSSE3-LABEL: test14: ; SSSE3: ## BB#0: ## %vector.ph @@ -634,6 +676,7 @@ ; SSSE3-NEXT: packuswb %xmm3, %xmm1 ; SSSE3-NEXT: andnpd %xmm1, %xmm0 ; SSSE3-NEXT: retq +; SSSE3-NEXT: ## -- End function ; ; SSE41-LABEL: test14: ; SSE41: ## BB#0: ## %vector.ph @@ -687,6 +730,7 @@ ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq +; SSE41-NEXT: ## -- End function ; ; AVX1-LABEL: test14: ; AVX1: ## BB#0: ## %vector.ph @@ -731,13 +775,14 @@ ; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq +; AVX1-NEXT: ## -- End function ; ; AVX2-LABEL: test14: ; AVX2: ## BB#0: ## %vector.ph ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm4 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] ; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm5 ; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm6 ; AVX2-NEXT: vpcmpgtd %ymm5, %ymm6, %ymm5 @@ -764,6 +809,7 @@ ; AVX2-NEXT: vpblendvb %xmm4, %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq +; AVX2-NEXT: ## -- End function vector.ph: %rhs = zext <16 x i8> %x to <16 x i32> %cond = icmp ult <16 x i32> %y, %rhs @@ -806,6 +852,7 @@ ; SSE2-NEXT: pand %xmm4, %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm0 ; SSE2-NEXT: retq +; SSE2-NEXT: ## -- End function ; ; SSSE3-LABEL: test15: ; SSSE3: ## BB#0: ## %vector.ph @@ -834,6 +881,7 @@ ; SSSE3-NEXT: pand %xmm4, %xmm3 ; SSSE3-NEXT: movdqa %xmm3, %xmm0 ; SSSE3-NEXT: retq +; SSSE3-NEXT: ## -- End function ; ; SSE41-LABEL: test15: ; SSE41: ## BB#0: ## %vector.ph @@ -860,6 +908,7 @@ ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: retq +; SSE41-NEXT: ## -- End function ; ; AVX1-LABEL: test15: ; AVX1: ## BB#0: ## %vector.ph @@ -884,11 +933,12 @@ ; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq +; AVX1-NEXT: ## -- End function ; ; AVX2-LABEL: test15: ; AVX2: ## BB#0: ## %vector.ph ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] ; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3 ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vpcmpgtd %ymm3, %ymm2, %ymm2 @@ -900,6 +950,7 @@ ; AVX2-NEXT: vpand %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq +; AVX2-NEXT: ## -- End function vector.ph: %lhs = zext <8 x i16> %x to <8 x i32> %cond = icmp ugt <8 x i32> %lhs, %y @@ -942,6 +993,7 @@ ; SSE2-NEXT: pand %xmm4, %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm0 ; SSE2-NEXT: retq +; SSE2-NEXT: ## -- End function ; ; SSSE3-LABEL: test16: ; SSSE3: ## BB#0: ## %vector.ph @@ -970,6 +1022,7 @@ ; SSSE3-NEXT: pand %xmm4, %xmm3 ; SSSE3-NEXT: movdqa %xmm3, %xmm0 ; SSSE3-NEXT: retq +; SSSE3-NEXT: ## -- End function ; ; SSE41-LABEL: test16: ; SSE41: ## BB#0: ## %vector.ph @@ -996,6 +1049,7 @@ ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: retq +; SSE41-NEXT: ## -- End function ; ; AVX1-LABEL: test16: ; AVX1: ## BB#0: ## %vector.ph @@ -1020,11 +1074,12 @@ ; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq +; AVX1-NEXT: ## -- End function ; ; AVX2-LABEL: test16: ; AVX2: ## BB#0: ## %vector.ph ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] ; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3 ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vpcmpgtd %ymm3, %ymm2, %ymm2 @@ -1036,6 +1091,7 @@ ; AVX2-NEXT: vpand %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq +; AVX2-NEXT: ## -- End function vector.ph: %lhs = zext <8 x i16> %x to <8 x i32> %cond = icmp ult <8 x i32> %y, %lhs @@ -1044,3 +1100,553 @@ %res = select <8 x i1> %cond, <8 x i16> %truncsub, <8 x i16> zeroinitializer ret <8 x i16> %res } + +; Test converting max(x,y) - y pattern to psubus, where x and y are 8 x i16 +define <8 x i16> @psubus_8i16_max(<8 x i16> %x, <8 x i16> %y) nounwind { +; SSE2-LABEL: psubus_8i16_max: +; SSE2: ## BB#0: ## %vector.ph +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtw %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: psubw %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; SSE2-NEXT: ## -- End function +; +; SSSE3-LABEL: psubus_8i16_max: +; SSSE3: ## BB#0: ## %vector.ph +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pxor %xmm2, %xmm3 +; SSSE3-NEXT: pxor %xmm1, %xmm2 +; SSSE3-NEXT: pcmpgtw %xmm3, %xmm2 +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: pandn %xmm0, %xmm2 +; SSSE3-NEXT: por %xmm3, %xmm2 +; SSSE3-NEXT: psubw %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: retq +; SSSE3-NEXT: ## -- End function +; +; SSE41-LABEL: psubus_8i16_max: +; SSE41: ## BB#0: ## %vector.ph +; SSE41-NEXT: psubusw %xmm1, %xmm0 +; SSE41-NEXT: retq +; SSE41-NEXT: ## -- End function +; +; AVX-LABEL: psubus_8i16_max: +; AVX: ## BB#0: ## %vector.ph +; AVX-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; AVX-NEXT: ## -- End function +vector.ph: + %cmp = icmp ult <8 x i16> %x, %y + %max = select <8 x i1> %cmp, <8 x i16> %y, <8 x i16> %x + %res = sub <8 x i16> %max, %y + ret <8 x i16> %res +} + +; Test converting max(x,y) - y pattern to psubus, where x and y are 16 x i8 +define <16 x i8> @psubus_16i8_max(<16 x i8> %x, <16 x i8> %y) nounwind { +; SSE-LABEL: psubus_16i8_max: +; SSE: ## BB#0: ## %vector.ph +; SSE-NEXT: psubusb %xmm1, %xmm0 +; SSE-NEXT: retq +; SSE-NEXT: ## -- End function +; +; AVX-LABEL: psubus_16i8_max: +; AVX: ## BB#0: ## %vector.ph +; AVX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; AVX-NEXT: ## -- End function +vector.ph: + %cmp = icmp ult <16 x i8> %x, %y + %max = select <16 x i1> %cmp, <16 x i8> %y, <16 x i8> %x + %res = sub <16 x i8> %max, %y + ret <16 x i8> %res +} + +; Test converting max(x,y) - y pattern to psubus, where x and y are 16 x i16 +; Tests don't convert to psubus on sse2, because ISD umin/umax SDNode is not +; generated there. +define <16 x i16> @psubus_16i16_max(<16 x i16> %x, <16 x i16> %y) nounwind { +; SSE2-LABEL: psubus_16i16_max: +; SSE2: ## BB#0: ## %vector.ph +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm6 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: pcmpgtw %xmm6, %xmm5 +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm6 +; SSE2-NEXT: pxor %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtw %xmm6, %xmm4 +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: pand %xmm4, %xmm6 +; SSE2-NEXT: pandn %xmm1, %xmm4 +; SSE2-NEXT: por %xmm6, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm5 +; SSE2-NEXT: por %xmm1, %xmm5 +; SSE2-NEXT: psubw %xmm2, %xmm5 +; SSE2-NEXT: psubw %xmm3, %xmm4 +; SSE2-NEXT: movdqa %xmm5, %xmm0 +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: retq +; SSE2-NEXT: ## -- End function +; +; SSSE3-LABEL: psubus_16i16_max: +; SSSE3: ## BB#0: ## %vector.ph +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSSE3-NEXT: movdqa %xmm0, %xmm6 +; SSSE3-NEXT: pxor %xmm4, %xmm6 +; SSSE3-NEXT: movdqa %xmm2, %xmm5 +; SSSE3-NEXT: pxor %xmm4, %xmm5 +; SSSE3-NEXT: pcmpgtw %xmm6, %xmm5 +; SSSE3-NEXT: movdqa %xmm1, %xmm6 +; SSSE3-NEXT: pxor %xmm4, %xmm6 +; SSSE3-NEXT: pxor %xmm3, %xmm4 +; SSSE3-NEXT: pcmpgtw %xmm6, %xmm4 +; SSSE3-NEXT: movdqa %xmm3, %xmm6 +; SSSE3-NEXT: pand %xmm4, %xmm6 +; SSSE3-NEXT: pandn %xmm1, %xmm4 +; SSSE3-NEXT: por %xmm6, %xmm4 +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: pand %xmm5, %xmm1 +; SSSE3-NEXT: pandn %xmm0, %xmm5 +; SSSE3-NEXT: por %xmm1, %xmm5 +; SSSE3-NEXT: psubw %xmm2, %xmm5 +; SSSE3-NEXT: psubw %xmm3, %xmm4 +; SSSE3-NEXT: movdqa %xmm5, %xmm0 +; SSSE3-NEXT: movdqa %xmm4, %xmm1 +; SSSE3-NEXT: retq +; SSSE3-NEXT: ## -- End function +; +; SSE41-LABEL: psubus_16i16_max: +; SSE41: ## BB#0: ## %vector.ph +; SSE41-NEXT: psubusw %xmm2, %xmm0 +; SSE41-NEXT: psubusw %xmm3, %xmm1 +; SSE41-NEXT: retq +; SSE41-NEXT: ## -- End function +; +; AVX1-LABEL: psubus_16i16_max: +; AVX1: ## BB#0: ## %vector.ph +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpsubusw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; AVX1-NEXT: ## -- End function +; +; AVX2-LABEL: psubus_16i16_max: +; AVX2: ## BB#0: ## %vector.ph +; AVX2-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; AVX2-NEXT: ## -- End function +vector.ph: + %cmp = icmp ult <16 x i16> %x, %y + %max = select <16 x i1> %cmp, <16 x i16> %y, <16 x i16> %x + %res = sub <16 x i16> %max, %y + ret <16 x i16> %res +} + +; Test converting max(x,y) - y pattern to psubus, where x and y are 32 x i8 +define <32 x i8> @psubus_32i8_max(<32 x i8> %x, <32 x i8> %y) nounwind { +; SSE-LABEL: psubus_32i8_max: +; SSE: ## BB#0: ## %vector.ph +; SSE-NEXT: psubusb %xmm2, %xmm0 +; SSE-NEXT: psubusb %xmm3, %xmm1 +; SSE-NEXT: retq +; SSE-NEXT: ## -- End function +; +; AVX1-LABEL: psubus_32i8_max: +; AVX1: ## BB#0: ## %vector.ph +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpsubusb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; AVX1-NEXT: ## -- End function +; +; AVX2-LABEL: psubus_32i8_max: +; AVX2: ## BB#0: ## %vector.ph +; AVX2-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; AVX2-NEXT: ## -- End function +vector.ph: + %cmp = icmp ult <32 x i8> %x, %y + %max = select <32 x i1> %cmp, <32 x i8> %y, <32 x i8> %x + %res = sub <32 x i8> %max, %y + ret <32 x i8> %res +} + +; Test converting max(lhs,y) - y pattern to psubus(lhs,min(y, 0xFFFF)), where y is i32 and lhs is zero extended i16 +define <8 x i16> @psubus_i16_i32_max(<8 x i16> %x, <8 x i32> %y) nounwind { +; SSE2-LABEL: psubus_i16_i32_max: +; SSE2: ## BB#0: ## %vector.ph +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm5, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: por %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: pxor %xmm5, %xmm6 +; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pand %xmm6, %xmm5 +; SSE2-NEXT: pandn %xmm4, %xmm6 +; SSE2-NEXT: por %xmm5, %xmm6 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pand %xmm0, %xmm4 +; SSE2-NEXT: pandn %xmm3, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: psubd %xmm1, %xmm0 +; SSE2-NEXT: psubd %xmm2, %xmm6 +; SSE2-NEXT: pslld $16, %xmm6 +; SSE2-NEXT: psrad $16, %xmm6 +; SSE2-NEXT: pslld $16, %xmm0 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm6, %xmm0 +; SSE2-NEXT: retq +; SSE2-NEXT: ## -- End function +; +; SSSE3-LABEL: psubus_i16_i32_max: +; SSSE3: ## BB#0: ## %vector.ph +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pxor %xmm0, %xmm0 +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm5, %xmm0 +; SSSE3-NEXT: movdqa %xmm3, %xmm6 +; SSSE3-NEXT: por %xmm5, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm0 +; SSSE3-NEXT: movdqa %xmm2, %xmm6 +; SSSE3-NEXT: pxor %xmm5, %xmm6 +; SSSE3-NEXT: por %xmm4, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 +; SSSE3-NEXT: movdqa %xmm2, %xmm5 +; SSSE3-NEXT: pand %xmm6, %xmm5 +; SSSE3-NEXT: pandn %xmm4, %xmm6 +; SSSE3-NEXT: por %xmm5, %xmm6 +; SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSSE3-NEXT: pand %xmm0, %xmm4 +; SSSE3-NEXT: pandn %xmm3, %xmm0 +; SSSE3-NEXT: por %xmm4, %xmm0 +; SSSE3-NEXT: psubd %xmm1, %xmm0 +; SSSE3-NEXT: psubd %xmm2, %xmm6 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3-NEXT: pshufb %xmm1, %xmm6 +; SSSE3-NEXT: pshufb %xmm1, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] +; SSSE3-NEXT: retq +; SSSE3-NEXT: ## -- End function +; +; SSE41-LABEL: psubus_i16_i32_max: +; SSE41: ## BB#0: ## %vector.ph +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535] +; SSE41-NEXT: pminud %xmm4, %xmm2 +; SSE41-NEXT: pshufb %xmm3, %xmm2 +; SSE41-NEXT: pminud %xmm4, %xmm1 +; SSE41-NEXT: pshufb %xmm3, %xmm1 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE41-NEXT: psubusw %xmm1, %xmm0 +; SSE41-NEXT: retq +; SSE41-NEXT: ## -- End function +; +; AVX1-LABEL: psubus_i16_i32_max: +; AVX1: ## BB#0: ## %vector.ph +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [65535,65535,65535,65535] +; AVX1-NEXT: vpminud %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX1-NEXT: vpminud %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; AVX1-NEXT: ## -- End function +; +; AVX2-LABEL: psubus_i16_i32_max: +; AVX2: ## BB#0: ## %vector.ph +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; AVX2-NEXT: ## -- End function +vector.ph: + %lhs = zext <8 x i16> %x to <8 x i32> + %cond = icmp ult <8 x i32> %lhs, %y + %max = select <8 x i1> %cond, <8 x i32> %y, <8 x i32> %lhs + %sub = sub <8 x i32> %max, %y + %res = trunc <8 x i32> %sub to <8 x i16> + ret <8 x i16> %res +} + +; Test converting trunc(max(y,lhs) - y) pattern to psubus(lhs,min(y, 0xFFFF)), where y is i32 and lhs is zero extended i16 +define <8 x i16> @psubus_i16_i32_max_swapped(<8 x i16> %x, <8 x i32> %y) nounwind { +; SSE2-LABEL: psubus_i16_i32_max_swapped: +; SSE2: ## BB#0: ## %vector.ph +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm6 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm6 +; SSE2-NEXT: por %xmm5, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pandn %xmm2, %xmm4 +; SSE2-NEXT: por %xmm5, %xmm4 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: psubd %xmm1, %xmm0 +; SSE2-NEXT: psubd %xmm2, %xmm4 +; SSE2-NEXT: pslld $16, %xmm4 +; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: pslld $16, %xmm0 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm4, %xmm0 +; SSE2-NEXT: retq +; SSE2-NEXT: ## -- End function +; +; SSSE3-LABEL: psubus_i16_i32_max_swapped: +; SSSE3: ## BB#0: ## %vector.ph +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pxor %xmm0, %xmm0 +; SSSE3-NEXT: movdqa %xmm3, %xmm5 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm1, %xmm6 +; SSSE3-NEXT: pxor %xmm4, %xmm6 +; SSSE3-NEXT: movdqa %xmm3, %xmm0 +; SSSE3-NEXT: por %xmm4, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm0 +; SSSE3-NEXT: movdqa %xmm2, %xmm6 +; SSSE3-NEXT: pxor %xmm4, %xmm6 +; SSSE3-NEXT: por %xmm5, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm5 +; SSSE3-NEXT: pandn %xmm2, %xmm4 +; SSSE3-NEXT: por %xmm5, %xmm4 +; SSSE3-NEXT: pand %xmm0, %xmm3 +; SSSE3-NEXT: pandn %xmm1, %xmm0 +; SSSE3-NEXT: por %xmm3, %xmm0 +; SSSE3-NEXT: psubd %xmm1, %xmm0 +; SSSE3-NEXT: psubd %xmm2, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3-NEXT: pshufb %xmm1, %xmm4 +; SSSE3-NEXT: pshufb %xmm1, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; SSSE3-NEXT: retq +; SSSE3-NEXT: ## -- End function +; +; SSE41-LABEL: psubus_i16_i32_max_swapped: +; SSE41: ## BB#0: ## %vector.ph +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535] +; SSE41-NEXT: pminud %xmm4, %xmm2 +; SSE41-NEXT: pshufb %xmm3, %xmm2 +; SSE41-NEXT: pminud %xmm4, %xmm1 +; SSE41-NEXT: pshufb %xmm3, %xmm1 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE41-NEXT: psubusw %xmm1, %xmm0 +; SSE41-NEXT: retq +; SSE41-NEXT: ## -- End function +; +; AVX1-LABEL: psubus_i16_i32_max_swapped: +; AVX1: ## BB#0: ## %vector.ph +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [65535,65535,65535,65535] +; AVX1-NEXT: vpminud %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX1-NEXT: vpminud %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; AVX1-NEXT: ## -- End function +; +; AVX2-LABEL: psubus_i16_i32_max_swapped: +; AVX2: ## BB#0: ## %vector.ph +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; AVX2-NEXT: ## -- End function +vector.ph: + %lhs = zext <8 x i16> %x to <8 x i32> + %cond = icmp ult <8 x i32> %y, %lhs + %max = select <8 x i1> %cond, <8 x i32> %lhs, <8 x i32> %y + %sub = sub <8 x i32> %max, %y + %res = trunc <8 x i32> %sub to <8 x i16> + ret <8 x i16> %res +} + +; Test converting trunc(lhs - min(lhs,y)) pattern to psubus(lhs,min(y, 0xFFFF)), where y is i32 and lhs is a zero extended i16 +define <8 x i16> @psubus_i16_i32_min(<8 x i16> %x, <8 x i32> %y) nounwind { +; SSE2-LABEL: psubus_i16_i32_min: +; SSE2: ## BB#0: ## %vector.ph +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: por %xmm4, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm5 +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm6 +; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pand %xmm6, %xmm4 +; SSE2-NEXT: pandn %xmm2, %xmm6 +; SSE2-NEXT: por %xmm4, %xmm6 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pandn %xmm1, %xmm5 +; SSE2-NEXT: por %xmm2, %xmm5 +; SSE2-NEXT: psubd %xmm5, %xmm0 +; SSE2-NEXT: psubd %xmm6, %xmm3 +; SSE2-NEXT: pslld $16, %xmm3 +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: pslld $16, %xmm0 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm3, %xmm0 +; SSE2-NEXT: retq +; SSE2-NEXT: ## -- End function +; +; SSSE3-LABEL: psubus_i16_i32_min: +; SSSE3: ## BB#0: ## %vector.ph +; SSSE3-NEXT: pxor %xmm4, %xmm4 +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm1, %xmm5 +; SSSE3-NEXT: pxor %xmm4, %xmm5 +; SSSE3-NEXT: movdqa %xmm0, %xmm6 +; SSSE3-NEXT: por %xmm4, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm5 +; SSSE3-NEXT: movdqa %xmm2, %xmm6 +; SSSE3-NEXT: pxor %xmm4, %xmm6 +; SSSE3-NEXT: por %xmm3, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pand %xmm6, %xmm4 +; SSSE3-NEXT: pandn %xmm2, %xmm6 +; SSSE3-NEXT: por %xmm4, %xmm6 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pandn %xmm1, %xmm5 +; SSSE3-NEXT: por %xmm2, %xmm5 +; SSSE3-NEXT: psubd %xmm5, %xmm0 +; SSSE3-NEXT: psubd %xmm6, %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3-NEXT: pshufb %xmm1, %xmm3 +; SSSE3-NEXT: pshufb %xmm1, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; SSSE3-NEXT: retq +; SSSE3-NEXT: ## -- End function +; +; SSE41-LABEL: psubus_i16_i32_min: +; SSE41: ## BB#0: ## %vector.ph +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535] +; SSE41-NEXT: pminud %xmm1, %xmm4 +; SSE41-NEXT: pminud %xmm5, %xmm4 +; SSE41-NEXT: pshufb %xmm3, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SSE41-NEXT: pminud %xmm2, %xmm1 +; SSE41-NEXT: pminud %xmm5, %xmm1 +; SSE41-NEXT: pshufb %xmm3, %xmm1 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0] +; SSE41-NEXT: psubusw %xmm4, %xmm0 +; SSE41-NEXT: retq +; SSE41-NEXT: ## -- End function +; +; AVX1-LABEL: psubus_i16_i32_min: +; AVX1: ## BB#0: ## %vector.ph +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [65535,65535,65535,65535] +; AVX1-NEXT: vpminud %xmm1, %xmm3, %xmm3 +; AVX1-NEXT: vpminud %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpminud %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vpminud %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; AVX1-NEXT: ## -- End function +; +; AVX2-LABEL: psubus_i16_i32_min: +; AVX2: ## BB#0: ## %vector.ph +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX2-NEXT: vpminud %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpminud %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; AVX2-NEXT: ## -- End function +vector.ph: + %lhs = zext <8 x i16> %x to <8 x i32> + %cond = icmp ult <8 x i32> %lhs, %y + %min = select <8 x i1> %cond, <8 x i32> %lhs, <8 x i32> %y + %sub = sub <8 x i32> %lhs, %min + %res = trunc <8 x i32> %sub to <8 x i16> + ret <8 x i16> %res +}