Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -25756,8 +25756,10 @@ switch (N->getOpcode()) { default: llvm_unreachable("Do not know how to custom type legalize this operation!"); + case X86ISD::ADDUS: + case X86ISD::SUBUS: case X86ISD::AVG: { - // Legalize types for X86ISD::AVG by expanding vectors. + // Legalize types for X86ISD::AVG/ADDUS/SUBUS by widening. assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); auto InVT = N->getValueType(0); @@ -25775,7 +25777,7 @@ Ops[0] = N->getOperand(1); SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops); - SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1); + SDValue Res = DAG.getNode(N->getOpcode(), dl, RegVT, InVec0, InVec1); if (getTypeAction(*DAG.getContext(), InVT) != TypeWidenVector) Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res, DAG.getIntPtrConstant(0, dl)); @@ -33147,14 +33149,11 @@ } } - // Early exit check - if (!TLI.isTypeLegal(VT)) - return SDValue(); - // Match VSELECTs into subs with unsigned saturation. if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC && // psubus is available in SSE2 for i8 and i16 vectors. - Subtarget.hasSSE2() && + Subtarget.hasSSE2() && VT.getVectorNumElements() >= 2 && + isPowerOf2_32(VT.getVectorNumElements()) && (VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16)) { ISD::CondCode CC = cast(Cond.getOperand(2))->get(); @@ -33227,7 +33226,8 @@ // Match VSELECTs into add with unsigned saturation. if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC && // paddus is available in SSE2 for i8 and i16 vectors. - Subtarget.hasSSE2() && + Subtarget.hasSSE2() && VT.getVectorNumElements() >= 2 && + isPowerOf2_32(VT.getVectorNumElements()) && (VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16)) { ISD::CondCode CC = cast(Cond.getOperand(2))->get(); @@ -33283,6 +33283,10 @@ } } + // Early exit check + if (!TLI.isTypeLegal(VT)) + return SDValue(); + if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget)) return V; Index: llvm/trunk/test/CodeGen/X86/paddus.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/paddus.ll +++ llvm/trunk/test/CodeGen/X86/paddus.ll @@ -1533,99 +1533,36 @@ } define void @addus_v8i8(<8 x i8>* %p1, <8 x i8>* %p2) { -; SSE2-LABEL: addus_v8i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: paddw %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm1, %xmm3 -; SSE2-NEXT: pcmpgtw %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: pandn %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: movq %xmm0, (%rdi) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: addus_v8i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSSE3-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSSE3-NEXT: paddw %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: pand %xmm2, %xmm1 -; SSSE3-NEXT: pcmpgtw %xmm1, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pandn %xmm2, %xmm1 -; SSSE3-NEXT: pand {{.*}}(%rip), %xmm0 -; SSSE3-NEXT: por %xmm1, %xmm0 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; SSSE3-NEXT: movq %xmm0, (%rdi) -; SSSE3-NEXT: retq -; -; SSE41-LABEL: addus_v8i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; SSE41-NEXT: paddw %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: pand %xmm1, %xmm3 -; SSE41-NEXT: pcmpgtw %xmm3, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm2 -; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; SSE41-NEXT: movq %xmm2, (%rdi) -; SSE41-NEXT: retq +; SSE-LABEL: addus_v8i8: +; SSE: # %bb.0: +; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE-NEXT: paddusb %xmm0, %xmm1 +; SSE-NEXT: movq %xmm1, (%rdi) +; SSE-NEXT: retq ; ; AVX1-LABEL: addus_v8i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX1-NEXT: vpaddw %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm3 -; AVX1-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vpaddusb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovq %xmm0, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: addus_v8i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpaddw %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm3 -; AVX2-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vpaddusb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovq %xmm0, (%rdi) ; AVX2-NEXT: retq ; ; AVX512-LABEL: addus_v8i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX512-NEXT: vpaddw %xmm0, %xmm3, %xmm0 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vpcmpnleuw %xmm1, %xmm2, %k1 -; AVX512-NEXT: vmovdqu16 {{.*}}(%rip), %xmm0 {%k1} +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: vpaddusb %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512-NEXT: vpmovwb %xmm0, (%rdi) ; AVX512-NEXT: retq %ld1 = load <8 x i8>, <8 x i8>* %p1, align 8 @@ -1638,105 +1575,36 @@ } define void @addus_v4i8(<4 x i8>* %p1, <4 x i8>* %p2) { -; SSE2-LABEL: addus_v4i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: paddd %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm1, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: pandn %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: movd %xmm0, (%rdi) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: addus_v4i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSSE3-NEXT: paddd %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: pand %xmm2, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pandn %xmm2, %xmm1 -; SSSE3-NEXT: pand {{.*}}(%rip), %xmm0 -; SSSE3-NEXT: por %xmm1, %xmm0 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] -; SSSE3-NEXT: movd %xmm0, (%rdi) -; SSSE3-NEXT: retq -; -; SSE41-LABEL: addus_v4i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; SSE41-NEXT: paddd %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: pand %xmm1, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm2 -; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] -; SSE41-NEXT: movd %xmm2, (%rdi) -; SSE41-NEXT: retq +; SSE-LABEL: addus_v4i8: +; SSE: # %bb.0: +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: paddusb %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, (%rdi) +; SSE-NEXT: retq ; ; AVX1-LABEL: addus_v4i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm3 -; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vpaddusb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: addus_v4i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX2-NEXT: vpaddd %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255] -; AVX2-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vpaddusb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, (%rdi) ; AVX2-NEXT: retq ; ; AVX512-LABEL: addus_v4i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX512-NEXT: vpaddd %xmm0, %xmm3, %xmm0 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vpcmpnleud %xmm1, %xmm2, %k1 -; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512-NEXT: vpaddusb %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX512-NEXT: vpmovdb %xmm0, (%rdi) ; AVX512-NEXT: retq %ld1 = load <4 x i8>, <4 x i8>* %p1, align 4 @@ -1753,39 +1621,10 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movzwl (%rdi), %eax ; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] ; SSE2-NEXT: movzwl (%rsi), %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] -; SSE2-NEXT: paddq %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0] -; SSE2-NEXT: por %xmm4, %xmm3 -; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: packuswb %xmm3, %xmm3 -; SSE2-NEXT: packuswb %xmm3, %xmm3 -; SSE2-NEXT: packuswb %xmm3, %xmm3 -; SSE2-NEXT: movd %xmm3, %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: paddusb %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: movw %ax, (%rdi) ; SSE2-NEXT: retq ; @@ -1793,91 +1632,52 @@ ; SSSE3: # %bb.0: ; SSSE3-NEXT: movzwl (%rdi), %eax ; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,128,128,128,128,128,128,128,1,128,128,128,128,128,128,128] -; SSSE3-NEXT: pshufb %xmm1, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] ; SSSE3-NEXT: movzwl (%rsi), %eax -; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: pshufb %xmm1, %xmm3 -; SSSE3-NEXT: paddq %xmm0, %xmm3 -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pand %xmm3, %xmm2 -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,0,2147483648,0] -; SSSE3-NEXT: por %xmm1, %xmm2 -; SSSE3-NEXT: por %xmm1, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: pandn %xmm3, %xmm0 -; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSSE3-NEXT: por %xmm0, %xmm1 -; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: paddusb %xmm0, %xmm1 ; SSSE3-NEXT: movd %xmm1, %eax ; SSSE3-NEXT: movw %ax, (%rdi) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: addus_v2i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: paddq %xmm0, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,0,2147483648,0] -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE41-NEXT: pand %xmm4, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, {{.*}}(%rip), %xmm1 -; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; SSE41-NEXT: movzwl (%rdi), %eax +; SSE41-NEXT: movd %eax, %xmm0 +; SSE41-NEXT: movzwl (%rsi), %eax +; SSE41-NEXT: movd %eax, %xmm1 +; SSE41-NEXT: paddusb %xmm0, %xmm1 ; SSE41-NEXT: pextrw $0, %xmm1, (%rdi) ; SSE41-NEXT: retq ; ; AVX1-LABEL: addus_v2i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vblendvpd %xmm0, {{.*}}(%rip), %xmm1, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: movzwl (%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: movzwl (%rsi), %eax +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: vpaddusb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: addus_v2i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpaddq %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm2 -; AVX2-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vblendvpd %xmm0, {{.*}}(%rip), %xmm1, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: movzwl (%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: movzwl (%rsi), %eax +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: vpaddusb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi) ; AVX2-NEXT: retq ; ; AVX512-LABEL: addus_v2i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpaddq %xmm0, %xmm1, %xmm1 -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm2 -; AVX512-NEXT: vpcmpnleuq %xmm2, %xmm0, %k1 -; AVX512-NEXT: vmovdqa64 {{.*}}(%rip), %xmm1 {%k1} -; AVX512-NEXT: vpmovqb %xmm1, (%rdi) +; AVX512-NEXT: movzwl (%rdi), %eax +; AVX512-NEXT: vmovd %eax, %xmm0 +; AVX512-NEXT: movzwl (%rsi), %eax +; AVX512-NEXT: vmovd %eax, %xmm1 +; AVX512-NEXT: vpaddusb %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vpmovqb %xmm0, (%rdi) ; AVX512-NEXT: retq %ld1 = load <2 x i8>, <2 x i8>* %p1, align 2 %ld2 = load <2 x i8>, <2 x i8>* %p2, align 2 @@ -1889,97 +1689,36 @@ } define void @addus_v4i16(<4 x i16>* %p1, <4 x i16>* %p2) { -; SSE2-LABEL: addus_v4i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0] -; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: paddd %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pandn %xmm2, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: movq %xmm0, (%rdi) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: addus_v4i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0] -; SSSE3-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSSE3-NEXT: paddd %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: pand %xmm2, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pandn %xmm2, %xmm1 -; SSSE3-NEXT: pand {{.*}}(%rip), %xmm0 -; SSSE3-NEXT: por %xmm1, %xmm0 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSSE3-NEXT: movq %xmm0, (%rdi) -; SSSE3-NEXT: retq -; -; SSE41-LABEL: addus_v4i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; SSE41-NEXT: paddd %xmm0, %xmm1 -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; SSE41-NEXT: pcmpgtd %xmm2, %xmm0 -; SSE41-NEXT: blendvps %xmm0, {{.*}}(%rip), %xmm1 -; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSE41-NEXT: movq %xmm1, (%rdi) -; SSE41-NEXT: retq +; SSE-LABEL: addus_v4i16: +; SSE: # %bb.0: +; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE-NEXT: paddusw %xmm0, %xmm1 +; SSE-NEXT: movq %xmm1, (%rdi) +; SSE-NEXT: retq ; ; AVX1-LABEL: addus_v4i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vblendvps %xmm0, {{.*}}(%rip), %xmm1, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vpaddusw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovq %xmm0, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: addus_v4i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; AVX2-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [65535,65535,65535,65535] -; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vpaddusw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovq %xmm0, (%rdi) ; AVX2-NEXT: retq ; ; AVX512-LABEL: addus_v4i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: vpaddusw %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX512-NEXT: vpaddd %xmm0, %xmm3, %xmm0 -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX512-NEXT: vpcmpnleud %xmm1, %xmm2, %k1 -; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} ; AVX512-NEXT: vpmovdw %xmm0, (%rdi) ; AVX512-NEXT: retq %ld1 = load <4 x i16>, <4 x i16>* %p1, align 4 @@ -1992,132 +1731,36 @@ } define void @addus_v2i16(<2 x i16>* %p1, <2 x i16>* %p2) { -; SSE2-LABEL: addus_v2i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7] -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0,65535,0,0,0] -; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,6,7] -; SSE2-NEXT: paddq %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,0,2147483648,0] -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: movd %xmm0, (%rdi) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: addus_v2i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7] -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0,65535,0,0,0] -; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; SSSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,6,7] -; SSSE3-NEXT: paddq %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: pand %xmm2, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,0,2147483648,0] -; SSSE3-NEXT: por %xmm3, %xmm0 -; SSSE3-NEXT: por %xmm3, %xmm1 -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm0, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: pandn %xmm2, %xmm0 -; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSSE3-NEXT: por %xmm0, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSSE3-NEXT: movd %xmm0, (%rdi) -; SSSE3-NEXT: retq -; -; SSE41-LABEL: addus_v2i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; SSE41-NEXT: paddq %xmm0, %xmm1 -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,0,2147483648,0] -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE41-NEXT: pand %xmm4, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, {{.*}}(%rip), %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE41-NEXT: movd %xmm0, (%rdi) -; SSE41-NEXT: retq +; SSE-LABEL: addus_v2i16: +; SSE: # %bb.0: +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: paddusw %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, (%rdi) +; SSE-NEXT: retq ; ; AVX1-LABEL: addus_v2i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; AVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vblendvpd %xmm0, {{.*}}(%rip), %xmm1, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vpaddusw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: addus_v2i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; AVX2-NEXT: vpaddq %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; AVX2-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vblendvpd %xmm0, {{.*}}(%rip), %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vpaddusw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, (%rdi) ; AVX2-NEXT: retq ; ; AVX512-LABEL: addus_v2i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; AVX512-NEXT: vpaddq %xmm0, %xmm3, %xmm0 -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; AVX512-NEXT: vpcmpnleuq %xmm1, %xmm2, %k1 -; AVX512-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512-NEXT: vpaddusw %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512-NEXT: vpmovqw %xmm0, (%rdi) ; AVX512-NEXT: retq %ld1 = load <2 x i16>, <2 x i16>* %p1, align 2 Index: llvm/trunk/test/CodeGen/X86/psubus.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/psubus.ll +++ llvm/trunk/test/CodeGen/X86/psubus.ll @@ -2213,93 +2213,36 @@ } define void @subus_v8i8(<8 x i8>* %p1, <8 x i8>* %p2) { -; SSE2-LABEL: subus_v8i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pcmpgtw %xmm1, %xmm2 -; SSE2-NEXT: psubw %xmm3, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: movq %xmm0, (%rdi) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: subus_v8i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: movq {{.*#+}} xmm3 = mem[0],zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: pcmpgtw %xmm1, %xmm2 -; SSSE3-NEXT: psubw %xmm3, %xmm0 -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; SSSE3-NEXT: movq %xmm0, (%rdi) -; SSSE3-NEXT: retq -; -; SSE41-LABEL: subus_v8i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; SSE41-NEXT: pand %xmm3, %xmm1 -; SSE41-NEXT: pcmpgtw %xmm1, %xmm2 -; SSE41-NEXT: psubw %xmm3, %xmm0 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; SSE41-NEXT: movq %xmm0, (%rdi) -; SSE41-NEXT: retq +; SSE-LABEL: subus_v8i8: +; SSE: # %bb.0: +; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE-NEXT: psubusb %xmm1, %xmm0 +; SSE-NEXT: movq %xmm0, (%rdi) +; SSE-NEXT: retq ; ; AVX1-LABEL: subus_v8i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpsubw %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: subus_v8i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpand %xmm1, %xmm3, %xmm1 -; AVX2-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpsubw %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, (%rdi) ; AVX2-NEXT: retq ; ; AVX512-LABEL: subus_v8i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX512-NEXT: vpand %xmm1, %xmm3, %xmm1 -; AVX512-NEXT: vpcmpnleuw %xmm1, %xmm2, %k1 -; AVX512-NEXT: vpsubw %xmm3, %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512-NEXT: vpmovwb %xmm0, (%rdi) ; AVX512-NEXT: retq %ld1 = load <8 x i8>, <8 x i8>* %p1, align 8 @@ -2312,98 +2255,36 @@ } define void @subus_v4i8(<4 x i8>* %p1, <4 x i8>* %p2) { -; SSE2-LABEL: subus_v4i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: psubd %xmm3, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: movd %xmm0, (%rdi) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: subus_v4i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 -; SSSE3-NEXT: psubd %xmm3, %xmm0 -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] -; SSSE3-NEXT: movd %xmm0, (%rdi) -; SSSE3-NEXT: retq -; -; SSE41-LABEL: subus_v4i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; SSE41-NEXT: pand %xmm3, %xmm1 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE41-NEXT: psubd %xmm3, %xmm0 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] -; SSE41-NEXT: movd %xmm0, (%rdi) -; SSE41-NEXT: retq +; SSE-LABEL: subus_v4i8: +; SSE: # %bb.0: +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: psubusb %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, (%rdi) +; SSE-NEXT: retq ; ; AVX1-LABEL: subus_v4i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpsubd %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: subus_v4i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX2-NEXT: vpand %xmm1, %xmm3, %xmm1 -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpsubd %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, (%rdi) ; AVX2-NEXT: retq ; ; AVX512-LABEL: subus_v4i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX512-NEXT: vpand %xmm1, %xmm3, %xmm1 -; AVX512-NEXT: vpcmpnleud %xmm1, %xmm2, %k1 -; AVX512-NEXT: vpsubd %xmm3, %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX512-NEXT: vpmovdb %xmm0, (%rdi) ; AVX512-NEXT: retq %ld1 = load <4 x i8>, <4 x i8>* %p1, align 8 @@ -2419,117 +2300,51 @@ ; SSE2-LABEL: subus_v2i8: ; SSE2: # %bb.0: ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,1,3] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: psubq %xmm3, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,0,2147483648,0] -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE2-NEXT: packuswb %xmm2, %xmm2 -; SSE2-NEXT: packuswb %xmm2, %xmm2 -; SSE2-NEXT: packuswb %xmm2, %xmm2 -; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: psubusb %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: movw %ax, (%rdi) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: subus_v2i8: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,128,128,128,128,128,128,128,1,128,128,128,128,128,128,128] -; SSSE3-NEXT: pshufb %xmm1, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: pand %xmm2, %xmm3 -; SSSE3-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero -; SSSE3-NEXT: pshufb %xmm1, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm2 -; SSSE3-NEXT: psubq %xmm4, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,0,2147483648,0] -; SSSE3-NEXT: por %xmm1, %xmm2 -; SSSE3-NEXT: por %xmm1, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm1 -; SSSE3-NEXT: pand %xmm0, %xmm1 -; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; SSSE3-NEXT: movd %xmm1, %eax +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: psubusb %xmm1, %xmm0 +; SSSE3-NEXT: movd %xmm0, %eax ; SSSE3-NEXT: movw %ax, (%rdi) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: subus_v2i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psubq %xmm1, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,0,2147483648,0] -; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] -; SSE41-NEXT: por %xmm0, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; SSE41-NEXT: pextrw $0, %xmm1, (%rdi) +; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE41-NEXT: psubusb %xmm1, %xmm0 +; SSE41-NEXT: pextrw $0, %xmm0, (%rdi) ; SSE41-NEXT: retq ; ; AVX1-LABEL: subus_v2i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: subus_v2i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi) ; AVX2-NEXT: retq ; ; AVX512-LABEL: subus_v2i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpcmpnleuq %xmm1, %xmm0, %k1 -; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vpmovqb %xmm0, (%rdi) ; AVX512-NEXT: retq %ld1 = load <2 x i8>, <2 x i8>* %p1, align 8 @@ -2542,87 +2357,36 @@ } define void @subus_v4i16(<4 x i16>* %p1, <4 x i16>* %p2) { -; SSE2-LABEL: subus_v4i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: psubd %xmm3, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: movq %xmm0, (%rdi) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: subus_v4i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0] -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: movq {{.*#+}} xmm3 = mem[0],zero -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 -; SSSE3-NEXT: psubd %xmm3, %xmm0 -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSSE3-NEXT: movq %xmm0, (%rdi) -; SSSE3-NEXT: retq -; -; SSE41-LABEL: subus_v4i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psubd %xmm1, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSE41-NEXT: movq %xmm0, (%rdi) -; SSE41-NEXT: retq +; SSE-LABEL: subus_v4i16: +; SSE: # %bb.0: +; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE-NEXT: psubusw %xmm1, %xmm0 +; SSE-NEXT: movq %xmm0, (%rdi) +; SSE-NEXT: retq ; ; AVX1-LABEL: subus_v4i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: subus_v4i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, (%rdi) ; AVX2-NEXT: retq ; ; AVX512-LABEL: subus_v4i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX512-NEXT: vpcmpnleud %xmm1, %xmm2, %k1 -; AVX512-NEXT: vpsubd %xmm3, %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: vpmovdw %xmm0, (%rdi) ; AVX512-NEXT: retq %ld1 = load <4 x i16>, <4 x i16>* %p1, align 8 @@ -2635,133 +2399,36 @@ } define void @subus_v2i16(<2 x i16>* %p1, <2 x i16>* %p2) { -; SSE2-LABEL: subus_v2i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7] -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0,65535,0,0,0] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] -; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,6,7] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: psubq %xmm3, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,0,2147483648,0] -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: movd %xmm0, (%rdi) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: subus_v2i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7] -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0,65535,0,0,0] -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] -; SSSE3-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,6,7] -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: psubq %xmm3, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,0,2147483648,0] -; SSSE3-NEXT: por %xmm3, %xmm1 -; SSSE3-NEXT: por %xmm3, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm2 -; SSSE3-NEXT: pand %xmm0, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSSE3-NEXT: movd %xmm0, (%rdi) -; SSSE3-NEXT: retq -; -; SSE41-LABEL: subus_v2i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psubq %xmm1, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,0,2147483648,0] -; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] -; SSE41-NEXT: por %xmm0, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE41-NEXT: movd %xmm0, (%rdi) -; SSE41-NEXT: retq +; SSE-LABEL: subus_v2i16: +; SSE: # %bb.0: +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: psubusw %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, (%rdi) +; SSE-NEXT: retq ; ; AVX1-LABEL: subus_v2i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, (%rdi) ; AVX1-NEXT: retq ; -; AVX2-SLOW-LABEL: subus_v2i16: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpsubq %xmm1, %xmm0, %xmm2 -; AVX2-SLOW-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovd %xmm0, (%rdi) -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: subus_v2i16: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; AVX2-FAST-NEXT: vpsubq %xmm1, %xmm0, %xmm2 -; AVX2-FAST-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vmovd %xmm0, (%rdi) -; AVX2-FAST-NEXT: retq +; AVX2-LABEL: subus_v2i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, (%rdi) +; AVX2-NEXT: retq ; ; AVX512-LABEL: subus_v2i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3],xmm3[4],xmm1[5,6,7] -; AVX512-NEXT: vpcmpnleuq %xmm1, %xmm2, %k1 -; AVX512-NEXT: vpsubq %xmm3, %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512-NEXT: vpmovqw %xmm0, (%rdi) ; AVX512-NEXT: retq %ld1 = load <2 x i16>, <2 x i16>* %p1, align 8 Index: llvm/trunk/test/CodeGen/X86/sse2-intrinsics-canonical.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/sse2-intrinsics-canonical.ll +++ llvm/trunk/test/CodeGen/X86/sse2-intrinsics-canonical.ll @@ -93,43 +93,40 @@ define <8 x i8> @test_x86_sse2_paddus_b_64(<8 x i8> %a0, <8 x i8> %a1) { ; SSE-LABEL: test_x86_sse2_paddus_b_64: ; SSE: ## %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; SSE-NEXT: ## encoding: [0x66,0x0f,0x6f,0x15,A,A,A,A] ; SSE-NEXT: ## fixup A - offset: 4, value: LCPI4_0, kind: FK_Data_4 -; SSE-NEXT: paddw %xmm0, %xmm1 ## encoding: [0x66,0x0f,0xfd,0xc8] +; SSE-NEXT: pand %xmm2, %xmm1 ## encoding: [0x66,0x0f,0xdb,0xca] +; SSE-NEXT: packuswb %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x67,0xc9] ; SSE-NEXT: pand %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xdb,0xc2] -; SSE-NEXT: pand %xmm1, %xmm2 ## encoding: [0x66,0x0f,0xdb,0xd1] -; SSE-NEXT: pcmpgtw %xmm2, %xmm0 ## encoding: [0x66,0x0f,0x65,0xc2] -; SSE-NEXT: movdqa %xmm0, %xmm2 ## encoding: [0x66,0x0f,0x6f,0xd0] -; SSE-NEXT: pandn %xmm1, %xmm2 ## encoding: [0x66,0x0f,0xdf,0xd1] -; SSE-NEXT: pand LCPI4_0, %xmm0 ## encoding: [0x66,0x0f,0xdb,0x05,A,A,A,A] -; SSE-NEXT: ## fixup A - offset: 4, value: LCPI4_0, kind: FK_Data_4 -; SSE-NEXT: por %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xeb,0xc2] +; SSE-NEXT: packuswb %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x67,0xc0] +; SSE-NEXT: paddusb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xdc,0xc1] +; SSE-NEXT: punpcklbw %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x60,0xc0] +; SSE-NEXT: ## xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: retl ## encoding: [0xc3] ; ; AVX2-LABEL: test_x86_sse2_paddus_b_64: ; AVX2: ## %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> ; AVX2-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x15,A,A,A,A] ; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI4_0, kind: FK_Data_4 -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm3 ## encoding: [0xc5,0xf9,0xdb,0xda] -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1] -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm1 ## encoding: [0xc5,0xf9,0xdb,0xca] -; AVX2-NEXT: vpcmpgtw %xmm1, %xmm3, %xmm1 ## encoding: [0xc5,0xe1,0x65,0xc9] -; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x4c,0xc2,0x10] +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ## encoding: [0xc4,0xe2,0x71,0x00,0xca] +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x00,0xc2] +; AVX2-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdc,0xc1] +; AVX2-NEXT: vpmovzxbw %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x30,0xc0] +; AVX2-NEXT: ## xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: retl ## encoding: [0xc3] ; ; SKX-LABEL: test_x86_sse2_paddus_b_64: ; SKX: ## %bb.0: -; SKX-NEXT: vmovdqa LCPI4_0, %xmm2 ## EVEX TO VEX Compression xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SKX-NEXT: vmovdqa LCPI4_0, %xmm2 ## EVEX TO VEX Compression xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> ; SKX-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x15,A,A,A,A] ; SKX-NEXT: ## fixup A - offset: 4, value: LCPI4_0, kind: FK_Data_4 -; SKX-NEXT: vpand %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0xda] -; SKX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1] -; SKX-NEXT: vpand %xmm2, %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0xca] -; SKX-NEXT: vpcmpnleuw %xmm1, %xmm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x08,0x3e,0xc9,0x06] -; SKX-NEXT: vmovdqu16 LCPI4_0, %xmm0 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x6f,0x05,A,A,A,A] -; SKX-NEXT: ## fixup A - offset: 6, value: LCPI4_0, kind: FK_Data_4 +; SKX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x00,0xca] +; SKX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x00,0xc2] +; SKX-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdc,0xc1] +; SKX-NEXT: vpmovzxbw %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x30,0xc0] +; SKX-NEXT: ## xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SKX-NEXT: retl ## encoding: [0xc3] %1 = add <8 x i8> %a0, %a1 %2 = icmp ugt <8 x i8> %a0, %1 @@ -140,46 +137,45 @@ define <4 x i16> @test_x86_sse2_paddus_w_64(<4 x i16> %a0, <4 x i16> %a1) { ; SSE-LABEL: test_x86_sse2_paddus_w_64: ; SSE: ## %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0] -; SSE-NEXT: ## encoding: [0x66,0x0f,0x6f,0x15,A,A,A,A] -; SSE-NEXT: ## fixup A - offset: 4, value: LCPI5_0, kind: FK_Data_4 -; SSE-NEXT: paddd %xmm0, %xmm1 ## encoding: [0x66,0x0f,0xfe,0xc8] -; SSE-NEXT: pand %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xdb,0xc2] -; SSE-NEXT: pand %xmm1, %xmm2 ## encoding: [0x66,0x0f,0xdb,0xd1] -; SSE-NEXT: pcmpgtd %xmm2, %xmm0 ## encoding: [0x66,0x0f,0x66,0xc2] -; SSE-NEXT: movdqa %xmm0, %xmm2 ## encoding: [0x66,0x0f,0x6f,0xd0] -; SSE-NEXT: pandn %xmm1, %xmm2 ## encoding: [0x66,0x0f,0xdf,0xd1] -; SSE-NEXT: pand LCPI5_0, %xmm0 ## encoding: [0x66,0x0f,0xdb,0x05,A,A,A,A] -; SSE-NEXT: ## fixup A - offset: 4, value: LCPI5_0, kind: FK_Data_4 -; SSE-NEXT: por %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xeb,0xc2] +; SSE-NEXT: pshuflw $232, %xmm1, %xmm1 ## encoding: [0xf2,0x0f,0x70,0xc9,0xe8] +; SSE-NEXT: ## xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw $232, %xmm1, %xmm1 ## encoding: [0xf3,0x0f,0x70,0xc9,0xe8] +; SSE-NEXT: ## xmm1 = xmm1[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd $232, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0xe8] +; SSE-NEXT: ## xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw $232, %xmm0, %xmm0 ## encoding: [0xf2,0x0f,0x70,0xc0,0xe8] +; SSE-NEXT: ## xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw $232, %xmm0, %xmm0 ## encoding: [0xf3,0x0f,0x70,0xc0,0xe8] +; SSE-NEXT: ## xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd $232, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x70,0xc0,0xe8] +; SSE-NEXT: ## xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: paddusw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xdd,0xc1] +; SSE-NEXT: punpcklwd %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x61,0xc0] +; SSE-NEXT: ## xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE-NEXT: retl ## encoding: [0xc3] ; ; AVX2-LABEL: test_x86_sse2_paddus_w_64: ; AVX2: ## %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xef,0xd2] -; AVX2-NEXT: vpblendw $170, %xmm2, %xmm0, %xmm3 ## encoding: [0xc4,0xe3,0x79,0x0e,0xda,0xaa] -; AVX2-NEXT: ## xmm3 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfe,0xc1] -; AVX2-NEXT: vpblendw $170, %xmm2, %xmm0, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x0e,0xca,0xaa] -; AVX2-NEXT: ## xmm1 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm1 ## encoding: [0xc5,0xe1,0x66,0xc9] -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [65535,65535,65535,65535] -; AVX2-NEXT: ## encoding: [0xc4,0xe2,0x79,0x18,0x15,A,A,A,A] -; AVX2-NEXT: ## fixup A - offset: 5, value: LCPI5_0, kind: FK_Data_4 -; AVX2-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x4a,0xc2,0x10] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x15,A,A,A,A] +; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI5_0, kind: FK_Data_4 +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ## encoding: [0xc4,0xe2,0x71,0x00,0xca] +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x00,0xc2] +; AVX2-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdd,0xc1] +; AVX2-NEXT: vpmovzxwd %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x33,0xc0] +; AVX2-NEXT: ## xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX2-NEXT: retl ## encoding: [0xc3] ; ; SKX-LABEL: test_x86_sse2_paddus_w_64: ; SKX: ## %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] -; SKX-NEXT: vpblendw $170, %xmm2, %xmm0, %xmm3 ## encoding: [0xc4,0xe3,0x79,0x0e,0xda,0xaa] -; SKX-NEXT: ## xmm3 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] -; SKX-NEXT: vpblendw $170, %xmm2, %xmm0, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x0e,0xca,0xaa] -; SKX-NEXT: ## xmm1 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; SKX-NEXT: vpcmpnleud %xmm1, %xmm3, %k1 ## encoding: [0x62,0xf3,0x65,0x08,0x1e,0xc9,0x06] -; SKX-NEXT: vpbroadcastd LCPI5_0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x58,0x05,A,A,A,A] -; SKX-NEXT: ## fixup A - offset: 6, value: LCPI5_0, kind: FK_Data_4 +; SKX-NEXT: vmovdqa LCPI5_0, %xmm2 ## EVEX TO VEX Compression xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SKX-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x15,A,A,A,A] +; SKX-NEXT: ## fixup A - offset: 4, value: LCPI5_0, kind: FK_Data_4 +; SKX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x00,0xca] +; SKX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x00,0xc2] +; SKX-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdd,0xc1] +; SKX-NEXT: vpmovzxwd %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x33,0xc0] +; SKX-NEXT: ## xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SKX-NEXT: retl ## encoding: [0xc3] %1 = add <4 x i16> %a0, %a1 %2 = icmp ugt <4 x i16> %a0, %1