Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -28620,6 +28620,72 @@ return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1); } +// Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK. +static SDValue combineHorizontalPredicateResult(SDNode *Extract, + SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + if (!Subtarget.hasSSE2()) + return SDValue(); + + EVT ExtractVT = Extract->getValueType(0); + unsigned BitWidth = ExtractVT.getSizeInBits(); + if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 && + ExtractVT != MVT::i8) + return SDValue(); + + // Check for OR(any_of) and AND(all_of) horizontal reduction patterns. + for (ISD::NodeType Op : {ISD::OR, ISD::AND}) { + SDValue Match = matchBinOpReduction(Extract, Op); + if (!Match || (Match.getScalarValueSizeInBits() != BitWidth)) + continue; + + // We require AVX2 for PMOVMSKB for v16i16/v32i8; + unsigned MatchSizeInBits = Match.getValueSizeInBits(); + if (!(MatchSizeInBits == 128 || + (MatchSizeInBits == 256 && + ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2())))) + continue; + + // Check that we are extracting a reduction of all sign bits. + if (DAG.ComputeNumSignBits(Match) != BitWidth) + continue; + + // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB. + MVT MaskVT; + if (64 == BitWidth || 32 == BitWidth) { + MaskVT = MVT::getFloatingPointVT(BitWidth); + MaskVT = MVT::getVectorVT(MaskVT, MatchSizeInBits / BitWidth); + } else { + MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8); + } + + APInt CompareBits; + ISD::CondCode CondCode; + if (Op == ISD::OR) { + // any_of -> MOVMSK != 0 + CompareBits = APInt::getNullValue(32); + CondCode = ISD::CondCode::SETNE; + } else { + // all_of -> MOVMSK == ((1 << NumElts) - 1) + CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements()); + CondCode = ISD::CondCode::SETEQ; + } + + SDLoc DL(Extract); + APInt ResOnes = APInt::getAllOnesValue(BitWidth); + APInt ResZero = APInt::getNullValue(BitWidth); + + SDValue Res = DAG.getBitcast(MaskVT, Match); + Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res); + Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32), + DAG.getConstant(ResOnes, DL, ExtractVT), + DAG.getConstant(ResZero, DL, ExtractVT), CondCode); + return Res; + } + + return SDValue(); +} + static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // PSADBW is only supported on SSE2 and up. @@ -28731,6 +28797,9 @@ if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget)) return SAD; + if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget)) + return Cmp; + // Only operate on vectors of 4 elements, where the alternative shuffling // gets to be more expensive. if (InputVector.getValueType() != MVT::v4i32) Index: test/CodeGen/X86/vector-compare-all_of.ll =================================================================== --- test/CodeGen/X86/vector-compare-all_of.ll +++ test/CodeGen/X86/vector-compare-all_of.ll @@ -7,17 +7,21 @@ ; SSE-LABEL: test_v2f64: ; SSE: # BB#0: ; SSE-NEXT: cmpltpd %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %rax +; SSE-NEXT: movmskpd %xmm1, %eax +; SSE-NEXT: xorl %ecx, %ecx +; SSE-NEXT: cmpl $3, %eax +; SSE-NEXT: movq $-1, %rax +; SSE-NEXT: cmovneq %rcx, %rax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2f64: ; AVX: # BB#0: ; AVX-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vandpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: vmovmskpd %xmm0, %eax +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: cmpl $3, %eax +; AVX-NEXT: movq $-1, %rax +; AVX-NEXT: cmovneq %rcx, %rax ; AVX-NEXT: retq %c = fcmp ogt <2 x double> %a0, %a1 %s = sext <2 x i1> %c to <2 x i64> @@ -33,32 +37,23 @@ ; SSE-NEXT: cmpltpd %xmm1, %xmm3 ; SSE-NEXT: cmpltpd %xmm0, %xmm2 ; SSE-NEXT: andpd %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movd %xmm0, %rax +; SSE-NEXT: movmskpd %xmm2, %eax +; SSE-NEXT: xorl %ecx, %ecx +; SSE-NEXT: cmpl $3, %eax +; SSE-NEXT: movq $-1, %rax +; SSE-NEXT: cmovneq %rcx, %rax ; SSE-NEXT: retq ; -; AVX1-LABEL: test_v4f64: -; AVX1: # BB#0: -; AVX1-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vandpd %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vandpd %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v4f64: -; AVX2: # BB#0: -; AVX2-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vandpd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX-LABEL: test_v4f64: +; AVX: # BB#0: +; AVX-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vmovmskpd %ymm0, %eax +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: cmpl $15, %eax +; AVX-NEXT: movq $-1, %rax +; AVX-NEXT: cmovneq %rcx, %rax +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %c = fcmp ogt <4 x double> %a0, %a1 %s = sext <4 x i1> %c to <4 x i64> %1 = shufflevector <4 x i64> %s, <4 x i64> undef, <4 x i32> @@ -75,11 +70,11 @@ ; SSE-NEXT: cmpltpd %xmm1, %xmm3 ; SSE-NEXT: cmpltpd %xmm0, %xmm2 ; SSE-NEXT: packsswb %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: movmskps %xmm2, %eax +; SSE-NEXT: xorl %ecx, %ecx +; SSE-NEXT: cmpl $15, %eax +; SSE-NEXT: movl $-1, %eax +; SSE-NEXT: cmovnel %ecx, %eax ; SSE-NEXT: cltq ; SSE-NEXT: retq ; @@ -88,11 +83,11 @@ ; AVX-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vmovmskps %xmm0, %eax +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: cmpl $15, %eax +; AVX-NEXT: movl $-1, %eax +; AVX-NEXT: cmovnel %ecx, %eax ; AVX-NEXT: cltq ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -111,21 +106,21 @@ ; SSE-LABEL: test_v4f32: ; SSE: # BB#0: ; SSE-NEXT: cmpltps %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: movmskps %xmm1, %eax +; SSE-NEXT: xorl %ecx, %ecx +; SSE-NEXT: cmpl $15, %eax +; SSE-NEXT: movl $-1, %eax +; SSE-NEXT: cmovnel %ecx, %eax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v4f32: ; AVX: # BB#0: ; AVX-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vandpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vmovmskps %xmm0, %eax +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: cmpl $15, %eax +; AVX-NEXT: movl $-1, %eax +; AVX-NEXT: cmovnel %ecx, %eax ; AVX-NEXT: retq %c = fcmp ogt <4 x float> %a0, %a1 %s = sext <4 x i1> %c to <4 x i32> @@ -143,38 +138,23 @@ ; SSE-NEXT: cmpltps %xmm1, %xmm3 ; SSE-NEXT: cmpltps %xmm0, %xmm2 ; SSE-NEXT: andps %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: movmskps %xmm2, %eax +; SSE-NEXT: xorl %ecx, %ecx +; SSE-NEXT: cmpl $15, %eax +; SSE-NEXT: movl $-1, %eax +; SSE-NEXT: cmovnel %ecx, %eax ; SSE-NEXT: retq ; -; AVX1-LABEL: test_v8f32: -; AVX1: # BB#0: -; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v8f32: -; AVX2: # BB#0: -; AVX2-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX-LABEL: test_v8f32: +; AVX: # BB#0: +; AVX-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vmovmskps %ymm0, %eax +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: cmpl $255, %eax +; AVX-NEXT: movl $-1, %eax +; AVX-NEXT: cmovnel %ecx, %eax +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %c = fcmp ogt <8 x float> %a0, %a1 %s = sext <8 x i1> %c to <8 x i32> %1 = shufflevector <8 x i32> %s, <8 x i32> undef, <8 x i32> @@ -193,14 +173,11 @@ ; SSE-NEXT: cmpltps %xmm1, %xmm3 ; SSE-NEXT: cmpltps %xmm0, %xmm2 ; SSE-NEXT: packsswb %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: pmovmskb %xmm2, %eax +; SSE-NEXT: xorl %ecx, %ecx +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: movw $-1, %ax +; SSE-NEXT: cmovnew %cx, %ax ; SSE-NEXT: cwtl ; SSE-NEXT: retq ; @@ -209,13 +186,11 @@ ; AVX-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vpmovmskb %xmm0, %eax +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX-NEXT: movw $-1, %ax +; AVX-NEXT: cmovnew %cx, %ax ; AVX-NEXT: cwtl ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -236,17 +211,21 @@ ; SSE-LABEL: test_v2i64: ; SSE: # BB#0: ; SSE-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %rax +; SSE-NEXT: movmskpd %xmm0, %eax +; SSE-NEXT: xorl %ecx, %ecx +; SSE-NEXT: cmpl $3, %eax +; SSE-NEXT: movq $-1, %rax +; SSE-NEXT: cmovneq %rcx, %rax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2i64: ; AVX: # BB#0: ; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: vmovmskpd %xmm0, %eax +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: cmpl $3, %eax +; AVX-NEXT: movq $-1, %rax +; AVX-NEXT: cmovneq %rcx, %rax ; AVX-NEXT: retq %c = icmp sgt <2 x i64> %a0, %a1 %s = sext <2 x i1> %c to <2 x i64> @@ -262,9 +241,11 @@ ; SSE-NEXT: pcmpgtq %xmm3, %xmm1 ; SSE-NEXT: pcmpgtq %xmm2, %xmm0 ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %rax +; SSE-NEXT: movmskpd %xmm0, %eax +; SSE-NEXT: xorl %ecx, %ecx +; SSE-NEXT: cmpl $3, %eax +; SSE-NEXT: movq $-1, %rax +; SSE-NEXT: cmovneq %rcx, %rax ; SSE-NEXT: retq ; ; AVX1-LABEL: test_v4i64: @@ -274,21 +255,22 @@ ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vmovmskpd %ymm0, %eax +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: cmpl $15, %eax +; AVX1-NEXT: movq $-1, %rax +; AVX1-NEXT: cmovneq %rcx, %rax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_v4i64: ; AVX2: # BB#0: ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vmovmskpd %ymm0, %eax +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: cmpl $15, %eax +; AVX2-NEXT: movq $-1, %rax +; AVX2-NEXT: cmovneq %rcx, %rax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq %c = icmp sgt <4 x i64> %a0, %a1 @@ -307,11 +289,11 @@ ; SSE-NEXT: pcmpgtq %xmm3, %xmm1 ; SSE-NEXT: pcmpgtq %xmm2, %xmm0 ; SSE-NEXT: packsswb %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: movmskps %xmm0, %eax +; SSE-NEXT: xorl %ecx, %ecx +; SSE-NEXT: cmpl $15, %eax +; SSE-NEXT: movl $-1, %eax +; SSE-NEXT: cmovnel %ecx, %eax ; SSE-NEXT: cltq ; SSE-NEXT: retq ; @@ -322,11 +304,11 @@ ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vmovmskps %xmm0, %eax +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: cmpl $15, %eax +; AVX1-NEXT: movl $-1, %eax +; AVX1-NEXT: cmovnel %ecx, %eax ; AVX1-NEXT: cltq ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -336,11 +318,11 @@ ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vmovmskps %xmm0, %eax +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: cmpl $15, %eax +; AVX2-NEXT: movl $-1, %eax +; AVX2-NEXT: cmovnel %ecx, %eax ; AVX2-NEXT: cltq ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -359,21 +341,21 @@ ; SSE-LABEL: test_v4i32: ; SSE: # BB#0: ; SSE-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: movmskps %xmm0, %eax +; SSE-NEXT: xorl %ecx, %ecx +; SSE-NEXT: cmpl $15, %eax +; SSE-NEXT: movl $-1, %eax +; SSE-NEXT: cmovnel %ecx, %eax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v4i32: ; AVX: # BB#0: ; AVX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vmovmskps %xmm0, %eax +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: cmpl $15, %eax +; AVX-NEXT: movl $-1, %eax +; AVX-NEXT: cmovnel %ecx, %eax ; AVX-NEXT: retq %c = icmp sgt <4 x i32> %a0, %a1 %s = sext <4 x i1> %c to <4 x i32> @@ -391,11 +373,11 @@ ; SSE-NEXT: pcmpgtd %xmm3, %xmm1 ; SSE-NEXT: pcmpgtd %xmm2, %xmm0 ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: movmskps %xmm0, %eax +; SSE-NEXT: xorl %ecx, %ecx +; SSE-NEXT: cmpl $15, %eax +; SSE-NEXT: movl $-1, %eax +; SSE-NEXT: cmovnel %ecx, %eax ; SSE-NEXT: retq ; ; AVX1-LABEL: test_v8i32: @@ -405,25 +387,22 @@ ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vmovmskps %ymm0, %eax +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: cmpl $255, %eax +; AVX1-NEXT: movl $-1, %eax +; AVX1-NEXT: cmovnel %ecx, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_v8i32: ; AVX2: # BB#0: ; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vmovmskps %ymm0, %eax +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: cmpl $255, %eax +; AVX2-NEXT: movl $-1, %eax +; AVX2-NEXT: cmovnel %ecx, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq %c = icmp sgt <8 x i32> %a0, %a1 @@ -444,14 +423,11 @@ ; SSE-NEXT: pcmpgtd %xmm3, %xmm1 ; SSE-NEXT: pcmpgtd %xmm2, %xmm0 ; SSE-NEXT: packsswb %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: xorl %ecx, %ecx +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: movw $-1, %ax +; SSE-NEXT: cmovnew %cx, %ax ; SSE-NEXT: cwtl ; SSE-NEXT: retq ; @@ -462,13 +438,11 @@ ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX1-NEXT: movw $-1, %ax +; AVX1-NEXT: cmovnew %cx, %ax ; AVX1-NEXT: cwtl ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -478,13 +452,11 @@ ; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX2-NEXT: movw $-1, %ax +; AVX2-NEXT: cmovnew %cx, %ax ; AVX2-NEXT: cwtl ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -505,28 +477,21 @@ ; SSE-LABEL: test_v8i16: ; SSE: # BB#0: ; SSE-NEXT: pcmpgtw %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: # kill: %AX %AX %EAX +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: xorl %ecx, %ecx +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: movw $-1, %ax +; SSE-NEXT: cmovnew %cx, %ax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v8i16: ; AVX: # BB#0: ; AVX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: # kill: %AX %AX %EAX +; AVX-NEXT: vpmovmskb %xmm0, %eax +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX-NEXT: movw $-1, %ax +; AVX-NEXT: cmovnew %cx, %ax ; AVX-NEXT: retq %c = icmp sgt <8 x i16> %a0, %a1 %s = sext <8 x i1> %c to <8 x i16> @@ -546,15 +511,11 @@ ; SSE-NEXT: pcmpgtw %xmm3, %xmm1 ; SSE-NEXT: pcmpgtw %xmm2, %xmm0 ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: # kill: %AX %AX %EAX +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: xorl %ecx, %ecx +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: movw $-1, %ax +; SSE-NEXT: cmovnew %cx, %ax ; SSE-NEXT: retq ; ; AVX1-LABEL: test_v16i16: @@ -579,16 +540,11 @@ ; AVX2-LABEL: test_v16i16: ; AVX2: # BB#0: ; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: # kill: %AX %AX %EAX +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: cmpl $-1, %eax +; AVX2-NEXT: movw $-1, %ax +; AVX2-NEXT: cmovnew %cx, %ax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq %c = icmp sgt <16 x i16> %a0, %a1 @@ -611,17 +567,13 @@ ; SSE-NEXT: pcmpgtw %xmm3, %xmm1 ; SSE-NEXT: pcmpgtw %xmm2, %xmm0 ; SSE-NEXT: packsswb %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrlw $8, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pextrb $0, %xmm0, %eax +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: movb $-1, %al +; SSE-NEXT: je .LBB14_2 +; SSE-NEXT: # BB#1: +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: .LBB14_2: ; SSE-NEXT: movsbl %al, %eax ; SSE-NEXT: # kill: %AX %AX %EAX ; SSE-NEXT: retq @@ -633,15 +585,13 @@ ; AVX1-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: vpmovmskb %xmm0, %ecx +; AVX1-NEXT: movb $-1, %al +; AVX1-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; AVX1-NEXT: je .LBB14_2 +; AVX1-NEXT: # BB#1: +; AVX1-NEXT: xorl %eax, %eax +; AVX1-NEXT: .LBB14_2: ; AVX1-NEXT: movsbl %al, %eax ; AVX1-NEXT: # kill: %AX %AX %EAX ; AVX1-NEXT: vzeroupper @@ -652,15 +602,13 @@ ; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: vpmovmskb %xmm0, %ecx +; AVX2-NEXT: movb $-1, %al +; AVX2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; AVX2-NEXT: je .LBB14_2 +; AVX2-NEXT: # BB#1: +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: .LBB14_2: ; AVX2-NEXT: movsbl %al, %eax ; AVX2-NEXT: # kill: %AX %AX %EAX ; AVX2-NEXT: vzeroupper @@ -684,32 +632,26 @@ ; SSE-LABEL: test_v16i8: ; SSE: # BB#0: ; SSE-NEXT: pcmpgtb %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrlw $8, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pextrb $0, %xmm0, %eax +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: movb $-1, %al +; SSE-NEXT: je .LBB15_2 +; SSE-NEXT: # BB#1: +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: .LBB15_2: ; SSE-NEXT: # kill: %AL %AL %EAX ; SSE-NEXT: retq ; ; AVX-LABEL: test_v16i8: ; AVX: # BB#0: ; AVX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: vpmovmskb %xmm0, %ecx +; AVX-NEXT: movb $-1, %al +; AVX-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; AVX-NEXT: je .LBB15_2 +; AVX-NEXT: # BB#1: +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: .LBB15_2: ; AVX-NEXT: # kill: %AL %AL %EAX ; AVX-NEXT: retq %c = icmp sgt <16 x i8> %a0, %a1 @@ -732,17 +674,13 @@ ; SSE-NEXT: pcmpgtb %xmm3, %xmm1 ; SSE-NEXT: pcmpgtb %xmm2, %xmm0 ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrlw $8, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pextrb $0, %xmm0, %eax +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: movb $-1, %al +; SSE-NEXT: je .LBB16_2 +; SSE-NEXT: # BB#1: +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: .LBB16_2: ; SSE-NEXT: # kill: %AL %AL %EAX ; SSE-NEXT: retq ; @@ -770,17 +708,13 @@ ; AVX2-LABEL: test_v32i8: ; AVX2: # BB#0: ; AVX2-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: vpmovmskb %ymm0, %ecx +; AVX2-NEXT: movb $-1, %al +; AVX2-NEXT: cmpl $-1, %ecx +; AVX2-NEXT: je .LBB16_2 +; AVX2-NEXT: # BB#1: +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: .LBB16_2: ; AVX2-NEXT: # kill: %AL %AL %EAX ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq Index: test/CodeGen/X86/vector-compare-any_of.ll =================================================================== --- test/CodeGen/X86/vector-compare-any_of.ll +++ test/CodeGen/X86/vector-compare-any_of.ll @@ -7,17 +7,17 @@ ; SSE-LABEL: test_v2f64: ; SSE: # BB#0: ; SSE-NEXT: cmpltpd %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %rax +; SSE-NEXT: movmskpd %xmm1, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbq %rax, %rax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2f64: ; AVX: # BB#0: ; AVX-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vorpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: vmovmskpd %xmm0, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: sbbq %rax, %rax ; AVX-NEXT: retq %c = fcmp ogt <2 x double> %a0, %a1 %s = sext <2 x i1> %c to <2 x i64> @@ -33,32 +33,19 @@ ; SSE-NEXT: cmpltpd %xmm1, %xmm3 ; SSE-NEXT: cmpltpd %xmm0, %xmm2 ; SSE-NEXT: orpd %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movd %xmm0, %rax +; SSE-NEXT: movmskpd %xmm2, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbq %rax, %rax ; SSE-NEXT: retq ; -; AVX1-LABEL: test_v4f64: -; AVX1: # BB#0: -; AVX1-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vorpd %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vorpd %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v4f64: -; AVX2: # BB#0: -; AVX2-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vorpd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX-LABEL: test_v4f64: +; AVX: # BB#0: +; AVX-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vmovmskpd %ymm0, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: sbbq %rax, %rax +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %c = fcmp ogt <4 x double> %a0, %a1 %s = sext <4 x i1> %c to <4 x i64> %1 = shufflevector <4 x i64> %s, <4 x i64> undef, <4 x i32> @@ -75,11 +62,9 @@ ; SSE-NEXT: cmpltpd %xmm1, %xmm3 ; SSE-NEXT: cmpltpd %xmm0, %xmm2 ; SSE-NEXT: packsswb %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: movmskps %xmm2, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbl %eax, %eax ; SSE-NEXT: cltq ; SSE-NEXT: retq ; @@ -88,11 +73,9 @@ ; AVX-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vmovmskps %xmm0, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: sbbl %eax, %eax ; AVX-NEXT: cltq ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -111,21 +94,17 @@ ; SSE-LABEL: test_v4f32: ; SSE: # BB#0: ; SSE-NEXT: cmpltps %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: movmskps %xmm1, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbl %eax, %eax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v4f32: ; AVX: # BB#0: ; AVX-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vorpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vmovmskps %xmm0, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: sbbl %eax, %eax ; AVX-NEXT: retq %c = fcmp ogt <4 x float> %a0, %a1 %s = sext <4 x i1> %c to <4 x i32> @@ -143,38 +122,19 @@ ; SSE-NEXT: cmpltps %xmm1, %xmm3 ; SSE-NEXT: cmpltps %xmm0, %xmm2 ; SSE-NEXT: orps %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: movmskps %xmm2, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbl %eax, %eax ; SSE-NEXT: retq ; -; AVX1-LABEL: test_v8f32: -; AVX1: # BB#0: -; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v8f32: -; AVX2: # BB#0: -; AVX2-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX-LABEL: test_v8f32: +; AVX: # BB#0: +; AVX-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vmovmskps %ymm0, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: sbbl %eax, %eax +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %c = fcmp ogt <8 x float> %a0, %a1 %s = sext <8 x i1> %c to <8 x i32> %1 = shufflevector <8 x i32> %s, <8 x i32> undef, <8 x i32> @@ -193,14 +153,9 @@ ; SSE-NEXT: cmpltps %xmm1, %xmm3 ; SSE-NEXT: cmpltps %xmm0, %xmm2 ; SSE-NEXT: packsswb %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: pmovmskb %xmm2, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbw %ax, %ax ; SSE-NEXT: cwtl ; SSE-NEXT: retq ; @@ -209,13 +164,9 @@ ; AVX-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vpmovmskb %xmm0, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: sbbw %ax, %ax ; AVX-NEXT: cwtl ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -236,17 +187,17 @@ ; SSE-LABEL: test_v2i64: ; SSE: # BB#0: ; SSE-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %rax +; SSE-NEXT: movmskpd %xmm0, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbq %rax, %rax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2i64: ; AVX: # BB#0: ; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: vmovmskpd %xmm0, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: sbbq %rax, %rax ; AVX-NEXT: retq %c = icmp sgt <2 x i64> %a0, %a1 %s = sext <2 x i1> %c to <2 x i64> @@ -262,9 +213,9 @@ ; SSE-NEXT: pcmpgtq %xmm3, %xmm1 ; SSE-NEXT: pcmpgtq %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %rax +; SSE-NEXT: movmskpd %xmm0, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbq %rax, %rax ; SSE-NEXT: retq ; ; AVX1-LABEL: test_v4i64: @@ -274,21 +225,18 @@ ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vmovmskpd %ymm0, %eax +; AVX1-NEXT: negl %eax +; AVX1-NEXT: sbbq %rax, %rax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_v4i64: ; AVX2: # BB#0: ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vmovmskpd %ymm0, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: sbbq %rax, %rax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq %c = icmp sgt <4 x i64> %a0, %a1 @@ -307,11 +255,9 @@ ; SSE-NEXT: pcmpgtq %xmm3, %xmm1 ; SSE-NEXT: pcmpgtq %xmm2, %xmm0 ; SSE-NEXT: packsswb %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: movmskps %xmm0, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbl %eax, %eax ; SSE-NEXT: cltq ; SSE-NEXT: retq ; @@ -322,11 +268,9 @@ ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vmovmskps %xmm0, %eax +; AVX1-NEXT: negl %eax +; AVX1-NEXT: sbbl %eax, %eax ; AVX1-NEXT: cltq ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -336,11 +280,9 @@ ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vmovmskps %xmm0, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: sbbl %eax, %eax ; AVX2-NEXT: cltq ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -359,21 +301,17 @@ ; SSE-LABEL: test_v4i32: ; SSE: # BB#0: ; SSE-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: movmskps %xmm0, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbl %eax, %eax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v4i32: ; AVX: # BB#0: ; AVX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vmovmskps %xmm0, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: sbbl %eax, %eax ; AVX-NEXT: retq %c = icmp sgt <4 x i32> %a0, %a1 %s = sext <4 x i1> %c to <4 x i32> @@ -391,11 +329,9 @@ ; SSE-NEXT: pcmpgtd %xmm3, %xmm1 ; SSE-NEXT: pcmpgtd %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: movmskps %xmm0, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbl %eax, %eax ; SSE-NEXT: retq ; ; AVX1-LABEL: test_v8i32: @@ -405,25 +341,18 @@ ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vmovmskps %ymm0, %eax +; AVX1-NEXT: negl %eax +; AVX1-NEXT: sbbl %eax, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_v8i32: ; AVX2: # BB#0: ; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vmovmskps %ymm0, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: sbbl %eax, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq %c = icmp sgt <8 x i32> %a0, %a1 @@ -444,14 +373,9 @@ ; SSE-NEXT: pcmpgtd %xmm3, %xmm1 ; SSE-NEXT: pcmpgtd %xmm2, %xmm0 ; SSE-NEXT: packsswb %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbw %ax, %ax ; SSE-NEXT: cwtl ; SSE-NEXT: retq ; @@ -462,13 +386,9 @@ ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: negl %eax +; AVX1-NEXT: sbbw %ax, %ax ; AVX1-NEXT: cwtl ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -478,13 +398,9 @@ ; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: sbbw %ax, %ax ; AVX2-NEXT: cwtl ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -505,28 +421,17 @@ ; SSE-LABEL: test_v8i16: ; SSE: # BB#0: ; SSE-NEXT: pcmpgtw %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: # kill: %AX %AX %EAX +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbw %ax, %ax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v8i16: ; AVX: # BB#0: ; AVX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: # kill: %AX %AX %EAX +; AVX-NEXT: vpmovmskb %xmm0, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: sbbw %ax, %ax ; AVX-NEXT: retq %c = icmp sgt <8 x i16> %a0, %a1 %s = sext <8 x i1> %c to <8 x i16> @@ -546,15 +451,9 @@ ; SSE-NEXT: pcmpgtw %xmm3, %xmm1 ; SSE-NEXT: pcmpgtw %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: # kill: %AX %AX %EAX +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbw %ax, %ax ; SSE-NEXT: retq ; ; AVX1-LABEL: test_v16i16: @@ -579,16 +478,9 @@ ; AVX2-LABEL: test_v16i16: ; AVX2: # BB#0: ; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: # kill: %AX %AX %EAX +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: sbbw %ax, %ax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq %c = icmp sgt <16 x i16> %a0, %a1 @@ -611,19 +503,9 @@ ; SSE-NEXT: pcmpgtw %xmm3, %xmm1 ; SSE-NEXT: pcmpgtw %xmm2, %xmm0 ; SSE-NEXT: packsswb %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrlw $8, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pextrb $0, %xmm0, %eax -; SSE-NEXT: movsbl %al, %eax -; SSE-NEXT: # kill: %AX %AX %EAX +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbw %ax, %ax ; SSE-NEXT: retq ; ; AVX1-LABEL: test_v16i16_legal: @@ -633,17 +515,9 @@ ; AVX1-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: movsbl %al, %eax -; AVX1-NEXT: # kill: %AX %AX %EAX +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: negl %eax +; AVX1-NEXT: sbbw %ax, %ax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -652,17 +526,9 @@ ; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: movsbl %al, %eax -; AVX2-NEXT: # kill: %AX %AX %EAX +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: sbbw %ax, %ax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq %c = icmp sgt <16 x i16> %a0, %a1 @@ -684,33 +550,17 @@ ; SSE-LABEL: test_v16i8: ; SSE: # BB#0: ; SSE-NEXT: pcmpgtb %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrlw $8, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pextrb $0, %xmm0, %eax -; SSE-NEXT: # kill: %AL %AL %EAX +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbb %al, %al ; SSE-NEXT: retq ; ; AVX-LABEL: test_v16i8: ; AVX: # BB#0: ; AVX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm0, %eax -; AVX-NEXT: # kill: %AL %AL %EAX +; AVX-NEXT: vpmovmskb %xmm0, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: sbbb %al, %al ; AVX-NEXT: retq %c = icmp sgt <16 x i8> %a0, %a1 %s = sext <16 x i1> %c to <16 x i8> @@ -732,18 +582,9 @@ ; SSE-NEXT: pcmpgtb %xmm3, %xmm1 ; SSE-NEXT: pcmpgtb %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrlw $8, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pextrb $0, %xmm0, %eax -; SSE-NEXT: # kill: %AL %AL %EAX +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbb %al, %al ; SSE-NEXT: retq ; ; AVX1-LABEL: test_v32i8: @@ -770,18 +611,9 @@ ; AVX2-LABEL: test_v32i8: ; AVX2: # BB#0: ; AVX2-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: # kill: %AL %AL %EAX +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: sbbb %al, %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq %c = icmp sgt <32 x i8> %a0, %a1