Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -28620,6 +28620,72 @@ return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1); } +// Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK. +static SDValue combineHorizontalPredicateResult(SDNode *Extract, + SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + if (!Subtarget.hasSSE2() || Subtarget.hasVLX()) + return SDValue(); + + EVT ExtractVT = Extract->getValueType(0); + unsigned BitWidth = ExtractVT.getSizeInBits(); + if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 && + ExtractVT != MVT::i8) + return SDValue(); + + // Check for OR(any_of) and AND(all_of) horizontal reduction patterns. + for (ISD::NodeType Op : {ISD::OR, ISD::AND}) { + SDValue Match = matchBinOpReduction(Extract, Op); + if (!Match || (Match.getScalarValueSizeInBits() != BitWidth)) + continue; + + // We require AVX2 for PMOVMSKB for v16i16/v32i8; + unsigned MatchSizeInBits = Match.getValueSizeInBits(); + if (!(MatchSizeInBits == 128 || + (MatchSizeInBits == 256 && + ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2())))) + continue; + + // Check that we are extracting a reduction of all sign bits. + if (DAG.ComputeNumSignBits(Match) != BitWidth) + continue; + + // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB. + MVT MaskVT; + if (64 == BitWidth || 32 == BitWidth) { + MaskVT = MVT::getFloatingPointVT(BitWidth); + MaskVT = MVT::getVectorVT(MaskVT, MatchSizeInBits / BitWidth); + } else { + MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8); + } + + APInt CompareBits; + ISD::CondCode CondCode; + if (Op == ISD::OR) { + // any_of -> MOVMSK != 0 + CompareBits = APInt::getNullValue(32); + CondCode = ISD::CondCode::SETNE; + } else { + // all_of -> MOVMSK == ((1 << NumElts) - 1) + CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements()); + CondCode = ISD::CondCode::SETEQ; + } + + SDLoc DL(Extract); + APInt ResOnes = APInt::getAllOnesValue(BitWidth); + APInt ResZero = APInt::getNullValue(BitWidth); + + SDValue Res = DAG.getBitcast(MaskVT, Match); + Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res); + Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32), + DAG.getConstant(ResOnes, DL, ExtractVT), + DAG.getConstant(ResZero, DL, ExtractVT), CondCode); + return Res; + } + + return SDValue(); +} + static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // PSADBW is only supported on SSE2 and up. @@ -28731,6 +28797,9 @@ if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget)) return SAD; + if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget)) + return Cmp; + // Only operate on vectors of 4 elements, where the alternative shuffling // gets to be more expensive. if (InputVector.getValueType() != MVT::v4i32) Index: test/CodeGen/X86/vector-compare-all_of.ll =================================================================== --- test/CodeGen/X86/vector-compare-all_of.ll +++ test/CodeGen/X86/vector-compare-all_of.ll @@ -8,17 +8,21 @@ ; SSE-LABEL: test_v2f64: ; SSE: # BB#0: ; SSE-NEXT: cmpltpd %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %rax +; SSE-NEXT: movmskpd %xmm1, %eax +; SSE-NEXT: xorl %ecx, %ecx +; SSE-NEXT: cmpl $3, %eax +; SSE-NEXT: movq $-1, %rax +; SSE-NEXT: cmovneq %rcx, %rax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2f64: ; AVX: # BB#0: ; AVX-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vandpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: vmovmskpd %xmm0, %eax +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: cmpl $3, %eax +; AVX-NEXT: movq $-1, %rax +; AVX-NEXT: cmovneq %rcx, %rax ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f64: @@ -44,32 +48,23 @@ ; SSE-NEXT: cmpltpd %xmm1, %xmm3 ; SSE-NEXT: cmpltpd %xmm0, %xmm2 ; SSE-NEXT: andpd %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movd %xmm0, %rax +; SSE-NEXT: movmskpd %xmm2, %eax +; SSE-NEXT: xorl %ecx, %ecx +; SSE-NEXT: cmpl $3, %eax +; SSE-NEXT: movq $-1, %rax +; SSE-NEXT: cmovneq %rcx, %rax ; SSE-NEXT: retq ; -; AVX1-LABEL: test_v4f64: -; AVX1: # BB#0: -; AVX1-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vandpd %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vandpd %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v4f64: -; AVX2: # BB#0: -; AVX2-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vandpd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX-LABEL: test_v4f64: +; AVX: # BB#0: +; AVX-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vmovmskpd %ymm0, %eax +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: cmpl $15, %eax +; AVX-NEXT: movq $-1, %rax +; AVX-NEXT: cmovneq %rcx, %rax +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f64: ; AVX512: # BB#0: @@ -98,11 +93,11 @@ ; SSE-NEXT: cmpltpd %xmm1, %xmm3 ; SSE-NEXT: cmpltpd %xmm0, %xmm2 ; SSE-NEXT: packsswb %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: movmskps %xmm2, %eax +; SSE-NEXT: xorl %ecx, %ecx +; SSE-NEXT: cmpl $15, %eax +; SSE-NEXT: movl $-1, %eax +; SSE-NEXT: cmovnel %ecx, %eax ; SSE-NEXT: cltq ; SSE-NEXT: retq ; @@ -111,11 +106,11 @@ ; AVX-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vmovmskps %xmm0, %eax +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: cmpl $15, %eax +; AVX-NEXT: movl $-1, %eax +; AVX-NEXT: cmovnel %ecx, %eax ; AVX-NEXT: cltq ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -147,21 +142,21 @@ ; SSE-LABEL: test_v4f32: ; SSE: # BB#0: ; SSE-NEXT: cmpltps %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: movmskps %xmm1, %eax +; SSE-NEXT: xorl %ecx, %ecx +; SSE-NEXT: cmpl $15, %eax +; SSE-NEXT: movl $-1, %eax +; SSE-NEXT: cmovnel %ecx, %eax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v4f32: ; AVX: # BB#0: ; AVX-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vandpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vmovmskps %xmm0, %eax +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: cmpl $15, %eax +; AVX-NEXT: movl $-1, %eax +; AVX-NEXT: cmovnel %ecx, %eax ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f32: @@ -191,38 +186,23 @@ ; SSE-NEXT: cmpltps %xmm1, %xmm3 ; SSE-NEXT: cmpltps %xmm0, %xmm2 ; SSE-NEXT: andps %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: movmskps %xmm2, %eax +; SSE-NEXT: xorl %ecx, %ecx +; SSE-NEXT: cmpl $15, %eax +; SSE-NEXT: movl $-1, %eax +; SSE-NEXT: cmovnel %ecx, %eax ; SSE-NEXT: retq ; -; AVX1-LABEL: test_v8f32: -; AVX1: # BB#0: -; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v8f32: -; AVX2: # BB#0: -; AVX2-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX-LABEL: test_v8f32: +; AVX: # BB#0: +; AVX-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vmovmskps %ymm0, %eax +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: cmpl $255, %eax +; AVX-NEXT: movl $-1, %eax +; AVX-NEXT: cmovnel %ecx, %eax +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq ; ; AVX512-LABEL: test_v8f32: ; AVX512: # BB#0: @@ -255,14 +235,11 @@ ; SSE-NEXT: cmpltps %xmm1, %xmm3 ; SSE-NEXT: cmpltps %xmm0, %xmm2 ; SSE-NEXT: packsswb %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: pmovmskb %xmm2, %eax +; SSE-NEXT: xorl %ecx, %ecx +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: movw $-1, %ax +; SSE-NEXT: cmovnew %cx, %ax ; SSE-NEXT: cwtl ; SSE-NEXT: retq ; @@ -271,13 +248,11 @@ ; AVX-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vpmovmskb %xmm0, %eax +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX-NEXT: movw $-1, %ax +; AVX-NEXT: cmovnew %cx, %ax ; AVX-NEXT: cwtl ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -312,17 +287,21 @@ ; SSE-LABEL: test_v2i64: ; SSE: # BB#0: ; SSE-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %rax +; SSE-NEXT: movmskpd %xmm0, %eax +; SSE-NEXT: xorl %ecx, %ecx +; SSE-NEXT: cmpl $3, %eax +; SSE-NEXT: movq $-1, %rax +; SSE-NEXT: cmovneq %rcx, %rax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2i64: ; AVX: # BB#0: ; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: vmovmskpd %xmm0, %eax +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: cmpl $3, %eax +; AVX-NEXT: movq $-1, %rax +; AVX-NEXT: cmovneq %rcx, %rax ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2i64: @@ -348,9 +327,11 @@ ; SSE-NEXT: pcmpgtq %xmm3, %xmm1 ; SSE-NEXT: pcmpgtq %xmm2, %xmm0 ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %rax +; SSE-NEXT: movmskpd %xmm0, %eax +; SSE-NEXT: xorl %ecx, %ecx +; SSE-NEXT: cmpl $3, %eax +; SSE-NEXT: movq $-1, %rax +; SSE-NEXT: cmovneq %rcx, %rax ; SSE-NEXT: retq ; ; AVX1-LABEL: test_v4i64: @@ -360,21 +341,22 @@ ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vmovmskpd %ymm0, %eax +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: cmpl $15, %eax +; AVX1-NEXT: movq $-1, %rax +; AVX1-NEXT: cmovneq %rcx, %rax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_v4i64: ; AVX2: # BB#0: ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vmovmskpd %ymm0, %eax +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: cmpl $15, %eax +; AVX2-NEXT: movq $-1, %rax +; AVX2-NEXT: cmovneq %rcx, %rax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -405,11 +387,11 @@ ; SSE-NEXT: pcmpgtq %xmm3, %xmm1 ; SSE-NEXT: pcmpgtq %xmm2, %xmm0 ; SSE-NEXT: packsswb %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: movmskps %xmm0, %eax +; SSE-NEXT: xorl %ecx, %ecx +; SSE-NEXT: cmpl $15, %eax +; SSE-NEXT: movl $-1, %eax +; SSE-NEXT: cmovnel %ecx, %eax ; SSE-NEXT: cltq ; SSE-NEXT: retq ; @@ -420,11 +402,11 @@ ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vmovmskps %xmm0, %eax +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: cmpl $15, %eax +; AVX1-NEXT: movl $-1, %eax +; AVX1-NEXT: cmovnel %ecx, %eax ; AVX1-NEXT: cltq ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -434,11 +416,11 @@ ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vmovmskps %xmm0, %eax +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: cmpl $15, %eax +; AVX2-NEXT: movl $-1, %eax +; AVX2-NEXT: cmovnel %ecx, %eax ; AVX2-NEXT: cltq ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -470,21 +452,21 @@ ; SSE-LABEL: test_v4i32: ; SSE: # BB#0: ; SSE-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: movmskps %xmm0, %eax +; SSE-NEXT: xorl %ecx, %ecx +; SSE-NEXT: cmpl $15, %eax +; SSE-NEXT: movl $-1, %eax +; SSE-NEXT: cmovnel %ecx, %eax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v4i32: ; AVX: # BB#0: ; AVX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vmovmskps %xmm0, %eax +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: cmpl $15, %eax +; AVX-NEXT: movl $-1, %eax +; AVX-NEXT: cmovnel %ecx, %eax ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4i32: @@ -514,11 +496,11 @@ ; SSE-NEXT: pcmpgtd %xmm3, %xmm1 ; SSE-NEXT: pcmpgtd %xmm2, %xmm0 ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: movmskps %xmm0, %eax +; SSE-NEXT: xorl %ecx, %ecx +; SSE-NEXT: cmpl $15, %eax +; SSE-NEXT: movl $-1, %eax +; SSE-NEXT: cmovnel %ecx, %eax ; SSE-NEXT: retq ; ; AVX1-LABEL: test_v8i32: @@ -528,25 +510,22 @@ ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vmovmskps %ymm0, %eax +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: cmpl $255, %eax +; AVX1-NEXT: movl $-1, %eax +; AVX1-NEXT: cmovnel %ecx, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_v8i32: ; AVX2: # BB#0: ; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vmovmskps %ymm0, %eax +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: cmpl $255, %eax +; AVX2-NEXT: movl $-1, %eax +; AVX2-NEXT: cmovnel %ecx, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -581,14 +560,11 @@ ; SSE-NEXT: pcmpgtd %xmm3, %xmm1 ; SSE-NEXT: pcmpgtd %xmm2, %xmm0 ; SSE-NEXT: packsswb %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: xorl %ecx, %ecx +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: movw $-1, %ax +; SSE-NEXT: cmovnew %cx, %ax ; SSE-NEXT: cwtl ; SSE-NEXT: retq ; @@ -599,13 +575,11 @@ ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX1-NEXT: movw $-1, %ax +; AVX1-NEXT: cmovnew %cx, %ax ; AVX1-NEXT: cwtl ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -615,13 +589,11 @@ ; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX2-NEXT: movw $-1, %ax +; AVX2-NEXT: cmovnew %cx, %ax ; AVX2-NEXT: cwtl ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -656,28 +628,21 @@ ; SSE-LABEL: test_v8i16: ; SSE: # BB#0: ; SSE-NEXT: pcmpgtw %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: # kill: %AX %AX %EAX +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: xorl %ecx, %ecx +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: movw $-1, %ax +; SSE-NEXT: cmovnew %cx, %ax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v8i16: ; AVX: # BB#0: ; AVX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: # kill: %AX %AX %EAX +; AVX-NEXT: vpmovmskb %xmm0, %eax +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX-NEXT: movw $-1, %ax +; AVX-NEXT: cmovnew %cx, %ax ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v8i16: @@ -711,15 +676,11 @@ ; SSE-NEXT: pcmpgtw %xmm3, %xmm1 ; SSE-NEXT: pcmpgtw %xmm2, %xmm0 ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: # kill: %AX %AX %EAX +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: xorl %ecx, %ecx +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: movw $-1, %ax +; SSE-NEXT: cmovnew %cx, %ax ; SSE-NEXT: retq ; ; AVX1-LABEL: test_v16i16: @@ -744,16 +705,11 @@ ; AVX2-LABEL: test_v16i16: ; AVX2: # BB#0: ; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: # kill: %AX %AX %EAX +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: cmpl $-1, %eax +; AVX2-NEXT: movw $-1, %ax +; AVX2-NEXT: cmovnew %cx, %ax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -792,17 +748,13 @@ ; SSE-NEXT: pcmpgtw %xmm3, %xmm1 ; SSE-NEXT: pcmpgtw %xmm2, %xmm0 ; SSE-NEXT: packsswb %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrlw $8, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pextrb $0, %xmm0, %eax +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: movb $-1, %al +; SSE-NEXT: je .LBB14_2 +; SSE-NEXT: # BB#1: +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: .LBB14_2: ; SSE-NEXT: movsbl %al, %eax ; SSE-NEXT: # kill: %AX %AX %EAX ; SSE-NEXT: retq @@ -814,15 +766,13 @@ ; AVX1-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: vpmovmskb %xmm0, %ecx +; AVX1-NEXT: movb $-1, %al +; AVX1-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; AVX1-NEXT: je .LBB14_2 +; AVX1-NEXT: # BB#1: +; AVX1-NEXT: xorl %eax, %eax +; AVX1-NEXT: .LBB14_2: ; AVX1-NEXT: movsbl %al, %eax ; AVX1-NEXT: # kill: %AX %AX %EAX ; AVX1-NEXT: vzeroupper @@ -833,15 +783,13 @@ ; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: vpmovmskb %xmm0, %ecx +; AVX2-NEXT: movb $-1, %al +; AVX2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; AVX2-NEXT: je .LBB14_2 +; AVX2-NEXT: # BB#1: +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: .LBB14_2: ; AVX2-NEXT: movsbl %al, %eax ; AVX2-NEXT: # kill: %AX %AX %EAX ; AVX2-NEXT: vzeroupper @@ -882,32 +830,26 @@ ; SSE-LABEL: test_v16i8: ; SSE: # BB#0: ; SSE-NEXT: pcmpgtb %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrlw $8, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pextrb $0, %xmm0, %eax +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: movb $-1, %al +; SSE-NEXT: je .LBB15_2 +; SSE-NEXT: # BB#1: +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: .LBB15_2: ; SSE-NEXT: # kill: %AL %AL %EAX ; SSE-NEXT: retq ; ; AVX-LABEL: test_v16i8: ; AVX: # BB#0: ; AVX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: vpmovmskb %xmm0, %ecx +; AVX-NEXT: movb $-1, %al +; AVX-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; AVX-NEXT: je .LBB15_2 +; AVX-NEXT: # BB#1: +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: .LBB15_2: ; AVX-NEXT: # kill: %AL %AL %EAX ; AVX-NEXT: retq ; @@ -946,17 +888,13 @@ ; SSE-NEXT: pcmpgtb %xmm3, %xmm1 ; SSE-NEXT: pcmpgtb %xmm2, %xmm0 ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrlw $8, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pextrb $0, %xmm0, %eax +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: movb $-1, %al +; SSE-NEXT: je .LBB16_2 +; SSE-NEXT: # BB#1: +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: .LBB16_2: ; SSE-NEXT: # kill: %AL %AL %EAX ; SSE-NEXT: retq ; @@ -984,17 +922,13 @@ ; AVX2-LABEL: test_v32i8: ; AVX2: # BB#0: ; AVX2-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: vpmovmskb %ymm0, %ecx +; AVX2-NEXT: movb $-1, %al +; AVX2-NEXT: cmpl $-1, %ecx +; AVX2-NEXT: je .LBB16_2 +; AVX2-NEXT: # BB#1: +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: .LBB16_2: ; AVX2-NEXT: # kill: %AL %AL %EAX ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq Index: test/CodeGen/X86/vector-compare-any_of.ll =================================================================== --- test/CodeGen/X86/vector-compare-any_of.ll +++ test/CodeGen/X86/vector-compare-any_of.ll @@ -8,17 +8,17 @@ ; SSE-LABEL: test_v2f64: ; SSE: # BB#0: ; SSE-NEXT: cmpltpd %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %rax +; SSE-NEXT: movmskpd %xmm1, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbq %rax, %rax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2f64: ; AVX: # BB#0: ; AVX-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vorpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: vmovmskpd %xmm0, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: sbbq %rax, %rax ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f64: @@ -44,32 +44,19 @@ ; SSE-NEXT: cmpltpd %xmm1, %xmm3 ; SSE-NEXT: cmpltpd %xmm0, %xmm2 ; SSE-NEXT: orpd %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movd %xmm0, %rax +; SSE-NEXT: movmskpd %xmm2, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbq %rax, %rax ; SSE-NEXT: retq ; -; AVX1-LABEL: test_v4f64: -; AVX1: # BB#0: -; AVX1-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vorpd %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vorpd %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v4f64: -; AVX2: # BB#0: -; AVX2-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vorpd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX-LABEL: test_v4f64: +; AVX: # BB#0: +; AVX-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vmovmskpd %ymm0, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: sbbq %rax, %rax +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f64: ; AVX512: # BB#0: @@ -98,11 +85,9 @@ ; SSE-NEXT: cmpltpd %xmm1, %xmm3 ; SSE-NEXT: cmpltpd %xmm0, %xmm2 ; SSE-NEXT: packsswb %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: movmskps %xmm2, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbl %eax, %eax ; SSE-NEXT: cltq ; SSE-NEXT: retq ; @@ -111,11 +96,9 @@ ; AVX-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vmovmskps %xmm0, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: sbbl %eax, %eax ; AVX-NEXT: cltq ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -147,21 +130,17 @@ ; SSE-LABEL: test_v4f32: ; SSE: # BB#0: ; SSE-NEXT: cmpltps %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: movmskps %xmm1, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbl %eax, %eax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v4f32: ; AVX: # BB#0: ; AVX-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vorpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vmovmskps %xmm0, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: sbbl %eax, %eax ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f32: @@ -191,38 +170,19 @@ ; SSE-NEXT: cmpltps %xmm1, %xmm3 ; SSE-NEXT: cmpltps %xmm0, %xmm2 ; SSE-NEXT: orps %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: movmskps %xmm2, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbl %eax, %eax ; SSE-NEXT: retq ; -; AVX1-LABEL: test_v8f32: -; AVX1: # BB#0: -; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v8f32: -; AVX2: # BB#0: -; AVX2-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX-LABEL: test_v8f32: +; AVX: # BB#0: +; AVX-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vmovmskps %ymm0, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: sbbl %eax, %eax +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq ; ; AVX512-LABEL: test_v8f32: ; AVX512: # BB#0: @@ -255,14 +215,9 @@ ; SSE-NEXT: cmpltps %xmm1, %xmm3 ; SSE-NEXT: cmpltps %xmm0, %xmm2 ; SSE-NEXT: packsswb %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: pmovmskb %xmm2, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbw %ax, %ax ; SSE-NEXT: cwtl ; SSE-NEXT: retq ; @@ -271,13 +226,9 @@ ; AVX-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vpmovmskb %xmm0, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: sbbw %ax, %ax ; AVX-NEXT: cwtl ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -312,17 +263,17 @@ ; SSE-LABEL: test_v2i64: ; SSE: # BB#0: ; SSE-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %rax +; SSE-NEXT: movmskpd %xmm0, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbq %rax, %rax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2i64: ; AVX: # BB#0: ; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: vmovmskpd %xmm0, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: sbbq %rax, %rax ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2i64: @@ -348,9 +299,9 @@ ; SSE-NEXT: pcmpgtq %xmm3, %xmm1 ; SSE-NEXT: pcmpgtq %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %rax +; SSE-NEXT: movmskpd %xmm0, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbq %rax, %rax ; SSE-NEXT: retq ; ; AVX1-LABEL: test_v4i64: @@ -360,21 +311,18 @@ ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vmovmskpd %ymm0, %eax +; AVX1-NEXT: negl %eax +; AVX1-NEXT: sbbq %rax, %rax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_v4i64: ; AVX2: # BB#0: ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vmovmskpd %ymm0, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: sbbq %rax, %rax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -405,11 +353,9 @@ ; SSE-NEXT: pcmpgtq %xmm3, %xmm1 ; SSE-NEXT: pcmpgtq %xmm2, %xmm0 ; SSE-NEXT: packsswb %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: movmskps %xmm0, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbl %eax, %eax ; SSE-NEXT: cltq ; SSE-NEXT: retq ; @@ -420,11 +366,9 @@ ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vmovmskps %xmm0, %eax +; AVX1-NEXT: negl %eax +; AVX1-NEXT: sbbl %eax, %eax ; AVX1-NEXT: cltq ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -434,11 +378,9 @@ ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vmovmskps %xmm0, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: sbbl %eax, %eax ; AVX2-NEXT: cltq ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -470,21 +412,17 @@ ; SSE-LABEL: test_v4i32: ; SSE: # BB#0: ; SSE-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: movmskps %xmm0, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbl %eax, %eax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v4i32: ; AVX: # BB#0: ; AVX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vmovmskps %xmm0, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: sbbl %eax, %eax ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4i32: @@ -514,11 +452,9 @@ ; SSE-NEXT: pcmpgtd %xmm3, %xmm1 ; SSE-NEXT: pcmpgtd %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: movmskps %xmm0, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbl %eax, %eax ; SSE-NEXT: retq ; ; AVX1-LABEL: test_v8i32: @@ -528,25 +464,18 @@ ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vmovmskps %ymm0, %eax +; AVX1-NEXT: negl %eax +; AVX1-NEXT: sbbl %eax, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_v8i32: ; AVX2: # BB#0: ; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vmovmskps %ymm0, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: sbbl %eax, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -581,14 +510,9 @@ ; SSE-NEXT: pcmpgtd %xmm3, %xmm1 ; SSE-NEXT: pcmpgtd %xmm2, %xmm0 ; SSE-NEXT: packsswb %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbw %ax, %ax ; SSE-NEXT: cwtl ; SSE-NEXT: retq ; @@ -599,13 +523,9 @@ ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: negl %eax +; AVX1-NEXT: sbbw %ax, %ax ; AVX1-NEXT: cwtl ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -615,13 +535,9 @@ ; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: sbbw %ax, %ax ; AVX2-NEXT: cwtl ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -656,28 +572,17 @@ ; SSE-LABEL: test_v8i16: ; SSE: # BB#0: ; SSE-NEXT: pcmpgtw %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: # kill: %AX %AX %EAX +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbw %ax, %ax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v8i16: ; AVX: # BB#0: ; AVX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: # kill: %AX %AX %EAX +; AVX-NEXT: vpmovmskb %xmm0, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: sbbw %ax, %ax ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v8i16: @@ -711,15 +616,9 @@ ; SSE-NEXT: pcmpgtw %xmm3, %xmm1 ; SSE-NEXT: pcmpgtw %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: # kill: %AX %AX %EAX +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbw %ax, %ax ; SSE-NEXT: retq ; ; AVX1-LABEL: test_v16i16: @@ -744,16 +643,9 @@ ; AVX2-LABEL: test_v16i16: ; AVX2: # BB#0: ; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: # kill: %AX %AX %EAX +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: sbbw %ax, %ax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -792,19 +684,9 @@ ; SSE-NEXT: pcmpgtw %xmm3, %xmm1 ; SSE-NEXT: pcmpgtw %xmm2, %xmm0 ; SSE-NEXT: packsswb %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrlw $8, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pextrb $0, %xmm0, %eax -; SSE-NEXT: movsbl %al, %eax -; SSE-NEXT: # kill: %AX %AX %EAX +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbw %ax, %ax ; SSE-NEXT: retq ; ; AVX1-LABEL: test_v16i16_legal: @@ -814,17 +696,9 @@ ; AVX1-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: movsbl %al, %eax -; AVX1-NEXT: # kill: %AX %AX %EAX +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: negl %eax +; AVX1-NEXT: sbbw %ax, %ax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -833,17 +707,9 @@ ; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: movsbl %al, %eax -; AVX2-NEXT: # kill: %AX %AX %EAX +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: sbbw %ax, %ax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -882,33 +748,17 @@ ; SSE-LABEL: test_v16i8: ; SSE: # BB#0: ; SSE-NEXT: pcmpgtb %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrlw $8, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pextrb $0, %xmm0, %eax -; SSE-NEXT: # kill: %AL %AL %EAX +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbb %al, %al ; SSE-NEXT: retq ; ; AVX-LABEL: test_v16i8: ; AVX: # BB#0: ; AVX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm0, %eax -; AVX-NEXT: # kill: %AL %AL %EAX +; AVX-NEXT: vpmovmskb %xmm0, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: sbbb %al, %al ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v16i8: @@ -946,18 +796,9 @@ ; SSE-NEXT: pcmpgtb %xmm3, %xmm1 ; SSE-NEXT: pcmpgtb %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrlw $8, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pextrb $0, %xmm0, %eax -; SSE-NEXT: # kill: %AL %AL %EAX +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbb %al, %al ; SSE-NEXT: retq ; ; AVX1-LABEL: test_v32i8: @@ -984,18 +825,9 @@ ; AVX2-LABEL: test_v32i8: ; AVX2: # BB#0: ; AVX2-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: # kill: %AL %AL %EAX +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: sbbb %al, %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ;