Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -28627,6 +28627,85 @@ return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1); } +// Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK. +static SDValue combineHorizontalPredicateResult(SDNode *Extract, + SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + // Bail without SSE2 or with AVX512VL (which uses predicate registers). + if (!Subtarget.hasSSE2() || Subtarget.hasVLX()) + return SDValue(); + + EVT ExtractVT = Extract->getValueType(0); + unsigned BitWidth = ExtractVT.getSizeInBits(); + if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 && + ExtractVT != MVT::i8) + return SDValue(); + + // Check for OR(any_of) and AND(all_of) horizontal reduction patterns. + for (ISD::NodeType Op : {ISD::OR, ISD::AND}) { + SDValue Match = matchBinOpReduction(Extract, Op); + if (!Match) + continue; + + // EXTRACT_VECTOR_ELT can require implicit extension of the vector element + // which we can't support here for now. + if (Match.getScalarValueSizeInBits() != BitWidth) + continue; + + // We require AVX2 for PMOVMSKB for v16i16/v32i8; + unsigned MatchSizeInBits = Match.getValueSizeInBits(); + if (!(MatchSizeInBits == 128 || + (MatchSizeInBits == 256 && + ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2())))) + return SDValue(); + + // Don't bother performing this for 2-element vectors. + if (Match.getValueType().getVectorNumElements() <= 2) + return SDValue(); + + // Check that we are extracting a reduction of all sign bits. + if (DAG.ComputeNumSignBits(Match) != BitWidth) + return SDValue(); + + // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB. + MVT MaskVT; + if (64 == BitWidth || 32 == BitWidth) + MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth), + MatchSizeInBits / BitWidth); + else + MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8); + + APInt CompareBits; + ISD::CondCode CondCode; + if (Op == ISD::OR) { + // any_of -> MOVMSK != 0 + CompareBits = APInt::getNullValue(32); + CondCode = ISD::CondCode::SETNE; + } else { + // all_of -> MOVMSK == ((1 << NumElts) - 1) + CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements()); + CondCode = ISD::CondCode::SETEQ; + } + + // Perform the select as i32/i64 and then truncate to avoid partial register + // stalls. + unsigned ResWidth = std::max(BitWidth, 32u); + APInt ResOnes = APInt::getAllOnesValue(ResWidth); + APInt ResZero = APInt::getNullValue(ResWidth); + EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth); + + SDLoc DL(Extract); + SDValue Res = DAG.getBitcast(MaskVT, Match); + Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res); + Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32), + DAG.getConstant(ResOnes, DL, ResVT), + DAG.getConstant(ResZero, DL, ResVT), CondCode); + return DAG.getSExtOrTrunc(Res, DL, ExtractVT); + } + + return SDValue(); +} + static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // PSADBW is only supported on SSE2 and up. @@ -28738,6 +28817,10 @@ if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget)) return SAD; + // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK. + if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget)) + return Cmp; + // Only operate on vectors of 4 elements, where the alternative shuffling // gets to be more expensive. if (InputVector.getValueType() != MVT::v4i32) Index: llvm/trunk/test/CodeGen/X86/vector-compare-all_of.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-compare-all_of.ll +++ llvm/trunk/test/CodeGen/X86/vector-compare-all_of.ll @@ -49,27 +49,16 @@ ; SSE-NEXT: movd %xmm0, %rax ; SSE-NEXT: retq ; -; AVX1-LABEL: test_v4f64: -; AVX1: # BB#0: -; AVX1-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vandpd %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vandpd %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v4f64: -; AVX2: # BB#0: -; AVX2-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vandpd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX-LABEL: test_v4f64: +; AVX: # BB#0: +; AVX-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vmovmskpd %ymm0, %eax +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: cmpl $15, %eax +; AVX-NEXT: movq $-1, %rax +; AVX-NEXT: cmovneq %rcx, %rax +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f64: ; AVX512: # BB#0: @@ -98,11 +87,11 @@ ; SSE-NEXT: cmpltpd %xmm1, %xmm3 ; SSE-NEXT: cmpltpd %xmm0, %xmm2 ; SSE-NEXT: packsswb %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: movmskps %xmm2, %eax +; SSE-NEXT: xorl %ecx, %ecx +; SSE-NEXT: cmpl $15, %eax +; SSE-NEXT: movl $-1, %eax +; SSE-NEXT: cmovnel %ecx, %eax ; SSE-NEXT: cltq ; SSE-NEXT: retq ; @@ -111,11 +100,11 @@ ; AVX-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vmovmskps %xmm0, %eax +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: cmpl $15, %eax +; AVX-NEXT: movl $-1, %eax +; AVX-NEXT: cmovnel %ecx, %eax ; AVX-NEXT: cltq ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -147,21 +136,21 @@ ; SSE-LABEL: test_v4f32: ; SSE: # BB#0: ; SSE-NEXT: cmpltps %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: movmskps %xmm1, %eax +; SSE-NEXT: xorl %ecx, %ecx +; SSE-NEXT: cmpl $15, %eax +; SSE-NEXT: movl $-1, %eax +; SSE-NEXT: cmovnel %ecx, %eax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v4f32: ; AVX: # BB#0: ; AVX-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vandpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vmovmskps %xmm0, %eax +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: cmpl $15, %eax +; AVX-NEXT: movl $-1, %eax +; AVX-NEXT: cmovnel %ecx, %eax ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f32: @@ -191,38 +180,23 @@ ; SSE-NEXT: cmpltps %xmm1, %xmm3 ; SSE-NEXT: cmpltps %xmm0, %xmm2 ; SSE-NEXT: andps %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: movmskps %xmm2, %eax +; SSE-NEXT: xorl %ecx, %ecx +; SSE-NEXT: cmpl $15, %eax +; SSE-NEXT: movl $-1, %eax +; SSE-NEXT: cmovnel %ecx, %eax ; SSE-NEXT: retq ; -; AVX1-LABEL: test_v8f32: -; AVX1: # BB#0: -; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v8f32: -; AVX2: # BB#0: -; AVX2-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX-LABEL: test_v8f32: +; AVX: # BB#0: +; AVX-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vmovmskps %ymm0, %eax +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: cmpl $255, %eax +; AVX-NEXT: movl $-1, %eax +; AVX-NEXT: cmovnel %ecx, %eax +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq ; ; AVX512-LABEL: test_v8f32: ; AVX512: # BB#0: @@ -255,15 +229,11 @@ ; SSE-NEXT: cmpltps %xmm1, %xmm3 ; SSE-NEXT: cmpltps %xmm0, %xmm2 ; SSE-NEXT: packsswb %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: cwtl +; SSE-NEXT: pmovmskb %xmm2, %eax +; SSE-NEXT: xorl %ecx, %ecx +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: movl $-1, %eax +; SSE-NEXT: cmovnel %ecx, %eax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v8f32_legal: @@ -271,14 +241,11 @@ ; AVX-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: cwtl +; AVX-NEXT: vpmovmskb %xmm0, %eax +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX-NEXT: movl $-1, %eax +; AVX-NEXT: cmovnel %ecx, %eax ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -360,21 +327,22 @@ ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vmovmskpd %ymm0, %eax +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: cmpl $15, %eax +; AVX1-NEXT: movq $-1, %rax +; AVX1-NEXT: cmovneq %rcx, %rax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_v4i64: ; AVX2: # BB#0: ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vmovmskpd %ymm0, %eax +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: cmpl $15, %eax +; AVX2-NEXT: movq $-1, %rax +; AVX2-NEXT: cmovneq %rcx, %rax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -405,11 +373,11 @@ ; SSE-NEXT: pcmpgtq %xmm3, %xmm1 ; SSE-NEXT: pcmpgtq %xmm2, %xmm0 ; SSE-NEXT: packsswb %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: movmskps %xmm0, %eax +; SSE-NEXT: xorl %ecx, %ecx +; SSE-NEXT: cmpl $15, %eax +; SSE-NEXT: movl $-1, %eax +; SSE-NEXT: cmovnel %ecx, %eax ; SSE-NEXT: cltq ; SSE-NEXT: retq ; @@ -420,11 +388,11 @@ ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vmovmskps %xmm0, %eax +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: cmpl $15, %eax +; AVX1-NEXT: movl $-1, %eax +; AVX1-NEXT: cmovnel %ecx, %eax ; AVX1-NEXT: cltq ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -434,11 +402,11 @@ ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vmovmskps %xmm0, %eax +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: cmpl $15, %eax +; AVX2-NEXT: movl $-1, %eax +; AVX2-NEXT: cmovnel %ecx, %eax ; AVX2-NEXT: cltq ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -470,21 +438,21 @@ ; SSE-LABEL: test_v4i32: ; SSE: # BB#0: ; SSE-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: movmskps %xmm0, %eax +; SSE-NEXT: xorl %ecx, %ecx +; SSE-NEXT: cmpl $15, %eax +; SSE-NEXT: movl $-1, %eax +; SSE-NEXT: cmovnel %ecx, %eax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v4i32: ; AVX: # BB#0: ; AVX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vmovmskps %xmm0, %eax +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: cmpl $15, %eax +; AVX-NEXT: movl $-1, %eax +; AVX-NEXT: cmovnel %ecx, %eax ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4i32: @@ -514,11 +482,11 @@ ; SSE-NEXT: pcmpgtd %xmm3, %xmm1 ; SSE-NEXT: pcmpgtd %xmm2, %xmm0 ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: movmskps %xmm0, %eax +; SSE-NEXT: xorl %ecx, %ecx +; SSE-NEXT: cmpl $15, %eax +; SSE-NEXT: movl $-1, %eax +; SSE-NEXT: cmovnel %ecx, %eax ; SSE-NEXT: retq ; ; AVX1-LABEL: test_v8i32: @@ -528,25 +496,22 @@ ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vmovmskps %ymm0, %eax +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: cmpl $255, %eax +; AVX1-NEXT: movl $-1, %eax +; AVX1-NEXT: cmovnel %ecx, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_v8i32: ; AVX2: # BB#0: ; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vmovmskps %ymm0, %eax +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: cmpl $255, %eax +; AVX2-NEXT: movl $-1, %eax +; AVX2-NEXT: cmovnel %ecx, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -581,15 +546,11 @@ ; SSE-NEXT: pcmpgtd %xmm3, %xmm1 ; SSE-NEXT: pcmpgtd %xmm2, %xmm0 ; SSE-NEXT: packsswb %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: cwtl +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: xorl %ecx, %ecx +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: movl $-1, %eax +; SSE-NEXT: cmovnel %ecx, %eax ; SSE-NEXT: retq ; ; AVX1-LABEL: test_v8i32_legal: @@ -599,14 +560,11 @@ ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: cwtl +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX1-NEXT: movl $-1, %eax +; AVX1-NEXT: cmovnel %ecx, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -615,14 +573,11 @@ ; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: cwtl +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX2-NEXT: movl $-1, %eax +; AVX2-NEXT: cmovnel %ecx, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -656,27 +611,22 @@ ; SSE-LABEL: test_v8i16: ; SSE: # BB#0: ; SSE-NEXT: pcmpgtw %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: xorl %ecx, %ecx +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: movl $-1, %eax +; SSE-NEXT: cmovnel %ecx, %eax ; SSE-NEXT: # kill: %AX %AX %EAX ; SSE-NEXT: retq ; ; AVX-LABEL: test_v8i16: ; AVX: # BB#0: ; AVX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vpmovmskb %xmm0, %eax +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX-NEXT: movl $-1, %eax +; AVX-NEXT: cmovnel %ecx, %eax ; AVX-NEXT: # kill: %AX %AX %EAX ; AVX-NEXT: retq ; @@ -711,14 +661,11 @@ ; SSE-NEXT: pcmpgtw %xmm3, %xmm1 ; SSE-NEXT: pcmpgtw %xmm2, %xmm0 ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: xorl %ecx, %ecx +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: movl $-1, %eax +; SSE-NEXT: cmovnel %ecx, %eax ; SSE-NEXT: # kill: %AX %AX %EAX ; SSE-NEXT: retq ; @@ -744,15 +691,10 @@ ; AVX2-LABEL: test_v16i16: ; AVX2: # BB#0: ; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vpmovmskb %ymm0, %ecx +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: cmpl $-1, %ecx +; AVX2-NEXT: cmovel %ecx, %eax ; AVX2-NEXT: # kill: %AX %AX %EAX ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -792,18 +734,11 @@ ; SSE-NEXT: pcmpgtw %xmm3, %xmm1 ; SSE-NEXT: pcmpgtw %xmm2, %xmm0 ; SSE-NEXT: packsswb %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrlw $8, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pextrb $0, %xmm0, %eax -; SSE-NEXT: movsbl %al, %eax +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: xorl %ecx, %ecx +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: movl $-1, %eax +; SSE-NEXT: cmovnel %ecx, %eax ; SSE-NEXT: # kill: %AX %AX %EAX ; SSE-NEXT: retq ; @@ -814,16 +749,11 @@ ; AVX1-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: movsbl %al, %eax +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX1-NEXT: movl $-1, %eax +; AVX1-NEXT: cmovnel %ecx, %eax ; AVX1-NEXT: # kill: %AX %AX %EAX ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -833,16 +763,11 @@ ; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: movsbl %al, %eax +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX2-NEXT: movl $-1, %eax +; AVX2-NEXT: cmovnel %ecx, %eax ; AVX2-NEXT: # kill: %AX %AX %EAX ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -882,32 +807,22 @@ ; SSE-LABEL: test_v16i8: ; SSE: # BB#0: ; SSE-NEXT: pcmpgtb %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrlw $8, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pextrb $0, %xmm0, %eax +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: xorl %ecx, %ecx +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: movl $-1, %eax +; SSE-NEXT: cmovnel %ecx, %eax ; SSE-NEXT: # kill: %AL %AL %EAX ; SSE-NEXT: retq ; ; AVX-LABEL: test_v16i8: ; AVX: # BB#0: ; AVX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: vpmovmskb %xmm0, %eax +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX-NEXT: movl $-1, %eax +; AVX-NEXT: cmovnel %ecx, %eax ; AVX-NEXT: # kill: %AL %AL %EAX ; AVX-NEXT: retq ; @@ -946,17 +861,11 @@ ; SSE-NEXT: pcmpgtb %xmm3, %xmm1 ; SSE-NEXT: pcmpgtb %xmm2, %xmm0 ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrlw $8, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pextrb $0, %xmm0, %eax +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: xorl %ecx, %ecx +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: movl $-1, %eax +; SSE-NEXT: cmovnel %ecx, %eax ; SSE-NEXT: # kill: %AL %AL %EAX ; SSE-NEXT: retq ; @@ -984,17 +893,10 @@ ; AVX2-LABEL: test_v32i8: ; AVX2: # BB#0: ; AVX2-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: vpmovmskb %ymm0, %ecx +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: cmpl $-1, %ecx +; AVX2-NEXT: cmovel %ecx, %eax ; AVX2-NEXT: # kill: %AL %AL %EAX ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq Index: llvm/trunk/test/CodeGen/X86/vector-compare-any_of.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-compare-any_of.ll +++ llvm/trunk/test/CodeGen/X86/vector-compare-any_of.ll @@ -49,27 +49,14 @@ ; SSE-NEXT: movd %xmm0, %rax ; SSE-NEXT: retq ; -; AVX1-LABEL: test_v4f64: -; AVX1: # BB#0: -; AVX1-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vorpd %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vorpd %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v4f64: -; AVX2: # BB#0: -; AVX2-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vorpd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX-LABEL: test_v4f64: +; AVX: # BB#0: +; AVX-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vmovmskpd %ymm0, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: sbbq %rax, %rax +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f64: ; AVX512: # BB#0: @@ -98,11 +85,9 @@ ; SSE-NEXT: cmpltpd %xmm1, %xmm3 ; SSE-NEXT: cmpltpd %xmm0, %xmm2 ; SSE-NEXT: packsswb %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: movmskps %xmm2, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbl %eax, %eax ; SSE-NEXT: cltq ; SSE-NEXT: retq ; @@ -111,11 +96,9 @@ ; AVX-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vmovmskps %xmm0, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: sbbl %eax, %eax ; AVX-NEXT: cltq ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -147,21 +130,17 @@ ; SSE-LABEL: test_v4f32: ; SSE: # BB#0: ; SSE-NEXT: cmpltps %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: movmskps %xmm1, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbl %eax, %eax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v4f32: ; AVX: # BB#0: ; AVX-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vorpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vmovmskps %xmm0, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: sbbl %eax, %eax ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f32: @@ -191,38 +170,19 @@ ; SSE-NEXT: cmpltps %xmm1, %xmm3 ; SSE-NEXT: cmpltps %xmm0, %xmm2 ; SSE-NEXT: orps %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: movmskps %xmm2, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbl %eax, %eax ; SSE-NEXT: retq ; -; AVX1-LABEL: test_v8f32: -; AVX1: # BB#0: -; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v8f32: -; AVX2: # BB#0: -; AVX2-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX-LABEL: test_v8f32: +; AVX: # BB#0: +; AVX-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vmovmskps %ymm0, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: sbbl %eax, %eax +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq ; ; AVX512-LABEL: test_v8f32: ; AVX512: # BB#0: @@ -255,15 +215,9 @@ ; SSE-NEXT: cmpltps %xmm1, %xmm3 ; SSE-NEXT: cmpltps %xmm0, %xmm2 ; SSE-NEXT: packsswb %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: cwtl +; SSE-NEXT: pmovmskb %xmm2, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbl %eax, %eax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v8f32_legal: @@ -271,14 +225,9 @@ ; AVX-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: cwtl +; AVX-NEXT: vpmovmskb %xmm0, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: sbbl %eax, %eax ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -360,21 +309,18 @@ ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vmovmskpd %ymm0, %eax +; AVX1-NEXT: negl %eax +; AVX1-NEXT: sbbq %rax, %rax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_v4i64: ; AVX2: # BB#0: ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vmovmskpd %ymm0, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: sbbq %rax, %rax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -405,11 +351,9 @@ ; SSE-NEXT: pcmpgtq %xmm3, %xmm1 ; SSE-NEXT: pcmpgtq %xmm2, %xmm0 ; SSE-NEXT: packsswb %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: movmskps %xmm0, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbl %eax, %eax ; SSE-NEXT: cltq ; SSE-NEXT: retq ; @@ -420,11 +364,9 @@ ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vmovmskps %xmm0, %eax +; AVX1-NEXT: negl %eax +; AVX1-NEXT: sbbl %eax, %eax ; AVX1-NEXT: cltq ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -434,11 +376,9 @@ ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vmovmskps %xmm0, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: sbbl %eax, %eax ; AVX2-NEXT: cltq ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -470,21 +410,17 @@ ; SSE-LABEL: test_v4i32: ; SSE: # BB#0: ; SSE-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: movmskps %xmm0, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbl %eax, %eax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v4i32: ; AVX: # BB#0: ; AVX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vmovmskps %xmm0, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: sbbl %eax, %eax ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4i32: @@ -514,11 +450,9 @@ ; SSE-NEXT: pcmpgtd %xmm3, %xmm1 ; SSE-NEXT: pcmpgtd %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: movmskps %xmm0, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbl %eax, %eax ; SSE-NEXT: retq ; ; AVX1-LABEL: test_v8i32: @@ -528,25 +462,18 @@ ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vmovmskps %ymm0, %eax +; AVX1-NEXT: negl %eax +; AVX1-NEXT: sbbl %eax, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_v8i32: ; AVX2: # BB#0: ; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vmovmskps %ymm0, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: sbbl %eax, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -581,15 +508,9 @@ ; SSE-NEXT: pcmpgtd %xmm3, %xmm1 ; SSE-NEXT: pcmpgtd %xmm2, %xmm0 ; SSE-NEXT: packsswb %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: cwtl +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbl %eax, %eax ; SSE-NEXT: retq ; ; AVX1-LABEL: test_v8i32_legal: @@ -599,14 +520,9 @@ ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: cwtl +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: negl %eax +; AVX1-NEXT: sbbl %eax, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -615,14 +531,9 @@ ; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: cwtl +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: sbbl %eax, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -656,27 +567,18 @@ ; SSE-LABEL: test_v8i16: ; SSE: # BB#0: ; SSE-NEXT: pcmpgtw %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbl %eax, %eax ; SSE-NEXT: # kill: %AX %AX %EAX ; SSE-NEXT: retq ; ; AVX-LABEL: test_v8i16: ; AVX: # BB#0: ; AVX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vpmovmskb %xmm0, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: sbbl %eax, %eax ; AVX-NEXT: # kill: %AX %AX %EAX ; AVX-NEXT: retq ; @@ -711,14 +613,9 @@ ; SSE-NEXT: pcmpgtw %xmm3, %xmm1 ; SSE-NEXT: pcmpgtw %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbl %eax, %eax ; SSE-NEXT: # kill: %AX %AX %EAX ; SSE-NEXT: retq ; @@ -744,15 +641,9 @@ ; AVX2-LABEL: test_v16i16: ; AVX2: # BB#0: ; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: sbbl %eax, %eax ; AVX2-NEXT: # kill: %AX %AX %EAX ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -792,18 +683,9 @@ ; SSE-NEXT: pcmpgtw %xmm3, %xmm1 ; SSE-NEXT: pcmpgtw %xmm2, %xmm0 ; SSE-NEXT: packsswb %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrlw $8, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pextrb $0, %xmm0, %eax -; SSE-NEXT: movsbl %al, %eax +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbl %eax, %eax ; SSE-NEXT: # kill: %AX %AX %EAX ; SSE-NEXT: retq ; @@ -814,16 +696,9 @@ ; AVX1-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: movsbl %al, %eax +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: negl %eax +; AVX1-NEXT: sbbl %eax, %eax ; AVX1-NEXT: # kill: %AX %AX %EAX ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -833,16 +708,9 @@ ; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: movsbl %al, %eax +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: sbbl %eax, %eax ; AVX2-NEXT: # kill: %AX %AX %EAX ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -882,32 +750,18 @@ ; SSE-LABEL: test_v16i8: ; SSE: # BB#0: ; SSE-NEXT: pcmpgtb %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrlw $8, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pextrb $0, %xmm0, %eax +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbl %eax, %eax ; SSE-NEXT: # kill: %AL %AL %EAX ; SSE-NEXT: retq ; ; AVX-LABEL: test_v16i8: ; AVX: # BB#0: ; AVX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: vpmovmskb %xmm0, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: sbbl %eax, %eax ; AVX-NEXT: # kill: %AL %AL %EAX ; AVX-NEXT: retq ; @@ -946,17 +800,9 @@ ; SSE-NEXT: pcmpgtb %xmm3, %xmm1 ; SSE-NEXT: pcmpgtb %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrlw $8, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pextrb $0, %xmm0, %eax +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbl %eax, %eax ; SSE-NEXT: # kill: %AL %AL %EAX ; SSE-NEXT: retq ; @@ -984,17 +830,9 @@ ; AVX2-LABEL: test_v32i8: ; AVX2: # BB#0: ; AVX2-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: sbbl %eax, %eax ; AVX2-NEXT: # kill: %AL %AL %EAX ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq