diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -24339,12 +24339,12 @@ return true; } -// Helper function for comparing all bits of a vector against zero. -static SDValue LowerVectorAllZero(const SDLoc &DL, SDValue V, ISD::CondCode CC, - const APInt &Mask, - const X86Subtarget &Subtarget, - SelectionDAG &DAG, X86::CondCode &X86CC) { - EVT VT = V.getValueType(); +// Helper function for comparing all bits of two vectors. +static SDValue LowerVectorAllEqual(const SDLoc &DL, SDValue LHS, SDValue RHS, + ISD::CondCode CC, const APInt &Mask, + const X86Subtarget &Subtarget, + SelectionDAG &DAG, X86::CondCode &X86CC) { + EVT VT = LHS.getValueType(); unsigned ScalarSize = VT.getScalarSizeInBits(); if (Mask.getBitWidth() != ScalarSize) { assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch"); @@ -24372,8 +24372,8 @@ if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT)) return SDValue(); return DAG.getNode(X86ISD::CMP, DL, MVT::i32, - DAG.getBitcast(IntVT, MaskBits(V)), - DAG.getConstant(0, DL, IntVT)); + DAG.getBitcast(IntVT, MaskBits(LHS)), + DAG.getBitcast(IntVT, MaskBits(RHS))); } // Without PTEST, a masked v2i64 or-reduction is not faster than @@ -24385,36 +24385,51 @@ // Split down to 128/256/512-bit vector. unsigned TestSize = UseKORTEST ? 512 : (Subtarget.hasAVX() ? 256 : 128); - while (VT.getSizeInBits() > TestSize) { - auto Split = DAG.SplitVector(V, DL); - VT = Split.first.getValueType(); - V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second); + if (VT.getSizeInBits() > TestSize) { + // Convert to a ICMP_EQ(XOR(LHS,RHS),0) pattern. + SDValue V = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS); + while (VT.getSizeInBits() > TestSize) { + auto Split = DAG.SplitVector(V, DL); + VT = Split.first.getValueType(); + V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second); + } + LHS = V; + RHS = DAG.getConstant(0, DL, VT); } if (UseKORTEST && VT.is512BitVector()) { - V = DAG.getBitcast(MVT::v16i32, MaskBits(V)); - V = DAG.getSetCC(DL, MVT::v16i1, V, - getZeroVector(MVT::v16i32, Subtarget, DAG, DL), - ISD::SETNE); + LHS = DAG.getBitcast(MVT::v16i32, MaskBits(LHS)); + RHS = DAG.getBitcast(MVT::v16i32, MaskBits(RHS)); + SDValue V = DAG.getSetCC(DL, MVT::v16i1, LHS, RHS, ISD::SETNE); return DAG.getNode(X86ISD::KORTEST, DL, MVT::i32, V, V); } if (UsePTEST) { MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; - V = DAG.getBitcast(TestVT, MaskBits(V)); + LHS = DAG.getBitcast(TestVT, MaskBits(LHS)); + RHS = DAG.getBitcast(TestVT, MaskBits(RHS)); + SDValue V = DAG.getNode(ISD::XOR, DL, TestVT, LHS, RHS); return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V); } MVT MaskVT = ScalarSize >= 32 ? MVT::v4i32 : MVT::v16i8; - V = DAG.getBitcast(MaskVT, MaskBits(V)); - V = DAG.getNode(X86ISD::PCMPEQ, DL, MaskVT, V, - getZeroVector(MaskVT, Subtarget, DAG, DL)); + LHS = DAG.getBitcast(MaskVT, MaskBits(LHS)); + RHS = DAG.getBitcast(MaskVT, MaskBits(RHS)); + SDValue V = DAG.getNode(X86ISD::PCMPEQ, DL, MaskVT, LHS, RHS); V = DAG.getNOT(DL, V, MaskVT); V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V); return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V, DAG.getConstant(0, DL, MVT::i32)); } +static SDValue LowerVectorAllZero(const SDLoc &DL, SDValue V, ISD::CondCode CC, + const APInt &Mask, + const X86Subtarget &Subtarget, + SelectionDAG &DAG, X86::CondCode &X86CC) { + SDValue Z = DAG.getConstant(0, DL, V.getValueType()); + return LowerVectorAllEqual(DL, V, Z, CC, Mask, Subtarget, DAG, X86CC); +} + // Check whether an OR'd reduction tree is PTEST-able, or if we can fallback to // CMP(MOVMSK(PCMPEQB(X,0))). static SDValue MatchVectorAllZeroTest(SDValue Op, ISD::CondCode CC, @@ -24483,6 +24498,28 @@ } } + // Match icmp(bitcast(icmp_ne(X,Y)),0) reduction patterns. + // TODO: Expand to icmp(bitcast(icmp_eq(X,Y)),-1) patterns. + if (Mask.isAllOnes()) { + assert(!Op.getValueType().isVector() && + "Illegal vector type for reduction pattern"); + SDValue Src = peekThroughBitcasts(Op); + if (Src.getOpcode() == ISD::SETCC && + Src.getValueType().isFixedLengthVector() && + Src.getValueType().getScalarType() == MVT::i1) { + ISD::CondCode SrcCC = cast(Src.getOperand(2))->get(); + if (SrcCC == ISD::SETNE) { + SDValue LHS = Src.getOperand(0); + SDValue RHS = Src.getOperand(1); + EVT LHSVT = LHS.getValueType(); + APInt SrcMask = APInt::getAllOnes(LHSVT.getScalarSizeInBits()); + if (SDValue V = LowerVectorAllEqual(DL, LHS, RHS, CC, SrcMask, + Subtarget, DAG, X86CC)) + return V; + } + } + } + return SDValue(); } diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll --- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll +++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll @@ -2175,23 +2175,14 @@ ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq ; -; X64-AVX512BW-LABEL: length64_eq: -; X64-AVX512BW: # %bb.0: -; X64-AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0 -; X64-AVX512BW-NEXT: vpcmpneqb (%rsi), %zmm0, %k0 -; X64-AVX512BW-NEXT: kortestq %k0, %k0 -; X64-AVX512BW-NEXT: setne %al -; X64-AVX512BW-NEXT: vzeroupper -; X64-AVX512BW-NEXT: retq -; -; X64-AVX512F-LABEL: length64_eq: -; X64-AVX512F: # %bb.0: -; X64-AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 -; X64-AVX512F-NEXT: vpcmpneqd (%rsi), %zmm0, %k0 -; X64-AVX512F-NEXT: kortestw %k0, %k0 -; X64-AVX512F-NEXT: setne %al -; X64-AVX512F-NEXT: vzeroupper -; X64-AVX512F-NEXT: retq +; X64-AVX512-LABEL: length64_eq: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 +; X64-AVX512-NEXT: vpcmpneqd (%rsi), %zmm0, %k0 +; X64-AVX512-NEXT: kortestw %k0, %k0 +; X64-AVX512-NEXT: setne %al +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq ; ; X64-MIC-AVX2-LABEL: length64_eq: ; X64-MIC-AVX2: # %bb.0: @@ -2309,23 +2300,14 @@ ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq ; -; X64-AVX512BW-LABEL: length64_eq_const: -; X64-AVX512BW: # %bb.0: -; X64-AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0 -; X64-AVX512BW-NEXT: vpcmpneqb .L.str(%rip), %zmm0, %k0 -; X64-AVX512BW-NEXT: kortestq %k0, %k0 -; X64-AVX512BW-NEXT: sete %al -; X64-AVX512BW-NEXT: vzeroupper -; X64-AVX512BW-NEXT: retq -; -; X64-AVX512F-LABEL: length64_eq_const: -; X64-AVX512F: # %bb.0: -; X64-AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 -; X64-AVX512F-NEXT: vpcmpneqd .L.str(%rip), %zmm0, %k0 -; X64-AVX512F-NEXT: kortestw %k0, %k0 -; X64-AVX512F-NEXT: sete %al -; X64-AVX512F-NEXT: vzeroupper -; X64-AVX512F-NEXT: retq +; X64-AVX512-LABEL: length64_eq_const: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 +; X64-AVX512-NEXT: vpcmpneqd .L.str(%rip), %zmm0, %k0 +; X64-AVX512-NEXT: kortestw %k0, %k0 +; X64-AVX512-NEXT: sete %al +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq ; ; X64-MIC-AVX2-LABEL: length64_eq_const: ; X64-MIC-AVX2: # %bb.0: diff --git a/llvm/test/CodeGen/X86/memcmp.ll b/llvm/test/CodeGen/X86/memcmp.ll --- a/llvm/test/CodeGen/X86/memcmp.ll +++ b/llvm/test/CodeGen/X86/memcmp.ll @@ -1868,23 +1868,14 @@ ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq ; -; X64-AVX512BW-LABEL: length64_eq: -; X64-AVX512BW: # %bb.0: -; X64-AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0 -; X64-AVX512BW-NEXT: vpcmpneqb (%rsi), %zmm0, %k0 -; X64-AVX512BW-NEXT: kortestq %k0, %k0 -; X64-AVX512BW-NEXT: setne %al -; X64-AVX512BW-NEXT: vzeroupper -; X64-AVX512BW-NEXT: retq -; -; X64-AVX512F-LABEL: length64_eq: -; X64-AVX512F: # %bb.0: -; X64-AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 -; X64-AVX512F-NEXT: vpcmpneqd (%rsi), %zmm0, %k0 -; X64-AVX512F-NEXT: kortestw %k0, %k0 -; X64-AVX512F-NEXT: setne %al -; X64-AVX512F-NEXT: vzeroupper -; X64-AVX512F-NEXT: retq +; X64-AVX512-LABEL: length64_eq: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 +; X64-AVX512-NEXT: vpcmpneqd (%rsi), %zmm0, %k0 +; X64-AVX512-NEXT: kortestw %k0, %k0 +; X64-AVX512-NEXT: setne %al +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq ; ; X64-MIC-AVX2-LABEL: length64_eq: ; X64-MIC-AVX2: # %bb.0: @@ -1978,23 +1969,14 @@ ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq ; -; X64-AVX512BW-LABEL: length64_eq_const: -; X64-AVX512BW: # %bb.0: -; X64-AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0 -; X64-AVX512BW-NEXT: vpcmpneqb .L.str(%rip), %zmm0, %k0 -; X64-AVX512BW-NEXT: kortestq %k0, %k0 -; X64-AVX512BW-NEXT: sete %al -; X64-AVX512BW-NEXT: vzeroupper -; X64-AVX512BW-NEXT: retq -; -; X64-AVX512F-LABEL: length64_eq_const: -; X64-AVX512F: # %bb.0: -; X64-AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 -; X64-AVX512F-NEXT: vpcmpneqd .L.str(%rip), %zmm0, %k0 -; X64-AVX512F-NEXT: kortestw %k0, %k0 -; X64-AVX512F-NEXT: sete %al -; X64-AVX512F-NEXT: vzeroupper -; X64-AVX512F-NEXT: retq +; X64-AVX512-LABEL: length64_eq_const: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 +; X64-AVX512-NEXT: vpcmpneqd .L.str(%rip), %zmm0, %k0 +; X64-AVX512-NEXT: kortestw %k0, %k0 +; X64-AVX512-NEXT: sete %al +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq ; ; X64-MIC-AVX2-LABEL: length64_eq_const: ; X64-MIC-AVX2: # %bb.0: diff --git a/llvm/test/CodeGen/X86/movmsk-cmp.ll b/llvm/test/CodeGen/X86/movmsk-cmp.ll --- a/llvm/test/CodeGen/X86/movmsk-cmp.ll +++ b/llvm/test/CodeGen/X86/movmsk-cmp.ll @@ -299,9 +299,8 @@ ; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; KNL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; KNL-NEXT: vpmovsxwq %xmm0, %zmm0 -; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: testb %al, %al +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: kortestw %k0, %k0 ; KNL-NEXT: sete %al ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq @@ -1175,36 +1174,25 @@ } define i1 @allzeros_v16i8_and1(<16 x i8> %arg) { -; SSE-LABEL: allzeros_v16i8_and1: -; SSE: # %bb.0: -; SSE-NEXT: psllw $7, %xmm0 -; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: testl %eax, %eax -; SSE-NEXT: sete %al -; SSE-NEXT: retq -; -; AVX1OR2-LABEL: allzeros_v16i8_and1: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vpsllw $7, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax -; AVX1OR2-NEXT: testl %eax, %eax -; AVX1OR2-NEXT: sete %al -; AVX1OR2-NEXT: retq +; SSE2-LABEL: allzeros_v16i8_and1: +; SSE2: # %bb.0: +; SSE2-NEXT: psllw $7, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: testl %eax, %eax +; SSE2-NEXT: sete %al +; SSE2-NEXT: retq ; -; KNL-LABEL: allzeros_v16i8_and1: -; KNL: # %bb.0: -; KNL-NEXT: vpsllw $7, %xmm0, %xmm0 -; KNL-NEXT: vpmovmskb %xmm0, %eax -; KNL-NEXT: testl %eax, %eax -; KNL-NEXT: sete %al -; KNL-NEXT: retq +; SSE41-LABEL: allzeros_v16i8_and1: +; SSE41: # %bb.0: +; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: sete %al +; SSE41-NEXT: retq ; -; SKX-LABEL: allzeros_v16i8_and1: -; SKX: # %bb.0: -; SKX-NEXT: vptestmb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 -; SKX-NEXT: kortestw %k0, %k0 -; SKX-NEXT: sete %al -; SKX-NEXT: retq +; AVX-LABEL: allzeros_v16i8_and1: +; AVX: # %bb.0: +; AVX-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX-NEXT: sete %al +; AVX-NEXT: retq %tmp = and <16 x i8> %arg, %tmp1 = icmp ne <16 x i8> %tmp, zeroinitializer %tmp2 = bitcast <16 x i1> %tmp1 to i16 @@ -1266,51 +1254,28 @@ } define i1 @allzeros_v32i8_and1(<32 x i8> %arg) { -; SSE-LABEL: allzeros_v32i8_and1: -; SSE: # %bb.0: -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: psllw $7, %xmm0 -; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: testl %eax, %eax -; SSE-NEXT: sete %al -; SSE-NEXT: retq -; -; AVX1-LABEL: allzeros_v32i8_and1: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: testl %eax, %eax -; AVX1-NEXT: sete %al -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: allzeros_v32i8_and1: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0 -; AVX2-NEXT: vpmovmskb %ymm0, %eax -; AVX2-NEXT: testl %eax, %eax -; AVX2-NEXT: sete %al -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; SSE2-LABEL: allzeros_v32i8_and1: +; SSE2: # %bb.0: +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: psllw $7, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: testl %eax, %eax +; SSE2-NEXT: sete %al +; SSE2-NEXT: retq ; -; KNL-LABEL: allzeros_v32i8_and1: -; KNL: # %bb.0: -; KNL-NEXT: vpsllw $7, %ymm0, %ymm0 -; KNL-NEXT: vpmovmskb %ymm0, %eax -; KNL-NEXT: testl %eax, %eax -; KNL-NEXT: sete %al -; KNL-NEXT: vzeroupper -; KNL-NEXT: retq +; SSE41-LABEL: allzeros_v32i8_and1: +; SSE41: # %bb.0: +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: sete %al +; SSE41-NEXT: retq ; -; SKX-LABEL: allzeros_v32i8_and1: -; SKX: # %bb.0: -; SKX-NEXT: vptestmb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 -; SKX-NEXT: kortestd %k0, %k0 -; SKX-NEXT: sete %al -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; AVX-LABEL: allzeros_v32i8_and1: +; AVX: # %bb.0: +; AVX-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX-NEXT: sete %al +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %tmp = and <32 x i8> %arg, %tmp1 = icmp ne <32 x i8> %tmp, zeroinitializer %tmp2 = bitcast <32 x i1> %tmp1 to i32 @@ -1380,27 +1345,30 @@ } define i1 @allzeros_v64i8_and1(<64 x i8> %arg) { -; SSE-LABEL: allzeros_v64i8_and1: -; SSE: # %bb.0: -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: psllw $7, %xmm0 -; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: testl %eax, %eax -; SSE-NEXT: sete %al -; SSE-NEXT: retq +; SSE2-LABEL: allzeros_v64i8_and1: +; SSE2: # %bb.0: +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: psllw $7, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: testl %eax, %eax +; SSE2-NEXT: sete %al +; SSE2-NEXT: retq +; +; SSE41-LABEL: allzeros_v64i8_and1: +; SSE41: # %bb.0: +; SSE41-NEXT: por %xmm3, %xmm1 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: sete %al +; SSE41-NEXT: retq ; ; AVX1-LABEL: allzeros_v64i8_and1: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: testl %eax, %eax +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -1408,31 +1376,18 @@ ; AVX2-LABEL: allzeros_v64i8_and1: ; AVX2: # %bb.0: ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0 -; AVX2-NEXT: vpmovmskb %ymm0, %eax -; AVX2-NEXT: testl %eax, %eax +; AVX2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; KNL-LABEL: allzeros_v64i8_and1: -; KNL: # %bb.0: -; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; KNL-NEXT: vpor %ymm1, %ymm0, %ymm0 -; KNL-NEXT: vpsllw $7, %ymm0, %ymm0 -; KNL-NEXT: vpmovmskb %ymm0, %eax -; KNL-NEXT: testl %eax, %eax -; KNL-NEXT: sete %al -; KNL-NEXT: vzeroupper -; KNL-NEXT: retq -; -; SKX-LABEL: allzeros_v64i8_and1: -; SKX: # %bb.0: -; SKX-NEXT: vptestmb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 -; SKX-NEXT: kortestq %k0, %k0 -; SKX-NEXT: sete %al -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; AVX512-LABEL: allzeros_v64i8_and1: +; AVX512: # %bb.0: +; AVX512-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: sete %al +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %tmp = and <64 x i8> %arg, %tmp1 = icmp ne <64 x i8> %tmp, zeroinitializer %tmp2 = bitcast <64 x i1> %tmp1 to i64 @@ -1485,40 +1440,25 @@ } define i1 @allzeros_v8i16_and1(<8 x i16> %arg) { -; SSE-LABEL: allzeros_v8i16_and1: -; SSE: # %bb.0: -; SSE-NEXT: psllw $15, %xmm0 -; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: testl $43690, %eax # imm = 0xAAAA -; SSE-NEXT: sete %al -; SSE-NEXT: retq -; -; AVX1OR2-LABEL: allzeros_v8i16_and1: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vpsllw $15, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax -; AVX1OR2-NEXT: testl $43690, %eax # imm = 0xAAAA -; AVX1OR2-NEXT: sete %al -; AVX1OR2-NEXT: retq +; SSE2-LABEL: allzeros_v8i16_and1: +; SSE2: # %bb.0: +; SSE2-NEXT: psllw $7, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: testl $21845, %eax # imm = 0x5555 +; SSE2-NEXT: sete %al +; SSE2-NEXT: retq ; -; KNL-LABEL: allzeros_v8i16_and1: -; KNL: # %bb.0: -; KNL-NEXT: vpsllw $15, %xmm0, %xmm0 -; KNL-NEXT: vpsraw $15, %xmm0, %xmm0 -; KNL-NEXT: vpmovsxwq %xmm0, %zmm0 -; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: testb %al, %al -; KNL-NEXT: sete %al -; KNL-NEXT: vzeroupper -; KNL-NEXT: retq +; SSE41-LABEL: allzeros_v8i16_and1: +; SSE41: # %bb.0: +; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: sete %al +; SSE41-NEXT: retq ; -; SKX-LABEL: allzeros_v8i16_and1: -; SKX: # %bb.0: -; SKX-NEXT: vptestmw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 -; SKX-NEXT: kortestb %k0, %k0 -; SKX-NEXT: sete %al -; SKX-NEXT: retq +; AVX-LABEL: allzeros_v8i16_and1: +; AVX: # %bb.0: +; AVX-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX-NEXT: sete %al +; AVX-NEXT: retq %tmp = and <8 x i16> %arg, %tmp1 = icmp ne <8 x i16> %tmp, zeroinitializer %tmp2 = bitcast <8 x i1> %tmp1 to i8 @@ -1662,68 +1602,49 @@ } define i1 @allzeros_v32i16_and1(<32 x i16> %arg) { -; SSE-LABEL: allzeros_v32i16_and1: -; SSE: # %bb.0: -; SSE-NEXT: psllw $15, %xmm3 -; SSE-NEXT: psllw $15, %xmm2 -; SSE-NEXT: packsswb %xmm3, %xmm2 -; SSE-NEXT: psllw $15, %xmm1 -; SSE-NEXT: psllw $15, %xmm0 -; SSE-NEXT: packsswb %xmm1, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: testl %eax, %eax -; SSE-NEXT: sete %al -; SSE-NEXT: retq +; SSE2-LABEL: allzeros_v32i16_and1: +; SSE2: # %bb.0: +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: psllw $7, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: testl $21845, %eax # imm = 0x5555 +; SSE2-NEXT: sete %al +; SSE2-NEXT: retq +; +; SSE41-LABEL: allzeros_v32i16_and1: +; SSE41: # %bb.0: +; SSE41-NEXT: por %xmm3, %xmm1 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: sete %al +; SSE41-NEXT: retq ; ; AVX1-LABEL: allzeros_v32i16_and1: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpsllw $15, %xmm2, %xmm2 -; AVX1-NEXT: vpsllw $15, %xmm1, %xmm1 -; AVX1-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpsllw $15, %xmm2, %xmm2 -; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: testl %eax, %eax +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: allzeros_v32i16_and1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsllw $15, %ymm1, %ymm1 -; AVX2-NEXT: vpsllw $15, %ymm0, %ymm0 -; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpmovmskb %ymm0, %eax -; AVX2-NEXT: testl %eax, %eax +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; KNL-LABEL: allzeros_v32i16_and1: -; KNL: # %bb.0: -; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; KNL-NEXT: vpor %ymm1, %ymm0, %ymm0 -; KNL-NEXT: vpsllw $15, %ymm0, %ymm0 -; KNL-NEXT: vpsraw $15, %ymm0, %ymm0 -; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 -; KNL-NEXT: kortestw %k0, %k0 -; KNL-NEXT: sete %al -; KNL-NEXT: vzeroupper -; KNL-NEXT: retq -; -; SKX-LABEL: allzeros_v32i16_and1: -; SKX: # %bb.0: -; SKX-NEXT: vptestmw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 -; SKX-NEXT: kortestd %k0, %k0 -; SKX-NEXT: sete %al -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; AVX512-LABEL: allzeros_v32i16_and1: +; AVX512: # %bb.0: +; AVX512-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: sete %al +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %tmp = and <32 x i16> %arg, %tmp1 = icmp ne <32 x i16> %tmp, zeroinitializer %tmp2 = bitcast <32 x i1> %tmp1 to i32 @@ -1732,55 +1653,28 @@ } define i1 @allzeros_v16i16_and1(<16 x i16> %arg) { -; SSE-LABEL: allzeros_v16i16_and1: -; SSE: # %bb.0: -; SSE-NEXT: psllw $15, %xmm1 -; SSE-NEXT: psllw $15, %xmm0 -; SSE-NEXT: packsswb %xmm1, %xmm0 -; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: testl %eax, %eax -; SSE-NEXT: sete %al -; SSE-NEXT: retq +; SSE2-LABEL: allzeros_v16i16_and1: +; SSE2: # %bb.0: +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: psllw $7, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: testl $21845, %eax # imm = 0x5555 +; SSE2-NEXT: sete %al +; SSE2-NEXT: retq ; -; AVX1-LABEL: allzeros_v16i16_and1: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpsllw $15, %xmm1, %xmm1 -; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: testl %eax, %eax -; AVX1-NEXT: sete %al -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq +; SSE41-LABEL: allzeros_v16i16_and1: +; SSE41: # %bb.0: +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: sete %al +; SSE41-NEXT: retq ; -; AVX2-LABEL: allzeros_v16i16_and1: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsllw $15, %ymm0, %ymm0 -; AVX2-NEXT: vpmovmskb %ymm0, %eax -; AVX2-NEXT: testl $-1431655766, %eax # imm = 0xAAAAAAAA -; AVX2-NEXT: sete %al -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; KNL-LABEL: allzeros_v16i16_and1: -; KNL: # %bb.0: -; KNL-NEXT: vpsllw $15, %ymm0, %ymm0 -; KNL-NEXT: vpsraw $15, %ymm0, %ymm0 -; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 -; KNL-NEXT: kortestw %k0, %k0 -; KNL-NEXT: sete %al -; KNL-NEXT: vzeroupper -; KNL-NEXT: retq -; -; SKX-LABEL: allzeros_v16i16_and1: -; SKX: # %bb.0: -; SKX-NEXT: vptestmw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 -; SKX-NEXT: kortestw %k0, %k0 -; SKX-NEXT: sete %al -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; AVX-LABEL: allzeros_v16i16_and1: +; AVX: # %bb.0: +; AVX-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX-NEXT: sete %al +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %tmp = and <16 x i16> %arg, %tmp1 = icmp ne <16 x i16> %tmp, zeroinitializer %tmp2 = bitcast <16 x i1> %tmp1 to i16 @@ -1830,38 +1724,39 @@ } define i1 @allzeros_v4i32_and1(<4 x i32> %arg) { -; SSE-LABEL: allzeros_v4i32_and1: -; SSE: # %bb.0: -; SSE-NEXT: pslld $31, %xmm0 -; SSE-NEXT: movmskps %xmm0, %eax -; SSE-NEXT: testl %eax, %eax -; SSE-NEXT: sete %al -; SSE-NEXT: retq +; SSE2-LABEL: allzeros_v4i32_and1: +; SSE2: # %bb.0: +; SSE2-NEXT: pslld $31, %xmm0 +; SSE2-NEXT: movmskps %xmm0, %eax +; SSE2-NEXT: testl %eax, %eax +; SSE2-NEXT: sete %al +; SSE2-NEXT: retq ; -; AVX1OR2-LABEL: allzeros_v4i32_and1: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vpslld $31, %xmm0, %xmm0 -; AVX1OR2-NEXT: vmovmskps %xmm0, %eax -; AVX1OR2-NEXT: testl %eax, %eax -; AVX1OR2-NEXT: sete %al -; AVX1OR2-NEXT: retq +; SSE41-LABEL: allzeros_v4i32_and1: +; SSE41: # %bb.0: +; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: sete %al +; SSE41-NEXT: retq ; -; KNL-LABEL: allzeros_v4i32_and1: -; KNL: # %bb.0: -; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: testb $15, %al -; KNL-NEXT: sete %al -; KNL-NEXT: vzeroupper -; KNL-NEXT: retq +; AVX1-LABEL: allzeros_v4i32_and1: +; AVX1: # %bb.0: +; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX1-NEXT: sete %al +; AVX1-NEXT: retq ; -; SKX-LABEL: allzeros_v4i32_and1: -; SKX: # %bb.0: -; SKX-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %k0 -; SKX-NEXT: kortestb %k0, %k0 -; SKX-NEXT: sete %al -; SKX-NEXT: retq +; AVX2-LABEL: allzeros_v4i32_and1: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] +; AVX2-NEXT: vptest %xmm1, %xmm0 +; AVX2-NEXT: sete %al +; AVX2-NEXT: retq +; +; AVX512-LABEL: allzeros_v4i32_and1: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] +; AVX512-NEXT: vptest %xmm1, %xmm0 +; AVX512-NEXT: sete %al +; AVX512-NEXT: retq %tmp = and <4 x i32> %arg, %tmp1 = icmp ne <4 x i32> %tmp, zeroinitializer %tmp2 = bitcast <4 x i1> %tmp1 to i4 @@ -1926,53 +1821,44 @@ } define i1 @allzeros_v8i32_and1(<8 x i32> %arg) { -; SSE-LABEL: allzeros_v8i32_and1: -; SSE: # %bb.0: -; SSE-NEXT: pslld $31, %xmm1 -; SSE-NEXT: pslld $31, %xmm0 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: testl $43690, %eax # imm = 0xAAAA -; SSE-NEXT: sete %al -; SSE-NEXT: retq +; SSE2-LABEL: allzeros_v8i32_and1: +; SSE2: # %bb.0: +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pslld $31, %xmm0 +; SSE2-NEXT: movmskps %xmm0, %eax +; SSE2-NEXT: testl %eax, %eax +; SSE2-NEXT: sete %al +; SSE2-NEXT: retq +; +; SSE41-LABEL: allzeros_v8i32_and1: +; SSE41: # %bb.0: +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: sete %al +; SSE41-NEXT: retq ; ; AVX1-LABEL: allzeros_v8i32_and1: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 -; AVX1-NEXT: vmovmskps %xmm0, %eax -; AVX1-NEXT: testl %eax, %eax +; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: allzeros_v8i32_and1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 -; AVX2-NEXT: vmovmskps %ymm0, %eax -; AVX2-NEXT: testl %eax, %eax +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] +; AVX2-NEXT: vptest %ymm1, %ymm0 ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; KNL-LABEL: allzeros_v8i32_and1: -; KNL: # %bb.0: -; KNL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; KNL-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: testb %al, %al -; KNL-NEXT: sete %al -; KNL-NEXT: vzeroupper -; KNL-NEXT: retq -; -; SKX-LABEL: allzeros_v8i32_and1: -; SKX: # %bb.0: -; SKX-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %k0 -; SKX-NEXT: kortestb %k0, %k0 -; SKX-NEXT: sete %al -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; AVX512-LABEL: allzeros_v8i32_and1: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] +; AVX512-NEXT: vptest %ymm1, %ymm0 +; AVX512-NEXT: sete %al +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %tmp = and <8 x i32> %arg, %tmp1 = icmp ne <8 x i32> %tmp, zeroinitializer %tmp2 = bitcast <8 x i1> %tmp1 to i8 @@ -2040,46 +1926,39 @@ } define i1 @allzeros_v16i32_and1(<16 x i32> %arg) { -; SSE-LABEL: allzeros_v16i32_and1: -; SSE: # %bb.0: -; SSE-NEXT: pslld $31, %xmm3 -; SSE-NEXT: pslld $31, %xmm2 -; SSE-NEXT: packssdw %xmm3, %xmm2 -; SSE-NEXT: pslld $31, %xmm1 -; SSE-NEXT: pslld $31, %xmm0 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: packsswb %xmm2, %xmm0 -; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: testl %eax, %eax -; SSE-NEXT: sete %al -; SSE-NEXT: retq +; SSE2-LABEL: allzeros_v16i32_and1: +; SSE2: # %bb.0: +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pslld $31, %xmm0 +; SSE2-NEXT: movmskps %xmm0, %eax +; SSE2-NEXT: testl %eax, %eax +; SSE2-NEXT: sete %al +; SSE2-NEXT: retq +; +; SSE41-LABEL: allzeros_v16i32_and1: +; SSE41: # %bb.0: +; SSE41-NEXT: por %xmm3, %xmm1 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: sete %al +; SSE41-NEXT: retq ; ; AVX1-LABEL: allzeros_v16i32_and1: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpslld $31, %xmm2, %xmm2 -; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpslld $31, %xmm2, %xmm2 -; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: testl %eax, %eax +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: allzeros_v16i32_and1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpslld $31, %ymm1, %ymm1 -; AVX2-NEXT: vpsrad $31, %ymm1, %ymm1 -; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 -; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 -; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpmovmskb %ymm0, %eax -; AVX2-NEXT: testl %eax, %eax +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] +; AVX2-NEXT: vptest %ymm1, %ymm0 ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -2141,37 +2020,36 @@ } define i1 @allzeros_v2i64_and1(<2 x i64> %arg) { -; SSE-LABEL: allzeros_v2i64_and1: -; SSE: # %bb.0: -; SSE-NEXT: psllq $63, %xmm0 -; SSE-NEXT: movmskpd %xmm0, %eax -; SSE-NEXT: testl %eax, %eax -; SSE-NEXT: sete %al -; SSE-NEXT: retq +; SSE2-LABEL: allzeros_v2i64_and1: +; SSE2: # %bb.0: +; SSE2-NEXT: pslld $31, %xmm0 +; SSE2-NEXT: movmskps %xmm0, %eax +; SSE2-NEXT: testb $5, %al +; SSE2-NEXT: sete %al +; SSE2-NEXT: retq +; +; SSE41-LABEL: allzeros_v2i64_and1: +; SSE41: # %bb.0: +; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: sete %al +; SSE41-NEXT: retq ; ; AVX1OR2-LABEL: allzeros_v2i64_and1: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vpsllq $63, %xmm0, %xmm0 -; AVX1OR2-NEXT: vmovmskpd %xmm0, %eax -; AVX1OR2-NEXT: testl %eax, %eax +; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; AVX1OR2-NEXT: sete %al ; AVX1OR2-NEXT: retq ; ; KNL-LABEL: allzeros_v2i64_and1: ; KNL: # %bb.0: -; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1] -; KNL-NEXT: vptestmq %zmm1, %zmm0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: testb $3, %al +; KNL-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; KNL-NEXT: sete %al -; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: allzeros_v2i64_and1: ; SKX: # %bb.0: -; SKX-NEXT: vptestmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k0 -; SKX-NEXT: kortestb %k0, %k0 +; SKX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] +; SKX-NEXT: vptest %xmm1, %xmm0 ; SKX-NEXT: sete %al ; SKX-NEXT: retq %tmp = and <2 x i64> %arg, @@ -2238,53 +2116,44 @@ } define i1 @allzeros_v4i64_and1(<4 x i64> %arg) { -; SSE-LABEL: allzeros_v4i64_and1: -; SSE: # %bb.0: -; SSE-NEXT: psllq $63, %xmm1 -; SSE-NEXT: psllq $63, %xmm0 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: movmskps %xmm0, %eax -; SSE-NEXT: testl %eax, %eax -; SSE-NEXT: sete %al -; SSE-NEXT: retq +; SSE2-LABEL: allzeros_v4i64_and1: +; SSE2: # %bb.0: +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pslld $31, %xmm0 +; SSE2-NEXT: movmskps %xmm0, %eax +; SSE2-NEXT: testb $5, %al +; SSE2-NEXT: sete %al +; SSE2-NEXT: retq +; +; SSE41-LABEL: allzeros_v4i64_and1: +; SSE41: # %bb.0: +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: sete %al +; SSE41-NEXT: retq ; ; AVX1-LABEL: allzeros_v4i64_and1: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0 -; AVX1-NEXT: vmovmskpd %xmm0, %eax -; AVX1-NEXT: testl %eax, %eax +; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: allzeros_v4i64_and1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsllq $63, %ymm0, %ymm0 -; AVX2-NEXT: vmovmskpd %ymm0, %eax -; AVX2-NEXT: testl %eax, %eax +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] +; AVX2-NEXT: vptest %ymm1, %ymm0 ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; KNL-LABEL: allzeros_v4i64_and1: -; KNL: # %bb.0: -; KNL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; KNL-NEXT: vptestmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: testb $15, %al -; KNL-NEXT: sete %al -; KNL-NEXT: vzeroupper -; KNL-NEXT: retq -; -; SKX-LABEL: allzeros_v4i64_and1: -; SKX: # %bb.0: -; SKX-NEXT: vptestmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %k0 -; SKX-NEXT: kortestb %k0, %k0 -; SKX-NEXT: sete %al -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; AVX512-LABEL: allzeros_v4i64_and1: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] +; AVX512-NEXT: vptest %ymm1, %ymm0 +; AVX512-NEXT: sete %al +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %tmp = and <4 x i64> %arg, %tmp1 = icmp ne <4 x i64> %tmp, zeroinitializer %tmp2 = bitcast <4 x i1> %tmp1 to i4 @@ -2360,64 +2229,51 @@ } define i1 @allzeros_v8i64_and1(<8 x i64> %arg) { -; SSE-LABEL: allzeros_v8i64_and1: -; SSE: # %bb.0: -; SSE-NEXT: psllq $63, %xmm3 -; SSE-NEXT: psllq $63, %xmm2 -; SSE-NEXT: packssdw %xmm3, %xmm2 -; SSE-NEXT: psllq $63, %xmm1 -; SSE-NEXT: psllq $63, %xmm0 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: packssdw %xmm2, %xmm0 -; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: testl $43690, %eax # imm = 0xAAAA -; SSE-NEXT: sete %al -; SSE-NEXT: retq +; SSE2-LABEL: allzeros_v8i64_and1: +; SSE2: # %bb.0: +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pslld $31, %xmm0 +; SSE2-NEXT: movmskps %xmm0, %eax +; SSE2-NEXT: testb $5, %al +; SSE2-NEXT: sete %al +; SSE2-NEXT: retq +; +; SSE41-LABEL: allzeros_v8i64_and1: +; SSE41: # %bb.0: +; SSE41-NEXT: por %xmm3, %xmm1 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: sete %al +; SSE41-NEXT: retq ; ; AVX1-LABEL: allzeros_v8i64_and1: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpsllq $63, %xmm2, %xmm2 -; AVX1-NEXT: vpsllq $63, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpsllq $63, %xmm2, %xmm2 -; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovmskps %xmm0, %eax -; AVX1-NEXT: testl %eax, %eax +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: allzeros_v8i64_and1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsllq $63, %ymm1, %ymm1 -; AVX2-NEXT: vpsllq $63, %ymm0, %ymm0 -; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovmskps %ymm0, %eax -; AVX2-NEXT: testl %eax, %eax +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] +; AVX2-NEXT: vptest %ymm1, %ymm0 ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; KNL-LABEL: allzeros_v8i64_and1: -; KNL: # %bb.0: -; KNL-NEXT: vptestmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: testb %al, %al -; KNL-NEXT: sete %al -; KNL-NEXT: vzeroupper -; KNL-NEXT: retq -; -; SKX-LABEL: allzeros_v8i64_and1: -; SKX: # %bb.0: -; SKX-NEXT: vptestmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k0 -; SKX-NEXT: kortestb %k0, %k0 -; SKX-NEXT: sete %al -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; AVX512-LABEL: allzeros_v8i64_and1: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1] +; AVX512-NEXT: vptestmd %zmm1, %zmm0, %k0 +; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: sete %al +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %tmp = and <8 x i64> %arg, %tmp1 = icmp ne <8 x i64> %tmp, zeroinitializer %tmp2 = bitcast <8 x i1> %tmp1 to i8 @@ -2464,36 +2320,25 @@ } define i1 @allzeros_v16i8_and4(<16 x i8> %arg) { -; SSE-LABEL: allzeros_v16i8_and4: -; SSE: # %bb.0: -; SSE-NEXT: psllw $5, %xmm0 -; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: testl %eax, %eax -; SSE-NEXT: sete %al -; SSE-NEXT: retq -; -; AVX1OR2-LABEL: allzeros_v16i8_and4: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vpsllw $5, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax -; AVX1OR2-NEXT: testl %eax, %eax -; AVX1OR2-NEXT: sete %al -; AVX1OR2-NEXT: retq +; SSE2-LABEL: allzeros_v16i8_and4: +; SSE2: # %bb.0: +; SSE2-NEXT: psllw $5, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: testl %eax, %eax +; SSE2-NEXT: sete %al +; SSE2-NEXT: retq ; -; KNL-LABEL: allzeros_v16i8_and4: -; KNL: # %bb.0: -; KNL-NEXT: vpsllw $5, %xmm0, %xmm0 -; KNL-NEXT: vpmovmskb %xmm0, %eax -; KNL-NEXT: testl %eax, %eax -; KNL-NEXT: sete %al -; KNL-NEXT: retq +; SSE41-LABEL: allzeros_v16i8_and4: +; SSE41: # %bb.0: +; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: sete %al +; SSE41-NEXT: retq ; -; SKX-LABEL: allzeros_v16i8_and4: -; SKX: # %bb.0: -; SKX-NEXT: vptestmb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 -; SKX-NEXT: kortestw %k0, %k0 -; SKX-NEXT: sete %al -; SKX-NEXT: retq +; AVX-LABEL: allzeros_v16i8_and4: +; AVX: # %bb.0: +; AVX-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX-NEXT: sete %al +; AVX-NEXT: retq %tmp = and <16 x i8> %arg, %tmp1 = icmp ne <16 x i8> %tmp, zeroinitializer %tmp2 = bitcast <16 x i1> %tmp1 to i16 @@ -2555,51 +2400,28 @@ } define i1 @allzeros_v32i8_and4(<32 x i8> %arg) { -; SSE-LABEL: allzeros_v32i8_and4: -; SSE: # %bb.0: -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: psllw $5, %xmm0 -; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: testl %eax, %eax -; SSE-NEXT: sete %al -; SSE-NEXT: retq -; -; AVX1-LABEL: allzeros_v32i8_and4: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsllw $5, %xmm0, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: testl %eax, %eax -; AVX1-NEXT: sete %al -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: allzeros_v32i8_and4: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsllw $5, %ymm0, %ymm0 -; AVX2-NEXT: vpmovmskb %ymm0, %eax -; AVX2-NEXT: testl %eax, %eax -; AVX2-NEXT: sete %al -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; SSE2-LABEL: allzeros_v32i8_and4: +; SSE2: # %bb.0: +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: psllw $5, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: testl %eax, %eax +; SSE2-NEXT: sete %al +; SSE2-NEXT: retq ; -; KNL-LABEL: allzeros_v32i8_and4: -; KNL: # %bb.0: -; KNL-NEXT: vpsllw $5, %ymm0, %ymm0 -; KNL-NEXT: vpmovmskb %ymm0, %eax -; KNL-NEXT: testl %eax, %eax -; KNL-NEXT: sete %al -; KNL-NEXT: vzeroupper -; KNL-NEXT: retq +; SSE41-LABEL: allzeros_v32i8_and4: +; SSE41: # %bb.0: +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: sete %al +; SSE41-NEXT: retq ; -; SKX-LABEL: allzeros_v32i8_and4: -; SKX: # %bb.0: -; SKX-NEXT: vptestmb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 -; SKX-NEXT: kortestd %k0, %k0 -; SKX-NEXT: sete %al -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; AVX-LABEL: allzeros_v32i8_and4: +; AVX: # %bb.0: +; AVX-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX-NEXT: sete %al +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %tmp = and <32 x i8> %arg, %tmp1 = icmp ne <32 x i8> %tmp, zeroinitializer %tmp2 = bitcast <32 x i1> %tmp1 to i32 @@ -2669,27 +2491,30 @@ } define i1 @allzeros_v64i8_and4(<64 x i8> %arg) { -; SSE-LABEL: allzeros_v64i8_and4: -; SSE: # %bb.0: -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: psllw $5, %xmm0 -; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: testl %eax, %eax -; SSE-NEXT: sete %al -; SSE-NEXT: retq +; SSE2-LABEL: allzeros_v64i8_and4: +; SSE2: # %bb.0: +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: psllw $5, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: testl %eax, %eax +; SSE2-NEXT: sete %al +; SSE2-NEXT: retq +; +; SSE41-LABEL: allzeros_v64i8_and4: +; SSE41: # %bb.0: +; SSE41-NEXT: por %xmm3, %xmm1 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: sete %al +; SSE41-NEXT: retq ; ; AVX1-LABEL: allzeros_v64i8_and4: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpsllw $5, %xmm0, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: testl %eax, %eax +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -2697,31 +2522,18 @@ ; AVX2-LABEL: allzeros_v64i8_and4: ; AVX2: # %bb.0: ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsllw $5, %ymm0, %ymm0 -; AVX2-NEXT: vpmovmskb %ymm0, %eax -; AVX2-NEXT: testl %eax, %eax +; AVX2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; KNL-LABEL: allzeros_v64i8_and4: -; KNL: # %bb.0: -; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; KNL-NEXT: vpor %ymm1, %ymm0, %ymm0 -; KNL-NEXT: vpsllw $5, %ymm0, %ymm0 -; KNL-NEXT: vpmovmskb %ymm0, %eax -; KNL-NEXT: testl %eax, %eax -; KNL-NEXT: sete %al -; KNL-NEXT: vzeroupper -; KNL-NEXT: retq -; -; SKX-LABEL: allzeros_v64i8_and4: -; SKX: # %bb.0: -; SKX-NEXT: vptestmb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 -; SKX-NEXT: kortestq %k0, %k0 -; SKX-NEXT: sete %al -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; AVX512-LABEL: allzeros_v64i8_and4: +; AVX512: # %bb.0: +; AVX512-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: sete %al +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %tmp = and <64 x i8> %arg, %tmp1 = icmp ne <64 x i8> %tmp, zeroinitializer %tmp2 = bitcast <64 x i1> %tmp1 to i64 @@ -2774,40 +2586,25 @@ } define i1 @allzeros_v8i16_and4(<8 x i16> %arg) { -; SSE-LABEL: allzeros_v8i16_and4: -; SSE: # %bb.0: -; SSE-NEXT: psllw $13, %xmm0 -; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: testl $43690, %eax # imm = 0xAAAA -; SSE-NEXT: sete %al -; SSE-NEXT: retq -; -; AVX1OR2-LABEL: allzeros_v8i16_and4: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vpsllw $13, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax -; AVX1OR2-NEXT: testl $43690, %eax # imm = 0xAAAA -; AVX1OR2-NEXT: sete %al -; AVX1OR2-NEXT: retq +; SSE2-LABEL: allzeros_v8i16_and4: +; SSE2: # %bb.0: +; SSE2-NEXT: psllw $5, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: testl $21845, %eax # imm = 0x5555 +; SSE2-NEXT: sete %al +; SSE2-NEXT: retq ; -; KNL-LABEL: allzeros_v8i16_and4: -; KNL: # %bb.0: -; KNL-NEXT: vpsllw $13, %xmm0, %xmm0 -; KNL-NEXT: vpsraw $15, %xmm0, %xmm0 -; KNL-NEXT: vpmovsxwq %xmm0, %zmm0 -; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: testb %al, %al -; KNL-NEXT: sete %al -; KNL-NEXT: vzeroupper -; KNL-NEXT: retq +; SSE41-LABEL: allzeros_v8i16_and4: +; SSE41: # %bb.0: +; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: sete %al +; SSE41-NEXT: retq ; -; SKX-LABEL: allzeros_v8i16_and4: -; SKX: # %bb.0: -; SKX-NEXT: vptestmw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 -; SKX-NEXT: kortestb %k0, %k0 -; SKX-NEXT: sete %al -; SKX-NEXT: retq +; AVX-LABEL: allzeros_v8i16_and4: +; AVX: # %bb.0: +; AVX-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX-NEXT: sete %al +; AVX-NEXT: retq %tmp = and <8 x i16> %arg, %tmp1 = icmp ne <8 x i16> %tmp, zeroinitializer %tmp2 = bitcast <8 x i1> %tmp1 to i8 @@ -2951,68 +2748,49 @@ } define i1 @allzeros_v32i16_and4(<32 x i16> %arg) { -; SSE-LABEL: allzeros_v32i16_and4: -; SSE: # %bb.0: -; SSE-NEXT: psllw $13, %xmm3 -; SSE-NEXT: psllw $13, %xmm2 -; SSE-NEXT: packsswb %xmm3, %xmm2 -; SSE-NEXT: psllw $13, %xmm1 -; SSE-NEXT: psllw $13, %xmm0 -; SSE-NEXT: packsswb %xmm1, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: testl %eax, %eax -; SSE-NEXT: sete %al -; SSE-NEXT: retq +; SSE2-LABEL: allzeros_v32i16_and4: +; SSE2: # %bb.0: +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: psllw $5, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: testl $21845, %eax # imm = 0x5555 +; SSE2-NEXT: sete %al +; SSE2-NEXT: retq +; +; SSE41-LABEL: allzeros_v32i16_and4: +; SSE41: # %bb.0: +; SSE41-NEXT: por %xmm3, %xmm1 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: sete %al +; SSE41-NEXT: retq ; ; AVX1-LABEL: allzeros_v32i16_and4: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpsllw $13, %xmm2, %xmm2 -; AVX1-NEXT: vpsllw $13, %xmm1, %xmm1 -; AVX1-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpsllw $13, %xmm2, %xmm2 -; AVX1-NEXT: vpsllw $13, %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: testl %eax, %eax +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: allzeros_v32i16_and4: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsllw $13, %ymm1, %ymm1 -; AVX2-NEXT: vpsllw $13, %ymm0, %ymm0 -; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpmovmskb %ymm0, %eax -; AVX2-NEXT: testl %eax, %eax +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; KNL-LABEL: allzeros_v32i16_and4: -; KNL: # %bb.0: -; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; KNL-NEXT: vpor %ymm1, %ymm0, %ymm0 -; KNL-NEXT: vpsllw $13, %ymm0, %ymm0 -; KNL-NEXT: vpsraw $15, %ymm0, %ymm0 -; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 -; KNL-NEXT: kortestw %k0, %k0 -; KNL-NEXT: sete %al -; KNL-NEXT: vzeroupper -; KNL-NEXT: retq -; -; SKX-LABEL: allzeros_v32i16_and4: -; SKX: # %bb.0: -; SKX-NEXT: vptestmw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 -; SKX-NEXT: kortestd %k0, %k0 -; SKX-NEXT: sete %al -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; AVX512-LABEL: allzeros_v32i16_and4: +; AVX512: # %bb.0: +; AVX512-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: sete %al +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %tmp = and <32 x i16> %arg, %tmp1 = icmp ne <32 x i16> %tmp, zeroinitializer %tmp2 = bitcast <32 x i1> %tmp1 to i32 @@ -3021,55 +2799,28 @@ } define i1 @allzeros_v16i16_and4(<16 x i16> %arg) { -; SSE-LABEL: allzeros_v16i16_and4: -; SSE: # %bb.0: -; SSE-NEXT: psllw $13, %xmm1 -; SSE-NEXT: psllw $13, %xmm0 -; SSE-NEXT: packsswb %xmm1, %xmm0 -; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: testl %eax, %eax -; SSE-NEXT: sete %al -; SSE-NEXT: retq -; -; AVX1-LABEL: allzeros_v16i16_and4: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpsllw $13, %xmm1, %xmm1 -; AVX1-NEXT: vpsllw $13, %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: testl %eax, %eax -; AVX1-NEXT: sete %al -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: allzeros_v16i16_and4: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsllw $13, %ymm0, %ymm0 -; AVX2-NEXT: vpmovmskb %ymm0, %eax -; AVX2-NEXT: testl $-1431655766, %eax # imm = 0xAAAAAAAA -; AVX2-NEXT: sete %al -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; SSE2-LABEL: allzeros_v16i16_and4: +; SSE2: # %bb.0: +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: psllw $5, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: testl $21845, %eax # imm = 0x5555 +; SSE2-NEXT: sete %al +; SSE2-NEXT: retq ; -; KNL-LABEL: allzeros_v16i16_and4: -; KNL: # %bb.0: -; KNL-NEXT: vpsllw $13, %ymm0, %ymm0 -; KNL-NEXT: vpsraw $15, %ymm0, %ymm0 -; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 -; KNL-NEXT: kortestw %k0, %k0 -; KNL-NEXT: sete %al -; KNL-NEXT: vzeroupper -; KNL-NEXT: retq +; SSE41-LABEL: allzeros_v16i16_and4: +; SSE41: # %bb.0: +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: sete %al +; SSE41-NEXT: retq ; -; SKX-LABEL: allzeros_v16i16_and4: -; SKX: # %bb.0: -; SKX-NEXT: vptestmw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 -; SKX-NEXT: kortestw %k0, %k0 -; SKX-NEXT: sete %al -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; AVX-LABEL: allzeros_v16i16_and4: +; AVX: # %bb.0: +; AVX-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX-NEXT: sete %al +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %tmp = and <16 x i16> %arg, %tmp1 = icmp ne <16 x i16> %tmp, zeroinitializer %tmp2 = bitcast <16 x i1> %tmp1 to i16 @@ -3078,79 +2829,80 @@ } define i1 @allones_v4i32_and4(<4 x i32> %arg) { -; SSE-LABEL: allones_v4i32_and4: -; SSE: # %bb.0: -; SSE-NEXT: pslld $29, %xmm0 -; SSE-NEXT: movmskps %xmm0, %eax -; SSE-NEXT: cmpb $15, %al -; SSE-NEXT: sete %al -; SSE-NEXT: retq -; -; AVX1OR2-LABEL: allones_v4i32_and4: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vpslld $29, %xmm0, %xmm0 -; AVX1OR2-NEXT: vmovmskps %xmm0, %eax -; AVX1OR2-NEXT: cmpb $15, %al -; AVX1OR2-NEXT: sete %al -; AVX1OR2-NEXT: retq -; -; KNL-LABEL: allones_v4i32_and4: -; KNL: # %bb.0: -; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL-NEXT: vptestnmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: testb $15, %al -; KNL-NEXT: sete %al -; KNL-NEXT: vzeroupper -; KNL-NEXT: retq -; -; SKX-LABEL: allones_v4i32_and4: -; SKX: # %bb.0: -; SKX-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %k0 -; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: cmpb $15, %al -; SKX-NEXT: sete %al -; SKX-NEXT: retq - %tmp = and <4 x i32> %arg, - %tmp1 = icmp ne <4 x i32> %tmp, zeroinitializer - %tmp2 = bitcast <4 x i1> %tmp1 to i4 - %tmp3 = icmp eq i4 %tmp2, -1 - ret i1 %tmp3 -} - -define i1 @allzeros_v4i32_and4(<4 x i32> %arg) { -; SSE-LABEL: allzeros_v4i32_and4: +; SSE-LABEL: allones_v4i32_and4: ; SSE: # %bb.0: ; SSE-NEXT: pslld $29, %xmm0 ; SSE-NEXT: movmskps %xmm0, %eax -; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: cmpb $15, %al ; SSE-NEXT: sete %al ; SSE-NEXT: retq ; -; AVX1OR2-LABEL: allzeros_v4i32_and4: +; AVX1OR2-LABEL: allones_v4i32_and4: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vpslld $29, %xmm0, %xmm0 ; AVX1OR2-NEXT: vmovmskps %xmm0, %eax -; AVX1OR2-NEXT: testl %eax, %eax +; AVX1OR2-NEXT: cmpb $15, %al ; AVX1OR2-NEXT: sete %al ; AVX1OR2-NEXT: retq ; -; KNL-LABEL: allzeros_v4i32_and4: +; KNL-LABEL: allones_v4i32_and4: ; KNL: # %bb.0: ; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 +; KNL-NEXT: vptestnmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: testb $15, %al ; KNL-NEXT: sete %al ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; -; SKX-LABEL: allzeros_v4i32_and4: +; SKX-LABEL: allones_v4i32_and4: ; SKX: # %bb.0: ; SKX-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %k0 -; SKX-NEXT: kortestb %k0, %k0 +; SKX-NEXT: kmovd %k0, %eax +; SKX-NEXT: cmpb $15, %al ; SKX-NEXT: sete %al ; SKX-NEXT: retq + %tmp = and <4 x i32> %arg, + %tmp1 = icmp ne <4 x i32> %tmp, zeroinitializer + %tmp2 = bitcast <4 x i1> %tmp1 to i4 + %tmp3 = icmp eq i4 %tmp2, -1 + ret i1 %tmp3 +} + +define i1 @allzeros_v4i32_and4(<4 x i32> %arg) { +; SSE2-LABEL: allzeros_v4i32_and4: +; SSE2: # %bb.0: +; SSE2-NEXT: pslld $29, %xmm0 +; SSE2-NEXT: movmskps %xmm0, %eax +; SSE2-NEXT: testl %eax, %eax +; SSE2-NEXT: sete %al +; SSE2-NEXT: retq +; +; SSE41-LABEL: allzeros_v4i32_and4: +; SSE41: # %bb.0: +; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: sete %al +; SSE41-NEXT: retq +; +; AVX1-LABEL: allzeros_v4i32_and4: +; AVX1: # %bb.0: +; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX1-NEXT: sete %al +; AVX1-NEXT: retq +; +; AVX2-LABEL: allzeros_v4i32_and4: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4,4,4,4] +; AVX2-NEXT: vptest %xmm1, %xmm0 +; AVX2-NEXT: sete %al +; AVX2-NEXT: retq +; +; AVX512-LABEL: allzeros_v4i32_and4: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4,4,4,4] +; AVX512-NEXT: vptest %xmm1, %xmm0 +; AVX512-NEXT: sete %al +; AVX512-NEXT: retq %tmp = and <4 x i32> %arg, %tmp1 = icmp ne <4 x i32> %tmp, zeroinitializer %tmp2 = bitcast <4 x i1> %tmp1 to i4 @@ -3215,53 +2967,44 @@ } define i1 @allzeros_v8i32_and4(<8 x i32> %arg) { -; SSE-LABEL: allzeros_v8i32_and4: -; SSE: # %bb.0: -; SSE-NEXT: pslld $29, %xmm1 -; SSE-NEXT: pslld $29, %xmm0 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: testl $43690, %eax # imm = 0xAAAA -; SSE-NEXT: sete %al -; SSE-NEXT: retq +; SSE2-LABEL: allzeros_v8i32_and4: +; SSE2: # %bb.0: +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pslld $29, %xmm0 +; SSE2-NEXT: movmskps %xmm0, %eax +; SSE2-NEXT: testl %eax, %eax +; SSE2-NEXT: sete %al +; SSE2-NEXT: retq +; +; SSE41-LABEL: allzeros_v8i32_and4: +; SSE41: # %bb.0: +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: sete %al +; SSE41-NEXT: retq ; ; AVX1-LABEL: allzeros_v8i32_and4: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpslld $29, %xmm0, %xmm0 -; AVX1-NEXT: vmovmskps %xmm0, %eax -; AVX1-NEXT: testl %eax, %eax +; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: allzeros_v8i32_and4: ; AVX2: # %bb.0: -; AVX2-NEXT: vpslld $29, %ymm0, %ymm0 -; AVX2-NEXT: vmovmskps %ymm0, %eax -; AVX2-NEXT: testl %eax, %eax +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4] +; AVX2-NEXT: vptest %ymm1, %ymm0 ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; KNL-LABEL: allzeros_v8i32_and4: -; KNL: # %bb.0: -; KNL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; KNL-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: testb %al, %al -; KNL-NEXT: sete %al -; KNL-NEXT: vzeroupper -; KNL-NEXT: retq -; -; SKX-LABEL: allzeros_v8i32_and4: -; SKX: # %bb.0: -; SKX-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %k0 -; SKX-NEXT: kortestb %k0, %k0 -; SKX-NEXT: sete %al -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; AVX512-LABEL: allzeros_v8i32_and4: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4] +; AVX512-NEXT: vptest %ymm1, %ymm0 +; AVX512-NEXT: sete %al +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %tmp = and <8 x i32> %arg, %tmp1 = icmp ne <8 x i32> %tmp, zeroinitializer %tmp2 = bitcast <8 x i1> %tmp1 to i8 @@ -3329,46 +3072,39 @@ } define i1 @allzeros_v16i32_and4(<16 x i32> %arg) { -; SSE-LABEL: allzeros_v16i32_and4: -; SSE: # %bb.0: -; SSE-NEXT: pslld $29, %xmm3 -; SSE-NEXT: pslld $29, %xmm2 -; SSE-NEXT: packssdw %xmm3, %xmm2 -; SSE-NEXT: pslld $29, %xmm1 -; SSE-NEXT: pslld $29, %xmm0 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: packsswb %xmm2, %xmm0 -; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: testl %eax, %eax -; SSE-NEXT: sete %al -; SSE-NEXT: retq +; SSE2-LABEL: allzeros_v16i32_and4: +; SSE2: # %bb.0: +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pslld $29, %xmm0 +; SSE2-NEXT: movmskps %xmm0, %eax +; SSE2-NEXT: testl %eax, %eax +; SSE2-NEXT: sete %al +; SSE2-NEXT: retq +; +; SSE41-LABEL: allzeros_v16i32_and4: +; SSE41: # %bb.0: +; SSE41-NEXT: por %xmm3, %xmm1 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: sete %al +; SSE41-NEXT: retq ; ; AVX1-LABEL: allzeros_v16i32_and4: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpslld $29, %xmm2, %xmm2 -; AVX1-NEXT: vpslld $29, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpslld $29, %xmm2, %xmm2 -; AVX1-NEXT: vpslld $29, %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: testl %eax, %eax +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: allzeros_v16i32_and4: ; AVX2: # %bb.0: -; AVX2-NEXT: vpslld $29, %ymm1, %ymm1 -; AVX2-NEXT: vpsrad $31, %ymm1, %ymm1 -; AVX2-NEXT: vpslld $29, %ymm0, %ymm0 -; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 -; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpmovmskb %ymm0, %eax -; AVX2-NEXT: testl %eax, %eax +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4] +; AVX2-NEXT: vptest %ymm1, %ymm0 ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -3430,37 +3166,36 @@ } define i1 @allzeros_v2i64_and4(<2 x i64> %arg) { -; SSE-LABEL: allzeros_v2i64_and4: -; SSE: # %bb.0: -; SSE-NEXT: psllq $61, %xmm0 -; SSE-NEXT: movmskpd %xmm0, %eax -; SSE-NEXT: testl %eax, %eax -; SSE-NEXT: sete %al -; SSE-NEXT: retq +; SSE2-LABEL: allzeros_v2i64_and4: +; SSE2: # %bb.0: +; SSE2-NEXT: pslld $29, %xmm0 +; SSE2-NEXT: movmskps %xmm0, %eax +; SSE2-NEXT: testb $5, %al +; SSE2-NEXT: sete %al +; SSE2-NEXT: retq +; +; SSE41-LABEL: allzeros_v2i64_and4: +; SSE41: # %bb.0: +; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: sete %al +; SSE41-NEXT: retq ; ; AVX1OR2-LABEL: allzeros_v2i64_and4: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vpsllq $61, %xmm0, %xmm0 -; AVX1OR2-NEXT: vmovmskpd %xmm0, %eax -; AVX1OR2-NEXT: testl %eax, %eax +; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; AVX1OR2-NEXT: sete %al ; AVX1OR2-NEXT: retq ; ; KNL-LABEL: allzeros_v2i64_and4: ; KNL: # %bb.0: -; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4] -; KNL-NEXT: vptestmq %zmm1, %zmm0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: testb $3, %al +; KNL-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; KNL-NEXT: sete %al -; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: allzeros_v2i64_and4: ; SKX: # %bb.0: -; SKX-NEXT: vptestmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k0 -; SKX-NEXT: kortestb %k0, %k0 +; SKX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,4] +; SKX-NEXT: vptest %xmm1, %xmm0 ; SKX-NEXT: sete %al ; SKX-NEXT: retq %tmp = and <2 x i64> %arg, @@ -3527,53 +3262,44 @@ } define i1 @allzeros_v4i64_and4(<4 x i64> %arg) { -; SSE-LABEL: allzeros_v4i64_and4: -; SSE: # %bb.0: -; SSE-NEXT: psllq $61, %xmm1 -; SSE-NEXT: psllq $61, %xmm0 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: movmskps %xmm0, %eax -; SSE-NEXT: testl %eax, %eax -; SSE-NEXT: sete %al -; SSE-NEXT: retq +; SSE2-LABEL: allzeros_v4i64_and4: +; SSE2: # %bb.0: +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pslld $29, %xmm0 +; SSE2-NEXT: movmskps %xmm0, %eax +; SSE2-NEXT: testb $5, %al +; SSE2-NEXT: sete %al +; SSE2-NEXT: retq +; +; SSE41-LABEL: allzeros_v4i64_and4: +; SSE41: # %bb.0: +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: sete %al +; SSE41-NEXT: retq ; ; AVX1-LABEL: allzeros_v4i64_and4: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsllq $61, %xmm0, %xmm0 -; AVX1-NEXT: vmovmskpd %xmm0, %eax -; AVX1-NEXT: testl %eax, %eax +; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: allzeros_v4i64_and4: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsllq $61, %ymm0, %ymm0 -; AVX2-NEXT: vmovmskpd %ymm0, %eax -; AVX2-NEXT: testl %eax, %eax +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4,4,4,4] +; AVX2-NEXT: vptest %ymm1, %ymm0 ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; KNL-LABEL: allzeros_v4i64_and4: -; KNL: # %bb.0: -; KNL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; KNL-NEXT: vptestmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: testb $15, %al -; KNL-NEXT: sete %al -; KNL-NEXT: vzeroupper -; KNL-NEXT: retq -; -; SKX-LABEL: allzeros_v4i64_and4: -; SKX: # %bb.0: -; SKX-NEXT: vptestmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %k0 -; SKX-NEXT: kortestb %k0, %k0 -; SKX-NEXT: sete %al -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; AVX512-LABEL: allzeros_v4i64_and4: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4,4,4,4] +; AVX512-NEXT: vptest %ymm1, %ymm0 +; AVX512-NEXT: sete %al +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %tmp = and <4 x i64> %arg, %tmp1 = icmp ne <4 x i64> %tmp, zeroinitializer %tmp2 = bitcast <4 x i1> %tmp1 to i4 @@ -3649,64 +3375,51 @@ } define i1 @allzeros_v8i64_and4(<8 x i64> %arg) { -; SSE-LABEL: allzeros_v8i64_and4: -; SSE: # %bb.0: -; SSE-NEXT: psllq $61, %xmm3 -; SSE-NEXT: psllq $61, %xmm2 -; SSE-NEXT: packssdw %xmm3, %xmm2 -; SSE-NEXT: psllq $61, %xmm1 -; SSE-NEXT: psllq $61, %xmm0 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: packssdw %xmm2, %xmm0 -; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: testl $43690, %eax # imm = 0xAAAA -; SSE-NEXT: sete %al -; SSE-NEXT: retq +; SSE2-LABEL: allzeros_v8i64_and4: +; SSE2: # %bb.0: +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pslld $29, %xmm0 +; SSE2-NEXT: movmskps %xmm0, %eax +; SSE2-NEXT: testb $5, %al +; SSE2-NEXT: sete %al +; SSE2-NEXT: retq +; +; SSE41-LABEL: allzeros_v8i64_and4: +; SSE41: # %bb.0: +; SSE41-NEXT: por %xmm3, %xmm1 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: sete %al +; SSE41-NEXT: retq ; ; AVX1-LABEL: allzeros_v8i64_and4: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpsllq $61, %xmm2, %xmm2 -; AVX1-NEXT: vpsllq $61, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpsllq $61, %xmm2, %xmm2 -; AVX1-NEXT: vpsllq $61, %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovmskps %xmm0, %eax -; AVX1-NEXT: testl %eax, %eax +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: allzeros_v8i64_and4: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsllq $61, %ymm1, %ymm1 -; AVX2-NEXT: vpsllq $61, %ymm0, %ymm0 -; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovmskps %ymm0, %eax -; AVX2-NEXT: testl %eax, %eax +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4,4,4,4] +; AVX2-NEXT: vptest %ymm1, %ymm0 ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; KNL-LABEL: allzeros_v8i64_and4: -; KNL: # %bb.0: -; KNL-NEXT: vptestmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: testb %al, %al -; KNL-NEXT: sete %al -; KNL-NEXT: vzeroupper -; KNL-NEXT: retq -; -; SKX-LABEL: allzeros_v8i64_and4: -; SKX: # %bb.0: -; SKX-NEXT: vptestmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k0 -; SKX-NEXT: kortestb %k0, %k0 -; SKX-NEXT: sete %al -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; AVX512-LABEL: allzeros_v8i64_and4: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4] +; AVX512-NEXT: vptestmd %zmm1, %zmm0, %k0 +; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: sete %al +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %tmp = and <8 x i64> %arg, %tmp1 = icmp ne <8 x i64> %tmp, zeroinitializer %tmp2 = bitcast <8 x i1> %tmp1 to i8 @@ -4104,23 +3817,12 @@ ; AVX1OR2-NEXT: setne %al ; AVX1OR2-NEXT: retq ; -; KNL-LABEL: movmsk_or_v2i64: -; KNL: # %bb.0: -; KNL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL-NEXT: vpcmpneqq %zmm1, %zmm0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: testb $3, %al -; KNL-NEXT: setne %al -; KNL-NEXT: vzeroupper -; KNL-NEXT: retq -; -; SKX-LABEL: movmsk_or_v2i64: -; SKX: # %bb.0: -; SKX-NEXT: vpcmpneqq %xmm1, %xmm0, %k0 -; SKX-NEXT: kortestb %k0, %k0 -; SKX-NEXT: setne %al -; SKX-NEXT: retq +; AVX512-LABEL: movmsk_or_v2i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vptest %xmm0, %xmm0 +; AVX512-NEXT: setne %al +; AVX512-NEXT: retq %cmp = icmp ne <2 x i64> %x, %y %e1 = extractelement <2 x i1> %cmp, i32 0 %e2 = extractelement <2 x i1> %cmp, i32 1 diff --git a/llvm/test/CodeGen/X86/pr53419.ll b/llvm/test/CodeGen/X86/pr53419.ll --- a/llvm/test/CodeGen/X86/pr53419.ll +++ b/llvm/test/CodeGen/X86/pr53419.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X64,SSE,SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=X64,SSE,SSE42 -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=X64,AVX -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64,AVX -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512vl,+avx512bw,+avx512dq | FileCheck %s --check-prefixes=X64,AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X64 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=X64 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=X64 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512vl,+avx512bw,+avx512dq | FileCheck %s --check-prefixes=X64 ; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X86 declare i1 @llvm.vector.reduce.and.v2i1(<2 x i1>) @@ -88,61 +88,19 @@ } define i1 @vector_version_v2i8(ptr align 1 %arg, ptr align 1 %arg1) { -; SSE2-LABEL: vector_version_v2i8: -; SSE2: # %bb.0: # %bb -; SSE2-NEXT: movzwl (%rsi), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzwl (%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE2-NEXT: movmskpd %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE42-LABEL: vector_version_v2i8: -; SSE42: # %bb.0: # %bb -; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; SSE42-NEXT: pmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; SSE42-NEXT: psubq %xmm1, %xmm0 -; SSE42-NEXT: ptest %xmm0, %xmm0 -; SSE42-NEXT: sete %al -; SSE42-NEXT: retq -; -; AVX-LABEL: vector_version_v2i8: -; AVX: # %bb.0: # %bb -; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpsubq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vptest %xmm0, %xmm0 -; AVX-NEXT: sete %al -; AVX-NEXT: retq -; -; AVX512-LABEL: vector_version_v2i8: -; AVX512: # %bb.0: # %bb -; AVX512-NEXT: movzwl (%rsi), %eax -; AVX512-NEXT: vmovd %eax, %xmm0 -; AVX512-NEXT: movzwl (%rdi), %eax -; AVX512-NEXT: vmovd %eax, %xmm1 -; AVX512-NEXT: vpcmpneqb %xmm1, %xmm0, %k0 -; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: testb $3, %al -; AVX512-NEXT: sete %al -; AVX512-NEXT: retq +; X64-LABEL: vector_version_v2i8: +; X64: # %bb.0: # %bb +; X64-NEXT: movzwl (%rsi), %eax +; X64-NEXT: cmpw (%rdi), %ax +; X64-NEXT: sete %al +; X64-NEXT: retq ; ; X86-LABEL: vector_version_v2i8: ; X86: # %bb.0: # %bb ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; X86-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; X86-NEXT: vpsubq %xmm1, %xmm0, %xmm0 -; X86-NEXT: vptest %xmm0, %xmm0 +; X86-NEXT: movzwl (%ecx), %ecx +; X86-NEXT: cmpw (%eax), %cx ; X86-NEXT: sete %al ; X86-NEXT: retl bb: @@ -155,56 +113,19 @@ } define i1 @vector_version_v4i8(ptr align 1 %arg, ptr align 1 %arg1) { -; SSE2-LABEL: vector_version_v4i8: -; SSE2: # %bb.0: # %bb -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: movmskps %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE42-LABEL: vector_version_v4i8: -; SSE42: # %bb.0: # %bb -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; SSE42-NEXT: psubd %xmm1, %xmm0 -; SSE42-NEXT: ptest %xmm0, %xmm0 -; SSE42-NEXT: sete %al -; SSE42-NEXT: retq -; -; AVX-LABEL: vector_version_v4i8: -; AVX: # %bb.0: # %bb -; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vptest %xmm0, %xmm0 -; AVX-NEXT: sete %al -; AVX-NEXT: retq -; -; AVX512-LABEL: vector_version_v4i8: -; AVX512: # %bb.0: # %bb -; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512-NEXT: vpcmpneqb %xmm1, %xmm0, %k0 -; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: testb $15, %al -; AVX512-NEXT: sete %al -; AVX512-NEXT: retq +; X64-LABEL: vector_version_v4i8: +; X64: # %bb.0: # %bb +; X64-NEXT: movl (%rsi), %eax +; X64-NEXT: cmpl (%rdi), %eax +; X64-NEXT: sete %al +; X64-NEXT: retq ; ; X86-LABEL: vector_version_v4i8: ; X86: # %bb.0: # %bb ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X86-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X86-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; X86-NEXT: vptest %xmm0, %xmm0 +; X86-NEXT: movl (%ecx), %ecx +; X86-NEXT: cmpl (%eax), %ecx ; X86-NEXT: sete %al ; X86-NEXT: retl bb: @@ -217,34 +138,12 @@ } define i1 @vector_version_v8i8(ptr align 1 %arg, ptr align 1 %arg1) { -; SSE-LABEL: vector_version_v8i8: -; SSE: # %bb.0: # %bb -; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: pcmpeqb %xmm0, %xmm1 -; SSE-NEXT: pmovmskb %xmm1, %eax -; SSE-NEXT: xorb $-1, %al -; SSE-NEXT: sete %al -; SSE-NEXT: retq -; -; AVX-LABEL: vector_version_v8i8: -; AVX: # %bb.0: # %bb -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpmovmskb %xmm0, %eax -; AVX-NEXT: xorb $-1, %al -; AVX-NEXT: sete %al -; AVX-NEXT: retq -; -; AVX512-LABEL: vector_version_v8i8: -; AVX512: # %bb.0: # %bb -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vpcmpneqb %xmm1, %xmm0, %k0 -; AVX512-NEXT: kortestb %k0, %k0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: retq +; X64-LABEL: vector_version_v8i8: +; X64: # %bb.0: # %bb +; X64-NEXT: movq (%rsi), %rax +; X64-NEXT: cmpq (%rdi), %rax +; X64-NEXT: sete %al +; X64-NEXT: retq ; ; X86-LABEL: vector_version_v8i8: ; X86: # %bb.0: # %bb diff --git a/llvm/test/CodeGen/X86/setcc-wide-types.ll b/llvm/test/CodeGen/X86/setcc-wide-types.ll --- a/llvm/test/CodeGen/X86/setcc-wide-types.ll +++ b/llvm/test/CodeGen/X86/setcc-wide-types.ll @@ -399,23 +399,14 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: ne_i512: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 -; AVX512F-NEXT: xorl %eax, %eax -; AVX512F-NEXT: kortestw %k0, %k0 -; AVX512F-NEXT: setne %al -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: ne_i512: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 -; AVX512BW-NEXT: xorl %eax, %eax -; AVX512BW-NEXT: kortestq %k0, %k0 -; AVX512BW-NEXT: setne %al -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512-LABEL: ne_i512: +; AVX512: # %bb.0: +; AVX512-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: setne %al +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %bcx = bitcast <8 x i64> %x to i512 %bcy = bitcast <8 x i64> %y to i512 %cmp = icmp ne i512 %bcx, %bcy @@ -590,23 +581,14 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: eq_i512: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 -; AVX512F-NEXT: xorl %eax, %eax -; AVX512F-NEXT: kortestw %k0, %k0 -; AVX512F-NEXT: sete %al -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: eq_i512: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 -; AVX512BW-NEXT: xorl %eax, %eax -; AVX512BW-NEXT: kortestq %k0, %k0 -; AVX512BW-NEXT: sete %al -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512-LABEL: eq_i512: +; AVX512: # %bb.0: +; AVX512-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: sete %al +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %bcx = bitcast <8 x i64> %x to i512 %bcy = bitcast <8 x i64> %y to i512 %cmp = icmp eq i512 %bcx, %bcy diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll --- a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll @@ -132,9 +132,8 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 -; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kortestw %k0, %k0 ; AVX512F-NEXT: setne %al ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -301,35 +300,13 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: trunc_v8i32_v8i1: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpslld $31, %ymm0, %ymm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb %al, %al -; AVX512F-NEXT: setne %al -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: trunc_v8i32_v8i1: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpslld $31, %ymm0, %ymm0 -; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: testb %al, %al -; AVX512BW-NEXT: setne %al -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: trunc_v8i32_v8i1: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0 -; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0 -; AVX512VL-NEXT: kmovd %k0, %eax -; AVX512VL-NEXT: testb %al, %al -; AVX512VL-NEXT: setne %al -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512-LABEL: trunc_v8i32_v8i1: +; AVX512: # %bb.0: +; AVX512-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX512-NEXT: vptest %ymm0, %ymm0 +; AVX512-NEXT: setne %al +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %a = trunc <8 x i32> %0 to <8 x i1> %b = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %a) ret i1 %b @@ -538,35 +515,14 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: trunc_v8i64_v8i1: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 -; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb %al, %al -; AVX512F-NEXT: setne %al -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: trunc_v8i64_v8i1: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsllq $63, %zmm0, %zmm0 -; AVX512BW-NEXT: vptestmq %zmm0, %zmm0, %k0 -; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: testb %al, %al -; AVX512BW-NEXT: setne %al -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: trunc_v8i64_v8i1: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllq $63, %zmm0, %zmm0 -; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k0 -; AVX512VL-NEXT: kmovd %k0, %eax -; AVX512VL-NEXT: testb %al, %al -; AVX512VL-NEXT: setne %al -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512-LABEL: trunc_v8i64_v8i1: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllq $63, %zmm0, %zmm0 +; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: setne %al +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %a = trunc <8 x i64> %0 to <8 x i1> %b = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %a) ret i1 %b @@ -958,9 +914,8 @@ ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 -; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kortestw %k0, %k0 ; AVX512F-NEXT: setne %al ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1805,9 +1760,8 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 -; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kortestw %k0, %k0 ; AVX512F-NEXT: setne %al ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll --- a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll @@ -883,7 +883,7 @@ ; ; AVX2-LABEL: mask_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372039002259456,9223372039002259456,9223372039002259456,9223372039002259456] +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] ; AVX2-NEXT: vptest %ymm1, %ymm0 ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper @@ -891,7 +891,7 @@ ; ; AVX512-LABEL: mask_v8i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372039002259456,9223372039002259456,9223372039002259456,9223372039002259456] +; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] ; AVX512-NEXT: vptest %ymm1, %ymm0 ; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper @@ -921,28 +921,12 @@ ; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; -; AVX1-LABEL: trunc_v16i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 -; AVX1-NEXT: setne %al -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_v16i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [71777214294589695,71777214294589695,71777214294589695,71777214294589695] -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: setne %al -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc_v16i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [71777214294589695,71777214294589695,71777214294589695,71777214294589695] -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: setne %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: trunc_v16i16: +; AVX: # %bb.0: +; AVX-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX-NEXT: setne %al +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %1 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %a0) %2 = trunc i16 %1 to i8 %3 = icmp ne i8 %2, 0 @@ -1043,24 +1027,18 @@ ; AVX2-LABEL: PR44781: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqu (%rdi), %xmm0 -; AVX2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15] +; AVX2-NEXT: vptest %xmm1, %xmm0 ; AVX2-NEXT: sete %al ; AVX2-NEXT: retq ; -; AVX512BW-LABEL: PR44781: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqu (%rdi), %xmm0 -; AVX512BW-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX512BW-NEXT: sete %al -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: PR44781: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqu (%rdi), %xmm0 -; AVX512BWVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [64424509455,64424509455] -; AVX512BWVL-NEXT: vptest %xmm1, %xmm0 -; AVX512BWVL-NEXT: sete %al -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: PR44781: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqu (%rdi), %xmm0 +; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15] +; AVX512-NEXT: vptest %xmm1, %xmm0 +; AVX512-NEXT: sete %al +; AVX512-NEXT: retq %2 = load <4 x i32>, ptr %0, align 4 %3 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %2) %4 = and i32 %3, 15