diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -40227,6 +40227,81 @@ return SDValue(); } +// Attempt to simplify the MOVMSK input based on the comparison type. +static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC, + SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + // Only handle eq/ne against zero (any_of). + // TODO: Handle eq/ne against -1 (all_of) as well. + if (!(CC == X86::COND_E || CC == X86::COND_NE)) + return SDValue(); + if (EFLAGS.getValueType() != MVT::i32) + return SDValue(); + unsigned CmpOpcode = EFLAGS.getOpcode(); + if (CmpOpcode != X86ISD::CMP || !isNullConstant(EFLAGS.getOperand(1))) + return SDValue(); + + SDValue CmpOp = EFLAGS.getOperand(0); + unsigned CmpBits = CmpOp.getValueSizeInBits(); + + // Peek through any truncate. + if (CmpOp.getOpcode() == ISD::TRUNCATE) + CmpOp = CmpOp.getOperand(0); + + // Bail if we don't find a MOVMSK. + if (CmpOp.getOpcode() != X86ISD::MOVMSK) + return SDValue(); + + SDValue Vec = CmpOp.getOperand(0); + MVT VecVT = Vec.getSimpleValueType(); + assert((VecVT.is128BitVector() || VecVT.is256BitVector()) && + "Unexpected MOVMSK operand"); + + // See if we can avoid a PACKSS by calling MOVMSK on the sources. + // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out + // sign bits prior to the comparison with zero unless we know that + // the vXi16 splats the sign bit down to the lower i8 half. + if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) { + SDValue VecOp0 = Vec.getOperand(0); + SDValue VecOp1 = Vec.getOperand(1); + bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8; + bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8; + // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA. + if (CmpBits == 8 && VecOp1.isUndef()) { + SDLoc DL(EFLAGS); + SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0); + Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result); + Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16); + if (!SignExt0) { + Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result, + DAG.getConstant(0xAAAA, DL, MVT::i16)); + } + return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result, + DAG.getConstant(0, DL, MVT::i16)); + } + // PMOVMSKB(PACKSSBW(LO(X), HI(X))) + // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA. + if (CmpBits == 16 && Subtarget.hasInt256() && + VecOp0.getOpcode() == ISD::EXTRACT_SUBVECTOR && + VecOp1.getOpcode() == ISD::EXTRACT_SUBVECTOR && + VecOp0.getOperand(0) == VecOp1.getOperand(0) && + VecOp0.getConstantOperandAPInt(1) == 0 && + VecOp1.getConstantOperandAPInt(1) == 8) { + SDLoc DL(EFLAGS); + SDValue Result = DAG.getBitcast(MVT::v32i8, VecOp0.getOperand(0)); + Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result); + if (!SignExt0 || !SignExt1) { + Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result, + DAG.getConstant(0xAAAAAAAA, DL, MVT::i32)); + } + return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result, + DAG.getConstant(0, DL, MVT::i32)); + } + } + + return SDValue(); +} + /// Optimize an EFLAGS definition used according to the condition code \p CC /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing /// uses of chain values. @@ -40243,6 +40318,9 @@ if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget)) return R; + if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget)) + return R; + return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget); } diff --git a/llvm/test/CodeGen/X86/movmsk-cmp.ll b/llvm/test/CodeGen/X86/movmsk-cmp.ll --- a/llvm/test/CodeGen/X86/movmsk-cmp.ll +++ b/llvm/test/CodeGen/X86/movmsk-cmp.ll @@ -359,17 +359,15 @@ define i1 @allzeros_v8i16_sign(<8 x i16> %arg) { ; SSE2-LABEL: allzeros_v8i16_sign: ; SSE2: # %bb.0: -; SSE2-NEXT: packsswb %xmm0, %xmm0 ; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: testl $43690, %eax # imm = 0xAAAA ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; ; AVX-LABEL: allzeros_v8i16_sign: ; AVX: # %bb.0: -; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpmovmskb %xmm0, %eax -; AVX-NEXT: testb %al, %al +; AVX-NEXT: testl $43690, %eax # imm = 0xAAAA ; AVX-NEXT: sete %al ; AVX-NEXT: retq ; @@ -471,10 +469,8 @@ ; ; AVX2-LABEL: allzeros_v16i16_sign: ; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovmskb %xmm0, %eax -; AVX2-NEXT: testw %ax, %ax +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: testl $-1431655766, %eax # imm = 0xAAAAAAAA ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -764,9 +760,8 @@ ; SSE2-LABEL: allzeros_v8i32_sign: ; SSE2: # %bb.0: ; SSE2-NEXT: packssdw %xmm1, %xmm0 -; SSE2-NEXT: packsswb %xmm0, %xmm0 ; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: testl $43690, %eax # imm = 0xAAAA ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; @@ -894,10 +889,8 @@ ; AVX2-NEXT: vpcmpgtd %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovmskb %xmm0, %eax -; AVX2-NEXT: testw %ax, %ax +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: testl $-1431655766, %eax # imm = 0xAAAAAAAA ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1075,9 +1068,8 @@ ; SSE2-NEXT: packssdw %xmm3, %xmm2 ; SSE2-NEXT: packssdw %xmm1, %xmm0 ; SSE2-NEXT: packssdw %xmm2, %xmm0 -; SSE2-NEXT: packsswb %xmm0, %xmm0 ; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: testl $43690, %eax # imm = 0xAAAA ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; @@ -1536,18 +1528,16 @@ ; SSE2-LABEL: allzeros_v8i16_and1: ; SSE2: # %bb.0: ; SSE2-NEXT: psllw $15, %xmm0 -; SSE2-NEXT: packsswb %xmm0, %xmm0 ; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: testl $43690, %eax # imm = 0xAAAA ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; ; AVX-LABEL: allzeros_v8i16_and1: ; AVX: # %bb.0: ; AVX-NEXT: vpsllw $15, %xmm0, %xmm0 -; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpmovmskb %xmm0, %eax -; AVX-NEXT: testb %al, %al +; AVX-NEXT: testl $43690, %eax # imm = 0xAAAA ; AVX-NEXT: sete %al ; AVX-NEXT: retq ; @@ -1822,10 +1812,8 @@ ; AVX2-LABEL: allzeros_v16i16_and1: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsllw $15, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovmskb %xmm0, %eax -; AVX2-NEXT: testw %ax, %ax +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: testl $-1431655766, %eax # imm = 0xAAAAAAAA ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -2000,9 +1988,8 @@ ; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: packssdw %xmm1, %xmm0 -; SSE2-NEXT: packsswb %xmm0, %xmm0 ; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: testl $43690, %eax # imm = 0xAAAA ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; @@ -2161,10 +2148,8 @@ ; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovmskb %xmm0, %eax -; AVX2-NEXT: testw %ax, %ax +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: testl $-1431655766, %eax # imm = 0xAAAAAAAA ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -2470,9 +2455,8 @@ ; SSE2-NEXT: psllq $63, %xmm0 ; SSE2-NEXT: packssdw %xmm1, %xmm0 ; SSE2-NEXT: packssdw %xmm2, %xmm0 -; SSE2-NEXT: packsswb %xmm0, %xmm0 ; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: testl $43690, %eax # imm = 0xAAAA ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; @@ -2937,18 +2921,16 @@ ; SSE2-LABEL: allzeros_v8i16_and4: ; SSE2: # %bb.0: ; SSE2-NEXT: psllw $13, %xmm0 -; SSE2-NEXT: packsswb %xmm0, %xmm0 ; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: testl $43690, %eax # imm = 0xAAAA ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; ; AVX-LABEL: allzeros_v8i16_and4: ; AVX: # %bb.0: ; AVX-NEXT: vpsllw $13, %xmm0, %xmm0 -; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpmovmskb %xmm0, %eax -; AVX-NEXT: testb %al, %al +; AVX-NEXT: testl $43690, %eax # imm = 0xAAAA ; AVX-NEXT: sete %al ; AVX-NEXT: retq ; @@ -3223,10 +3205,8 @@ ; AVX2-LABEL: allzeros_v16i16_and4: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsllw $13, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovmskb %xmm0, %eax -; AVX2-NEXT: testw %ax, %ax +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: testl $-1431655766, %eax # imm = 0xAAAAAAAA ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -3401,9 +3381,8 @@ ; SSE2-NEXT: pslld $29, %xmm1 ; SSE2-NEXT: pslld $29, %xmm0 ; SSE2-NEXT: packssdw %xmm1, %xmm0 -; SSE2-NEXT: packsswb %xmm0, %xmm0 ; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: testl $43690, %eax # imm = 0xAAAA ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; @@ -3562,10 +3541,8 @@ ; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovmskb %xmm0, %eax -; AVX2-NEXT: testw %ax, %ax +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: testl $-1431655766, %eax # imm = 0xAAAAAAAA ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -3871,9 +3848,8 @@ ; SSE2-NEXT: psllq $61, %xmm0 ; SSE2-NEXT: packssdw %xmm1, %xmm0 ; SSE2-NEXT: packssdw %xmm2, %xmm0 -; SSE2-NEXT: packsswb %xmm0, %xmm0 ; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: testl $43690, %eax # imm = 0xAAAA ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-compare-any_of.ll b/llvm/test/CodeGen/X86/vector-compare-any_of.ll --- a/llvm/test/CodeGen/X86/vector-compare-any_of.ll +++ b/llvm/test/CodeGen/X86/vector-compare-any_of.ll @@ -888,9 +888,8 @@ ; SSE-NEXT: cmpneqps %xmm3, %xmm1 ; SSE-NEXT: cmpneqps %xmm2, %xmm0 ; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: packsswb %xmm0, %xmm0 ; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: testb %al, %al +; SSE-NEXT: testw %ax, %ax ; SSE-NEXT: setne %al ; SSE-NEXT: retq ; @@ -996,18 +995,16 @@ ; SSE-LABEL: bool_reduction_v8i16: ; SSE: # %bb.0: ; SSE-NEXT: pcmpgtw %xmm0, %xmm1 -; SSE-NEXT: packsswb %xmm1, %xmm1 ; SSE-NEXT: pmovmskb %xmm1, %eax -; SSE-NEXT: testb %al, %al +; SSE-NEXT: testw %ax, %ax ; SSE-NEXT: setne %al ; SSE-NEXT: retq ; ; AVX-LABEL: bool_reduction_v8i16: ; AVX: # %bb.0: ; AVX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpmovmskb %xmm0, %eax -; AVX-NEXT: testb %al, %al +; AVX-NEXT: testw %ax, %ax ; AVX-NEXT: setne %al ; AVX-NEXT: retq ; @@ -1123,9 +1120,8 @@ ; SSE-NEXT: pminud %xmm0, %xmm2 ; SSE-NEXT: pcmpeqd %xmm0, %xmm2 ; SSE-NEXT: packssdw %xmm3, %xmm2 -; SSE-NEXT: packsswb %xmm2, %xmm2 ; SSE-NEXT: pmovmskb %xmm2, %eax -; SSE-NEXT: testb %al, %al +; SSE-NEXT: testw %ax, %ax ; SSE-NEXT: setne %al ; SSE-NEXT: retq ; @@ -1200,10 +1196,8 @@ ; AVX2-LABEL: bool_reduction_v16i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovmskb %xmm0, %eax -; AVX2-NEXT: testw %ax, %ax +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll --- a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll @@ -116,9 +116,8 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: psllw $15, %xmm0 -; SSE2-NEXT: packsswb %xmm0, %xmm0 ; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: testl $43690, %eax # imm = 0xAAAA ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq ; @@ -126,9 +125,8 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE41-NEXT: psllw $15, %xmm0 -; SSE41-NEXT: packsswb %xmm0, %xmm0 ; SSE41-NEXT: pmovmskb %xmm0, %eax -; SSE41-NEXT: testb %al, %al +; SSE41-NEXT: testl $43690, %eax # imm = 0xAAAA ; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; @@ -136,9 +134,8 @@ ; AVX: # %bb.0: ; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX-NEXT: vpsllw $15, %xmm0, %xmm0 -; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpmovmskb %xmm0, %eax -; AVX-NEXT: testb %al, %al +; AVX-NEXT: testl $43690, %eax # imm = 0xAAAA ; AVX-NEXT: setne %al ; AVX-NEXT: retq ; @@ -269,9 +266,8 @@ ; SSE2-NEXT: psrad $16, %xmm0 ; SSE2-NEXT: packssdw %xmm1, %xmm0 ; SSE2-NEXT: psllw $15, %xmm0 -; SSE2-NEXT: packsswb %xmm0, %xmm0 ; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: testl $43690, %eax # imm = 0xAAAA ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq ; @@ -282,9 +278,8 @@ ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] ; SSE41-NEXT: packusdw %xmm1, %xmm0 ; SSE41-NEXT: psllw $15, %xmm0 -; SSE41-NEXT: packsswb %xmm0, %xmm0 ; SSE41-NEXT: pmovmskb %xmm0, %eax -; SSE41-NEXT: testb %al, %al +; SSE41-NEXT: testl $43690, %eax # imm = 0xAAAA ; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; @@ -296,9 +291,8 @@ ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: testb %al, %al +; AVX1-NEXT: testl $43690, %eax # imm = 0xAAAA ; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -308,9 +302,8 @@ ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0 -; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpmovmskb %xmm0, %eax -; AVX2-NEXT: testb %al, %al +; AVX2-NEXT: testl $43690, %eax # imm = 0xAAAA ; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -505,9 +498,8 @@ ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE2-NEXT: psllw $15, %xmm2 -; SSE2-NEXT: packsswb %xmm2, %xmm2 ; SSE2-NEXT: pmovmskb %xmm2, %eax -; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: testl $43690, %eax # imm = 0xAAAA ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq ; @@ -522,9 +514,8 @@ ; SSE41-NEXT: packusdw %xmm1, %xmm0 ; SSE41-NEXT: packusdw %xmm2, %xmm0 ; SSE41-NEXT: psllw $15, %xmm0 -; SSE41-NEXT: packsswb %xmm0, %xmm0 ; SSE41-NEXT: pmovmskb %xmm0, %eax -; SSE41-NEXT: testb %al, %al +; SSE41-NEXT: testl $43690, %eax # imm = 0xAAAA ; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; @@ -539,9 +530,8 @@ ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: testb %al, %al +; AVX1-NEXT: testl $43690, %eax # imm = 0xAAAA ; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -554,9 +544,8 @@ ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0 -; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpmovmskb %xmm0, %eax -; AVX2-NEXT: testb %al, %al +; AVX2-NEXT: testl $43690, %eax # imm = 0xAAAA ; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -966,9 +955,8 @@ ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: packsswb %xmm0, %xmm0 ; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: testl $43690, %eax # imm = 0xAAAA ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq ; @@ -977,9 +965,8 @@ ; SSE41-NEXT: pxor %xmm1, %xmm1 ; SSE41-NEXT: pcmpeqb %xmm0, %xmm1 ; SSE41-NEXT: pmovsxbw %xmm1, %xmm0 -; SSE41-NEXT: packsswb %xmm0, %xmm0 ; SSE41-NEXT: pmovmskb %xmm0, %eax -; SSE41-NEXT: testb %al, %al +; SSE41-NEXT: testw %ax, %ax ; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; @@ -988,9 +975,8 @@ ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpmovsxbw %xmm0, %xmm0 -; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpmovmskb %xmm0, %eax -; AVX-NEXT: testb %al, %al +; AVX-NEXT: testw %ax, %ax ; AVX-NEXT: setne %al ; AVX-NEXT: retq ; @@ -1166,9 +1152,8 @@ ; SSE-NEXT: pcmpeqd %xmm2, %xmm1 ; SSE-NEXT: pcmpeqd %xmm2, %xmm0 ; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: packsswb %xmm0, %xmm0 ; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: testb %al, %al +; SSE-NEXT: testw %ax, %ax ; SSE-NEXT: setne %al ; SSE-NEXT: retq ; @@ -1257,10 +1242,8 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovmskb %xmm0, %eax -; AVX2-NEXT: testw %ax, %ax +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1393,9 +1376,8 @@ ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: packssdw %xmm2, %xmm1 ; SSE2-NEXT: packssdw %xmm3, %xmm1 -; SSE2-NEXT: packsswb %xmm1, %xmm1 ; SSE2-NEXT: pmovmskb %xmm1, %eax -; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: testl $43690, %eax # imm = 0xAAAA ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq ; @@ -1409,9 +1391,8 @@ ; SSE41-NEXT: pcmpeqq %xmm4, %xmm0 ; SSE41-NEXT: packssdw %xmm1, %xmm0 ; SSE41-NEXT: packssdw %xmm2, %xmm0 -; SSE41-NEXT: packsswb %xmm0, %xmm0 ; SSE41-NEXT: pmovmskb %xmm0, %eax -; SSE41-NEXT: testb %al, %al +; SSE41-NEXT: testl $43690, %eax # imm = 0xAAAA ; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; @@ -1518,10 +1499,8 @@ ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovmskb %xmm0, %eax -; AVX2-NEXT: testw %ax, %ax +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: testl $-1431655766, %eax # imm = 0xAAAAAAAA ; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq