Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -23103,9 +23103,8 @@ SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0)); SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL); - SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT); SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT); - SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask); + SDValue Lo = Op0; SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift); SDValue HiZ; if (CurrVT.is512BitVector()) { Index: llvm/trunk/test/CodeGen/X86/combine-srl.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/combine-srl.ll +++ llvm/trunk/test/CodeGen/X86/combine-srl.ll @@ -347,50 +347,48 @@ ; SSE-LABEL: combine_vec_lshr_lzcnt_bit1: ; SSE: # %bb.0: ; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: pshufb %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: psrlw $4, %xmm1 -; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pshufb %xmm1, %xmm3 -; SSE-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE-NEXT: pand %xmm0, %xmm5 -; SSE-NEXT: pshufb %xmm5, %xmm4 -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: paddb %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE-NEXT: psrlw $8, %xmm1 +; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: pshufb %xmm1, %xmm2 +; SSE-NEXT: pcmpeqb %xmm4, %xmm1 ; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: psrlw $8, %xmm3 -; SSE-NEXT: paddw %xmm1, %xmm3 -; SSE-NEXT: pcmpeqw %xmm2, %xmm0 +; SSE-NEXT: paddb %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pcmpeqb %xmm4, %xmm2 +; SSE-NEXT: psrlw $8, %xmm2 +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: psrlw $8, %xmm1 +; SSE-NEXT: paddw %xmm2, %xmm1 +; SSE-NEXT: pcmpeqw %xmm4, %xmm0 ; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: psrld $16, %xmm3 -; SSE-NEXT: paddd %xmm3, %xmm0 -; SSE-NEXT: psrld $5, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: paddd %xmm0, %xmm1 +; SSE-NEXT: psrld $5, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_lshr_lzcnt_bit1: ; AVX: # %bb.0: ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm1 -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm3 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm4 -; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm4, %xmm5, %xmm4 -; AVX-NEXT: vpand %xmm3, %xmm4, %xmm3 -; AVX-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX-NEXT: vpaddb %xmm1, %xmm3, %xmm1 -; AVX-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm3 -; AVX-NEXT: vpsrlw $8, %xmm3, %xmm3 -; AVX-NEXT: vpand %xmm3, %xmm1, %xmm3 +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm3 +; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 +; AVX-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 +; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX-NEXT: vpand %xmm2, %xmm1, %xmm2 ; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX-NEXT: vpaddw %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 ; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vpsrld $16, %xmm1, %xmm1 Index: llvm/trunk/test/CodeGen/X86/prefer-avx256-lzcnt.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/prefer-avx256-lzcnt.ll +++ llvm/trunk/test/CodeGen/X86/prefer-avx256-lzcnt.ll @@ -38,17 +38,15 @@ define <16 x i8> @testv16i8(<16 x i8> %in) { ; AVX256-LABEL: testv16i8: ; AVX256: # %bb.0: -; AVX256-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX256-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX256-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX256-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX256-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX256-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX256-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX256-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX256-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX256-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1 -; AVX256-NEXT: vpand %xmm1, %xmm2, %xmm1 -; AVX256-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX256-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX256-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX256-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX256-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm3 +; AVX256-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX256-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX256-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ; AVX256-NEXT: retq ; ; AVX512-LABEL: testv16i8: @@ -93,17 +91,15 @@ define <32 x i8> @testv32i8(<32 x i8> %in) { ; AVX256-LABEL: testv32i8: ; AVX256: # %bb.0: -; AVX256-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX256-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX256-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX256-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX256-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX256-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX256-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX256-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX256-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX256-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1 -; AVX256-NEXT: vpand %ymm1, %ymm2, %ymm1 -; AVX256-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX256-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX256-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX256-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX256-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm3 +; AVX256-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX256-NEXT: vpshufb %ymm0, %ymm1, %ymm0 +; AVX256-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ; AVX256-NEXT: retq ; ; AVX512-LABEL: testv32i8: Index: llvm/trunk/test/CodeGen/X86/vector-lzcnt-128.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-lzcnt-128.ll +++ llvm/trunk/test/CodeGen/X86/vector-lzcnt-128.ll @@ -96,33 +96,30 @@ ; ; SSSE3-LABEL: testv2i64: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pand %xmm2, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pshufb %xmm1, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pshufb %xmm0, %xmm3 ; SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-NEXT: psrlw $4, %xmm1 -; SSSE3-NEXT: pand %xmm2, %xmm1 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: pshufb %xmm1, %xmm3 -; SSSE3-NEXT: pcmpeqb %xmm2, %xmm1 -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: paddb %xmm3, %xmm1 -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: pcmpeqb %xmm2, %xmm3 -; SSSE3-NEXT: psrlw $8, %xmm3 -; SSSE3-NEXT: pand %xmm1, %xmm3 +; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm4 +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: pcmpeqb %xmm4, %xmm1 +; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: paddb %xmm2, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pcmpeqb %xmm4, %xmm2 +; SSSE3-NEXT: psrlw $8, %xmm2 +; SSSE3-NEXT: pand %xmm1, %xmm2 ; SSSE3-NEXT: psrlw $8, %xmm1 -; SSSE3-NEXT: paddw %xmm3, %xmm1 -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: pcmpeqw %xmm2, %xmm3 -; SSSE3-NEXT: psrld $16, %xmm3 -; SSSE3-NEXT: pand %xmm1, %xmm3 +; SSSE3-NEXT: paddw %xmm2, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pcmpeqw %xmm4, %xmm2 +; SSSE3-NEXT: psrld $16, %xmm2 +; SSSE3-NEXT: pand %xmm1, %xmm2 ; SSSE3-NEXT: psrld $16, %xmm1 -; SSSE3-NEXT: paddd %xmm3, %xmm1 -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 +; SSSE3-NEXT: paddd %xmm2, %xmm1 +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm0 ; SSSE3-NEXT: psrlq $32, %xmm0 ; SSSE3-NEXT: pand %xmm1, %xmm0 ; SSSE3-NEXT: psrlq $32, %xmm1 @@ -132,33 +129,30 @@ ; ; SSE41-LABEL: testv2i64: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pshufb %xmm1, %xmm4 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: pshufb %xmm0, %xmm3 ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: psrlw $4, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pshufb %xmm1, %xmm3 -; SSE41-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE41-NEXT: pand %xmm4, %xmm1 -; SSE41-NEXT: paddb %xmm3, %xmm1 -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pcmpeqb %xmm2, %xmm3 -; SSE41-NEXT: psrlw $8, %xmm3 -; SSE41-NEXT: pand %xmm1, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: pshufb %xmm1, %xmm2 +; SSE41-NEXT: pcmpeqb %xmm4, %xmm1 +; SSE41-NEXT: pand %xmm3, %xmm1 +; SSE41-NEXT: paddb %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pcmpeqb %xmm4, %xmm2 +; SSE41-NEXT: psrlw $8, %xmm2 +; SSE41-NEXT: pand %xmm1, %xmm2 ; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: paddw %xmm3, %xmm1 -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pcmpeqw %xmm2, %xmm3 -; SSE41-NEXT: psrld $16, %xmm3 -; SSE41-NEXT: pand %xmm1, %xmm3 +; SSE41-NEXT: paddw %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pcmpeqw %xmm4, %xmm2 +; SSE41-NEXT: psrld $16, %xmm2 +; SSE41-NEXT: pand %xmm1, %xmm2 ; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: paddd %xmm3, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE41-NEXT: paddd %xmm2, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 ; SSE41-NEXT: psrlq $32, %xmm0 ; SSE41-NEXT: pand %xmm1, %xmm0 ; SSE41-NEXT: psrlq $32, %xmm1 @@ -168,16 +162,14 @@ ; ; AVX-LABEL: testv2i64: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX-NEXT: vpand %xmm1, %xmm4, %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm3 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 ; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm5 +; AVX-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 ; AVX-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 ; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 @@ -198,16 +190,14 @@ ; ; AVX512VLBWDQ-LABEL: testv2i64: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm4, %xmm1 +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm3 +; AVX512VLBWDQ-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 ; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm5 +; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 ; AVX512VLBWDQ-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX512VLBWDQ-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX512VLBWDQ-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512VLBWDQ-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 ; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm2, %xmm2 @@ -241,33 +231,30 @@ ; ; X32-SSE-LABEL: testv2i64: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: pand %xmm2, %xmm1 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; X32-SSE-NEXT: movdqa %xmm3, %xmm4 -; X32-SSE-NEXT: pshufb %xmm1, %xmm4 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-SSE-NEXT: movdqa %xmm2, %xmm3 +; X32-SSE-NEXT: pshufb %xmm0, %xmm3 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE-NEXT: psrlw $4, %xmm1 -; X32-SSE-NEXT: pand %xmm2, %xmm1 -; X32-SSE-NEXT: pxor %xmm2, %xmm2 -; X32-SSE-NEXT: pshufb %xmm1, %xmm3 -; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm1 -; X32-SSE-NEXT: pand %xmm4, %xmm1 -; X32-SSE-NEXT: paddb %xmm3, %xmm1 -; X32-SSE-NEXT: movdqa %xmm0, %xmm3 -; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm3 -; X32-SSE-NEXT: psrlw $8, %xmm3 -; X32-SSE-NEXT: pand %xmm1, %xmm3 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 +; X32-SSE-NEXT: pxor %xmm4, %xmm4 +; X32-SSE-NEXT: pshufb %xmm1, %xmm2 +; X32-SSE-NEXT: pcmpeqb %xmm4, %xmm1 +; X32-SSE-NEXT: pand %xmm3, %xmm1 +; X32-SSE-NEXT: paddb %xmm2, %xmm1 +; X32-SSE-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE-NEXT: pcmpeqb %xmm4, %xmm2 +; X32-SSE-NEXT: psrlw $8, %xmm2 +; X32-SSE-NEXT: pand %xmm1, %xmm2 ; X32-SSE-NEXT: psrlw $8, %xmm1 -; X32-SSE-NEXT: paddw %xmm3, %xmm1 -; X32-SSE-NEXT: movdqa %xmm0, %xmm3 -; X32-SSE-NEXT: pcmpeqw %xmm2, %xmm3 -; X32-SSE-NEXT: psrld $16, %xmm3 -; X32-SSE-NEXT: pand %xmm1, %xmm3 +; X32-SSE-NEXT: paddw %xmm2, %xmm1 +; X32-SSE-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE-NEXT: pcmpeqw %xmm4, %xmm2 +; X32-SSE-NEXT: psrld $16, %xmm2 +; X32-SSE-NEXT: pand %xmm1, %xmm2 ; X32-SSE-NEXT: psrld $16, %xmm1 -; X32-SSE-NEXT: paddd %xmm3, %xmm1 -; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm0 +; X32-SSE-NEXT: paddd %xmm2, %xmm1 +; X32-SSE-NEXT: pcmpeqd %xmm4, %xmm0 ; X32-SSE-NEXT: psrlq $32, %xmm0 ; X32-SSE-NEXT: pand %xmm1, %xmm0 ; X32-SSE-NEXT: psrlq $32, %xmm1 @@ -362,33 +349,30 @@ ; ; SSSE3-LABEL: testv2i64u: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pand %xmm2, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pshufb %xmm1, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pshufb %xmm0, %xmm3 ; SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-NEXT: psrlw $4, %xmm1 -; SSSE3-NEXT: pand %xmm2, %xmm1 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: pshufb %xmm1, %xmm3 -; SSSE3-NEXT: pcmpeqb %xmm2, %xmm1 -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: paddb %xmm3, %xmm1 -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: pcmpeqb %xmm2, %xmm3 -; SSSE3-NEXT: psrlw $8, %xmm3 -; SSSE3-NEXT: pand %xmm1, %xmm3 +; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm4 +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: pcmpeqb %xmm4, %xmm1 +; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: paddb %xmm2, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pcmpeqb %xmm4, %xmm2 +; SSSE3-NEXT: psrlw $8, %xmm2 +; SSSE3-NEXT: pand %xmm1, %xmm2 ; SSSE3-NEXT: psrlw $8, %xmm1 -; SSSE3-NEXT: paddw %xmm3, %xmm1 -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: pcmpeqw %xmm2, %xmm3 -; SSSE3-NEXT: psrld $16, %xmm3 -; SSSE3-NEXT: pand %xmm1, %xmm3 +; SSSE3-NEXT: paddw %xmm2, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pcmpeqw %xmm4, %xmm2 +; SSSE3-NEXT: psrld $16, %xmm2 +; SSSE3-NEXT: pand %xmm1, %xmm2 ; SSSE3-NEXT: psrld $16, %xmm1 -; SSSE3-NEXT: paddd %xmm3, %xmm1 -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 +; SSSE3-NEXT: paddd %xmm2, %xmm1 +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm0 ; SSSE3-NEXT: psrlq $32, %xmm0 ; SSSE3-NEXT: pand %xmm1, %xmm0 ; SSSE3-NEXT: psrlq $32, %xmm1 @@ -398,33 +382,30 @@ ; ; SSE41-LABEL: testv2i64u: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pshufb %xmm1, %xmm4 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: pshufb %xmm0, %xmm3 ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: psrlw $4, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pshufb %xmm1, %xmm3 -; SSE41-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE41-NEXT: pand %xmm4, %xmm1 -; SSE41-NEXT: paddb %xmm3, %xmm1 -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pcmpeqb %xmm2, %xmm3 -; SSE41-NEXT: psrlw $8, %xmm3 -; SSE41-NEXT: pand %xmm1, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: pshufb %xmm1, %xmm2 +; SSE41-NEXT: pcmpeqb %xmm4, %xmm1 +; SSE41-NEXT: pand %xmm3, %xmm1 +; SSE41-NEXT: paddb %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pcmpeqb %xmm4, %xmm2 +; SSE41-NEXT: psrlw $8, %xmm2 +; SSE41-NEXT: pand %xmm1, %xmm2 ; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: paddw %xmm3, %xmm1 -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pcmpeqw %xmm2, %xmm3 -; SSE41-NEXT: psrld $16, %xmm3 -; SSE41-NEXT: pand %xmm1, %xmm3 +; SSE41-NEXT: paddw %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pcmpeqw %xmm4, %xmm2 +; SSE41-NEXT: psrld $16, %xmm2 +; SSE41-NEXT: pand %xmm1, %xmm2 ; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: paddd %xmm3, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE41-NEXT: paddd %xmm2, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 ; SSE41-NEXT: psrlq $32, %xmm0 ; SSE41-NEXT: pand %xmm1, %xmm0 ; SSE41-NEXT: psrlq $32, %xmm1 @@ -434,16 +415,14 @@ ; ; AVX-LABEL: testv2i64u: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX-NEXT: vpand %xmm1, %xmm4, %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm3 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 ; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm5 +; AVX-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 ; AVX-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 ; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 @@ -464,16 +443,14 @@ ; ; AVX512VLBWDQ-LABEL: testv2i64u: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm4, %xmm1 +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm3 +; AVX512VLBWDQ-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 ; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm5 +; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 ; AVX512VLBWDQ-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX512VLBWDQ-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX512VLBWDQ-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512VLBWDQ-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 ; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm2, %xmm2 @@ -507,33 +484,30 @@ ; ; X32-SSE-LABEL: testv2i64u: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: pand %xmm2, %xmm1 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; X32-SSE-NEXT: movdqa %xmm3, %xmm4 -; X32-SSE-NEXT: pshufb %xmm1, %xmm4 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-SSE-NEXT: movdqa %xmm2, %xmm3 +; X32-SSE-NEXT: pshufb %xmm0, %xmm3 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE-NEXT: psrlw $4, %xmm1 -; X32-SSE-NEXT: pand %xmm2, %xmm1 -; X32-SSE-NEXT: pxor %xmm2, %xmm2 -; X32-SSE-NEXT: pshufb %xmm1, %xmm3 -; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm1 -; X32-SSE-NEXT: pand %xmm4, %xmm1 -; X32-SSE-NEXT: paddb %xmm3, %xmm1 -; X32-SSE-NEXT: movdqa %xmm0, %xmm3 -; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm3 -; X32-SSE-NEXT: psrlw $8, %xmm3 -; X32-SSE-NEXT: pand %xmm1, %xmm3 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 +; X32-SSE-NEXT: pxor %xmm4, %xmm4 +; X32-SSE-NEXT: pshufb %xmm1, %xmm2 +; X32-SSE-NEXT: pcmpeqb %xmm4, %xmm1 +; X32-SSE-NEXT: pand %xmm3, %xmm1 +; X32-SSE-NEXT: paddb %xmm2, %xmm1 +; X32-SSE-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE-NEXT: pcmpeqb %xmm4, %xmm2 +; X32-SSE-NEXT: psrlw $8, %xmm2 +; X32-SSE-NEXT: pand %xmm1, %xmm2 ; X32-SSE-NEXT: psrlw $8, %xmm1 -; X32-SSE-NEXT: paddw %xmm3, %xmm1 -; X32-SSE-NEXT: movdqa %xmm0, %xmm3 -; X32-SSE-NEXT: pcmpeqw %xmm2, %xmm3 -; X32-SSE-NEXT: psrld $16, %xmm3 -; X32-SSE-NEXT: pand %xmm1, %xmm3 +; X32-SSE-NEXT: paddw %xmm2, %xmm1 +; X32-SSE-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE-NEXT: pcmpeqw %xmm4, %xmm2 +; X32-SSE-NEXT: psrld $16, %xmm2 +; X32-SSE-NEXT: pand %xmm1, %xmm2 ; X32-SSE-NEXT: psrld $16, %xmm1 -; X32-SSE-NEXT: paddd %xmm3, %xmm1 -; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm0 +; X32-SSE-NEXT: paddd %xmm2, %xmm1 +; X32-SSE-NEXT: pcmpeqd %xmm4, %xmm0 ; X32-SSE-NEXT: psrlq $32, %xmm0 ; X32-SSE-NEXT: pand %xmm1, %xmm0 ; X32-SSE-NEXT: psrlq $32, %xmm1 @@ -632,27 +606,24 @@ ; ; SSSE3-LABEL: testv4i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pand %xmm2, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pshufb %xmm1, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pshufb %xmm0, %xmm3 ; SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-NEXT: psrlw $4, %xmm1 -; SSSE3-NEXT: pand %xmm2, %xmm1 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: pshufb %xmm1, %xmm3 -; SSSE3-NEXT: pcmpeqb %xmm2, %xmm1 -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: paddb %xmm3, %xmm1 -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: pcmpeqb %xmm2, %xmm3 -; SSSE3-NEXT: psrlw $8, %xmm3 -; SSSE3-NEXT: pand %xmm1, %xmm3 +; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm4 +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: pcmpeqb %xmm4, %xmm1 +; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: paddb %xmm2, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pcmpeqb %xmm4, %xmm2 +; SSSE3-NEXT: psrlw $8, %xmm2 +; SSSE3-NEXT: pand %xmm1, %xmm2 ; SSSE3-NEXT: psrlw $8, %xmm1 -; SSSE3-NEXT: paddw %xmm3, %xmm1 -; SSSE3-NEXT: pcmpeqw %xmm2, %xmm0 +; SSSE3-NEXT: paddw %xmm2, %xmm1 +; SSSE3-NEXT: pcmpeqw %xmm4, %xmm0 ; SSSE3-NEXT: psrld $16, %xmm0 ; SSSE3-NEXT: pand %xmm1, %xmm0 ; SSSE3-NEXT: psrld $16, %xmm1 @@ -662,27 +633,24 @@ ; ; SSE41-LABEL: testv4i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pshufb %xmm1, %xmm4 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: pshufb %xmm0, %xmm3 ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: psrlw $4, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pshufb %xmm1, %xmm3 -; SSE41-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE41-NEXT: pand %xmm4, %xmm1 -; SSE41-NEXT: paddb %xmm3, %xmm1 -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pcmpeqb %xmm2, %xmm3 -; SSE41-NEXT: psrlw $8, %xmm3 -; SSE41-NEXT: pand %xmm1, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: pshufb %xmm1, %xmm2 +; SSE41-NEXT: pcmpeqb %xmm4, %xmm1 +; SSE41-NEXT: pand %xmm3, %xmm1 +; SSE41-NEXT: paddb %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pcmpeqb %xmm4, %xmm2 +; SSE41-NEXT: psrlw $8, %xmm2 +; SSE41-NEXT: pand %xmm1, %xmm2 ; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: paddw %xmm3, %xmm1 -; SSE41-NEXT: pcmpeqw %xmm2, %xmm0 +; SSE41-NEXT: paddw %xmm2, %xmm1 +; SSE41-NEXT: pcmpeqw %xmm4, %xmm0 ; SSE41-NEXT: psrld $16, %xmm0 ; SSE41-NEXT: pand %xmm1, %xmm0 ; SSE41-NEXT: psrld $16, %xmm1 @@ -692,16 +660,14 @@ ; ; AVX-LABEL: testv4i32: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX-NEXT: vpand %xmm1, %xmm4, %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm3 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 ; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm5 +; AVX-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 ; AVX-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 ; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 @@ -717,16 +683,14 @@ ; ; AVX512VLBWDQ-LABEL: testv4i32: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm4, %xmm1 +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm3 +; AVX512VLBWDQ-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 ; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm5 +; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 ; AVX512VLBWDQ-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX512VLBWDQ-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX512VLBWDQ-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512VLBWDQ-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 ; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm2, %xmm2 @@ -755,27 +719,24 @@ ; ; X32-SSE-LABEL: testv4i32: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: pand %xmm2, %xmm1 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; X32-SSE-NEXT: movdqa %xmm3, %xmm4 -; X32-SSE-NEXT: pshufb %xmm1, %xmm4 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-SSE-NEXT: movdqa %xmm2, %xmm3 +; X32-SSE-NEXT: pshufb %xmm0, %xmm3 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE-NEXT: psrlw $4, %xmm1 -; X32-SSE-NEXT: pand %xmm2, %xmm1 -; X32-SSE-NEXT: pxor %xmm2, %xmm2 -; X32-SSE-NEXT: pshufb %xmm1, %xmm3 -; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm1 -; X32-SSE-NEXT: pand %xmm4, %xmm1 -; X32-SSE-NEXT: paddb %xmm3, %xmm1 -; X32-SSE-NEXT: movdqa %xmm0, %xmm3 -; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm3 -; X32-SSE-NEXT: psrlw $8, %xmm3 -; X32-SSE-NEXT: pand %xmm1, %xmm3 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 +; X32-SSE-NEXT: pxor %xmm4, %xmm4 +; X32-SSE-NEXT: pshufb %xmm1, %xmm2 +; X32-SSE-NEXT: pcmpeqb %xmm4, %xmm1 +; X32-SSE-NEXT: pand %xmm3, %xmm1 +; X32-SSE-NEXT: paddb %xmm2, %xmm1 +; X32-SSE-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE-NEXT: pcmpeqb %xmm4, %xmm2 +; X32-SSE-NEXT: psrlw $8, %xmm2 +; X32-SSE-NEXT: pand %xmm1, %xmm2 ; X32-SSE-NEXT: psrlw $8, %xmm1 -; X32-SSE-NEXT: paddw %xmm3, %xmm1 -; X32-SSE-NEXT: pcmpeqw %xmm2, %xmm0 +; X32-SSE-NEXT: paddw %xmm2, %xmm1 +; X32-SSE-NEXT: pcmpeqw %xmm4, %xmm0 ; X32-SSE-NEXT: psrld $16, %xmm0 ; X32-SSE-NEXT: pand %xmm1, %xmm0 ; X32-SSE-NEXT: psrld $16, %xmm1 @@ -874,27 +835,24 @@ ; ; SSSE3-LABEL: testv4i32u: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pand %xmm2, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pshufb %xmm1, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pshufb %xmm0, %xmm3 ; SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-NEXT: psrlw $4, %xmm1 -; SSSE3-NEXT: pand %xmm2, %xmm1 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: pshufb %xmm1, %xmm3 -; SSSE3-NEXT: pcmpeqb %xmm2, %xmm1 -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: paddb %xmm3, %xmm1 -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: pcmpeqb %xmm2, %xmm3 -; SSSE3-NEXT: psrlw $8, %xmm3 -; SSSE3-NEXT: pand %xmm1, %xmm3 +; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm4 +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: pcmpeqb %xmm4, %xmm1 +; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: paddb %xmm2, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pcmpeqb %xmm4, %xmm2 +; SSSE3-NEXT: psrlw $8, %xmm2 +; SSSE3-NEXT: pand %xmm1, %xmm2 ; SSSE3-NEXT: psrlw $8, %xmm1 -; SSSE3-NEXT: paddw %xmm3, %xmm1 -; SSSE3-NEXT: pcmpeqw %xmm2, %xmm0 +; SSSE3-NEXT: paddw %xmm2, %xmm1 +; SSSE3-NEXT: pcmpeqw %xmm4, %xmm0 ; SSSE3-NEXT: psrld $16, %xmm0 ; SSSE3-NEXT: pand %xmm1, %xmm0 ; SSSE3-NEXT: psrld $16, %xmm1 @@ -904,27 +862,24 @@ ; ; SSE41-LABEL: testv4i32u: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pshufb %xmm1, %xmm4 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: pshufb %xmm0, %xmm3 ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: psrlw $4, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pshufb %xmm1, %xmm3 -; SSE41-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE41-NEXT: pand %xmm4, %xmm1 -; SSE41-NEXT: paddb %xmm3, %xmm1 -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pcmpeqb %xmm2, %xmm3 -; SSE41-NEXT: psrlw $8, %xmm3 -; SSE41-NEXT: pand %xmm1, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: pshufb %xmm1, %xmm2 +; SSE41-NEXT: pcmpeqb %xmm4, %xmm1 +; SSE41-NEXT: pand %xmm3, %xmm1 +; SSE41-NEXT: paddb %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pcmpeqb %xmm4, %xmm2 +; SSE41-NEXT: psrlw $8, %xmm2 +; SSE41-NEXT: pand %xmm1, %xmm2 ; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: paddw %xmm3, %xmm1 -; SSE41-NEXT: pcmpeqw %xmm2, %xmm0 +; SSE41-NEXT: paddw %xmm2, %xmm1 +; SSE41-NEXT: pcmpeqw %xmm4, %xmm0 ; SSE41-NEXT: psrld $16, %xmm0 ; SSE41-NEXT: pand %xmm1, %xmm0 ; SSE41-NEXT: psrld $16, %xmm1 @@ -934,16 +889,14 @@ ; ; AVX-LABEL: testv4i32u: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX-NEXT: vpand %xmm1, %xmm4, %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm3 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 ; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm5 +; AVX-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 ; AVX-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 ; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 @@ -959,16 +912,14 @@ ; ; AVX512VLBWDQ-LABEL: testv4i32u: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm4, %xmm1 +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm3 +; AVX512VLBWDQ-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 ; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm5 +; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 ; AVX512VLBWDQ-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX512VLBWDQ-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX512VLBWDQ-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512VLBWDQ-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 ; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm2, %xmm2 @@ -997,27 +948,24 @@ ; ; X32-SSE-LABEL: testv4i32u: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: pand %xmm2, %xmm1 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; X32-SSE-NEXT: movdqa %xmm3, %xmm4 -; X32-SSE-NEXT: pshufb %xmm1, %xmm4 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-SSE-NEXT: movdqa %xmm2, %xmm3 +; X32-SSE-NEXT: pshufb %xmm0, %xmm3 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE-NEXT: psrlw $4, %xmm1 -; X32-SSE-NEXT: pand %xmm2, %xmm1 -; X32-SSE-NEXT: pxor %xmm2, %xmm2 -; X32-SSE-NEXT: pshufb %xmm1, %xmm3 -; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm1 -; X32-SSE-NEXT: pand %xmm4, %xmm1 -; X32-SSE-NEXT: paddb %xmm3, %xmm1 -; X32-SSE-NEXT: movdqa %xmm0, %xmm3 -; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm3 -; X32-SSE-NEXT: psrlw $8, %xmm3 -; X32-SSE-NEXT: pand %xmm1, %xmm3 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 +; X32-SSE-NEXT: pxor %xmm4, %xmm4 +; X32-SSE-NEXT: pshufb %xmm1, %xmm2 +; X32-SSE-NEXT: pcmpeqb %xmm4, %xmm1 +; X32-SSE-NEXT: pand %xmm3, %xmm1 +; X32-SSE-NEXT: paddb %xmm2, %xmm1 +; X32-SSE-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE-NEXT: pcmpeqb %xmm4, %xmm2 +; X32-SSE-NEXT: psrlw $8, %xmm2 +; X32-SSE-NEXT: pand %xmm1, %xmm2 ; X32-SSE-NEXT: psrlw $8, %xmm1 -; X32-SSE-NEXT: paddw %xmm3, %xmm1 -; X32-SSE-NEXT: pcmpeqw %xmm2, %xmm0 +; X32-SSE-NEXT: paddw %xmm2, %xmm1 +; X32-SSE-NEXT: pcmpeqw %xmm4, %xmm0 ; X32-SSE-NEXT: psrld $16, %xmm0 ; X32-SSE-NEXT: pand %xmm1, %xmm0 ; X32-SSE-NEXT: psrld $16, %xmm1 @@ -1104,21 +1052,18 @@ ; ; SSSE3-LABEL: testv8i16: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pand %xmm2, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pshufb %xmm1, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pshufb %xmm0, %xmm3 ; SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-NEXT: psrlw $4, %xmm1 -; SSSE3-NEXT: pand %xmm2, %xmm1 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: pshufb %xmm1, %xmm3 -; SSSE3-NEXT: pcmpeqb %xmm2, %xmm1 -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: paddb %xmm3, %xmm1 -; SSSE3-NEXT: pcmpeqb %xmm2, %xmm0 +; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm4 +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: pcmpeqb %xmm4, %xmm1 +; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: paddb %xmm2, %xmm1 +; SSSE3-NEXT: pcmpeqb %xmm4, %xmm0 ; SSSE3-NEXT: psrlw $8, %xmm0 ; SSSE3-NEXT: pand %xmm1, %xmm0 ; SSSE3-NEXT: psrlw $8, %xmm1 @@ -1128,21 +1073,18 @@ ; ; SSE41-LABEL: testv8i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pshufb %xmm1, %xmm4 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: pshufb %xmm0, %xmm3 ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: psrlw $4, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pshufb %xmm1, %xmm3 -; SSE41-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE41-NEXT: pand %xmm4, %xmm1 -; SSE41-NEXT: paddb %xmm3, %xmm1 -; SSE41-NEXT: pcmpeqb %xmm2, %xmm0 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: pshufb %xmm1, %xmm2 +; SSE41-NEXT: pcmpeqb %xmm4, %xmm1 +; SSE41-NEXT: pand %xmm3, %xmm1 +; SSE41-NEXT: paddb %xmm2, %xmm1 +; SSE41-NEXT: pcmpeqb %xmm4, %xmm0 ; SSE41-NEXT: psrlw $8, %xmm0 ; SSE41-NEXT: pand %xmm1, %xmm0 ; SSE41-NEXT: psrlw $8, %xmm1 @@ -1152,16 +1094,14 @@ ; ; AVX-LABEL: testv8i16: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX-NEXT: vpand %xmm1, %xmm4, %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm3 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 ; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm5 +; AVX-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 ; AVX-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0 ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 @@ -1172,16 +1112,14 @@ ; ; AVX512VLBWDQ-LABEL: testv8i16: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm4, %xmm1 +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm3 +; AVX512VLBWDQ-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 ; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm5 +; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 ; AVX512VLBWDQ-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX512VLBWDQ-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX512VLBWDQ-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512VLBWDQ-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0 ; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm0, %xmm0 @@ -1210,21 +1148,18 @@ ; ; X32-SSE-LABEL: testv8i16: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: pand %xmm2, %xmm1 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; X32-SSE-NEXT: movdqa %xmm3, %xmm4 -; X32-SSE-NEXT: pshufb %xmm1, %xmm4 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-SSE-NEXT: movdqa %xmm2, %xmm3 +; X32-SSE-NEXT: pshufb %xmm0, %xmm3 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE-NEXT: psrlw $4, %xmm1 -; X32-SSE-NEXT: pand %xmm2, %xmm1 -; X32-SSE-NEXT: pxor %xmm2, %xmm2 -; X32-SSE-NEXT: pshufb %xmm1, %xmm3 -; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm1 -; X32-SSE-NEXT: pand %xmm4, %xmm1 -; X32-SSE-NEXT: paddb %xmm3, %xmm1 -; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 +; X32-SSE-NEXT: pxor %xmm4, %xmm4 +; X32-SSE-NEXT: pshufb %xmm1, %xmm2 +; X32-SSE-NEXT: pcmpeqb %xmm4, %xmm1 +; X32-SSE-NEXT: pand %xmm3, %xmm1 +; X32-SSE-NEXT: paddb %xmm2, %xmm1 +; X32-SSE-NEXT: pcmpeqb %xmm4, %xmm0 ; X32-SSE-NEXT: psrlw $8, %xmm0 ; X32-SSE-NEXT: pand %xmm1, %xmm0 ; X32-SSE-NEXT: psrlw $8, %xmm1 @@ -1310,21 +1245,18 @@ ; ; SSSE3-LABEL: testv8i16u: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pand %xmm2, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pshufb %xmm1, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pshufb %xmm0, %xmm3 ; SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-NEXT: psrlw $4, %xmm1 -; SSSE3-NEXT: pand %xmm2, %xmm1 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: pshufb %xmm1, %xmm3 -; SSSE3-NEXT: pcmpeqb %xmm2, %xmm1 -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: paddb %xmm3, %xmm1 -; SSSE3-NEXT: pcmpeqb %xmm2, %xmm0 +; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm4 +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: pcmpeqb %xmm4, %xmm1 +; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: paddb %xmm2, %xmm1 +; SSSE3-NEXT: pcmpeqb %xmm4, %xmm0 ; SSSE3-NEXT: psrlw $8, %xmm0 ; SSSE3-NEXT: pand %xmm1, %xmm0 ; SSSE3-NEXT: psrlw $8, %xmm1 @@ -1334,21 +1266,18 @@ ; ; SSE41-LABEL: testv8i16u: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pshufb %xmm1, %xmm4 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: pshufb %xmm0, %xmm3 ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: psrlw $4, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pshufb %xmm1, %xmm3 -; SSE41-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE41-NEXT: pand %xmm4, %xmm1 -; SSE41-NEXT: paddb %xmm3, %xmm1 -; SSE41-NEXT: pcmpeqb %xmm2, %xmm0 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: pshufb %xmm1, %xmm2 +; SSE41-NEXT: pcmpeqb %xmm4, %xmm1 +; SSE41-NEXT: pand %xmm3, %xmm1 +; SSE41-NEXT: paddb %xmm2, %xmm1 +; SSE41-NEXT: pcmpeqb %xmm4, %xmm0 ; SSE41-NEXT: psrlw $8, %xmm0 ; SSE41-NEXT: pand %xmm1, %xmm0 ; SSE41-NEXT: psrlw $8, %xmm1 @@ -1358,16 +1287,14 @@ ; ; AVX-LABEL: testv8i16u: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX-NEXT: vpand %xmm1, %xmm4, %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm3 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 ; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm5 +; AVX-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 ; AVX-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0 ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 @@ -1378,16 +1305,14 @@ ; ; AVX512VLBWDQ-LABEL: testv8i16u: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm4, %xmm1 +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm3 +; AVX512VLBWDQ-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 ; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm5 +; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 ; AVX512VLBWDQ-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX512VLBWDQ-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX512VLBWDQ-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512VLBWDQ-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0 ; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm0, %xmm0 @@ -1416,21 +1341,18 @@ ; ; X32-SSE-LABEL: testv8i16u: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: pand %xmm2, %xmm1 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; X32-SSE-NEXT: movdqa %xmm3, %xmm4 -; X32-SSE-NEXT: pshufb %xmm1, %xmm4 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-SSE-NEXT: movdqa %xmm2, %xmm3 +; X32-SSE-NEXT: pshufb %xmm0, %xmm3 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE-NEXT: psrlw $4, %xmm1 -; X32-SSE-NEXT: pand %xmm2, %xmm1 -; X32-SSE-NEXT: pxor %xmm2, %xmm2 -; X32-SSE-NEXT: pshufb %xmm1, %xmm3 -; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm1 -; X32-SSE-NEXT: pand %xmm4, %xmm1 -; X32-SSE-NEXT: paddb %xmm3, %xmm1 -; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 +; X32-SSE-NEXT: pxor %xmm4, %xmm4 +; X32-SSE-NEXT: pshufb %xmm1, %xmm2 +; X32-SSE-NEXT: pcmpeqb %xmm4, %xmm1 +; X32-SSE-NEXT: pand %xmm3, %xmm1 +; X32-SSE-NEXT: paddb %xmm2, %xmm1 +; X32-SSE-NEXT: pcmpeqb %xmm4, %xmm0 ; X32-SSE-NEXT: psrlw $8, %xmm0 ; X32-SSE-NEXT: pand %xmm1, %xmm0 ; X32-SSE-NEXT: psrlw $8, %xmm1 @@ -1510,68 +1432,58 @@ ; ; SSSE3-LABEL: testv16i8: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: pand %xmm2, %xmm3 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; SSSE3-NEXT: movdqa %xmm1, %xmm4 -; SSSE3-NEXT: pshufb %xmm3, %xmm4 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pshufb %xmm0, %xmm2 ; SSSE3-NEXT: psrlw $4, %xmm0 -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: pcmpeqb %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm4, %xmm2 +; SSSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSSE3-NEXT: pcmpeqb %xmm0, %xmm3 +; SSSE3-NEXT: pand %xmm2, %xmm3 ; SSSE3-NEXT: pshufb %xmm0, %xmm1 -; SSSE3-NEXT: paddb %xmm2, %xmm1 +; SSSE3-NEXT: paddb %xmm3, %xmm1 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: testv16i8: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pand %xmm2, %xmm3 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: pshufb %xmm3, %xmm4 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: pshufb %xmm0, %xmm2 ; SSE41-NEXT: psrlw $4, %xmm0 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pcmpeqb %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm4, %xmm2 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: pcmpeqb %xmm0, %xmm3 +; SSE41-NEXT: pand %xmm2, %xmm3 ; SSE41-NEXT: pshufb %xmm0, %xmm1 -; SSE41-NEXT: paddb %xmm2, %xmm1 +; SSE41-NEXT: paddb %xmm3, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: testv16i8: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpand %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm3 +; AVX-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ; AVX-NEXT: retq ; ; AVX512VLBWDQ-LABEL: testv16i8: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512VLBWDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1 -; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm2, %xmm1 -; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX512VLBWDQ-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX512VLBWDQ-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VLBWDQ-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm3 +; AVX512VLBWDQ-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX512VLBWDQ-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ; AVX512VLBWDQ-NEXT: retq ; ; AVX512-LABEL: testv16i8: @@ -1585,19 +1497,16 @@ ; ; X32-SSE-LABEL: testv16i8: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X32-SSE-NEXT: movdqa %xmm0, %xmm3 -; X32-SSE-NEXT: pand %xmm2, %xmm3 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; X32-SSE-NEXT: movdqa %xmm1, %xmm4 -; X32-SSE-NEXT: pshufb %xmm3, %xmm4 +; X32-SSE-NEXT: movdqa %xmm1, %xmm2 +; X32-SSE-NEXT: pshufb %xmm0, %xmm2 ; X32-SSE-NEXT: psrlw $4, %xmm0 -; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: pxor %xmm2, %xmm2 -; X32-SSE-NEXT: pcmpeqb %xmm0, %xmm2 -; X32-SSE-NEXT: pand %xmm4, %xmm2 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: pxor %xmm3, %xmm3 +; X32-SSE-NEXT: pcmpeqb %xmm0, %xmm3 +; X32-SSE-NEXT: pand %xmm2, %xmm3 ; X32-SSE-NEXT: pshufb %xmm0, %xmm1 -; X32-SSE-NEXT: paddb %xmm2, %xmm1 +; X32-SSE-NEXT: paddb %xmm3, %xmm1 ; X32-SSE-NEXT: movdqa %xmm1, %xmm0 ; X32-SSE-NEXT: retl %out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %in, i1 0) @@ -1673,68 +1582,58 @@ ; ; SSSE3-LABEL: testv16i8u: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: pand %xmm2, %xmm3 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; SSSE3-NEXT: movdqa %xmm1, %xmm4 -; SSSE3-NEXT: pshufb %xmm3, %xmm4 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pshufb %xmm0, %xmm2 ; SSSE3-NEXT: psrlw $4, %xmm0 -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: pcmpeqb %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm4, %xmm2 +; SSSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSSE3-NEXT: pcmpeqb %xmm0, %xmm3 +; SSSE3-NEXT: pand %xmm2, %xmm3 ; SSSE3-NEXT: pshufb %xmm0, %xmm1 -; SSSE3-NEXT: paddb %xmm2, %xmm1 +; SSSE3-NEXT: paddb %xmm3, %xmm1 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: testv16i8u: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pand %xmm2, %xmm3 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: pshufb %xmm3, %xmm4 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: pshufb %xmm0, %xmm2 ; SSE41-NEXT: psrlw $4, %xmm0 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pcmpeqb %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm4, %xmm2 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: pcmpeqb %xmm0, %xmm3 +; SSE41-NEXT: pand %xmm2, %xmm3 ; SSE41-NEXT: pshufb %xmm0, %xmm1 -; SSE41-NEXT: paddb %xmm2, %xmm1 +; SSE41-NEXT: paddb %xmm3, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: testv16i8u: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpand %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm3 +; AVX-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ; AVX-NEXT: retq ; ; AVX512VLBWDQ-LABEL: testv16i8u: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512VLBWDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1 -; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm2, %xmm1 -; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX512VLBWDQ-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX512VLBWDQ-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VLBWDQ-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm3 +; AVX512VLBWDQ-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX512VLBWDQ-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ; AVX512VLBWDQ-NEXT: retq ; ; AVX512-LABEL: testv16i8u: @@ -1748,19 +1647,16 @@ ; ; X32-SSE-LABEL: testv16i8u: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X32-SSE-NEXT: movdqa %xmm0, %xmm3 -; X32-SSE-NEXT: pand %xmm2, %xmm3 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; X32-SSE-NEXT: movdqa %xmm1, %xmm4 -; X32-SSE-NEXT: pshufb %xmm3, %xmm4 +; X32-SSE-NEXT: movdqa %xmm1, %xmm2 +; X32-SSE-NEXT: pshufb %xmm0, %xmm2 ; X32-SSE-NEXT: psrlw $4, %xmm0 -; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: pxor %xmm2, %xmm2 -; X32-SSE-NEXT: pcmpeqb %xmm0, %xmm2 -; X32-SSE-NEXT: pand %xmm4, %xmm2 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: pxor %xmm3, %xmm3 +; X32-SSE-NEXT: pcmpeqb %xmm0, %xmm3 +; X32-SSE-NEXT: pand %xmm2, %xmm3 ; X32-SSE-NEXT: pshufb %xmm0, %xmm1 -; X32-SSE-NEXT: paddb %xmm2, %xmm1 +; X32-SSE-NEXT: paddb %xmm3, %xmm1 ; X32-SSE-NEXT: movdqa %xmm1, %xmm0 ; X32-SSE-NEXT: retl %out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %in, i1 -1) Index: llvm/trunk/test/CodeGen/X86/vector-lzcnt-256.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-lzcnt-256.ll +++ llvm/trunk/test/CodeGen/X86/vector-lzcnt-256.ll @@ -13,40 +13,38 @@ ; AVX1-LABEL: testv4i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm5 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm4 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm1 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm6 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm6 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpcmpeqb %xmm1, %xmm6, %xmm7 -; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 -; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm6 -; AVX1-NEXT: vpaddb %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm4 +; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm6 +; AVX1-NEXT: vpaddb %xmm6, %xmm4, %xmm4 ; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm6 ; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6 -; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm6 -; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 -; AVX1-NEXT: vpaddw %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm6 +; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 +; AVX1-NEXT: vpaddw %xmm6, %xmm4, %xmm4 ; AVX1-NEXT: vpcmpeqw %xmm1, %xmm2, %xmm6 ; AVX1-NEXT: vpsrld $16, %xmm6, %xmm6 -; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm6 -; AVX1-NEXT: vpsrld $16, %xmm5, %xmm5 -; AVX1-NEXT: vpaddd %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm6 +; AVX1-NEXT: vpsrld $16, %xmm4, %xmm4 +; AVX1-NEXT: vpaddd %xmm6, %xmm4, %xmm4 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm5 -; AVX1-NEXT: vpaddq %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 +; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm4 +; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm4 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm6 -; AVX1-NEXT: vpand %xmm3, %xmm6, %xmm3 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm3, %xmm6 -; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpaddb %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpand %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpcmpeqb %xmm1, %xmm5, %xmm6 +; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4 +; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpaddb %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm4 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm4 @@ -67,16 +65,14 @@ ; ; AVX2-LABEL: testv4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4 -; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm2 +; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm3 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; AVX2-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 ; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 @@ -97,16 +93,14 @@ ; ; AVX512VL-LABEL: testv4i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4 -; AVX512VL-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm2 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 ; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX512VL-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512VL-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 ; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 @@ -127,16 +121,14 @@ ; ; AVX512VLBWDQ-LABEL: testv4i64: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm4 -; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 +; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm3 +; AVX512VLBWDQ-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 ; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX512VLBWDQ-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512VLBWDQ-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX512VLBWDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 ; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm2, %ymm2 @@ -169,16 +161,14 @@ ; ; X32-AVX-LABEL: testv4i64: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm4 -; X32-AVX-NEXT: vpand %ymm1, %ymm4, %ymm1 +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-AVX-NEXT: vpshufb %ymm0, %ymm1, %ymm2 +; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm3 +; X32-AVX-NEXT: vpand {{\.LCPI.*}}, %ymm3, %ymm3 ; X32-AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; X32-AVX-NEXT: vpand %ymm5, %ymm2, %ymm2 -; X32-AVX-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; X32-AVX-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; X32-AVX-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 ; X32-AVX-NEXT: vpsrlw $8, %ymm2, %ymm2 @@ -205,40 +195,38 @@ ; AVX1-LABEL: testv4i64u: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm5 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm4 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm1 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm6 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm6 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpcmpeqb %xmm1, %xmm6, %xmm7 -; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 -; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm6 -; AVX1-NEXT: vpaddb %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm4 +; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm6 +; AVX1-NEXT: vpaddb %xmm6, %xmm4, %xmm4 ; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm6 ; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6 -; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm6 -; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 -; AVX1-NEXT: vpaddw %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm6 +; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 +; AVX1-NEXT: vpaddw %xmm6, %xmm4, %xmm4 ; AVX1-NEXT: vpcmpeqw %xmm1, %xmm2, %xmm6 ; AVX1-NEXT: vpsrld $16, %xmm6, %xmm6 -; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm6 -; AVX1-NEXT: vpsrld $16, %xmm5, %xmm5 -; AVX1-NEXT: vpaddd %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm6 +; AVX1-NEXT: vpsrld $16, %xmm4, %xmm4 +; AVX1-NEXT: vpaddd %xmm6, %xmm4, %xmm4 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm5 -; AVX1-NEXT: vpaddq %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 +; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm4 +; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm4 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm6 -; AVX1-NEXT: vpand %xmm3, %xmm6, %xmm3 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm3, %xmm6 -; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpaddb %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpand %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpcmpeqb %xmm1, %xmm5, %xmm6 +; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4 +; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpaddb %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm4 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm4 @@ -259,16 +247,14 @@ ; ; AVX2-LABEL: testv4i64u: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4 -; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm2 +; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm3 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; AVX2-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 ; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 @@ -289,16 +275,14 @@ ; ; AVX512VL-LABEL: testv4i64u: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4 -; AVX512VL-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm2 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 ; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX512VL-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512VL-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 ; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 @@ -319,16 +303,14 @@ ; ; AVX512VLBWDQ-LABEL: testv4i64u: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm4 -; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 +; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm3 +; AVX512VLBWDQ-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 ; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX512VLBWDQ-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512VLBWDQ-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX512VLBWDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 ; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm2, %ymm2 @@ -361,16 +343,14 @@ ; ; X32-AVX-LABEL: testv4i64u: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm4 -; X32-AVX-NEXT: vpand %ymm1, %ymm4, %ymm1 +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-AVX-NEXT: vpshufb %ymm0, %ymm1, %ymm2 +; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm3 +; X32-AVX-NEXT: vpand {{\.LCPI.*}}, %ymm3, %ymm3 ; X32-AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; X32-AVX-NEXT: vpand %ymm5, %ymm2, %ymm2 -; X32-AVX-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; X32-AVX-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; X32-AVX-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 ; X32-AVX-NEXT: vpsrlw $8, %ymm2, %ymm2 @@ -397,34 +377,32 @@ ; AVX1-LABEL: testv8i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm5 -; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm5 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm4 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX1-NEXT: vpcmpeqb %xmm6, %xmm5, %xmm7 +; AVX1-NEXT: vpcmpeqb %xmm6, %xmm4, %xmm7 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 -; AVX1-NEXT: vpaddb %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqb %xmm6, %xmm1, %xmm5 -; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 -; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX1-NEXT: vpaddb %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqb %xmm6, %xmm1, %xmm4 +; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm4 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 -; AVX1-NEXT: vpaddw %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpaddw %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpeqw %xmm6, %xmm1, %xmm1 ; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpsrld $16, %xmm3, %xmm3 ; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm5 -; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm6, %xmm2, %xmm5 +; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpeqb %xmm6, %xmm4, %xmm5 ; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpeqb %xmm6, %xmm0, %xmm3 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 @@ -441,16 +419,14 @@ ; ; AVX2-LABEL: testv8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4 -; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm2 +; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm3 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; AVX2-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 ; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 @@ -466,16 +442,14 @@ ; ; AVX512VL-LABEL: testv8i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4 -; AVX512VL-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm2 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 ; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX512VL-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512VL-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 ; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 @@ -491,16 +465,14 @@ ; ; AVX512VLBWDQ-LABEL: testv8i32: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm4 -; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 +; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm3 +; AVX512VLBWDQ-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 ; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX512VLBWDQ-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512VLBWDQ-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX512VLBWDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 ; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm2, %ymm2 @@ -528,16 +500,14 @@ ; ; X32-AVX-LABEL: testv8i32: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm4 -; X32-AVX-NEXT: vpand %ymm1, %ymm4, %ymm1 +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-AVX-NEXT: vpshufb %ymm0, %ymm1, %ymm2 +; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm3 +; X32-AVX-NEXT: vpand {{\.LCPI.*}}, %ymm3, %ymm3 ; X32-AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; X32-AVX-NEXT: vpand %ymm5, %ymm2, %ymm2 -; X32-AVX-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; X32-AVX-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; X32-AVX-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 ; X32-AVX-NEXT: vpsrlw $8, %ymm2, %ymm2 @@ -559,34 +529,32 @@ ; AVX1-LABEL: testv8i32u: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm5 -; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm5 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm4 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX1-NEXT: vpcmpeqb %xmm6, %xmm5, %xmm7 +; AVX1-NEXT: vpcmpeqb %xmm6, %xmm4, %xmm7 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 -; AVX1-NEXT: vpaddb %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqb %xmm6, %xmm1, %xmm5 -; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 -; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX1-NEXT: vpaddb %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqb %xmm6, %xmm1, %xmm4 +; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm4 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 -; AVX1-NEXT: vpaddw %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpaddw %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpeqw %xmm6, %xmm1, %xmm1 ; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpsrld $16, %xmm3, %xmm3 ; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm5 -; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm6, %xmm2, %xmm5 +; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpeqb %xmm6, %xmm4, %xmm5 ; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpeqb %xmm6, %xmm0, %xmm3 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 @@ -603,16 +571,14 @@ ; ; AVX2-LABEL: testv8i32u: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4 -; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm2 +; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm3 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; AVX2-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 ; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 @@ -628,16 +594,14 @@ ; ; AVX512VL-LABEL: testv8i32u: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4 -; AVX512VL-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm2 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 ; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX512VL-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512VL-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 ; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 @@ -653,16 +617,14 @@ ; ; AVX512VLBWDQ-LABEL: testv8i32u: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm4 -; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 +; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm3 +; AVX512VLBWDQ-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 ; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX512VLBWDQ-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512VLBWDQ-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX512VLBWDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 ; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm2, %ymm2 @@ -690,16 +652,14 @@ ; ; X32-AVX-LABEL: testv8i32u: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm4 -; X32-AVX-NEXT: vpand %ymm1, %ymm4, %ymm1 +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-AVX-NEXT: vpshufb %ymm0, %ymm1, %ymm2 +; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm3 +; X32-AVX-NEXT: vpand {{\.LCPI.*}}, %ymm3, %ymm3 ; X32-AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; X32-AVX-NEXT: vpand %ymm5, %ymm2, %ymm2 -; X32-AVX-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; X32-AVX-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; X32-AVX-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 ; X32-AVX-NEXT: vpsrlw $8, %ymm2, %ymm2 @@ -721,29 +681,27 @@ ; AVX1-LABEL: testv16i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm5 -; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm5 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm4 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX1-NEXT: vpcmpeqb %xmm6, %xmm5, %xmm7 +; AVX1-NEXT: vpcmpeqb %xmm6, %xmm4, %xmm7 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 -; AVX1-NEXT: vpaddb %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX1-NEXT: vpaddb %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpeqb %xmm6, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 ; AVX1-NEXT: vpaddw %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm5 -; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm6, %xmm2, %xmm5 +; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpeqb %xmm6, %xmm4, %xmm5 ; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpeqb %xmm6, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 @@ -755,16 +713,14 @@ ; ; AVX2-LABEL: testv16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4 -; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm2 +; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm3 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; AVX2-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 @@ -775,16 +731,14 @@ ; ; AVX512VL-LABEL: testv16i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4 -; AVX512VL-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm2 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 ; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX512VL-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512VL-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 @@ -795,16 +749,14 @@ ; ; AVX512VLBWDQ-LABEL: testv16i16: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm4 -; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 +; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm3 +; AVX512VLBWDQ-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 ; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX512VLBWDQ-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512VLBWDQ-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX512VLBWDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0 ; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm0, %ymm0 @@ -823,16 +775,14 @@ ; ; X32-AVX-LABEL: testv16i16: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm4 -; X32-AVX-NEXT: vpand %ymm1, %ymm4, %ymm1 +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-AVX-NEXT: vpshufb %ymm0, %ymm1, %ymm2 +; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm3 +; X32-AVX-NEXT: vpand {{\.LCPI.*}}, %ymm3, %ymm3 ; X32-AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; X32-AVX-NEXT: vpand %ymm5, %ymm2, %ymm2 -; X32-AVX-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; X32-AVX-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; X32-AVX-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0 ; X32-AVX-NEXT: vpsrlw $8, %ymm0, %ymm0 @@ -848,29 +798,27 @@ ; AVX1-LABEL: testv16i16u: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm5 -; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm5 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm4 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX1-NEXT: vpcmpeqb %xmm6, %xmm5, %xmm7 +; AVX1-NEXT: vpcmpeqb %xmm6, %xmm4, %xmm7 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 -; AVX1-NEXT: vpaddb %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX1-NEXT: vpaddb %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpeqb %xmm6, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 ; AVX1-NEXT: vpaddw %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm5 -; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm6, %xmm2, %xmm5 +; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpeqb %xmm6, %xmm4, %xmm5 ; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpeqb %xmm6, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 @@ -882,16 +830,14 @@ ; ; AVX2-LABEL: testv16i16u: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4 -; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm2 +; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm3 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; AVX2-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 @@ -902,16 +848,14 @@ ; ; AVX512VL-LABEL: testv16i16u: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4 -; AVX512VL-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm2 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 ; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX512VL-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512VL-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 @@ -922,16 +866,14 @@ ; ; AVX512VLBWDQ-LABEL: testv16i16u: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm4 -; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 +; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm3 +; AVX512VLBWDQ-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 ; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX512VLBWDQ-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512VLBWDQ-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX512VLBWDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0 ; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm0, %ymm0 @@ -950,16 +892,14 @@ ; ; X32-AVX-LABEL: testv16i16u: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm4 -; X32-AVX-NEXT: vpand %ymm1, %ymm4, %ymm1 +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-AVX-NEXT: vpshufb %ymm0, %ymm1, %ymm2 +; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm3 +; X32-AVX-NEXT: vpand {{\.LCPI.*}}, %ymm3, %ymm3 ; X32-AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; X32-AVX-NEXT: vpand %ymm5, %ymm2, %ymm2 -; X32-AVX-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; X32-AVX-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; X32-AVX-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0 ; X32-AVX-NEXT: vpsrlw $8, %ymm0, %ymm0 @@ -975,71 +915,63 @@ ; AVX1-LABEL: testv32i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm3 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; AVX1-NEXT: vpcmpeqb %xmm5, %xmm1, %xmm6 ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 -; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm3 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqb %xmm5, %xmm0, %xmm2 -; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqb %xmm5, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpaddb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: testv32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm3 +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: testv32i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1 -; AVX512VL-NEXT: vpand %ymm1, %ymm2, %ymm1 -; AVX512VL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512VL-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512VL-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm3 +; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512VLBWDQ-LABEL: testv32i8: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512VLBWDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1 -; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm2, %ymm1 -; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512VLBWDQ-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX512VLBWDQ-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VLBWDQ-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm3 +; AVX512VLBWDQ-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm0 +; AVX512VLBWDQ-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ; AVX512VLBWDQ-NEXT: retq ; ; AVX512-LABEL: testv32i8: @@ -1059,17 +991,15 @@ ; ; X32-AVX-LABEL: testv32i8: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-AVX-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 -; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 -; X32-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X32-AVX-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1 -; X32-AVX-NEXT: vpand %ymm1, %ymm2, %ymm1 -; X32-AVX-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; X32-AVX-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; X32-AVX-NEXT: vpand {{\.LCPI.*}}, %ymm0, %ymm0 +; X32-AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; X32-AVX-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm3 +; X32-AVX-NEXT: vpand %ymm3, %ymm2, %ymm2 +; X32-AVX-NEXT: vpshufb %ymm0, %ymm1, %ymm0 +; X32-AVX-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ; X32-AVX-NEXT: retl %out = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %in, i1 0) ret <32 x i8> %out @@ -1079,71 +1009,63 @@ ; AVX1-LABEL: testv32i8u: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm3 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; AVX1-NEXT: vpcmpeqb %xmm5, %xmm1, %xmm6 ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 -; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm3 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqb %xmm5, %xmm0, %xmm2 -; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqb %xmm5, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpaddb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: testv32i8u: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm3 +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: testv32i8u: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1 -; AVX512VL-NEXT: vpand %ymm1, %ymm2, %ymm1 -; AVX512VL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512VL-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512VL-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm3 +; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512VLBWDQ-LABEL: testv32i8u: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512VLBWDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1 -; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm2, %ymm1 -; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512VLBWDQ-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX512VLBWDQ-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VLBWDQ-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm3 +; AVX512VLBWDQ-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm0 +; AVX512VLBWDQ-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ; AVX512VLBWDQ-NEXT: retq ; ; AVX512-LABEL: testv32i8u: @@ -1163,17 +1085,15 @@ ; ; X32-AVX-LABEL: testv32i8u: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-AVX-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 -; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 -; X32-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X32-AVX-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1 -; X32-AVX-NEXT: vpand %ymm1, %ymm2, %ymm1 -; X32-AVX-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; X32-AVX-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; X32-AVX-NEXT: vpand {{\.LCPI.*}}, %ymm0, %ymm0 +; X32-AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; X32-AVX-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm3 +; X32-AVX-NEXT: vpand %ymm3, %ymm2, %ymm2 +; X32-AVX-NEXT: vpshufb %ymm0, %ymm1, %ymm0 +; X32-AVX-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ; X32-AVX-NEXT: retl %out = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %in, i1 -1) ret <32 x i8> %out Index: llvm/trunk/test/CodeGen/X86/vector-lzcnt-512.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-lzcnt-512.ll +++ llvm/trunk/test/CodeGen/X86/vector-lzcnt-512.ll @@ -359,16 +359,15 @@ ; ; AVX512BW-LABEL: testv32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm4 -; AVX512BW-NEXT: vptestnmb %zmm1, %zmm4, %k0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm2 +; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vptestnmb %zmm4, %zmm3, %k0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm5 ; AVX512BW-NEXT: vpandq %zmm5, %zmm2, %zmm2 -; AVX512BW-NEXT: vpandq %zmm1, %zmm4, %zmm1 -; AVX512BW-NEXT: vpshufb %zmm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vpandq %zmm4, %zmm3, %zmm3 +; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 @@ -380,29 +379,27 @@ ; ; AVX512DQ-LABEL: testv32i16: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm3 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm5 -; AVX512DQ-NEXT: vpand %ymm2, %ymm5, %ymm5 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm3 +; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX512DQ-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm5, %ymm7 +; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm4, %ymm7 ; AVX512DQ-NEXT: vpand %ymm7, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512DQ-NEXT: vpaddb %ymm5, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm2, %ymm4 +; AVX512DQ-NEXT: vpaddb %ymm4, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpand %ymm0, %ymm3, %ymm0 ; AVX512DQ-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpaddw %ymm0, %ymm3, %ymm0 -; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm5 -; AVX512DQ-NEXT: vpand %ymm2, %ymm5, %ymm2 -; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm2, %ymm5 +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm3 +; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm4 +; AVX512DQ-NEXT: vpand %ymm5, %ymm4, %ymm4 +; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm4, %ymm5 ; AVX512DQ-NEXT: vpand %ymm5, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpaddb %ymm2, %ymm3, %ymm2 ; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1 @@ -445,16 +442,15 @@ ; ; AVX512BW-LABEL: testv32i16u: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm4 -; AVX512BW-NEXT: vptestnmb %zmm1, %zmm4, %k0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm2 +; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vptestnmb %zmm4, %zmm3, %k0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm5 ; AVX512BW-NEXT: vpandq %zmm5, %zmm2, %zmm2 -; AVX512BW-NEXT: vpandq %zmm1, %zmm4, %zmm1 -; AVX512BW-NEXT: vpshufb %zmm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vpandq %zmm4, %zmm3, %zmm3 +; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 @@ -466,29 +462,27 @@ ; ; AVX512DQ-LABEL: testv32i16u: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm3 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm5 -; AVX512DQ-NEXT: vpand %ymm2, %ymm5, %ymm5 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm3 +; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX512DQ-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm5, %ymm7 +; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm4, %ymm7 ; AVX512DQ-NEXT: vpand %ymm7, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512DQ-NEXT: vpaddb %ymm5, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm2, %ymm4 +; AVX512DQ-NEXT: vpaddb %ymm4, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpand %ymm0, %ymm3, %ymm0 ; AVX512DQ-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpaddw %ymm0, %ymm3, %ymm0 -; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm5 -; AVX512DQ-NEXT: vpand %ymm2, %ymm5, %ymm2 -; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm2, %ymm5 +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm3 +; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm4 +; AVX512DQ-NEXT: vpand %ymm5, %ymm4, %ymm4 +; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm4, %ymm5 ; AVX512DQ-NEXT: vpand %ymm5, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpaddb %ymm2, %ymm3, %ymm2 ; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1 @@ -555,40 +549,37 @@ ; ; AVX512BW-LABEL: testv64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vptestnmb %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vptestnmb %zmm3, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm4 ; AVX512BW-NEXT: vpandq %zmm4, %zmm2, %zmm2 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 +; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: testv64i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm3 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm3 ; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpand %ymm4, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm0, %ymm6 ; AVX512DQ-NEXT: vpand %ymm6, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm0 ; AVX512DQ-NEXT: vpaddb %ymm0, %ymm3, %ymm0 -; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm3 ; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm1, %ymm2 -; AVX512DQ-NEXT: vpand %ymm2, %ymm3, %ymm2 -; AVX512DQ-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512DQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX512DQ-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm1, %ymm4 +; AVX512DQ-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX512DQ-NEXT: vpaddb %ymm1, %ymm3, %ymm1 ; AVX512DQ-NEXT: retq %out = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %in, i1 0) ret <64 x i8> %out @@ -649,40 +640,37 @@ ; ; AVX512BW-LABEL: testv64i8u: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vptestnmb %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vptestnmb %zmm3, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm4 ; AVX512BW-NEXT: vpandq %zmm4, %zmm2, %zmm2 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 +; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: testv64i8u: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm3 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm3 ; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpand %ymm4, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm0, %ymm6 ; AVX512DQ-NEXT: vpand %ymm6, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm0 ; AVX512DQ-NEXT: vpaddb %ymm0, %ymm3, %ymm0 -; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm3 ; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm1, %ymm2 -; AVX512DQ-NEXT: vpand %ymm2, %ymm3, %ymm2 -; AVX512DQ-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512DQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX512DQ-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm1, %ymm4 +; AVX512DQ-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX512DQ-NEXT: vpaddb %ymm1, %ymm3, %ymm1 ; AVX512DQ-NEXT: retq %out = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %in, i1 -1) ret <64 x i8> %out