Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -864,6 +864,13 @@ } } + if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) { + setOperationAction(ISD::CTLZ, MVT::v16i8, Custom); + setOperationAction(ISD::CTLZ, MVT::v8i16, Custom); + // ISD::CTLZ v4i32 - scalarization is faster. + // ISD::CTLZ v2i64 - scalarization is faster. + } + if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) { for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) { setOperationAction(ISD::FFLOOR, RoundedTy, Legal); @@ -932,6 +939,8 @@ } if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) { + bool HasInt256 = Subtarget.hasInt256(); + addRegisterClass(MVT::v32i8, &X86::VR256RegClass); addRegisterClass(MVT::v16i16, &X86::VR256RegClass); addRegisterClass(MVT::v8i32, &X86::VR256RegClass); @@ -998,14 +1007,24 @@ setOperationAction(ISD::CTTZ, VT, Custom); } + // ISD::CTLZ v8i32/v4i64 - scalarization is faster without AVX2 + // as we end up splitting the 256-bit vectors. + for (auto VT : { MVT::v32i8, MVT::v16i16 }) { + setOperationAction(ISD::CTLZ, VT, Custom); + } + + if (HasInt256) { + for (auto VT : { MVT::v8i32, MVT::v4i64 }) { + setOperationAction(ISD::CTLZ, VT, Custom); + } + } + if (Subtarget.hasAnyFMA()) { for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) setOperationAction(ISD::FMA, VT, Legal); } - bool HasInt256 = Subtarget.hasInt256(); - for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom); setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom); @@ -18738,7 +18757,105 @@ return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta); } -static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) { +// Lower CTLZ using a PSHUFB lookup table implementation. +static SDValue LowerVectorCTLZInRegLUT(SDValue Op, SDLoc DL, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + MVT SVT = VT.getScalarType(); + int NumElts = VT.getVectorNumElements(); + int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8); + MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes); + + // Per-nibble leading zero PSHUFB lookup table. + const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2, + /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1, + /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0, + /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0}; + + SmallVector LUTVec; + for (int i = 0; i < NumBytes; ++i) + LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8)); + SDValue InRegLUT = DAG.getNode(ISD::BUILD_VECTOR, DL, CurrVT, LUTVec); + + // Begin by bitcasting the input to byte vector, then split those bytes + // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them. + // If the hi input nibble is zero then we add both results together, otherwise + // we just take the hi result (by masking the lo result to zero before the + // add). + SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0)); + SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL); + + SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT); + SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT); + SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask); + SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift); + SDValue HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ); + + Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo); + Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi); + Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ); + SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi); + + // Merge result back from vXi8 back to VT, working on the lo/hi halves + // of the current vector width in the same way we did for the nibbles. + // If the upper half of the input element is zero then add the halves' + // leading zero counts together, otherwise just use the upper half's. + // Double the width of the result until we are at target width. + while (CurrVT != VT) { + int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits(); + int CurrNumElts = CurrVT.getVectorNumElements(); + MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2); + MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2); + SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT); + + // Check if the upper half of the input element is zero. + SDValue HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0), + DAG.getBitcast(CurrVT, Zero), ISD::SETEQ); + HiZ = DAG.getBitcast(NextVT, HiZ); + + // Move the upper/lower halves to the lower bits as we'll be extending to + // NextVT. Mask the lower result to zero if HiZ is true and add the results + // together. + SDValue ResNext = Res = DAG.getBitcast(NextVT, Res); + SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift); + SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift); + R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1); + Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1); + CurrVT = NextVT; + } + + return Res; +} + +static SDValue LowerVectorCTLZ(SDValue Op, SDLoc DL, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + SDValue Op0 = Op.getOperand(0); + + if (Subtarget.hasAVX512()) + return LowerVectorCTLZ_AVX512(Op, DAG); + + // Decompose 256-bit ops into smaller 128-bit ops. + if (VT.is256BitVector() && !Subtarget.hasInt256()) { + unsigned NumElems = VT.getVectorNumElements(); + + // Extract each 128-bit vector, perform ctlz and concat the result. + SDValue LHS = extract128BitVector(Op0, 0, DAG, DL); + SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL); + + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, + DAG.getNode(ISD::CTLZ, DL, LHS.getValueType(), LHS), + DAG.getNode(ISD::CTLZ, DL, RHS.getValueType(), RHS)); + } + + assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB"); + return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG); +} + +static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget, + SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); MVT OpVT = VT; unsigned NumBits = VT.getSizeInBits(); @@ -18746,7 +18863,7 @@ unsigned Opc = Op.getOpcode(); if (VT.isVector()) - return LowerVectorCTLZ_AVX512(Op, DAG); + return LowerVectorCTLZ(Op, dl, Subtarget, DAG); Op = Op.getOperand(0); if (VT == MVT::i8) { @@ -21275,7 +21392,7 @@ case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG); case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); case ISD::CTLZ: - case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, DAG); + case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG); case ISD::CTTZ: case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG); case ISD::MUL: return LowerMUL(Op, Subtarget, DAG); Index: test/CodeGen/X86/vector-lzcnt-128.ll =================================================================== --- test/CodeGen/X86/vector-lzcnt-128.ll +++ test/CodeGen/X86/vector-lzcnt-128.ll @@ -706,145 +706,70 @@ ; ; SSSE3-LABEL: testv8i16: ; SSSE3: # BB#0: -; SSSE3-NEXT: pextrw $7, %xmm0, %eax -; SSSE3-NEXT: bsrw %ax, %cx -; SSSE3-NEXT: movw $31, %ax -; SSSE3-NEXT: cmovew %ax, %cx -; SSSE3-NEXT: xorl $15, %ecx -; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: pextrw $3, %xmm0, %ecx -; SSSE3-NEXT: bsrw %cx, %cx -; SSSE3-NEXT: cmovew %ax, %cx -; SSSE3-NEXT: xorl $15, %ecx -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSSE3-NEXT: pextrw $5, %xmm0, %ecx -; SSSE3-NEXT: bsrw %cx, %cx -; SSSE3-NEXT: cmovew %ax, %cx -; SSSE3-NEXT: xorl $15, %ecx -; SSSE3-NEXT: movd %ecx, %xmm3 -; SSSE3-NEXT: pextrw $1, %xmm0, %ecx -; SSSE3-NEXT: bsrw %cx, %cx -; SSSE3-NEXT: cmovew %ax, %cx -; SSSE3-NEXT: xorl $15, %ecx -; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSSE3-NEXT: pextrw $6, %xmm0, %ecx -; SSSE3-NEXT: bsrw %cx, %cx -; SSSE3-NEXT: cmovew %ax, %cx -; SSSE3-NEXT: xorl $15, %ecx -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: pextrw $2, %xmm0, %ecx -; SSSE3-NEXT: bsrw %cx, %cx -; SSSE3-NEXT: cmovew %ax, %cx -; SSSE3-NEXT: xorl $15, %ecx -; SSSE3-NEXT: movd %ecx, %xmm3 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSSE3-NEXT: pextrw $4, %xmm0, %ecx -; SSSE3-NEXT: bsrw %cx, %cx -; SSSE3-NEXT: cmovew %ax, %cx -; SSSE3-NEXT: xorl $15, %ecx -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: movd %xmm0, %ecx -; SSSE3-NEXT: bsrw %cx, %cx -; SSSE3-NEXT: cmovew %ax, %cx -; SSSE3-NEXT: xorl $15, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pshufb %xmm1, %xmm4 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: psrlw $4, %xmm1 +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pshufb %xmm1, %xmm3 +; SSSE3-NEXT: pcmpeqb %xmm2, %xmm1 +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: paddb %xmm3, %xmm1 +; SSSE3-NEXT: pcmpeqb %xmm2, %xmm0 +; SSSE3-NEXT: psrlw $8, %xmm0 +; SSSE3-NEXT: pand %xmm1, %xmm0 +; SSSE3-NEXT: psrlw $8, %xmm1 +; SSSE3-NEXT: paddw %xmm0, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: testv8i16: ; SSE41: # BB#0: -; SSE41-NEXT: pextrw $1, %xmm0, %eax -; SSE41-NEXT: bsrw %ax, %cx -; SSE41-NEXT: movw $31, %ax -; SSE41-NEXT: cmovew %ax, %cx -; SSE41-NEXT: xorl $15, %ecx -; SSE41-NEXT: movd %xmm0, %edx -; SSE41-NEXT: bsrw %dx, %dx -; SSE41-NEXT: cmovew %ax, %dx -; SSE41-NEXT: xorl $15, %edx -; SSE41-NEXT: movd %edx, %xmm1 -; SSE41-NEXT: pinsrw $1, %ecx, %xmm1 -; SSE41-NEXT: pextrw $2, %xmm0, %ecx -; SSE41-NEXT: bsrw %cx, %cx -; SSE41-NEXT: cmovew %ax, %cx -; SSE41-NEXT: xorl $15, %ecx -; SSE41-NEXT: pinsrw $2, %ecx, %xmm1 -; SSE41-NEXT: pextrw $3, %xmm0, %ecx -; SSE41-NEXT: bsrw %cx, %cx -; SSE41-NEXT: cmovew %ax, %cx -; SSE41-NEXT: xorl $15, %ecx -; SSE41-NEXT: pinsrw $3, %ecx, %xmm1 -; SSE41-NEXT: pextrw $4, %xmm0, %ecx -; SSE41-NEXT: bsrw %cx, %cx -; SSE41-NEXT: cmovew %ax, %cx -; SSE41-NEXT: xorl $15, %ecx -; SSE41-NEXT: pinsrw $4, %ecx, %xmm1 -; SSE41-NEXT: pextrw $5, %xmm0, %ecx -; SSE41-NEXT: bsrw %cx, %cx -; SSE41-NEXT: cmovew %ax, %cx -; SSE41-NEXT: xorl $15, %ecx -; SSE41-NEXT: pinsrw $5, %ecx, %xmm1 -; SSE41-NEXT: pextrw $6, %xmm0, %ecx -; SSE41-NEXT: bsrw %cx, %cx -; SSE41-NEXT: cmovew %ax, %cx -; SSE41-NEXT: xorl $15, %ecx -; SSE41-NEXT: pinsrw $6, %ecx, %xmm1 -; SSE41-NEXT: pextrw $7, %xmm0, %ecx -; SSE41-NEXT: bsrw %cx, %cx -; SSE41-NEXT: cmovew %ax, %cx -; SSE41-NEXT: xorl $15, %ecx -; SSE41-NEXT: pinsrw $7, %ecx, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pand %xmm2, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pshufb %xmm1, %xmm4 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $4, %xmm1 +; SSE41-NEXT: pand %xmm2, %xmm1 +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pshufb %xmm1, %xmm3 +; SSE41-NEXT: pcmpeqb %xmm2, %xmm1 +; SSE41-NEXT: pand %xmm4, %xmm1 +; SSE41-NEXT: paddb %xmm3, %xmm1 +; SSE41-NEXT: pcmpeqb %xmm2, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: paddw %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: testv8i16: ; AVX: # BB#0: -; AVX-NEXT: vpextrw $1, %xmm0, %eax -; AVX-NEXT: bsrw %ax, %cx -; AVX-NEXT: movw $31, %ax -; AVX-NEXT: cmovew %ax, %cx -; AVX-NEXT: xorl $15, %ecx -; AVX-NEXT: vmovd %xmm0, %edx -; AVX-NEXT: bsrw %dx, %dx -; AVX-NEXT: cmovew %ax, %dx -; AVX-NEXT: xorl $15, %edx -; AVX-NEXT: vmovd %edx, %xmm1 -; AVX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $2, %xmm0, %ecx -; AVX-NEXT: bsrw %cx, %cx -; AVX-NEXT: cmovew %ax, %cx -; AVX-NEXT: xorl $15, %ecx -; AVX-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $3, %xmm0, %ecx -; AVX-NEXT: bsrw %cx, %cx -; AVX-NEXT: cmovew %ax, %cx -; AVX-NEXT: xorl $15, %ecx -; AVX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $4, %xmm0, %ecx -; AVX-NEXT: bsrw %cx, %cx -; AVX-NEXT: cmovew %ax, %cx -; AVX-NEXT: xorl $15, %ecx -; AVX-NEXT: vpinsrw $4, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $5, %xmm0, %ecx -; AVX-NEXT: bsrw %cx, %cx -; AVX-NEXT: cmovew %ax, %cx -; AVX-NEXT: xorl $15, %ecx -; AVX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $6, %xmm0, %ecx -; AVX-NEXT: bsrw %cx, %cx -; AVX-NEXT: cmovew %ax, %cx -; AVX-NEXT: xorl $15, %ecx -; AVX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $7, %xmm0, %ecx -; AVX-NEXT: bsrw %cx, %cx -; AVX-NEXT: cmovew %ax, %cx -; AVX-NEXT: xorl $15, %ecx -; AVX-NEXT: vpinsrw $7, %ecx, %xmm1, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm4 +; AVX-NEXT: vpand %xmm1, %xmm4, %xmm1 +; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm5 +; AVX-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; ; AVX512VLCD-LABEL: testv8i16: @@ -865,47 +790,25 @@ ; ; X32-SSE-LABEL: testv8i16: ; X32-SSE: # BB#0: -; X32-SSE-NEXT: pextrw $1, %xmm0, %eax -; X32-SSE-NEXT: bsrw %ax, %cx -; X32-SSE-NEXT: movw $31, %ax -; X32-SSE-NEXT: cmovew %ax, %cx -; X32-SSE-NEXT: xorl $15, %ecx -; X32-SSE-NEXT: movd %xmm0, %edx -; X32-SSE-NEXT: bsrw %dx, %dx -; X32-SSE-NEXT: cmovew %ax, %dx -; X32-SSE-NEXT: xorl $15, %edx -; X32-SSE-NEXT: movd %edx, %xmm1 -; X32-SSE-NEXT: pinsrw $1, %ecx, %xmm1 -; X32-SSE-NEXT: pextrw $2, %xmm0, %ecx -; X32-SSE-NEXT: bsrw %cx, %cx -; X32-SSE-NEXT: cmovew %ax, %cx -; X32-SSE-NEXT: xorl $15, %ecx -; X32-SSE-NEXT: pinsrw $2, %ecx, %xmm1 -; X32-SSE-NEXT: pextrw $3, %xmm0, %ecx -; X32-SSE-NEXT: bsrw %cx, %cx -; X32-SSE-NEXT: cmovew %ax, %cx -; X32-SSE-NEXT: xorl $15, %ecx -; X32-SSE-NEXT: pinsrw $3, %ecx, %xmm1 -; X32-SSE-NEXT: pextrw $4, %xmm0, %ecx -; X32-SSE-NEXT: bsrw %cx, %cx -; X32-SSE-NEXT: cmovew %ax, %cx -; X32-SSE-NEXT: xorl $15, %ecx -; X32-SSE-NEXT: pinsrw $4, %ecx, %xmm1 -; X32-SSE-NEXT: pextrw $5, %xmm0, %ecx -; X32-SSE-NEXT: bsrw %cx, %cx -; X32-SSE-NEXT: cmovew %ax, %cx -; X32-SSE-NEXT: xorl $15, %ecx -; X32-SSE-NEXT: pinsrw $5, %ecx, %xmm1 -; X32-SSE-NEXT: pextrw $6, %xmm0, %ecx -; X32-SSE-NEXT: bsrw %cx, %cx -; X32-SSE-NEXT: cmovew %ax, %cx -; X32-SSE-NEXT: xorl $15, %ecx -; X32-SSE-NEXT: pinsrw $6, %ecx, %xmm1 -; X32-SSE-NEXT: pextrw $7, %xmm0, %ecx -; X32-SSE-NEXT: bsrw %cx, %cx -; X32-SSE-NEXT: cmovew %ax, %cx -; X32-SSE-NEXT: xorl $15, %ecx -; X32-SSE-NEXT: pinsrw $7, %ecx, %xmm1 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-SSE-NEXT: movdqa %xmm0, %xmm1 +; X32-SSE-NEXT: pand %xmm2, %xmm1 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-SSE-NEXT: movdqa %xmm3, %xmm4 +; X32-SSE-NEXT: pshufb %xmm1, %xmm4 +; X32-SSE-NEXT: movdqa %xmm0, %xmm1 +; X32-SSE-NEXT: psrlw $4, %xmm1 +; X32-SSE-NEXT: pand %xmm2, %xmm1 +; X32-SSE-NEXT: pxor %xmm2, %xmm2 +; X32-SSE-NEXT: pshufb %xmm1, %xmm3 +; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm1 +; X32-SSE-NEXT: pand %xmm4, %xmm1 +; X32-SSE-NEXT: paddb %xmm3, %xmm1 +; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm0 +; X32-SSE-NEXT: psrlw $8, %xmm0 +; X32-SSE-NEXT: pand %xmm1, %xmm0 +; X32-SSE-NEXT: psrlw $8, %xmm1 +; X32-SSE-NEXT: paddw %xmm0, %xmm1 ; X32-SSE-NEXT: movdqa %xmm1, %xmm0 ; X32-SSE-NEXT: retl %out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %in, i1 0) @@ -1001,118 +904,70 @@ ; ; SSSE3-LABEL: testv8i16u: ; SSSE3: # BB#0: -; SSSE3-NEXT: pextrw $7, %xmm0, %eax -; SSSE3-NEXT: bsrw %ax, %ax -; SSSE3-NEXT: xorl $15, %eax -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: pextrw $3, %xmm0, %eax -; SSSE3-NEXT: bsrw %ax, %ax -; SSSE3-NEXT: xorl $15, %eax -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSSE3-NEXT: pextrw $5, %xmm0, %eax -; SSSE3-NEXT: bsrw %ax, %ax -; SSSE3-NEXT: xorl $15, %eax -; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: pextrw $1, %xmm0, %eax -; SSSE3-NEXT: bsrw %ax, %ax -; SSSE3-NEXT: xorl $15, %eax -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSSE3-NEXT: pextrw $6, %xmm0, %eax -; SSSE3-NEXT: bsrw %ax, %ax -; SSSE3-NEXT: xorl $15, %eax -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: pextrw $2, %xmm0, %eax -; SSSE3-NEXT: bsrw %ax, %ax -; SSSE3-NEXT: xorl $15, %eax -; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSSE3-NEXT: pextrw $4, %xmm0, %eax -; SSSE3-NEXT: bsrw %ax, %ax -; SSSE3-NEXT: xorl $15, %eax -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: movd %xmm0, %eax -; SSSE3-NEXT: bsrw %ax, %ax -; SSSE3-NEXT: xorl $15, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pshufb %xmm1, %xmm4 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: psrlw $4, %xmm1 +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pshufb %xmm1, %xmm3 +; SSSE3-NEXT: pcmpeqb %xmm2, %xmm1 +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: paddb %xmm3, %xmm1 +; SSSE3-NEXT: pcmpeqb %xmm2, %xmm0 +; SSSE3-NEXT: psrlw $8, %xmm0 +; SSSE3-NEXT: pand %xmm1, %xmm0 +; SSSE3-NEXT: psrlw $8, %xmm1 +; SSSE3-NEXT: paddw %xmm0, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: testv8i16u: ; SSE41: # BB#0: -; SSE41-NEXT: pextrw $1, %xmm0, %eax -; SSE41-NEXT: bsrw %ax, %ax -; SSE41-NEXT: xorl $15, %eax -; SSE41-NEXT: movd %xmm0, %ecx -; SSE41-NEXT: bsrw %cx, %cx -; SSE41-NEXT: xorl $15, %ecx -; SSE41-NEXT: movd %ecx, %xmm1 -; SSE41-NEXT: pinsrw $1, %eax, %xmm1 -; SSE41-NEXT: pextrw $2, %xmm0, %eax -; SSE41-NEXT: bsrw %ax, %ax -; SSE41-NEXT: xorl $15, %eax -; SSE41-NEXT: pinsrw $2, %eax, %xmm1 -; SSE41-NEXT: pextrw $3, %xmm0, %eax -; SSE41-NEXT: bsrw %ax, %ax -; SSE41-NEXT: xorl $15, %eax -; SSE41-NEXT: pinsrw $3, %eax, %xmm1 -; SSE41-NEXT: pextrw $4, %xmm0, %eax -; SSE41-NEXT: bsrw %ax, %ax -; SSE41-NEXT: xorl $15, %eax -; SSE41-NEXT: pinsrw $4, %eax, %xmm1 -; SSE41-NEXT: pextrw $5, %xmm0, %eax -; SSE41-NEXT: bsrw %ax, %ax -; SSE41-NEXT: xorl $15, %eax -; SSE41-NEXT: pinsrw $5, %eax, %xmm1 -; SSE41-NEXT: pextrw $6, %xmm0, %eax -; SSE41-NEXT: bsrw %ax, %ax -; SSE41-NEXT: xorl $15, %eax -; SSE41-NEXT: pinsrw $6, %eax, %xmm1 -; SSE41-NEXT: pextrw $7, %xmm0, %eax -; SSE41-NEXT: bsrw %ax, %ax -; SSE41-NEXT: xorl $15, %eax -; SSE41-NEXT: pinsrw $7, %eax, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pand %xmm2, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pshufb %xmm1, %xmm4 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $4, %xmm1 +; SSE41-NEXT: pand %xmm2, %xmm1 +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pshufb %xmm1, %xmm3 +; SSE41-NEXT: pcmpeqb %xmm2, %xmm1 +; SSE41-NEXT: pand %xmm4, %xmm1 +; SSE41-NEXT: paddb %xmm3, %xmm1 +; SSE41-NEXT: pcmpeqb %xmm2, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: paddw %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: testv8i16u: ; AVX: # BB#0: -; AVX-NEXT: vpextrw $1, %xmm0, %eax -; AVX-NEXT: bsrw %ax, %ax -; AVX-NEXT: xorl $15, %eax -; AVX-NEXT: vmovd %xmm0, %ecx -; AVX-NEXT: bsrw %cx, %cx -; AVX-NEXT: xorl $15, %ecx -; AVX-NEXT: vmovd %ecx, %xmm1 -; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $2, %xmm0, %eax -; AVX-NEXT: bsrw %ax, %ax -; AVX-NEXT: xorl $15, %eax -; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $3, %xmm0, %eax -; AVX-NEXT: bsrw %ax, %ax -; AVX-NEXT: xorl $15, %eax -; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $4, %xmm0, %eax -; AVX-NEXT: bsrw %ax, %ax -; AVX-NEXT: xorl $15, %eax -; AVX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $5, %xmm0, %eax -; AVX-NEXT: bsrw %ax, %ax -; AVX-NEXT: xorl $15, %eax -; AVX-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $6, %xmm0, %eax -; AVX-NEXT: bsrw %ax, %ax -; AVX-NEXT: xorl $15, %eax -; AVX-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $7, %xmm0, %eax -; AVX-NEXT: bsrw %ax, %ax -; AVX-NEXT: xorl $15, %eax -; AVX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm4 +; AVX-NEXT: vpand %xmm1, %xmm4, %xmm1 +; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm5 +; AVX-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; ; AVX512VLCD-LABEL: testv8i16u: @@ -1133,38 +988,25 @@ ; ; X32-SSE-LABEL: testv8i16u: ; X32-SSE: # BB#0: -; X32-SSE-NEXT: pextrw $1, %xmm0, %eax -; X32-SSE-NEXT: bsrw %ax, %ax -; X32-SSE-NEXT: xorl $15, %eax -; X32-SSE-NEXT: movd %xmm0, %ecx -; X32-SSE-NEXT: bsrw %cx, %cx -; X32-SSE-NEXT: xorl $15, %ecx -; X32-SSE-NEXT: movd %ecx, %xmm1 -; X32-SSE-NEXT: pinsrw $1, %eax, %xmm1 -; X32-SSE-NEXT: pextrw $2, %xmm0, %eax -; X32-SSE-NEXT: bsrw %ax, %ax -; X32-SSE-NEXT: xorl $15, %eax -; X32-SSE-NEXT: pinsrw $2, %eax, %xmm1 -; X32-SSE-NEXT: pextrw $3, %xmm0, %eax -; X32-SSE-NEXT: bsrw %ax, %ax -; X32-SSE-NEXT: xorl $15, %eax -; X32-SSE-NEXT: pinsrw $3, %eax, %xmm1 -; X32-SSE-NEXT: pextrw $4, %xmm0, %eax -; X32-SSE-NEXT: bsrw %ax, %ax -; X32-SSE-NEXT: xorl $15, %eax -; X32-SSE-NEXT: pinsrw $4, %eax, %xmm1 -; X32-SSE-NEXT: pextrw $5, %xmm0, %eax -; X32-SSE-NEXT: bsrw %ax, %ax -; X32-SSE-NEXT: xorl $15, %eax -; X32-SSE-NEXT: pinsrw $5, %eax, %xmm1 -; X32-SSE-NEXT: pextrw $6, %xmm0, %eax -; X32-SSE-NEXT: bsrw %ax, %ax -; X32-SSE-NEXT: xorl $15, %eax -; X32-SSE-NEXT: pinsrw $6, %eax, %xmm1 -; X32-SSE-NEXT: pextrw $7, %xmm0, %eax -; X32-SSE-NEXT: bsrw %ax, %ax -; X32-SSE-NEXT: xorl $15, %eax -; X32-SSE-NEXT: pinsrw $7, %eax, %xmm1 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-SSE-NEXT: movdqa %xmm0, %xmm1 +; X32-SSE-NEXT: pand %xmm2, %xmm1 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-SSE-NEXT: movdqa %xmm3, %xmm4 +; X32-SSE-NEXT: pshufb %xmm1, %xmm4 +; X32-SSE-NEXT: movdqa %xmm0, %xmm1 +; X32-SSE-NEXT: psrlw $4, %xmm1 +; X32-SSE-NEXT: pand %xmm2, %xmm1 +; X32-SSE-NEXT: pxor %xmm2, %xmm2 +; X32-SSE-NEXT: pshufb %xmm1, %xmm3 +; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm1 +; X32-SSE-NEXT: pand %xmm4, %xmm1 +; X32-SSE-NEXT: paddb %xmm3, %xmm1 +; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm0 +; X32-SSE-NEXT: psrlw $8, %xmm0 +; X32-SSE-NEXT: pand %xmm1, %xmm0 +; X32-SSE-NEXT: psrlw $8, %xmm1 +; X32-SSE-NEXT: paddw %xmm0, %xmm1 ; X32-SSE-NEXT: movdqa %xmm1, %xmm0 ; X32-SSE-NEXT: retl %out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %in, i1 -1) @@ -1384,278 +1226,53 @@ ; ; SSSE3-LABEL: testv16i8: ; SSSE3: # BB#0: -; SSSE3-NEXT: pushq %rbp -; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSSE3-NEXT: bsrl %eax, %ecx -; SSSE3-NEXT: movl $15, %eax -; SSSE3-NEXT: cmovel %eax, %ecx -; SSSE3-NEXT: xorl $7, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSSE3-NEXT: bsrl %ecx, %ecx -; SSSE3-NEXT: cmovel %eax, %ecx -; SSSE3-NEXT: xorl $7, %ecx -; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSSE3-NEXT: bsrl %edx, %ecx -; SSSE3-NEXT: cmovel %eax, %ecx -; SSSE3-NEXT: xorl $7, %ecx -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSSE3-NEXT: bsrl %ebp, %ebp -; SSSE3-NEXT: cmovel %eax, %ebp -; SSSE3-NEXT: xorl $7, %ebp -; SSSE3-NEXT: movd %ebp, %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSSE3-NEXT: bsrl %edi, %edi -; SSSE3-NEXT: cmovel %eax, %edi -; SSSE3-NEXT: xorl $7, %edi -; SSSE3-NEXT: movd %edi, %xmm1 -; SSSE3-NEXT: bsrl %ecx, %ecx -; SSSE3-NEXT: cmovel %eax, %ecx -; SSSE3-NEXT: xorl $7, %ecx -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSSE3-NEXT: bsrl %esi, %ecx -; SSSE3-NEXT: cmovel %eax, %ecx -; SSSE3-NEXT: xorl $7, %ecx -; SSSE3-NEXT: movd %ecx, %xmm3 -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSSE3-NEXT: bsrl %ecx, %ecx -; SSSE3-NEXT: cmovel %eax, %ecx -; SSSE3-NEXT: xorl $7, %ecx -; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSSE3-NEXT: bsrl %ebx, %ecx -; SSSE3-NEXT: cmovel %eax, %ecx -; SSSE3-NEXT: xorl $7, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: bsrl %edx, %ecx -; SSSE3-NEXT: cmovel %eax, %ecx -; SSSE3-NEXT: xorl $7, %ecx -; SSSE3-NEXT: movd %ecx, %xmm3 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSSE3-NEXT: bsrl %r11d, %ecx -; SSSE3-NEXT: cmovel %eax, %ecx -; SSSE3-NEXT: xorl $7, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: bsrl %esi, %ecx -; SSSE3-NEXT: cmovel %eax, %ecx -; SSSE3-NEXT: xorl $7, %ecx -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSSE3-NEXT: bsrl %r9d, %ecx -; SSSE3-NEXT: cmovel %eax, %ecx -; SSSE3-NEXT: xorl $7, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: bsrl %r10d, %ecx -; SSSE3-NEXT: cmovel %eax, %ecx -; SSSE3-NEXT: xorl $7, %ecx -; SSSE3-NEXT: movd %ecx, %xmm3 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSSE3-NEXT: bsrl %r8d, %ecx -; SSSE3-NEXT: cmovel %eax, %ecx -; SSSE3-NEXT: xorl $7, %ecx -; SSSE3-NEXT: movd %ecx, %xmm4 -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSSE3-NEXT: bsrl %ecx, %ecx -; SSSE3-NEXT: cmovel %eax, %ecx -; SSSE3-NEXT: xorl $7, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSSE3-NEXT: popq %rbx -; SSSE3-NEXT: popq %rbp +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSSE3-NEXT: pshufb %xmm3, %xmm4 +; SSSE3-NEXT: psrlw $4, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pcmpeqb %xmm0, %xmm2 +; SSSE3-NEXT: pand %xmm4, %xmm2 +; SSSE3-NEXT: pshufb %xmm0, %xmm1 +; SSSE3-NEXT: paddb %xmm2, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: testv16i8: ; SSE41: # BB#0: -; SSE41-NEXT: pextrb $1, %xmm0, %eax -; SSE41-NEXT: bsrl %eax, %ecx -; SSE41-NEXT: movl $15, %eax -; SSE41-NEXT: cmovel %eax, %ecx -; SSE41-NEXT: xorl $7, %ecx -; SSE41-NEXT: pextrb $0, %xmm0, %edx -; SSE41-NEXT: bsrl %edx, %edx -; SSE41-NEXT: cmovel %eax, %edx -; SSE41-NEXT: xorl $7, %edx -; SSE41-NEXT: movd %edx, %xmm1 -; SSE41-NEXT: pinsrb $1, %ecx, %xmm1 -; SSE41-NEXT: pextrb $2, %xmm0, %ecx -; SSE41-NEXT: bsrl %ecx, %ecx -; SSE41-NEXT: cmovel %eax, %ecx -; SSE41-NEXT: xorl $7, %ecx -; SSE41-NEXT: pinsrb $2, %ecx, %xmm1 -; SSE41-NEXT: pextrb $3, %xmm0, %ecx -; SSE41-NEXT: bsrl %ecx, %ecx -; SSE41-NEXT: cmovel %eax, %ecx -; SSE41-NEXT: xorl $7, %ecx -; SSE41-NEXT: pinsrb $3, %ecx, %xmm1 -; SSE41-NEXT: pextrb $4, %xmm0, %ecx -; SSE41-NEXT: bsrl %ecx, %ecx -; SSE41-NEXT: cmovel %eax, %ecx -; SSE41-NEXT: xorl $7, %ecx -; SSE41-NEXT: pinsrb $4, %ecx, %xmm1 -; SSE41-NEXT: pextrb $5, %xmm0, %ecx -; SSE41-NEXT: bsrl %ecx, %ecx -; SSE41-NEXT: cmovel %eax, %ecx -; SSE41-NEXT: xorl $7, %ecx -; SSE41-NEXT: pinsrb $5, %ecx, %xmm1 -; SSE41-NEXT: pextrb $6, %xmm0, %ecx -; SSE41-NEXT: bsrl %ecx, %ecx -; SSE41-NEXT: cmovel %eax, %ecx -; SSE41-NEXT: xorl $7, %ecx -; SSE41-NEXT: pinsrb $6, %ecx, %xmm1 -; SSE41-NEXT: pextrb $7, %xmm0, %ecx -; SSE41-NEXT: bsrl %ecx, %ecx -; SSE41-NEXT: cmovel %eax, %ecx -; SSE41-NEXT: xorl $7, %ecx -; SSE41-NEXT: pinsrb $7, %ecx, %xmm1 -; SSE41-NEXT: pextrb $8, %xmm0, %ecx -; SSE41-NEXT: bsrl %ecx, %ecx -; SSE41-NEXT: cmovel %eax, %ecx -; SSE41-NEXT: xorl $7, %ecx -; SSE41-NEXT: pinsrb $8, %ecx, %xmm1 -; SSE41-NEXT: pextrb $9, %xmm0, %ecx -; SSE41-NEXT: bsrl %ecx, %ecx -; SSE41-NEXT: cmovel %eax, %ecx -; SSE41-NEXT: xorl $7, %ecx -; SSE41-NEXT: pinsrb $9, %ecx, %xmm1 -; SSE41-NEXT: pextrb $10, %xmm0, %ecx -; SSE41-NEXT: bsrl %ecx, %ecx -; SSE41-NEXT: cmovel %eax, %ecx -; SSE41-NEXT: xorl $7, %ecx -; SSE41-NEXT: pinsrb $10, %ecx, %xmm1 -; SSE41-NEXT: pextrb $11, %xmm0, %ecx -; SSE41-NEXT: bsrl %ecx, %ecx -; SSE41-NEXT: cmovel %eax, %ecx -; SSE41-NEXT: xorl $7, %ecx -; SSE41-NEXT: pinsrb $11, %ecx, %xmm1 -; SSE41-NEXT: pextrb $12, %xmm0, %ecx -; SSE41-NEXT: bsrl %ecx, %ecx -; SSE41-NEXT: cmovel %eax, %ecx -; SSE41-NEXT: xorl $7, %ecx -; SSE41-NEXT: pinsrb $12, %ecx, %xmm1 -; SSE41-NEXT: pextrb $13, %xmm0, %ecx -; SSE41-NEXT: bsrl %ecx, %ecx -; SSE41-NEXT: cmovel %eax, %ecx -; SSE41-NEXT: xorl $7, %ecx -; SSE41-NEXT: pinsrb $13, %ecx, %xmm1 -; SSE41-NEXT: pextrb $14, %xmm0, %ecx -; SSE41-NEXT: bsrl %ecx, %ecx -; SSE41-NEXT: cmovel %eax, %ecx -; SSE41-NEXT: xorl $7, %ecx -; SSE41-NEXT: pinsrb $14, %ecx, %xmm1 -; SSE41-NEXT: pextrb $15, %xmm0, %ecx -; SSE41-NEXT: bsrl %ecx, %ecx -; SSE41-NEXT: cmovel %eax, %ecx -; SSE41-NEXT: xorl $7, %ecx -; SSE41-NEXT: pinsrb $15, %ecx, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: pand %xmm2, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: pshufb %xmm3, %xmm4 +; SSE41-NEXT: psrlw $4, %xmm0 +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pcmpeqb %xmm0, %xmm2 +; SSE41-NEXT: pand %xmm4, %xmm2 +; SSE41-NEXT: pshufb %xmm0, %xmm1 +; SSE41-NEXT: paddb %xmm2, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: testv16i8: ; AVX: # BB#0: -; AVX-NEXT: vpextrb $1, %xmm0, %eax -; AVX-NEXT: bsrl %eax, %ecx -; AVX-NEXT: movl $15, %eax -; AVX-NEXT: cmovel %eax, %ecx -; AVX-NEXT: xorl $7, %ecx -; AVX-NEXT: vpextrb $0, %xmm0, %edx -; AVX-NEXT: bsrl %edx, %edx -; AVX-NEXT: cmovel %eax, %edx -; AVX-NEXT: xorl $7, %edx -; AVX-NEXT: vmovd %edx, %xmm1 -; AVX-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpextrb $2, %xmm0, %ecx -; AVX-NEXT: bsrl %ecx, %ecx -; AVX-NEXT: cmovel %eax, %ecx -; AVX-NEXT: xorl $7, %ecx -; AVX-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpextrb $3, %xmm0, %ecx -; AVX-NEXT: bsrl %ecx, %ecx -; AVX-NEXT: cmovel %eax, %ecx -; AVX-NEXT: xorl $7, %ecx -; AVX-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpextrb $4, %xmm0, %ecx -; AVX-NEXT: bsrl %ecx, %ecx -; AVX-NEXT: cmovel %eax, %ecx -; AVX-NEXT: xorl $7, %ecx -; AVX-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpextrb $5, %xmm0, %ecx -; AVX-NEXT: bsrl %ecx, %ecx -; AVX-NEXT: cmovel %eax, %ecx -; AVX-NEXT: xorl $7, %ecx -; AVX-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpextrb $6, %xmm0, %ecx -; AVX-NEXT: bsrl %ecx, %ecx -; AVX-NEXT: cmovel %eax, %ecx -; AVX-NEXT: xorl $7, %ecx -; AVX-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpextrb $7, %xmm0, %ecx -; AVX-NEXT: bsrl %ecx, %ecx -; AVX-NEXT: cmovel %eax, %ecx -; AVX-NEXT: xorl $7, %ecx -; AVX-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpextrb $8, %xmm0, %ecx -; AVX-NEXT: bsrl %ecx, %ecx -; AVX-NEXT: cmovel %eax, %ecx -; AVX-NEXT: xorl $7, %ecx -; AVX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpextrb $9, %xmm0, %ecx -; AVX-NEXT: bsrl %ecx, %ecx -; AVX-NEXT: cmovel %eax, %ecx -; AVX-NEXT: xorl $7, %ecx -; AVX-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpextrb $10, %xmm0, %ecx -; AVX-NEXT: bsrl %ecx, %ecx -; AVX-NEXT: cmovel %eax, %ecx -; AVX-NEXT: xorl $7, %ecx -; AVX-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpextrb $11, %xmm0, %ecx -; AVX-NEXT: bsrl %ecx, %ecx -; AVX-NEXT: cmovel %eax, %ecx -; AVX-NEXT: xorl $7, %ecx -; AVX-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpextrb $12, %xmm0, %ecx -; AVX-NEXT: bsrl %ecx, %ecx -; AVX-NEXT: cmovel %eax, %ecx -; AVX-NEXT: xorl $7, %ecx -; AVX-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpextrb $13, %xmm0, %ecx -; AVX-NEXT: bsrl %ecx, %ecx -; AVX-NEXT: cmovel %eax, %ecx -; AVX-NEXT: xorl $7, %ecx -; AVX-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpextrb $14, %xmm0, %ecx -; AVX-NEXT: bsrl %ecx, %ecx -; AVX-NEXT: cmovel %eax, %ecx -; AVX-NEXT: xorl $7, %ecx -; AVX-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpextrb $15, %xmm0, %ecx -; AVX-NEXT: bsrl %ecx, %ecx -; AVX-NEXT: cmovel %eax, %ecx -; AVX-NEXT: xorl $7, %ecx -; AVX-NEXT: vpinsrb $15, %ecx, %xmm1, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vpand %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: testv16i8: @@ -1668,87 +1285,19 @@ ; ; X32-SSE-LABEL: testv16i8: ; X32-SSE: # BB#0: -; X32-SSE-NEXT: pextrb $1, %xmm0, %eax -; X32-SSE-NEXT: bsrl %eax, %ecx -; X32-SSE-NEXT: movl $15, %eax -; X32-SSE-NEXT: cmovel %eax, %ecx -; X32-SSE-NEXT: xorl $7, %ecx -; X32-SSE-NEXT: pextrb $0, %xmm0, %edx -; X32-SSE-NEXT: bsrl %edx, %edx -; X32-SSE-NEXT: cmovel %eax, %edx -; X32-SSE-NEXT: xorl $7, %edx -; X32-SSE-NEXT: movd %edx, %xmm1 -; X32-SSE-NEXT: pinsrb $1, %ecx, %xmm1 -; X32-SSE-NEXT: pextrb $2, %xmm0, %ecx -; X32-SSE-NEXT: bsrl %ecx, %ecx -; X32-SSE-NEXT: cmovel %eax, %ecx -; X32-SSE-NEXT: xorl $7, %ecx -; X32-SSE-NEXT: pinsrb $2, %ecx, %xmm1 -; X32-SSE-NEXT: pextrb $3, %xmm0, %ecx -; X32-SSE-NEXT: bsrl %ecx, %ecx -; X32-SSE-NEXT: cmovel %eax, %ecx -; X32-SSE-NEXT: xorl $7, %ecx -; X32-SSE-NEXT: pinsrb $3, %ecx, %xmm1 -; X32-SSE-NEXT: pextrb $4, %xmm0, %ecx -; X32-SSE-NEXT: bsrl %ecx, %ecx -; X32-SSE-NEXT: cmovel %eax, %ecx -; X32-SSE-NEXT: xorl $7, %ecx -; X32-SSE-NEXT: pinsrb $4, %ecx, %xmm1 -; X32-SSE-NEXT: pextrb $5, %xmm0, %ecx -; X32-SSE-NEXT: bsrl %ecx, %ecx -; X32-SSE-NEXT: cmovel %eax, %ecx -; X32-SSE-NEXT: xorl $7, %ecx -; X32-SSE-NEXT: pinsrb $5, %ecx, %xmm1 -; X32-SSE-NEXT: pextrb $6, %xmm0, %ecx -; X32-SSE-NEXT: bsrl %ecx, %ecx -; X32-SSE-NEXT: cmovel %eax, %ecx -; X32-SSE-NEXT: xorl $7, %ecx -; X32-SSE-NEXT: pinsrb $6, %ecx, %xmm1 -; X32-SSE-NEXT: pextrb $7, %xmm0, %ecx -; X32-SSE-NEXT: bsrl %ecx, %ecx -; X32-SSE-NEXT: cmovel %eax, %ecx -; X32-SSE-NEXT: xorl $7, %ecx -; X32-SSE-NEXT: pinsrb $7, %ecx, %xmm1 -; X32-SSE-NEXT: pextrb $8, %xmm0, %ecx -; X32-SSE-NEXT: bsrl %ecx, %ecx -; X32-SSE-NEXT: cmovel %eax, %ecx -; X32-SSE-NEXT: xorl $7, %ecx -; X32-SSE-NEXT: pinsrb $8, %ecx, %xmm1 -; X32-SSE-NEXT: pextrb $9, %xmm0, %ecx -; X32-SSE-NEXT: bsrl %ecx, %ecx -; X32-SSE-NEXT: cmovel %eax, %ecx -; X32-SSE-NEXT: xorl $7, %ecx -; X32-SSE-NEXT: pinsrb $9, %ecx, %xmm1 -; X32-SSE-NEXT: pextrb $10, %xmm0, %ecx -; X32-SSE-NEXT: bsrl %ecx, %ecx -; X32-SSE-NEXT: cmovel %eax, %ecx -; X32-SSE-NEXT: xorl $7, %ecx -; X32-SSE-NEXT: pinsrb $10, %ecx, %xmm1 -; X32-SSE-NEXT: pextrb $11, %xmm0, %ecx -; X32-SSE-NEXT: bsrl %ecx, %ecx -; X32-SSE-NEXT: cmovel %eax, %ecx -; X32-SSE-NEXT: xorl $7, %ecx -; X32-SSE-NEXT: pinsrb $11, %ecx, %xmm1 -; X32-SSE-NEXT: pextrb $12, %xmm0, %ecx -; X32-SSE-NEXT: bsrl %ecx, %ecx -; X32-SSE-NEXT: cmovel %eax, %ecx -; X32-SSE-NEXT: xorl $7, %ecx -; X32-SSE-NEXT: pinsrb $12, %ecx, %xmm1 -; X32-SSE-NEXT: pextrb $13, %xmm0, %ecx -; X32-SSE-NEXT: bsrl %ecx, %ecx -; X32-SSE-NEXT: cmovel %eax, %ecx -; X32-SSE-NEXT: xorl $7, %ecx -; X32-SSE-NEXT: pinsrb $13, %ecx, %xmm1 -; X32-SSE-NEXT: pextrb $14, %xmm0, %ecx -; X32-SSE-NEXT: bsrl %ecx, %ecx -; X32-SSE-NEXT: cmovel %eax, %ecx -; X32-SSE-NEXT: xorl $7, %ecx -; X32-SSE-NEXT: pinsrb $14, %ecx, %xmm1 -; X32-SSE-NEXT: pextrb $15, %xmm0, %ecx -; X32-SSE-NEXT: bsrl %ecx, %ecx -; X32-SSE-NEXT: cmovel %eax, %ecx -; X32-SSE-NEXT: xorl $7, %ecx -; X32-SSE-NEXT: pinsrb $15, %ecx, %xmm1 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-SSE-NEXT: movdqa %xmm0, %xmm3 +; X32-SSE-NEXT: pand %xmm2, %xmm3 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-SSE-NEXT: movdqa %xmm1, %xmm4 +; X32-SSE-NEXT: pshufb %xmm3, %xmm4 +; X32-SSE-NEXT: psrlw $4, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: pxor %xmm2, %xmm2 +; X32-SSE-NEXT: pcmpeqb %xmm0, %xmm2 +; X32-SSE-NEXT: pand %xmm4, %xmm2 +; X32-SSE-NEXT: pshufb %xmm0, %xmm1 +; X32-SSE-NEXT: paddb %xmm2, %xmm1 ; X32-SSE-NEXT: movdqa %xmm1, %xmm0 ; X32-SSE-NEXT: retl %out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %in, i1 0) @@ -1930,225 +1479,53 @@ ; ; SSSE3-LABEL: testv16i8u: ; SSSE3: # BB#0: -; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSSE3-NEXT: bsrl %eax, %eax -; SSSE3-NEXT: xorl $7, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; SSSE3-NEXT: bsrl %esi, %esi -; SSSE3-NEXT: xorl $7, %esi -; SSSE3-NEXT: movd %esi, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSSE3-NEXT: bsrl %eax, %eax -; SSSE3-NEXT: xorl $7, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx -; SSSE3-NEXT: bsrl %ebx, %ebx -; SSSE3-NEXT: xorl $7, %ebx -; SSSE3-NEXT: movd %ebx, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSSE3-NEXT: bsrl %edx, %edx -; SSSE3-NEXT: xorl $7, %edx -; SSSE3-NEXT: movd %edx, %xmm0 -; SSSE3-NEXT: bsrl %esi, %edx -; SSSE3-NEXT: xorl $7, %edx -; SSSE3-NEXT: movd %edx, %xmm3 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSSE3-NEXT: bsrl %ecx, %ecx -; SSSE3-NEXT: xorl $7, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSSE3-NEXT: bsrl %edx, %edx -; SSSE3-NEXT: xorl $7, %edx -; SSSE3-NEXT: movd %edx, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSSE3-NEXT: bsrl %edi, %edx -; SSSE3-NEXT: xorl $7, %edx -; SSSE3-NEXT: movd %edx, %xmm0 -; SSSE3-NEXT: bsrl %eax, %eax -; SSSE3-NEXT: xorl $7, %eax -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSSE3-NEXT: bsrl %r10d, %eax -; SSSE3-NEXT: xorl $7, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: bsrl %ecx, %eax -; SSSE3-NEXT: xorl $7, %eax -; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSSE3-NEXT: bsrl %r9d, %eax -; SSSE3-NEXT: xorl $7, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: bsrl %r11d, %eax -; SSSE3-NEXT: xorl $7, %eax -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSSE3-NEXT: bsrl %r8d, %eax -; SSSE3-NEXT: xorl $7, %eax -; SSSE3-NEXT: movd %eax, %xmm4 -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSSE3-NEXT: bsrl %eax, %eax -; SSSE3-NEXT: xorl $7, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSSE3-NEXT: popq %rbx +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSSE3-NEXT: pshufb %xmm3, %xmm4 +; SSSE3-NEXT: psrlw $4, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pcmpeqb %xmm0, %xmm2 +; SSSE3-NEXT: pand %xmm4, %xmm2 +; SSSE3-NEXT: pshufb %xmm0, %xmm1 +; SSSE3-NEXT: paddb %xmm2, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: testv16i8u: ; SSE41: # BB#0: -; SSE41-NEXT: pextrb $1, %xmm0, %eax -; SSE41-NEXT: bsrl %eax, %eax -; SSE41-NEXT: xorl $7, %eax -; SSE41-NEXT: pextrb $0, %xmm0, %ecx -; SSE41-NEXT: bsrl %ecx, %ecx -; SSE41-NEXT: xorl $7, %ecx -; SSE41-NEXT: movd %ecx, %xmm1 -; SSE41-NEXT: pinsrb $1, %eax, %xmm1 -; SSE41-NEXT: pextrb $2, %xmm0, %eax -; SSE41-NEXT: bsrl %eax, %eax -; SSE41-NEXT: xorl $7, %eax -; SSE41-NEXT: pinsrb $2, %eax, %xmm1 -; SSE41-NEXT: pextrb $3, %xmm0, %eax -; SSE41-NEXT: bsrl %eax, %eax -; SSE41-NEXT: xorl $7, %eax -; SSE41-NEXT: pinsrb $3, %eax, %xmm1 -; SSE41-NEXT: pextrb $4, %xmm0, %eax -; SSE41-NEXT: bsrl %eax, %eax -; SSE41-NEXT: xorl $7, %eax -; SSE41-NEXT: pinsrb $4, %eax, %xmm1 -; SSE41-NEXT: pextrb $5, %xmm0, %eax -; SSE41-NEXT: bsrl %eax, %eax -; SSE41-NEXT: xorl $7, %eax -; SSE41-NEXT: pinsrb $5, %eax, %xmm1 -; SSE41-NEXT: pextrb $6, %xmm0, %eax -; SSE41-NEXT: bsrl %eax, %eax -; SSE41-NEXT: xorl $7, %eax -; SSE41-NEXT: pinsrb $6, %eax, %xmm1 -; SSE41-NEXT: pextrb $7, %xmm0, %eax -; SSE41-NEXT: bsrl %eax, %eax -; SSE41-NEXT: xorl $7, %eax -; SSE41-NEXT: pinsrb $7, %eax, %xmm1 -; SSE41-NEXT: pextrb $8, %xmm0, %eax -; SSE41-NEXT: bsrl %eax, %eax -; SSE41-NEXT: xorl $7, %eax -; SSE41-NEXT: pinsrb $8, %eax, %xmm1 -; SSE41-NEXT: pextrb $9, %xmm0, %eax -; SSE41-NEXT: bsrl %eax, %eax -; SSE41-NEXT: xorl $7, %eax -; SSE41-NEXT: pinsrb $9, %eax, %xmm1 -; SSE41-NEXT: pextrb $10, %xmm0, %eax -; SSE41-NEXT: bsrl %eax, %eax -; SSE41-NEXT: xorl $7, %eax -; SSE41-NEXT: pinsrb $10, %eax, %xmm1 -; SSE41-NEXT: pextrb $11, %xmm0, %eax -; SSE41-NEXT: bsrl %eax, %eax -; SSE41-NEXT: xorl $7, %eax -; SSE41-NEXT: pinsrb $11, %eax, %xmm1 -; SSE41-NEXT: pextrb $12, %xmm0, %eax -; SSE41-NEXT: bsrl %eax, %eax -; SSE41-NEXT: xorl $7, %eax -; SSE41-NEXT: pinsrb $12, %eax, %xmm1 -; SSE41-NEXT: pextrb $13, %xmm0, %eax -; SSE41-NEXT: bsrl %eax, %eax -; SSE41-NEXT: xorl $7, %eax -; SSE41-NEXT: pinsrb $13, %eax, %xmm1 -; SSE41-NEXT: pextrb $14, %xmm0, %eax -; SSE41-NEXT: bsrl %eax, %eax -; SSE41-NEXT: xorl $7, %eax -; SSE41-NEXT: pinsrb $14, %eax, %xmm1 -; SSE41-NEXT: pextrb $15, %xmm0, %eax -; SSE41-NEXT: bsrl %eax, %eax -; SSE41-NEXT: xorl $7, %eax -; SSE41-NEXT: pinsrb $15, %eax, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: pand %xmm2, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: pshufb %xmm3, %xmm4 +; SSE41-NEXT: psrlw $4, %xmm0 +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pcmpeqb %xmm0, %xmm2 +; SSE41-NEXT: pand %xmm4, %xmm2 +; SSE41-NEXT: pshufb %xmm0, %xmm1 +; SSE41-NEXT: paddb %xmm2, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: testv16i8u: ; AVX: # BB#0: -; AVX-NEXT: vpextrb $1, %xmm0, %eax -; AVX-NEXT: bsrl %eax, %eax -; AVX-NEXT: xorl $7, %eax -; AVX-NEXT: vpextrb $0, %xmm0, %ecx -; AVX-NEXT: bsrl %ecx, %ecx -; AVX-NEXT: xorl $7, %ecx -; AVX-NEXT: vmovd %ecx, %xmm1 -; AVX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrb $2, %xmm0, %eax -; AVX-NEXT: bsrl %eax, %eax -; AVX-NEXT: xorl $7, %eax -; AVX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrb $3, %xmm0, %eax -; AVX-NEXT: bsrl %eax, %eax -; AVX-NEXT: xorl $7, %eax -; AVX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrb $4, %xmm0, %eax -; AVX-NEXT: bsrl %eax, %eax -; AVX-NEXT: xorl $7, %eax -; AVX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrb $5, %xmm0, %eax -; AVX-NEXT: bsrl %eax, %eax -; AVX-NEXT: xorl $7, %eax -; AVX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrb $6, %xmm0, %eax -; AVX-NEXT: bsrl %eax, %eax -; AVX-NEXT: xorl $7, %eax -; AVX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrb $7, %xmm0, %eax -; AVX-NEXT: bsrl %eax, %eax -; AVX-NEXT: xorl $7, %eax -; AVX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrb $8, %xmm0, %eax -; AVX-NEXT: bsrl %eax, %eax -; AVX-NEXT: xorl $7, %eax -; AVX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrb $9, %xmm0, %eax -; AVX-NEXT: bsrl %eax, %eax -; AVX-NEXT: xorl $7, %eax -; AVX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrb $10, %xmm0, %eax -; AVX-NEXT: bsrl %eax, %eax -; AVX-NEXT: xorl $7, %eax -; AVX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrb $11, %xmm0, %eax -; AVX-NEXT: bsrl %eax, %eax -; AVX-NEXT: xorl $7, %eax -; AVX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrb $12, %xmm0, %eax -; AVX-NEXT: bsrl %eax, %eax -; AVX-NEXT: xorl $7, %eax -; AVX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrb $13, %xmm0, %eax -; AVX-NEXT: bsrl %eax, %eax -; AVX-NEXT: xorl $7, %eax -; AVX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrb $14, %xmm0, %eax -; AVX-NEXT: bsrl %eax, %eax -; AVX-NEXT: xorl $7, %eax -; AVX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrb $15, %xmm0, %eax -; AVX-NEXT: bsrl %eax, %eax -; AVX-NEXT: xorl $7, %eax -; AVX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vpand %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: testv16i8u: @@ -2161,70 +1538,19 @@ ; ; X32-SSE-LABEL: testv16i8u: ; X32-SSE: # BB#0: -; X32-SSE-NEXT: pextrb $1, %xmm0, %eax -; X32-SSE-NEXT: bsrl %eax, %eax -; X32-SSE-NEXT: xorl $7, %eax -; X32-SSE-NEXT: pextrb $0, %xmm0, %ecx -; X32-SSE-NEXT: bsrl %ecx, %ecx -; X32-SSE-NEXT: xorl $7, %ecx -; X32-SSE-NEXT: movd %ecx, %xmm1 -; X32-SSE-NEXT: pinsrb $1, %eax, %xmm1 -; X32-SSE-NEXT: pextrb $2, %xmm0, %eax -; X32-SSE-NEXT: bsrl %eax, %eax -; X32-SSE-NEXT: xorl $7, %eax -; X32-SSE-NEXT: pinsrb $2, %eax, %xmm1 -; X32-SSE-NEXT: pextrb $3, %xmm0, %eax -; X32-SSE-NEXT: bsrl %eax, %eax -; X32-SSE-NEXT: xorl $7, %eax -; X32-SSE-NEXT: pinsrb $3, %eax, %xmm1 -; X32-SSE-NEXT: pextrb $4, %xmm0, %eax -; X32-SSE-NEXT: bsrl %eax, %eax -; X32-SSE-NEXT: xorl $7, %eax -; X32-SSE-NEXT: pinsrb $4, %eax, %xmm1 -; X32-SSE-NEXT: pextrb $5, %xmm0, %eax -; X32-SSE-NEXT: bsrl %eax, %eax -; X32-SSE-NEXT: xorl $7, %eax -; X32-SSE-NEXT: pinsrb $5, %eax, %xmm1 -; X32-SSE-NEXT: pextrb $6, %xmm0, %eax -; X32-SSE-NEXT: bsrl %eax, %eax -; X32-SSE-NEXT: xorl $7, %eax -; X32-SSE-NEXT: pinsrb $6, %eax, %xmm1 -; X32-SSE-NEXT: pextrb $7, %xmm0, %eax -; X32-SSE-NEXT: bsrl %eax, %eax -; X32-SSE-NEXT: xorl $7, %eax -; X32-SSE-NEXT: pinsrb $7, %eax, %xmm1 -; X32-SSE-NEXT: pextrb $8, %xmm0, %eax -; X32-SSE-NEXT: bsrl %eax, %eax -; X32-SSE-NEXT: xorl $7, %eax -; X32-SSE-NEXT: pinsrb $8, %eax, %xmm1 -; X32-SSE-NEXT: pextrb $9, %xmm0, %eax -; X32-SSE-NEXT: bsrl %eax, %eax -; X32-SSE-NEXT: xorl $7, %eax -; X32-SSE-NEXT: pinsrb $9, %eax, %xmm1 -; X32-SSE-NEXT: pextrb $10, %xmm0, %eax -; X32-SSE-NEXT: bsrl %eax, %eax -; X32-SSE-NEXT: xorl $7, %eax -; X32-SSE-NEXT: pinsrb $10, %eax, %xmm1 -; X32-SSE-NEXT: pextrb $11, %xmm0, %eax -; X32-SSE-NEXT: bsrl %eax, %eax -; X32-SSE-NEXT: xorl $7, %eax -; X32-SSE-NEXT: pinsrb $11, %eax, %xmm1 -; X32-SSE-NEXT: pextrb $12, %xmm0, %eax -; X32-SSE-NEXT: bsrl %eax, %eax -; X32-SSE-NEXT: xorl $7, %eax -; X32-SSE-NEXT: pinsrb $12, %eax, %xmm1 -; X32-SSE-NEXT: pextrb $13, %xmm0, %eax -; X32-SSE-NEXT: bsrl %eax, %eax -; X32-SSE-NEXT: xorl $7, %eax -; X32-SSE-NEXT: pinsrb $13, %eax, %xmm1 -; X32-SSE-NEXT: pextrb $14, %xmm0, %eax -; X32-SSE-NEXT: bsrl %eax, %eax -; X32-SSE-NEXT: xorl $7, %eax -; X32-SSE-NEXT: pinsrb $14, %eax, %xmm1 -; X32-SSE-NEXT: pextrb $15, %xmm0, %eax -; X32-SSE-NEXT: bsrl %eax, %eax -; X32-SSE-NEXT: xorl $7, %eax -; X32-SSE-NEXT: pinsrb $15, %eax, %xmm1 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-SSE-NEXT: movdqa %xmm0, %xmm3 +; X32-SSE-NEXT: pand %xmm2, %xmm3 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-SSE-NEXT: movdqa %xmm1, %xmm4 +; X32-SSE-NEXT: pshufb %xmm3, %xmm4 +; X32-SSE-NEXT: psrlw $4, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: pxor %xmm2, %xmm2 +; X32-SSE-NEXT: pcmpeqb %xmm0, %xmm2 +; X32-SSE-NEXT: pand %xmm4, %xmm2 +; X32-SSE-NEXT: pshufb %xmm0, %xmm1 +; X32-SSE-NEXT: paddb %xmm2, %xmm1 ; X32-SSE-NEXT: movdqa %xmm1, %xmm0 ; X32-SSE-NEXT: retl %out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %in, i1 -1) Index: test/CodeGen/X86/vector-lzcnt-256.ll =================================================================== --- test/CodeGen/X86/vector-lzcnt-256.ll +++ test/CodeGen/X86/vector-lzcnt-256.ll @@ -35,30 +35,32 @@ ; ; AVX2-LABEL: testv4i64: ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-NEXT: bsrq %rax, %rax -; AVX2-NEXT: movl $127, %ecx -; AVX2-NEXT: cmoveq %rcx, %rax -; AVX2-NEXT: vmovq %rax, %xmm2 -; AVX2-NEXT: vmovq %xmm1, %rax -; AVX2-NEXT: bsrq %rax, %rax -; AVX2-NEXT: cmoveq %rcx, %rax -; AVX2-NEXT: vmovq %rax, %xmm1 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: bsrq %rax, %rax -; AVX2-NEXT: cmoveq %rcx, %rax -; AVX2-NEXT: vmovq %rax, %xmm3 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: bsrq %rax, %rax -; AVX2-NEXT: cmoveq %rcx, %rax -; AVX2-NEXT: vmovq %rax, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 +; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm2 +; AVX2-NEXT: vpsrld $16, %ymm2, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 +; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqd %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1 +; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VLCD-LABEL: testv4i64: @@ -101,25 +103,32 @@ ; ; AVX2-LABEL: testv4i64u: ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-NEXT: bsrq %rax, %rax -; AVX2-NEXT: vmovq %rax, %xmm2 -; AVX2-NEXT: vmovq %xmm1, %rax -; AVX2-NEXT: bsrq %rax, %rax -; AVX2-NEXT: vmovq %rax, %xmm1 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: bsrq %rax, %rax -; AVX2-NEXT: vmovq %rax, %xmm3 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: bsrq %rax, %rax -; AVX2-NEXT: vmovq %rax, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 +; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm2 +; AVX2-NEXT: vpsrld $16, %ymm2, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 +; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqd %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1 +; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VLCD-LABEL: testv4i64u: @@ -181,44 +190,27 @@ ; ; AVX2-LABEL: testv8i32: ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpextrd $1, %xmm1, %eax -; AVX2-NEXT: bsrl %eax, %ecx -; AVX2-NEXT: movl $63, %eax -; AVX2-NEXT: cmovel %eax, %ecx -; AVX2-NEXT: vmovd %xmm1, %edx -; AVX2-NEXT: bsrl %edx, %edx -; AVX2-NEXT: cmovel %eax, %edx -; AVX2-NEXT: vmovd %edx, %xmm2 -; AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrd $2, %xmm1, %ecx -; AVX2-NEXT: bsrl %ecx, %ecx -; AVX2-NEXT: cmovel %eax, %ecx -; AVX2-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrd $3, %xmm1, %ecx -; AVX2-NEXT: bsrl %ecx, %ecx -; AVX2-NEXT: cmovel %eax, %ecx -; AVX2-NEXT: vpinsrd $3, %ecx, %xmm2, %xmm1 -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpextrd $1, %xmm0, %ecx -; AVX2-NEXT: bsrl %ecx, %ecx -; AVX2-NEXT: cmovel %eax, %ecx -; AVX2-NEXT: vmovd %xmm0, %edx -; AVX2-NEXT: bsrl %edx, %edx -; AVX2-NEXT: cmovel %eax, %edx -; AVX2-NEXT: vmovd %edx, %xmm3 -; AVX2-NEXT: vpinsrd $1, %ecx, %xmm3, %xmm3 -; AVX2-NEXT: vpextrd $2, %xmm0, %ecx -; AVX2-NEXT: bsrl %ecx, %ecx -; AVX2-NEXT: cmovel %eax, %ecx -; AVX2-NEXT: vpinsrd $2, %ecx, %xmm3, %xmm3 -; AVX2-NEXT: vpextrd $3, %xmm0, %ecx -; AVX2-NEXT: bsrl %ecx, %ecx -; AVX2-NEXT: cmovel %eax, %ecx -; AVX2-NEXT: vpinsrd $3, %ecx, %xmm3, %xmm0 -; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 +; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 +; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VLCD-LABEL: testv8i32: @@ -271,35 +263,27 @@ ; ; AVX2-LABEL: testv8i32u: ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpextrd $1, %xmm1, %eax -; AVX2-NEXT: bsrl %eax, %eax -; AVX2-NEXT: vmovd %xmm1, %ecx -; AVX2-NEXT: bsrl %ecx, %ecx -; AVX2-NEXT: vmovd %ecx, %xmm2 -; AVX2-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrd $2, %xmm1, %eax -; AVX2-NEXT: bsrl %eax, %eax -; AVX2-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrd $3, %xmm1, %eax -; AVX2-NEXT: bsrl %eax, %eax -; AVX2-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1 -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpextrd $1, %xmm0, %eax -; AVX2-NEXT: bsrl %eax, %eax -; AVX2-NEXT: vmovd %xmm0, %ecx -; AVX2-NEXT: bsrl %ecx, %ecx -; AVX2-NEXT: vmovd %ecx, %xmm3 -; AVX2-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 -; AVX2-NEXT: vpextrd $2, %xmm0, %eax -; AVX2-NEXT: bsrl %eax, %eax -; AVX2-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3 -; AVX2-NEXT: vpextrd $3, %xmm0, %eax -; AVX2-NEXT: bsrl %eax, %eax -; AVX2-NEXT: vpinsrd $3, %eax, %xmm3, %xmm0 -; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 +; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 +; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VLCD-LABEL: testv8i32u: @@ -320,149 +304,56 @@ ; AVX1-LABEL: testv16i16: ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpextrw $1, %xmm1, %eax -; AVX1-NEXT: bsrw %ax, %cx -; AVX1-NEXT: movw $31, %ax -; AVX1-NEXT: cmovew %ax, %cx -; AVX1-NEXT: vmovd %xmm1, %edx -; AVX1-NEXT: bsrw %dx, %dx -; AVX1-NEXT: cmovew %ax, %dx -; AVX1-NEXT: vmovd %edx, %xmm2 -; AVX1-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $2, %xmm1, %ecx -; AVX1-NEXT: bsrw %cx, %cx -; AVX1-NEXT: cmovew %ax, %cx -; AVX1-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $3, %xmm1, %ecx -; AVX1-NEXT: bsrw %cx, %cx -; AVX1-NEXT: cmovew %ax, %cx -; AVX1-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $4, %xmm1, %ecx -; AVX1-NEXT: bsrw %cx, %cx -; AVX1-NEXT: cmovew %ax, %cx -; AVX1-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $5, %xmm1, %ecx -; AVX1-NEXT: bsrw %cx, %cx -; AVX1-NEXT: cmovew %ax, %cx -; AVX1-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $6, %xmm1, %ecx -; AVX1-NEXT: bsrw %cx, %cx -; AVX1-NEXT: cmovew %ax, %cx -; AVX1-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $7, %xmm1, %ecx -; AVX1-NEXT: bsrw %cx, %cx -; AVX1-NEXT: cmovew %ax, %cx -; AVX1-NEXT: vpinsrw $7, %ecx, %xmm2, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpextrw $1, %xmm0, %ecx -; AVX1-NEXT: bsrw %cx, %cx -; AVX1-NEXT: cmovew %ax, %cx -; AVX1-NEXT: vmovd %xmm0, %edx -; AVX1-NEXT: bsrw %dx, %dx -; AVX1-NEXT: cmovew %ax, %dx -; AVX1-NEXT: vmovd %edx, %xmm3 -; AVX1-NEXT: vpinsrw $1, %ecx, %xmm3, %xmm3 -; AVX1-NEXT: vpextrw $2, %xmm0, %ecx -; AVX1-NEXT: bsrw %cx, %cx -; AVX1-NEXT: cmovew %ax, %cx -; AVX1-NEXT: vpinsrw $2, %ecx, %xmm3, %xmm3 -; AVX1-NEXT: vpextrw $3, %xmm0, %ecx -; AVX1-NEXT: bsrw %cx, %cx -; AVX1-NEXT: cmovew %ax, %cx -; AVX1-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 -; AVX1-NEXT: vpextrw $4, %xmm0, %ecx -; AVX1-NEXT: bsrw %cx, %cx -; AVX1-NEXT: cmovew %ax, %cx -; AVX1-NEXT: vpinsrw $4, %ecx, %xmm3, %xmm3 -; AVX1-NEXT: vpextrw $5, %xmm0, %ecx -; AVX1-NEXT: bsrw %cx, %cx -; AVX1-NEXT: cmovew %ax, %cx -; AVX1-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 -; AVX1-NEXT: vpextrw $6, %xmm0, %ecx -; AVX1-NEXT: bsrw %cx, %cx -; AVX1-NEXT: cmovew %ax, %cx -; AVX1-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 -; AVX1-NEXT: vpextrw $7, %xmm0, %ecx -; AVX1-NEXT: bsrw %cx, %cx -; AVX1-NEXT: cmovew %ax, %cx -; AVX1-NEXT: vpinsrw $7, %ecx, %xmm3, %xmm0 -; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm5 +; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm5 +; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX1-NEXT: vpcmpeqb %xmm6, %xmm5, %xmm7 +; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 +; AVX1-NEXT: vpaddb %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqb %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 +; AVX1-NEXT: vpaddw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm5 +; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpcmpeqb %xmm6, %xmm2, %xmm5 +; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpeqb %xmm6, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX1-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: testv16i16: ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpextrw $1, %xmm1, %eax -; AVX2-NEXT: bsrw %ax, %cx -; AVX2-NEXT: movw $31, %ax -; AVX2-NEXT: cmovew %ax, %cx -; AVX2-NEXT: vmovd %xmm1, %edx -; AVX2-NEXT: bsrw %dx, %dx -; AVX2-NEXT: cmovew %ax, %dx -; AVX2-NEXT: vmovd %edx, %xmm2 -; AVX2-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrw $2, %xmm1, %ecx -; AVX2-NEXT: bsrw %cx, %cx -; AVX2-NEXT: cmovew %ax, %cx -; AVX2-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrw $3, %xmm1, %ecx -; AVX2-NEXT: bsrw %cx, %cx -; AVX2-NEXT: cmovew %ax, %cx -; AVX2-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrw $4, %xmm1, %ecx -; AVX2-NEXT: bsrw %cx, %cx -; AVX2-NEXT: cmovew %ax, %cx -; AVX2-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrw $5, %xmm1, %ecx -; AVX2-NEXT: bsrw %cx, %cx -; AVX2-NEXT: cmovew %ax, %cx -; AVX2-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrw $6, %xmm1, %ecx -; AVX2-NEXT: bsrw %cx, %cx -; AVX2-NEXT: cmovew %ax, %cx -; AVX2-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrw $7, %xmm1, %ecx -; AVX2-NEXT: bsrw %cx, %cx -; AVX2-NEXT: cmovew %ax, %cx -; AVX2-NEXT: vpinsrw $7, %ecx, %xmm2, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpextrw $1, %xmm0, %ecx -; AVX2-NEXT: bsrw %cx, %cx -; AVX2-NEXT: cmovew %ax, %cx -; AVX2-NEXT: vmovd %xmm0, %edx -; AVX2-NEXT: bsrw %dx, %dx -; AVX2-NEXT: cmovew %ax, %dx -; AVX2-NEXT: vmovd %edx, %xmm3 -; AVX2-NEXT: vpinsrw $1, %ecx, %xmm3, %xmm3 -; AVX2-NEXT: vpextrw $2, %xmm0, %ecx -; AVX2-NEXT: bsrw %cx, %cx -; AVX2-NEXT: cmovew %ax, %cx -; AVX2-NEXT: vpinsrw $2, %ecx, %xmm3, %xmm3 -; AVX2-NEXT: vpextrw $3, %xmm0, %ecx -; AVX2-NEXT: bsrw %cx, %cx -; AVX2-NEXT: cmovew %ax, %cx -; AVX2-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 -; AVX2-NEXT: vpextrw $4, %xmm0, %ecx -; AVX2-NEXT: bsrw %cx, %cx -; AVX2-NEXT: cmovew %ax, %cx -; AVX2-NEXT: vpinsrw $4, %ecx, %xmm3, %xmm3 -; AVX2-NEXT: vpextrw $5, %xmm0, %ecx -; AVX2-NEXT: bsrw %cx, %cx -; AVX2-NEXT: cmovew %ax, %cx -; AVX2-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 -; AVX2-NEXT: vpextrw $6, %xmm0, %ecx -; AVX2-NEXT: bsrw %cx, %cx -; AVX2-NEXT: cmovew %ax, %cx -; AVX2-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 -; AVX2-NEXT: vpextrw $7, %xmm0, %ecx -; AVX2-NEXT: bsrw %cx, %cx -; AVX2-NEXT: cmovew %ax, %cx -; AVX2-NEXT: vpinsrw $7, %ecx, %xmm3, %xmm0 -; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX2-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: testv16i16: @@ -480,115 +371,56 @@ ; AVX1-LABEL: testv16i16u: ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpextrw $1, %xmm1, %eax -; AVX1-NEXT: bsrw %ax, %ax -; AVX1-NEXT: vmovd %xmm1, %ecx -; AVX1-NEXT: bsrw %cx, %cx -; AVX1-NEXT: vmovd %ecx, %xmm2 -; AVX1-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $2, %xmm1, %eax -; AVX1-NEXT: bsrw %ax, %ax -; AVX1-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $3, %xmm1, %eax -; AVX1-NEXT: bsrw %ax, %ax -; AVX1-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $4, %xmm1, %eax -; AVX1-NEXT: bsrw %ax, %ax -; AVX1-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $5, %xmm1, %eax -; AVX1-NEXT: bsrw %ax, %ax -; AVX1-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $6, %xmm1, %eax -; AVX1-NEXT: bsrw %ax, %ax -; AVX1-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $7, %xmm1, %eax -; AVX1-NEXT: bsrw %ax, %ax -; AVX1-NEXT: vpinsrw $7, %eax, %xmm2, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpextrw $1, %xmm0, %eax -; AVX1-NEXT: bsrw %ax, %ax -; AVX1-NEXT: vmovd %xmm0, %ecx -; AVX1-NEXT: bsrw %cx, %cx -; AVX1-NEXT: vmovd %ecx, %xmm3 -; AVX1-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 -; AVX1-NEXT: vpextrw $2, %xmm0, %eax -; AVX1-NEXT: bsrw %ax, %ax -; AVX1-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 -; AVX1-NEXT: vpextrw $3, %xmm0, %eax -; AVX1-NEXT: bsrw %ax, %ax -; AVX1-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 -; AVX1-NEXT: vpextrw $4, %xmm0, %eax -; AVX1-NEXT: bsrw %ax, %ax -; AVX1-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 -; AVX1-NEXT: vpextrw $5, %xmm0, %eax -; AVX1-NEXT: bsrw %ax, %ax -; AVX1-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 -; AVX1-NEXT: vpextrw $6, %xmm0, %eax -; AVX1-NEXT: bsrw %ax, %ax -; AVX1-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3 -; AVX1-NEXT: vpextrw $7, %xmm0, %eax -; AVX1-NEXT: bsrw %ax, %ax -; AVX1-NEXT: vpinsrw $7, %eax, %xmm3, %xmm0 -; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm5 +; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm5 +; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX1-NEXT: vpcmpeqb %xmm6, %xmm5, %xmm7 +; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 +; AVX1-NEXT: vpaddb %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqb %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 +; AVX1-NEXT: vpaddw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm5 +; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpcmpeqb %xmm6, %xmm2, %xmm5 +; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpeqb %xmm6, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX1-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: testv16i16u: ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpextrw $1, %xmm1, %eax -; AVX2-NEXT: bsrw %ax, %ax -; AVX2-NEXT: vmovd %xmm1, %ecx -; AVX2-NEXT: bsrw %cx, %cx -; AVX2-NEXT: vmovd %ecx, %xmm2 -; AVX2-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrw $2, %xmm1, %eax -; AVX2-NEXT: bsrw %ax, %ax -; AVX2-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrw $3, %xmm1, %eax -; AVX2-NEXT: bsrw %ax, %ax -; AVX2-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrw $4, %xmm1, %eax -; AVX2-NEXT: bsrw %ax, %ax -; AVX2-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrw $5, %xmm1, %eax -; AVX2-NEXT: bsrw %ax, %ax -; AVX2-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrw $6, %xmm1, %eax -; AVX2-NEXT: bsrw %ax, %ax -; AVX2-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrw $7, %xmm1, %eax -; AVX2-NEXT: bsrw %ax, %ax -; AVX2-NEXT: vpinsrw $7, %eax, %xmm2, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpextrw $1, %xmm0, %eax -; AVX2-NEXT: bsrw %ax, %ax -; AVX2-NEXT: vmovd %xmm0, %ecx -; AVX2-NEXT: bsrw %cx, %cx -; AVX2-NEXT: vmovd %ecx, %xmm3 -; AVX2-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 -; AVX2-NEXT: vpextrw $2, %xmm0, %eax -; AVX2-NEXT: bsrw %ax, %ax -; AVX2-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 -; AVX2-NEXT: vpextrw $3, %xmm0, %eax -; AVX2-NEXT: bsrw %ax, %ax -; AVX2-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 -; AVX2-NEXT: vpextrw $4, %xmm0, %eax -; AVX2-NEXT: bsrw %ax, %ax -; AVX2-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 -; AVX2-NEXT: vpextrw $5, %xmm0, %eax -; AVX2-NEXT: bsrw %ax, %ax -; AVX2-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 -; AVX2-NEXT: vpextrw $6, %xmm0, %eax -; AVX2-NEXT: bsrw %ax, %ax -; AVX2-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3 -; AVX2-NEXT: vpextrw $7, %xmm0, %eax -; AVX2-NEXT: bsrw %ax, %ax -; AVX2-NEXT: vpinsrw $7, %eax, %xmm3, %xmm0 -; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX2-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: testv16i16u: @@ -606,335 +438,41 @@ ; AVX1-LABEL: testv32i8: ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpextrb $1, %xmm1, %eax -; AVX1-NEXT: bsrl %eax, %ecx -; AVX1-NEXT: movl $15, %eax -; AVX1-NEXT: cmovel %eax, %ecx -; AVX1-NEXT: xorl $7, %ecx -; AVX1-NEXT: vpextrb $0, %xmm1, %edx -; AVX1-NEXT: bsrl %edx, %edx -; AVX1-NEXT: cmovel %eax, %edx -; AVX1-NEXT: xorl $7, %edx -; AVX1-NEXT: vmovd %edx, %xmm2 -; AVX1-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $2, %xmm1, %ecx -; AVX1-NEXT: bsrl %ecx, %ecx -; AVX1-NEXT: cmovel %eax, %ecx -; AVX1-NEXT: xorl $7, %ecx -; AVX1-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $3, %xmm1, %ecx -; AVX1-NEXT: bsrl %ecx, %ecx -; AVX1-NEXT: cmovel %eax, %ecx -; AVX1-NEXT: xorl $7, %ecx -; AVX1-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $4, %xmm1, %ecx -; AVX1-NEXT: bsrl %ecx, %ecx -; AVX1-NEXT: cmovel %eax, %ecx -; AVX1-NEXT: xorl $7, %ecx -; AVX1-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $5, %xmm1, %ecx -; AVX1-NEXT: bsrl %ecx, %ecx -; AVX1-NEXT: cmovel %eax, %ecx -; AVX1-NEXT: xorl $7, %ecx -; AVX1-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $6, %xmm1, %ecx -; AVX1-NEXT: bsrl %ecx, %ecx -; AVX1-NEXT: cmovel %eax, %ecx -; AVX1-NEXT: xorl $7, %ecx -; AVX1-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $7, %xmm1, %ecx -; AVX1-NEXT: bsrl %ecx, %ecx -; AVX1-NEXT: cmovel %eax, %ecx -; AVX1-NEXT: xorl $7, %ecx -; AVX1-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $8, %xmm1, %ecx -; AVX1-NEXT: bsrl %ecx, %ecx -; AVX1-NEXT: cmovel %eax, %ecx -; AVX1-NEXT: xorl $7, %ecx -; AVX1-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $9, %xmm1, %ecx -; AVX1-NEXT: bsrl %ecx, %ecx -; AVX1-NEXT: cmovel %eax, %ecx -; AVX1-NEXT: xorl $7, %ecx -; AVX1-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $10, %xmm1, %ecx -; AVX1-NEXT: bsrl %ecx, %ecx -; AVX1-NEXT: cmovel %eax, %ecx -; AVX1-NEXT: xorl $7, %ecx -; AVX1-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $11, %xmm1, %ecx -; AVX1-NEXT: bsrl %ecx, %ecx -; AVX1-NEXT: cmovel %eax, %ecx -; AVX1-NEXT: xorl $7, %ecx -; AVX1-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $12, %xmm1, %ecx -; AVX1-NEXT: bsrl %ecx, %ecx -; AVX1-NEXT: cmovel %eax, %ecx -; AVX1-NEXT: xorl $7, %ecx -; AVX1-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $13, %xmm1, %ecx -; AVX1-NEXT: bsrl %ecx, %ecx -; AVX1-NEXT: cmovel %eax, %ecx -; AVX1-NEXT: xorl $7, %ecx -; AVX1-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $14, %xmm1, %ecx -; AVX1-NEXT: bsrl %ecx, %ecx -; AVX1-NEXT: cmovel %eax, %ecx -; AVX1-NEXT: xorl $7, %ecx -; AVX1-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $15, %xmm1, %ecx -; AVX1-NEXT: bsrl %ecx, %ecx -; AVX1-NEXT: cmovel %eax, %ecx -; AVX1-NEXT: xorl $7, %ecx -; AVX1-NEXT: vpinsrb $15, %ecx, %xmm2, %xmm1 -; AVX1-NEXT: vpextrb $1, %xmm0, %ecx -; AVX1-NEXT: bsrl %ecx, %ecx -; AVX1-NEXT: cmovel %eax, %ecx -; AVX1-NEXT: xorl $7, %ecx -; AVX1-NEXT: vpextrb $0, %xmm0, %edx -; AVX1-NEXT: bsrl %edx, %edx -; AVX1-NEXT: cmovel %eax, %edx -; AVX1-NEXT: xorl $7, %edx -; AVX1-NEXT: vmovd %edx, %xmm2 -; AVX1-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $2, %xmm0, %ecx -; AVX1-NEXT: bsrl %ecx, %ecx -; AVX1-NEXT: cmovel %eax, %ecx -; AVX1-NEXT: xorl $7, %ecx -; AVX1-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $3, %xmm0, %ecx -; AVX1-NEXT: bsrl %ecx, %ecx -; AVX1-NEXT: cmovel %eax, %ecx -; AVX1-NEXT: xorl $7, %ecx -; AVX1-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $4, %xmm0, %ecx -; AVX1-NEXT: bsrl %ecx, %ecx -; AVX1-NEXT: cmovel %eax, %ecx -; AVX1-NEXT: xorl $7, %ecx -; AVX1-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $5, %xmm0, %ecx -; AVX1-NEXT: bsrl %ecx, %ecx -; AVX1-NEXT: cmovel %eax, %ecx -; AVX1-NEXT: xorl $7, %ecx -; AVX1-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $6, %xmm0, %ecx -; AVX1-NEXT: bsrl %ecx, %ecx -; AVX1-NEXT: cmovel %eax, %ecx -; AVX1-NEXT: xorl $7, %ecx -; AVX1-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $7, %xmm0, %ecx -; AVX1-NEXT: bsrl %ecx, %ecx -; AVX1-NEXT: cmovel %eax, %ecx -; AVX1-NEXT: xorl $7, %ecx -; AVX1-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $8, %xmm0, %ecx -; AVX1-NEXT: bsrl %ecx, %ecx -; AVX1-NEXT: cmovel %eax, %ecx -; AVX1-NEXT: xorl $7, %ecx -; AVX1-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $9, %xmm0, %ecx -; AVX1-NEXT: bsrl %ecx, %ecx -; AVX1-NEXT: cmovel %eax, %ecx -; AVX1-NEXT: xorl $7, %ecx -; AVX1-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $10, %xmm0, %ecx -; AVX1-NEXT: bsrl %ecx, %ecx -; AVX1-NEXT: cmovel %eax, %ecx -; AVX1-NEXT: xorl $7, %ecx -; AVX1-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $11, %xmm0, %ecx -; AVX1-NEXT: bsrl %ecx, %ecx -; AVX1-NEXT: cmovel %eax, %ecx -; AVX1-NEXT: xorl $7, %ecx -; AVX1-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $12, %xmm0, %ecx -; AVX1-NEXT: bsrl %ecx, %ecx -; AVX1-NEXT: cmovel %eax, %ecx -; AVX1-NEXT: xorl $7, %ecx -; AVX1-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $13, %xmm0, %ecx -; AVX1-NEXT: bsrl %ecx, %ecx -; AVX1-NEXT: cmovel %eax, %ecx -; AVX1-NEXT: xorl $7, %ecx -; AVX1-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $14, %xmm0, %ecx -; AVX1-NEXT: bsrl %ecx, %ecx -; AVX1-NEXT: cmovel %eax, %ecx -; AVX1-NEXT: xorl $7, %ecx -; AVX1-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $15, %xmm0, %ecx -; AVX1-NEXT: bsrl %ecx, %ecx -; AVX1-NEXT: cmovel %eax, %ecx -; AVX1-NEXT: xorl $7, %ecx -; AVX1-NEXT: vpinsrb $15, %ecx, %xmm2, %xmm0 +; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpcmpeqb %xmm5, %xmm1, %xmm6 +; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqb %xmm5, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: testv32i8: ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpextrb $1, %xmm1, %eax -; AVX2-NEXT: bsrl %eax, %ecx -; AVX2-NEXT: movl $15, %eax -; AVX2-NEXT: cmovel %eax, %ecx -; AVX2-NEXT: xorl $7, %ecx -; AVX2-NEXT: vpextrb $0, %xmm1, %edx -; AVX2-NEXT: bsrl %edx, %edx -; AVX2-NEXT: cmovel %eax, %edx -; AVX2-NEXT: xorl $7, %edx -; AVX2-NEXT: vmovd %edx, %xmm2 -; AVX2-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $2, %xmm1, %ecx -; AVX2-NEXT: bsrl %ecx, %ecx -; AVX2-NEXT: cmovel %eax, %ecx -; AVX2-NEXT: xorl $7, %ecx -; AVX2-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $3, %xmm1, %ecx -; AVX2-NEXT: bsrl %ecx, %ecx -; AVX2-NEXT: cmovel %eax, %ecx -; AVX2-NEXT: xorl $7, %ecx -; AVX2-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $4, %xmm1, %ecx -; AVX2-NEXT: bsrl %ecx, %ecx -; AVX2-NEXT: cmovel %eax, %ecx -; AVX2-NEXT: xorl $7, %ecx -; AVX2-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $5, %xmm1, %ecx -; AVX2-NEXT: bsrl %ecx, %ecx -; AVX2-NEXT: cmovel %eax, %ecx -; AVX2-NEXT: xorl $7, %ecx -; AVX2-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $6, %xmm1, %ecx -; AVX2-NEXT: bsrl %ecx, %ecx -; AVX2-NEXT: cmovel %eax, %ecx -; AVX2-NEXT: xorl $7, %ecx -; AVX2-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $7, %xmm1, %ecx -; AVX2-NEXT: bsrl %ecx, %ecx -; AVX2-NEXT: cmovel %eax, %ecx -; AVX2-NEXT: xorl $7, %ecx -; AVX2-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $8, %xmm1, %ecx -; AVX2-NEXT: bsrl %ecx, %ecx -; AVX2-NEXT: cmovel %eax, %ecx -; AVX2-NEXT: xorl $7, %ecx -; AVX2-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $9, %xmm1, %ecx -; AVX2-NEXT: bsrl %ecx, %ecx -; AVX2-NEXT: cmovel %eax, %ecx -; AVX2-NEXT: xorl $7, %ecx -; AVX2-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $10, %xmm1, %ecx -; AVX2-NEXT: bsrl %ecx, %ecx -; AVX2-NEXT: cmovel %eax, %ecx -; AVX2-NEXT: xorl $7, %ecx -; AVX2-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $11, %xmm1, %ecx -; AVX2-NEXT: bsrl %ecx, %ecx -; AVX2-NEXT: cmovel %eax, %ecx -; AVX2-NEXT: xorl $7, %ecx -; AVX2-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $12, %xmm1, %ecx -; AVX2-NEXT: bsrl %ecx, %ecx -; AVX2-NEXT: cmovel %eax, %ecx -; AVX2-NEXT: xorl $7, %ecx -; AVX2-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $13, %xmm1, %ecx -; AVX2-NEXT: bsrl %ecx, %ecx -; AVX2-NEXT: cmovel %eax, %ecx -; AVX2-NEXT: xorl $7, %ecx -; AVX2-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $14, %xmm1, %ecx -; AVX2-NEXT: bsrl %ecx, %ecx -; AVX2-NEXT: cmovel %eax, %ecx -; AVX2-NEXT: xorl $7, %ecx -; AVX2-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $15, %xmm1, %ecx -; AVX2-NEXT: bsrl %ecx, %ecx -; AVX2-NEXT: cmovel %eax, %ecx -; AVX2-NEXT: xorl $7, %ecx -; AVX2-NEXT: vpinsrb $15, %ecx, %xmm2, %xmm1 -; AVX2-NEXT: vpextrb $1, %xmm0, %ecx -; AVX2-NEXT: bsrl %ecx, %ecx -; AVX2-NEXT: cmovel %eax, %ecx -; AVX2-NEXT: xorl $7, %ecx -; AVX2-NEXT: vpextrb $0, %xmm0, %edx -; AVX2-NEXT: bsrl %edx, %edx -; AVX2-NEXT: cmovel %eax, %edx -; AVX2-NEXT: xorl $7, %edx -; AVX2-NEXT: vmovd %edx, %xmm2 -; AVX2-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $2, %xmm0, %ecx -; AVX2-NEXT: bsrl %ecx, %ecx -; AVX2-NEXT: cmovel %eax, %ecx -; AVX2-NEXT: xorl $7, %ecx -; AVX2-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $3, %xmm0, %ecx -; AVX2-NEXT: bsrl %ecx, %ecx -; AVX2-NEXT: cmovel %eax, %ecx -; AVX2-NEXT: xorl $7, %ecx -; AVX2-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $4, %xmm0, %ecx -; AVX2-NEXT: bsrl %ecx, %ecx -; AVX2-NEXT: cmovel %eax, %ecx -; AVX2-NEXT: xorl $7, %ecx -; AVX2-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $5, %xmm0, %ecx -; AVX2-NEXT: bsrl %ecx, %ecx -; AVX2-NEXT: cmovel %eax, %ecx -; AVX2-NEXT: xorl $7, %ecx -; AVX2-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $6, %xmm0, %ecx -; AVX2-NEXT: bsrl %ecx, %ecx -; AVX2-NEXT: cmovel %eax, %ecx -; AVX2-NEXT: xorl $7, %ecx -; AVX2-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $7, %xmm0, %ecx -; AVX2-NEXT: bsrl %ecx, %ecx -; AVX2-NEXT: cmovel %eax, %ecx -; AVX2-NEXT: xorl $7, %ecx -; AVX2-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $8, %xmm0, %ecx -; AVX2-NEXT: bsrl %ecx, %ecx -; AVX2-NEXT: cmovel %eax, %ecx -; AVX2-NEXT: xorl $7, %ecx -; AVX2-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $9, %xmm0, %ecx -; AVX2-NEXT: bsrl %ecx, %ecx -; AVX2-NEXT: cmovel %eax, %ecx -; AVX2-NEXT: xorl $7, %ecx -; AVX2-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $10, %xmm0, %ecx -; AVX2-NEXT: bsrl %ecx, %ecx -; AVX2-NEXT: cmovel %eax, %ecx -; AVX2-NEXT: xorl $7, %ecx -; AVX2-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $11, %xmm0, %ecx -; AVX2-NEXT: bsrl %ecx, %ecx -; AVX2-NEXT: cmovel %eax, %ecx -; AVX2-NEXT: xorl $7, %ecx -; AVX2-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $12, %xmm0, %ecx -; AVX2-NEXT: bsrl %ecx, %ecx -; AVX2-NEXT: cmovel %eax, %ecx -; AVX2-NEXT: xorl $7, %ecx -; AVX2-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $13, %xmm0, %ecx -; AVX2-NEXT: bsrl %ecx, %ecx -; AVX2-NEXT: cmovel %eax, %ecx -; AVX2-NEXT: xorl $7, %ecx -; AVX2-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $14, %xmm0, %ecx -; AVX2-NEXT: bsrl %ecx, %ecx -; AVX2-NEXT: cmovel %eax, %ecx -; AVX2-NEXT: xorl $7, %ecx -; AVX2-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $15, %xmm0, %ecx -; AVX2-NEXT: bsrl %ecx, %ecx -; AVX2-NEXT: cmovel %eax, %ecx -; AVX2-NEXT: xorl $7, %ecx -; AVX2-NEXT: vpinsrb $15, %ecx, %xmm2, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VLCD-LABEL: testv32i8: @@ -974,269 +512,41 @@ ; AVX1-LABEL: testv32i8u: ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpextrb $1, %xmm1, %eax -; AVX1-NEXT: bsrl %eax, %eax -; AVX1-NEXT: xorl $7, %eax -; AVX1-NEXT: vpextrb $0, %xmm1, %ecx -; AVX1-NEXT: bsrl %ecx, %ecx -; AVX1-NEXT: xorl $7, %ecx -; AVX1-NEXT: vmovd %ecx, %xmm2 -; AVX1-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $2, %xmm1, %eax -; AVX1-NEXT: bsrl %eax, %eax -; AVX1-NEXT: xorl $7, %eax -; AVX1-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $3, %xmm1, %eax -; AVX1-NEXT: bsrl %eax, %eax -; AVX1-NEXT: xorl $7, %eax -; AVX1-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $4, %xmm1, %eax -; AVX1-NEXT: bsrl %eax, %eax -; AVX1-NEXT: xorl $7, %eax -; AVX1-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $5, %xmm1, %eax -; AVX1-NEXT: bsrl %eax, %eax -; AVX1-NEXT: xorl $7, %eax -; AVX1-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $6, %xmm1, %eax -; AVX1-NEXT: bsrl %eax, %eax -; AVX1-NEXT: xorl $7, %eax -; AVX1-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $7, %xmm1, %eax -; AVX1-NEXT: bsrl %eax, %eax -; AVX1-NEXT: xorl $7, %eax -; AVX1-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $8, %xmm1, %eax -; AVX1-NEXT: bsrl %eax, %eax -; AVX1-NEXT: xorl $7, %eax -; AVX1-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $9, %xmm1, %eax -; AVX1-NEXT: bsrl %eax, %eax -; AVX1-NEXT: xorl $7, %eax -; AVX1-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $10, %xmm1, %eax -; AVX1-NEXT: bsrl %eax, %eax -; AVX1-NEXT: xorl $7, %eax -; AVX1-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $11, %xmm1, %eax -; AVX1-NEXT: bsrl %eax, %eax -; AVX1-NEXT: xorl $7, %eax -; AVX1-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $12, %xmm1, %eax -; AVX1-NEXT: bsrl %eax, %eax -; AVX1-NEXT: xorl $7, %eax -; AVX1-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $13, %xmm1, %eax -; AVX1-NEXT: bsrl %eax, %eax -; AVX1-NEXT: xorl $7, %eax -; AVX1-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $14, %xmm1, %eax -; AVX1-NEXT: bsrl %eax, %eax -; AVX1-NEXT: xorl $7, %eax -; AVX1-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $15, %xmm1, %eax -; AVX1-NEXT: bsrl %eax, %eax -; AVX1-NEXT: xorl $7, %eax -; AVX1-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1 -; AVX1-NEXT: vpextrb $1, %xmm0, %eax -; AVX1-NEXT: bsrl %eax, %eax -; AVX1-NEXT: xorl $7, %eax -; AVX1-NEXT: vpextrb $0, %xmm0, %ecx -; AVX1-NEXT: bsrl %ecx, %ecx -; AVX1-NEXT: xorl $7, %ecx -; AVX1-NEXT: vmovd %ecx, %xmm2 -; AVX1-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $2, %xmm0, %eax -; AVX1-NEXT: bsrl %eax, %eax -; AVX1-NEXT: xorl $7, %eax -; AVX1-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $3, %xmm0, %eax -; AVX1-NEXT: bsrl %eax, %eax -; AVX1-NEXT: xorl $7, %eax -; AVX1-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $4, %xmm0, %eax -; AVX1-NEXT: bsrl %eax, %eax -; AVX1-NEXT: xorl $7, %eax -; AVX1-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $5, %xmm0, %eax -; AVX1-NEXT: bsrl %eax, %eax -; AVX1-NEXT: xorl $7, %eax -; AVX1-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $6, %xmm0, %eax -; AVX1-NEXT: bsrl %eax, %eax -; AVX1-NEXT: xorl $7, %eax -; AVX1-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $7, %xmm0, %eax -; AVX1-NEXT: bsrl %eax, %eax -; AVX1-NEXT: xorl $7, %eax -; AVX1-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $8, %xmm0, %eax -; AVX1-NEXT: bsrl %eax, %eax -; AVX1-NEXT: xorl $7, %eax -; AVX1-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $9, %xmm0, %eax -; AVX1-NEXT: bsrl %eax, %eax -; AVX1-NEXT: xorl $7, %eax -; AVX1-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $10, %xmm0, %eax -; AVX1-NEXT: bsrl %eax, %eax -; AVX1-NEXT: xorl $7, %eax -; AVX1-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $11, %xmm0, %eax -; AVX1-NEXT: bsrl %eax, %eax -; AVX1-NEXT: xorl $7, %eax -; AVX1-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $12, %xmm0, %eax -; AVX1-NEXT: bsrl %eax, %eax -; AVX1-NEXT: xorl $7, %eax -; AVX1-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $13, %xmm0, %eax -; AVX1-NEXT: bsrl %eax, %eax -; AVX1-NEXT: xorl $7, %eax -; AVX1-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $14, %xmm0, %eax -; AVX1-NEXT: bsrl %eax, %eax -; AVX1-NEXT: xorl $7, %eax -; AVX1-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $15, %xmm0, %eax -; AVX1-NEXT: bsrl %eax, %eax -; AVX1-NEXT: xorl $7, %eax -; AVX1-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0 +; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpcmpeqb %xmm5, %xmm1, %xmm6 +; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqb %xmm5, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: testv32i8u: ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpextrb $1, %xmm1, %eax -; AVX2-NEXT: bsrl %eax, %eax -; AVX2-NEXT: xorl $7, %eax -; AVX2-NEXT: vpextrb $0, %xmm1, %ecx -; AVX2-NEXT: bsrl %ecx, %ecx -; AVX2-NEXT: xorl $7, %ecx -; AVX2-NEXT: vmovd %ecx, %xmm2 -; AVX2-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $2, %xmm1, %eax -; AVX2-NEXT: bsrl %eax, %eax -; AVX2-NEXT: xorl $7, %eax -; AVX2-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $3, %xmm1, %eax -; AVX2-NEXT: bsrl %eax, %eax -; AVX2-NEXT: xorl $7, %eax -; AVX2-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $4, %xmm1, %eax -; AVX2-NEXT: bsrl %eax, %eax -; AVX2-NEXT: xorl $7, %eax -; AVX2-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $5, %xmm1, %eax -; AVX2-NEXT: bsrl %eax, %eax -; AVX2-NEXT: xorl $7, %eax -; AVX2-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $6, %xmm1, %eax -; AVX2-NEXT: bsrl %eax, %eax -; AVX2-NEXT: xorl $7, %eax -; AVX2-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $7, %xmm1, %eax -; AVX2-NEXT: bsrl %eax, %eax -; AVX2-NEXT: xorl $7, %eax -; AVX2-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $8, %xmm1, %eax -; AVX2-NEXT: bsrl %eax, %eax -; AVX2-NEXT: xorl $7, %eax -; AVX2-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $9, %xmm1, %eax -; AVX2-NEXT: bsrl %eax, %eax -; AVX2-NEXT: xorl $7, %eax -; AVX2-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $10, %xmm1, %eax -; AVX2-NEXT: bsrl %eax, %eax -; AVX2-NEXT: xorl $7, %eax -; AVX2-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $11, %xmm1, %eax -; AVX2-NEXT: bsrl %eax, %eax -; AVX2-NEXT: xorl $7, %eax -; AVX2-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $12, %xmm1, %eax -; AVX2-NEXT: bsrl %eax, %eax -; AVX2-NEXT: xorl $7, %eax -; AVX2-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $13, %xmm1, %eax -; AVX2-NEXT: bsrl %eax, %eax -; AVX2-NEXT: xorl $7, %eax -; AVX2-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $14, %xmm1, %eax -; AVX2-NEXT: bsrl %eax, %eax -; AVX2-NEXT: xorl $7, %eax -; AVX2-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $15, %xmm1, %eax -; AVX2-NEXT: bsrl %eax, %eax -; AVX2-NEXT: xorl $7, %eax -; AVX2-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1 -; AVX2-NEXT: vpextrb $1, %xmm0, %eax -; AVX2-NEXT: bsrl %eax, %eax -; AVX2-NEXT: xorl $7, %eax -; AVX2-NEXT: vpextrb $0, %xmm0, %ecx -; AVX2-NEXT: bsrl %ecx, %ecx -; AVX2-NEXT: xorl $7, %ecx -; AVX2-NEXT: vmovd %ecx, %xmm2 -; AVX2-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $2, %xmm0, %eax -; AVX2-NEXT: bsrl %eax, %eax -; AVX2-NEXT: xorl $7, %eax -; AVX2-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $3, %xmm0, %eax -; AVX2-NEXT: bsrl %eax, %eax -; AVX2-NEXT: xorl $7, %eax -; AVX2-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $4, %xmm0, %eax -; AVX2-NEXT: bsrl %eax, %eax -; AVX2-NEXT: xorl $7, %eax -; AVX2-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $5, %xmm0, %eax -; AVX2-NEXT: bsrl %eax, %eax -; AVX2-NEXT: xorl $7, %eax -; AVX2-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $6, %xmm0, %eax -; AVX2-NEXT: bsrl %eax, %eax -; AVX2-NEXT: xorl $7, %eax -; AVX2-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $7, %xmm0, %eax -; AVX2-NEXT: bsrl %eax, %eax -; AVX2-NEXT: xorl $7, %eax -; AVX2-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $8, %xmm0, %eax -; AVX2-NEXT: bsrl %eax, %eax -; AVX2-NEXT: xorl $7, %eax -; AVX2-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $9, %xmm0, %eax -; AVX2-NEXT: bsrl %eax, %eax -; AVX2-NEXT: xorl $7, %eax -; AVX2-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $10, %xmm0, %eax -; AVX2-NEXT: bsrl %eax, %eax -; AVX2-NEXT: xorl $7, %eax -; AVX2-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $11, %xmm0, %eax -; AVX2-NEXT: bsrl %eax, %eax -; AVX2-NEXT: xorl $7, %eax -; AVX2-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $12, %xmm0, %eax -; AVX2-NEXT: bsrl %eax, %eax -; AVX2-NEXT: xorl $7, %eax -; AVX2-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $13, %xmm0, %eax -; AVX2-NEXT: bsrl %eax, %eax -; AVX2-NEXT: xorl $7, %eax -; AVX2-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $14, %xmm0, %eax -; AVX2-NEXT: bsrl %eax, %eax -; AVX2-NEXT: xorl $7, %eax -; AVX2-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $15, %xmm0, %eax -; AVX2-NEXT: bsrl %eax, %eax -; AVX2-NEXT: xorl $7, %eax -; AVX2-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VLCD-LABEL: testv32i8u: