Index: lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -106,7 +106,8 @@ SDValue ExpandStore(SDValue Op); SDValue ExpandFNEG(SDValue Op); SDValue ExpandBITREVERSE(SDValue Op); - SDValue ExpandCTLZ_CTTZ_ZERO_UNDEF(SDValue Op); + SDValue ExpandCTLZ(SDValue Op); + SDValue ExpandCTTZ_ZERO_UNDEF(SDValue Op); /// \brief Implements vector promotion. /// @@ -693,9 +694,11 @@ return UnrollVSETCC(Op); case ISD::BITREVERSE: return ExpandBITREVERSE(Op); + case ISD::CTLZ: case ISD::CTLZ_ZERO_UNDEF: + return ExpandCTLZ(Op); case ISD::CTTZ_ZERO_UNDEF: - return ExpandCTLZ_CTTZ_ZERO_UNDEF(Op); + return ExpandCTTZ_ZERO_UNDEF(Op); default: return DAG.UnrollVectorOp(Op.getNode()); } @@ -1022,7 +1025,48 @@ return DAG.UnrollVectorOp(Op.getNode()); } -SDValue VectorLegalizer::ExpandCTLZ_CTTZ_ZERO_UNDEF(SDValue Op) { +SDValue VectorLegalizer::ExpandCTLZ(SDValue Op) { + EVT VT = Op.getValueType(); + + // If the non-ZERO_UNDEF version is supported we can use that instead. + if (Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF && + TLI.isOperationLegalOrCustom(ISD::CTLZ, VT)) { + SDLoc DL(Op); + return DAG.getNode(ISD::CTLZ, DL, Op.getValueType(), Op.getOperand(0)); + } + + // If CTPOP is available we can lower with a CTPOP based method: + // u16 ctlz(u16 x) { + // x |= (x >> 1); + // x |= (x >> 2); + // x |= (x >> 4); + // x |= (x >> 8); + // return ctpop(~x); + // } + // Ref: "Hacker's Delight" by Henry Warren + if (TLI.isOperationLegalOrCustom(ISD::CTPOP, VT) && + TLI.isOperationLegalOrCustom(ISD::SRL, VT) && + TLI.isOperationLegalOrCustomOrPromote(ISD::OR, VT) && + TLI.isOperationLegalOrCustomOrPromote(ISD::XOR, VT)) { + SDLoc DL(Op); + SDValue Res = Op.getOperand(0); + unsigned NumBitsPerElt = VT.getScalarSizeInBits(); + EVT ShiftTy = TLI.getShiftAmountTy(VT, DAG.getDataLayout()); + + for (unsigned i = 1; i != NumBitsPerElt; i *= 2) + Res = DAG.getNode( + ISD::OR, DL, VT, Res, + DAG.getNode(ISD::SRL, DL, VT, Res, DAG.getConstant(i, DL, ShiftTy))); + + Res = DAG.getNOT(DL, Res, VT); + return DAG.getNode(ISD::CTPOP, DL, VT, Res); + } + + // Otherwise go ahead and unroll. + return DAG.UnrollVectorOp(Op.getNode()); +} + +SDValue VectorLegalizer::ExpandCTTZ_ZERO_UNDEF(SDValue Op) { // If the non-ZERO_UNDEF version is supported we can use that instead. unsigned Opc = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ? ISD::CTLZ : ISD::CTTZ; if (TLI.isOperationLegalOrCustom(Opc, Op.getValueType())) { Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -1145,7 +1145,10 @@ { ISD::BSWAP, MVT::v2i64, 7 }, { ISD::BSWAP, MVT::v4i32, 7 }, { ISD::BSWAP, MVT::v8i16, 7 }, - /* ISD::CTLZ - currently scalarized pre-SSSE3 */ + { ISD::CTLZ, MVT::v2i64, 25 }, + { ISD::CTLZ, MVT::v4i32, 26 }, + { ISD::CTLZ, MVT::v8i16, 20 }, + { ISD::CTLZ, MVT::v16i8, 17 }, { ISD::CTPOP, MVT::v2i64, 12 }, { ISD::CTPOP, MVT::v4i32, 15 }, { ISD::CTPOP, MVT::v8i16, 13 }, Index: test/Analysis/CostModel/X86/ctbits-cost.ll =================================================================== --- test/Analysis/CostModel/X86/ctbits-cost.ll +++ test/Analysis/CostModel/X86/ctbits-cost.ll @@ -209,7 +209,7 @@ define <2 x i64> @var_ctlz_v2i64(<2 x i64> %a) { ; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v2i64': -; SSE2: Found an estimated cost of 6 for instruction: %ctlz +; SSE2: Found an estimated cost of 25 for instruction: %ctlz ; SSE42: Found an estimated cost of 23 for instruction: %ctlz ; AVX: Found an estimated cost of 23 for instruction: %ctlz %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 0) @@ -218,7 +218,7 @@ define <2 x i64> @var_ctlz_v2i64u(<2 x i64> %a) { ; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v2i64u': -; SSE2: Found an estimated cost of 6 for instruction: %ctlz +; SSE2: Found an estimated cost of 25 for instruction: %ctlz ; SSE42: Found an estimated cost of 23 for instruction: %ctlz ; AVX: Found an estimated cost of 23 for instruction: %ctlz %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 1) @@ -227,7 +227,7 @@ define <4 x i64> @var_ctlz_v4i64(<4 x i64> %a) { ; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v4i64': -; SSE2: Found an estimated cost of 12 for instruction: %ctlz +; SSE2: Found an estimated cost of 50 for instruction: %ctlz ; SSE42: Found an estimated cost of 46 for instruction: %ctlz ; AVX1: Found an estimated cost of 46 for instruction: %ctlz ; AVX2: Found an estimated cost of 23 for instruction: %ctlz @@ -237,7 +237,7 @@ define <4 x i64> @var_ctlz_v4i64u(<4 x i64> %a) { ; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v4i64u': -; SSE2: Found an estimated cost of 12 for instruction: %ctlz +; SSE2: Found an estimated cost of 50 for instruction: %ctlz ; SSE42: Found an estimated cost of 46 for instruction: %ctlz ; AVX1: Found an estimated cost of 46 for instruction: %ctlz ; AVX2: Found an estimated cost of 23 for instruction: %ctlz @@ -247,7 +247,7 @@ define <4 x i32> @var_ctlz_v4i32(<4 x i32> %a) { ; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v4i32': -; SSE2: Found an estimated cost of 12 for instruction: %ctlz +; SSE2: Found an estimated cost of 26 for instruction: %ctlz ; SSE42: Found an estimated cost of 18 for instruction: %ctlz ; AVX: Found an estimated cost of 18 for instruction: %ctlz %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 0) @@ -256,7 +256,7 @@ define <4 x i32> @var_ctlz_v4i32u(<4 x i32> %a) { ; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v4i32u': -; SSE2: Found an estimated cost of 12 for instruction: %ctlz +; SSE2: Found an estimated cost of 26 for instruction: %ctlz ; SSE42: Found an estimated cost of 18 for instruction: %ctlz ; AVX: Found an estimated cost of 18 for instruction: %ctlz %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 1) @@ -265,7 +265,7 @@ define <8 x i32> @var_ctlz_v8i32(<8 x i32> %a) { ; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v8i32': -; SSE2: Found an estimated cost of 24 for instruction: %ctlz +; SSE2: Found an estimated cost of 52 for instruction: %ctlz ; SSE42: Found an estimated cost of 36 for instruction: %ctlz ; AVX1: Found an estimated cost of 36 for instruction: %ctlz ; AVX2: Found an estimated cost of 18 for instruction: %ctlz @@ -275,7 +275,7 @@ define <8 x i32> @var_ctlz_v8i32u(<8 x i32> %a) { ; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v8i32u': -; SSE2: Found an estimated cost of 24 for instruction: %ctlz +; SSE2: Found an estimated cost of 52 for instruction: %ctlz ; SSE42: Found an estimated cost of 36 for instruction: %ctlz ; AVX1: Found an estimated cost of 36 for instruction: %ctlz ; AVX2: Found an estimated cost of 18 for instruction: %ctlz @@ -285,7 +285,7 @@ define <8 x i16> @var_ctlz_v8i16(<8 x i16> %a) { ; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v8i16': -; SSE2: Found an estimated cost of 24 for instruction: %ctlz +; SSE2: Found an estimated cost of 20 for instruction: %ctlz ; SSE42: Found an estimated cost of 14 for instruction: %ctlz ; AVX: Found an estimated cost of 14 for instruction: %ctlz %ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 0) @@ -294,7 +294,7 @@ define <8 x i16> @var_ctlz_v8i16u(<8 x i16> %a) { ; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v8i16u': -; SSE2: Found an estimated cost of 24 for instruction: %ctlz +; SSE2: Found an estimated cost of 20 for instruction: %ctlz ; SSE42: Found an estimated cost of 14 for instruction: %ctlz ; AVX: Found an estimated cost of 14 for instruction: %ctlz %ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 1) @@ -303,7 +303,7 @@ define <16 x i16> @var_ctlz_v16i16(<16 x i16> %a) { ; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v16i16': -; SSE2: Found an estimated cost of 48 for instruction: %ctlz +; SSE2: Found an estimated cost of 40 for instruction: %ctlz ; SSE42: Found an estimated cost of 28 for instruction: %ctlz ; AVX1: Found an estimated cost of 28 for instruction: %ctlz ; AVX2: Found an estimated cost of 14 for instruction: %ctlz @@ -313,7 +313,7 @@ define <16 x i16> @var_ctlz_v16i16u(<16 x i16> %a) { ; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v16i16u': -; SSE2: Found an estimated cost of 48 for instruction: %ctlz +; SSE2: Found an estimated cost of 40 for instruction: %ctlz ; SSE42: Found an estimated cost of 28 for instruction: %ctlz ; AVX1: Found an estimated cost of 28 for instruction: %ctlz ; AVX2: Found an estimated cost of 14 for instruction: %ctlz @@ -323,7 +323,7 @@ define <16 x i8> @var_ctlz_v16i8(<16 x i8> %a) { ; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v16i8': -; SSE2: Found an estimated cost of 48 for instruction: %ctlz +; SSE2: Found an estimated cost of 17 for instruction: %ctlz ; SSE42: Found an estimated cost of 9 for instruction: %ctlz ; AVX: Found an estimated cost of 9 for instruction: %ctlz %ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 0) @@ -332,7 +332,7 @@ define <16 x i8> @var_ctlz_v16i8u(<16 x i8> %a) { ; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v16i8u': -; SSE2: Found an estimated cost of 48 for instruction: %ctlz +; SSE2: Found an estimated cost of 17 for instruction: %ctlz ; SSE42: Found an estimated cost of 9 for instruction: %ctlz ; AVX: Found an estimated cost of 9 for instruction: %ctlz %ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 1) @@ -341,7 +341,7 @@ define <32 x i8> @var_ctlz_v32i8(<32 x i8> %a) { ; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v32i8': -; SSE2: Found an estimated cost of 96 for instruction: %ctlz +; SSE2: Found an estimated cost of 34 for instruction: %ctlz ; SSE42: Found an estimated cost of 18 for instruction: %ctlz ; AVX1: Found an estimated cost of 18 for instruction: %ctlz ; AVX2: Found an estimated cost of 9 for instruction: %ctlz @@ -351,7 +351,7 @@ define <32 x i8> @var_ctlz_v32i8u(<32 x i8> %a) { ; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v32i8u': -; SSE2: Found an estimated cost of 96 for instruction: %ctlz +; SSE2: Found an estimated cost of 34 for instruction: %ctlz ; SSE42: Found an estimated cost of 18 for instruction: %ctlz ; AVX1: Found an estimated cost of 18 for instruction: %ctlz ; AVX2: Found an estimated cost of 9 for instruction: %ctlz Index: test/CodeGen/X86/vec_ctbits.ll =================================================================== --- test/CodeGen/X86/vec_ctbits.ll +++ test/CodeGen/X86/vec_ctbits.ll @@ -36,17 +36,42 @@ define <2 x i64> @foolz(<2 x i64> %a) nounwind { ; CHECK-LABEL: foolz: ; CHECK: # BB#0: -; CHECK-NEXT: movd %xmm0, %rax -; CHECK-NEXT: bsrq %rax, %rax -; CHECK-NEXT: xorq $63, %rax -; CHECK-NEXT: movd %rax, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; CHECK-NEXT: movd %xmm0, %rax -; CHECK-NEXT: bsrq %rax, %rax -; CHECK-NEXT: xorq $63, %rax -; CHECK-NEXT: movd %rax, %xmm0 -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrlq $1, %xmm1 +; CHECK-NEXT: por %xmm0, %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: psrlq $2, %xmm0 +; CHECK-NEXT: por %xmm1, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrlq $4, %xmm1 +; CHECK-NEXT: por %xmm0, %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: psrlq $8, %xmm0 +; CHECK-NEXT: por %xmm1, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrlq $16, %xmm1 +; CHECK-NEXT: por %xmm0, %xmm1 ; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: psrlq $32, %xmm0 +; CHECK-NEXT: por %xmm1, %xmm0 +; CHECK-NEXT: pcmpeqd %xmm1, %xmm1 +; CHECK-NEXT: pxor %xmm0, %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: psrlq $1, %xmm0 +; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 +; CHECK-NEXT: psubq %xmm0, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323] +; CHECK-NEXT: movdqa %xmm1, %xmm2 +; CHECK-NEXT: pand %xmm0, %xmm2 +; CHECK-NEXT: psrlq $2, %xmm1 +; CHECK-NEXT: pand %xmm0, %xmm1 +; CHECK-NEXT: paddq %xmm2, %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm2 +; CHECK-NEXT: psrlq $4, %xmm2 +; CHECK-NEXT: paddq %xmm1, %xmm2 +; CHECK-NEXT: pand {{.*}}(%rip), %xmm2 +; CHECK-NEXT: pxor %xmm0, %xmm0 +; CHECK-NEXT: psadbw %xmm2, %xmm0 ; CHECK-NEXT: retq %c = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 true) ret <2 x i64> %c @@ -115,21 +140,43 @@ ; CHECK-LABEL: promlz: ; CHECK: # BB#0: ; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 -; CHECK-NEXT: movd %xmm0, %rax -; CHECK-NEXT: bsrq %rax, %rax -; CHECK-NEXT: movl $127, %ecx -; CHECK-NEXT: cmoveq %rcx, %rax -; CHECK-NEXT: xorq $63, %rax -; CHECK-NEXT: movd %rax, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; CHECK-NEXT: movd %xmm0, %rax -; CHECK-NEXT: bsrq %rax, %rax -; CHECK-NEXT: cmoveq %rcx, %rax -; CHECK-NEXT: xorq $63, %rax -; CHECK-NEXT: movd %rax, %xmm0 -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; CHECK-NEXT: psubq {{.*}}(%rip), %xmm1 -; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: psrlq $1, %xmm2 +; CHECK-NEXT: por %xmm0, %xmm2 +; CHECK-NEXT: movdqa %xmm2, %xmm0 +; CHECK-NEXT: psrlq $2, %xmm0 +; CHECK-NEXT: por %xmm2, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: psrlq $4, %xmm2 +; CHECK-NEXT: por %xmm0, %xmm2 +; CHECK-NEXT: movdqa %xmm2, %xmm0 +; CHECK-NEXT: psrlq $8, %xmm0 +; CHECK-NEXT: por %xmm2, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: psrlq $16, %xmm2 +; CHECK-NEXT: por %xmm0, %xmm2 +; CHECK-NEXT: movdqa %xmm2, %xmm0 +; CHECK-NEXT: psrlq $32, %xmm0 +; CHECK-NEXT: por %xmm2, %xmm0 +; CHECK-NEXT: pcmpeqd %xmm2, %xmm2 +; CHECK-NEXT: pxor %xmm0, %xmm2 +; CHECK-NEXT: movdqa %xmm2, %xmm0 +; CHECK-NEXT: psrlq $1, %xmm0 +; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 +; CHECK-NEXT: psubq %xmm0, %xmm2 +; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323] +; CHECK-NEXT: movdqa %xmm2, %xmm3 +; CHECK-NEXT: pand %xmm0, %xmm3 +; CHECK-NEXT: psrlq $2, %xmm2 +; CHECK-NEXT: pand %xmm0, %xmm2 +; CHECK-NEXT: paddq %xmm3, %xmm2 +; CHECK-NEXT: movdqa %xmm2, %xmm0 +; CHECK-NEXT: psrlq $4, %xmm0 +; CHECK-NEXT: paddq %xmm2, %xmm0 +; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 +; CHECK-NEXT: psadbw %xmm1, %xmm0 +; CHECK-NEXT: psubq {{.*}}(%rip), %xmm0 ; CHECK-NEXT: retq %c = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) ret <2 x i32> %c Index: test/CodeGen/X86/vector-lzcnt-128.ll =================================================================== --- test/CodeGen/X86/vector-lzcnt-128.ll +++ test/CodeGen/X86/vector-lzcnt-128.ll @@ -14,38 +14,82 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; SSE2-LABEL: testv2i64: ; SSE2: # BB#0: -; SSE2-NEXT: movd %xmm0, %rax -; SSE2-NEXT: bsrq %rax, %rax -; SSE2-NEXT: movl $127, %ecx -; SSE2-NEXT: cmoveq %rcx, %rax -; SSE2-NEXT: xorq $63, %rax -; SSE2-NEXT: movd %rax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %rax -; SSE2-NEXT: bsrq %rax, %rax -; SSE2-NEXT: cmoveq %rcx, %rax -; SSE2-NEXT: xorq $63, %rax -; SSE2-NEXT: movd %rax, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlq $1, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrlq $2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlq $4, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrlq $8, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlq $16, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrlq $32, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrlq $1, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: psubq %xmm0, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: psrlq $2, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: paddq %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psrlq $4, %xmm2 +; SSE2-NEXT: paddq %xmm1, %xmm2 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: psadbw %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: testv2i64: ; SSE3: # BB#0: -; SSE3-NEXT: movd %xmm0, %rax -; SSE3-NEXT: bsrq %rax, %rax -; SSE3-NEXT: movl $127, %ecx -; SSE3-NEXT: cmoveq %rcx, %rax -; SSE3-NEXT: xorq $63, %rax -; SSE3-NEXT: movd %rax, %xmm1 -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE3-NEXT: movd %xmm0, %rax -; SSE3-NEXT: bsrq %rax, %rax -; SSE3-NEXT: cmoveq %rcx, %rax -; SSE3-NEXT: xorq $63, %rax -; SSE3-NEXT: movd %rax, %xmm0 -; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlq $1, %xmm1 +; SSE3-NEXT: por %xmm0, %xmm1 +; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: psrlq $2, %xmm0 +; SSE3-NEXT: por %xmm1, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlq $4, %xmm1 +; SSE3-NEXT: por %xmm0, %xmm1 ; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: psrlq $8, %xmm0 +; SSE3-NEXT: por %xmm1, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlq $16, %xmm1 +; SSE3-NEXT: por %xmm0, %xmm1 +; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: psrlq $32, %xmm0 +; SSE3-NEXT: por %xmm1, %xmm0 +; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE3-NEXT: pxor %xmm0, %xmm1 +; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: psrlq $1, %xmm0 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE3-NEXT: psubq %xmm0, %xmm1 +; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323] +; SSE3-NEXT: movdqa %xmm1, %xmm2 +; SSE3-NEXT: pand %xmm0, %xmm2 +; SSE3-NEXT: psrlq $2, %xmm1 +; SSE3-NEXT: pand %xmm0, %xmm1 +; SSE3-NEXT: paddq %xmm2, %xmm1 +; SSE3-NEXT: movdqa %xmm1, %xmm2 +; SSE3-NEXT: psrlq $4, %xmm2 +; SSE3-NEXT: paddq %xmm1, %xmm2 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE3-NEXT: pxor %xmm0, %xmm0 +; SSE3-NEXT: psadbw %xmm2, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: testv2i64: @@ -205,32 +249,82 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind { ; SSE2-LABEL: testv2i64u: ; SSE2: # BB#0: -; SSE2-NEXT: movd %xmm0, %rax -; SSE2-NEXT: bsrq %rax, %rax -; SSE2-NEXT: xorq $63, %rax -; SSE2-NEXT: movd %rax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %rax -; SSE2-NEXT: bsrq %rax, %rax -; SSE2-NEXT: xorq $63, %rax -; SSE2-NEXT: movd %rax, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlq $1, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrlq $2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlq $4, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrlq $8, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlq $16, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrlq $32, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrlq $1, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: psubq %xmm0, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: psrlq $2, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: paddq %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psrlq $4, %xmm2 +; SSE2-NEXT: paddq %xmm1, %xmm2 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: psadbw %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: testv2i64u: ; SSE3: # BB#0: -; SSE3-NEXT: movd %xmm0, %rax -; SSE3-NEXT: bsrq %rax, %rax -; SSE3-NEXT: xorq $63, %rax -; SSE3-NEXT: movd %rax, %xmm1 -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE3-NEXT: movd %xmm0, %rax -; SSE3-NEXT: bsrq %rax, %rax -; SSE3-NEXT: xorq $63, %rax -; SSE3-NEXT: movd %rax, %xmm0 -; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlq $1, %xmm1 +; SSE3-NEXT: por %xmm0, %xmm1 +; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: psrlq $2, %xmm0 +; SSE3-NEXT: por %xmm1, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlq $4, %xmm1 +; SSE3-NEXT: por %xmm0, %xmm1 +; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: psrlq $8, %xmm0 +; SSE3-NEXT: por %xmm1, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlq $16, %xmm1 +; SSE3-NEXT: por %xmm0, %xmm1 +; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: psrlq $32, %xmm0 +; SSE3-NEXT: por %xmm1, %xmm0 +; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE3-NEXT: pxor %xmm0, %xmm1 ; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: psrlq $1, %xmm0 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE3-NEXT: psubq %xmm0, %xmm1 +; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323] +; SSE3-NEXT: movdqa %xmm1, %xmm2 +; SSE3-NEXT: pand %xmm0, %xmm2 +; SSE3-NEXT: psrlq $2, %xmm1 +; SSE3-NEXT: pand %xmm0, %xmm1 +; SSE3-NEXT: paddq %xmm2, %xmm1 +; SSE3-NEXT: movdqa %xmm1, %xmm2 +; SSE3-NEXT: psrlq $4, %xmm2 +; SSE3-NEXT: paddq %xmm1, %xmm2 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE3-NEXT: pxor %xmm0, %xmm0 +; SSE3-NEXT: psadbw %xmm2, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: testv2i64u: @@ -390,66 +484,86 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; SSE2-LABEL: testv4i32: ; SSE2: # BB#0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: bsrl %eax, %eax -; SSE2-NEXT: movl $63, %ecx -; SSE2-NEXT: cmovel %ecx, %eax -; SSE2-NEXT: xorl $31, %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: bsrl %eax, %eax -; SSE2-NEXT: cmovel %ecx, %eax -; SSE2-NEXT: xorl $31, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: bsrl %eax, %eax -; SSE2-NEXT: cmovel %ecx, %eax -; SSE2-NEXT: xorl $31, %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: bsrl %eax, %eax -; SSE2-NEXT: cmovel %ecx, %eax -; SSE2-NEXT: xorl $31, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $1, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrld $2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $4, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrld $8, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: psrld $1, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: psubd %xmm0, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459] +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: psrld $2, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: paddd %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: psrld $4, %xmm0 +; SSE2-NEXT: paddd %xmm2, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: psadbw %xmm1, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: psadbw %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: testv4i32: ; SSE3: # BB#0: -; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] -; SSE3-NEXT: movd %xmm1, %eax -; SSE3-NEXT: bsrl %eax, %eax -; SSE3-NEXT: movl $63, %ecx -; SSE3-NEXT: cmovel %ecx, %eax -; SSE3-NEXT: xorl $31, %eax -; SSE3-NEXT: movd %eax, %xmm1 -; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] -; SSE3-NEXT: movd %xmm2, %eax -; SSE3-NEXT: bsrl %eax, %eax -; SSE3-NEXT: cmovel %ecx, %eax -; SSE3-NEXT: xorl $31, %eax -; SSE3-NEXT: movd %eax, %xmm2 -; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE3-NEXT: movd %xmm0, %eax -; SSE3-NEXT: bsrl %eax, %eax -; SSE3-NEXT: cmovel %ecx, %eax -; SSE3-NEXT: xorl $31, %eax -; SSE3-NEXT: movd %eax, %xmm1 -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE3-NEXT: movd %xmm0, %eax -; SSE3-NEXT: bsrl %eax, %eax -; SSE3-NEXT: cmovel %ecx, %eax -; SSE3-NEXT: xorl $31, %eax -; SSE3-NEXT: movd %eax, %xmm0 -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrld $1, %xmm1 +; SSE3-NEXT: por %xmm0, %xmm1 +; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: psrld $2, %xmm0 +; SSE3-NEXT: por %xmm1, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrld $4, %xmm1 +; SSE3-NEXT: por %xmm0, %xmm1 ; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: psrld $8, %xmm0 +; SSE3-NEXT: por %xmm1, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrld $16, %xmm1 +; SSE3-NEXT: por %xmm0, %xmm1 +; SSE3-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE3-NEXT: pxor %xmm1, %xmm2 +; SSE3-NEXT: movdqa %xmm2, %xmm0 +; SSE3-NEXT: psrld $1, %xmm0 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE3-NEXT: psubd %xmm0, %xmm2 +; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459] +; SSE3-NEXT: movdqa %xmm2, %xmm1 +; SSE3-NEXT: pand %xmm0, %xmm1 +; SSE3-NEXT: psrld $2, %xmm2 +; SSE3-NEXT: pand %xmm0, %xmm2 +; SSE3-NEXT: paddd %xmm1, %xmm2 +; SSE3-NEXT: movdqa %xmm2, %xmm0 +; SSE3-NEXT: psrld $4, %xmm0 +; SSE3-NEXT: paddd %xmm2, %xmm0 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE3-NEXT: psadbw %xmm1, %xmm2 +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE3-NEXT: psadbw %xmm1, %xmm0 +; SSE3-NEXT: packuswb %xmm2, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: testv4i32: @@ -586,56 +700,86 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { ; SSE2-LABEL: testv4i32u: ; SSE2: # BB#0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: bsrl %eax, %eax -; SSE2-NEXT: xorl $31, %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: bsrl %eax, %eax -; SSE2-NEXT: xorl $31, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: bsrl %eax, %eax -; SSE2-NEXT: xorl $31, %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: bsrl %eax, %eax -; SSE2-NEXT: xorl $31, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $1, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrld $2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $4, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrld $8, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: psrld $1, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: psubd %xmm0, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459] +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: psrld $2, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: paddd %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: psrld $4, %xmm0 +; SSE2-NEXT: paddd %xmm2, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: psadbw %xmm1, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: psadbw %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: testv4i32u: ; SSE3: # BB#0: -; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] -; SSE3-NEXT: movd %xmm1, %eax -; SSE3-NEXT: bsrl %eax, %eax -; SSE3-NEXT: xorl $31, %eax -; SSE3-NEXT: movd %eax, %xmm1 -; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] -; SSE3-NEXT: movd %xmm2, %eax -; SSE3-NEXT: bsrl %eax, %eax -; SSE3-NEXT: xorl $31, %eax -; SSE3-NEXT: movd %eax, %xmm2 -; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE3-NEXT: movd %xmm0, %eax -; SSE3-NEXT: bsrl %eax, %eax -; SSE3-NEXT: xorl $31, %eax -; SSE3-NEXT: movd %eax, %xmm1 -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE3-NEXT: movd %xmm0, %eax -; SSE3-NEXT: bsrl %eax, %eax -; SSE3-NEXT: xorl $31, %eax -; SSE3-NEXT: movd %eax, %xmm0 -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrld $1, %xmm1 +; SSE3-NEXT: por %xmm0, %xmm1 +; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: psrld $2, %xmm0 +; SSE3-NEXT: por %xmm1, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrld $4, %xmm1 +; SSE3-NEXT: por %xmm0, %xmm1 ; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: psrld $8, %xmm0 +; SSE3-NEXT: por %xmm1, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrld $16, %xmm1 +; SSE3-NEXT: por %xmm0, %xmm1 +; SSE3-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE3-NEXT: pxor %xmm1, %xmm2 +; SSE3-NEXT: movdqa %xmm2, %xmm0 +; SSE3-NEXT: psrld $1, %xmm0 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE3-NEXT: psubd %xmm0, %xmm2 +; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459] +; SSE3-NEXT: movdqa %xmm2, %xmm1 +; SSE3-NEXT: pand %xmm0, %xmm1 +; SSE3-NEXT: psrld $2, %xmm2 +; SSE3-NEXT: pand %xmm0, %xmm2 +; SSE3-NEXT: paddd %xmm1, %xmm2 +; SSE3-NEXT: movdqa %xmm2, %xmm0 +; SSE3-NEXT: psrld $4, %xmm0 +; SSE3-NEXT: paddd %xmm2, %xmm0 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE3-NEXT: psadbw %xmm1, %xmm2 +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE3-NEXT: psadbw %xmm1, %xmm0 +; SSE3-NEXT: packuswb %xmm2, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: testv4i32u: @@ -772,106 +916,74 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; SSE2-LABEL: testv8i16: ; SSE2: # BB#0: -; SSE2-NEXT: pextrw $7, %xmm0, %eax -; SSE2-NEXT: bsrw %ax, %cx -; SSE2-NEXT: movw $31, %ax -; SSE2-NEXT: cmovew %ax, %cx -; SSE2-NEXT: xorl $15, %ecx -; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: pextrw $3, %xmm0, %ecx -; SSE2-NEXT: bsrw %cx, %cx -; SSE2-NEXT: cmovew %ax, %cx -; SSE2-NEXT: xorl $15, %ecx -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE2-NEXT: pextrw $5, %xmm0, %ecx -; SSE2-NEXT: bsrw %cx, %cx -; SSE2-NEXT: cmovew %ax, %cx -; SSE2-NEXT: xorl $15, %ecx -; SSE2-NEXT: movd %ecx, %xmm3 -; SSE2-NEXT: pextrw $1, %xmm0, %ecx -; SSE2-NEXT: bsrw %cx, %cx -; SSE2-NEXT: cmovew %ax, %cx -; SSE2-NEXT: xorl $15, %ecx -; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE2-NEXT: pextrw $6, %xmm0, %ecx -; SSE2-NEXT: bsrw %cx, %cx -; SSE2-NEXT: cmovew %ax, %cx -; SSE2-NEXT: xorl $15, %ecx -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: pextrw $2, %xmm0, %ecx -; SSE2-NEXT: bsrw %cx, %cx -; SSE2-NEXT: cmovew %ax, %cx -; SSE2-NEXT: xorl $15, %ecx -; SSE2-NEXT: movd %ecx, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE2-NEXT: pextrw $4, %xmm0, %ecx -; SSE2-NEXT: bsrw %cx, %cx -; SSE2-NEXT: cmovew %ax, %cx -; SSE2-NEXT: xorl $15, %ecx -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: movd %xmm0, %ecx -; SSE2-NEXT: bsrw %cx, %cx -; SSE2-NEXT: cmovew %ax, %cx -; SSE2-NEXT: xorl $15, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $1, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrlw $1, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: psubw %xmm0, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [13107,13107,13107,13107,13107,13107,13107,13107] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: psrlw $2, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: paddw %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psrlw $4, %xmm2 +; SSE2-NEXT: paddw %xmm1, %xmm2 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: psllw $8, %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: testv8i16: ; SSE3: # BB#0: -; SSE3-NEXT: pextrw $7, %xmm0, %eax -; SSE3-NEXT: bsrw %ax, %cx -; SSE3-NEXT: movw $31, %ax -; SSE3-NEXT: cmovew %ax, %cx -; SSE3-NEXT: xorl $15, %ecx -; SSE3-NEXT: movd %ecx, %xmm1 -; SSE3-NEXT: pextrw $3, %xmm0, %ecx -; SSE3-NEXT: bsrw %cx, %cx -; SSE3-NEXT: cmovew %ax, %cx -; SSE3-NEXT: xorl $15, %ecx -; SSE3-NEXT: movd %ecx, %xmm2 -; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE3-NEXT: pextrw $5, %xmm0, %ecx -; SSE3-NEXT: bsrw %cx, %cx -; SSE3-NEXT: cmovew %ax, %cx -; SSE3-NEXT: xorl $15, %ecx -; SSE3-NEXT: movd %ecx, %xmm3 -; SSE3-NEXT: pextrw $1, %xmm0, %ecx -; SSE3-NEXT: bsrw %cx, %cx -; SSE3-NEXT: cmovew %ax, %cx -; SSE3-NEXT: xorl $15, %ecx -; SSE3-NEXT: movd %ecx, %xmm1 -; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE3-NEXT: pextrw $6, %xmm0, %ecx -; SSE3-NEXT: bsrw %cx, %cx -; SSE3-NEXT: cmovew %ax, %cx -; SSE3-NEXT: xorl $15, %ecx -; SSE3-NEXT: movd %ecx, %xmm2 -; SSE3-NEXT: pextrw $2, %xmm0, %ecx -; SSE3-NEXT: bsrw %cx, %cx -; SSE3-NEXT: cmovew %ax, %cx -; SSE3-NEXT: xorl $15, %ecx -; SSE3-NEXT: movd %ecx, %xmm3 -; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE3-NEXT: pextrw $4, %xmm0, %ecx -; SSE3-NEXT: bsrw %cx, %cx -; SSE3-NEXT: cmovew %ax, %cx -; SSE3-NEXT: xorl $15, %ecx -; SSE3-NEXT: movd %ecx, %xmm2 -; SSE3-NEXT: movd %xmm0, %ecx -; SSE3-NEXT: bsrw %cx, %cx -; SSE3-NEXT: cmovew %ax, %cx -; SSE3-NEXT: xorl $15, %ecx -; SSE3-NEXT: movd %ecx, %xmm0 -; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $1, %xmm1 +; SSE3-NEXT: por %xmm0, %xmm1 +; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: psrlw $2, %xmm0 +; SSE3-NEXT: por %xmm1, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: por %xmm0, %xmm1 +; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: psrlw $8, %xmm0 +; SSE3-NEXT: por %xmm1, %xmm0 +; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE3-NEXT: pxor %xmm0, %xmm1 +; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: psrlw $1, %xmm0 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE3-NEXT: psubw %xmm0, %xmm1 +; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [13107,13107,13107,13107,13107,13107,13107,13107] +; SSE3-NEXT: movdqa %xmm1, %xmm2 +; SSE3-NEXT: pand %xmm0, %xmm2 +; SSE3-NEXT: psrlw $2, %xmm1 +; SSE3-NEXT: pand %xmm0, %xmm1 +; SSE3-NEXT: paddw %xmm2, %xmm1 +; SSE3-NEXT: movdqa %xmm1, %xmm2 +; SSE3-NEXT: psrlw $4, %xmm2 +; SSE3-NEXT: paddw %xmm1, %xmm2 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE3-NEXT: movdqa %xmm2, %xmm0 +; SSE3-NEXT: psllw $8, %xmm0 +; SSE3-NEXT: paddb %xmm2, %xmm0 +; SSE3-NEXT: psrlw $8, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: testv8i16: @@ -988,88 +1100,74 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { ; SSE2-LABEL: testv8i16u: ; SSE2: # BB#0: -; SSE2-NEXT: pextrw $7, %xmm0, %eax -; SSE2-NEXT: bsrw %ax, %ax -; SSE2-NEXT: xorl $15, %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: pextrw $3, %xmm0, %eax -; SSE2-NEXT: bsrw %ax, %ax -; SSE2-NEXT: xorl $15, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE2-NEXT: pextrw $5, %xmm0, %eax -; SSE2-NEXT: bsrw %ax, %ax -; SSE2-NEXT: xorl $15, %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: pextrw $1, %xmm0, %eax -; SSE2-NEXT: bsrw %ax, %ax -; SSE2-NEXT: xorl $15, %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE2-NEXT: pextrw $6, %xmm0, %eax -; SSE2-NEXT: bsrw %ax, %ax -; SSE2-NEXT: xorl $15, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: bsrw %ax, %ax -; SSE2-NEXT: xorl $15, %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: bsrw %ax, %ax -; SSE2-NEXT: xorl $15, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: bsrw %ax, %ax -; SSE2-NEXT: xorl $15, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $1, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrlw $1, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: psubw %xmm0, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [13107,13107,13107,13107,13107,13107,13107,13107] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: psrlw $2, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: paddw %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psrlw $4, %xmm2 +; SSE2-NEXT: paddw %xmm1, %xmm2 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: psllw $8, %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: testv8i16u: ; SSE3: # BB#0: -; SSE3-NEXT: pextrw $7, %xmm0, %eax -; SSE3-NEXT: bsrw %ax, %ax -; SSE3-NEXT: xorl $15, %eax -; SSE3-NEXT: movd %eax, %xmm1 -; SSE3-NEXT: pextrw $3, %xmm0, %eax -; SSE3-NEXT: bsrw %ax, %ax -; SSE3-NEXT: xorl $15, %eax -; SSE3-NEXT: movd %eax, %xmm2 -; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE3-NEXT: pextrw $5, %xmm0, %eax -; SSE3-NEXT: bsrw %ax, %ax -; SSE3-NEXT: xorl $15, %eax -; SSE3-NEXT: movd %eax, %xmm3 -; SSE3-NEXT: pextrw $1, %xmm0, %eax -; SSE3-NEXT: bsrw %ax, %ax -; SSE3-NEXT: xorl $15, %eax -; SSE3-NEXT: movd %eax, %xmm1 -; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE3-NEXT: pextrw $6, %xmm0, %eax -; SSE3-NEXT: bsrw %ax, %ax -; SSE3-NEXT: xorl $15, %eax -; SSE3-NEXT: movd %eax, %xmm2 -; SSE3-NEXT: pextrw $2, %xmm0, %eax -; SSE3-NEXT: bsrw %ax, %ax -; SSE3-NEXT: xorl $15, %eax -; SSE3-NEXT: movd %eax, %xmm3 -; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE3-NEXT: pextrw $4, %xmm0, %eax -; SSE3-NEXT: bsrw %ax, %ax -; SSE3-NEXT: xorl $15, %eax -; SSE3-NEXT: movd %eax, %xmm2 -; SSE3-NEXT: movd %xmm0, %eax -; SSE3-NEXT: bsrw %ax, %ax -; SSE3-NEXT: xorl $15, %eax -; SSE3-NEXT: movd %eax, %xmm0 -; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $1, %xmm1 +; SSE3-NEXT: por %xmm0, %xmm1 +; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: psrlw $2, %xmm0 +; SSE3-NEXT: por %xmm1, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: por %xmm0, %xmm1 +; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: psrlw $8, %xmm0 +; SSE3-NEXT: por %xmm1, %xmm0 +; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE3-NEXT: pxor %xmm0, %xmm1 +; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: psrlw $1, %xmm0 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE3-NEXT: psubw %xmm0, %xmm1 +; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [13107,13107,13107,13107,13107,13107,13107,13107] +; SSE3-NEXT: movdqa %xmm1, %xmm2 +; SSE3-NEXT: pand %xmm0, %xmm2 +; SSE3-NEXT: psrlw $2, %xmm1 +; SSE3-NEXT: pand %xmm0, %xmm1 +; SSE3-NEXT: paddw %xmm2, %xmm1 +; SSE3-NEXT: movdqa %xmm1, %xmm2 +; SSE3-NEXT: psrlw $4, %xmm2 +; SSE3-NEXT: paddw %xmm1, %xmm2 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE3-NEXT: movdqa %xmm2, %xmm0 +; SSE3-NEXT: psllw $8, %xmm0 +; SSE3-NEXT: paddb %xmm2, %xmm0 +; SSE3-NEXT: psrlw $8, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: testv8i16u: @@ -1186,212 +1284,68 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind { ; SSE2-LABEL: testv16i8: ; SSE2: # BB#0: -; SSE2-NEXT: pushq %rbp -; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: bsrl %eax, %ecx -; SSE2-NEXT: movl $15, %eax -; SSE2-NEXT: cmovel %eax, %ecx -; SSE2-NEXT: xorl $7, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSE2-NEXT: bsrl %ecx, %ecx -; SSE2-NEXT: cmovel %eax, %ecx -; SSE2-NEXT: xorl $7, %ecx -; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: bsrl %edx, %ecx -; SSE2-NEXT: cmovel %eax, %ecx -; SSE2-NEXT: xorl $7, %ecx -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: bsrl %ebp, %ebp -; SSE2-NEXT: cmovel %eax, %ebp -; SSE2-NEXT: xorl $7, %ebp -; SSE2-NEXT: movd %ebp, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: bsrl %edi, %edi -; SSE2-NEXT: cmovel %eax, %edi -; SSE2-NEXT: xorl $7, %edi -; SSE2-NEXT: movd %edi, %xmm1 -; SSE2-NEXT: bsrl %ecx, %ecx -; SSE2-NEXT: cmovel %eax, %ecx -; SSE2-NEXT: xorl $7, %ecx -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE2-NEXT: bsrl %esi, %ecx -; SSE2-NEXT: cmovel %eax, %ecx -; SSE2-NEXT: xorl $7, %ecx -; SSE2-NEXT: movd %ecx, %xmm3 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSE2-NEXT: bsrl %ecx, %ecx -; SSE2-NEXT: cmovel %eax, %ecx -; SSE2-NEXT: xorl $7, %ecx -; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: bsrl %ebx, %ecx -; SSE2-NEXT: cmovel %eax, %ecx -; SSE2-NEXT: xorl $7, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: bsrl %edx, %ecx -; SSE2-NEXT: cmovel %eax, %ecx -; SSE2-NEXT: xorl $7, %ecx -; SSE2-NEXT: movd %ecx, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: bsrl %r11d, %ecx -; SSE2-NEXT: cmovel %eax, %ecx -; SSE2-NEXT: xorl $7, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: bsrl %esi, %ecx -; SSE2-NEXT: cmovel %eax, %ecx -; SSE2-NEXT: xorl $7, %ecx -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE2-NEXT: bsrl %r9d, %ecx -; SSE2-NEXT: cmovel %eax, %ecx -; SSE2-NEXT: xorl $7, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: bsrl %r10d, %ecx -; SSE2-NEXT: cmovel %eax, %ecx -; SSE2-NEXT: xorl $7, %ecx -; SSE2-NEXT: movd %ecx, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: bsrl %r8d, %ecx -; SSE2-NEXT: cmovel %eax, %ecx -; SSE2-NEXT: xorl $7, %ecx -; SSE2-NEXT: movd %ecx, %xmm4 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSE2-NEXT: bsrl %ecx, %ecx -; SSE2-NEXT: cmovel %eax, %ecx -; SSE2-NEXT: xorl $7, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: popq %rbx -; SSE2-NEXT: popq %rbp +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $1, %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: psrlw $1, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: psubb %xmm0, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: psrlw $2, %xmm3 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: paddb %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: paddb %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: testv16i8: ; SSE3: # BB#0: -; SSE3-NEXT: pushq %rbp -; SSE3-NEXT: pushq %rbx -; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: bsrl %eax, %ecx -; SSE3-NEXT: movl $15, %eax -; SSE3-NEXT: cmovel %eax, %ecx -; SSE3-NEXT: xorl $7, %ecx -; SSE3-NEXT: movd %ecx, %xmm0 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSE3-NEXT: bsrl %ecx, %ecx -; SSE3-NEXT: cmovel %eax, %ecx -; SSE3-NEXT: xorl $7, %ecx -; SSE3-NEXT: movd %ecx, %xmm1 -; SSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE3-NEXT: bsrl %edx, %ecx -; SSE3-NEXT: cmovel %eax, %ecx -; SSE3-NEXT: xorl $7, %ecx -; SSE3-NEXT: movd %ecx, %xmm2 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE3-NEXT: bsrl %ebp, %ebp -; SSE3-NEXT: cmovel %eax, %ebp -; SSE3-NEXT: xorl $7, %ebp -; SSE3-NEXT: movd %ebp, %xmm0 -; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE3-NEXT: bsrl %edi, %edi -; SSE3-NEXT: cmovel %eax, %edi -; SSE3-NEXT: xorl $7, %edi -; SSE3-NEXT: movd %edi, %xmm1 -; SSE3-NEXT: bsrl %ecx, %ecx -; SSE3-NEXT: cmovel %eax, %ecx -; SSE3-NEXT: xorl $7, %ecx -; SSE3-NEXT: movd %ecx, %xmm2 -; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE3-NEXT: bsrl %esi, %ecx -; SSE3-NEXT: cmovel %eax, %ecx -; SSE3-NEXT: xorl $7, %ecx -; SSE3-NEXT: movd %ecx, %xmm3 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSE3-NEXT: bsrl %ecx, %ecx -; SSE3-NEXT: cmovel %eax, %ecx -; SSE3-NEXT: xorl $7, %ecx -; SSE3-NEXT: movd %ecx, %xmm1 -; SSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE3-NEXT: bsrl %ebx, %ecx -; SSE3-NEXT: cmovel %eax, %ecx -; SSE3-NEXT: xorl $7, %ecx -; SSE3-NEXT: movd %ecx, %xmm0 -; SSE3-NEXT: bsrl %edx, %ecx -; SSE3-NEXT: cmovel %eax, %ecx -; SSE3-NEXT: xorl $7, %ecx -; SSE3-NEXT: movd %ecx, %xmm3 -; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE3-NEXT: bsrl %r11d, %ecx -; SSE3-NEXT: cmovel %eax, %ecx -; SSE3-NEXT: xorl $7, %ecx -; SSE3-NEXT: movd %ecx, %xmm0 -; SSE3-NEXT: bsrl %esi, %ecx -; SSE3-NEXT: cmovel %eax, %ecx -; SSE3-NEXT: xorl $7, %ecx -; SSE3-NEXT: movd %ecx, %xmm2 -; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE3-NEXT: bsrl %r9d, %ecx -; SSE3-NEXT: cmovel %eax, %ecx -; SSE3-NEXT: xorl $7, %ecx -; SSE3-NEXT: movd %ecx, %xmm0 -; SSE3-NEXT: bsrl %r10d, %ecx -; SSE3-NEXT: cmovel %eax, %ecx -; SSE3-NEXT: xorl $7, %ecx -; SSE3-NEXT: movd %ecx, %xmm3 -; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE3-NEXT: bsrl %r8d, %ecx -; SSE3-NEXT: cmovel %eax, %ecx -; SSE3-NEXT: xorl $7, %ecx -; SSE3-NEXT: movd %ecx, %xmm4 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSE3-NEXT: bsrl %ecx, %ecx -; SSE3-NEXT: cmovel %eax, %ecx -; SSE3-NEXT: xorl $7, %ecx -; SSE3-NEXT: movd %ecx, %xmm0 -; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE3-NEXT: popq %rbx -; SSE3-NEXT: popq %rbp +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $1, %xmm1 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: por %xmm0, %xmm1 +; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: psrlw $2, %xmm0 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE3-NEXT: por %xmm1, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE3-NEXT: pand %xmm2, %xmm1 +; SSE3-NEXT: por %xmm0, %xmm1 +; SSE3-NEXT: pcmpeqd %xmm3, %xmm3 +; SSE3-NEXT: pxor %xmm1, %xmm3 +; SSE3-NEXT: movdqa %xmm3, %xmm0 +; SSE3-NEXT: psrlw $1, %xmm0 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE3-NEXT: psubb %xmm0, %xmm3 +; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE3-NEXT: movdqa %xmm3, %xmm1 +; SSE3-NEXT: pand %xmm0, %xmm1 +; SSE3-NEXT: psrlw $2, %xmm3 +; SSE3-NEXT: pand %xmm0, %xmm3 +; SSE3-NEXT: paddb %xmm1, %xmm3 +; SSE3-NEXT: movdqa %xmm3, %xmm0 +; SSE3-NEXT: psrlw $4, %xmm0 +; SSE3-NEXT: paddb %xmm3, %xmm0 +; SSE3-NEXT: pand %xmm2, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: testv16i8: @@ -1477,174 +1431,68 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind { ; SSE2-LABEL: testv16i8u: ; SSE2: # BB#0: -; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: bsrl %eax, %eax -; SSE2-NEXT: xorl $7, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; SSE2-NEXT: bsrl %esi, %esi -; SSE2-NEXT: xorl $7, %esi -; SSE2-NEXT: movd %esi, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: bsrl %eax, %eax -; SSE2-NEXT: xorl $7, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx -; SSE2-NEXT: bsrl %ebx, %ebx -; SSE2-NEXT: xorl $7, %ebx -; SSE2-NEXT: movd %ebx, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE2-NEXT: bsrl %edx, %edx -; SSE2-NEXT: xorl $7, %edx -; SSE2-NEXT: movd %edx, %xmm0 -; SSE2-NEXT: bsrl %esi, %edx -; SSE2-NEXT: xorl $7, %edx -; SSE2-NEXT: movd %edx, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: bsrl %ecx, %ecx -; SSE2-NEXT: xorl $7, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: bsrl %edx, %edx -; SSE2-NEXT: xorl $7, %edx -; SSE2-NEXT: movd %edx, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE2-NEXT: bsrl %edi, %edx -; SSE2-NEXT: xorl $7, %edx -; SSE2-NEXT: movd %edx, %xmm0 -; SSE2-NEXT: bsrl %eax, %eax -; SSE2-NEXT: xorl $7, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: bsrl %r10d, %eax -; SSE2-NEXT: xorl $7, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: bsrl %ecx, %eax -; SSE2-NEXT: xorl $7, %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE2-NEXT: bsrl %r9d, %eax -; SSE2-NEXT: xorl $7, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: bsrl %r11d, %eax -; SSE2-NEXT: xorl $7, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: bsrl %r8d, %eax -; SSE2-NEXT: xorl $7, %eax -; SSE2-NEXT: movd %eax, %xmm4 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: bsrl %eax, %eax -; SSE2-NEXT: xorl $7, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: popq %rbx +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $1, %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: psrlw $1, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: psubb %xmm0, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: psrlw $2, %xmm3 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: paddb %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: paddb %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: testv16i8u: ; SSE3: # BB#0: -; SSE3-NEXT: pushq %rbx -; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: bsrl %eax, %eax -; SSE3-NEXT: xorl $7, %eax -; SSE3-NEXT: movd %eax, %xmm0 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; SSE3-NEXT: bsrl %esi, %esi -; SSE3-NEXT: xorl $7, %esi -; SSE3-NEXT: movd %esi, %xmm1 -; SSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE3-NEXT: bsrl %eax, %eax -; SSE3-NEXT: xorl $7, %eax -; SSE3-NEXT: movd %eax, %xmm0 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx -; SSE3-NEXT: bsrl %ebx, %ebx -; SSE3-NEXT: xorl $7, %ebx -; SSE3-NEXT: movd %ebx, %xmm2 -; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE3-NEXT: bsrl %edx, %edx -; SSE3-NEXT: xorl $7, %edx -; SSE3-NEXT: movd %edx, %xmm0 -; SSE3-NEXT: bsrl %esi, %edx -; SSE3-NEXT: xorl $7, %edx -; SSE3-NEXT: movd %edx, %xmm3 -; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE3-NEXT: bsrl %ecx, %ecx -; SSE3-NEXT: xorl $7, %ecx -; SSE3-NEXT: movd %ecx, %xmm0 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE3-NEXT: bsrl %edx, %edx -; SSE3-NEXT: xorl $7, %edx -; SSE3-NEXT: movd %edx, %xmm1 -; SSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE3-NEXT: bsrl %edi, %edx -; SSE3-NEXT: xorl $7, %edx -; SSE3-NEXT: movd %edx, %xmm0 -; SSE3-NEXT: bsrl %eax, %eax -; SSE3-NEXT: xorl $7, %eax -; SSE3-NEXT: movd %eax, %xmm2 -; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE3-NEXT: bsrl %r10d, %eax -; SSE3-NEXT: xorl $7, %eax -; SSE3-NEXT: movd %eax, %xmm0 -; SSE3-NEXT: bsrl %ecx, %eax -; SSE3-NEXT: xorl $7, %eax -; SSE3-NEXT: movd %eax, %xmm3 -; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE3-NEXT: bsrl %r9d, %eax -; SSE3-NEXT: xorl $7, %eax -; SSE3-NEXT: movd %eax, %xmm0 -; SSE3-NEXT: bsrl %r11d, %eax -; SSE3-NEXT: xorl $7, %eax -; SSE3-NEXT: movd %eax, %xmm2 -; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE3-NEXT: bsrl %r8d, %eax -; SSE3-NEXT: xorl $7, %eax -; SSE3-NEXT: movd %eax, %xmm4 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: bsrl %eax, %eax -; SSE3-NEXT: xorl $7, %eax -; SSE3-NEXT: movd %eax, %xmm0 -; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE3-NEXT: popq %rbx +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $1, %xmm1 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: por %xmm0, %xmm1 +; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: psrlw $2, %xmm0 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE3-NEXT: por %xmm1, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE3-NEXT: pand %xmm2, %xmm1 +; SSE3-NEXT: por %xmm0, %xmm1 +; SSE3-NEXT: pcmpeqd %xmm3, %xmm3 +; SSE3-NEXT: pxor %xmm1, %xmm3 +; SSE3-NEXT: movdqa %xmm3, %xmm0 +; SSE3-NEXT: psrlw $1, %xmm0 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE3-NEXT: psubb %xmm0, %xmm3 +; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE3-NEXT: movdqa %xmm3, %xmm1 +; SSE3-NEXT: pand %xmm0, %xmm1 +; SSE3-NEXT: psrlw $2, %xmm3 +; SSE3-NEXT: pand %xmm0, %xmm3 +; SSE3-NEXT: paddb %xmm1, %xmm3 +; SSE3-NEXT: movdqa %xmm3, %xmm0 +; SSE3-NEXT: psrlw $4, %xmm0 +; SSE3-NEXT: paddb %xmm3, %xmm0 +; SSE3-NEXT: pand %xmm2, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: testv16i8u: Index: test/Transforms/SLPVectorizer/X86/ctlz.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/ctlz.ll +++ test/Transforms/SLPVectorizer/X86/ctlz.ll @@ -194,44 +194,11 @@ } define void @ctlz_8i16() #0 { -; SSE2-LABEL: @ctlz_8i16( -; SSE2-NEXT: [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 0), align 2 -; SSE2-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 1), align 2 -; SSE2-NEXT: [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 2), align 2 -; SSE2-NEXT: [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 3), align 2 -; SSE2-NEXT: [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 4), align 2 -; SSE2-NEXT: [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 5), align 2 -; SSE2-NEXT: [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 6), align 2 -; SSE2-NEXT: [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 7), align 2 -; SSE2-NEXT: [[CTLZ0:%.*]] = call i16 @llvm.ctlz.i16(i16 [[LD0]], i1 false) -; SSE2-NEXT: [[CTLZ1:%.*]] = call i16 @llvm.ctlz.i16(i16 [[LD1]], i1 false) -; SSE2-NEXT: [[CTLZ2:%.*]] = call i16 @llvm.ctlz.i16(i16 [[LD2]], i1 false) -; SSE2-NEXT: [[CTLZ3:%.*]] = call i16 @llvm.ctlz.i16(i16 [[LD3]], i1 false) -; SSE2-NEXT: [[CTLZ4:%.*]] = call i16 @llvm.ctlz.i16(i16 [[LD4]], i1 false) -; SSE2-NEXT: [[CTLZ5:%.*]] = call i16 @llvm.ctlz.i16(i16 [[LD5]], i1 false) -; SSE2-NEXT: [[CTLZ6:%.*]] = call i16 @llvm.ctlz.i16(i16 [[LD6]], i1 false) -; SSE2-NEXT: [[CTLZ7:%.*]] = call i16 @llvm.ctlz.i16(i16 [[LD7]], i1 false) -; SSE2-NEXT: store i16 [[CTLZ0]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 0), align 2 -; SSE2-NEXT: store i16 [[CTLZ1]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 1), align 2 -; SSE2-NEXT: store i16 [[CTLZ2]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 2), align 2 -; SSE2-NEXT: store i16 [[CTLZ3]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 3), align 2 -; SSE2-NEXT: store i16 [[CTLZ4]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 4), align 2 -; SSE2-NEXT: store i16 [[CTLZ5]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 5), align 2 -; SSE2-NEXT: store i16 [[CTLZ6]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 6), align 2 -; SSE2-NEXT: store i16 [[CTLZ7]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 7), align 2 -; SSE2-NEXT: ret void -; -; SSE42-LABEL: @ctlz_8i16( -; SSE42-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2 -; SSE42-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP1]], i1 false) -; SSE42-NEXT: store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2 -; SSE42-NEXT: ret void -; -; AVX-LABEL: @ctlz_8i16( -; AVX-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2 -; AVX-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP1]], i1 false) -; AVX-NEXT: store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2 -; AVX-NEXT: ret void +; CHECK-LABEL: @ctlz_8i16( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2 +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP1]], i1 false) +; CHECK-NEXT: store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2 +; CHECK-NEXT: ret void ; %ld0 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 0), align 2 %ld1 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 1), align 2 @@ -261,65 +228,14 @@ } define void @ctlz_16i16() #0 { -; SSE2-LABEL: @ctlz_16i16( -; SSE2-NEXT: [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 0), align 2 -; SSE2-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 1), align 2 -; SSE2-NEXT: [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 2), align 2 -; SSE2-NEXT: [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 3), align 2 -; SSE2-NEXT: [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 4), align 2 -; SSE2-NEXT: [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 5), align 2 -; SSE2-NEXT: [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 6), align 2 -; SSE2-NEXT: [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 7), align 2 -; SSE2-NEXT: [[LD8:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8), align 2 -; SSE2-NEXT: [[LD9:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 9), align 2 -; SSE2-NEXT: [[LD10:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 10), align 2 -; SSE2-NEXT: [[LD11:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 11), align 2 -; SSE2-NEXT: [[LD12:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 12), align 2 -; SSE2-NEXT: [[LD13:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 13), align 2 -; SSE2-NEXT: [[LD14:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 14), align 2 -; SSE2-NEXT: [[LD15:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 15), align 2 -; SSE2-NEXT: [[CTLZ0:%.*]] = call i16 @llvm.ctlz.i16(i16 [[LD0]], i1 false) -; SSE2-NEXT: [[CTLZ1:%.*]] = call i16 @llvm.ctlz.i16(i16 [[LD1]], i1 false) -; SSE2-NEXT: [[CTLZ2:%.*]] = call i16 @llvm.ctlz.i16(i16 [[LD2]], i1 false) -; SSE2-NEXT: [[CTLZ3:%.*]] = call i16 @llvm.ctlz.i16(i16 [[LD3]], i1 false) -; SSE2-NEXT: [[CTLZ4:%.*]] = call i16 @llvm.ctlz.i16(i16 [[LD4]], i1 false) -; SSE2-NEXT: [[CTLZ5:%.*]] = call i16 @llvm.ctlz.i16(i16 [[LD5]], i1 false) -; SSE2-NEXT: [[CTLZ6:%.*]] = call i16 @llvm.ctlz.i16(i16 [[LD6]], i1 false) -; SSE2-NEXT: [[CTLZ7:%.*]] = call i16 @llvm.ctlz.i16(i16 [[LD7]], i1 false) -; SSE2-NEXT: [[CTLZ8:%.*]] = call i16 @llvm.ctlz.i16(i16 [[LD8]], i1 false) -; SSE2-NEXT: [[CTLZ9:%.*]] = call i16 @llvm.ctlz.i16(i16 [[LD9]], i1 false) -; SSE2-NEXT: [[CTLZ10:%.*]] = call i16 @llvm.ctlz.i16(i16 [[LD10]], i1 false) -; SSE2-NEXT: [[CTLZ11:%.*]] = call i16 @llvm.ctlz.i16(i16 [[LD11]], i1 false) -; SSE2-NEXT: [[CTLZ12:%.*]] = call i16 @llvm.ctlz.i16(i16 [[LD12]], i1 false) -; SSE2-NEXT: [[CTLZ13:%.*]] = call i16 @llvm.ctlz.i16(i16 [[LD13]], i1 false) -; SSE2-NEXT: [[CTLZ14:%.*]] = call i16 @llvm.ctlz.i16(i16 [[LD14]], i1 false) -; SSE2-NEXT: [[CTLZ15:%.*]] = call i16 @llvm.ctlz.i16(i16 [[LD15]], i1 false) -; SSE2-NEXT: store i16 [[CTLZ0]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 0), align 2 -; SSE2-NEXT: store i16 [[CTLZ1]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 1), align 2 -; SSE2-NEXT: store i16 [[CTLZ2]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 2), align 2 -; SSE2-NEXT: store i16 [[CTLZ3]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 3), align 2 -; SSE2-NEXT: store i16 [[CTLZ4]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 4), align 2 -; SSE2-NEXT: store i16 [[CTLZ5]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 5), align 2 -; SSE2-NEXT: store i16 [[CTLZ6]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 6), align 2 -; SSE2-NEXT: store i16 [[CTLZ7]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 7), align 2 -; SSE2-NEXT: store i16 [[CTLZ8]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 8), align 2 -; SSE2-NEXT: store i16 [[CTLZ9]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 9), align 2 -; SSE2-NEXT: store i16 [[CTLZ10]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 10), align 2 -; SSE2-NEXT: store i16 [[CTLZ11]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 11), align 2 -; SSE2-NEXT: store i16 [[CTLZ12]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 12), align 2 -; SSE2-NEXT: store i16 [[CTLZ13]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 13), align 2 -; SSE2-NEXT: store i16 [[CTLZ14]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 14), align 2 -; SSE2-NEXT: store i16 [[CTLZ15]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 15), align 2 -; SSE2-NEXT: ret void -; -; SSE42-LABEL: @ctlz_16i16( -; SSE42-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2 -; SSE42-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2 -; SSE42-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP1]], i1 false) -; SSE42-NEXT: [[TMP4:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP2]], i1 false) -; SSE42-NEXT: store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2 -; SSE42-NEXT: store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 8) to <8 x i16>*), align 2 -; SSE42-NEXT: ret void +; SSE-LABEL: @ctlz_16i16( +; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP1]], i1 false) +; SSE-NEXT: [[TMP4:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP2]], i1 false) +; SSE-NEXT: store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2 +; SSE-NEXT: store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: ret void ; ; AVX-LABEL: @ctlz_16i16( ; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([16 x i16]* @src16 to <16 x i16>*), align 2 @@ -379,68 +295,11 @@ } define void @ctlz_16i8() #0 { -; SSE2-LABEL: @ctlz_16i8( -; SSE2-NEXT: [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 0), align 1 -; SSE2-NEXT: [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 1), align 1 -; SSE2-NEXT: [[LD2:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 2), align 1 -; SSE2-NEXT: [[LD3:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 3), align 1 -; SSE2-NEXT: [[LD4:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 4), align 1 -; SSE2-NEXT: [[LD5:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 5), align 1 -; SSE2-NEXT: [[LD6:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 6), align 1 -; SSE2-NEXT: [[LD7:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 7), align 1 -; SSE2-NEXT: [[LD8:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 8), align 1 -; SSE2-NEXT: [[LD9:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 9), align 1 -; SSE2-NEXT: [[LD10:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 10), align 1 -; SSE2-NEXT: [[LD11:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 11), align 1 -; SSE2-NEXT: [[LD12:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 12), align 1 -; SSE2-NEXT: [[LD13:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 13), align 1 -; SSE2-NEXT: [[LD14:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 14), align 1 -; SSE2-NEXT: [[LD15:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 15), align 1 -; SSE2-NEXT: [[CTLZ0:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD0]], i1 false) -; SSE2-NEXT: [[CTLZ1:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD1]], i1 false) -; SSE2-NEXT: [[CTLZ2:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD2]], i1 false) -; SSE2-NEXT: [[CTLZ3:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD3]], i1 false) -; SSE2-NEXT: [[CTLZ4:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD4]], i1 false) -; SSE2-NEXT: [[CTLZ5:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD5]], i1 false) -; SSE2-NEXT: [[CTLZ6:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD6]], i1 false) -; SSE2-NEXT: [[CTLZ7:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD7]], i1 false) -; SSE2-NEXT: [[CTLZ8:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD8]], i1 false) -; SSE2-NEXT: [[CTLZ9:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD9]], i1 false) -; SSE2-NEXT: [[CTLZ10:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD10]], i1 false) -; SSE2-NEXT: [[CTLZ11:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD11]], i1 false) -; SSE2-NEXT: [[CTLZ12:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD12]], i1 false) -; SSE2-NEXT: [[CTLZ13:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD13]], i1 false) -; SSE2-NEXT: [[CTLZ14:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD14]], i1 false) -; SSE2-NEXT: [[CTLZ15:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD15]], i1 false) -; SSE2-NEXT: store i8 [[CTLZ0]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 0), align 1 -; SSE2-NEXT: store i8 [[CTLZ1]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 1), align 1 -; SSE2-NEXT: store i8 [[CTLZ2]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 2), align 1 -; SSE2-NEXT: store i8 [[CTLZ3]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 3), align 1 -; SSE2-NEXT: store i8 [[CTLZ4]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 4), align 1 -; SSE2-NEXT: store i8 [[CTLZ5]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 5), align 1 -; SSE2-NEXT: store i8 [[CTLZ6]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 6), align 1 -; SSE2-NEXT: store i8 [[CTLZ7]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 7), align 1 -; SSE2-NEXT: store i8 [[CTLZ8]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 8), align 1 -; SSE2-NEXT: store i8 [[CTLZ9]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 9), align 1 -; SSE2-NEXT: store i8 [[CTLZ10]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 10), align 1 -; SSE2-NEXT: store i8 [[CTLZ11]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 11), align 1 -; SSE2-NEXT: store i8 [[CTLZ12]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 12), align 1 -; SSE2-NEXT: store i8 [[CTLZ13]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 13), align 1 -; SSE2-NEXT: store i8 [[CTLZ14]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 14), align 1 -; SSE2-NEXT: store i8 [[CTLZ15]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 15), align 1 -; SSE2-NEXT: ret void -; -; SSE42-LABEL: @ctlz_16i8( -; SSE42-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1 -; SSE42-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP1]], i1 false) -; SSE42-NEXT: store <16 x i8> [[TMP2]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1 -; SSE42-NEXT: ret void -; -; AVX-LABEL: @ctlz_16i8( -; AVX-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1 -; AVX-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP1]], i1 false) -; AVX-NEXT: store <16 x i8> [[TMP2]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1 -; AVX-NEXT: ret void +; CHECK-LABEL: @ctlz_16i8( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP1]], i1 false) +; CHECK-NEXT: store <16 x i8> [[TMP2]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1 +; CHECK-NEXT: ret void ; %ld0 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 0), align 1 %ld1 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 1), align 1 @@ -494,122 +353,14 @@ } define void @ctlz_32i8() #0 { -; SSE2-LABEL: @ctlz_32i8( -; SSE2-NEXT: [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 0), align 1 -; SSE2-NEXT: [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 1), align 1 -; SSE2-NEXT: [[LD2:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 2), align 1 -; SSE2-NEXT: [[LD3:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 3), align 1 -; SSE2-NEXT: [[LD4:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 4), align 1 -; SSE2-NEXT: [[LD5:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 5), align 1 -; SSE2-NEXT: [[LD6:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 6), align 1 -; SSE2-NEXT: [[LD7:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 7), align 1 -; SSE2-NEXT: [[LD8:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 8), align 1 -; SSE2-NEXT: [[LD9:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 9), align 1 -; SSE2-NEXT: [[LD10:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 10), align 1 -; SSE2-NEXT: [[LD11:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 11), align 1 -; SSE2-NEXT: [[LD12:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 12), align 1 -; SSE2-NEXT: [[LD13:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 13), align 1 -; SSE2-NEXT: [[LD14:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 14), align 1 -; SSE2-NEXT: [[LD15:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 15), align 1 -; SSE2-NEXT: [[LD16:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16), align 1 -; SSE2-NEXT: [[LD17:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 17), align 1 -; SSE2-NEXT: [[LD18:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 18), align 1 -; SSE2-NEXT: [[LD19:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 19), align 1 -; SSE2-NEXT: [[LD20:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 20), align 1 -; SSE2-NEXT: [[LD21:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 21), align 1 -; SSE2-NEXT: [[LD22:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 22), align 1 -; SSE2-NEXT: [[LD23:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 23), align 1 -; SSE2-NEXT: [[LD24:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 24), align 1 -; SSE2-NEXT: [[LD25:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 25), align 1 -; SSE2-NEXT: [[LD26:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 26), align 1 -; SSE2-NEXT: [[LD27:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 27), align 1 -; SSE2-NEXT: [[LD28:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 28), align 1 -; SSE2-NEXT: [[LD29:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 29), align 1 -; SSE2-NEXT: [[LD30:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 30), align 1 -; SSE2-NEXT: [[LD31:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 31), align 1 -; SSE2-NEXT: [[CTLZ0:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD0]], i1 false) -; SSE2-NEXT: [[CTLZ1:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD1]], i1 false) -; SSE2-NEXT: [[CTLZ2:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD2]], i1 false) -; SSE2-NEXT: [[CTLZ3:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD3]], i1 false) -; SSE2-NEXT: [[CTLZ4:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD4]], i1 false) -; SSE2-NEXT: [[CTLZ5:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD5]], i1 false) -; SSE2-NEXT: [[CTLZ6:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD6]], i1 false) -; SSE2-NEXT: [[CTLZ7:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD7]], i1 false) -; SSE2-NEXT: [[CTLZ8:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD8]], i1 false) -; SSE2-NEXT: [[CTLZ9:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD9]], i1 false) -; SSE2-NEXT: [[CTLZ10:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD10]], i1 false) -; SSE2-NEXT: [[CTLZ11:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD11]], i1 false) -; SSE2-NEXT: [[CTLZ12:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD12]], i1 false) -; SSE2-NEXT: [[CTLZ13:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD13]], i1 false) -; SSE2-NEXT: [[CTLZ14:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD14]], i1 false) -; SSE2-NEXT: [[CTLZ15:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD15]], i1 false) -; SSE2-NEXT: [[CTLZ16:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD16]], i1 false) -; SSE2-NEXT: [[CTLZ17:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD17]], i1 false) -; SSE2-NEXT: [[CTLZ18:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD18]], i1 false) -; SSE2-NEXT: [[CTLZ19:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD19]], i1 false) -; SSE2-NEXT: [[CTLZ20:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD20]], i1 false) -; SSE2-NEXT: [[CTLZ21:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD21]], i1 false) -; SSE2-NEXT: [[CTLZ22:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD22]], i1 false) -; SSE2-NEXT: [[CTLZ23:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD23]], i1 false) -; SSE2-NEXT: [[CTLZ24:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD24]], i1 false) -; SSE2-NEXT: [[CTLZ25:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD25]], i1 false) -; SSE2-NEXT: [[CTLZ26:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD26]], i1 false) -; SSE2-NEXT: [[CTLZ27:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD27]], i1 false) -; SSE2-NEXT: [[CTLZ28:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD28]], i1 false) -; SSE2-NEXT: [[CTLZ29:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD29]], i1 false) -; SSE2-NEXT: [[CTLZ30:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD30]], i1 false) -; SSE2-NEXT: [[CTLZ31:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD31]], i1 false) -; SSE2-NEXT: store i8 [[CTLZ0]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 0), align 1 -; SSE2-NEXT: store i8 [[CTLZ1]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 1), align 1 -; SSE2-NEXT: store i8 [[CTLZ2]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 2), align 1 -; SSE2-NEXT: store i8 [[CTLZ3]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 3), align 1 -; SSE2-NEXT: store i8 [[CTLZ4]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 4), align 1 -; SSE2-NEXT: store i8 [[CTLZ5]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 5), align 1 -; SSE2-NEXT: store i8 [[CTLZ6]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 6), align 1 -; SSE2-NEXT: store i8 [[CTLZ7]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 7), align 1 -; SSE2-NEXT: store i8 [[CTLZ8]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 8), align 1 -; SSE2-NEXT: store i8 [[CTLZ9]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 9), align 1 -; SSE2-NEXT: store i8 [[CTLZ10]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 10), align 1 -; SSE2-NEXT: store i8 [[CTLZ11]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 11), align 1 -; SSE2-NEXT: store i8 [[CTLZ12]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 12), align 1 -; SSE2-NEXT: store i8 [[CTLZ13]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 13), align 1 -; SSE2-NEXT: store i8 [[CTLZ14]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 14), align 1 -; SSE2-NEXT: store i8 [[CTLZ15]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 15), align 1 -; SSE2-NEXT: store i8 [[CTLZ16]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16), align 1 -; SSE2-NEXT: store i8 [[CTLZ17]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 17), align 1 -; SSE2-NEXT: store i8 [[CTLZ18]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 18), align 1 -; SSE2-NEXT: store i8 [[CTLZ19]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 19), align 1 -; SSE2-NEXT: store i8 [[CTLZ20]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 20), align 1 -; SSE2-NEXT: store i8 [[CTLZ21]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 21), align 1 -; SSE2-NEXT: store i8 [[CTLZ22]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 22), align 1 -; SSE2-NEXT: store i8 [[CTLZ23]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 23), align 1 -; SSE2-NEXT: store i8 [[CTLZ24]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 24), align 1 -; SSE2-NEXT: store i8 [[CTLZ25]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 25), align 1 -; SSE2-NEXT: store i8 [[CTLZ26]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 26), align 1 -; SSE2-NEXT: store i8 [[CTLZ27]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 27), align 1 -; SSE2-NEXT: store i8 [[CTLZ28]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 28), align 1 -; SSE2-NEXT: store i8 [[CTLZ29]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 29), align 1 -; SSE2-NEXT: store i8 [[CTLZ30]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 30), align 1 -; SSE2-NEXT: store i8 [[CTLZ31]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 31), align 1 -; SSE2-NEXT: ret void -; -; SSE42-LABEL: @ctlz_32i8( -; SSE42-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1 -; SSE42-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1 -; SSE42-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP1]], i1 false) -; SSE42-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP2]], i1 false) -; SSE42-NEXT: store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1 -; SSE42-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16) to <16 x i8>*), align 1 -; SSE42-NEXT: ret void -; -; AVX-LABEL: @ctlz_32i8( -; AVX-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1 -; AVX-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1 -; AVX-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP1]], i1 false) -; AVX-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP2]], i1 false) -; AVX-NEXT: store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1 -; AVX-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16) to <16 x i8>*), align 1 -; AVX-NEXT: ret void +; CHECK-LABEL: @ctlz_32i8( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP1]], i1 false) +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP2]], i1 false) +; CHECK-NEXT: store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1 +; CHECK-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16) to <16 x i8>*), align 1 +; CHECK-NEXT: ret void ; %ld0 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 0), align 1 %ld1 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 1), align 1 @@ -884,44 +635,11 @@ } define void @ctlz_undef_8i16() #0 { -; SSE2-LABEL: @ctlz_undef_8i16( -; SSE2-NEXT: [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 0), align 2 -; SSE2-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 1), align 2 -; SSE2-NEXT: [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 2), align 2 -; SSE2-NEXT: [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 3), align 2 -; SSE2-NEXT: [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 4), align 2 -; SSE2-NEXT: [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 5), align 2 -; SSE2-NEXT: [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 6), align 2 -; SSE2-NEXT: [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 7), align 2 -; SSE2-NEXT: [[CTLZ0:%.*]] = call i16 @llvm.ctlz.i16(i16 [[LD0]], i1 true) -; SSE2-NEXT: [[CTLZ1:%.*]] = call i16 @llvm.ctlz.i16(i16 [[LD1]], i1 true) -; SSE2-NEXT: [[CTLZ2:%.*]] = call i16 @llvm.ctlz.i16(i16 [[LD2]], i1 true) -; SSE2-NEXT: [[CTLZ3:%.*]] = call i16 @llvm.ctlz.i16(i16 [[LD3]], i1 true) -; SSE2-NEXT: [[CTLZ4:%.*]] = call i16 @llvm.ctlz.i16(i16 [[LD4]], i1 true) -; SSE2-NEXT: [[CTLZ5:%.*]] = call i16 @llvm.ctlz.i16(i16 [[LD5]], i1 true) -; SSE2-NEXT: [[CTLZ6:%.*]] = call i16 @llvm.ctlz.i16(i16 [[LD6]], i1 true) -; SSE2-NEXT: [[CTLZ7:%.*]] = call i16 @llvm.ctlz.i16(i16 [[LD7]], i1 true) -; SSE2-NEXT: store i16 [[CTLZ0]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 0), align 2 -; SSE2-NEXT: store i16 [[CTLZ1]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 1), align 2 -; SSE2-NEXT: store i16 [[CTLZ2]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 2), align 2 -; SSE2-NEXT: store i16 [[CTLZ3]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 3), align 2 -; SSE2-NEXT: store i16 [[CTLZ4]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 4), align 2 -; SSE2-NEXT: store i16 [[CTLZ5]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 5), align 2 -; SSE2-NEXT: store i16 [[CTLZ6]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 6), align 2 -; SSE2-NEXT: store i16 [[CTLZ7]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 7), align 2 -; SSE2-NEXT: ret void -; -; SSE42-LABEL: @ctlz_undef_8i16( -; SSE42-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2 -; SSE42-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP1]], i1 true) -; SSE42-NEXT: store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2 -; SSE42-NEXT: ret void -; -; AVX-LABEL: @ctlz_undef_8i16( -; AVX-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2 -; AVX-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP1]], i1 true) -; AVX-NEXT: store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2 -; AVX-NEXT: ret void +; CHECK-LABEL: @ctlz_undef_8i16( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2 +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP1]], i1 true) +; CHECK-NEXT: store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2 +; CHECK-NEXT: ret void ; %ld0 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 0), align 2 %ld1 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 1), align 2 @@ -951,65 +669,14 @@ } define void @ctlz_undef_16i16() #0 { -; SSE2-LABEL: @ctlz_undef_16i16( -; SSE2-NEXT: [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 0), align 2 -; SSE2-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 1), align 2 -; SSE2-NEXT: [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 2), align 2 -; SSE2-NEXT: [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 3), align 2 -; SSE2-NEXT: [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 4), align 2 -; SSE2-NEXT: [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 5), align 2 -; SSE2-NEXT: [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 6), align 2 -; SSE2-NEXT: [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 7), align 2 -; SSE2-NEXT: [[LD8:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8), align 2 -; SSE2-NEXT: [[LD9:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 9), align 2 -; SSE2-NEXT: [[LD10:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 10), align 2 -; SSE2-NEXT: [[LD11:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 11), align 2 -; SSE2-NEXT: [[LD12:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 12), align 2 -; SSE2-NEXT: [[LD13:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 13), align 2 -; SSE2-NEXT: [[LD14:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 14), align 2 -; SSE2-NEXT: [[LD15:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 15), align 2 -; SSE2-NEXT: [[CTLZ0:%.*]] = call i16 @llvm.ctlz.i16(i16 [[LD0]], i1 true) -; SSE2-NEXT: [[CTLZ1:%.*]] = call i16 @llvm.ctlz.i16(i16 [[LD1]], i1 true) -; SSE2-NEXT: [[CTLZ2:%.*]] = call i16 @llvm.ctlz.i16(i16 [[LD2]], i1 true) -; SSE2-NEXT: [[CTLZ3:%.*]] = call i16 @llvm.ctlz.i16(i16 [[LD3]], i1 true) -; SSE2-NEXT: [[CTLZ4:%.*]] = call i16 @llvm.ctlz.i16(i16 [[LD4]], i1 true) -; SSE2-NEXT: [[CTLZ5:%.*]] = call i16 @llvm.ctlz.i16(i16 [[LD5]], i1 true) -; SSE2-NEXT: [[CTLZ6:%.*]] = call i16 @llvm.ctlz.i16(i16 [[LD6]], i1 true) -; SSE2-NEXT: [[CTLZ7:%.*]] = call i16 @llvm.ctlz.i16(i16 [[LD7]], i1 true) -; SSE2-NEXT: [[CTLZ8:%.*]] = call i16 @llvm.ctlz.i16(i16 [[LD8]], i1 true) -; SSE2-NEXT: [[CTLZ9:%.*]] = call i16 @llvm.ctlz.i16(i16 [[LD9]], i1 true) -; SSE2-NEXT: [[CTLZ10:%.*]] = call i16 @llvm.ctlz.i16(i16 [[LD10]], i1 true) -; SSE2-NEXT: [[CTLZ11:%.*]] = call i16 @llvm.ctlz.i16(i16 [[LD11]], i1 true) -; SSE2-NEXT: [[CTLZ12:%.*]] = call i16 @llvm.ctlz.i16(i16 [[LD12]], i1 true) -; SSE2-NEXT: [[CTLZ13:%.*]] = call i16 @llvm.ctlz.i16(i16 [[LD13]], i1 true) -; SSE2-NEXT: [[CTLZ14:%.*]] = call i16 @llvm.ctlz.i16(i16 [[LD14]], i1 true) -; SSE2-NEXT: [[CTLZ15:%.*]] = call i16 @llvm.ctlz.i16(i16 [[LD15]], i1 true) -; SSE2-NEXT: store i16 [[CTLZ0]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 0), align 2 -; SSE2-NEXT: store i16 [[CTLZ1]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 1), align 2 -; SSE2-NEXT: store i16 [[CTLZ2]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 2), align 2 -; SSE2-NEXT: store i16 [[CTLZ3]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 3), align 2 -; SSE2-NEXT: store i16 [[CTLZ4]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 4), align 2 -; SSE2-NEXT: store i16 [[CTLZ5]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 5), align 2 -; SSE2-NEXT: store i16 [[CTLZ6]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 6), align 2 -; SSE2-NEXT: store i16 [[CTLZ7]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 7), align 2 -; SSE2-NEXT: store i16 [[CTLZ8]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 8), align 2 -; SSE2-NEXT: store i16 [[CTLZ9]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 9), align 2 -; SSE2-NEXT: store i16 [[CTLZ10]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 10), align 2 -; SSE2-NEXT: store i16 [[CTLZ11]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 11), align 2 -; SSE2-NEXT: store i16 [[CTLZ12]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 12), align 2 -; SSE2-NEXT: store i16 [[CTLZ13]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 13), align 2 -; SSE2-NEXT: store i16 [[CTLZ14]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 14), align 2 -; SSE2-NEXT: store i16 [[CTLZ15]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 15), align 2 -; SSE2-NEXT: ret void -; -; SSE42-LABEL: @ctlz_undef_16i16( -; SSE42-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2 -; SSE42-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2 -; SSE42-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP1]], i1 true) -; SSE42-NEXT: [[TMP4:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP2]], i1 true) -; SSE42-NEXT: store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2 -; SSE42-NEXT: store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 8) to <8 x i16>*), align 2 -; SSE42-NEXT: ret void +; SSE-LABEL: @ctlz_undef_16i16( +; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP1]], i1 true) +; SSE-NEXT: [[TMP4:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP2]], i1 true) +; SSE-NEXT: store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2 +; SSE-NEXT: store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: ret void ; ; AVX-LABEL: @ctlz_undef_16i16( ; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([16 x i16]* @src16 to <16 x i16>*), align 2 @@ -1069,68 +736,11 @@ } define void @ctlz_undef_16i8() #0 { -; SSE2-LABEL: @ctlz_undef_16i8( -; SSE2-NEXT: [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 0), align 1 -; SSE2-NEXT: [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 1), align 1 -; SSE2-NEXT: [[LD2:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 2), align 1 -; SSE2-NEXT: [[LD3:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 3), align 1 -; SSE2-NEXT: [[LD4:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 4), align 1 -; SSE2-NEXT: [[LD5:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 5), align 1 -; SSE2-NEXT: [[LD6:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 6), align 1 -; SSE2-NEXT: [[LD7:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 7), align 1 -; SSE2-NEXT: [[LD8:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 8), align 1 -; SSE2-NEXT: [[LD9:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 9), align 1 -; SSE2-NEXT: [[LD10:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 10), align 1 -; SSE2-NEXT: [[LD11:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 11), align 1 -; SSE2-NEXT: [[LD12:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 12), align 1 -; SSE2-NEXT: [[LD13:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 13), align 1 -; SSE2-NEXT: [[LD14:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 14), align 1 -; SSE2-NEXT: [[LD15:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 15), align 1 -; SSE2-NEXT: [[CTLZ0:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD0]], i1 true) -; SSE2-NEXT: [[CTLZ1:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD1]], i1 true) -; SSE2-NEXT: [[CTLZ2:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD2]], i1 true) -; SSE2-NEXT: [[CTLZ3:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD3]], i1 true) -; SSE2-NEXT: [[CTLZ4:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD4]], i1 true) -; SSE2-NEXT: [[CTLZ5:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD5]], i1 true) -; SSE2-NEXT: [[CTLZ6:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD6]], i1 true) -; SSE2-NEXT: [[CTLZ7:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD7]], i1 true) -; SSE2-NEXT: [[CTLZ8:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD8]], i1 true) -; SSE2-NEXT: [[CTLZ9:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD9]], i1 true) -; SSE2-NEXT: [[CTLZ10:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD10]], i1 true) -; SSE2-NEXT: [[CTLZ11:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD11]], i1 true) -; SSE2-NEXT: [[CTLZ12:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD12]], i1 true) -; SSE2-NEXT: [[CTLZ13:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD13]], i1 true) -; SSE2-NEXT: [[CTLZ14:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD14]], i1 true) -; SSE2-NEXT: [[CTLZ15:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD15]], i1 true) -; SSE2-NEXT: store i8 [[CTLZ0]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 0), align 1 -; SSE2-NEXT: store i8 [[CTLZ1]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 1), align 1 -; SSE2-NEXT: store i8 [[CTLZ2]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 2), align 1 -; SSE2-NEXT: store i8 [[CTLZ3]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 3), align 1 -; SSE2-NEXT: store i8 [[CTLZ4]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 4), align 1 -; SSE2-NEXT: store i8 [[CTLZ5]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 5), align 1 -; SSE2-NEXT: store i8 [[CTLZ6]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 6), align 1 -; SSE2-NEXT: store i8 [[CTLZ7]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 7), align 1 -; SSE2-NEXT: store i8 [[CTLZ8]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 8), align 1 -; SSE2-NEXT: store i8 [[CTLZ9]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 9), align 1 -; SSE2-NEXT: store i8 [[CTLZ10]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 10), align 1 -; SSE2-NEXT: store i8 [[CTLZ11]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 11), align 1 -; SSE2-NEXT: store i8 [[CTLZ12]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 12), align 1 -; SSE2-NEXT: store i8 [[CTLZ13]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 13), align 1 -; SSE2-NEXT: store i8 [[CTLZ14]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 14), align 1 -; SSE2-NEXT: store i8 [[CTLZ15]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 15), align 1 -; SSE2-NEXT: ret void -; -; SSE42-LABEL: @ctlz_undef_16i8( -; SSE42-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1 -; SSE42-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP1]], i1 true) -; SSE42-NEXT: store <16 x i8> [[TMP2]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1 -; SSE42-NEXT: ret void -; -; AVX-LABEL: @ctlz_undef_16i8( -; AVX-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1 -; AVX-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP1]], i1 true) -; AVX-NEXT: store <16 x i8> [[TMP2]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1 -; AVX-NEXT: ret void +; CHECK-LABEL: @ctlz_undef_16i8( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP1]], i1 true) +; CHECK-NEXT: store <16 x i8> [[TMP2]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1 +; CHECK-NEXT: ret void ; %ld0 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 0), align 1 %ld1 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 1), align 1 @@ -1184,122 +794,14 @@ } define void @ctlz_undef_32i8() #0 { -; SSE2-LABEL: @ctlz_undef_32i8( -; SSE2-NEXT: [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 0), align 1 -; SSE2-NEXT: [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 1), align 1 -; SSE2-NEXT: [[LD2:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 2), align 1 -; SSE2-NEXT: [[LD3:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 3), align 1 -; SSE2-NEXT: [[LD4:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 4), align 1 -; SSE2-NEXT: [[LD5:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 5), align 1 -; SSE2-NEXT: [[LD6:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 6), align 1 -; SSE2-NEXT: [[LD7:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 7), align 1 -; SSE2-NEXT: [[LD8:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 8), align 1 -; SSE2-NEXT: [[LD9:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 9), align 1 -; SSE2-NEXT: [[LD10:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 10), align 1 -; SSE2-NEXT: [[LD11:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 11), align 1 -; SSE2-NEXT: [[LD12:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 12), align 1 -; SSE2-NEXT: [[LD13:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 13), align 1 -; SSE2-NEXT: [[LD14:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 14), align 1 -; SSE2-NEXT: [[LD15:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 15), align 1 -; SSE2-NEXT: [[LD16:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16), align 1 -; SSE2-NEXT: [[LD17:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 17), align 1 -; SSE2-NEXT: [[LD18:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 18), align 1 -; SSE2-NEXT: [[LD19:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 19), align 1 -; SSE2-NEXT: [[LD20:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 20), align 1 -; SSE2-NEXT: [[LD21:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 21), align 1 -; SSE2-NEXT: [[LD22:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 22), align 1 -; SSE2-NEXT: [[LD23:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 23), align 1 -; SSE2-NEXT: [[LD24:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 24), align 1 -; SSE2-NEXT: [[LD25:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 25), align 1 -; SSE2-NEXT: [[LD26:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 26), align 1 -; SSE2-NEXT: [[LD27:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 27), align 1 -; SSE2-NEXT: [[LD28:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 28), align 1 -; SSE2-NEXT: [[LD29:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 29), align 1 -; SSE2-NEXT: [[LD30:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 30), align 1 -; SSE2-NEXT: [[LD31:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 31), align 1 -; SSE2-NEXT: [[CTLZ0:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD0]], i1 true) -; SSE2-NEXT: [[CTLZ1:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD1]], i1 true) -; SSE2-NEXT: [[CTLZ2:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD2]], i1 true) -; SSE2-NEXT: [[CTLZ3:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD3]], i1 true) -; SSE2-NEXT: [[CTLZ4:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD4]], i1 true) -; SSE2-NEXT: [[CTLZ5:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD5]], i1 true) -; SSE2-NEXT: [[CTLZ6:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD6]], i1 true) -; SSE2-NEXT: [[CTLZ7:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD7]], i1 true) -; SSE2-NEXT: [[CTLZ8:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD8]], i1 true) -; SSE2-NEXT: [[CTLZ9:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD9]], i1 true) -; SSE2-NEXT: [[CTLZ10:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD10]], i1 true) -; SSE2-NEXT: [[CTLZ11:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD11]], i1 true) -; SSE2-NEXT: [[CTLZ12:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD12]], i1 true) -; SSE2-NEXT: [[CTLZ13:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD13]], i1 true) -; SSE2-NEXT: [[CTLZ14:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD14]], i1 true) -; SSE2-NEXT: [[CTLZ15:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD15]], i1 true) -; SSE2-NEXT: [[CTLZ16:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD16]], i1 true) -; SSE2-NEXT: [[CTLZ17:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD17]], i1 true) -; SSE2-NEXT: [[CTLZ18:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD18]], i1 true) -; SSE2-NEXT: [[CTLZ19:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD19]], i1 true) -; SSE2-NEXT: [[CTLZ20:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD20]], i1 true) -; SSE2-NEXT: [[CTLZ21:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD21]], i1 true) -; SSE2-NEXT: [[CTLZ22:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD22]], i1 true) -; SSE2-NEXT: [[CTLZ23:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD23]], i1 true) -; SSE2-NEXT: [[CTLZ24:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD24]], i1 true) -; SSE2-NEXT: [[CTLZ25:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD25]], i1 true) -; SSE2-NEXT: [[CTLZ26:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD26]], i1 true) -; SSE2-NEXT: [[CTLZ27:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD27]], i1 true) -; SSE2-NEXT: [[CTLZ28:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD28]], i1 true) -; SSE2-NEXT: [[CTLZ29:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD29]], i1 true) -; SSE2-NEXT: [[CTLZ30:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD30]], i1 true) -; SSE2-NEXT: [[CTLZ31:%.*]] = call i8 @llvm.ctlz.i8(i8 [[LD31]], i1 true) -; SSE2-NEXT: store i8 [[CTLZ0]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 0), align 1 -; SSE2-NEXT: store i8 [[CTLZ1]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 1), align 1 -; SSE2-NEXT: store i8 [[CTLZ2]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 2), align 1 -; SSE2-NEXT: store i8 [[CTLZ3]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 3), align 1 -; SSE2-NEXT: store i8 [[CTLZ4]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 4), align 1 -; SSE2-NEXT: store i8 [[CTLZ5]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 5), align 1 -; SSE2-NEXT: store i8 [[CTLZ6]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 6), align 1 -; SSE2-NEXT: store i8 [[CTLZ7]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 7), align 1 -; SSE2-NEXT: store i8 [[CTLZ8]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 8), align 1 -; SSE2-NEXT: store i8 [[CTLZ9]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 9), align 1 -; SSE2-NEXT: store i8 [[CTLZ10]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 10), align 1 -; SSE2-NEXT: store i8 [[CTLZ11]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 11), align 1 -; SSE2-NEXT: store i8 [[CTLZ12]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 12), align 1 -; SSE2-NEXT: store i8 [[CTLZ13]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 13), align 1 -; SSE2-NEXT: store i8 [[CTLZ14]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 14), align 1 -; SSE2-NEXT: store i8 [[CTLZ15]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 15), align 1 -; SSE2-NEXT: store i8 [[CTLZ16]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16), align 1 -; SSE2-NEXT: store i8 [[CTLZ17]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 17), align 1 -; SSE2-NEXT: store i8 [[CTLZ18]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 18), align 1 -; SSE2-NEXT: store i8 [[CTLZ19]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 19), align 1 -; SSE2-NEXT: store i8 [[CTLZ20]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 20), align 1 -; SSE2-NEXT: store i8 [[CTLZ21]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 21), align 1 -; SSE2-NEXT: store i8 [[CTLZ22]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 22), align 1 -; SSE2-NEXT: store i8 [[CTLZ23]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 23), align 1 -; SSE2-NEXT: store i8 [[CTLZ24]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 24), align 1 -; SSE2-NEXT: store i8 [[CTLZ25]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 25), align 1 -; SSE2-NEXT: store i8 [[CTLZ26]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 26), align 1 -; SSE2-NEXT: store i8 [[CTLZ27]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 27), align 1 -; SSE2-NEXT: store i8 [[CTLZ28]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 28), align 1 -; SSE2-NEXT: store i8 [[CTLZ29]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 29), align 1 -; SSE2-NEXT: store i8 [[CTLZ30]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 30), align 1 -; SSE2-NEXT: store i8 [[CTLZ31]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 31), align 1 -; SSE2-NEXT: ret void -; -; SSE42-LABEL: @ctlz_undef_32i8( -; SSE42-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1 -; SSE42-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1 -; SSE42-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP1]], i1 true) -; SSE42-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP2]], i1 true) -; SSE42-NEXT: store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1 -; SSE42-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16) to <16 x i8>*), align 1 -; SSE42-NEXT: ret void -; -; AVX-LABEL: @ctlz_undef_32i8( -; AVX-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1 -; AVX-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1 -; AVX-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP1]], i1 true) -; AVX-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP2]], i1 true) -; AVX-NEXT: store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1 -; AVX-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16) to <16 x i8>*), align 1 -; AVX-NEXT: ret void +; CHECK-LABEL: @ctlz_undef_32i8( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP1]], i1 true) +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP2]], i1 true) +; CHECK-NEXT: store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1 +; CHECK-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16) to <16 x i8>*), align 1 +; CHECK-NEXT: ret void ; %ld0 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 0), align 1 %ld1 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 1), align 1