Index: llvm/trunk/lib/Target/X86/X86ISelLowering.h =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.h +++ llvm/trunk/lib/Target/X86/X86ISelLowering.h @@ -226,10 +226,6 @@ SCALEF, SCALEFS, - // Integer add/sub with unsigned saturation. - ADDUS, - SUBUS, - // Integer add/sub with signed saturation. ADDS, SUBS, Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -829,6 +829,17 @@ setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom); } + setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal); + setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal); + setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal); + setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal); + // Use widening instead of promotion. + for (auto VT : { MVT::v8i8, MVT::v4i8, MVT::v2i8, + MVT::v4i16, MVT::v2i16 }) { + setOperationAction(ISD::UADDSAT, VT, Custom); + setOperationAction(ISD::USUBSAT, VT, Custom); + } + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); @@ -1200,6 +1211,11 @@ setOperationAction(ISD::SMIN, MVT::v4i64, Custom); setOperationAction(ISD::UMIN, MVT::v4i64, Custom); + setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom); + setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom); + setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom); + setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom); + for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) { setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom); setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom); @@ -1317,6 +1333,8 @@ setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::TRUNCATE, VT, Custom); + setOperationAction(ISD::UADDSAT, VT, Custom); + setOperationAction(ISD::USUBSAT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); @@ -1577,6 +1595,8 @@ setOperationAction(ISD::SUB, VT, Custom); setOperationAction(ISD::MUL, VT, Custom); setOperationAction(ISD::VSELECT, VT, Expand); + setOperationAction(ISD::UADDSAT, VT, Custom); + setOperationAction(ISD::USUBSAT, VT, Custom); setOperationAction(ISD::TRUNCATE, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); @@ -1657,6 +1677,8 @@ setOperationAction(ISD::SMIN, VT, Legal); setOperationAction(ISD::UMIN, VT, Legal); setOperationAction(ISD::SETCC, VT, Custom); + setOperationAction(ISD::UADDSAT, VT, Legal); + setOperationAction(ISD::USUBSAT, VT, Legal); // The condition codes aren't legal in SSE/AVX and under AVX512 we use // setcc all the way to isel and prefer SETGT in some isel patterns. @@ -19147,7 +19169,7 @@ break; } - SDValue Result = DAG.getNode(X86ISD::SUBUS, dl, VT, Op0, Op1); + SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1); return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result, DAG.getConstant(0, dl, VT)); } @@ -23366,6 +23388,26 @@ return split256IntArith(Op, DAG); } +static SDValue LowerUADDSAT_USUBSAT(SDValue Op, SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + if (VT.getScalarType() == MVT::i1) { + SDLoc dl(Op); + switch (Op.getOpcode()) { + default: llvm_unreachable("Expected saturated arithmetic opcode"); + case ISD::UADDSAT: + return DAG.getNode(ISD::OR, dl, VT, Op.getOperand(0), Op.getOperand(1)); + case ISD::USUBSAT: + return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), + DAG.getNOT(dl, Op.getOperand(1), VT)); + } + } + + assert(Op.getSimpleValueType().is256BitVector() && + Op.getSimpleValueType().isInteger() && + "Only handle AVX 256-bit vector integer operation"); + return split256IntArith(Op, DAG); +} + static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) { @@ -26147,6 +26189,8 @@ case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG); case ISD::ADD: case ISD::SUB: return LowerADD_SUB(Op, DAG); + case ISD::UADDSAT: + case ISD::USUBSAT: return LowerUADDSAT_USUBSAT(Op, DAG); case ISD::SMAX: case ISD::SMIN: case ISD::UMAX: @@ -26228,11 +26272,12 @@ } return; } + case ISD::UADDSAT: + case ISD::USUBSAT: case X86ISD::VPMADDWD: - case X86ISD::ADDUS: - case X86ISD::SUBUS: case X86ISD::AVG: { - // Legalize types for X86ISD::AVG/ADDUS/SUBUS/VPMADDWD by widening. + // Legalize types for ISD::UADDSAT/USUBSAT and X86ISD::AVG/VPMADDWD + // by widening. assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); EVT VT = N->getValueType(0); @@ -26966,8 +27011,6 @@ case X86ISD::ANDNP: return "X86ISD::ANDNP"; case X86ISD::BLENDI: return "X86ISD::BLENDI"; case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND"; - case X86ISD::ADDUS: return "X86ISD::ADDUS"; - case X86ISD::SUBUS: return "X86ISD::SUBUS"; case X86ISD::HADD: return "X86ISD::HADD"; case X86ISD::HSUB: return "X86ISD::HSUB"; case X86ISD::FHADD: return "X86ISD::FHADD"; @@ -34043,9 +34086,9 @@ SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1); SDValue CondRHS = Cond->getOperand(1); - auto SUBUSBuilder = [](SelectionDAG &DAG, const SDLoc &DL, - ArrayRef Ops) { - return DAG.getNode(X86ISD::SUBUS, DL, Ops[0].getValueType(), Ops); + auto USUBSATBuilder = [](SelectionDAG &DAG, const SDLoc &DL, + ArrayRef Ops) { + return DAG.getNode(ISD::USUBSAT, DL, Ops[0].getValueType(), Ops); }; // Look for a general sub with unsigned saturation first. @@ -34054,22 +34097,22 @@ if ((CC == ISD::SETUGE || CC == ISD::SETUGT) && Other->getOpcode() == ISD::SUB && OpRHS == CondRHS) return SplitOpsAndApply(DAG, Subtarget, DL, VT, { OpLHS, OpRHS }, - SUBUSBuilder); + USUBSATBuilder); if (auto *OpRHSBV = dyn_cast(OpRHS)) { if (isa(CondRHS)) { // If the RHS is a constant we have to reverse the const // canonicalization. // x > C-1 ? x+-C : 0 --> subus x, C - auto MatchSUBUS = [](ConstantSDNode *Op, ConstantSDNode *Cond) { + auto MatchUSUBSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) { return Cond->getAPIntValue() == (-Op->getAPIntValue() - 1); }; if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD && - ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchSUBUS)) { + ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUSUBSAT)) { OpRHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), OpRHS); return SplitOpsAndApply(DAG, Subtarget, DL, VT, { OpLHS, OpRHS }, - SUBUSBuilder); + USUBSATBuilder); } // Another special case: If C was a sign bit, the sub has been @@ -34085,7 +34128,7 @@ // Note that we have to rebuild the RHS constant here to ensure we // don't rely on particular values of undef lanes. return SplitOpsAndApply(DAG, Subtarget, DL, VT, { OpLHS, OpRHS }, - SUBUSBuilder); + USUBSATBuilder); } } } @@ -34118,9 +34161,9 @@ if (Other.getNode() && Other.getOpcode() == ISD::ADD) { SDValue OpLHS = Other.getOperand(0), OpRHS = Other.getOperand(1); - auto ADDUSBuilder = [](SelectionDAG &DAG, const SDLoc &DL, - ArrayRef Ops) { - return DAG.getNode(X86ISD::ADDUS, DL, Ops[0].getValueType(), Ops); + auto UADDSATBuilder = [](SelectionDAG &DAG, const SDLoc &DL, + ArrayRef Ops) { + return DAG.getNode(ISD::UADDSAT, DL, Ops[0].getValueType(), Ops); }; // Canonicalize condition operands. @@ -34135,20 +34178,20 @@ if (CC == ISD::SETULE && Other == CondRHS && (OpLHS == CondLHS || OpRHS == CondLHS)) return SplitOpsAndApply(DAG, Subtarget, DL, VT, { OpLHS, OpRHS }, - ADDUSBuilder); + UADDSATBuilder); if (isa(OpRHS) && isa(CondRHS) && CondLHS == OpLHS) { // If the RHS is a constant we have to reverse the const // canonicalization. // x > ~C ? x+C : ~0 --> addus x, C - auto MatchADDUS = [](ConstantSDNode *Op, ConstantSDNode *Cond) { + auto MatchUADDSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) { return Cond->getAPIntValue() == ~Op->getAPIntValue(); }; if (CC == ISD::SETULE && - ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchADDUS)) + ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUADDSAT)) return SplitOpsAndApply(DAG, Subtarget, DL, VT, { OpLHS, OpRHS }, - ADDUSBuilder); + UADDSATBuilder); } } } @@ -40764,16 +40807,16 @@ } else return SDValue(); - auto SUBUSBuilder = [](SelectionDAG &DAG, const SDLoc &DL, - ArrayRef Ops) { - return DAG.getNode(X86ISD::SUBUS, DL, Ops[0].getValueType(), Ops); + auto USUBSATBuilder = [](SelectionDAG &DAG, const SDLoc &DL, + ArrayRef Ops) { + return DAG.getNode(ISD::USUBSAT, DL, Ops[0].getValueType(), Ops); }; // PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with // special preprocessing in some cases. if (VT != MVT::v8i32 && VT != MVT::v16i32 && VT != MVT::v8i64) return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, - { SubusLHS, SubusRHS }, SUBUSBuilder); + { SubusLHS, SubusRHS }, USUBSATBuilder); // Special preprocessing case can be only applied // if the value was zero extended from 16 bit, @@ -40805,7 +40848,7 @@ SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType); SDValue Psubus = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), ShrinkedType, - { NewSubusLHS, NewSubusRHS }, SUBUSBuilder); + { NewSubusLHS, NewSubusRHS }, USUBSATBuilder); // Zero extend the result, it may be used somewhere as 32 bit, // if not zext and following trunc will shrink. return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType); Index: llvm/trunk/lib/Target/X86/X86InstrAVX512.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrAVX512.td +++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td @@ -4834,9 +4834,9 @@ SchedWriteVecALU, HasBWI, 1>; defm VPSUBS : avx512_binop_rm_vl_bw<0xE8, 0xE9, "vpsubs", X86subs, SchedWriteVecALU, HasBWI, 0>; -defm VPADDUS : avx512_binop_rm_vl_bw<0xDC, 0xDD, "vpaddus", X86addus, +defm VPADDUS : avx512_binop_rm_vl_bw<0xDC, 0xDD, "vpaddus", uaddsat, SchedWriteVecALU, HasBWI, 1>; -defm VPSUBUS : avx512_binop_rm_vl_bw<0xD8, 0xD9, "vpsubus", X86subus, +defm VPSUBUS : avx512_binop_rm_vl_bw<0xD8, 0xD9, "vpsubus", usubsat, SchedWriteVecALU, HasBWI, 0>; defm VPMULLD : avx512_binop_rm_vl_d<0x40, "vpmulld", mul, SchedWritePMULLD, HasAVX512, 1>, T8PD; Index: llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td +++ llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -227,8 +227,6 @@ SDTCisVec<1>, SDTCisSameAs<2, 1>]>; -def X86addus : SDNode<"X86ISD::ADDUS", SDTIntBinOp, [SDNPCommutative]>; -def X86subus : SDNode<"X86ISD::SUBUS", SDTIntBinOp>; def X86adds : SDNode<"X86ISD::ADDS", SDTIntBinOp, [SDNPCommutative]>; def X86subs : SDNode<"X86ISD::SUBS", SDTIntBinOp>; def X86mulhrs : SDNode<"X86ISD::MULHRS", SDTIntBinOp, [SDNPCommutative]>; Index: llvm/trunk/lib/Target/X86/X86InstrSSE.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrSSE.td +++ llvm/trunk/lib/Target/X86/X86InstrSSE.td @@ -3627,9 +3627,9 @@ SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; defm PADDSW : PDI_binop_all<0xED, "paddsw", X86adds, v8i16, v16i16, SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; -defm PADDUSB : PDI_binop_all<0xDC, "paddusb", X86addus, v16i8, v32i8, +defm PADDUSB : PDI_binop_all<0xDC, "paddusb", uaddsat, v16i8, v32i8, SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; -defm PADDUSW : PDI_binop_all<0xDD, "paddusw", X86addus, v8i16, v16i16, +defm PADDUSW : PDI_binop_all<0xDD, "paddusw", uaddsat, v8i16, v16i16, SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; defm PMULLW : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16, SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>; @@ -3649,9 +3649,9 @@ SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; defm PSUBSW : PDI_binop_all<0xE9, "psubsw", X86subs, v8i16, v16i16, SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; -defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", X86subus, v16i8, v32i8, +defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", usubsat, v16i8, v32i8, SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; -defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", X86subus, v8i16, v16i16, +defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", usubsat, v8i16, v16i16, SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; defm PMINUB : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8, SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; Index: llvm/trunk/test/CodeGen/X86/uadd_sat_vec.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/uadd_sat_vec.ll +++ llvm/trunk/test/CodeGen/X86/uadd_sat_vec.ll @@ -34,9440 +34,198 @@ ; Legal types, depending on architecture. define <16 x i8> @v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { -; SSE2-LABEL: v16i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pushq %rbp -; SSE2-NEXT: pushq %r15 -; SSE2-NEXT: pushq %r14 -; SSE2-NEXT: pushq %r13 -; SSE2-NEXT: pushq %r12 -; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %dil -; SSE2-NEXT: jb .LBB0_2 -; SSE2-NEXT: # %bb.1: -; SSE2-NEXT: movl %eax, %edi -; SSE2-NEXT: .LBB0_2: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb $-1, %al -; SSE2-NEXT: jb .LBB0_4 -; SSE2-NEXT: # %bb.3: -; SSE2-NEXT: movl %ecx, %eax -; SSE2-NEXT: .LBB0_4: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb $-1, %cl -; SSE2-NEXT: jb .LBB0_6 -; SSE2-NEXT: # %bb.5: -; SSE2-NEXT: movl %edx, %ecx -; SSE2-NEXT: .LBB0_6: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb $-1, %r10b -; SSE2-NEXT: jb .LBB0_8 -; SSE2-NEXT: # %bb.7: -; SSE2-NEXT: movl %edx, %r10d -; SSE2-NEXT: .LBB0_8: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb $-1, %r11b -; SSE2-NEXT: jb .LBB0_10 -; SSE2-NEXT: # %bb.9: -; SSE2-NEXT: movl %edx, %r11d -; SSE2-NEXT: .LBB0_10: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb $-1, %r12b -; SSE2-NEXT: jb .LBB0_12 -; SSE2-NEXT: # %bb.11: -; SSE2-NEXT: movl %edx, %r12d -; SSE2-NEXT: .LBB0_12: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb $-1, %r13b -; SSE2-NEXT: jb .LBB0_14 -; SSE2-NEXT: # %bb.13: -; SSE2-NEXT: movl %edx, %r13d -; SSE2-NEXT: .LBB0_14: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb $-1, %r8b -; SSE2-NEXT: jb .LBB0_16 -; SSE2-NEXT: # %bb.15: -; SSE2-NEXT: movl %edx, %r8d -; SSE2-NEXT: .LBB0_16: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb $-1, %r14b -; SSE2-NEXT: jb .LBB0_18 -; SSE2-NEXT: # %bb.17: -; SSE2-NEXT: movl %edx, %r14d -; SSE2-NEXT: .LBB0_18: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb $-1, %r15b -; SSE2-NEXT: jb .LBB0_20 -; SSE2-NEXT: # %bb.19: -; SSE2-NEXT: movl %edx, %r15d -; SSE2-NEXT: .LBB0_20: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb $-1, %r9b -; SSE2-NEXT: jb .LBB0_22 -; SSE2-NEXT: # %bb.21: -; SSE2-NEXT: movl %edx, %r9d -; SSE2-NEXT: .LBB0_22: -; SSE2-NEXT: movzbl %dil, %edi -; SSE2-NEXT: movzbl %al, %esi -; SSE2-NEXT: movzbl %cl, %ebp -; SSE2-NEXT: movzbl %r10b, %edx -; SSE2-NEXT: movzbl %r11b, %ebx -; SSE2-NEXT: movzbl %r12b, %r10d -; SSE2-NEXT: movzbl %r13b, %r11d -; SSE2-NEXT: movzbl %r8b, %r8d -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %cl -; SSE2-NEXT: jb .LBB0_24 -; SSE2-NEXT: # %bb.23: -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: .LBB0_24: -; SSE2-NEXT: movd %edi, %xmm2 -; SSE2-NEXT: movd %esi, %xmm3 -; SSE2-NEXT: movd %ebp, %xmm5 -; SSE2-NEXT: movd %edx, %xmm0 -; SSE2-NEXT: movd %ebx, %xmm6 -; SSE2-NEXT: movd %r10d, %xmm4 -; SSE2-NEXT: movd %r11d, %xmm7 -; SSE2-NEXT: movd %r8d, %xmm1 -; SSE2-NEXT: movzbl %r14b, %esi -; SSE2-NEXT: movzbl %r15b, %edx -; SSE2-NEXT: movzbl %r9b, %eax -; SSE2-NEXT: movzbl %cl, %edi -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb $-1, %bl -; SSE2-NEXT: jb .LBB0_26 -; SSE2-NEXT: # %bb.25: -; SSE2-NEXT: movl %ecx, %ebx -; SSE2-NEXT: .LBB0_26: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; SSE2-NEXT: movd %esi, %xmm6 -; SSE2-NEXT: movd %edx, %xmm5 -; SSE2-NEXT: movd %eax, %xmm7 -; SSE2-NEXT: movd %edi, %xmm2 -; SSE2-NEXT: movzbl %bl, %eax -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb $-1, %cl -; SSE2-NEXT: jb .LBB0_28 -; SSE2-NEXT: # %bb.27: -; SSE2-NEXT: movl %edx, %ecx -; SSE2-NEXT: .LBB0_28: -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] -; SSE2-NEXT: movd %eax, %xmm4 -; SSE2-NEXT: movzbl %cl, %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb $-1, %cl -; SSE2-NEXT: jb .LBB0_30 -; SSE2-NEXT: # %bb.29: -; SSE2-NEXT: movl %edx, %ecx -; SSE2-NEXT: .LBB0_30: -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE2-NEXT: movzbl %cl, %ecx -; SSE2-NEXT: movd %ecx, %xmm4 -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %cl -; SSE2-NEXT: jb .LBB0_32 -; SSE2-NEXT: # %bb.31: -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: .LBB0_32: -; SSE2-NEXT: movzbl %cl, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: popq %rbx -; SSE2-NEXT: popq %r12 -; SSE2-NEXT: popq %r13 -; SSE2-NEXT: popq %r14 -; SSE2-NEXT: popq %r15 -; SSE2-NEXT: popq %rbp -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v16i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pushq %rbp -; SSSE3-NEXT: pushq %r15 -; SSSE3-NEXT: pushq %r14 -; SSSE3-NEXT: pushq %r13 -; SSSE3-NEXT: pushq %r12 -; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %dil -; SSSE3-NEXT: jb .LBB0_2 -; SSSE3-NEXT: # %bb.1: -; SSSE3-NEXT: movl %eax, %edi -; SSSE3-NEXT: .LBB0_2: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb $-1, %al -; SSSE3-NEXT: jb .LBB0_4 -; SSSE3-NEXT: # %bb.3: -; SSSE3-NEXT: movl %ecx, %eax -; SSSE3-NEXT: .LBB0_4: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb $-1, %cl -; SSSE3-NEXT: jb .LBB0_6 -; SSSE3-NEXT: # %bb.5: -; SSSE3-NEXT: movl %edx, %ecx -; SSSE3-NEXT: .LBB0_6: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb $-1, %r10b -; SSSE3-NEXT: jb .LBB0_8 -; SSSE3-NEXT: # %bb.7: -; SSSE3-NEXT: movl %edx, %r10d -; SSSE3-NEXT: .LBB0_8: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb $-1, %r11b -; SSSE3-NEXT: jb .LBB0_10 -; SSSE3-NEXT: # %bb.9: -; SSSE3-NEXT: movl %edx, %r11d -; SSSE3-NEXT: .LBB0_10: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb $-1, %r12b -; SSSE3-NEXT: jb .LBB0_12 -; SSSE3-NEXT: # %bb.11: -; SSSE3-NEXT: movl %edx, %r12d -; SSSE3-NEXT: .LBB0_12: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb $-1, %r13b -; SSSE3-NEXT: jb .LBB0_14 -; SSSE3-NEXT: # %bb.13: -; SSSE3-NEXT: movl %edx, %r13d -; SSSE3-NEXT: .LBB0_14: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb $-1, %r8b -; SSSE3-NEXT: jb .LBB0_16 -; SSSE3-NEXT: # %bb.15: -; SSSE3-NEXT: movl %edx, %r8d -; SSSE3-NEXT: .LBB0_16: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb $-1, %r14b -; SSSE3-NEXT: jb .LBB0_18 -; SSSE3-NEXT: # %bb.17: -; SSSE3-NEXT: movl %edx, %r14d -; SSSE3-NEXT: .LBB0_18: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb $-1, %r15b -; SSSE3-NEXT: jb .LBB0_20 -; SSSE3-NEXT: # %bb.19: -; SSSE3-NEXT: movl %edx, %r15d -; SSSE3-NEXT: .LBB0_20: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb $-1, %r9b -; SSSE3-NEXT: jb .LBB0_22 -; SSSE3-NEXT: # %bb.21: -; SSSE3-NEXT: movl %edx, %r9d -; SSSE3-NEXT: .LBB0_22: -; SSSE3-NEXT: movzbl %dil, %edi -; SSSE3-NEXT: movzbl %al, %esi -; SSSE3-NEXT: movzbl %cl, %ebp -; SSSE3-NEXT: movzbl %r10b, %edx -; SSSE3-NEXT: movzbl %r11b, %ebx -; SSSE3-NEXT: movzbl %r12b, %r10d -; SSSE3-NEXT: movzbl %r13b, %r11d -; SSSE3-NEXT: movzbl %r8b, %r8d -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %cl -; SSSE3-NEXT: jb .LBB0_24 -; SSSE3-NEXT: # %bb.23: -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: .LBB0_24: -; SSSE3-NEXT: movd %edi, %xmm2 -; SSSE3-NEXT: movd %esi, %xmm3 -; SSSE3-NEXT: movd %ebp, %xmm5 -; SSSE3-NEXT: movd %edx, %xmm0 -; SSSE3-NEXT: movd %ebx, %xmm6 -; SSSE3-NEXT: movd %r10d, %xmm4 -; SSSE3-NEXT: movd %r11d, %xmm7 -; SSSE3-NEXT: movd %r8d, %xmm1 -; SSSE3-NEXT: movzbl %r14b, %esi -; SSSE3-NEXT: movzbl %r15b, %edx -; SSSE3-NEXT: movzbl %r9b, %eax -; SSSE3-NEXT: movzbl %cl, %edi -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb $-1, %bl -; SSSE3-NEXT: jb .LBB0_26 -; SSSE3-NEXT: # %bb.25: -; SSSE3-NEXT: movl %ecx, %ebx -; SSSE3-NEXT: .LBB0_26: -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; SSSE3-NEXT: movd %esi, %xmm6 -; SSSE3-NEXT: movd %edx, %xmm5 -; SSSE3-NEXT: movd %eax, %xmm7 -; SSSE3-NEXT: movd %edi, %xmm2 -; SSSE3-NEXT: movzbl %bl, %eax -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb $-1, %cl -; SSSE3-NEXT: jb .LBB0_28 -; SSSE3-NEXT: # %bb.27: -; SSSE3-NEXT: movl %edx, %ecx -; SSSE3-NEXT: .LBB0_28: -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] -; SSSE3-NEXT: movd %eax, %xmm4 -; SSSE3-NEXT: movzbl %cl, %eax -; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb $-1, %cl -; SSSE3-NEXT: jb .LBB0_30 -; SSSE3-NEXT: # %bb.29: -; SSSE3-NEXT: movl %edx, %ecx -; SSSE3-NEXT: .LBB0_30: -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSSE3-NEXT: movzbl %cl, %ecx -; SSSE3-NEXT: movd %ecx, %xmm4 -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %cl -; SSSE3-NEXT: jb .LBB0_32 -; SSSE3-NEXT: # %bb.31: -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: .LBB0_32: -; SSSE3-NEXT: movzbl %cl, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: popq %rbx -; SSSE3-NEXT: popq %r12 -; SSSE3-NEXT: popq %r13 -; SSSE3-NEXT: popq %r14 -; SSSE3-NEXT: popq %r15 -; SSSE3-NEXT: popq %rbp -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v16i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pextrb $1, %xmm1, %eax -; SSE41-NEXT: pextrb $1, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %sil -; SSE41-NEXT: movb $-1, %dl -; SSE41-NEXT: jb .LBB0_2 -; SSE41-NEXT: # %bb.1: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB0_2: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pextrb $0, %xmm1, %eax -; SSE41-NEXT: pextrb $0, %xmm0, %edx -; SSE41-NEXT: addb %al, %dl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB0_4 -; SSE41-NEXT: # %bb.3: -; SSE41-NEXT: movl %edx, %eax -; SSE41-NEXT: .LBB0_4: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: movd %eax, %xmm2 -; SSE41-NEXT: pinsrb $1, %ecx, %xmm2 -; SSE41-NEXT: pextrb $2, %xmm1, %eax -; SSE41-NEXT: pextrb $2, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB0_6 -; SSE41-NEXT: # %bb.5: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB0_6: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $2, %eax, %xmm2 -; SSE41-NEXT: pextrb $3, %xmm1, %eax -; SSE41-NEXT: pextrb $3, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB0_8 -; SSE41-NEXT: # %bb.7: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB0_8: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $3, %eax, %xmm2 -; SSE41-NEXT: pextrb $4, %xmm1, %eax -; SSE41-NEXT: pextrb $4, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB0_10 -; SSE41-NEXT: # %bb.9: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB0_10: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $4, %eax, %xmm2 -; SSE41-NEXT: pextrb $5, %xmm1, %eax -; SSE41-NEXT: pextrb $5, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB0_12 -; SSE41-NEXT: # %bb.11: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB0_12: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $5, %eax, %xmm2 -; SSE41-NEXT: pextrb $6, %xmm1, %eax -; SSE41-NEXT: pextrb $6, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB0_14 -; SSE41-NEXT: # %bb.13: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB0_14: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $6, %eax, %xmm2 -; SSE41-NEXT: pextrb $7, %xmm1, %eax -; SSE41-NEXT: pextrb $7, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB0_16 -; SSE41-NEXT: # %bb.15: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB0_16: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $7, %eax, %xmm2 -; SSE41-NEXT: pextrb $8, %xmm1, %eax -; SSE41-NEXT: pextrb $8, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB0_18 -; SSE41-NEXT: # %bb.17: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB0_18: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $8, %eax, %xmm2 -; SSE41-NEXT: pextrb $9, %xmm1, %eax -; SSE41-NEXT: pextrb $9, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB0_20 -; SSE41-NEXT: # %bb.19: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB0_20: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $9, %eax, %xmm2 -; SSE41-NEXT: pextrb $10, %xmm1, %eax -; SSE41-NEXT: pextrb $10, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB0_22 -; SSE41-NEXT: # %bb.21: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB0_22: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $10, %eax, %xmm2 -; SSE41-NEXT: pextrb $11, %xmm1, %eax -; SSE41-NEXT: pextrb $11, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB0_24 -; SSE41-NEXT: # %bb.23: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB0_24: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $11, %eax, %xmm2 -; SSE41-NEXT: pextrb $12, %xmm1, %eax -; SSE41-NEXT: pextrb $12, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB0_26 -; SSE41-NEXT: # %bb.25: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB0_26: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $12, %eax, %xmm2 -; SSE41-NEXT: pextrb $13, %xmm1, %eax -; SSE41-NEXT: pextrb $13, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB0_28 -; SSE41-NEXT: # %bb.27: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB0_28: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $13, %eax, %xmm2 -; SSE41-NEXT: pextrb $14, %xmm1, %eax -; SSE41-NEXT: pextrb $14, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB0_30 -; SSE41-NEXT: # %bb.29: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB0_30: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $14, %eax, %xmm2 -; SSE41-NEXT: pextrb $15, %xmm1, %eax -; SSE41-NEXT: pextrb $15, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: jb .LBB0_32 -; SSE41-NEXT: # %bb.31: -; SSE41-NEXT: movl %ecx, %esi -; SSE41-NEXT: .LBB0_32: -; SSE41-NEXT: movzbl %sil, %eax -; SSE41-NEXT: pinsrb $15, %eax, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: v16i8: +; SSE: # %bb.0: +; SSE-NEXT: paddusb %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: v16i8: ; AVX: # %bb.0: -; AVX-NEXT: vpextrb $1, %xmm1, %eax -; AVX-NEXT: vpextrb $1, %xmm0, %ecx -; AVX-NEXT: addb %al, %cl -; AVX-NEXT: movb $-1, %sil -; AVX-NEXT: movb $-1, %dl -; AVX-NEXT: jb .LBB0_2 -; AVX-NEXT: # %bb.1: -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: .LBB0_2: -; AVX-NEXT: movzbl %dl, %ecx -; AVX-NEXT: vpextrb $0, %xmm1, %eax -; AVX-NEXT: vpextrb $0, %xmm0, %edx -; AVX-NEXT: addb %al, %dl -; AVX-NEXT: movb $-1, %al -; AVX-NEXT: jb .LBB0_4 -; AVX-NEXT: # %bb.3: -; AVX-NEXT: movl %edx, %eax -; AVX-NEXT: .LBB0_4: -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vmovd %eax, %xmm2 -; AVX-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $2, %xmm1, %eax -; AVX-NEXT: vpextrb $2, %xmm0, %ecx -; AVX-NEXT: addb %al, %cl -; AVX-NEXT: movb $-1, %al -; AVX-NEXT: jb .LBB0_6 -; AVX-NEXT: # %bb.5: -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: .LBB0_6: -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $3, %xmm1, %eax -; AVX-NEXT: vpextrb $3, %xmm0, %ecx -; AVX-NEXT: addb %al, %cl -; AVX-NEXT: movb $-1, %al -; AVX-NEXT: jb .LBB0_8 -; AVX-NEXT: # %bb.7: -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: .LBB0_8: -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $4, %xmm1, %eax -; AVX-NEXT: vpextrb $4, %xmm0, %ecx -; AVX-NEXT: addb %al, %cl -; AVX-NEXT: movb $-1, %al -; AVX-NEXT: jb .LBB0_10 -; AVX-NEXT: # %bb.9: -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: .LBB0_10: -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $5, %xmm1, %eax -; AVX-NEXT: vpextrb $5, %xmm0, %ecx -; AVX-NEXT: addb %al, %cl -; AVX-NEXT: movb $-1, %al -; AVX-NEXT: jb .LBB0_12 -; AVX-NEXT: # %bb.11: -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: .LBB0_12: -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $6, %xmm1, %eax -; AVX-NEXT: vpextrb $6, %xmm0, %ecx -; AVX-NEXT: addb %al, %cl -; AVX-NEXT: movb $-1, %al -; AVX-NEXT: jb .LBB0_14 -; AVX-NEXT: # %bb.13: -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: .LBB0_14: -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $7, %xmm1, %eax -; AVX-NEXT: vpextrb $7, %xmm0, %ecx -; AVX-NEXT: addb %al, %cl -; AVX-NEXT: movb $-1, %al -; AVX-NEXT: jb .LBB0_16 -; AVX-NEXT: # %bb.15: -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: .LBB0_16: -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $8, %xmm1, %eax -; AVX-NEXT: vpextrb $8, %xmm0, %ecx -; AVX-NEXT: addb %al, %cl -; AVX-NEXT: movb $-1, %al -; AVX-NEXT: jb .LBB0_18 -; AVX-NEXT: # %bb.17: -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: .LBB0_18: -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $9, %xmm1, %eax -; AVX-NEXT: vpextrb $9, %xmm0, %ecx -; AVX-NEXT: addb %al, %cl -; AVX-NEXT: movb $-1, %al -; AVX-NEXT: jb .LBB0_20 -; AVX-NEXT: # %bb.19: -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: .LBB0_20: -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $10, %xmm1, %eax -; AVX-NEXT: vpextrb $10, %xmm0, %ecx -; AVX-NEXT: addb %al, %cl -; AVX-NEXT: movb $-1, %al -; AVX-NEXT: jb .LBB0_22 -; AVX-NEXT: # %bb.21: -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: .LBB0_22: -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $11, %xmm1, %eax -; AVX-NEXT: vpextrb $11, %xmm0, %ecx -; AVX-NEXT: addb %al, %cl -; AVX-NEXT: movb $-1, %al -; AVX-NEXT: jb .LBB0_24 -; AVX-NEXT: # %bb.23: -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: .LBB0_24: -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $12, %xmm1, %eax -; AVX-NEXT: vpextrb $12, %xmm0, %ecx -; AVX-NEXT: addb %al, %cl -; AVX-NEXT: movb $-1, %al -; AVX-NEXT: jb .LBB0_26 -; AVX-NEXT: # %bb.25: -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: .LBB0_26: -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $13, %xmm1, %eax -; AVX-NEXT: vpextrb $13, %xmm0, %ecx -; AVX-NEXT: addb %al, %cl -; AVX-NEXT: movb $-1, %al -; AVX-NEXT: jb .LBB0_28 -; AVX-NEXT: # %bb.27: -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: .LBB0_28: -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $14, %xmm1, %eax -; AVX-NEXT: vpextrb $14, %xmm0, %ecx -; AVX-NEXT: addb %al, %cl -; AVX-NEXT: movb $-1, %al -; AVX-NEXT: jb .LBB0_30 -; AVX-NEXT: # %bb.29: -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: .LBB0_30: -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $15, %xmm1, %eax -; AVX-NEXT: vpextrb $15, %xmm0, %ecx -; AVX-NEXT: addb %al, %cl -; AVX-NEXT: jb .LBB0_32 -; AVX-NEXT: # %bb.31: -; AVX-NEXT: movl %ecx, %esi -; AVX-NEXT: .LBB0_32: -; AVX-NEXT: movzbl %sil, %eax -; AVX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0 +; AVX-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %z = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %x, <16 x i8> %y) ret <16 x i8> %z } define <32 x i8> @v32i8(<32 x i8> %x, <32 x i8> %y) nounwind { -; SSE2-LABEL: v32i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pushq %rbp -; SSE2-NEXT: pushq %r15 -; SSE2-NEXT: pushq %r14 -; SSE2-NEXT: pushq %r13 -; SSE2-NEXT: pushq %r12 -; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %sil -; SSE2-NEXT: jb .LBB1_2 -; SSE2-NEXT: # %bb.1: -; SSE2-NEXT: movl %eax, %esi -; SSE2-NEXT: .LBB1_2: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %dil -; SSE2-NEXT: jb .LBB1_4 -; SSE2-NEXT: # %bb.3: -; SSE2-NEXT: movl %eax, %edi -; SSE2-NEXT: .LBB1_4: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %r8b -; SSE2-NEXT: jb .LBB1_6 -; SSE2-NEXT: # %bb.5: -; SSE2-NEXT: movl %eax, %r8d -; SSE2-NEXT: .LBB1_6: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %bl -; SSE2-NEXT: jb .LBB1_8 -; SSE2-NEXT: # %bb.7: -; SSE2-NEXT: movl %eax, %ebx -; SSE2-NEXT: .LBB1_8: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %r11b -; SSE2-NEXT: jb .LBB1_10 -; SSE2-NEXT: # %bb.9: -; SSE2-NEXT: movl %eax, %r11d -; SSE2-NEXT: .LBB1_10: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %bpl -; SSE2-NEXT: jb .LBB1_12 -; SSE2-NEXT: # %bb.11: -; SSE2-NEXT: movl %eax, %ebp -; SSE2-NEXT: .LBB1_12: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %r14b -; SSE2-NEXT: jb .LBB1_14 -; SSE2-NEXT: # %bb.13: -; SSE2-NEXT: movl %eax, %r14d -; SSE2-NEXT: .LBB1_14: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %r15b -; SSE2-NEXT: jb .LBB1_16 -; SSE2-NEXT: # %bb.15: -; SSE2-NEXT: movl %eax, %r15d -; SSE2-NEXT: .LBB1_16: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %r9b -; SSE2-NEXT: jb .LBB1_18 -; SSE2-NEXT: # %bb.17: -; SSE2-NEXT: movl %eax, %r9d -; SSE2-NEXT: .LBB1_18: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %r12b -; SSE2-NEXT: jb .LBB1_20 -; SSE2-NEXT: # %bb.19: -; SSE2-NEXT: movl %eax, %r12d -; SSE2-NEXT: .LBB1_20: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %r13b -; SSE2-NEXT: jb .LBB1_22 -; SSE2-NEXT: # %bb.21: -; SSE2-NEXT: movl %eax, %r13d -; SSE2-NEXT: .LBB1_22: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %dl -; SSE2-NEXT: jb .LBB1_24 -; SSE2-NEXT: # %bb.23: -; SSE2-NEXT: movl %eax, %edx -; SSE2-NEXT: .LBB1_24: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %cl -; SSE2-NEXT: jb .LBB1_26 -; SSE2-NEXT: # %bb.25: -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: .LBB1_26: -; SSE2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %cl -; SSE2-NEXT: jb .LBB1_28 -; SSE2-NEXT: # %bb.27: -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: .LBB1_28: -; SSE2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb $-1, %r10b -; SSE2-NEXT: jb .LBB1_30 -; SSE2-NEXT: # %bb.29: -; SSE2-NEXT: movl %ecx, %r10d -; SSE2-NEXT: .LBB1_30: -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %r9b -; SSE2-NEXT: jb .LBB1_32 -; SSE2-NEXT: # %bb.31: -; SSE2-NEXT: movl %eax, %r9d -; SSE2-NEXT: .LBB1_32: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSE2-NEXT: movl %edx, %ecx -; SSE2-NEXT: jb .LBB1_34 -; SSE2-NEXT: # %bb.33: -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB1_34: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %dl -; SSE2-NEXT: jb .LBB1_36 -; SSE2-NEXT: # %bb.35: -; SSE2-NEXT: movl %eax, %edx -; SSE2-NEXT: .LBB1_36: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSE2-NEXT: jb .LBB1_38 -; SSE2-NEXT: # %bb.37: -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB1_38: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSE2-NEXT: jb .LBB1_40 -; SSE2-NEXT: # %bb.39: -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB1_40: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSE2-NEXT: jb .LBB1_42 -; SSE2-NEXT: # %bb.41: -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB1_42: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSE2-NEXT: jb .LBB1_44 -; SSE2-NEXT: # %bb.43: -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB1_44: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSE2-NEXT: jb .LBB1_46 -; SSE2-NEXT: # %bb.45: -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB1_46: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSE2-NEXT: jb .LBB1_48 -; SSE2-NEXT: # %bb.47: -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB1_48: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSE2-NEXT: jb .LBB1_50 -; SSE2-NEXT: # %bb.49: -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB1_50: -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %dl -; SSE2-NEXT: jb .LBB1_52 -; SSE2-NEXT: # %bb.51: -; SSE2-NEXT: movl %eax, %edx -; SSE2-NEXT: .LBB1_52: -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movzbl %sil, %edx -; SSE2-NEXT: movzbl %dil, %eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl %r8b, %esi -; SSE2-NEXT: movzbl %bl, %edi -; SSE2-NEXT: movzbl %r11b, %ebx -; SSE2-NEXT: movzbl %bpl, %eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl %r14b, %ebp -; SSE2-NEXT: movzbl %r15b, %r11d -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload -; SSE2-NEXT: movzbl %r12b, %r14d -; SSE2-NEXT: movzbl %r13b, %r15d -; SSE2-NEXT: movzbl %cl, %r12d -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movzbl %r10b, %r10d -; SSE2-NEXT: movzbl %r9b, %r9d -; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSE2-NEXT: jb .LBB1_54 -; SSE2-NEXT: # %bb.53: -; SSE2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB1_54: -; SSE2-NEXT: movd %edx, %xmm2 -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 4-byte Folded Reload -; SSE2-NEXT: # xmm8 = mem[0],zero,zero,zero -; SSE2-NEXT: movd %esi, %xmm3 -; SSE2-NEXT: movd %edi, %xmm11 -; SSE2-NEXT: movd %ebx, %xmm5 -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 4-byte Folded Reload -; SSE2-NEXT: # xmm9 = mem[0],zero,zero,zero -; SSE2-NEXT: movd %ebp, %xmm7 -; SSE2-NEXT: movd %r11d, %xmm1 -; SSE2-NEXT: movd %r8d, %xmm12 -; SSE2-NEXT: movd %r14d, %xmm10 -; SSE2-NEXT: movd %r15d, %xmm13 -; SSE2-NEXT: movd %r12d, %xmm4 -; SSE2-NEXT: movd %r13d, %xmm14 -; SSE2-NEXT: movd %eax, %xmm6 -; SSE2-NEXT: movd %r10d, %xmm15 -; SSE2-NEXT: movd %r9d, %xmm0 -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %bl -; SSE2-NEXT: jb .LBB1_56 -; SSE2-NEXT: # %bb.55: -; SSE2-NEXT: movl %eax, %ebx -; SSE2-NEXT: .LBB1_56: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3],xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3],xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3],xmm6[4],xmm14[4],xmm6[5],xmm14[5],xmm6[6],xmm14[6],xmm6[7],xmm14[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; SSE2-NEXT: movd %edx, %xmm7 -; SSE2-NEXT: movd %esi, %xmm12 -; SSE2-NEXT: movd %edi, %xmm13 -; SSE2-NEXT: movd %r8d, %xmm5 -; SSE2-NEXT: movd %ebp, %xmm14 -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: movd %r11d, %xmm15 -; SSE2-NEXT: movd %r9d, %xmm3 -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE2-NEXT: movzbl %bl, %edi -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movb $-1, %al -; SSE2-NEXT: jb .LBB1_58 -; SSE2-NEXT: # %bb.57: -; SSE2-NEXT: movl %ebx, %eax -; SSE2-NEXT: .LBB1_58: -; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] -; SSE2-NEXT: movd %ecx, %xmm8 -; SSE2-NEXT: movd %edx, %xmm7 -; SSE2-NEXT: movd %esi, %xmm9 -; SSE2-NEXT: movd %edi, %xmm6 -; SSE2-NEXT: movzbl %al, %eax -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb $-1, %cl -; SSE2-NEXT: jb .LBB1_60 -; SSE2-NEXT: # %bb.59: -; SSE2-NEXT: movl %edx, %ecx -; SSE2-NEXT: .LBB1_60: -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] -; SSE2-NEXT: movd %eax, %xmm4 -; SSE2-NEXT: movzbl %cl, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb $-1, %cl -; SSE2-NEXT: jb .LBB1_62 -; SSE2-NEXT: # %bb.61: -; SSE2-NEXT: movl %edx, %ecx -; SSE2-NEXT: .LBB1_62: -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE2-NEXT: movzbl %cl, %ecx -; SSE2-NEXT: movd %ecx, %xmm4 -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %cl -; SSE2-NEXT: jb .LBB1_64 -; SSE2-NEXT: # %bb.63: -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: .LBB1_64: -; SSE2-NEXT: movzbl %cl, %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; SSE2-NEXT: popq %rbx -; SSE2-NEXT: popq %r12 -; SSE2-NEXT: popq %r13 -; SSE2-NEXT: popq %r14 -; SSE2-NEXT: popq %r15 -; SSE2-NEXT: popq %rbp -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v32i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pushq %rbp -; SSSE3-NEXT: pushq %r15 -; SSSE3-NEXT: pushq %r14 -; SSSE3-NEXT: pushq %r13 -; SSSE3-NEXT: pushq %r12 -; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %sil -; SSSE3-NEXT: jb .LBB1_2 -; SSSE3-NEXT: # %bb.1: -; SSSE3-NEXT: movl %eax, %esi -; SSSE3-NEXT: .LBB1_2: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %dil -; SSSE3-NEXT: jb .LBB1_4 -; SSSE3-NEXT: # %bb.3: -; SSSE3-NEXT: movl %eax, %edi -; SSSE3-NEXT: .LBB1_4: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %r8b -; SSSE3-NEXT: jb .LBB1_6 -; SSSE3-NEXT: # %bb.5: -; SSSE3-NEXT: movl %eax, %r8d -; SSSE3-NEXT: .LBB1_6: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %bl -; SSSE3-NEXT: jb .LBB1_8 -; SSSE3-NEXT: # %bb.7: -; SSSE3-NEXT: movl %eax, %ebx -; SSSE3-NEXT: .LBB1_8: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %r11b -; SSSE3-NEXT: jb .LBB1_10 -; SSSE3-NEXT: # %bb.9: -; SSSE3-NEXT: movl %eax, %r11d -; SSSE3-NEXT: .LBB1_10: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %bpl -; SSSE3-NEXT: jb .LBB1_12 -; SSSE3-NEXT: # %bb.11: -; SSSE3-NEXT: movl %eax, %ebp -; SSSE3-NEXT: .LBB1_12: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %r14b -; SSSE3-NEXT: jb .LBB1_14 -; SSSE3-NEXT: # %bb.13: -; SSSE3-NEXT: movl %eax, %r14d -; SSSE3-NEXT: .LBB1_14: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %r15b -; SSSE3-NEXT: jb .LBB1_16 -; SSSE3-NEXT: # %bb.15: -; SSSE3-NEXT: movl %eax, %r15d -; SSSE3-NEXT: .LBB1_16: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %r9b -; SSSE3-NEXT: jb .LBB1_18 -; SSSE3-NEXT: # %bb.17: -; SSSE3-NEXT: movl %eax, %r9d -; SSSE3-NEXT: .LBB1_18: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %r12b -; SSSE3-NEXT: jb .LBB1_20 -; SSSE3-NEXT: # %bb.19: -; SSSE3-NEXT: movl %eax, %r12d -; SSSE3-NEXT: .LBB1_20: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %r13b -; SSSE3-NEXT: jb .LBB1_22 -; SSSE3-NEXT: # %bb.21: -; SSSE3-NEXT: movl %eax, %r13d -; SSSE3-NEXT: .LBB1_22: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %dl -; SSSE3-NEXT: jb .LBB1_24 -; SSSE3-NEXT: # %bb.23: -; SSSE3-NEXT: movl %eax, %edx -; SSSE3-NEXT: .LBB1_24: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %cl -; SSSE3-NEXT: jb .LBB1_26 -; SSSE3-NEXT: # %bb.25: -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: .LBB1_26: -; SSSE3-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %cl -; SSSE3-NEXT: jb .LBB1_28 -; SSSE3-NEXT: # %bb.27: -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: .LBB1_28: -; SSSE3-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb $-1, %r10b -; SSSE3-NEXT: jb .LBB1_30 -; SSSE3-NEXT: # %bb.29: -; SSSE3-NEXT: movl %ecx, %r10d -; SSSE3-NEXT: .LBB1_30: -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %r9b -; SSSE3-NEXT: jb .LBB1_32 -; SSSE3-NEXT: # %bb.31: -; SSSE3-NEXT: movl %eax, %r9d -; SSSE3-NEXT: .LBB1_32: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSSE3-NEXT: movl %edx, %ecx -; SSSE3-NEXT: jb .LBB1_34 -; SSSE3-NEXT: # %bb.33: -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB1_34: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %dl -; SSSE3-NEXT: jb .LBB1_36 -; SSSE3-NEXT: # %bb.35: -; SSSE3-NEXT: movl %eax, %edx -; SSSE3-NEXT: .LBB1_36: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSSE3-NEXT: jb .LBB1_38 -; SSSE3-NEXT: # %bb.37: -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB1_38: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSSE3-NEXT: jb .LBB1_40 -; SSSE3-NEXT: # %bb.39: -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB1_40: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSSE3-NEXT: jb .LBB1_42 -; SSSE3-NEXT: # %bb.41: -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB1_42: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSSE3-NEXT: jb .LBB1_44 -; SSSE3-NEXT: # %bb.43: -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB1_44: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSSE3-NEXT: jb .LBB1_46 -; SSSE3-NEXT: # %bb.45: -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB1_46: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSSE3-NEXT: jb .LBB1_48 -; SSSE3-NEXT: # %bb.47: -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB1_48: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSSE3-NEXT: jb .LBB1_50 -; SSSE3-NEXT: # %bb.49: -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB1_50: -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %dl -; SSSE3-NEXT: jb .LBB1_52 -; SSSE3-NEXT: # %bb.51: -; SSSE3-NEXT: movl %eax, %edx -; SSSE3-NEXT: .LBB1_52: -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movzbl %sil, %edx -; SSSE3-NEXT: movzbl %dil, %eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl %r8b, %esi -; SSSE3-NEXT: movzbl %bl, %edi -; SSSE3-NEXT: movzbl %r11b, %ebx -; SSSE3-NEXT: movzbl %bpl, %eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl %r14b, %ebp -; SSSE3-NEXT: movzbl %r15b, %r11d -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl %r12b, %r14d -; SSSE3-NEXT: movzbl %r13b, %r15d -; SSSE3-NEXT: movzbl %cl, %r12d -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movzbl %r10b, %r10d -; SSSE3-NEXT: movzbl %r9b, %r9d -; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSSE3-NEXT: jb .LBB1_54 -; SSSE3-NEXT: # %bb.53: -; SSSE3-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB1_54: -; SSSE3-NEXT: movd %edx, %xmm2 -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm8 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd %esi, %xmm3 -; SSSE3-NEXT: movd %edi, %xmm11 -; SSSE3-NEXT: movd %ebx, %xmm5 -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm9 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd %ebp, %xmm7 -; SSSE3-NEXT: movd %r11d, %xmm1 -; SSSE3-NEXT: movd %r8d, %xmm12 -; SSSE3-NEXT: movd %r14d, %xmm10 -; SSSE3-NEXT: movd %r15d, %xmm13 -; SSSE3-NEXT: movd %r12d, %xmm4 -; SSSE3-NEXT: movd %r13d, %xmm14 -; SSSE3-NEXT: movd %eax, %xmm6 -; SSSE3-NEXT: movd %r10d, %xmm15 -; SSSE3-NEXT: movd %r9d, %xmm0 -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %bl -; SSSE3-NEXT: jb .LBB1_56 -; SSSE3-NEXT: # %bb.55: -; SSSE3-NEXT: movl %eax, %ebx -; SSSE3-NEXT: .LBB1_56: -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3],xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3],xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3],xmm6[4],xmm14[4],xmm6[5],xmm14[5],xmm6[6],xmm14[6],xmm6[7],xmm14[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; SSSE3-NEXT: movd %edx, %xmm7 -; SSSE3-NEXT: movd %esi, %xmm12 -; SSSE3-NEXT: movd %edi, %xmm13 -; SSSE3-NEXT: movd %r8d, %xmm5 -; SSSE3-NEXT: movd %ebp, %xmm14 -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: movd %r11d, %xmm15 -; SSSE3-NEXT: movd %r9d, %xmm3 -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSSE3-NEXT: movzbl %bl, %edi -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movb $-1, %al -; SSSE3-NEXT: jb .LBB1_58 -; SSSE3-NEXT: # %bb.57: -; SSSE3-NEXT: movl %ebx, %eax -; SSSE3-NEXT: .LBB1_58: -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] -; SSSE3-NEXT: movd %ecx, %xmm8 -; SSSE3-NEXT: movd %edx, %xmm7 -; SSSE3-NEXT: movd %esi, %xmm9 -; SSSE3-NEXT: movd %edi, %xmm6 -; SSSE3-NEXT: movzbl %al, %eax -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb $-1, %cl -; SSSE3-NEXT: jb .LBB1_60 -; SSSE3-NEXT: # %bb.59: -; SSSE3-NEXT: movl %edx, %ecx -; SSSE3-NEXT: .LBB1_60: -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] -; SSSE3-NEXT: movd %eax, %xmm4 -; SSSE3-NEXT: movzbl %cl, %eax -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb $-1, %cl -; SSSE3-NEXT: jb .LBB1_62 -; SSSE3-NEXT: # %bb.61: -; SSSE3-NEXT: movl %edx, %ecx -; SSSE3-NEXT: .LBB1_62: -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSSE3-NEXT: movzbl %cl, %ecx -; SSSE3-NEXT: movd %ecx, %xmm4 -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %cl -; SSSE3-NEXT: jb .LBB1_64 -; SSSE3-NEXT: # %bb.63: -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: .LBB1_64: -; SSSE3-NEXT: movzbl %cl, %eax -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; SSSE3-NEXT: popq %rbx -; SSSE3-NEXT: popq %r12 -; SSSE3-NEXT: popq %r13 -; SSSE3-NEXT: popq %r14 -; SSSE3-NEXT: popq %r15 -; SSSE3-NEXT: popq %rbp -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v32i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pushq %rbp -; SSE41-NEXT: pushq %r15 -; SSE41-NEXT: pushq %r14 -; SSE41-NEXT: pushq %r13 -; SSE41-NEXT: pushq %r12 -; SSE41-NEXT: pushq %rbx -; SSE41-NEXT: pextrb $1, %xmm2, %ecx -; SSE41-NEXT: pextrb $1, %xmm0, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: movb $-1, %bl -; SSE41-NEXT: jb .LBB1_2 -; SSE41-NEXT: # %bb.1: -; SSE41-NEXT: movl %eax, %ebx -; SSE41-NEXT: .LBB1_2: -; SSE41-NEXT: pextrb $0, %xmm2, %ecx -; SSE41-NEXT: pextrb $0, %xmm0, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: movb $-1, %cl -; SSE41-NEXT: jb .LBB1_4 -; SSE41-NEXT: # %bb.3: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB1_4: -; SSE41-NEXT: pextrb $2, %xmm2, %edx -; SSE41-NEXT: pextrb $2, %xmm0, %eax -; SSE41-NEXT: addb %dl, %al -; SSE41-NEXT: movb $-1, %sil -; SSE41-NEXT: jb .LBB1_6 -; SSE41-NEXT: # %bb.5: -; SSE41-NEXT: movl %eax, %esi -; SSE41-NEXT: .LBB1_6: -; SSE41-NEXT: pextrb $3, %xmm2, %eax -; SSE41-NEXT: pextrb $3, %xmm0, %edx -; SSE41-NEXT: addb %al, %dl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB1_8 -; SSE41-NEXT: # %bb.7: -; SSE41-NEXT: movl %edx, %eax -; SSE41-NEXT: .LBB1_8: -; SSE41-NEXT: pextrb $4, %xmm2, %edx -; SSE41-NEXT: pextrb $4, %xmm0, %edi -; SSE41-NEXT: addb %dl, %dil -; SSE41-NEXT: movb $-1, %r10b -; SSE41-NEXT: jb .LBB1_10 -; SSE41-NEXT: # %bb.9: -; SSE41-NEXT: movl %edi, %r10d -; SSE41-NEXT: .LBB1_10: -; SSE41-NEXT: pextrb $5, %xmm2, %edi -; SSE41-NEXT: pextrb $5, %xmm0, %ebp -; SSE41-NEXT: addb %dil, %bpl -; SSE41-NEXT: movb $-1, %r9b -; SSE41-NEXT: jb .LBB1_12 -; SSE41-NEXT: # %bb.11: -; SSE41-NEXT: movl %ebp, %r9d -; SSE41-NEXT: .LBB1_12: -; SSE41-NEXT: pextrb $6, %xmm2, %edi -; SSE41-NEXT: pextrb $6, %xmm0, %ebp -; SSE41-NEXT: addb %dil, %bpl -; SSE41-NEXT: movb $-1, %r13b -; SSE41-NEXT: jb .LBB1_14 -; SSE41-NEXT: # %bb.13: -; SSE41-NEXT: movl %ebp, %r13d -; SSE41-NEXT: .LBB1_14: -; SSE41-NEXT: pextrb $7, %xmm2, %edi -; SSE41-NEXT: pextrb $7, %xmm0, %ebp -; SSE41-NEXT: addb %dil, %bpl -; SSE41-NEXT: movb $-1, %r12b -; SSE41-NEXT: jb .LBB1_16 -; SSE41-NEXT: # %bb.15: -; SSE41-NEXT: movl %ebp, %r12d -; SSE41-NEXT: .LBB1_16: -; SSE41-NEXT: pextrb $8, %xmm2, %edi -; SSE41-NEXT: pextrb $8, %xmm0, %ebp -; SSE41-NEXT: addb %dil, %bpl -; SSE41-NEXT: movb $-1, %r15b -; SSE41-NEXT: jb .LBB1_18 -; SSE41-NEXT: # %bb.17: -; SSE41-NEXT: movl %ebp, %r15d -; SSE41-NEXT: .LBB1_18: -; SSE41-NEXT: pextrb $9, %xmm2, %edi -; SSE41-NEXT: pextrb $9, %xmm0, %ebp -; SSE41-NEXT: addb %dil, %bpl -; SSE41-NEXT: movb $-1, %r14b -; SSE41-NEXT: jb .LBB1_20 -; SSE41-NEXT: # %bb.19: -; SSE41-NEXT: movl %ebp, %r14d -; SSE41-NEXT: .LBB1_20: -; SSE41-NEXT: pextrb $10, %xmm2, %edi -; SSE41-NEXT: pextrb $10, %xmm0, %ebp -; SSE41-NEXT: addb %dil, %bpl -; SSE41-NEXT: movb $-1, %r8b -; SSE41-NEXT: jb .LBB1_22 -; SSE41-NEXT: # %bb.21: -; SSE41-NEXT: movl %ebp, %r8d -; SSE41-NEXT: .LBB1_22: -; SSE41-NEXT: pextrb $11, %xmm2, %edi -; SSE41-NEXT: pextrb $11, %xmm0, %ebp -; SSE41-NEXT: addb %dil, %bpl -; SSE41-NEXT: movb $-1, %r11b -; SSE41-NEXT: jb .LBB1_24 -; SSE41-NEXT: # %bb.23: -; SSE41-NEXT: movl %ebp, %r11d -; SSE41-NEXT: .LBB1_24: -; SSE41-NEXT: pextrb $12, %xmm2, %edi -; SSE41-NEXT: pextrb $12, %xmm0, %ebp -; SSE41-NEXT: addb %dil, %bpl -; SSE41-NEXT: movb $-1, %dl -; SSE41-NEXT: jb .LBB1_26 -; SSE41-NEXT: # %bb.25: -; SSE41-NEXT: movl %ebp, %edx -; SSE41-NEXT: .LBB1_26: -; SSE41-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE41-NEXT: pextrb $13, %xmm2, %edi -; SSE41-NEXT: pextrb $13, %xmm0, %ebp -; SSE41-NEXT: addb %dil, %bpl -; SSE41-NEXT: movb $-1, %dl -; SSE41-NEXT: jb .LBB1_28 -; SSE41-NEXT: # %bb.27: -; SSE41-NEXT: movl %ebp, %edx -; SSE41-NEXT: .LBB1_28: -; SSE41-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE41-NEXT: pextrb $14, %xmm2, %edi -; SSE41-NEXT: pextrb $14, %xmm0, %ebp -; SSE41-NEXT: addb %dil, %bpl -; SSE41-NEXT: movb $-1, %dl -; SSE41-NEXT: jb .LBB1_30 -; SSE41-NEXT: # %bb.29: -; SSE41-NEXT: movl %ebp, %edx -; SSE41-NEXT: .LBB1_30: -; SSE41-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE41-NEXT: movzbl %cl, %ecx -; SSE41-NEXT: pextrb $15, %xmm2, %edi -; SSE41-NEXT: pextrb $15, %xmm0, %ebp -; SSE41-NEXT: addb %dil, %bpl -; SSE41-NEXT: movb $-1, %dil -; SSE41-NEXT: jb .LBB1_32 -; SSE41-NEXT: # %bb.31: -; SSE41-NEXT: movl %ebp, %edi -; SSE41-NEXT: .LBB1_32: -; SSE41-NEXT: movzbl %bl, %ebx -; SSE41-NEXT: movd %ecx, %xmm0 -; SSE41-NEXT: pextrb $1, %xmm3, %ecx -; SSE41-NEXT: pextrb $1, %xmm1, %ebp -; SSE41-NEXT: addb %cl, %bpl -; SSE41-NEXT: movb $-1, %cl -; SSE41-NEXT: jb .LBB1_34 -; SSE41-NEXT: # %bb.33: -; SSE41-NEXT: movl %ebp, %ecx -; SSE41-NEXT: .LBB1_34: -; SSE41-NEXT: pinsrb $1, %ebx, %xmm0 -; SSE41-NEXT: movzbl %sil, %ebx -; SSE41-NEXT: movzbl %cl, %esi -; SSE41-NEXT: pextrb $0, %xmm3, %ecx -; SSE41-NEXT: pextrb $0, %xmm1, %ebp -; SSE41-NEXT: addb %cl, %bpl -; SSE41-NEXT: movb $-1, %cl -; SSE41-NEXT: jb .LBB1_36 -; SSE41-NEXT: # %bb.35: -; SSE41-NEXT: movl %ebp, %ecx -; SSE41-NEXT: .LBB1_36: -; SSE41-NEXT: pinsrb $2, %ebx, %xmm0 -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: movzbl %cl, %ecx -; SSE41-NEXT: movd %ecx, %xmm2 -; SSE41-NEXT: pinsrb $1, %esi, %xmm2 -; SSE41-NEXT: pextrb $2, %xmm3, %ecx -; SSE41-NEXT: pextrb $2, %xmm1, %esi -; SSE41-NEXT: addb %cl, %sil -; SSE41-NEXT: movb $-1, %cl -; SSE41-NEXT: jb .LBB1_38 -; SSE41-NEXT: # %bb.37: -; SSE41-NEXT: movl %esi, %ecx -; SSE41-NEXT: .LBB1_38: -; SSE41-NEXT: pinsrb $3, %eax, %xmm0 -; SSE41-NEXT: movzbl %r10b, %eax -; SSE41-NEXT: movzbl %cl, %ecx -; SSE41-NEXT: pinsrb $2, %ecx, %xmm2 -; SSE41-NEXT: pextrb $3, %xmm3, %ecx -; SSE41-NEXT: pextrb $3, %xmm1, %edx -; SSE41-NEXT: addb %cl, %dl -; SSE41-NEXT: movb $-1, %cl -; SSE41-NEXT: jb .LBB1_40 -; SSE41-NEXT: # %bb.39: -; SSE41-NEXT: movl %edx, %ecx -; SSE41-NEXT: .LBB1_40: -; SSE41-NEXT: pinsrb $4, %eax, %xmm0 -; SSE41-NEXT: movzbl %r9b, %eax -; SSE41-NEXT: movzbl %cl, %ecx -; SSE41-NEXT: pinsrb $3, %ecx, %xmm2 -; SSE41-NEXT: pextrb $4, %xmm3, %ecx -; SSE41-NEXT: pextrb $4, %xmm1, %edx -; SSE41-NEXT: addb %cl, %dl -; SSE41-NEXT: movb $-1, %cl -; SSE41-NEXT: jb .LBB1_42 -; SSE41-NEXT: # %bb.41: -; SSE41-NEXT: movl %edx, %ecx -; SSE41-NEXT: .LBB1_42: -; SSE41-NEXT: pinsrb $5, %eax, %xmm0 -; SSE41-NEXT: movzbl %r13b, %eax -; SSE41-NEXT: movzbl %cl, %ecx -; SSE41-NEXT: pinsrb $4, %ecx, %xmm2 -; SSE41-NEXT: pextrb $5, %xmm3, %ecx -; SSE41-NEXT: pextrb $5, %xmm1, %edx -; SSE41-NEXT: addb %cl, %dl -; SSE41-NEXT: movb $-1, %cl -; SSE41-NEXT: jb .LBB1_44 -; SSE41-NEXT: # %bb.43: -; SSE41-NEXT: movl %edx, %ecx -; SSE41-NEXT: .LBB1_44: -; SSE41-NEXT: pinsrb $6, %eax, %xmm0 -; SSE41-NEXT: movzbl %r12b, %eax -; SSE41-NEXT: movzbl %cl, %ecx -; SSE41-NEXT: pinsrb $5, %ecx, %xmm2 -; SSE41-NEXT: pextrb $6, %xmm3, %ecx -; SSE41-NEXT: pextrb $6, %xmm1, %edx -; SSE41-NEXT: addb %cl, %dl -; SSE41-NEXT: movb $-1, %cl -; SSE41-NEXT: jb .LBB1_46 -; SSE41-NEXT: # %bb.45: -; SSE41-NEXT: movl %edx, %ecx -; SSE41-NEXT: .LBB1_46: -; SSE41-NEXT: pinsrb $7, %eax, %xmm0 -; SSE41-NEXT: movzbl %r15b, %eax -; SSE41-NEXT: movzbl %cl, %ecx -; SSE41-NEXT: pinsrb $6, %ecx, %xmm2 -; SSE41-NEXT: pextrb $7, %xmm3, %ecx -; SSE41-NEXT: pextrb $7, %xmm1, %edx -; SSE41-NEXT: addb %cl, %dl -; SSE41-NEXT: movb $-1, %cl -; SSE41-NEXT: jb .LBB1_48 -; SSE41-NEXT: # %bb.47: -; SSE41-NEXT: movl %edx, %ecx -; SSE41-NEXT: .LBB1_48: -; SSE41-NEXT: pinsrb $8, %eax, %xmm0 -; SSE41-NEXT: movzbl %r14b, %eax -; SSE41-NEXT: movzbl %cl, %ecx -; SSE41-NEXT: pinsrb $7, %ecx, %xmm2 -; SSE41-NEXT: pextrb $8, %xmm3, %ecx -; SSE41-NEXT: pextrb $8, %xmm1, %edx -; SSE41-NEXT: addb %cl, %dl -; SSE41-NEXT: movb $-1, %cl -; SSE41-NEXT: jb .LBB1_50 -; SSE41-NEXT: # %bb.49: -; SSE41-NEXT: movl %edx, %ecx -; SSE41-NEXT: .LBB1_50: -; SSE41-NEXT: pinsrb $9, %eax, %xmm0 -; SSE41-NEXT: movzbl %r8b, %eax -; SSE41-NEXT: movzbl %cl, %ecx -; SSE41-NEXT: pinsrb $8, %ecx, %xmm2 -; SSE41-NEXT: pextrb $9, %xmm3, %ecx -; SSE41-NEXT: pextrb $9, %xmm1, %edx -; SSE41-NEXT: addb %cl, %dl -; SSE41-NEXT: movb $-1, %cl -; SSE41-NEXT: jb .LBB1_52 -; SSE41-NEXT: # %bb.51: -; SSE41-NEXT: movl %edx, %ecx -; SSE41-NEXT: .LBB1_52: -; SSE41-NEXT: pinsrb $10, %eax, %xmm0 -; SSE41-NEXT: movzbl %r11b, %eax -; SSE41-NEXT: movzbl %cl, %ecx -; SSE41-NEXT: pinsrb $9, %ecx, %xmm2 -; SSE41-NEXT: pextrb $10, %xmm3, %ecx -; SSE41-NEXT: pextrb $10, %xmm1, %edx -; SSE41-NEXT: addb %cl, %dl -; SSE41-NEXT: movb $-1, %cl -; SSE41-NEXT: jb .LBB1_54 -; SSE41-NEXT: # %bb.53: -; SSE41-NEXT: movl %edx, %ecx -; SSE41-NEXT: .LBB1_54: -; SSE41-NEXT: pinsrb $11, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: movzbl %cl, %ecx -; SSE41-NEXT: pinsrb $10, %ecx, %xmm2 -; SSE41-NEXT: pextrb $11, %xmm3, %ecx -; SSE41-NEXT: pextrb $11, %xmm1, %edx -; SSE41-NEXT: addb %cl, %dl -; SSE41-NEXT: movb $-1, %cl -; SSE41-NEXT: jb .LBB1_56 -; SSE41-NEXT: # %bb.55: -; SSE41-NEXT: movl %edx, %ecx -; SSE41-NEXT: .LBB1_56: -; SSE41-NEXT: pinsrb $12, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: movzbl %cl, %ecx -; SSE41-NEXT: pinsrb $11, %ecx, %xmm2 -; SSE41-NEXT: pextrb $12, %xmm3, %ecx -; SSE41-NEXT: pextrb $12, %xmm1, %edx -; SSE41-NEXT: addb %cl, %dl -; SSE41-NEXT: movb $-1, %cl -; SSE41-NEXT: jb .LBB1_58 -; SSE41-NEXT: # %bb.57: -; SSE41-NEXT: movl %edx, %ecx -; SSE41-NEXT: .LBB1_58: -; SSE41-NEXT: pinsrb $13, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: movzbl %cl, %ecx -; SSE41-NEXT: pinsrb $12, %ecx, %xmm2 -; SSE41-NEXT: pextrb $13, %xmm3, %ecx -; SSE41-NEXT: pextrb $13, %xmm1, %edx -; SSE41-NEXT: addb %cl, %dl -; SSE41-NEXT: movb $-1, %cl -; SSE41-NEXT: jb .LBB1_60 -; SSE41-NEXT: # %bb.59: -; SSE41-NEXT: movl %edx, %ecx -; SSE41-NEXT: .LBB1_60: -; SSE41-NEXT: pinsrb $14, %eax, %xmm0 -; SSE41-NEXT: movzbl %dil, %eax -; SSE41-NEXT: movzbl %cl, %ecx -; SSE41-NEXT: pinsrb $13, %ecx, %xmm2 -; SSE41-NEXT: pextrb $14, %xmm3, %edx -; SSE41-NEXT: pextrb $14, %xmm1, %ecx -; SSE41-NEXT: addb %dl, %cl -; SSE41-NEXT: movb $-1, %dl -; SSE41-NEXT: jb .LBB1_62 -; SSE41-NEXT: # %bb.61: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB1_62: -; SSE41-NEXT: pinsrb $15, %eax, %xmm0 -; SSE41-NEXT: movzbl %dl, %eax -; SSE41-NEXT: pinsrb $14, %eax, %xmm2 -; SSE41-NEXT: pextrb $15, %xmm3, %ecx -; SSE41-NEXT: pextrb $15, %xmm1, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: movb $-1, %cl -; SSE41-NEXT: jb .LBB1_64 -; SSE41-NEXT: # %bb.63: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB1_64: -; SSE41-NEXT: movzbl %cl, %eax -; SSE41-NEXT: pinsrb $15, %eax, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm1 -; SSE41-NEXT: popq %rbx -; SSE41-NEXT: popq %r12 -; SSE41-NEXT: popq %r13 -; SSE41-NEXT: popq %r14 -; SSE41-NEXT: popq %r15 -; SSE41-NEXT: popq %rbp -; SSE41-NEXT: retq +; SSE-LABEL: v32i8: +; SSE: # %bb.0: +; SSE-NEXT: paddusb %xmm2, %xmm0 +; SSE-NEXT: paddusb %xmm3, %xmm1 +; SSE-NEXT: retq ; ; AVX1-LABEL: v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: pushq %rbp -; AVX1-NEXT: pushq %r15 -; AVX1-NEXT: pushq %r14 -; AVX1-NEXT: pushq %r13 -; AVX1-NEXT: pushq %r12 -; AVX1-NEXT: pushq %rbx ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpextrb $1, %xmm2, %ecx ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpextrb $1, %xmm3, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb $-1, %bl -; AVX1-NEXT: jb .LBB1_2 -; AVX1-NEXT: # %bb.1: -; AVX1-NEXT: movl %eax, %ebx -; AVX1-NEXT: .LBB1_2: -; AVX1-NEXT: vpextrb $0, %xmm2, %ecx -; AVX1-NEXT: vpextrb $0, %xmm3, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb $-1, %cl -; AVX1-NEXT: jb .LBB1_4 -; AVX1-NEXT: # %bb.3: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB1_4: -; AVX1-NEXT: vpextrb $2, %xmm2, %edx -; AVX1-NEXT: vpextrb $2, %xmm3, %eax -; AVX1-NEXT: addb %dl, %al -; AVX1-NEXT: movb $-1, %sil -; AVX1-NEXT: jb .LBB1_6 -; AVX1-NEXT: # %bb.5: -; AVX1-NEXT: movl %eax, %esi -; AVX1-NEXT: .LBB1_6: -; AVX1-NEXT: vpextrb $3, %xmm2, %eax -; AVX1-NEXT: vpextrb $3, %xmm3, %edx -; AVX1-NEXT: addb %al, %dl -; AVX1-NEXT: movb $-1, %al -; AVX1-NEXT: jb .LBB1_8 -; AVX1-NEXT: # %bb.7: -; AVX1-NEXT: movl %edx, %eax -; AVX1-NEXT: .LBB1_8: -; AVX1-NEXT: vpextrb $4, %xmm2, %edx -; AVX1-NEXT: vpextrb $4, %xmm3, %edi -; AVX1-NEXT: addb %dl, %dil -; AVX1-NEXT: movb $-1, %r9b -; AVX1-NEXT: jb .LBB1_10 -; AVX1-NEXT: # %bb.9: -; AVX1-NEXT: movl %edi, %r9d -; AVX1-NEXT: .LBB1_10: -; AVX1-NEXT: vpextrb $5, %xmm2, %edx -; AVX1-NEXT: vpextrb $5, %xmm3, %ebp -; AVX1-NEXT: addb %dl, %bpl -; AVX1-NEXT: movb $-1, %dil -; AVX1-NEXT: jb .LBB1_12 -; AVX1-NEXT: # %bb.11: -; AVX1-NEXT: movl %ebp, %edi -; AVX1-NEXT: .LBB1_12: -; AVX1-NEXT: vpextrb $6, %xmm2, %edx -; AVX1-NEXT: vpextrb $6, %xmm3, %ebp -; AVX1-NEXT: addb %dl, %bpl -; AVX1-NEXT: movb $-1, %r13b -; AVX1-NEXT: jb .LBB1_14 -; AVX1-NEXT: # %bb.13: -; AVX1-NEXT: movl %ebp, %r13d -; AVX1-NEXT: .LBB1_14: -; AVX1-NEXT: vpextrb $7, %xmm2, %edx -; AVX1-NEXT: vpextrb $7, %xmm3, %ebp -; AVX1-NEXT: addb %dl, %bpl -; AVX1-NEXT: movb $-1, %r12b -; AVX1-NEXT: jb .LBB1_16 -; AVX1-NEXT: # %bb.15: -; AVX1-NEXT: movl %ebp, %r12d -; AVX1-NEXT: .LBB1_16: -; AVX1-NEXT: vpextrb $8, %xmm2, %edx -; AVX1-NEXT: vpextrb $8, %xmm3, %ebp -; AVX1-NEXT: addb %dl, %bpl -; AVX1-NEXT: movb $-1, %r15b -; AVX1-NEXT: jb .LBB1_18 -; AVX1-NEXT: # %bb.17: -; AVX1-NEXT: movl %ebp, %r15d -; AVX1-NEXT: .LBB1_18: -; AVX1-NEXT: vpextrb $9, %xmm2, %edx -; AVX1-NEXT: vpextrb $9, %xmm3, %ebp -; AVX1-NEXT: addb %dl, %bpl -; AVX1-NEXT: movb $-1, %r14b -; AVX1-NEXT: jb .LBB1_20 -; AVX1-NEXT: # %bb.19: -; AVX1-NEXT: movl %ebp, %r14d -; AVX1-NEXT: .LBB1_20: -; AVX1-NEXT: vpextrb $10, %xmm2, %edx -; AVX1-NEXT: vpextrb $10, %xmm3, %ebp -; AVX1-NEXT: addb %dl, %bpl -; AVX1-NEXT: movb $-1, %r8b -; AVX1-NEXT: jb .LBB1_22 -; AVX1-NEXT: # %bb.21: -; AVX1-NEXT: movl %ebp, %r8d -; AVX1-NEXT: .LBB1_22: -; AVX1-NEXT: vpextrb $11, %xmm2, %edx -; AVX1-NEXT: vpextrb $11, %xmm3, %ebp -; AVX1-NEXT: addb %dl, %bpl -; AVX1-NEXT: movb $-1, %r11b -; AVX1-NEXT: jb .LBB1_24 -; AVX1-NEXT: # %bb.23: -; AVX1-NEXT: movl %ebp, %r11d -; AVX1-NEXT: .LBB1_24: -; AVX1-NEXT: vpextrb $12, %xmm2, %edx -; AVX1-NEXT: vpextrb $12, %xmm3, %ebp -; AVX1-NEXT: addb %dl, %bpl -; AVX1-NEXT: movb $-1, %r10b -; AVX1-NEXT: jb .LBB1_26 -; AVX1-NEXT: # %bb.25: -; AVX1-NEXT: movl %ebp, %r10d -; AVX1-NEXT: .LBB1_26: -; AVX1-NEXT: vpextrb $13, %xmm2, %edx -; AVX1-NEXT: vpextrb $13, %xmm3, %ebp -; AVX1-NEXT: addb %dl, %bpl -; AVX1-NEXT: movb $-1, %dl -; AVX1-NEXT: jb .LBB1_28 -; AVX1-NEXT: # %bb.27: -; AVX1-NEXT: movl %ebp, %edx -; AVX1-NEXT: .LBB1_28: -; AVX1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX1-NEXT: vpextrb $14, %xmm2, %edx -; AVX1-NEXT: vpextrb $14, %xmm3, %ebp -; AVX1-NEXT: addb %dl, %bpl -; AVX1-NEXT: movb $-1, %dl -; AVX1-NEXT: jb .LBB1_30 -; AVX1-NEXT: # %bb.29: -; AVX1-NEXT: movl %ebp, %edx -; AVX1-NEXT: .LBB1_30: -; AVX1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX1-NEXT: movzbl %cl, %ecx -; AVX1-NEXT: vpextrb $15, %xmm2, %edx -; AVX1-NEXT: vpextrb $15, %xmm3, %ebp -; AVX1-NEXT: addb %dl, %bpl -; AVX1-NEXT: movb $-1, %dl -; AVX1-NEXT: jb .LBB1_32 -; AVX1-NEXT: # %bb.31: -; AVX1-NEXT: movl %ebp, %edx -; AVX1-NEXT: .LBB1_32: -; AVX1-NEXT: movl %edx, %ebp -; AVX1-NEXT: movzbl %bl, %ebx -; AVX1-NEXT: vmovd %ecx, %xmm2 -; AVX1-NEXT: vpextrb $1, %xmm1, %edx -; AVX1-NEXT: vpextrb $1, %xmm0, %ecx -; AVX1-NEXT: addb %dl, %cl -; AVX1-NEXT: movb $-1, %dl -; AVX1-NEXT: jb .LBB1_34 -; AVX1-NEXT: # %bb.33: -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: .LBB1_34: -; AVX1-NEXT: vpinsrb $1, %ebx, %xmm2, %xmm2 -; AVX1-NEXT: movzbl %sil, %ecx -; AVX1-NEXT: movzbl %dl, %esi -; AVX1-NEXT: vpextrb $0, %xmm1, %edx -; AVX1-NEXT: vpextrb $0, %xmm0, %ebx -; AVX1-NEXT: addb %dl, %bl -; AVX1-NEXT: movb $-1, %dl -; AVX1-NEXT: jb .LBB1_36 -; AVX1-NEXT: # %bb.35: -; AVX1-NEXT: movl %ebx, %edx -; AVX1-NEXT: .LBB1_36: -; AVX1-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: movzbl %al, %eax -; AVX1-NEXT: movzbl %dl, %ecx -; AVX1-NEXT: vmovd %ecx, %xmm3 -; AVX1-NEXT: vpinsrb $1, %esi, %xmm3, %xmm3 -; AVX1-NEXT: vpextrb $2, %xmm1, %edx -; AVX1-NEXT: vpextrb $2, %xmm0, %ecx -; AVX1-NEXT: addb %dl, %cl -; AVX1-NEXT: movb $-1, %dl -; AVX1-NEXT: jb .LBB1_38 -; AVX1-NEXT: # %bb.37: -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: .LBB1_38: -; AVX1-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl %r9b, %eax -; AVX1-NEXT: movzbl %dl, %ecx -; AVX1-NEXT: vpinsrb $2, %ecx, %xmm3, %xmm3 -; AVX1-NEXT: vpextrb $3, %xmm1, %edx -; AVX1-NEXT: vpextrb $3, %xmm0, %ecx -; AVX1-NEXT: addb %dl, %cl -; AVX1-NEXT: movb $-1, %dl -; AVX1-NEXT: jb .LBB1_40 -; AVX1-NEXT: # %bb.39: -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: .LBB1_40: -; AVX1-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl %dil, %eax -; AVX1-NEXT: movzbl %dl, %ecx -; AVX1-NEXT: vpinsrb $3, %ecx, %xmm3, %xmm3 -; AVX1-NEXT: vpextrb $4, %xmm1, %edx -; AVX1-NEXT: vpextrb $4, %xmm0, %ecx -; AVX1-NEXT: addb %dl, %cl -; AVX1-NEXT: movb $-1, %dl -; AVX1-NEXT: jb .LBB1_42 -; AVX1-NEXT: # %bb.41: -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: .LBB1_42: -; AVX1-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl %r13b, %eax -; AVX1-NEXT: movzbl %dl, %ecx -; AVX1-NEXT: vpinsrb $4, %ecx, %xmm3, %xmm3 -; AVX1-NEXT: vpextrb $5, %xmm1, %edx -; AVX1-NEXT: vpextrb $5, %xmm0, %ecx -; AVX1-NEXT: addb %dl, %cl -; AVX1-NEXT: movb $-1, %dl -; AVX1-NEXT: jb .LBB1_44 -; AVX1-NEXT: # %bb.43: -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: .LBB1_44: -; AVX1-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl %r12b, %eax -; AVX1-NEXT: movzbl %dl, %ecx -; AVX1-NEXT: vpinsrb $5, %ecx, %xmm3, %xmm3 -; AVX1-NEXT: vpextrb $6, %xmm1, %edx -; AVX1-NEXT: vpextrb $6, %xmm0, %ecx -; AVX1-NEXT: addb %dl, %cl -; AVX1-NEXT: movb $-1, %dl -; AVX1-NEXT: jb .LBB1_46 -; AVX1-NEXT: # %bb.45: -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: .LBB1_46: -; AVX1-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl %r15b, %eax -; AVX1-NEXT: movzbl %dl, %ecx -; AVX1-NEXT: vpinsrb $6, %ecx, %xmm3, %xmm3 -; AVX1-NEXT: vpextrb $7, %xmm1, %edx -; AVX1-NEXT: vpextrb $7, %xmm0, %ecx -; AVX1-NEXT: addb %dl, %cl -; AVX1-NEXT: movb $-1, %dl -; AVX1-NEXT: jb .LBB1_48 -; AVX1-NEXT: # %bb.47: -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: .LBB1_48: -; AVX1-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl %r14b, %eax -; AVX1-NEXT: movzbl %dl, %ecx -; AVX1-NEXT: vpinsrb $7, %ecx, %xmm3, %xmm3 -; AVX1-NEXT: vpextrb $8, %xmm1, %edx -; AVX1-NEXT: vpextrb $8, %xmm0, %ecx -; AVX1-NEXT: addb %dl, %cl -; AVX1-NEXT: movb $-1, %dl -; AVX1-NEXT: jb .LBB1_50 -; AVX1-NEXT: # %bb.49: -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: .LBB1_50: -; AVX1-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl %r8b, %eax -; AVX1-NEXT: movzbl %dl, %ecx -; AVX1-NEXT: vpinsrb $8, %ecx, %xmm3, %xmm3 -; AVX1-NEXT: vpextrb $9, %xmm1, %edx -; AVX1-NEXT: vpextrb $9, %xmm0, %ecx -; AVX1-NEXT: addb %dl, %cl -; AVX1-NEXT: movb $-1, %dl -; AVX1-NEXT: jb .LBB1_52 -; AVX1-NEXT: # %bb.51: -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: .LBB1_52: -; AVX1-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl %r11b, %eax -; AVX1-NEXT: movzbl %dl, %ecx -; AVX1-NEXT: vpinsrb $9, %ecx, %xmm3, %xmm3 -; AVX1-NEXT: vpextrb $10, %xmm1, %edx -; AVX1-NEXT: vpextrb $10, %xmm0, %ecx -; AVX1-NEXT: addb %dl, %cl -; AVX1-NEXT: movb $-1, %dl -; AVX1-NEXT: jb .LBB1_54 -; AVX1-NEXT: # %bb.53: -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: .LBB1_54: -; AVX1-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl %r10b, %eax -; AVX1-NEXT: movzbl %dl, %ecx -; AVX1-NEXT: vpinsrb $10, %ecx, %xmm3, %xmm3 -; AVX1-NEXT: vpextrb $11, %xmm1, %edx -; AVX1-NEXT: vpextrb $11, %xmm0, %ecx -; AVX1-NEXT: addb %dl, %cl -; AVX1-NEXT: movb $-1, %dl -; AVX1-NEXT: jb .LBB1_56 -; AVX1-NEXT: # %bb.55: -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: .LBB1_56: -; AVX1-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: movzbl %dl, %ecx -; AVX1-NEXT: vpinsrb $11, %ecx, %xmm3, %xmm3 -; AVX1-NEXT: vpextrb $12, %xmm1, %edx -; AVX1-NEXT: vpextrb $12, %xmm0, %ecx -; AVX1-NEXT: addb %dl, %cl -; AVX1-NEXT: movb $-1, %dl -; AVX1-NEXT: jb .LBB1_58 -; AVX1-NEXT: # %bb.57: -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: .LBB1_58: -; AVX1-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: movzbl %dl, %ecx -; AVX1-NEXT: vpinsrb $12, %ecx, %xmm3, %xmm3 -; AVX1-NEXT: vpextrb $13, %xmm1, %edx -; AVX1-NEXT: vpextrb $13, %xmm0, %ecx -; AVX1-NEXT: addb %dl, %cl -; AVX1-NEXT: movb $-1, %dl -; AVX1-NEXT: jb .LBB1_60 -; AVX1-NEXT: # %bb.59: -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: .LBB1_60: -; AVX1-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl %bpl, %eax -; AVX1-NEXT: movzbl %dl, %ecx -; AVX1-NEXT: vpinsrb $13, %ecx, %xmm3, %xmm3 -; AVX1-NEXT: vpextrb $14, %xmm1, %edx -; AVX1-NEXT: vpextrb $14, %xmm0, %ecx -; AVX1-NEXT: addb %dl, %cl -; AVX1-NEXT: movb $-1, %dl -; AVX1-NEXT: jb .LBB1_62 -; AVX1-NEXT: # %bb.61: -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: .LBB1_62: -; AVX1-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl %dl, %eax -; AVX1-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 -; AVX1-NEXT: vpextrb $15, %xmm1, %ecx -; AVX1-NEXT: vpextrb $15, %xmm0, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb $-1, %cl -; AVX1-NEXT: jb .LBB1_64 -; AVX1-NEXT: # %bb.63: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB1_64: -; AVX1-NEXT: movzbl %cl, %eax -; AVX1-NEXT: vpinsrb $15, %eax, %xmm3, %xmm0 +; AVX1-NEXT: vpaddusb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: popq %rbx -; AVX1-NEXT: popq %r12 -; AVX1-NEXT: popq %r13 -; AVX1-NEXT: popq %r14 -; AVX1-NEXT: popq %r15 -; AVX1-NEXT: popq %rbp ; AVX1-NEXT: retq ; ; AVX2-LABEL: v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: pushq %r15 -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %r13 -; AVX2-NEXT: pushq %r12 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpextrb $1, %xmm2, %ecx -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpextrb $1, %xmm3, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb $-1, %bl -; AVX2-NEXT: jb .LBB1_2 -; AVX2-NEXT: # %bb.1: -; AVX2-NEXT: movl %eax, %ebx -; AVX2-NEXT: .LBB1_2: -; AVX2-NEXT: vpextrb $0, %xmm2, %ecx -; AVX2-NEXT: vpextrb $0, %xmm3, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb $-1, %cl -; AVX2-NEXT: jb .LBB1_4 -; AVX2-NEXT: # %bb.3: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB1_4: -; AVX2-NEXT: vpextrb $2, %xmm2, %edx -; AVX2-NEXT: vpextrb $2, %xmm3, %eax -; AVX2-NEXT: addb %dl, %al -; AVX2-NEXT: movb $-1, %sil -; AVX2-NEXT: jb .LBB1_6 -; AVX2-NEXT: # %bb.5: -; AVX2-NEXT: movl %eax, %esi -; AVX2-NEXT: .LBB1_6: -; AVX2-NEXT: vpextrb $3, %xmm2, %eax -; AVX2-NEXT: vpextrb $3, %xmm3, %edx -; AVX2-NEXT: addb %al, %dl -; AVX2-NEXT: movb $-1, %al -; AVX2-NEXT: jb .LBB1_8 -; AVX2-NEXT: # %bb.7: -; AVX2-NEXT: movl %edx, %eax -; AVX2-NEXT: .LBB1_8: -; AVX2-NEXT: vpextrb $4, %xmm2, %edx -; AVX2-NEXT: vpextrb $4, %xmm3, %edi -; AVX2-NEXT: addb %dl, %dil -; AVX2-NEXT: movb $-1, %r9b -; AVX2-NEXT: jb .LBB1_10 -; AVX2-NEXT: # %bb.9: -; AVX2-NEXT: movl %edi, %r9d -; AVX2-NEXT: .LBB1_10: -; AVX2-NEXT: vpextrb $5, %xmm2, %edx -; AVX2-NEXT: vpextrb $5, %xmm3, %ebp -; AVX2-NEXT: addb %dl, %bpl -; AVX2-NEXT: movb $-1, %dil -; AVX2-NEXT: jb .LBB1_12 -; AVX2-NEXT: # %bb.11: -; AVX2-NEXT: movl %ebp, %edi -; AVX2-NEXT: .LBB1_12: -; AVX2-NEXT: vpextrb $6, %xmm2, %edx -; AVX2-NEXT: vpextrb $6, %xmm3, %ebp -; AVX2-NEXT: addb %dl, %bpl -; AVX2-NEXT: movb $-1, %r13b -; AVX2-NEXT: jb .LBB1_14 -; AVX2-NEXT: # %bb.13: -; AVX2-NEXT: movl %ebp, %r13d -; AVX2-NEXT: .LBB1_14: -; AVX2-NEXT: vpextrb $7, %xmm2, %edx -; AVX2-NEXT: vpextrb $7, %xmm3, %ebp -; AVX2-NEXT: addb %dl, %bpl -; AVX2-NEXT: movb $-1, %r12b -; AVX2-NEXT: jb .LBB1_16 -; AVX2-NEXT: # %bb.15: -; AVX2-NEXT: movl %ebp, %r12d -; AVX2-NEXT: .LBB1_16: -; AVX2-NEXT: vpextrb $8, %xmm2, %edx -; AVX2-NEXT: vpextrb $8, %xmm3, %ebp -; AVX2-NEXT: addb %dl, %bpl -; AVX2-NEXT: movb $-1, %r15b -; AVX2-NEXT: jb .LBB1_18 -; AVX2-NEXT: # %bb.17: -; AVX2-NEXT: movl %ebp, %r15d -; AVX2-NEXT: .LBB1_18: -; AVX2-NEXT: vpextrb $9, %xmm2, %edx -; AVX2-NEXT: vpextrb $9, %xmm3, %ebp -; AVX2-NEXT: addb %dl, %bpl -; AVX2-NEXT: movb $-1, %r14b -; AVX2-NEXT: jb .LBB1_20 -; AVX2-NEXT: # %bb.19: -; AVX2-NEXT: movl %ebp, %r14d -; AVX2-NEXT: .LBB1_20: -; AVX2-NEXT: vpextrb $10, %xmm2, %edx -; AVX2-NEXT: vpextrb $10, %xmm3, %ebp -; AVX2-NEXT: addb %dl, %bpl -; AVX2-NEXT: movb $-1, %r8b -; AVX2-NEXT: jb .LBB1_22 -; AVX2-NEXT: # %bb.21: -; AVX2-NEXT: movl %ebp, %r8d -; AVX2-NEXT: .LBB1_22: -; AVX2-NEXT: vpextrb $11, %xmm2, %edx -; AVX2-NEXT: vpextrb $11, %xmm3, %ebp -; AVX2-NEXT: addb %dl, %bpl -; AVX2-NEXT: movb $-1, %r11b -; AVX2-NEXT: jb .LBB1_24 -; AVX2-NEXT: # %bb.23: -; AVX2-NEXT: movl %ebp, %r11d -; AVX2-NEXT: .LBB1_24: -; AVX2-NEXT: vpextrb $12, %xmm2, %edx -; AVX2-NEXT: vpextrb $12, %xmm3, %ebp -; AVX2-NEXT: addb %dl, %bpl -; AVX2-NEXT: movb $-1, %r10b -; AVX2-NEXT: jb .LBB1_26 -; AVX2-NEXT: # %bb.25: -; AVX2-NEXT: movl %ebp, %r10d -; AVX2-NEXT: .LBB1_26: -; AVX2-NEXT: vpextrb $13, %xmm2, %edx -; AVX2-NEXT: vpextrb $13, %xmm3, %ebp -; AVX2-NEXT: addb %dl, %bpl -; AVX2-NEXT: movb $-1, %dl -; AVX2-NEXT: jb .LBB1_28 -; AVX2-NEXT: # %bb.27: -; AVX2-NEXT: movl %ebp, %edx -; AVX2-NEXT: .LBB1_28: -; AVX2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX2-NEXT: vpextrb $14, %xmm2, %edx -; AVX2-NEXT: vpextrb $14, %xmm3, %ebp -; AVX2-NEXT: addb %dl, %bpl -; AVX2-NEXT: movb $-1, %dl -; AVX2-NEXT: jb .LBB1_30 -; AVX2-NEXT: # %bb.29: -; AVX2-NEXT: movl %ebp, %edx -; AVX2-NEXT: .LBB1_30: -; AVX2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX2-NEXT: movzbl %cl, %ecx -; AVX2-NEXT: vpextrb $15, %xmm2, %edx -; AVX2-NEXT: vpextrb $15, %xmm3, %ebp -; AVX2-NEXT: addb %dl, %bpl -; AVX2-NEXT: movb $-1, %dl -; AVX2-NEXT: jb .LBB1_32 -; AVX2-NEXT: # %bb.31: -; AVX2-NEXT: movl %ebp, %edx -; AVX2-NEXT: .LBB1_32: -; AVX2-NEXT: movl %edx, %ebp -; AVX2-NEXT: movzbl %bl, %ebx -; AVX2-NEXT: vmovd %ecx, %xmm2 -; AVX2-NEXT: vpextrb $1, %xmm1, %edx -; AVX2-NEXT: vpextrb $1, %xmm0, %ecx -; AVX2-NEXT: addb %dl, %cl -; AVX2-NEXT: movb $-1, %dl -; AVX2-NEXT: jb .LBB1_34 -; AVX2-NEXT: # %bb.33: -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: .LBB1_34: -; AVX2-NEXT: vpinsrb $1, %ebx, %xmm2, %xmm2 -; AVX2-NEXT: movzbl %sil, %ecx -; AVX2-NEXT: movzbl %dl, %esi -; AVX2-NEXT: vpextrb $0, %xmm1, %edx -; AVX2-NEXT: vpextrb $0, %xmm0, %ebx -; AVX2-NEXT: addb %dl, %bl -; AVX2-NEXT: movb $-1, %dl -; AVX2-NEXT: jb .LBB1_36 -; AVX2-NEXT: # %bb.35: -; AVX2-NEXT: movl %ebx, %edx -; AVX2-NEXT: .LBB1_36: -; AVX2-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: movzbl %dl, %ecx -; AVX2-NEXT: vmovd %ecx, %xmm3 -; AVX2-NEXT: vpinsrb $1, %esi, %xmm3, %xmm3 -; AVX2-NEXT: vpextrb $2, %xmm1, %edx -; AVX2-NEXT: vpextrb $2, %xmm0, %ecx -; AVX2-NEXT: addb %dl, %cl -; AVX2-NEXT: movb $-1, %dl -; AVX2-NEXT: jb .LBB1_38 -; AVX2-NEXT: # %bb.37: -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: .LBB1_38: -; AVX2-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl %r9b, %eax -; AVX2-NEXT: movzbl %dl, %ecx -; AVX2-NEXT: vpinsrb $2, %ecx, %xmm3, %xmm3 -; AVX2-NEXT: vpextrb $3, %xmm1, %edx -; AVX2-NEXT: vpextrb $3, %xmm0, %ecx -; AVX2-NEXT: addb %dl, %cl -; AVX2-NEXT: movb $-1, %dl -; AVX2-NEXT: jb .LBB1_40 -; AVX2-NEXT: # %bb.39: -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: .LBB1_40: -; AVX2-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl %dil, %eax -; AVX2-NEXT: movzbl %dl, %ecx -; AVX2-NEXT: vpinsrb $3, %ecx, %xmm3, %xmm3 -; AVX2-NEXT: vpextrb $4, %xmm1, %edx -; AVX2-NEXT: vpextrb $4, %xmm0, %ecx -; AVX2-NEXT: addb %dl, %cl -; AVX2-NEXT: movb $-1, %dl -; AVX2-NEXT: jb .LBB1_42 -; AVX2-NEXT: # %bb.41: -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: .LBB1_42: -; AVX2-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl %r13b, %eax -; AVX2-NEXT: movzbl %dl, %ecx -; AVX2-NEXT: vpinsrb $4, %ecx, %xmm3, %xmm3 -; AVX2-NEXT: vpextrb $5, %xmm1, %edx -; AVX2-NEXT: vpextrb $5, %xmm0, %ecx -; AVX2-NEXT: addb %dl, %cl -; AVX2-NEXT: movb $-1, %dl -; AVX2-NEXT: jb .LBB1_44 -; AVX2-NEXT: # %bb.43: -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: .LBB1_44: -; AVX2-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl %r12b, %eax -; AVX2-NEXT: movzbl %dl, %ecx -; AVX2-NEXT: vpinsrb $5, %ecx, %xmm3, %xmm3 -; AVX2-NEXT: vpextrb $6, %xmm1, %edx -; AVX2-NEXT: vpextrb $6, %xmm0, %ecx -; AVX2-NEXT: addb %dl, %cl -; AVX2-NEXT: movb $-1, %dl -; AVX2-NEXT: jb .LBB1_46 -; AVX2-NEXT: # %bb.45: -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: .LBB1_46: -; AVX2-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl %r15b, %eax -; AVX2-NEXT: movzbl %dl, %ecx -; AVX2-NEXT: vpinsrb $6, %ecx, %xmm3, %xmm3 -; AVX2-NEXT: vpextrb $7, %xmm1, %edx -; AVX2-NEXT: vpextrb $7, %xmm0, %ecx -; AVX2-NEXT: addb %dl, %cl -; AVX2-NEXT: movb $-1, %dl -; AVX2-NEXT: jb .LBB1_48 -; AVX2-NEXT: # %bb.47: -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: .LBB1_48: -; AVX2-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl %r14b, %eax -; AVX2-NEXT: movzbl %dl, %ecx -; AVX2-NEXT: vpinsrb $7, %ecx, %xmm3, %xmm3 -; AVX2-NEXT: vpextrb $8, %xmm1, %edx -; AVX2-NEXT: vpextrb $8, %xmm0, %ecx -; AVX2-NEXT: addb %dl, %cl -; AVX2-NEXT: movb $-1, %dl -; AVX2-NEXT: jb .LBB1_50 -; AVX2-NEXT: # %bb.49: -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: .LBB1_50: -; AVX2-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl %r8b, %eax -; AVX2-NEXT: movzbl %dl, %ecx -; AVX2-NEXT: vpinsrb $8, %ecx, %xmm3, %xmm3 -; AVX2-NEXT: vpextrb $9, %xmm1, %edx -; AVX2-NEXT: vpextrb $9, %xmm0, %ecx -; AVX2-NEXT: addb %dl, %cl -; AVX2-NEXT: movb $-1, %dl -; AVX2-NEXT: jb .LBB1_52 -; AVX2-NEXT: # %bb.51: -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: .LBB1_52: -; AVX2-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl %r11b, %eax -; AVX2-NEXT: movzbl %dl, %ecx -; AVX2-NEXT: vpinsrb $9, %ecx, %xmm3, %xmm3 -; AVX2-NEXT: vpextrb $10, %xmm1, %edx -; AVX2-NEXT: vpextrb $10, %xmm0, %ecx -; AVX2-NEXT: addb %dl, %cl -; AVX2-NEXT: movb $-1, %dl -; AVX2-NEXT: jb .LBB1_54 -; AVX2-NEXT: # %bb.53: -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: .LBB1_54: -; AVX2-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl %r10b, %eax -; AVX2-NEXT: movzbl %dl, %ecx -; AVX2-NEXT: vpinsrb $10, %ecx, %xmm3, %xmm3 -; AVX2-NEXT: vpextrb $11, %xmm1, %edx -; AVX2-NEXT: vpextrb $11, %xmm0, %ecx -; AVX2-NEXT: addb %dl, %cl -; AVX2-NEXT: movb $-1, %dl -; AVX2-NEXT: jb .LBB1_56 -; AVX2-NEXT: # %bb.55: -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: .LBB1_56: -; AVX2-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: movzbl %dl, %ecx -; AVX2-NEXT: vpinsrb $11, %ecx, %xmm3, %xmm3 -; AVX2-NEXT: vpextrb $12, %xmm1, %edx -; AVX2-NEXT: vpextrb $12, %xmm0, %ecx -; AVX2-NEXT: addb %dl, %cl -; AVX2-NEXT: movb $-1, %dl -; AVX2-NEXT: jb .LBB1_58 -; AVX2-NEXT: # %bb.57: -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: .LBB1_58: -; AVX2-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: movzbl %dl, %ecx -; AVX2-NEXT: vpinsrb $12, %ecx, %xmm3, %xmm3 -; AVX2-NEXT: vpextrb $13, %xmm1, %edx -; AVX2-NEXT: vpextrb $13, %xmm0, %ecx -; AVX2-NEXT: addb %dl, %cl -; AVX2-NEXT: movb $-1, %dl -; AVX2-NEXT: jb .LBB1_60 -; AVX2-NEXT: # %bb.59: -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: .LBB1_60: -; AVX2-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl %bpl, %eax -; AVX2-NEXT: movzbl %dl, %ecx -; AVX2-NEXT: vpinsrb $13, %ecx, %xmm3, %xmm3 -; AVX2-NEXT: vpextrb $14, %xmm1, %edx -; AVX2-NEXT: vpextrb $14, %xmm0, %ecx -; AVX2-NEXT: addb %dl, %cl -; AVX2-NEXT: movb $-1, %dl -; AVX2-NEXT: jb .LBB1_62 -; AVX2-NEXT: # %bb.61: -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: .LBB1_62: -; AVX2-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl %dl, %eax -; AVX2-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 -; AVX2-NEXT: vpextrb $15, %xmm1, %ecx -; AVX2-NEXT: vpextrb $15, %xmm0, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb $-1, %cl -; AVX2-NEXT: jb .LBB1_64 -; AVX2-NEXT: # %bb.63: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB1_64: -; AVX2-NEXT: movzbl %cl, %eax -; AVX2-NEXT: vpinsrb $15, %eax, %xmm3, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r12 -; AVX2-NEXT: popq %r13 -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %r15 -; AVX2-NEXT: popq %rbp +; AVX2-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: v32i8: ; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512-NEXT: vpextrb $1, %xmm2, %ecx -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512-NEXT: vpextrb $1, %xmm3, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: movb $-1, %bl -; AVX512-NEXT: jb .LBB1_2 -; AVX512-NEXT: # %bb.1: -; AVX512-NEXT: movl %eax, %ebx -; AVX512-NEXT: .LBB1_2: -; AVX512-NEXT: vpextrb $0, %xmm2, %ecx -; AVX512-NEXT: vpextrb $0, %xmm3, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: movb $-1, %cl -; AVX512-NEXT: jb .LBB1_4 -; AVX512-NEXT: # %bb.3: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB1_4: -; AVX512-NEXT: vpextrb $2, %xmm2, %edx -; AVX512-NEXT: vpextrb $2, %xmm3, %eax -; AVX512-NEXT: addb %dl, %al -; AVX512-NEXT: movb $-1, %sil -; AVX512-NEXT: jb .LBB1_6 -; AVX512-NEXT: # %bb.5: -; AVX512-NEXT: movl %eax, %esi -; AVX512-NEXT: .LBB1_6: -; AVX512-NEXT: vpextrb $3, %xmm2, %eax -; AVX512-NEXT: vpextrb $3, %xmm3, %edx -; AVX512-NEXT: addb %al, %dl -; AVX512-NEXT: movb $-1, %al -; AVX512-NEXT: jb .LBB1_8 -; AVX512-NEXT: # %bb.7: -; AVX512-NEXT: movl %edx, %eax -; AVX512-NEXT: .LBB1_8: -; AVX512-NEXT: vpextrb $4, %xmm2, %edx -; AVX512-NEXT: vpextrb $4, %xmm3, %edi -; AVX512-NEXT: addb %dl, %dil -; AVX512-NEXT: movb $-1, %r9b -; AVX512-NEXT: jb .LBB1_10 -; AVX512-NEXT: # %bb.9: -; AVX512-NEXT: movl %edi, %r9d -; AVX512-NEXT: .LBB1_10: -; AVX512-NEXT: vpextrb $5, %xmm2, %edx -; AVX512-NEXT: vpextrb $5, %xmm3, %ebp -; AVX512-NEXT: addb %dl, %bpl -; AVX512-NEXT: movb $-1, %dil -; AVX512-NEXT: jb .LBB1_12 -; AVX512-NEXT: # %bb.11: -; AVX512-NEXT: movl %ebp, %edi -; AVX512-NEXT: .LBB1_12: -; AVX512-NEXT: vpextrb $6, %xmm2, %edx -; AVX512-NEXT: vpextrb $6, %xmm3, %ebp -; AVX512-NEXT: addb %dl, %bpl -; AVX512-NEXT: movb $-1, %r13b -; AVX512-NEXT: jb .LBB1_14 -; AVX512-NEXT: # %bb.13: -; AVX512-NEXT: movl %ebp, %r13d -; AVX512-NEXT: .LBB1_14: -; AVX512-NEXT: vpextrb $7, %xmm2, %edx -; AVX512-NEXT: vpextrb $7, %xmm3, %ebp -; AVX512-NEXT: addb %dl, %bpl -; AVX512-NEXT: movb $-1, %r12b -; AVX512-NEXT: jb .LBB1_16 -; AVX512-NEXT: # %bb.15: -; AVX512-NEXT: movl %ebp, %r12d -; AVX512-NEXT: .LBB1_16: -; AVX512-NEXT: vpextrb $8, %xmm2, %edx -; AVX512-NEXT: vpextrb $8, %xmm3, %ebp -; AVX512-NEXT: addb %dl, %bpl -; AVX512-NEXT: movb $-1, %r15b -; AVX512-NEXT: jb .LBB1_18 -; AVX512-NEXT: # %bb.17: -; AVX512-NEXT: movl %ebp, %r15d -; AVX512-NEXT: .LBB1_18: -; AVX512-NEXT: vpextrb $9, %xmm2, %edx -; AVX512-NEXT: vpextrb $9, %xmm3, %ebp -; AVX512-NEXT: addb %dl, %bpl -; AVX512-NEXT: movb $-1, %r14b -; AVX512-NEXT: jb .LBB1_20 -; AVX512-NEXT: # %bb.19: -; AVX512-NEXT: movl %ebp, %r14d -; AVX512-NEXT: .LBB1_20: -; AVX512-NEXT: vpextrb $10, %xmm2, %edx -; AVX512-NEXT: vpextrb $10, %xmm3, %ebp -; AVX512-NEXT: addb %dl, %bpl -; AVX512-NEXT: movb $-1, %r8b -; AVX512-NEXT: jb .LBB1_22 -; AVX512-NEXT: # %bb.21: -; AVX512-NEXT: movl %ebp, %r8d -; AVX512-NEXT: .LBB1_22: -; AVX512-NEXT: vpextrb $11, %xmm2, %edx -; AVX512-NEXT: vpextrb $11, %xmm3, %ebp -; AVX512-NEXT: addb %dl, %bpl -; AVX512-NEXT: movb $-1, %r11b -; AVX512-NEXT: jb .LBB1_24 -; AVX512-NEXT: # %bb.23: -; AVX512-NEXT: movl %ebp, %r11d -; AVX512-NEXT: .LBB1_24: -; AVX512-NEXT: vpextrb $12, %xmm2, %edx -; AVX512-NEXT: vpextrb $12, %xmm3, %ebp -; AVX512-NEXT: addb %dl, %bpl -; AVX512-NEXT: movb $-1, %r10b -; AVX512-NEXT: jb .LBB1_26 -; AVX512-NEXT: # %bb.25: -; AVX512-NEXT: movl %ebp, %r10d -; AVX512-NEXT: .LBB1_26: -; AVX512-NEXT: vpextrb $13, %xmm2, %edx -; AVX512-NEXT: vpextrb $13, %xmm3, %ebp -; AVX512-NEXT: addb %dl, %bpl -; AVX512-NEXT: movb $-1, %dl -; AVX512-NEXT: jb .LBB1_28 -; AVX512-NEXT: # %bb.27: -; AVX512-NEXT: movl %ebp, %edx -; AVX512-NEXT: .LBB1_28: -; AVX512-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX512-NEXT: vpextrb $14, %xmm2, %edx -; AVX512-NEXT: vpextrb $14, %xmm3, %ebp -; AVX512-NEXT: addb %dl, %bpl -; AVX512-NEXT: movb $-1, %dl -; AVX512-NEXT: jb .LBB1_30 -; AVX512-NEXT: # %bb.29: -; AVX512-NEXT: movl %ebp, %edx -; AVX512-NEXT: .LBB1_30: -; AVX512-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX512-NEXT: movzbl %cl, %ecx -; AVX512-NEXT: vpextrb $15, %xmm2, %edx -; AVX512-NEXT: vpextrb $15, %xmm3, %ebp -; AVX512-NEXT: addb %dl, %bpl -; AVX512-NEXT: movb $-1, %dl -; AVX512-NEXT: jb .LBB1_32 -; AVX512-NEXT: # %bb.31: -; AVX512-NEXT: movl %ebp, %edx -; AVX512-NEXT: .LBB1_32: -; AVX512-NEXT: movl %edx, %ebp -; AVX512-NEXT: movzbl %bl, %ebx -; AVX512-NEXT: vmovd %ecx, %xmm2 -; AVX512-NEXT: vpextrb $1, %xmm1, %edx -; AVX512-NEXT: vpextrb $1, %xmm0, %ecx -; AVX512-NEXT: addb %dl, %cl -; AVX512-NEXT: movb $-1, %dl -; AVX512-NEXT: jb .LBB1_34 -; AVX512-NEXT: # %bb.33: -; AVX512-NEXT: movl %ecx, %edx -; AVX512-NEXT: .LBB1_34: -; AVX512-NEXT: vpinsrb $1, %ebx, %xmm2, %xmm2 -; AVX512-NEXT: movzbl %sil, %ecx -; AVX512-NEXT: movzbl %dl, %esi -; AVX512-NEXT: vpextrb $0, %xmm1, %edx -; AVX512-NEXT: vpextrb $0, %xmm0, %ebx -; AVX512-NEXT: addb %dl, %bl -; AVX512-NEXT: movb $-1, %dl -; AVX512-NEXT: jb .LBB1_36 -; AVX512-NEXT: # %bb.35: -; AVX512-NEXT: movl %ebx, %edx -; AVX512-NEXT: .LBB1_36: -; AVX512-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 -; AVX512-NEXT: movzbl %al, %eax -; AVX512-NEXT: movzbl %dl, %ecx -; AVX512-NEXT: vmovd %ecx, %xmm3 -; AVX512-NEXT: vpinsrb $1, %esi, %xmm3, %xmm3 -; AVX512-NEXT: vpextrb $2, %xmm1, %edx -; AVX512-NEXT: vpextrb $2, %xmm0, %ecx -; AVX512-NEXT: addb %dl, %cl -; AVX512-NEXT: movb $-1, %dl -; AVX512-NEXT: jb .LBB1_38 -; AVX512-NEXT: # %bb.37: -; AVX512-NEXT: movl %ecx, %edx -; AVX512-NEXT: .LBB1_38: -; AVX512-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl %r9b, %eax -; AVX512-NEXT: movzbl %dl, %ecx -; AVX512-NEXT: vpinsrb $2, %ecx, %xmm3, %xmm3 -; AVX512-NEXT: vpextrb $3, %xmm1, %edx -; AVX512-NEXT: vpextrb $3, %xmm0, %ecx -; AVX512-NEXT: addb %dl, %cl -; AVX512-NEXT: movb $-1, %dl -; AVX512-NEXT: jb .LBB1_40 -; AVX512-NEXT: # %bb.39: -; AVX512-NEXT: movl %ecx, %edx -; AVX512-NEXT: .LBB1_40: -; AVX512-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl %dil, %eax -; AVX512-NEXT: movzbl %dl, %ecx -; AVX512-NEXT: vpinsrb $3, %ecx, %xmm3, %xmm3 -; AVX512-NEXT: vpextrb $4, %xmm1, %edx -; AVX512-NEXT: vpextrb $4, %xmm0, %ecx -; AVX512-NEXT: addb %dl, %cl -; AVX512-NEXT: movb $-1, %dl -; AVX512-NEXT: jb .LBB1_42 -; AVX512-NEXT: # %bb.41: -; AVX512-NEXT: movl %ecx, %edx -; AVX512-NEXT: .LBB1_42: -; AVX512-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl %r13b, %eax -; AVX512-NEXT: movzbl %dl, %ecx -; AVX512-NEXT: vpinsrb $4, %ecx, %xmm3, %xmm3 -; AVX512-NEXT: vpextrb $5, %xmm1, %edx -; AVX512-NEXT: vpextrb $5, %xmm0, %ecx -; AVX512-NEXT: addb %dl, %cl -; AVX512-NEXT: movb $-1, %dl -; AVX512-NEXT: jb .LBB1_44 -; AVX512-NEXT: # %bb.43: -; AVX512-NEXT: movl %ecx, %edx -; AVX512-NEXT: .LBB1_44: -; AVX512-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl %r12b, %eax -; AVX512-NEXT: movzbl %dl, %ecx -; AVX512-NEXT: vpinsrb $5, %ecx, %xmm3, %xmm3 -; AVX512-NEXT: vpextrb $6, %xmm1, %edx -; AVX512-NEXT: vpextrb $6, %xmm0, %ecx -; AVX512-NEXT: addb %dl, %cl -; AVX512-NEXT: movb $-1, %dl -; AVX512-NEXT: jb .LBB1_46 -; AVX512-NEXT: # %bb.45: -; AVX512-NEXT: movl %ecx, %edx -; AVX512-NEXT: .LBB1_46: -; AVX512-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl %r15b, %eax -; AVX512-NEXT: movzbl %dl, %ecx -; AVX512-NEXT: vpinsrb $6, %ecx, %xmm3, %xmm3 -; AVX512-NEXT: vpextrb $7, %xmm1, %edx -; AVX512-NEXT: vpextrb $7, %xmm0, %ecx -; AVX512-NEXT: addb %dl, %cl -; AVX512-NEXT: movb $-1, %dl -; AVX512-NEXT: jb .LBB1_48 -; AVX512-NEXT: # %bb.47: -; AVX512-NEXT: movl %ecx, %edx -; AVX512-NEXT: .LBB1_48: -; AVX512-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl %r14b, %eax -; AVX512-NEXT: movzbl %dl, %ecx -; AVX512-NEXT: vpinsrb $7, %ecx, %xmm3, %xmm3 -; AVX512-NEXT: vpextrb $8, %xmm1, %edx -; AVX512-NEXT: vpextrb $8, %xmm0, %ecx -; AVX512-NEXT: addb %dl, %cl -; AVX512-NEXT: movb $-1, %dl -; AVX512-NEXT: jb .LBB1_50 -; AVX512-NEXT: # %bb.49: -; AVX512-NEXT: movl %ecx, %edx -; AVX512-NEXT: .LBB1_50: -; AVX512-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl %r8b, %eax -; AVX512-NEXT: movzbl %dl, %ecx -; AVX512-NEXT: vpinsrb $8, %ecx, %xmm3, %xmm3 -; AVX512-NEXT: vpextrb $9, %xmm1, %edx -; AVX512-NEXT: vpextrb $9, %xmm0, %ecx -; AVX512-NEXT: addb %dl, %cl -; AVX512-NEXT: movb $-1, %dl -; AVX512-NEXT: jb .LBB1_52 -; AVX512-NEXT: # %bb.51: -; AVX512-NEXT: movl %ecx, %edx -; AVX512-NEXT: .LBB1_52: -; AVX512-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl %r11b, %eax -; AVX512-NEXT: movzbl %dl, %ecx -; AVX512-NEXT: vpinsrb $9, %ecx, %xmm3, %xmm3 -; AVX512-NEXT: vpextrb $10, %xmm1, %edx -; AVX512-NEXT: vpextrb $10, %xmm0, %ecx -; AVX512-NEXT: addb %dl, %cl -; AVX512-NEXT: movb $-1, %dl -; AVX512-NEXT: jb .LBB1_54 -; AVX512-NEXT: # %bb.53: -; AVX512-NEXT: movl %ecx, %edx -; AVX512-NEXT: .LBB1_54: -; AVX512-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl %r10b, %eax -; AVX512-NEXT: movzbl %dl, %ecx -; AVX512-NEXT: vpinsrb $10, %ecx, %xmm3, %xmm3 -; AVX512-NEXT: vpextrb $11, %xmm1, %edx -; AVX512-NEXT: vpextrb $11, %xmm0, %ecx -; AVX512-NEXT: addb %dl, %cl -; AVX512-NEXT: movb $-1, %dl -; AVX512-NEXT: jb .LBB1_56 -; AVX512-NEXT: # %bb.55: -; AVX512-NEXT: movl %ecx, %edx -; AVX512-NEXT: .LBB1_56: -; AVX512-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: movzbl %dl, %ecx -; AVX512-NEXT: vpinsrb $11, %ecx, %xmm3, %xmm3 -; AVX512-NEXT: vpextrb $12, %xmm1, %edx -; AVX512-NEXT: vpextrb $12, %xmm0, %ecx -; AVX512-NEXT: addb %dl, %cl -; AVX512-NEXT: movb $-1, %dl -; AVX512-NEXT: jb .LBB1_58 -; AVX512-NEXT: # %bb.57: -; AVX512-NEXT: movl %ecx, %edx -; AVX512-NEXT: .LBB1_58: -; AVX512-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: movzbl %dl, %ecx -; AVX512-NEXT: vpinsrb $12, %ecx, %xmm3, %xmm3 -; AVX512-NEXT: vpextrb $13, %xmm1, %edx -; AVX512-NEXT: vpextrb $13, %xmm0, %ecx -; AVX512-NEXT: addb %dl, %cl -; AVX512-NEXT: movb $-1, %dl -; AVX512-NEXT: jb .LBB1_60 -; AVX512-NEXT: # %bb.59: -; AVX512-NEXT: movl %ecx, %edx -; AVX512-NEXT: .LBB1_60: -; AVX512-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl %bpl, %eax -; AVX512-NEXT: movzbl %dl, %ecx -; AVX512-NEXT: vpinsrb $13, %ecx, %xmm3, %xmm3 -; AVX512-NEXT: vpextrb $14, %xmm1, %edx -; AVX512-NEXT: vpextrb $14, %xmm0, %ecx -; AVX512-NEXT: addb %dl, %cl -; AVX512-NEXT: movb $-1, %dl -; AVX512-NEXT: jb .LBB1_62 -; AVX512-NEXT: # %bb.61: -; AVX512-NEXT: movl %ecx, %edx -; AVX512-NEXT: .LBB1_62: -; AVX512-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl %dl, %eax -; AVX512-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 -; AVX512-NEXT: vpextrb $15, %xmm1, %ecx -; AVX512-NEXT: vpextrb $15, %xmm0, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: movb $-1, %cl -; AVX512-NEXT: jb .LBB1_64 -; AVX512-NEXT: # %bb.63: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB1_64: -; AVX512-NEXT: movzbl %cl, %eax -; AVX512-NEXT: vpinsrb $15, %eax, %xmm3, %xmm0 -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp +; AVX512-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq %z = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> %x, <32 x i8> %y) ret <32 x i8> %z } define <64 x i8> @v64i8(<64 x i8> %x, <64 x i8> %y) nounwind { -; SSE2-LABEL: v64i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pushq %rbp -; SSE2-NEXT: pushq %r15 -; SSE2-NEXT: pushq %r14 -; SSE2-NEXT: pushq %r13 -; SSE2-NEXT: pushq %r12 -; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: subq $648, %rsp # imm = 0x288 -; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm5, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm6, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm7, (%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %cl -; SSE2-NEXT: jb .LBB2_2 -; SSE2-NEXT: # %bb.1: -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: .LBB2_2: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %dil -; SSE2-NEXT: jb .LBB2_4 -; SSE2-NEXT: # %bb.3: -; SSE2-NEXT: movl %eax, %edi -; SSE2-NEXT: .LBB2_4: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %r8b -; SSE2-NEXT: jb .LBB2_6 -; SSE2-NEXT: # %bb.5: -; SSE2-NEXT: movl %eax, %r8d -; SSE2-NEXT: .LBB2_6: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %r9b -; SSE2-NEXT: jb .LBB2_8 -; SSE2-NEXT: # %bb.7: -; SSE2-NEXT: movl %eax, %r9d -; SSE2-NEXT: .LBB2_8: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %r10b -; SSE2-NEXT: jb .LBB2_10 -; SSE2-NEXT: # %bb.9: -; SSE2-NEXT: movl %eax, %r10d -; SSE2-NEXT: .LBB2_10: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %r11b -; SSE2-NEXT: jb .LBB2_12 -; SSE2-NEXT: # %bb.11: -; SSE2-NEXT: movl %eax, %r11d -; SSE2-NEXT: .LBB2_12: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %bpl -; SSE2-NEXT: jb .LBB2_14 -; SSE2-NEXT: # %bb.13: -; SSE2-NEXT: movl %eax, %ebp -; SSE2-NEXT: .LBB2_14: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %r14b -; SSE2-NEXT: jb .LBB2_16 -; SSE2-NEXT: # %bb.15: -; SSE2-NEXT: movl %eax, %r14d -; SSE2-NEXT: .LBB2_16: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %dl -; SSE2-NEXT: jb .LBB2_18 -; SSE2-NEXT: # %bb.17: -; SSE2-NEXT: movl %eax, %edx -; SSE2-NEXT: .LBB2_18: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %r15b -; SSE2-NEXT: jb .LBB2_20 -; SSE2-NEXT: # %bb.19: -; SSE2-NEXT: movl %eax, %r15d -; SSE2-NEXT: .LBB2_20: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %r12b -; SSE2-NEXT: jb .LBB2_22 -; SSE2-NEXT: # %bb.21: -; SSE2-NEXT: movl %eax, %r12d -; SSE2-NEXT: .LBB2_22: -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %dl -; SSE2-NEXT: jb .LBB2_24 -; SSE2-NEXT: # %bb.23: -; SSE2-NEXT: movl %eax, %edx -; SSE2-NEXT: .LBB2_24: -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %dl -; SSE2-NEXT: jb .LBB2_26 -; SSE2-NEXT: # %bb.25: -; SSE2-NEXT: movl %eax, %edx -; SSE2-NEXT: .LBB2_26: -; SSE2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %sil -; SSE2-NEXT: jb .LBB2_28 -; SSE2-NEXT: # %bb.27: -; SSE2-NEXT: movl %eax, %esi -; SSE2-NEXT: .LBB2_28: -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSE2-NEXT: jb .LBB2_30 -; SSE2-NEXT: # %bb.29: -; SSE2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_30: -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %dl -; SSE2-NEXT: jb .LBB2_32 -; SSE2-NEXT: # %bb.31: -; SSE2-NEXT: movl %eax, %edx -; SSE2-NEXT: .LBB2_32: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %cl -; SSE2-NEXT: jb .LBB2_34 -; SSE2-NEXT: # %bb.33: -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: .LBB2_34: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSE2-NEXT: jb .LBB2_36 -; SSE2-NEXT: # %bb.35: -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_36: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSE2-NEXT: jb .LBB2_38 -; SSE2-NEXT: # %bb.37: -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_38: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSE2-NEXT: jb .LBB2_40 -; SSE2-NEXT: # %bb.39: -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_40: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSE2-NEXT: jb .LBB2_42 -; SSE2-NEXT: # %bb.41: -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_42: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSE2-NEXT: jb .LBB2_44 -; SSE2-NEXT: # %bb.43: -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_44: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSE2-NEXT: jb .LBB2_46 -; SSE2-NEXT: # %bb.45: -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_46: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSE2-NEXT: jb .LBB2_48 -; SSE2-NEXT: # %bb.47: -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_48: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSE2-NEXT: jb .LBB2_50 -; SSE2-NEXT: # %bb.49: -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_50: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSE2-NEXT: jb .LBB2_52 -; SSE2-NEXT: # %bb.51: -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_52: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSE2-NEXT: jb .LBB2_54 -; SSE2-NEXT: # %bb.53: -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_54: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSE2-NEXT: jb .LBB2_56 -; SSE2-NEXT: # %bb.55: -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_56: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSE2-NEXT: jb .LBB2_58 -; SSE2-NEXT: # %bb.57: -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_58: -; SSE2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %cl -; SSE2-NEXT: jb .LBB2_60 -; SSE2-NEXT: # %bb.59: -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: .LBB2_60: -; SSE2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSE2-NEXT: jb .LBB2_62 -; SSE2-NEXT: # %bb.61: -; SSE2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_62: -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %dl -; SSE2-NEXT: jb .LBB2_64 -; SSE2-NEXT: # %bb.63: -; SSE2-NEXT: movl %eax, %edx -; SSE2-NEXT: .LBB2_64: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %cl -; SSE2-NEXT: jb .LBB2_66 -; SSE2-NEXT: # %bb.65: -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: .LBB2_66: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSE2-NEXT: jb .LBB2_68 -; SSE2-NEXT: # %bb.67: -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_68: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSE2-NEXT: jb .LBB2_70 -; SSE2-NEXT: # %bb.69: -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_70: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSE2-NEXT: jb .LBB2_72 -; SSE2-NEXT: # %bb.71: -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_72: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSE2-NEXT: jb .LBB2_74 -; SSE2-NEXT: # %bb.73: -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_74: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSE2-NEXT: jb .LBB2_76 -; SSE2-NEXT: # %bb.75: -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_76: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSE2-NEXT: jb .LBB2_78 -; SSE2-NEXT: # %bb.77: -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_78: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSE2-NEXT: jb .LBB2_80 -; SSE2-NEXT: # %bb.79: -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_80: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSE2-NEXT: jb .LBB2_82 -; SSE2-NEXT: # %bb.81: -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_82: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSE2-NEXT: jb .LBB2_84 -; SSE2-NEXT: # %bb.83: -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_84: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSE2-NEXT: jb .LBB2_86 -; SSE2-NEXT: # %bb.85: -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_86: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %r13b -; SSE2-NEXT: jb .LBB2_88 -; SSE2-NEXT: # %bb.87: -; SSE2-NEXT: movl %eax, %r13d -; SSE2-NEXT: .LBB2_88: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %bl -; SSE2-NEXT: jb .LBB2_90 -; SSE2-NEXT: # %bb.89: -; SSE2-NEXT: movl %eax, %ebx -; SSE2-NEXT: .LBB2_90: -; SSE2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %cl -; SSE2-NEXT: jb .LBB2_92 -; SSE2-NEXT: # %bb.91: -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: .LBB2_92: -; SSE2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSE2-NEXT: jb .LBB2_94 -; SSE2-NEXT: # %bb.93: -; SSE2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_94: -; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %cl -; SSE2-NEXT: jb .LBB2_96 -; SSE2-NEXT: # %bb.95: -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: .LBB2_96: -; SSE2-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSE2-NEXT: movl %esi, %r13d -; SSE2-NEXT: jb .LBB2_98 -; SSE2-NEXT: # %bb.97: -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_98: -; SSE2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %sil -; SSE2-NEXT: jb .LBB2_100 -; SSE2-NEXT: # %bb.99: -; SSE2-NEXT: movl %eax, %esi -; SSE2-NEXT: .LBB2_100: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %cl -; SSE2-NEXT: jb .LBB2_102 -; SSE2-NEXT: # %bb.101: -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: .LBB2_102: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %bl -; SSE2-NEXT: jb .LBB2_104 -; SSE2-NEXT: # %bb.103: -; SSE2-NEXT: movl %eax, %ebx -; SSE2-NEXT: .LBB2_104: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSE2-NEXT: jb .LBB2_106 -; SSE2-NEXT: # %bb.105: -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_106: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSE2-NEXT: jb .LBB2_108 -; SSE2-NEXT: # %bb.107: -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_108: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSE2-NEXT: jb .LBB2_110 -; SSE2-NEXT: # %bb.109: -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_110: -; SSE2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %cl -; SSE2-NEXT: jb .LBB2_112 -; SSE2-NEXT: # %bb.111: -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: .LBB2_112: -; SSE2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %cl -; SSE2-NEXT: jb .LBB2_114 -; SSE2-NEXT: # %bb.113: -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: .LBB2_114: -; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %cl -; SSE2-NEXT: jb .LBB2_116 -; SSE2-NEXT: # %bb.115: -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: .LBB2_116: -; SSE2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl %dil, %eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl %r8b, %eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl %r9b, %edi -; SSE2-NEXT: movzbl %r10b, %r8d -; SSE2-NEXT: movzbl %r11b, %r9d -; SSE2-NEXT: movzbl %bpl, %r10d -; SSE2-NEXT: movzbl %r14b, %r11d -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SSE2-NEXT: movzbl %r15b, %ebp -; SSE2-NEXT: movzbl %r12b, %r14d -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload -; SSE2-NEXT: movzbl %r13b, %r13d -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl %dl, %edx -; SSE2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: addb {{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSE2-NEXT: jb .LBB2_118 -; SSE2-NEXT: # %bb.117: -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_118: -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd %edi, %xmm3 -; SSE2-NEXT: movd %r8d, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd %r9d, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd %r10d, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd %r11d, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd %ebx, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd %ebp, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd %r14d, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd %r15d, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd %r12d, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd %r13d, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: addb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload -; SSE2-NEXT: movb $-1, %bl -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 4-byte Folded Reload -; SSE2-NEXT: # xmm11 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload -; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload -; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload -; SSE2-NEXT: # xmm15 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 4-byte Folded Reload -; SSE2-NEXT: # xmm9 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload -; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload -; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 4-byte Folded Reload -; SSE2-NEXT: # xmm14 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 4-byte Folded Reload -; SSE2-NEXT: # xmm5 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload -; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 4-byte Folded Reload -; SSE2-NEXT: # xmm6 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 4-byte Folded Reload -; SSE2-NEXT: # xmm13 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 4-byte Folded Reload -; SSE2-NEXT: # xmm10 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload -; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload -; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload -; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload -; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload -; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload -; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 4-byte Folded Reload -; SSE2-NEXT: # xmm12 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload -; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload -; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload -; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 4-byte Folded Reload -; SSE2-NEXT: # xmm7 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload -; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload -; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload -; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 4-byte Folded Reload -; SSE2-NEXT: # xmm8 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload -; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload -; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload -; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload -; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: jb .LBB2_120 -; SSE2-NEXT: # %bb.119: -; SSE2-NEXT: movl %ecx, %ebx -; SSE2-NEXT: .LBB2_120: -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE2-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3],xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; SSE2-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3],xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] -; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE2-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1],xmm15[2],mem[2],xmm15[3],mem[3],xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] -; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE2-NEXT: # xmm14 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3],xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3],xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7] -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE2-NEXT: # xmm12 = xmm12[0],mem[0],xmm12[1],mem[1],xmm12[2],mem[2],xmm12[3],mem[3],xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE2-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3],xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE2-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3],xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE2-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSE2-NEXT: movd %esi, %xmm3 -; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd %edx, %xmm10 -; SSE2-NEXT: movd %edi, %xmm3 -; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd %ebp, %xmm9 -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd %r10d, %xmm11 -; SSE2-NEXT: movd %r9d, %xmm3 -; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd %r8d, %xmm4 -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE2-NEXT: movzbl %bl, %ecx -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: addb {{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movb $-1, %al -; SSE2-NEXT: jb .LBB2_122 -; SSE2-NEXT: # %bb.121: -; SSE2-NEXT: movl %ebx, %eax -; SSE2-NEXT: .LBB2_122: -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE2-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1],xmm15[2],mem[2],xmm15[3],mem[3] -; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE2-NEXT: # xmm14 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE2-NEXT: # xmm12 = xmm12[0],mem[0],xmm12[1],mem[1],xmm12[2],mem[2],xmm12[3],mem[3] -; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE2-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3] -; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE2-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3] -; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE2-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE2-NEXT: # xmm10 = xmm10[0],mem[0],xmm10[1],mem[1],xmm10[2],mem[2],xmm10[3],mem[3],xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE2-NEXT: # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1],xmm9[2],mem[2],xmm9[3],mem[3],xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE2-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3],xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE2-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3],xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; SSE2-NEXT: movd %edi, %xmm3 -; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd %edx, %xmm3 -; SSE2-NEXT: movd %esi, %xmm5 -; SSE2-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd %ecx, %xmm5 -; SSE2-NEXT: movzbl %al, %eax -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: addb {{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb $-1, %cl -; SSE2-NEXT: jb .LBB2_124 -; SSE2-NEXT: # %bb.123: -; SSE2-NEXT: movl %edx, %ecx -; SSE2-NEXT: .LBB2_124: -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE2-NEXT: # xmm6 = xmm6[0],mem[0],xmm6[1],mem[1] -; SSE2-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3] -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE2-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3],xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] -; SSE2-NEXT: movd %eax, %xmm8 -; SSE2-NEXT: movzbl %cl, %eax -; SSE2-NEXT: movd %eax, %xmm6 -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: addb {{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb $-1, %cl -; SSE2-NEXT: jb .LBB2_126 -; SSE2-NEXT: # %bb.125: -; SSE2-NEXT: movl %edx, %ecx -; SSE2-NEXT: .LBB2_126: -; SSE2-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE2-NEXT: # xmm0 = xmm0[0],mem[0] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm14[0] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm7[0] -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; SSE2-NEXT: movzbl %cl, %ecx -; SSE2-NEXT: movd %ecx, %xmm7 -; SSE2-NEXT: addb (%rsp), %al -; SSE2-NEXT: movb $-1, %cl -; SSE2-NEXT: jb .LBB2_128 -; SSE2-NEXT: # %bb.127: -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: .LBB2_128: -; SSE2-NEXT: movzbl %cl, %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; SSE2-NEXT: addq $648, %rsp # imm = 0x288 -; SSE2-NEXT: popq %rbx -; SSE2-NEXT: popq %r12 -; SSE2-NEXT: popq %r13 -; SSE2-NEXT: popq %r14 -; SSE2-NEXT: popq %r15 -; SSE2-NEXT: popq %rbp -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v64i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pushq %rbp -; SSSE3-NEXT: pushq %r15 -; SSSE3-NEXT: pushq %r14 -; SSSE3-NEXT: pushq %r13 -; SSSE3-NEXT: pushq %r12 -; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: subq $648, %rsp # imm = 0x288 -; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm5, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm6, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm7, (%rsp) -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %cl -; SSSE3-NEXT: jb .LBB2_2 -; SSSE3-NEXT: # %bb.1: -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: .LBB2_2: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %dil -; SSSE3-NEXT: jb .LBB2_4 -; SSSE3-NEXT: # %bb.3: -; SSSE3-NEXT: movl %eax, %edi -; SSSE3-NEXT: .LBB2_4: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %r8b -; SSSE3-NEXT: jb .LBB2_6 -; SSSE3-NEXT: # %bb.5: -; SSSE3-NEXT: movl %eax, %r8d -; SSSE3-NEXT: .LBB2_6: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %r9b -; SSSE3-NEXT: jb .LBB2_8 -; SSSE3-NEXT: # %bb.7: -; SSSE3-NEXT: movl %eax, %r9d -; SSSE3-NEXT: .LBB2_8: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %r10b -; SSSE3-NEXT: jb .LBB2_10 -; SSSE3-NEXT: # %bb.9: -; SSSE3-NEXT: movl %eax, %r10d -; SSSE3-NEXT: .LBB2_10: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %r11b -; SSSE3-NEXT: jb .LBB2_12 -; SSSE3-NEXT: # %bb.11: -; SSSE3-NEXT: movl %eax, %r11d -; SSSE3-NEXT: .LBB2_12: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %bpl -; SSSE3-NEXT: jb .LBB2_14 -; SSSE3-NEXT: # %bb.13: -; SSSE3-NEXT: movl %eax, %ebp -; SSSE3-NEXT: .LBB2_14: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %r14b -; SSSE3-NEXT: jb .LBB2_16 -; SSSE3-NEXT: # %bb.15: -; SSSE3-NEXT: movl %eax, %r14d -; SSSE3-NEXT: .LBB2_16: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %dl -; SSSE3-NEXT: jb .LBB2_18 -; SSSE3-NEXT: # %bb.17: -; SSSE3-NEXT: movl %eax, %edx -; SSSE3-NEXT: .LBB2_18: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %r15b -; SSSE3-NEXT: jb .LBB2_20 -; SSSE3-NEXT: # %bb.19: -; SSSE3-NEXT: movl %eax, %r15d -; SSSE3-NEXT: .LBB2_20: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %r12b -; SSSE3-NEXT: jb .LBB2_22 -; SSSE3-NEXT: # %bb.21: -; SSSE3-NEXT: movl %eax, %r12d -; SSSE3-NEXT: .LBB2_22: -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %dl -; SSSE3-NEXT: jb .LBB2_24 -; SSSE3-NEXT: # %bb.23: -; SSSE3-NEXT: movl %eax, %edx -; SSSE3-NEXT: .LBB2_24: -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %dl -; SSSE3-NEXT: jb .LBB2_26 -; SSSE3-NEXT: # %bb.25: -; SSSE3-NEXT: movl %eax, %edx -; SSSE3-NEXT: .LBB2_26: -; SSSE3-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %sil -; SSSE3-NEXT: jb .LBB2_28 -; SSSE3-NEXT: # %bb.27: -; SSSE3-NEXT: movl %eax, %esi -; SSSE3-NEXT: .LBB2_28: -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_30 -; SSSE3-NEXT: # %bb.29: -; SSSE3-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_30: -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %dl -; SSSE3-NEXT: jb .LBB2_32 -; SSSE3-NEXT: # %bb.31: -; SSSE3-NEXT: movl %eax, %edx -; SSSE3-NEXT: .LBB2_32: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %cl -; SSSE3-NEXT: jb .LBB2_34 -; SSSE3-NEXT: # %bb.33: -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: .LBB2_34: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_36 -; SSSE3-NEXT: # %bb.35: -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_36: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_38 -; SSSE3-NEXT: # %bb.37: -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_38: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_40 -; SSSE3-NEXT: # %bb.39: -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_40: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_42 -; SSSE3-NEXT: # %bb.41: -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_42: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_44 -; SSSE3-NEXT: # %bb.43: -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_44: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_46 -; SSSE3-NEXT: # %bb.45: -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_46: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_48 -; SSSE3-NEXT: # %bb.47: -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_48: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_50 -; SSSE3-NEXT: # %bb.49: -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_50: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_52 -; SSSE3-NEXT: # %bb.51: -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_52: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_54 -; SSSE3-NEXT: # %bb.53: -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_54: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_56 -; SSSE3-NEXT: # %bb.55: -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_56: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_58 -; SSSE3-NEXT: # %bb.57: -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_58: -; SSSE3-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %cl -; SSSE3-NEXT: jb .LBB2_60 -; SSSE3-NEXT: # %bb.59: -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: .LBB2_60: -; SSSE3-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_62 -; SSSE3-NEXT: # %bb.61: -; SSSE3-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_62: -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %dl -; SSSE3-NEXT: jb .LBB2_64 -; SSSE3-NEXT: # %bb.63: -; SSSE3-NEXT: movl %eax, %edx -; SSSE3-NEXT: .LBB2_64: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %cl -; SSSE3-NEXT: jb .LBB2_66 -; SSSE3-NEXT: # %bb.65: -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: .LBB2_66: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_68 -; SSSE3-NEXT: # %bb.67: -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_68: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_70 -; SSSE3-NEXT: # %bb.69: -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_70: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_72 -; SSSE3-NEXT: # %bb.71: -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_72: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_74 -; SSSE3-NEXT: # %bb.73: -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_74: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_76 -; SSSE3-NEXT: # %bb.75: -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_76: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_78 -; SSSE3-NEXT: # %bb.77: -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_78: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_80 -; SSSE3-NEXT: # %bb.79: -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_80: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_82 -; SSSE3-NEXT: # %bb.81: -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_82: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_84 -; SSSE3-NEXT: # %bb.83: -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_84: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_86 -; SSSE3-NEXT: # %bb.85: -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_86: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %r13b -; SSSE3-NEXT: jb .LBB2_88 -; SSSE3-NEXT: # %bb.87: -; SSSE3-NEXT: movl %eax, %r13d -; SSSE3-NEXT: .LBB2_88: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %bl -; SSSE3-NEXT: jb .LBB2_90 -; SSSE3-NEXT: # %bb.89: -; SSSE3-NEXT: movl %eax, %ebx -; SSSE3-NEXT: .LBB2_90: -; SSSE3-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %cl -; SSSE3-NEXT: jb .LBB2_92 -; SSSE3-NEXT: # %bb.91: -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: .LBB2_92: -; SSSE3-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_94 -; SSSE3-NEXT: # %bb.93: -; SSSE3-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_94: -; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %cl -; SSSE3-NEXT: jb .LBB2_96 -; SSSE3-NEXT: # %bb.95: -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: .LBB2_96: -; SSSE3-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSSE3-NEXT: movl %esi, %r13d -; SSSE3-NEXT: jb .LBB2_98 -; SSSE3-NEXT: # %bb.97: -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_98: -; SSSE3-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %sil -; SSSE3-NEXT: jb .LBB2_100 -; SSSE3-NEXT: # %bb.99: -; SSSE3-NEXT: movl %eax, %esi -; SSSE3-NEXT: .LBB2_100: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %cl -; SSSE3-NEXT: jb .LBB2_102 -; SSSE3-NEXT: # %bb.101: -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: .LBB2_102: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %bl -; SSSE3-NEXT: jb .LBB2_104 -; SSSE3-NEXT: # %bb.103: -; SSSE3-NEXT: movl %eax, %ebx -; SSSE3-NEXT: .LBB2_104: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_106 -; SSSE3-NEXT: # %bb.105: -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_106: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_108 -; SSSE3-NEXT: # %bb.107: -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_108: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_110 -; SSSE3-NEXT: # %bb.109: -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_110: -; SSSE3-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %cl -; SSSE3-NEXT: jb .LBB2_112 -; SSSE3-NEXT: # %bb.111: -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: .LBB2_112: -; SSSE3-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %cl -; SSSE3-NEXT: jb .LBB2_114 -; SSSE3-NEXT: # %bb.113: -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: .LBB2_114: -; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %cl -; SSSE3-NEXT: jb .LBB2_116 -; SSSE3-NEXT: # %bb.115: -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: .LBB2_116: -; SSSE3-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl %dil, %eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl %r8b, %eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl %r9b, %edi -; SSSE3-NEXT: movzbl %r10b, %r8d -; SSSE3-NEXT: movzbl %r11b, %r9d -; SSSE3-NEXT: movzbl %bpl, %r10d -; SSSE3-NEXT: movzbl %r14b, %r11d -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SSSE3-NEXT: movzbl %r15b, %ebp -; SSSE3-NEXT: movzbl %r12b, %r14d -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl %r13b, %r13d -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl %dl, %edx -; SSSE3-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: addb {{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_118 -; SSSE3-NEXT: # %bb.117: -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_118: -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; SSSE3-NEXT: # xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; SSSE3-NEXT: # xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; SSSE3-NEXT: # xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd %edi, %xmm3 -; SSSE3-NEXT: movd %r8d, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd %r9d, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd %r10d, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd %r11d, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd %ebx, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd %ebp, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd %r14d, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd %r15d, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd %r12d, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd %r13d, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: addb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload -; SSSE3-NEXT: movb $-1, %bl -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm11 = mem[0],zero,zero,zero -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload -; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload -; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm15 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm9 = mem[0],zero,zero,zero -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload -; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload -; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm14 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm5 = mem[0],zero,zero,zero -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload -; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm6 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm13 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm10 = mem[0],zero,zero,zero -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload -; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload -; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload -; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload -; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload -; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm12 = mem[0],zero,zero,zero -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload -; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload -; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload -; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm7 = mem[0],zero,zero,zero -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload -; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload -; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload -; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm8 = mem[0],zero,zero,zero -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload -; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload -; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload -; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: jb .LBB2_120 -; SSSE3-NEXT: # %bb.119: -; SSSE3-NEXT: movl %ecx, %ebx -; SSSE3-NEXT: .LBB2_120: -; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3],xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; SSSE3-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3],xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] -; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1],xmm15[2],mem[2],xmm15[3],mem[3],xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] -; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] -; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm14 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3],xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] -; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; SSSE3-NEXT: movdqa %xmm3, %xmm5 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3],xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7] -; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm12 = xmm12[0],mem[0],xmm12[1],mem[1],xmm12[2],mem[2],xmm12[3],mem[3],xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] -; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3],xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] -; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3],xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] -; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSSE3-NEXT: movd %esi, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd %edx, %xmm10 -; SSSE3-NEXT: movd %edi, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd %ebp, %xmm9 -; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd %r10d, %xmm11 -; SSSE3-NEXT: movd %r9d, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd %r8d, %xmm4 -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSSE3-NEXT: movzbl %bl, %ecx -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: addb {{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movb $-1, %al -; SSSE3-NEXT: jb .LBB2_122 -; SSSE3-NEXT: # %bb.121: -; SSSE3-NEXT: movl %ebx, %eax -; SSSE3-NEXT: .LBB2_122: -; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1],xmm15[2],mem[2],xmm15[3],mem[3] -; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm14 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm12 = xmm12[0],mem[0],xmm12[1],mem[1],xmm12[2],mem[2],xmm12[3],mem[3] -; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3] -; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3] -; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm10 = xmm10[0],mem[0],xmm10[1],mem[1],xmm10[2],mem[2],xmm10[3],mem[3],xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1],xmm9[2],mem[2],xmm9[3],mem[3],xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3],xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3],xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; SSSE3-NEXT: movd %edi, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd %edx, %xmm3 -; SSSE3-NEXT: movd %esi, %xmm5 -; SSSE3-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd %ecx, %xmm5 -; SSSE3-NEXT: movzbl %al, %eax -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: addb {{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb $-1, %cl -; SSSE3-NEXT: jb .LBB2_124 -; SSSE3-NEXT: # %bb.123: -; SSSE3-NEXT: movl %edx, %ecx -; SSSE3-NEXT: .LBB2_124: -; SSSE3-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSSE3-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm6 = xmm6[0],mem[0],xmm6[1],mem[1] -; SSSE3-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3] -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3],xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] -; SSSE3-NEXT: movd %eax, %xmm8 -; SSSE3-NEXT: movzbl %cl, %eax -; SSSE3-NEXT: movd %eax, %xmm6 -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: addb {{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb $-1, %cl -; SSSE3-NEXT: jb .LBB2_126 -; SSSE3-NEXT: # %bb.125: -; SSSE3-NEXT: movl %edx, %ecx -; SSSE3-NEXT: .LBB2_126: -; SSSE3-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm0 = xmm0[0],mem[0] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm14[0] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm7[0] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; SSSE3-NEXT: movzbl %cl, %ecx -; SSSE3-NEXT: movd %ecx, %xmm7 -; SSSE3-NEXT: addb (%rsp), %al -; SSSE3-NEXT: movb $-1, %cl -; SSSE3-NEXT: jb .LBB2_128 -; SSSE3-NEXT: # %bb.127: -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: .LBB2_128: -; SSSE3-NEXT: movzbl %cl, %eax -; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; SSSE3-NEXT: addq $648, %rsp # imm = 0x288 -; SSSE3-NEXT: popq %rbx -; SSSE3-NEXT: popq %r12 -; SSSE3-NEXT: popq %r13 -; SSSE3-NEXT: popq %r14 -; SSSE3-NEXT: popq %r15 -; SSSE3-NEXT: popq %rbp -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v64i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pextrb $1, %xmm4, %ecx -; SSE41-NEXT: pextrb $1, %xmm0, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: movb $-1, %cl -; SSE41-NEXT: jb .LBB2_2 -; SSE41-NEXT: # %bb.1: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_2: -; SSE41-NEXT: pushq %rbp -; SSE41-NEXT: pushq %r15 -; SSE41-NEXT: pushq %r14 -; SSE41-NEXT: pushq %r13 -; SSE41-NEXT: pushq %r12 -; SSE41-NEXT: pushq %rbx -; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE41-NEXT: pextrb $0, %xmm4, %ecx -; SSE41-NEXT: pextrb $0, %xmm0, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: movb $-1, %r10b -; SSE41-NEXT: jb .LBB2_4 -; SSE41-NEXT: # %bb.3: -; SSE41-NEXT: movl %eax, %r10d -; SSE41-NEXT: .LBB2_4: -; SSE41-NEXT: pextrb $2, %xmm4, %ecx -; SSE41-NEXT: pextrb $2, %xmm0, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: movb $-1, %cl -; SSE41-NEXT: jb .LBB2_6 -; SSE41-NEXT: # %bb.5: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_6: -; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE41-NEXT: pextrb $3, %xmm4, %ecx -; SSE41-NEXT: pextrb $3, %xmm0, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: movb $-1, %cl -; SSE41-NEXT: jb .LBB2_8 -; SSE41-NEXT: # %bb.7: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_8: -; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE41-NEXT: pextrb $4, %xmm4, %ecx -; SSE41-NEXT: pextrb $4, %xmm0, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: movb $-1, %cl -; SSE41-NEXT: jb .LBB2_10 -; SSE41-NEXT: # %bb.9: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_10: -; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE41-NEXT: pextrb $5, %xmm4, %ecx -; SSE41-NEXT: pextrb $5, %xmm0, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: movb $-1, %cl -; SSE41-NEXT: jb .LBB2_12 -; SSE41-NEXT: # %bb.11: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_12: -; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE41-NEXT: pextrb $6, %xmm4, %ecx -; SSE41-NEXT: pextrb $6, %xmm0, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: movb $-1, %cl -; SSE41-NEXT: jb .LBB2_14 -; SSE41-NEXT: # %bb.13: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_14: -; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE41-NEXT: pextrb $7, %xmm4, %ecx -; SSE41-NEXT: pextrb $7, %xmm0, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: movb $-1, %cl -; SSE41-NEXT: jb .LBB2_16 -; SSE41-NEXT: # %bb.15: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_16: -; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE41-NEXT: pextrb $8, %xmm4, %ecx -; SSE41-NEXT: pextrb $8, %xmm0, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: movb $-1, %cl -; SSE41-NEXT: jb .LBB2_18 -; SSE41-NEXT: # %bb.17: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_18: -; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE41-NEXT: pextrb $9, %xmm4, %ecx -; SSE41-NEXT: pextrb $9, %xmm0, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: movb $-1, %cl -; SSE41-NEXT: jb .LBB2_20 -; SSE41-NEXT: # %bb.19: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_20: -; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE41-NEXT: pextrb $10, %xmm4, %ecx -; SSE41-NEXT: pextrb $10, %xmm0, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: movb $-1, %cl -; SSE41-NEXT: jb .LBB2_22 -; SSE41-NEXT: # %bb.21: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_22: -; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE41-NEXT: pextrb $11, %xmm4, %ecx -; SSE41-NEXT: pextrb $11, %xmm0, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: movb $-1, %cl -; SSE41-NEXT: jb .LBB2_24 -; SSE41-NEXT: # %bb.23: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_24: -; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE41-NEXT: pextrb $12, %xmm4, %ecx -; SSE41-NEXT: pextrb $12, %xmm0, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: movb $-1, %cl -; SSE41-NEXT: jb .LBB2_26 -; SSE41-NEXT: # %bb.25: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_26: -; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE41-NEXT: pextrb $13, %xmm4, %ecx -; SSE41-NEXT: pextrb $13, %xmm0, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: movb $-1, %cl -; SSE41-NEXT: jb .LBB2_28 -; SSE41-NEXT: # %bb.27: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_28: -; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE41-NEXT: pextrb $14, %xmm4, %ecx -; SSE41-NEXT: pextrb $14, %xmm0, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: movb $-1, %cl -; SSE41-NEXT: jb .LBB2_30 -; SSE41-NEXT: # %bb.29: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_30: -; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE41-NEXT: pextrb $15, %xmm4, %ecx -; SSE41-NEXT: pextrb $15, %xmm0, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: movb $-1, %cl -; SSE41-NEXT: jb .LBB2_32 -; SSE41-NEXT: # %bb.31: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_32: -; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE41-NEXT: pextrb $1, %xmm5, %ecx -; SSE41-NEXT: pextrb $1, %xmm1, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: movb $-1, %bl -; SSE41-NEXT: jb .LBB2_34 -; SSE41-NEXT: # %bb.33: -; SSE41-NEXT: movl %eax, %ebx -; SSE41-NEXT: .LBB2_34: -; SSE41-NEXT: pextrb $0, %xmm5, %ecx -; SSE41-NEXT: pextrb $0, %xmm1, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: movb $-1, %bpl -; SSE41-NEXT: jb .LBB2_36 -; SSE41-NEXT: # %bb.35: -; SSE41-NEXT: movl %eax, %ebp -; SSE41-NEXT: .LBB2_36: -; SSE41-NEXT: pextrb $2, %xmm5, %ecx -; SSE41-NEXT: pextrb $2, %xmm1, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: movb $-1, %cl -; SSE41-NEXT: jb .LBB2_38 -; SSE41-NEXT: # %bb.37: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_38: -; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE41-NEXT: pextrb $3, %xmm5, %ecx -; SSE41-NEXT: pextrb $3, %xmm1, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: movb $-1, %cl -; SSE41-NEXT: jb .LBB2_40 -; SSE41-NEXT: # %bb.39: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_40: -; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE41-NEXT: pextrb $4, %xmm5, %ecx -; SSE41-NEXT: pextrb $4, %xmm1, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: movb $-1, %cl -; SSE41-NEXT: jb .LBB2_42 -; SSE41-NEXT: # %bb.41: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_42: -; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE41-NEXT: pextrb $5, %xmm5, %ecx -; SSE41-NEXT: pextrb $5, %xmm1, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: movb $-1, %cl -; SSE41-NEXT: jb .LBB2_44 -; SSE41-NEXT: # %bb.43: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_44: -; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE41-NEXT: pextrb $6, %xmm5, %ecx -; SSE41-NEXT: pextrb $6, %xmm1, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: movb $-1, %cl -; SSE41-NEXT: jb .LBB2_46 -; SSE41-NEXT: # %bb.45: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_46: -; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE41-NEXT: pextrb $7, %xmm5, %ecx -; SSE41-NEXT: pextrb $7, %xmm1, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: movb $-1, %cl -; SSE41-NEXT: jb .LBB2_48 -; SSE41-NEXT: # %bb.47: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_48: -; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE41-NEXT: pextrb $8, %xmm5, %ecx -; SSE41-NEXT: pextrb $8, %xmm1, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: movb $-1, %cl -; SSE41-NEXT: jb .LBB2_50 -; SSE41-NEXT: # %bb.49: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_50: -; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE41-NEXT: pextrb $9, %xmm5, %ecx -; SSE41-NEXT: pextrb $9, %xmm1, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: movb $-1, %cl -; SSE41-NEXT: jb .LBB2_52 -; SSE41-NEXT: # %bb.51: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_52: -; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE41-NEXT: pextrb $10, %xmm5, %ecx -; SSE41-NEXT: pextrb $10, %xmm1, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: movb $-1, %cl -; SSE41-NEXT: jb .LBB2_54 -; SSE41-NEXT: # %bb.53: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_54: -; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE41-NEXT: pextrb $11, %xmm5, %ecx -; SSE41-NEXT: pextrb $11, %xmm1, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: movb $-1, %cl -; SSE41-NEXT: jb .LBB2_56 -; SSE41-NEXT: # %bb.55: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_56: -; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE41-NEXT: pextrb $12, %xmm5, %ecx -; SSE41-NEXT: pextrb $12, %xmm1, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: movb $-1, %cl -; SSE41-NEXT: jb .LBB2_58 -; SSE41-NEXT: # %bb.57: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_58: -; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE41-NEXT: pextrb $13, %xmm5, %ecx -; SSE41-NEXT: pextrb $13, %xmm1, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: movb $-1, %cl -; SSE41-NEXT: jb .LBB2_60 -; SSE41-NEXT: # %bb.59: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_60: -; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE41-NEXT: pextrb $14, %xmm5, %ecx -; SSE41-NEXT: pextrb $14, %xmm1, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: movb $-1, %cl -; SSE41-NEXT: jb .LBB2_62 -; SSE41-NEXT: # %bb.61: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_62: -; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE41-NEXT: pextrb $15, %xmm5, %ecx -; SSE41-NEXT: pextrb $15, %xmm1, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: movb $-1, %cl -; SSE41-NEXT: jb .LBB2_64 -; SSE41-NEXT: # %bb.63: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_64: -; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE41-NEXT: pextrb $1, %xmm6, %ecx -; SSE41-NEXT: pextrb $1, %xmm2, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: movb $-1, %dil -; SSE41-NEXT: jb .LBB2_66 -; SSE41-NEXT: # %bb.65: -; SSE41-NEXT: movl %eax, %edi -; SSE41-NEXT: .LBB2_66: -; SSE41-NEXT: pextrb $0, %xmm6, %ecx -; SSE41-NEXT: pextrb $0, %xmm2, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: movb $-1, %cl -; SSE41-NEXT: jb .LBB2_68 -; SSE41-NEXT: # %bb.67: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_68: -; SSE41-NEXT: pextrb $2, %xmm6, %edx -; SSE41-NEXT: pextrb $2, %xmm2, %eax -; SSE41-NEXT: addb %dl, %al -; SSE41-NEXT: movb $-1, %r8b -; SSE41-NEXT: jb .LBB2_70 -; SSE41-NEXT: # %bb.69: -; SSE41-NEXT: movl %eax, %r8d -; SSE41-NEXT: .LBB2_70: -; SSE41-NEXT: pextrb $3, %xmm6, %edx -; SSE41-NEXT: pextrb $3, %xmm2, %eax -; SSE41-NEXT: addb %dl, %al -; SSE41-NEXT: movb $-1, %sil -; SSE41-NEXT: jb .LBB2_72 -; SSE41-NEXT: # %bb.71: -; SSE41-NEXT: movl %eax, %esi -; SSE41-NEXT: .LBB2_72: -; SSE41-NEXT: pextrb $4, %xmm6, %eax -; SSE41-NEXT: pextrb $4, %xmm2, %edx -; SSE41-NEXT: addb %al, %dl -; SSE41-NEXT: movb $-1, %r15b -; SSE41-NEXT: jb .LBB2_74 -; SSE41-NEXT: # %bb.73: -; SSE41-NEXT: movl %edx, %r15d -; SSE41-NEXT: .LBB2_74: -; SSE41-NEXT: pextrb $5, %xmm6, %eax -; SSE41-NEXT: pextrb $5, %xmm2, %edx -; SSE41-NEXT: addb %al, %dl -; SSE41-NEXT: movb $-1, %r14b -; SSE41-NEXT: jb .LBB2_76 -; SSE41-NEXT: # %bb.75: -; SSE41-NEXT: movl %edx, %r14d -; SSE41-NEXT: .LBB2_76: -; SSE41-NEXT: pextrb $6, %xmm6, %eax -; SSE41-NEXT: pextrb $6, %xmm2, %edx -; SSE41-NEXT: addb %al, %dl -; SSE41-NEXT: movb $-1, %r11b -; SSE41-NEXT: jb .LBB2_78 -; SSE41-NEXT: # %bb.77: -; SSE41-NEXT: movl %edx, %r11d -; SSE41-NEXT: .LBB2_78: -; SSE41-NEXT: pextrb $7, %xmm6, %eax -; SSE41-NEXT: pextrb $7, %xmm2, %edx -; SSE41-NEXT: addb %al, %dl -; SSE41-NEXT: movb $-1, %r9b -; SSE41-NEXT: jb .LBB2_80 -; SSE41-NEXT: # %bb.79: -; SSE41-NEXT: movl %edx, %r9d -; SSE41-NEXT: .LBB2_80: -; SSE41-NEXT: pextrb $8, %xmm6, %eax -; SSE41-NEXT: pextrb $8, %xmm2, %edx -; SSE41-NEXT: addb %al, %dl -; SSE41-NEXT: movb $-1, %r13b -; SSE41-NEXT: jb .LBB2_82 -; SSE41-NEXT: # %bb.81: -; SSE41-NEXT: movl %edx, %r13d -; SSE41-NEXT: .LBB2_82: -; SSE41-NEXT: pextrb $9, %xmm6, %eax -; SSE41-NEXT: pextrb $9, %xmm2, %edx -; SSE41-NEXT: addb %al, %dl -; SSE41-NEXT: movb $-1, %r12b -; SSE41-NEXT: jb .LBB2_84 -; SSE41-NEXT: # %bb.83: -; SSE41-NEXT: movl %edx, %r12d -; SSE41-NEXT: .LBB2_84: -; SSE41-NEXT: pextrb $10, %xmm6, %eax -; SSE41-NEXT: pextrb $10, %xmm2, %edx -; SSE41-NEXT: addb %al, %dl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB2_86 -; SSE41-NEXT: # %bb.85: -; SSE41-NEXT: movl %edx, %eax -; SSE41-NEXT: .LBB2_86: -; SSE41-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE41-NEXT: pextrb $11, %xmm6, %eax -; SSE41-NEXT: pextrb $11, %xmm2, %edx -; SSE41-NEXT: addb %al, %dl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB2_88 -; SSE41-NEXT: # %bb.87: -; SSE41-NEXT: movl %edx, %eax -; SSE41-NEXT: .LBB2_88: -; SSE41-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE41-NEXT: pextrb $12, %xmm6, %eax -; SSE41-NEXT: pextrb $12, %xmm2, %edx -; SSE41-NEXT: addb %al, %dl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB2_90 -; SSE41-NEXT: # %bb.89: -; SSE41-NEXT: movl %edx, %eax -; SSE41-NEXT: .LBB2_90: -; SSE41-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE41-NEXT: pextrb $13, %xmm6, %eax -; SSE41-NEXT: pextrb $13, %xmm2, %edx -; SSE41-NEXT: addb %al, %dl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB2_92 -; SSE41-NEXT: # %bb.91: -; SSE41-NEXT: movl %edx, %eax -; SSE41-NEXT: .LBB2_92: -; SSE41-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE41-NEXT: pextrb $14, %xmm6, %eax -; SSE41-NEXT: pextrb $14, %xmm2, %edx -; SSE41-NEXT: addb %al, %dl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB2_94 -; SSE41-NEXT: # %bb.93: -; SSE41-NEXT: movl %edx, %eax -; SSE41-NEXT: .LBB2_94: -; SSE41-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE41-NEXT: movzbl %r10b, %edx -; SSE41-NEXT: movzbl %bpl, %r10d -; SSE41-NEXT: movzbl %cl, %ebp -; SSE41-NEXT: pextrb $15, %xmm6, %eax -; SSE41-NEXT: pextrb $15, %xmm2, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB2_96 -; SSE41-NEXT: # %bb.95: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB2_96: -; SSE41-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE41-NEXT: movd %edx, %xmm0 -; SSE41-NEXT: movzbl %bl, %edx -; SSE41-NEXT: movd %r10d, %xmm1 -; SSE41-NEXT: movzbl %dil, %edi -; SSE41-NEXT: movd %ebp, %xmm2 -; SSE41-NEXT: pextrb $1, %xmm7, %eax -; SSE41-NEXT: pextrb $1, %xmm3, %ebp -; SSE41-NEXT: addb %al, %bpl -; SSE41-NEXT: movb $-1, %bl -; SSE41-NEXT: jb .LBB2_98 -; SSE41-NEXT: # %bb.97: -; SSE41-NEXT: movl %ebp, %ebx -; SSE41-NEXT: .LBB2_98: -; SSE41-NEXT: pinsrb $1, %ecx, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $1, %edx, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $1, %edi, %xmm2 -; SSE41-NEXT: movzbl %r8b, %edx -; SSE41-NEXT: movzbl %bl, %edi -; SSE41-NEXT: pextrb $0, %xmm7, %ebx -; SSE41-NEXT: pextrb $0, %xmm3, %eax -; SSE41-NEXT: addb %bl, %al -; SSE41-NEXT: movb $-1, %bl -; SSE41-NEXT: jb .LBB2_100 -; SSE41-NEXT: # %bb.99: -; SSE41-NEXT: movl %eax, %ebx -; SSE41-NEXT: .LBB2_100: -; SSE41-NEXT: pinsrb $2, %ecx, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $2, %ebp, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $2, %edx, %xmm2 -; SSE41-NEXT: movzbl %sil, %esi -; SSE41-NEXT: movzbl %bl, %eax -; SSE41-NEXT: movd %eax, %xmm4 -; SSE41-NEXT: pinsrb $1, %edi, %xmm4 -; SSE41-NEXT: pextrb $2, %xmm7, %edx -; SSE41-NEXT: pextrb $2, %xmm3, %eax -; SSE41-NEXT: addb %dl, %al -; SSE41-NEXT: movb $-1, %dl -; SSE41-NEXT: jb .LBB2_102 -; SSE41-NEXT: # %bb.101: -; SSE41-NEXT: movl %eax, %edx -; SSE41-NEXT: .LBB2_102: -; SSE41-NEXT: pinsrb $3, %ecx, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $3, %ebp, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $3, %esi, %xmm2 -; SSE41-NEXT: movzbl %r15b, %esi -; SSE41-NEXT: movzbl %dl, %eax -; SSE41-NEXT: pinsrb $2, %eax, %xmm4 -; SSE41-NEXT: pextrb $3, %xmm7, %edx -; SSE41-NEXT: pextrb $3, %xmm3, %eax -; SSE41-NEXT: addb %dl, %al -; SSE41-NEXT: movb $-1, %dl -; SSE41-NEXT: jb .LBB2_104 -; SSE41-NEXT: # %bb.103: -; SSE41-NEXT: movl %eax, %edx -; SSE41-NEXT: .LBB2_104: -; SSE41-NEXT: pinsrb $4, %ecx, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $4, %edi, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $4, %esi, %xmm2 -; SSE41-NEXT: movzbl %r14b, %esi -; SSE41-NEXT: movzbl %dl, %edx -; SSE41-NEXT: pinsrb $3, %edx, %xmm4 -; SSE41-NEXT: pextrb $4, %xmm7, %edi -; SSE41-NEXT: pextrb $4, %xmm3, %edx -; SSE41-NEXT: addb %dil, %dl -; SSE41-NEXT: movb $-1, %bl -; SSE41-NEXT: jb .LBB2_106 -; SSE41-NEXT: # %bb.105: -; SSE41-NEXT: movl %edx, %ebx -; SSE41-NEXT: .LBB2_106: -; SSE41-NEXT: pinsrb $5, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $5, %ecx, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $5, %esi, %xmm2 -; SSE41-NEXT: movzbl %r11b, %edx -; SSE41-NEXT: movzbl %bl, %esi -; SSE41-NEXT: pinsrb $4, %esi, %xmm4 -; SSE41-NEXT: pextrb $5, %xmm7, %edi -; SSE41-NEXT: pextrb $5, %xmm3, %esi -; SSE41-NEXT: addb %dil, %sil -; SSE41-NEXT: movb $-1, %bl -; SSE41-NEXT: jb .LBB2_108 -; SSE41-NEXT: # %bb.107: -; SSE41-NEXT: movl %esi, %ebx -; SSE41-NEXT: .LBB2_108: -; SSE41-NEXT: pinsrb $6, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $6, %ecx, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $6, %edx, %xmm2 -; SSE41-NEXT: movzbl %r9b, %edx -; SSE41-NEXT: movzbl %bl, %esi -; SSE41-NEXT: pinsrb $5, %esi, %xmm4 -; SSE41-NEXT: pextrb $6, %xmm7, %edi -; SSE41-NEXT: pextrb $6, %xmm3, %esi -; SSE41-NEXT: addb %dil, %sil -; SSE41-NEXT: movb $-1, %bl -; SSE41-NEXT: jb .LBB2_110 -; SSE41-NEXT: # %bb.109: -; SSE41-NEXT: movl %esi, %ebx -; SSE41-NEXT: .LBB2_110: -; SSE41-NEXT: pinsrb $7, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $7, %ecx, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $7, %edx, %xmm2 -; SSE41-NEXT: movzbl %r13b, %edx -; SSE41-NEXT: movzbl %bl, %esi -; SSE41-NEXT: pinsrb $6, %esi, %xmm4 -; SSE41-NEXT: pextrb $7, %xmm7, %edi -; SSE41-NEXT: pextrb $7, %xmm3, %esi -; SSE41-NEXT: addb %dil, %sil -; SSE41-NEXT: movb $-1, %bl -; SSE41-NEXT: jb .LBB2_112 -; SSE41-NEXT: # %bb.111: -; SSE41-NEXT: movl %esi, %ebx -; SSE41-NEXT: .LBB2_112: -; SSE41-NEXT: pinsrb $8, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $8, %ecx, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $8, %edx, %xmm2 -; SSE41-NEXT: movzbl %r12b, %edx -; SSE41-NEXT: movzbl %bl, %esi -; SSE41-NEXT: pinsrb $7, %esi, %xmm4 -; SSE41-NEXT: pextrb $8, %xmm7, %edi -; SSE41-NEXT: pextrb $8, %xmm3, %esi -; SSE41-NEXT: addb %dil, %sil -; SSE41-NEXT: movb $-1, %bl -; SSE41-NEXT: jb .LBB2_114 -; SSE41-NEXT: # %bb.113: -; SSE41-NEXT: movl %esi, %ebx -; SSE41-NEXT: .LBB2_114: -; SSE41-NEXT: pinsrb $9, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $9, %ecx, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $9, %edx, %xmm2 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; SSE41-NEXT: movzbl %bl, %esi -; SSE41-NEXT: pinsrb $8, %esi, %xmm4 -; SSE41-NEXT: pextrb $9, %xmm7, %edi -; SSE41-NEXT: pextrb $9, %xmm3, %esi -; SSE41-NEXT: addb %dil, %sil -; SSE41-NEXT: movb $-1, %bl -; SSE41-NEXT: jb .LBB2_116 -; SSE41-NEXT: # %bb.115: -; SSE41-NEXT: movl %esi, %ebx -; SSE41-NEXT: .LBB2_116: -; SSE41-NEXT: pinsrb $10, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $10, %ecx, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $10, %edx, %xmm2 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; SSE41-NEXT: movzbl %bl, %esi -; SSE41-NEXT: pinsrb $9, %esi, %xmm4 -; SSE41-NEXT: pextrb $10, %xmm7, %edi -; SSE41-NEXT: pextrb $10, %xmm3, %esi -; SSE41-NEXT: addb %dil, %sil -; SSE41-NEXT: movb $-1, %bl -; SSE41-NEXT: jb .LBB2_118 -; SSE41-NEXT: # %bb.117: -; SSE41-NEXT: movl %esi, %ebx -; SSE41-NEXT: .LBB2_118: -; SSE41-NEXT: pinsrb $11, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $11, %ecx, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $11, %edx, %xmm2 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; SSE41-NEXT: movzbl %bl, %esi -; SSE41-NEXT: pinsrb $10, %esi, %xmm4 -; SSE41-NEXT: pextrb $11, %xmm7, %edi -; SSE41-NEXT: pextrb $11, %xmm3, %esi -; SSE41-NEXT: addb %dil, %sil -; SSE41-NEXT: movb $-1, %bl -; SSE41-NEXT: jb .LBB2_120 -; SSE41-NEXT: # %bb.119: -; SSE41-NEXT: movl %esi, %ebx -; SSE41-NEXT: .LBB2_120: -; SSE41-NEXT: pinsrb $12, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $12, %ecx, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $12, %edx, %xmm2 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; SSE41-NEXT: movzbl %bl, %esi -; SSE41-NEXT: pinsrb $11, %esi, %xmm4 -; SSE41-NEXT: pextrb $12, %xmm7, %edi -; SSE41-NEXT: pextrb $12, %xmm3, %esi -; SSE41-NEXT: addb %dil, %sil -; SSE41-NEXT: movb $-1, %bl -; SSE41-NEXT: jb .LBB2_122 -; SSE41-NEXT: # %bb.121: -; SSE41-NEXT: movl %esi, %ebx -; SSE41-NEXT: .LBB2_122: -; SSE41-NEXT: pinsrb $13, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $13, %ecx, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $13, %edx, %xmm2 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; SSE41-NEXT: movzbl %bl, %esi -; SSE41-NEXT: pinsrb $12, %esi, %xmm4 -; SSE41-NEXT: pextrb $13, %xmm7, %edi -; SSE41-NEXT: pextrb $13, %xmm3, %esi -; SSE41-NEXT: addb %dil, %sil -; SSE41-NEXT: movb $-1, %bl -; SSE41-NEXT: jb .LBB2_124 -; SSE41-NEXT: # %bb.123: -; SSE41-NEXT: movl %esi, %ebx -; SSE41-NEXT: .LBB2_124: -; SSE41-NEXT: pinsrb $14, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $14, %ecx, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $14, %edx, %xmm2 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; SSE41-NEXT: movzbl %bl, %esi -; SSE41-NEXT: pinsrb $13, %esi, %xmm4 -; SSE41-NEXT: pextrb $14, %xmm7, %edi -; SSE41-NEXT: pextrb $14, %xmm3, %esi -; SSE41-NEXT: addb %dil, %sil -; SSE41-NEXT: movb $-1, %bl -; SSE41-NEXT: jb .LBB2_126 -; SSE41-NEXT: # %bb.125: -; SSE41-NEXT: movl %esi, %ebx -; SSE41-NEXT: .LBB2_126: -; SSE41-NEXT: pinsrb $15, %eax, %xmm0 -; SSE41-NEXT: pinsrb $15, %ecx, %xmm1 -; SSE41-NEXT: pinsrb $15, %edx, %xmm2 -; SSE41-NEXT: movzbl %bl, %eax -; SSE41-NEXT: pinsrb $14, %eax, %xmm4 -; SSE41-NEXT: pextrb $15, %xmm7, %ecx -; SSE41-NEXT: pextrb $15, %xmm3, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: movb $-1, %cl -; SSE41-NEXT: popq %rbx -; SSE41-NEXT: popq %r12 -; SSE41-NEXT: popq %r13 -; SSE41-NEXT: popq %r14 -; SSE41-NEXT: popq %r15 -; SSE41-NEXT: popq %rbp -; SSE41-NEXT: jb .LBB2_128 -; SSE41-NEXT: # %bb.127: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_128: -; SSE41-NEXT: movzbl %cl, %eax -; SSE41-NEXT: pinsrb $15, %eax, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm3 -; SSE41-NEXT: retq +; SSE-LABEL: v64i8: +; SSE: # %bb.0: +; SSE-NEXT: paddusb %xmm4, %xmm0 +; SSE-NEXT: paddusb %xmm5, %xmm1 +; SSE-NEXT: paddusb %xmm6, %xmm2 +; SSE-NEXT: paddusb %xmm7, %xmm3 +; SSE-NEXT: retq ; ; AVX1-LABEL: v64i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vpextrb $1, %xmm4, %ecx ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vpextrb $1, %xmm5, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb $-1, %cl -; AVX1-NEXT: jb .LBB2_2 -; AVX1-NEXT: # %bb.1: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_2: -; AVX1-NEXT: pushq %rbp -; AVX1-NEXT: pushq %r15 -; AVX1-NEXT: pushq %r14 -; AVX1-NEXT: pushq %r13 -; AVX1-NEXT: pushq %r12 -; AVX1-NEXT: pushq %rbx -; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX1-NEXT: vpextrb $0, %xmm4, %ecx -; AVX1-NEXT: vpextrb $0, %xmm5, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb $-1, %cl -; AVX1-NEXT: jb .LBB2_4 -; AVX1-NEXT: # %bb.3: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_4: -; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX1-NEXT: vpextrb $2, %xmm4, %ecx -; AVX1-NEXT: vpextrb $2, %xmm5, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb $-1, %cl -; AVX1-NEXT: jb .LBB2_6 -; AVX1-NEXT: # %bb.5: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_6: -; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX1-NEXT: vpextrb $3, %xmm4, %ecx -; AVX1-NEXT: vpextrb $3, %xmm5, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb $-1, %cl -; AVX1-NEXT: jb .LBB2_8 -; AVX1-NEXT: # %bb.7: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_8: -; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX1-NEXT: vpextrb $4, %xmm4, %ecx -; AVX1-NEXT: vpextrb $4, %xmm5, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb $-1, %cl -; AVX1-NEXT: jb .LBB2_10 -; AVX1-NEXT: # %bb.9: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_10: -; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX1-NEXT: vpextrb $5, %xmm4, %ecx -; AVX1-NEXT: vpextrb $5, %xmm5, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb $-1, %cl -; AVX1-NEXT: jb .LBB2_12 -; AVX1-NEXT: # %bb.11: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_12: -; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX1-NEXT: vpextrb $6, %xmm4, %ecx -; AVX1-NEXT: vpextrb $6, %xmm5, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb $-1, %cl -; AVX1-NEXT: jb .LBB2_14 -; AVX1-NEXT: # %bb.13: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_14: -; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX1-NEXT: vpextrb $7, %xmm4, %ecx -; AVX1-NEXT: vpextrb $7, %xmm5, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb $-1, %cl -; AVX1-NEXT: jb .LBB2_16 -; AVX1-NEXT: # %bb.15: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_16: -; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX1-NEXT: vpextrb $8, %xmm4, %ecx -; AVX1-NEXT: vpextrb $8, %xmm5, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb $-1, %cl -; AVX1-NEXT: jb .LBB2_18 -; AVX1-NEXT: # %bb.17: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_18: -; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX1-NEXT: vpextrb $9, %xmm4, %ecx -; AVX1-NEXT: vpextrb $9, %xmm5, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb $-1, %cl -; AVX1-NEXT: jb .LBB2_20 -; AVX1-NEXT: # %bb.19: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_20: -; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX1-NEXT: vpextrb $10, %xmm4, %ecx -; AVX1-NEXT: vpextrb $10, %xmm5, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb $-1, %cl -; AVX1-NEXT: jb .LBB2_22 -; AVX1-NEXT: # %bb.21: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_22: -; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX1-NEXT: vpextrb $11, %xmm4, %ecx -; AVX1-NEXT: vpextrb $11, %xmm5, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb $-1, %cl -; AVX1-NEXT: jb .LBB2_24 -; AVX1-NEXT: # %bb.23: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_24: -; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX1-NEXT: vpextrb $12, %xmm4, %ecx -; AVX1-NEXT: vpextrb $12, %xmm5, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb $-1, %cl -; AVX1-NEXT: jb .LBB2_26 -; AVX1-NEXT: # %bb.25: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_26: -; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX1-NEXT: vpextrb $13, %xmm4, %ecx -; AVX1-NEXT: vpextrb $13, %xmm5, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb $-1, %cl -; AVX1-NEXT: jb .LBB2_28 -; AVX1-NEXT: # %bb.27: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_28: -; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX1-NEXT: vpextrb $14, %xmm4, %ecx -; AVX1-NEXT: vpextrb $14, %xmm5, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb $-1, %cl -; AVX1-NEXT: jb .LBB2_30 -; AVX1-NEXT: # %bb.29: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_30: -; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX1-NEXT: vpextrb $15, %xmm4, %ecx -; AVX1-NEXT: vpextrb $15, %xmm5, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb $-1, %cl -; AVX1-NEXT: jb .LBB2_32 -; AVX1-NEXT: # %bb.31: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_32: -; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX1-NEXT: vpextrb $1, %xmm2, %ecx -; AVX1-NEXT: vpextrb $1, %xmm0, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb $-1, %r11b -; AVX1-NEXT: jb .LBB2_34 -; AVX1-NEXT: # %bb.33: -; AVX1-NEXT: movl %eax, %r11d -; AVX1-NEXT: .LBB2_34: -; AVX1-NEXT: vpextrb $0, %xmm2, %ecx -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb $-1, %r8b -; AVX1-NEXT: jb .LBB2_36 -; AVX1-NEXT: # %bb.35: -; AVX1-NEXT: movl %eax, %r8d -; AVX1-NEXT: .LBB2_36: -; AVX1-NEXT: vpextrb $2, %xmm2, %ecx -; AVX1-NEXT: vpextrb $2, %xmm0, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb $-1, %cl -; AVX1-NEXT: jb .LBB2_38 -; AVX1-NEXT: # %bb.37: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_38: -; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX1-NEXT: vpextrb $3, %xmm2, %ecx -; AVX1-NEXT: vpextrb $3, %xmm0, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb $-1, %cl -; AVX1-NEXT: jb .LBB2_40 -; AVX1-NEXT: # %bb.39: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_40: -; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX1-NEXT: vpextrb $4, %xmm2, %ecx -; AVX1-NEXT: vpextrb $4, %xmm0, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb $-1, %cl -; AVX1-NEXT: jb .LBB2_42 -; AVX1-NEXT: # %bb.41: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_42: -; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX1-NEXT: vpextrb $5, %xmm2, %ecx -; AVX1-NEXT: vpextrb $5, %xmm0, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb $-1, %cl -; AVX1-NEXT: jb .LBB2_44 -; AVX1-NEXT: # %bb.43: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_44: -; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX1-NEXT: vpextrb $6, %xmm2, %ecx -; AVX1-NEXT: vpextrb $6, %xmm0, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb $-1, %cl -; AVX1-NEXT: jb .LBB2_46 -; AVX1-NEXT: # %bb.45: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_46: -; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX1-NEXT: vpextrb $7, %xmm2, %ecx -; AVX1-NEXT: vpextrb $7, %xmm0, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb $-1, %cl -; AVX1-NEXT: jb .LBB2_48 -; AVX1-NEXT: # %bb.47: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_48: -; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX1-NEXT: vpextrb $8, %xmm2, %ecx -; AVX1-NEXT: vpextrb $8, %xmm0, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb $-1, %cl -; AVX1-NEXT: jb .LBB2_50 -; AVX1-NEXT: # %bb.49: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_50: -; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX1-NEXT: vpextrb $9, %xmm2, %ecx -; AVX1-NEXT: vpextrb $9, %xmm0, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb $-1, %cl -; AVX1-NEXT: jb .LBB2_52 -; AVX1-NEXT: # %bb.51: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_52: -; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX1-NEXT: vpextrb $10, %xmm2, %ecx -; AVX1-NEXT: vpextrb $10, %xmm0, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb $-1, %cl -; AVX1-NEXT: jb .LBB2_54 -; AVX1-NEXT: # %bb.53: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_54: -; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX1-NEXT: vpextrb $11, %xmm2, %ecx -; AVX1-NEXT: vpextrb $11, %xmm0, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb $-1, %cl -; AVX1-NEXT: jb .LBB2_56 -; AVX1-NEXT: # %bb.55: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_56: -; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX1-NEXT: vpextrb $12, %xmm2, %ecx -; AVX1-NEXT: vpextrb $12, %xmm0, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb $-1, %cl -; AVX1-NEXT: jb .LBB2_58 -; AVX1-NEXT: # %bb.57: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_58: -; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX1-NEXT: vpextrb $13, %xmm2, %ecx -; AVX1-NEXT: vpextrb $13, %xmm0, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb $-1, %cl -; AVX1-NEXT: jb .LBB2_60 -; AVX1-NEXT: # %bb.59: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_60: -; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX1-NEXT: vpextrb $14, %xmm2, %ecx -; AVX1-NEXT: vpextrb $14, %xmm0, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb $-1, %cl -; AVX1-NEXT: jb .LBB2_62 -; AVX1-NEXT: # %bb.61: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_62: -; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX1-NEXT: vpextrb $15, %xmm2, %ecx -; AVX1-NEXT: vpextrb $15, %xmm0, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb $-1, %cl -; AVX1-NEXT: jb .LBB2_64 -; AVX1-NEXT: # %bb.63: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_64: -; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm0 -; AVX1-NEXT: vpextrb $1, %xmm0, %ecx -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpextrb $1, %xmm2, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb $-1, %dil -; AVX1-NEXT: jb .LBB2_66 -; AVX1-NEXT: # %bb.65: -; AVX1-NEXT: movl %eax, %edi -; AVX1-NEXT: .LBB2_66: -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: vpextrb $0, %xmm2, %ecx -; AVX1-NEXT: addb %al, %cl -; AVX1-NEXT: movb $-1, %al -; AVX1-NEXT: jb .LBB2_68 -; AVX1-NEXT: # %bb.67: -; AVX1-NEXT: movl %ecx, %eax -; AVX1-NEXT: .LBB2_68: -; AVX1-NEXT: vpextrb $2, %xmm0, %esi -; AVX1-NEXT: vpextrb $2, %xmm2, %ecx -; AVX1-NEXT: addb %sil, %cl -; AVX1-NEXT: movb $-1, %r15b -; AVX1-NEXT: jb .LBB2_70 -; AVX1-NEXT: # %bb.69: -; AVX1-NEXT: movl %ecx, %r15d -; AVX1-NEXT: .LBB2_70: -; AVX1-NEXT: vpextrb $3, %xmm0, %esi -; AVX1-NEXT: vpextrb $3, %xmm2, %ecx -; AVX1-NEXT: addb %sil, %cl -; AVX1-NEXT: movb $-1, %sil -; AVX1-NEXT: jb .LBB2_72 -; AVX1-NEXT: # %bb.71: -; AVX1-NEXT: movl %ecx, %esi -; AVX1-NEXT: .LBB2_72: -; AVX1-NEXT: vpextrb $4, %xmm0, %ebp -; AVX1-NEXT: vpextrb $4, %xmm2, %ecx -; AVX1-NEXT: addb %bpl, %cl -; AVX1-NEXT: movb $-1, %r14b -; AVX1-NEXT: jb .LBB2_74 -; AVX1-NEXT: # %bb.73: -; AVX1-NEXT: movl %ecx, %r14d -; AVX1-NEXT: .LBB2_74: -; AVX1-NEXT: vpextrb $5, %xmm0, %ebp -; AVX1-NEXT: vpextrb $5, %xmm2, %ecx -; AVX1-NEXT: addb %bpl, %cl -; AVX1-NEXT: movb $-1, %r10b -; AVX1-NEXT: jb .LBB2_76 -; AVX1-NEXT: # %bb.75: -; AVX1-NEXT: movl %ecx, %r10d -; AVX1-NEXT: .LBB2_76: -; AVX1-NEXT: vpextrb $6, %xmm0, %ebp -; AVX1-NEXT: vpextrb $6, %xmm2, %ecx -; AVX1-NEXT: addb %bpl, %cl -; AVX1-NEXT: movb $-1, %r9b -; AVX1-NEXT: jb .LBB2_78 -; AVX1-NEXT: # %bb.77: -; AVX1-NEXT: movl %ecx, %r9d -; AVX1-NEXT: .LBB2_78: -; AVX1-NEXT: vpextrb $7, %xmm0, %ebx -; AVX1-NEXT: vpextrb $7, %xmm2, %ecx -; AVX1-NEXT: addb %bl, %cl -; AVX1-NEXT: movb $-1, %r13b -; AVX1-NEXT: jb .LBB2_80 -; AVX1-NEXT: # %bb.79: -; AVX1-NEXT: movl %ecx, %r13d -; AVX1-NEXT: .LBB2_80: -; AVX1-NEXT: vpextrb $8, %xmm0, %ebx -; AVX1-NEXT: vpextrb $8, %xmm2, %ecx -; AVX1-NEXT: addb %bl, %cl -; AVX1-NEXT: movb $-1, %r12b -; AVX1-NEXT: jb .LBB2_82 -; AVX1-NEXT: # %bb.81: -; AVX1-NEXT: movl %ecx, %r12d -; AVX1-NEXT: .LBB2_82: -; AVX1-NEXT: vpextrb $9, %xmm0, %ebx -; AVX1-NEXT: vpextrb $9, %xmm2, %ecx -; AVX1-NEXT: addb %bl, %cl -; AVX1-NEXT: movb $-1, %dl -; AVX1-NEXT: jb .LBB2_84 -; AVX1-NEXT: # %bb.83: -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: .LBB2_84: -; AVX1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX1-NEXT: vpextrb $10, %xmm0, %ebx -; AVX1-NEXT: vpextrb $10, %xmm2, %ecx -; AVX1-NEXT: addb %bl, %cl -; AVX1-NEXT: movb $-1, %dl -; AVX1-NEXT: jb .LBB2_86 -; AVX1-NEXT: # %bb.85: -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: .LBB2_86: -; AVX1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX1-NEXT: vpextrb $11, %xmm0, %ebx -; AVX1-NEXT: vpextrb $11, %xmm2, %ecx -; AVX1-NEXT: addb %bl, %cl -; AVX1-NEXT: movb $-1, %dl -; AVX1-NEXT: jb .LBB2_88 -; AVX1-NEXT: # %bb.87: -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: .LBB2_88: -; AVX1-NEXT: vpextrb $12, %xmm0, %ebx -; AVX1-NEXT: vpextrb $12, %xmm2, %ecx -; AVX1-NEXT: addb %bl, %cl -; AVX1-NEXT: movb $-1, %bl -; AVX1-NEXT: jb .LBB2_90 -; AVX1-NEXT: # %bb.89: -; AVX1-NEXT: movl %ecx, %ebx -; AVX1-NEXT: .LBB2_90: -; AVX1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX1-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX1-NEXT: movzbl %r8b, %ecx -; AVX1-NEXT: vpextrb $13, %xmm0, %ebx -; AVX1-NEXT: vpextrb $13, %xmm2, %edx -; AVX1-NEXT: addb %bl, %dl -; AVX1-NEXT: movb $-1, %bl -; AVX1-NEXT: jb .LBB2_92 -; AVX1-NEXT: # %bb.91: -; AVX1-NEXT: movl %edx, %ebx -; AVX1-NEXT: .LBB2_92: -; AVX1-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX1-NEXT: movzbl %r11b, %ebp -; AVX1-NEXT: vmovd %ecx, %xmm4 -; AVX1-NEXT: vpextrb $14, %xmm0, %ebx -; AVX1-NEXT: vpextrb $14, %xmm2, %ecx -; AVX1-NEXT: addb %bl, %cl -; AVX1-NEXT: movb $-1, %r8b -; AVX1-NEXT: jb .LBB2_94 -; AVX1-NEXT: # %bb.93: -; AVX1-NEXT: movl %ecx, %r8d -; AVX1-NEXT: .LBB2_94: -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX1-NEXT: vmovd %edx, %xmm5 -; AVX1-NEXT: vpinsrb $1, %ebp, %xmm4, %xmm4 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX1-NEXT: movzbl %al, %r11d -; AVX1-NEXT: vpextrb $15, %xmm0, %ebx -; AVX1-NEXT: vpextrb $15, %xmm2, %eax -; AVX1-NEXT: addb %bl, %al -; AVX1-NEXT: movb $-1, %bl -; AVX1-NEXT: jb .LBB2_96 -; AVX1-NEXT: # %bb.95: -; AVX1-NEXT: movl %eax, %ebx -; AVX1-NEXT: .LBB2_96: -; AVX1-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX1-NEXT: vpinsrb $1, %ecx, %xmm5, %xmm0 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $2, %edx, %xmm4, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX1-NEXT: movzbl %dil, %eax -; AVX1-NEXT: vmovd %r11d, %xmm4 -; AVX1-NEXT: vpextrb $1, %xmm3, %ebp -; AVX1-NEXT: vpextrb $1, %xmm1, %edi -; AVX1-NEXT: addb %bpl, %dil -; AVX1-NEXT: movb $-1, %bl -; AVX1-NEXT: jb .LBB2_98 -; AVX1-NEXT: # %bb.97: -; AVX1-NEXT: movl %edi, %ebx -; AVX1-NEXT: .LBB2_98: -; AVX1-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $3, %edx, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4 -; AVX1-NEXT: movzbl %r15b, %ebp -; AVX1-NEXT: movzbl %bl, %edi -; AVX1-NEXT: vpextrb $0, %xmm3, %ebx -; AVX1-NEXT: vpextrb $0, %xmm1, %eax -; AVX1-NEXT: addb %bl, %al -; AVX1-NEXT: movb $-1, %bl -; AVX1-NEXT: jb .LBB2_100 -; AVX1-NEXT: # %bb.99: -; AVX1-NEXT: movl %eax, %ebx -; AVX1-NEXT: .LBB2_100: -; AVX1-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $4, %edx, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $2, %ebp, %xmm4, %xmm4 -; AVX1-NEXT: movzbl %sil, %esi -; AVX1-NEXT: movzbl %bl, %edx -; AVX1-NEXT: vmovd %edx, %xmm5 -; AVX1-NEXT: vpinsrb $1, %edi, %xmm5, %xmm5 -; AVX1-NEXT: vpextrb $2, %xmm3, %edi -; AVX1-NEXT: vpextrb $2, %xmm1, %edx -; AVX1-NEXT: addb %dil, %dl -; AVX1-NEXT: movb $-1, %bl -; AVX1-NEXT: jb .LBB2_102 -; AVX1-NEXT: # %bb.101: -; AVX1-NEXT: movl %edx, %ebx -; AVX1-NEXT: .LBB2_102: -; AVX1-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $3, %esi, %xmm4, %xmm4 -; AVX1-NEXT: movzbl %r14b, %edx -; AVX1-NEXT: movzbl %bl, %esi -; AVX1-NEXT: vpinsrb $2, %esi, %xmm5, %xmm5 -; AVX1-NEXT: vpextrb $3, %xmm3, %edi -; AVX1-NEXT: vpextrb $3, %xmm1, %esi -; AVX1-NEXT: addb %dil, %sil -; AVX1-NEXT: movb $-1, %bl -; AVX1-NEXT: jb .LBB2_104 -; AVX1-NEXT: # %bb.103: -; AVX1-NEXT: movl %esi, %ebx -; AVX1-NEXT: .LBB2_104: -; AVX1-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $4, %edx, %xmm4, %xmm4 -; AVX1-NEXT: movzbl %r10b, %edx -; AVX1-NEXT: movzbl %bl, %esi -; AVX1-NEXT: vpinsrb $3, %esi, %xmm5, %xmm5 -; AVX1-NEXT: vpextrb $4, %xmm3, %edi -; AVX1-NEXT: vpextrb $4, %xmm1, %esi -; AVX1-NEXT: addb %dil, %sil -; AVX1-NEXT: movb $-1, %bl -; AVX1-NEXT: jb .LBB2_106 -; AVX1-NEXT: # %bb.105: -; AVX1-NEXT: movl %esi, %ebx -; AVX1-NEXT: .LBB2_106: -; AVX1-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $5, %edx, %xmm4, %xmm4 -; AVX1-NEXT: movzbl %r9b, %edx -; AVX1-NEXT: movzbl %bl, %esi -; AVX1-NEXT: vpinsrb $4, %esi, %xmm5, %xmm5 -; AVX1-NEXT: vpextrb $5, %xmm3, %edi -; AVX1-NEXT: vpextrb $5, %xmm1, %esi -; AVX1-NEXT: addb %dil, %sil -; AVX1-NEXT: movb $-1, %bl -; AVX1-NEXT: jb .LBB2_108 -; AVX1-NEXT: # %bb.107: -; AVX1-NEXT: movl %esi, %ebx -; AVX1-NEXT: .LBB2_108: -; AVX1-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $6, %edx, %xmm4, %xmm4 -; AVX1-NEXT: movzbl %r13b, %edx -; AVX1-NEXT: movzbl %bl, %esi -; AVX1-NEXT: vpinsrb $5, %esi, %xmm5, %xmm5 -; AVX1-NEXT: vpextrb $6, %xmm3, %edi -; AVX1-NEXT: vpextrb $6, %xmm1, %esi -; AVX1-NEXT: addb %dil, %sil -; AVX1-NEXT: movb $-1, %bl -; AVX1-NEXT: jb .LBB2_110 -; AVX1-NEXT: # %bb.109: -; AVX1-NEXT: movl %esi, %ebx -; AVX1-NEXT: .LBB2_110: -; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $7, %edx, %xmm4, %xmm4 -; AVX1-NEXT: movzbl %r12b, %edx -; AVX1-NEXT: movzbl %bl, %esi -; AVX1-NEXT: vpinsrb $6, %esi, %xmm5, %xmm5 -; AVX1-NEXT: vpextrb $7, %xmm3, %edi -; AVX1-NEXT: vpextrb $7, %xmm1, %esi -; AVX1-NEXT: addb %dil, %sil -; AVX1-NEXT: movb $-1, %bl -; AVX1-NEXT: jb .LBB2_112 -; AVX1-NEXT: # %bb.111: -; AVX1-NEXT: movl %esi, %ebx -; AVX1-NEXT: .LBB2_112: -; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $8, %edx, %xmm4, %xmm4 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX1-NEXT: movzbl %bl, %esi -; AVX1-NEXT: vpinsrb $7, %esi, %xmm5, %xmm5 -; AVX1-NEXT: vpextrb $8, %xmm3, %edi -; AVX1-NEXT: vpextrb $8, %xmm1, %esi -; AVX1-NEXT: addb %dil, %sil -; AVX1-NEXT: movb $-1, %bl -; AVX1-NEXT: jb .LBB2_114 -; AVX1-NEXT: # %bb.113: -; AVX1-NEXT: movl %esi, %ebx -; AVX1-NEXT: .LBB2_114: -; AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $9, %edx, %xmm4, %xmm4 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX1-NEXT: movzbl %bl, %esi -; AVX1-NEXT: vpinsrb $8, %esi, %xmm5, %xmm5 -; AVX1-NEXT: vpextrb $9, %xmm3, %edi -; AVX1-NEXT: vpextrb $9, %xmm1, %esi -; AVX1-NEXT: addb %dil, %sil -; AVX1-NEXT: movb $-1, %bl -; AVX1-NEXT: jb .LBB2_116 -; AVX1-NEXT: # %bb.115: -; AVX1-NEXT: movl %esi, %ebx -; AVX1-NEXT: .LBB2_116: -; AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $10, %edx, %xmm4, %xmm4 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX1-NEXT: movzbl %bl, %esi -; AVX1-NEXT: vpinsrb $9, %esi, %xmm5, %xmm5 -; AVX1-NEXT: vpextrb $10, %xmm3, %edi -; AVX1-NEXT: vpextrb $10, %xmm1, %esi -; AVX1-NEXT: addb %dil, %sil -; AVX1-NEXT: movb $-1, %bl -; AVX1-NEXT: jb .LBB2_118 -; AVX1-NEXT: # %bb.117: -; AVX1-NEXT: movl %esi, %ebx -; AVX1-NEXT: .LBB2_118: -; AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $11, %edx, %xmm4, %xmm4 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX1-NEXT: movzbl %bl, %esi -; AVX1-NEXT: vpinsrb $10, %esi, %xmm5, %xmm5 -; AVX1-NEXT: vpextrb $11, %xmm3, %edi -; AVX1-NEXT: vpextrb $11, %xmm1, %esi -; AVX1-NEXT: addb %dil, %sil -; AVX1-NEXT: movb $-1, %bl -; AVX1-NEXT: jb .LBB2_120 -; AVX1-NEXT: # %bb.119: -; AVX1-NEXT: movl %esi, %ebx -; AVX1-NEXT: .LBB2_120: -; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $12, %edx, %xmm4, %xmm6 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX1-NEXT: movzbl %bl, %esi -; AVX1-NEXT: vpinsrb $11, %esi, %xmm5, %xmm5 -; AVX1-NEXT: vpextrb $12, %xmm3, %edi -; AVX1-NEXT: vpextrb $12, %xmm1, %esi -; AVX1-NEXT: addb %dil, %sil -; AVX1-NEXT: movb $-1, %bl -; AVX1-NEXT: jb .LBB2_122 -; AVX1-NEXT: # %bb.121: -; AVX1-NEXT: movl %esi, %ebx -; AVX1-NEXT: .LBB2_122: -; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm4 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $15, %ecx, %xmm2, %xmm0 -; AVX1-NEXT: vpinsrb $13, %edx, %xmm6, %xmm6 -; AVX1-NEXT: movzbl %r8b, %ecx -; AVX1-NEXT: movzbl %bl, %edx -; AVX1-NEXT: vpinsrb $12, %edx, %xmm5, %xmm5 -; AVX1-NEXT: vpextrb $13, %xmm3, %edx -; AVX1-NEXT: vpextrb $13, %xmm1, %esi -; AVX1-NEXT: addb %dl, %sil -; AVX1-NEXT: movb $-1, %dl -; AVX1-NEXT: jb .LBB2_124 -; AVX1-NEXT: # %bb.123: -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: .LBB2_124: -; AVX1-NEXT: vpinsrb $15, %eax, %xmm4, %xmm2 -; AVX1-NEXT: vpinsrb $14, %ecx, %xmm6, %xmm4 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: movzbl %dl, %ecx -; AVX1-NEXT: vpinsrb $13, %ecx, %xmm5, %xmm5 -; AVX1-NEXT: vpextrb $14, %xmm3, %edx -; AVX1-NEXT: vpextrb $14, %xmm1, %ecx -; AVX1-NEXT: addb %dl, %cl -; AVX1-NEXT: movb $-1, %dl -; AVX1-NEXT: popq %rbx -; AVX1-NEXT: popq %r12 -; AVX1-NEXT: popq %r13 -; AVX1-NEXT: popq %r14 -; AVX1-NEXT: popq %r15 -; AVX1-NEXT: popq %rbp -; AVX1-NEXT: jb .LBB2_126 -; AVX1-NEXT: # %bb.125: -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: .LBB2_126: -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vpinsrb $15, %eax, %xmm4, %xmm2 -; AVX1-NEXT: movzbl %dl, %eax -; AVX1-NEXT: vpinsrb $14, %eax, %xmm5, %xmm4 -; AVX1-NEXT: vpextrb $15, %xmm3, %ecx -; AVX1-NEXT: vpextrb $15, %xmm1, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb $-1, %cl -; AVX1-NEXT: jb .LBB2_128 -; AVX1-NEXT: # %bb.127: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_128: -; AVX1-NEXT: movzbl %cl, %eax -; AVX1-NEXT: vpinsrb $15, %eax, %xmm4, %xmm1 +; AVX1-NEXT: vpaddusb %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpaddusb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpaddusb %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpaddusb %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v64i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-NEXT: vpextrb $1, %xmm4, %ecx -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX2-NEXT: vpextrb $1, %xmm5, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb $-1, %cl -; AVX2-NEXT: jb .LBB2_2 -; AVX2-NEXT: # %bb.1: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_2: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: pushq %r15 -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %r13 -; AVX2-NEXT: pushq %r12 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX2-NEXT: vpextrb $0, %xmm4, %ecx -; AVX2-NEXT: vpextrb $0, %xmm5, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb $-1, %cl -; AVX2-NEXT: jb .LBB2_4 -; AVX2-NEXT: # %bb.3: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_4: -; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX2-NEXT: vpextrb $2, %xmm4, %ecx -; AVX2-NEXT: vpextrb $2, %xmm5, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb $-1, %cl -; AVX2-NEXT: jb .LBB2_6 -; AVX2-NEXT: # %bb.5: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_6: -; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX2-NEXT: vpextrb $3, %xmm4, %ecx -; AVX2-NEXT: vpextrb $3, %xmm5, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb $-1, %cl -; AVX2-NEXT: jb .LBB2_8 -; AVX2-NEXT: # %bb.7: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_8: -; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX2-NEXT: vpextrb $4, %xmm4, %ecx -; AVX2-NEXT: vpextrb $4, %xmm5, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb $-1, %cl -; AVX2-NEXT: jb .LBB2_10 -; AVX2-NEXT: # %bb.9: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_10: -; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX2-NEXT: vpextrb $5, %xmm4, %ecx -; AVX2-NEXT: vpextrb $5, %xmm5, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb $-1, %cl -; AVX2-NEXT: jb .LBB2_12 -; AVX2-NEXT: # %bb.11: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_12: -; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX2-NEXT: vpextrb $6, %xmm4, %ecx -; AVX2-NEXT: vpextrb $6, %xmm5, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb $-1, %cl -; AVX2-NEXT: jb .LBB2_14 -; AVX2-NEXT: # %bb.13: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_14: -; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX2-NEXT: vpextrb $7, %xmm4, %ecx -; AVX2-NEXT: vpextrb $7, %xmm5, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb $-1, %cl -; AVX2-NEXT: jb .LBB2_16 -; AVX2-NEXT: # %bb.15: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_16: -; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX2-NEXT: vpextrb $8, %xmm4, %ecx -; AVX2-NEXT: vpextrb $8, %xmm5, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb $-1, %cl -; AVX2-NEXT: jb .LBB2_18 -; AVX2-NEXT: # %bb.17: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_18: -; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX2-NEXT: vpextrb $9, %xmm4, %ecx -; AVX2-NEXT: vpextrb $9, %xmm5, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb $-1, %cl -; AVX2-NEXT: jb .LBB2_20 -; AVX2-NEXT: # %bb.19: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_20: -; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX2-NEXT: vpextrb $10, %xmm4, %ecx -; AVX2-NEXT: vpextrb $10, %xmm5, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb $-1, %cl -; AVX2-NEXT: jb .LBB2_22 -; AVX2-NEXT: # %bb.21: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_22: -; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX2-NEXT: vpextrb $11, %xmm4, %ecx -; AVX2-NEXT: vpextrb $11, %xmm5, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb $-1, %cl -; AVX2-NEXT: jb .LBB2_24 -; AVX2-NEXT: # %bb.23: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_24: -; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX2-NEXT: vpextrb $12, %xmm4, %ecx -; AVX2-NEXT: vpextrb $12, %xmm5, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb $-1, %cl -; AVX2-NEXT: jb .LBB2_26 -; AVX2-NEXT: # %bb.25: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_26: -; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX2-NEXT: vpextrb $13, %xmm4, %ecx -; AVX2-NEXT: vpextrb $13, %xmm5, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb $-1, %cl -; AVX2-NEXT: jb .LBB2_28 -; AVX2-NEXT: # %bb.27: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_28: -; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX2-NEXT: vpextrb $14, %xmm4, %ecx -; AVX2-NEXT: vpextrb $14, %xmm5, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb $-1, %cl -; AVX2-NEXT: jb .LBB2_30 -; AVX2-NEXT: # %bb.29: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_30: -; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX2-NEXT: vpextrb $15, %xmm4, %ecx -; AVX2-NEXT: vpextrb $15, %xmm5, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb $-1, %cl -; AVX2-NEXT: jb .LBB2_32 -; AVX2-NEXT: # %bb.31: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_32: -; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX2-NEXT: vpextrb $1, %xmm2, %ecx -; AVX2-NEXT: vpextrb $1, %xmm0, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb $-1, %r11b -; AVX2-NEXT: jb .LBB2_34 -; AVX2-NEXT: # %bb.33: -; AVX2-NEXT: movl %eax, %r11d -; AVX2-NEXT: .LBB2_34: -; AVX2-NEXT: vpextrb $0, %xmm2, %ecx -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb $-1, %r8b -; AVX2-NEXT: jb .LBB2_36 -; AVX2-NEXT: # %bb.35: -; AVX2-NEXT: movl %eax, %r8d -; AVX2-NEXT: .LBB2_36: -; AVX2-NEXT: vpextrb $2, %xmm2, %ecx -; AVX2-NEXT: vpextrb $2, %xmm0, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb $-1, %cl -; AVX2-NEXT: jb .LBB2_38 -; AVX2-NEXT: # %bb.37: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_38: -; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX2-NEXT: vpextrb $3, %xmm2, %ecx -; AVX2-NEXT: vpextrb $3, %xmm0, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb $-1, %cl -; AVX2-NEXT: jb .LBB2_40 -; AVX2-NEXT: # %bb.39: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_40: -; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX2-NEXT: vpextrb $4, %xmm2, %ecx -; AVX2-NEXT: vpextrb $4, %xmm0, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb $-1, %cl -; AVX2-NEXT: jb .LBB2_42 -; AVX2-NEXT: # %bb.41: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_42: -; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX2-NEXT: vpextrb $5, %xmm2, %ecx -; AVX2-NEXT: vpextrb $5, %xmm0, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb $-1, %cl -; AVX2-NEXT: jb .LBB2_44 -; AVX2-NEXT: # %bb.43: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_44: -; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX2-NEXT: vpextrb $6, %xmm2, %ecx -; AVX2-NEXT: vpextrb $6, %xmm0, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb $-1, %cl -; AVX2-NEXT: jb .LBB2_46 -; AVX2-NEXT: # %bb.45: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_46: -; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX2-NEXT: vpextrb $7, %xmm2, %ecx -; AVX2-NEXT: vpextrb $7, %xmm0, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb $-1, %cl -; AVX2-NEXT: jb .LBB2_48 -; AVX2-NEXT: # %bb.47: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_48: -; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX2-NEXT: vpextrb $8, %xmm2, %ecx -; AVX2-NEXT: vpextrb $8, %xmm0, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb $-1, %cl -; AVX2-NEXT: jb .LBB2_50 -; AVX2-NEXT: # %bb.49: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_50: -; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX2-NEXT: vpextrb $9, %xmm2, %ecx -; AVX2-NEXT: vpextrb $9, %xmm0, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb $-1, %cl -; AVX2-NEXT: jb .LBB2_52 -; AVX2-NEXT: # %bb.51: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_52: -; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX2-NEXT: vpextrb $10, %xmm2, %ecx -; AVX2-NEXT: vpextrb $10, %xmm0, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb $-1, %cl -; AVX2-NEXT: jb .LBB2_54 -; AVX2-NEXT: # %bb.53: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_54: -; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX2-NEXT: vpextrb $11, %xmm2, %ecx -; AVX2-NEXT: vpextrb $11, %xmm0, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb $-1, %cl -; AVX2-NEXT: jb .LBB2_56 -; AVX2-NEXT: # %bb.55: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_56: -; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX2-NEXT: vpextrb $12, %xmm2, %ecx -; AVX2-NEXT: vpextrb $12, %xmm0, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb $-1, %cl -; AVX2-NEXT: jb .LBB2_58 -; AVX2-NEXT: # %bb.57: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_58: -; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX2-NEXT: vpextrb $13, %xmm2, %ecx -; AVX2-NEXT: vpextrb $13, %xmm0, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb $-1, %cl -; AVX2-NEXT: jb .LBB2_60 -; AVX2-NEXT: # %bb.59: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_60: -; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX2-NEXT: vpextrb $14, %xmm2, %ecx -; AVX2-NEXT: vpextrb $14, %xmm0, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb $-1, %cl -; AVX2-NEXT: jb .LBB2_62 -; AVX2-NEXT: # %bb.61: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_62: -; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX2-NEXT: vpextrb $15, %xmm2, %ecx -; AVX2-NEXT: vpextrb $15, %xmm0, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb $-1, %cl -; AVX2-NEXT: jb .LBB2_64 -; AVX2-NEXT: # %bb.63: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_64: -; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm0 -; AVX2-NEXT: vpextrb $1, %xmm0, %ecx -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpextrb $1, %xmm2, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb $-1, %dil -; AVX2-NEXT: jb .LBB2_66 -; AVX2-NEXT: # %bb.65: -; AVX2-NEXT: movl %eax, %edi -; AVX2-NEXT: .LBB2_66: -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: vpextrb $0, %xmm2, %ecx -; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: movb $-1, %al -; AVX2-NEXT: jb .LBB2_68 -; AVX2-NEXT: # %bb.67: -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: .LBB2_68: -; AVX2-NEXT: vpextrb $2, %xmm0, %esi -; AVX2-NEXT: vpextrb $2, %xmm2, %ecx -; AVX2-NEXT: addb %sil, %cl -; AVX2-NEXT: movb $-1, %r15b -; AVX2-NEXT: jb .LBB2_70 -; AVX2-NEXT: # %bb.69: -; AVX2-NEXT: movl %ecx, %r15d -; AVX2-NEXT: .LBB2_70: -; AVX2-NEXT: vpextrb $3, %xmm0, %esi -; AVX2-NEXT: vpextrb $3, %xmm2, %ecx -; AVX2-NEXT: addb %sil, %cl -; AVX2-NEXT: movb $-1, %sil -; AVX2-NEXT: jb .LBB2_72 -; AVX2-NEXT: # %bb.71: -; AVX2-NEXT: movl %ecx, %esi -; AVX2-NEXT: .LBB2_72: -; AVX2-NEXT: vpextrb $4, %xmm0, %ebp -; AVX2-NEXT: vpextrb $4, %xmm2, %ecx -; AVX2-NEXT: addb %bpl, %cl -; AVX2-NEXT: movb $-1, %r14b -; AVX2-NEXT: jb .LBB2_74 -; AVX2-NEXT: # %bb.73: -; AVX2-NEXT: movl %ecx, %r14d -; AVX2-NEXT: .LBB2_74: -; AVX2-NEXT: vpextrb $5, %xmm0, %ebp -; AVX2-NEXT: vpextrb $5, %xmm2, %ecx -; AVX2-NEXT: addb %bpl, %cl -; AVX2-NEXT: movb $-1, %r10b -; AVX2-NEXT: jb .LBB2_76 -; AVX2-NEXT: # %bb.75: -; AVX2-NEXT: movl %ecx, %r10d -; AVX2-NEXT: .LBB2_76: -; AVX2-NEXT: vpextrb $6, %xmm0, %ebp -; AVX2-NEXT: vpextrb $6, %xmm2, %ecx -; AVX2-NEXT: addb %bpl, %cl -; AVX2-NEXT: movb $-1, %r9b -; AVX2-NEXT: jb .LBB2_78 -; AVX2-NEXT: # %bb.77: -; AVX2-NEXT: movl %ecx, %r9d -; AVX2-NEXT: .LBB2_78: -; AVX2-NEXT: vpextrb $7, %xmm0, %ebx -; AVX2-NEXT: vpextrb $7, %xmm2, %ecx -; AVX2-NEXT: addb %bl, %cl -; AVX2-NEXT: movb $-1, %r13b -; AVX2-NEXT: jb .LBB2_80 -; AVX2-NEXT: # %bb.79: -; AVX2-NEXT: movl %ecx, %r13d -; AVX2-NEXT: .LBB2_80: -; AVX2-NEXT: vpextrb $8, %xmm0, %ebx -; AVX2-NEXT: vpextrb $8, %xmm2, %ecx -; AVX2-NEXT: addb %bl, %cl -; AVX2-NEXT: movb $-1, %r12b -; AVX2-NEXT: jb .LBB2_82 -; AVX2-NEXT: # %bb.81: -; AVX2-NEXT: movl %ecx, %r12d -; AVX2-NEXT: .LBB2_82: -; AVX2-NEXT: vpextrb $9, %xmm0, %ebx -; AVX2-NEXT: vpextrb $9, %xmm2, %ecx -; AVX2-NEXT: addb %bl, %cl -; AVX2-NEXT: movb $-1, %dl -; AVX2-NEXT: jb .LBB2_84 -; AVX2-NEXT: # %bb.83: -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: .LBB2_84: -; AVX2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX2-NEXT: vpextrb $10, %xmm0, %ebx -; AVX2-NEXT: vpextrb $10, %xmm2, %ecx -; AVX2-NEXT: addb %bl, %cl -; AVX2-NEXT: movb $-1, %dl -; AVX2-NEXT: jb .LBB2_86 -; AVX2-NEXT: # %bb.85: -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: .LBB2_86: -; AVX2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX2-NEXT: vpextrb $11, %xmm0, %ebx -; AVX2-NEXT: vpextrb $11, %xmm2, %ecx -; AVX2-NEXT: addb %bl, %cl -; AVX2-NEXT: movb $-1, %dl -; AVX2-NEXT: jb .LBB2_88 -; AVX2-NEXT: # %bb.87: -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: .LBB2_88: -; AVX2-NEXT: vpextrb $12, %xmm0, %ebx -; AVX2-NEXT: vpextrb $12, %xmm2, %ecx -; AVX2-NEXT: addb %bl, %cl -; AVX2-NEXT: movb $-1, %bl -; AVX2-NEXT: jb .LBB2_90 -; AVX2-NEXT: # %bb.89: -; AVX2-NEXT: movl %ecx, %ebx -; AVX2-NEXT: .LBB2_90: -; AVX2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX2-NEXT: movzbl %r8b, %ecx -; AVX2-NEXT: vpextrb $13, %xmm0, %ebx -; AVX2-NEXT: vpextrb $13, %xmm2, %edx -; AVX2-NEXT: addb %bl, %dl -; AVX2-NEXT: movb $-1, %bl -; AVX2-NEXT: jb .LBB2_92 -; AVX2-NEXT: # %bb.91: -; AVX2-NEXT: movl %edx, %ebx -; AVX2-NEXT: .LBB2_92: -; AVX2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX2-NEXT: movzbl %r11b, %ebp -; AVX2-NEXT: vmovd %ecx, %xmm4 -; AVX2-NEXT: vpextrb $14, %xmm0, %ebx -; AVX2-NEXT: vpextrb $14, %xmm2, %ecx -; AVX2-NEXT: addb %bl, %cl -; AVX2-NEXT: movb $-1, %r8b -; AVX2-NEXT: jb .LBB2_94 -; AVX2-NEXT: # %bb.93: -; AVX2-NEXT: movl %ecx, %r8d -; AVX2-NEXT: .LBB2_94: -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX2-NEXT: vmovd %edx, %xmm5 -; AVX2-NEXT: vpinsrb $1, %ebp, %xmm4, %xmm4 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX2-NEXT: movzbl %al, %r11d -; AVX2-NEXT: vpextrb $15, %xmm0, %ebx -; AVX2-NEXT: vpextrb $15, %xmm2, %eax -; AVX2-NEXT: addb %bl, %al -; AVX2-NEXT: movb $-1, %bl -; AVX2-NEXT: jb .LBB2_96 -; AVX2-NEXT: # %bb.95: -; AVX2-NEXT: movl %eax, %ebx -; AVX2-NEXT: .LBB2_96: -; AVX2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX2-NEXT: vpinsrb $1, %ecx, %xmm5, %xmm0 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $2, %edx, %xmm4, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX2-NEXT: movzbl %dil, %eax -; AVX2-NEXT: vmovd %r11d, %xmm4 -; AVX2-NEXT: vpextrb $1, %xmm3, %ebp -; AVX2-NEXT: vpextrb $1, %xmm1, %edi -; AVX2-NEXT: addb %bpl, %dil -; AVX2-NEXT: movb $-1, %bl -; AVX2-NEXT: jb .LBB2_98 -; AVX2-NEXT: # %bb.97: -; AVX2-NEXT: movl %edi, %ebx -; AVX2-NEXT: .LBB2_98: -; AVX2-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $3, %edx, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movzbl %r15b, %ebp -; AVX2-NEXT: movzbl %bl, %edi -; AVX2-NEXT: vpextrb $0, %xmm3, %ebx -; AVX2-NEXT: vpextrb $0, %xmm1, %eax -; AVX2-NEXT: addb %bl, %al -; AVX2-NEXT: movb $-1, %bl -; AVX2-NEXT: jb .LBB2_100 -; AVX2-NEXT: # %bb.99: -; AVX2-NEXT: movl %eax, %ebx -; AVX2-NEXT: .LBB2_100: -; AVX2-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $4, %edx, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $2, %ebp, %xmm4, %xmm4 -; AVX2-NEXT: movzbl %sil, %esi -; AVX2-NEXT: movzbl %bl, %edx -; AVX2-NEXT: vmovd %edx, %xmm5 -; AVX2-NEXT: vpinsrb $1, %edi, %xmm5, %xmm5 -; AVX2-NEXT: vpextrb $2, %xmm3, %edi -; AVX2-NEXT: vpextrb $2, %xmm1, %edx -; AVX2-NEXT: addb %dil, %dl -; AVX2-NEXT: movb $-1, %bl -; AVX2-NEXT: jb .LBB2_102 -; AVX2-NEXT: # %bb.101: -; AVX2-NEXT: movl %edx, %ebx -; AVX2-NEXT: .LBB2_102: -; AVX2-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $3, %esi, %xmm4, %xmm4 -; AVX2-NEXT: movzbl %r14b, %edx -; AVX2-NEXT: movzbl %bl, %esi -; AVX2-NEXT: vpinsrb $2, %esi, %xmm5, %xmm5 -; AVX2-NEXT: vpextrb $3, %xmm3, %edi -; AVX2-NEXT: vpextrb $3, %xmm1, %esi -; AVX2-NEXT: addb %dil, %sil -; AVX2-NEXT: movb $-1, %bl -; AVX2-NEXT: jb .LBB2_104 -; AVX2-NEXT: # %bb.103: -; AVX2-NEXT: movl %esi, %ebx -; AVX2-NEXT: .LBB2_104: -; AVX2-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $4, %edx, %xmm4, %xmm4 -; AVX2-NEXT: movzbl %r10b, %edx -; AVX2-NEXT: movzbl %bl, %esi -; AVX2-NEXT: vpinsrb $3, %esi, %xmm5, %xmm5 -; AVX2-NEXT: vpextrb $4, %xmm3, %edi -; AVX2-NEXT: vpextrb $4, %xmm1, %esi -; AVX2-NEXT: addb %dil, %sil -; AVX2-NEXT: movb $-1, %bl -; AVX2-NEXT: jb .LBB2_106 -; AVX2-NEXT: # %bb.105: -; AVX2-NEXT: movl %esi, %ebx -; AVX2-NEXT: .LBB2_106: -; AVX2-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $5, %edx, %xmm4, %xmm4 -; AVX2-NEXT: movzbl %r9b, %edx -; AVX2-NEXT: movzbl %bl, %esi -; AVX2-NEXT: vpinsrb $4, %esi, %xmm5, %xmm5 -; AVX2-NEXT: vpextrb $5, %xmm3, %edi -; AVX2-NEXT: vpextrb $5, %xmm1, %esi -; AVX2-NEXT: addb %dil, %sil -; AVX2-NEXT: movb $-1, %bl -; AVX2-NEXT: jb .LBB2_108 -; AVX2-NEXT: # %bb.107: -; AVX2-NEXT: movl %esi, %ebx -; AVX2-NEXT: .LBB2_108: -; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $6, %edx, %xmm4, %xmm4 -; AVX2-NEXT: movzbl %r13b, %edx -; AVX2-NEXT: movzbl %bl, %esi -; AVX2-NEXT: vpinsrb $5, %esi, %xmm5, %xmm5 -; AVX2-NEXT: vpextrb $6, %xmm3, %edi -; AVX2-NEXT: vpextrb $6, %xmm1, %esi -; AVX2-NEXT: addb %dil, %sil -; AVX2-NEXT: movb $-1, %bl -; AVX2-NEXT: jb .LBB2_110 -; AVX2-NEXT: # %bb.109: -; AVX2-NEXT: movl %esi, %ebx -; AVX2-NEXT: .LBB2_110: -; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $7, %edx, %xmm4, %xmm4 -; AVX2-NEXT: movzbl %r12b, %edx -; AVX2-NEXT: movzbl %bl, %esi -; AVX2-NEXT: vpinsrb $6, %esi, %xmm5, %xmm5 -; AVX2-NEXT: vpextrb $7, %xmm3, %edi -; AVX2-NEXT: vpextrb $7, %xmm1, %esi -; AVX2-NEXT: addb %dil, %sil -; AVX2-NEXT: movb $-1, %bl -; AVX2-NEXT: jb .LBB2_112 -; AVX2-NEXT: # %bb.111: -; AVX2-NEXT: movl %esi, %ebx -; AVX2-NEXT: .LBB2_112: -; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $8, %edx, %xmm4, %xmm4 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX2-NEXT: movzbl %bl, %esi -; AVX2-NEXT: vpinsrb $7, %esi, %xmm5, %xmm5 -; AVX2-NEXT: vpextrb $8, %xmm3, %edi -; AVX2-NEXT: vpextrb $8, %xmm1, %esi -; AVX2-NEXT: addb %dil, %sil -; AVX2-NEXT: movb $-1, %bl -; AVX2-NEXT: jb .LBB2_114 -; AVX2-NEXT: # %bb.113: -; AVX2-NEXT: movl %esi, %ebx -; AVX2-NEXT: .LBB2_114: -; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $9, %edx, %xmm4, %xmm4 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX2-NEXT: movzbl %bl, %esi -; AVX2-NEXT: vpinsrb $8, %esi, %xmm5, %xmm5 -; AVX2-NEXT: vpextrb $9, %xmm3, %edi -; AVX2-NEXT: vpextrb $9, %xmm1, %esi -; AVX2-NEXT: addb %dil, %sil -; AVX2-NEXT: movb $-1, %bl -; AVX2-NEXT: jb .LBB2_116 -; AVX2-NEXT: # %bb.115: -; AVX2-NEXT: movl %esi, %ebx -; AVX2-NEXT: .LBB2_116: -; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $10, %edx, %xmm4, %xmm4 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX2-NEXT: movzbl %bl, %esi -; AVX2-NEXT: vpinsrb $9, %esi, %xmm5, %xmm5 -; AVX2-NEXT: vpextrb $10, %xmm3, %edi -; AVX2-NEXT: vpextrb $10, %xmm1, %esi -; AVX2-NEXT: addb %dil, %sil -; AVX2-NEXT: movb $-1, %bl -; AVX2-NEXT: jb .LBB2_118 -; AVX2-NEXT: # %bb.117: -; AVX2-NEXT: movl %esi, %ebx -; AVX2-NEXT: .LBB2_118: -; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $11, %edx, %xmm4, %xmm4 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX2-NEXT: movzbl %bl, %esi -; AVX2-NEXT: vpinsrb $10, %esi, %xmm5, %xmm5 -; AVX2-NEXT: vpextrb $11, %xmm3, %edi -; AVX2-NEXT: vpextrb $11, %xmm1, %esi -; AVX2-NEXT: addb %dil, %sil -; AVX2-NEXT: movb $-1, %bl -; AVX2-NEXT: jb .LBB2_120 -; AVX2-NEXT: # %bb.119: -; AVX2-NEXT: movl %esi, %ebx -; AVX2-NEXT: .LBB2_120: -; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $12, %edx, %xmm4, %xmm6 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX2-NEXT: movzbl %bl, %esi -; AVX2-NEXT: vpinsrb $11, %esi, %xmm5, %xmm5 -; AVX2-NEXT: vpextrb $12, %xmm3, %edi -; AVX2-NEXT: vpextrb $12, %xmm1, %esi -; AVX2-NEXT: addb %dil, %sil -; AVX2-NEXT: movb $-1, %bl -; AVX2-NEXT: jb .LBB2_122 -; AVX2-NEXT: # %bb.121: -; AVX2-NEXT: movl %esi, %ebx -; AVX2-NEXT: .LBB2_122: -; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm4 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $15, %ecx, %xmm2, %xmm0 -; AVX2-NEXT: vpinsrb $13, %edx, %xmm6, %xmm6 -; AVX2-NEXT: movzbl %r8b, %ecx -; AVX2-NEXT: movzbl %bl, %edx -; AVX2-NEXT: vpinsrb $12, %edx, %xmm5, %xmm5 -; AVX2-NEXT: vpextrb $13, %xmm3, %edx -; AVX2-NEXT: vpextrb $13, %xmm1, %esi -; AVX2-NEXT: addb %dl, %sil -; AVX2-NEXT: movb $-1, %dl -; AVX2-NEXT: jb .LBB2_124 -; AVX2-NEXT: # %bb.123: -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: .LBB2_124: -; AVX2-NEXT: vpinsrb $15, %eax, %xmm4, %xmm2 -; AVX2-NEXT: vpinsrb $14, %ecx, %xmm6, %xmm4 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: movzbl %dl, %ecx -; AVX2-NEXT: vpinsrb $13, %ecx, %xmm5, %xmm5 -; AVX2-NEXT: vpextrb $14, %xmm3, %edx -; AVX2-NEXT: vpextrb $14, %xmm1, %ecx -; AVX2-NEXT: addb %dl, %cl -; AVX2-NEXT: movb $-1, %dl -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r12 -; AVX2-NEXT: popq %r13 -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %r15 -; AVX2-NEXT: popq %rbp -; AVX2-NEXT: jb .LBB2_126 -; AVX2-NEXT: # %bb.125: -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: .LBB2_126: -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: vpinsrb $15, %eax, %xmm4, %xmm2 -; AVX2-NEXT: movzbl %dl, %eax -; AVX2-NEXT: vpinsrb $14, %eax, %xmm5, %xmm4 -; AVX2-NEXT: vpextrb $15, %xmm3, %ecx -; AVX2-NEXT: vpextrb $15, %xmm1, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb $-1, %cl -; AVX2-NEXT: jb .LBB2_128 -; AVX2-NEXT: # %bb.127: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_128: -; AVX2-NEXT: movzbl %cl, %eax -; AVX2-NEXT: vpinsrb $15, %eax, %xmm4, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: vpaddusb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddusb %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: v64i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm2 -; AVX512-NEXT: vpextrb $1, %xmm2, %ecx -; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm3 -; AVX512-NEXT: vpextrb $1, %xmm3, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: movb $-1, %cl -; AVX512-NEXT: jb .LBB2_2 -; AVX512-NEXT: # %bb.1: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_2: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX512-NEXT: vpextrb $0, %xmm2, %ecx -; AVX512-NEXT: vpextrb $0, %xmm3, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: movb $-1, %cl -; AVX512-NEXT: jb .LBB2_4 -; AVX512-NEXT: # %bb.3: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_4: -; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX512-NEXT: vpextrb $2, %xmm2, %ecx -; AVX512-NEXT: vpextrb $2, %xmm3, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: movb $-1, %cl -; AVX512-NEXT: jb .LBB2_6 -; AVX512-NEXT: # %bb.5: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_6: -; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX512-NEXT: vpextrb $3, %xmm2, %ecx -; AVX512-NEXT: vpextrb $3, %xmm3, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: movb $-1, %cl -; AVX512-NEXT: jb .LBB2_8 -; AVX512-NEXT: # %bb.7: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_8: -; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX512-NEXT: vpextrb $4, %xmm2, %ecx -; AVX512-NEXT: vpextrb $4, %xmm3, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: movb $-1, %cl -; AVX512-NEXT: jb .LBB2_10 -; AVX512-NEXT: # %bb.9: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_10: -; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX512-NEXT: vpextrb $5, %xmm2, %ecx -; AVX512-NEXT: vpextrb $5, %xmm3, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: movb $-1, %cl -; AVX512-NEXT: jb .LBB2_12 -; AVX512-NEXT: # %bb.11: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_12: -; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX512-NEXT: vpextrb $6, %xmm2, %ecx -; AVX512-NEXT: vpextrb $6, %xmm3, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: movb $-1, %cl -; AVX512-NEXT: jb .LBB2_14 -; AVX512-NEXT: # %bb.13: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_14: -; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX512-NEXT: vpextrb $7, %xmm2, %ecx -; AVX512-NEXT: vpextrb $7, %xmm3, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: movb $-1, %cl -; AVX512-NEXT: jb .LBB2_16 -; AVX512-NEXT: # %bb.15: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_16: -; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX512-NEXT: vpextrb $8, %xmm2, %ecx -; AVX512-NEXT: vpextrb $8, %xmm3, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: movb $-1, %cl -; AVX512-NEXT: jb .LBB2_18 -; AVX512-NEXT: # %bb.17: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_18: -; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX512-NEXT: vpextrb $9, %xmm2, %ecx -; AVX512-NEXT: vpextrb $9, %xmm3, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: movb $-1, %cl -; AVX512-NEXT: jb .LBB2_20 -; AVX512-NEXT: # %bb.19: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_20: -; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX512-NEXT: vpextrb $10, %xmm2, %ecx -; AVX512-NEXT: vpextrb $10, %xmm3, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: movb $-1, %cl -; AVX512-NEXT: jb .LBB2_22 -; AVX512-NEXT: # %bb.21: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_22: -; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX512-NEXT: vpextrb $11, %xmm2, %ecx -; AVX512-NEXT: vpextrb $11, %xmm3, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: movb $-1, %cl -; AVX512-NEXT: jb .LBB2_24 -; AVX512-NEXT: # %bb.23: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_24: -; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX512-NEXT: vpextrb $12, %xmm2, %ecx -; AVX512-NEXT: vpextrb $12, %xmm3, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: movb $-1, %cl -; AVX512-NEXT: jb .LBB2_26 -; AVX512-NEXT: # %bb.25: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_26: -; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX512-NEXT: vpextrb $13, %xmm2, %ecx -; AVX512-NEXT: vpextrb $13, %xmm3, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: movb $-1, %cl -; AVX512-NEXT: jb .LBB2_28 -; AVX512-NEXT: # %bb.27: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_28: -; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX512-NEXT: vpextrb $14, %xmm2, %ecx -; AVX512-NEXT: vpextrb $14, %xmm3, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: movb $-1, %cl -; AVX512-NEXT: jb .LBB2_30 -; AVX512-NEXT: # %bb.29: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_30: -; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX512-NEXT: vpextrb $15, %xmm2, %ecx -; AVX512-NEXT: vpextrb $15, %xmm3, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: movb $-1, %cl -; AVX512-NEXT: jb .LBB2_32 -; AVX512-NEXT: # %bb.31: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_32: -; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm2 -; AVX512-NEXT: vpextrb $1, %xmm2, %ecx -; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm3 -; AVX512-NEXT: vpextrb $1, %xmm3, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: movb $-1, %r11b -; AVX512-NEXT: jb .LBB2_34 -; AVX512-NEXT: # %bb.33: -; AVX512-NEXT: movl %eax, %r11d -; AVX512-NEXT: .LBB2_34: -; AVX512-NEXT: vpextrb $0, %xmm2, %ecx -; AVX512-NEXT: vpextrb $0, %xmm3, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: movb $-1, %r8b -; AVX512-NEXT: jb .LBB2_36 -; AVX512-NEXT: # %bb.35: -; AVX512-NEXT: movl %eax, %r8d -; AVX512-NEXT: .LBB2_36: -; AVX512-NEXT: vpextrb $2, %xmm2, %ecx -; AVX512-NEXT: vpextrb $2, %xmm3, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: movb $-1, %cl -; AVX512-NEXT: jb .LBB2_38 -; AVX512-NEXT: # %bb.37: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_38: -; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX512-NEXT: vpextrb $3, %xmm2, %ecx -; AVX512-NEXT: vpextrb $3, %xmm3, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: movb $-1, %cl -; AVX512-NEXT: jb .LBB2_40 -; AVX512-NEXT: # %bb.39: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_40: -; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX512-NEXT: vpextrb $4, %xmm2, %ecx -; AVX512-NEXT: vpextrb $4, %xmm3, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: movb $-1, %cl -; AVX512-NEXT: jb .LBB2_42 -; AVX512-NEXT: # %bb.41: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_42: -; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX512-NEXT: vpextrb $5, %xmm2, %ecx -; AVX512-NEXT: vpextrb $5, %xmm3, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: movb $-1, %cl -; AVX512-NEXT: jb .LBB2_44 -; AVX512-NEXT: # %bb.43: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_44: -; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX512-NEXT: vpextrb $6, %xmm2, %ecx -; AVX512-NEXT: vpextrb $6, %xmm3, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: movb $-1, %cl -; AVX512-NEXT: jb .LBB2_46 -; AVX512-NEXT: # %bb.45: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_46: -; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX512-NEXT: vpextrb $7, %xmm2, %ecx -; AVX512-NEXT: vpextrb $7, %xmm3, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: movb $-1, %cl -; AVX512-NEXT: jb .LBB2_48 -; AVX512-NEXT: # %bb.47: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_48: -; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX512-NEXT: vpextrb $8, %xmm2, %ecx -; AVX512-NEXT: vpextrb $8, %xmm3, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: movb $-1, %cl -; AVX512-NEXT: jb .LBB2_50 -; AVX512-NEXT: # %bb.49: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_50: -; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX512-NEXT: vpextrb $9, %xmm2, %ecx -; AVX512-NEXT: vpextrb $9, %xmm3, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: movb $-1, %cl -; AVX512-NEXT: jb .LBB2_52 -; AVX512-NEXT: # %bb.51: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_52: -; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX512-NEXT: vpextrb $10, %xmm2, %ecx -; AVX512-NEXT: vpextrb $10, %xmm3, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: movb $-1, %cl -; AVX512-NEXT: jb .LBB2_54 -; AVX512-NEXT: # %bb.53: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_54: -; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX512-NEXT: vpextrb $11, %xmm2, %ecx -; AVX512-NEXT: vpextrb $11, %xmm3, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: movb $-1, %cl -; AVX512-NEXT: jb .LBB2_56 -; AVX512-NEXT: # %bb.55: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_56: -; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX512-NEXT: vpextrb $12, %xmm2, %ecx -; AVX512-NEXT: vpextrb $12, %xmm3, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: movb $-1, %cl -; AVX512-NEXT: jb .LBB2_58 -; AVX512-NEXT: # %bb.57: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_58: -; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX512-NEXT: vpextrb $13, %xmm2, %ecx -; AVX512-NEXT: vpextrb $13, %xmm3, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: movb $-1, %cl -; AVX512-NEXT: jb .LBB2_60 -; AVX512-NEXT: # %bb.59: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_60: -; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX512-NEXT: vpextrb $14, %xmm2, %ecx -; AVX512-NEXT: vpextrb $14, %xmm3, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: movb $-1, %cl -; AVX512-NEXT: jb .LBB2_62 -; AVX512-NEXT: # %bb.61: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_62: -; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX512-NEXT: vpextrb $15, %xmm2, %ecx -; AVX512-NEXT: vpextrb $15, %xmm3, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: movb $-1, %cl -; AVX512-NEXT: jb .LBB2_64 -; AVX512-NEXT: # %bb.63: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_64: -; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512-NEXT: vpextrb $1, %xmm2, %ecx -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512-NEXT: vpextrb $1, %xmm3, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: movb $-1, %dil -; AVX512-NEXT: jb .LBB2_66 -; AVX512-NEXT: # %bb.65: -; AVX512-NEXT: movl %eax, %edi -; AVX512-NEXT: .LBB2_66: -; AVX512-NEXT: vpextrb $0, %xmm2, %eax -; AVX512-NEXT: vpextrb $0, %xmm3, %ecx -; AVX512-NEXT: addb %al, %cl -; AVX512-NEXT: movb $-1, %al -; AVX512-NEXT: jb .LBB2_68 -; AVX512-NEXT: # %bb.67: -; AVX512-NEXT: movl %ecx, %eax -; AVX512-NEXT: .LBB2_68: -; AVX512-NEXT: vpextrb $2, %xmm2, %esi -; AVX512-NEXT: vpextrb $2, %xmm3, %ecx -; AVX512-NEXT: addb %sil, %cl -; AVX512-NEXT: movb $-1, %r15b -; AVX512-NEXT: jb .LBB2_70 -; AVX512-NEXT: # %bb.69: -; AVX512-NEXT: movl %ecx, %r15d -; AVX512-NEXT: .LBB2_70: -; AVX512-NEXT: vpextrb $3, %xmm2, %esi -; AVX512-NEXT: vpextrb $3, %xmm3, %ecx -; AVX512-NEXT: addb %sil, %cl -; AVX512-NEXT: movb $-1, %sil -; AVX512-NEXT: jb .LBB2_72 -; AVX512-NEXT: # %bb.71: -; AVX512-NEXT: movl %ecx, %esi -; AVX512-NEXT: .LBB2_72: -; AVX512-NEXT: vpextrb $4, %xmm2, %ebp -; AVX512-NEXT: vpextrb $4, %xmm3, %ecx -; AVX512-NEXT: addb %bpl, %cl -; AVX512-NEXT: movb $-1, %r14b -; AVX512-NEXT: jb .LBB2_74 -; AVX512-NEXT: # %bb.73: -; AVX512-NEXT: movl %ecx, %r14d -; AVX512-NEXT: .LBB2_74: -; AVX512-NEXT: vpextrb $5, %xmm2, %ebp -; AVX512-NEXT: vpextrb $5, %xmm3, %ecx -; AVX512-NEXT: addb %bpl, %cl -; AVX512-NEXT: movb $-1, %r10b -; AVX512-NEXT: jb .LBB2_76 -; AVX512-NEXT: # %bb.75: -; AVX512-NEXT: movl %ecx, %r10d -; AVX512-NEXT: .LBB2_76: -; AVX512-NEXT: vpextrb $6, %xmm2, %ebp -; AVX512-NEXT: vpextrb $6, %xmm3, %ecx -; AVX512-NEXT: addb %bpl, %cl -; AVX512-NEXT: movb $-1, %r9b -; AVX512-NEXT: jb .LBB2_78 -; AVX512-NEXT: # %bb.77: -; AVX512-NEXT: movl %ecx, %r9d -; AVX512-NEXT: .LBB2_78: -; AVX512-NEXT: vpextrb $7, %xmm2, %ebx -; AVX512-NEXT: vpextrb $7, %xmm3, %ecx -; AVX512-NEXT: addb %bl, %cl -; AVX512-NEXT: movb $-1, %r13b -; AVX512-NEXT: jb .LBB2_80 -; AVX512-NEXT: # %bb.79: -; AVX512-NEXT: movl %ecx, %r13d -; AVX512-NEXT: .LBB2_80: -; AVX512-NEXT: vpextrb $8, %xmm2, %ebx -; AVX512-NEXT: vpextrb $8, %xmm3, %ecx -; AVX512-NEXT: addb %bl, %cl -; AVX512-NEXT: movb $-1, %r12b -; AVX512-NEXT: jb .LBB2_82 -; AVX512-NEXT: # %bb.81: -; AVX512-NEXT: movl %ecx, %r12d -; AVX512-NEXT: .LBB2_82: -; AVX512-NEXT: vpextrb $9, %xmm2, %ebx -; AVX512-NEXT: vpextrb $9, %xmm3, %ecx -; AVX512-NEXT: addb %bl, %cl -; AVX512-NEXT: movb $-1, %dl -; AVX512-NEXT: jb .LBB2_84 -; AVX512-NEXT: # %bb.83: -; AVX512-NEXT: movl %ecx, %edx -; AVX512-NEXT: .LBB2_84: -; AVX512-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX512-NEXT: vpextrb $10, %xmm2, %ebx -; AVX512-NEXT: vpextrb $10, %xmm3, %ecx -; AVX512-NEXT: addb %bl, %cl -; AVX512-NEXT: movb $-1, %dl -; AVX512-NEXT: jb .LBB2_86 -; AVX512-NEXT: # %bb.85: -; AVX512-NEXT: movl %ecx, %edx -; AVX512-NEXT: .LBB2_86: -; AVX512-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX512-NEXT: vpextrb $11, %xmm2, %ebx -; AVX512-NEXT: vpextrb $11, %xmm3, %ecx -; AVX512-NEXT: addb %bl, %cl -; AVX512-NEXT: movb $-1, %dl -; AVX512-NEXT: jb .LBB2_88 -; AVX512-NEXT: # %bb.87: -; AVX512-NEXT: movl %ecx, %edx -; AVX512-NEXT: .LBB2_88: -; AVX512-NEXT: vpextrb $12, %xmm2, %ebx -; AVX512-NEXT: vpextrb $12, %xmm3, %ecx -; AVX512-NEXT: addb %bl, %cl -; AVX512-NEXT: movb $-1, %bl -; AVX512-NEXT: jb .LBB2_90 -; AVX512-NEXT: # %bb.89: -; AVX512-NEXT: movl %ecx, %ebx -; AVX512-NEXT: .LBB2_90: -; AVX512-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX512-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX512-NEXT: movzbl %r8b, %ecx -; AVX512-NEXT: vpextrb $13, %xmm2, %ebx -; AVX512-NEXT: vpextrb $13, %xmm3, %edx -; AVX512-NEXT: addb %bl, %dl -; AVX512-NEXT: movb $-1, %bl -; AVX512-NEXT: jb .LBB2_92 -; AVX512-NEXT: # %bb.91: -; AVX512-NEXT: movl %edx, %ebx -; AVX512-NEXT: .LBB2_92: -; AVX512-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX512-NEXT: movzbl %r11b, %ebp -; AVX512-NEXT: vmovd %ecx, %xmm4 -; AVX512-NEXT: vpextrb $14, %xmm2, %ebx -; AVX512-NEXT: vpextrb $14, %xmm3, %ecx -; AVX512-NEXT: addb %bl, %cl -; AVX512-NEXT: movb $-1, %r8b -; AVX512-NEXT: jb .LBB2_94 -; AVX512-NEXT: # %bb.93: -; AVX512-NEXT: movl %ecx, %r8d -; AVX512-NEXT: .LBB2_94: -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX512-NEXT: vmovd %edx, %xmm5 -; AVX512-NEXT: vpinsrb $1, %ebp, %xmm4, %xmm4 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX512-NEXT: movzbl %al, %r11d -; AVX512-NEXT: vpextrb $15, %xmm2, %ebx -; AVX512-NEXT: vpextrb $15, %xmm3, %eax -; AVX512-NEXT: addb %bl, %al -; AVX512-NEXT: movb $-1, %bl -; AVX512-NEXT: jb .LBB2_96 -; AVX512-NEXT: # %bb.95: -; AVX512-NEXT: movl %eax, %ebx -; AVX512-NEXT: .LBB2_96: -; AVX512-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX512-NEXT: vpinsrb $1, %ecx, %xmm5, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $2, %edx, %xmm4, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX512-NEXT: movzbl %dil, %eax -; AVX512-NEXT: vmovd %r11d, %xmm4 -; AVX512-NEXT: vpextrb $1, %xmm1, %ebp -; AVX512-NEXT: vpextrb $1, %xmm0, %edi -; AVX512-NEXT: addb %bpl, %dil -; AVX512-NEXT: movb $-1, %bl -; AVX512-NEXT: jb .LBB2_98 -; AVX512-NEXT: # %bb.97: -; AVX512-NEXT: movl %edi, %ebx -; AVX512-NEXT: .LBB2_98: -; AVX512-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $3, %edx, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4 -; AVX512-NEXT: movzbl %r15b, %ebp -; AVX512-NEXT: movzbl %bl, %edi -; AVX512-NEXT: vpextrb $0, %xmm1, %ebx -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: addb %bl, %al -; AVX512-NEXT: movb $-1, %bl -; AVX512-NEXT: jb .LBB2_100 -; AVX512-NEXT: # %bb.99: -; AVX512-NEXT: movl %eax, %ebx -; AVX512-NEXT: .LBB2_100: -; AVX512-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $4, %edx, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $2, %ebp, %xmm4, %xmm4 -; AVX512-NEXT: movzbl %sil, %esi -; AVX512-NEXT: movzbl %bl, %edx -; AVX512-NEXT: vmovd %edx, %xmm5 -; AVX512-NEXT: vpinsrb $1, %edi, %xmm5, %xmm5 -; AVX512-NEXT: vpextrb $2, %xmm1, %edi -; AVX512-NEXT: vpextrb $2, %xmm0, %edx -; AVX512-NEXT: addb %dil, %dl -; AVX512-NEXT: movb $-1, %bl -; AVX512-NEXT: jb .LBB2_102 -; AVX512-NEXT: # %bb.101: -; AVX512-NEXT: movl %edx, %ebx -; AVX512-NEXT: .LBB2_102: -; AVX512-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $5, %ecx, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $3, %esi, %xmm4, %xmm4 -; AVX512-NEXT: movzbl %r14b, %edx -; AVX512-NEXT: movzbl %bl, %esi -; AVX512-NEXT: vpinsrb $2, %esi, %xmm5, %xmm5 -; AVX512-NEXT: vpextrb $3, %xmm1, %edi -; AVX512-NEXT: vpextrb $3, %xmm0, %esi -; AVX512-NEXT: addb %dil, %sil -; AVX512-NEXT: movb $-1, %bl -; AVX512-NEXT: jb .LBB2_104 -; AVX512-NEXT: # %bb.103: -; AVX512-NEXT: movl %esi, %ebx -; AVX512-NEXT: .LBB2_104: -; AVX512-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $6, %ecx, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $4, %edx, %xmm4, %xmm4 -; AVX512-NEXT: movzbl %r10b, %edx -; AVX512-NEXT: movzbl %bl, %esi -; AVX512-NEXT: vpinsrb $3, %esi, %xmm5, %xmm5 -; AVX512-NEXT: vpextrb $4, %xmm1, %edi -; AVX512-NEXT: vpextrb $4, %xmm0, %esi -; AVX512-NEXT: addb %dil, %sil -; AVX512-NEXT: movb $-1, %bl -; AVX512-NEXT: jb .LBB2_106 -; AVX512-NEXT: # %bb.105: -; AVX512-NEXT: movl %esi, %ebx -; AVX512-NEXT: .LBB2_106: -; AVX512-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $7, %ecx, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $5, %edx, %xmm4, %xmm4 -; AVX512-NEXT: movzbl %r9b, %edx -; AVX512-NEXT: movzbl %bl, %esi -; AVX512-NEXT: vpinsrb $4, %esi, %xmm5, %xmm5 -; AVX512-NEXT: vpextrb $5, %xmm1, %edi -; AVX512-NEXT: vpextrb $5, %xmm0, %esi -; AVX512-NEXT: addb %dil, %sil -; AVX512-NEXT: movb $-1, %bl -; AVX512-NEXT: jb .LBB2_108 -; AVX512-NEXT: # %bb.107: -; AVX512-NEXT: movl %esi, %ebx -; AVX512-NEXT: .LBB2_108: -; AVX512-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $8, %ecx, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $6, %edx, %xmm4, %xmm4 -; AVX512-NEXT: movzbl %r13b, %edx -; AVX512-NEXT: movzbl %bl, %esi -; AVX512-NEXT: vpinsrb $5, %esi, %xmm5, %xmm5 -; AVX512-NEXT: vpextrb $6, %xmm1, %edi -; AVX512-NEXT: vpextrb $6, %xmm0, %esi -; AVX512-NEXT: addb %dil, %sil -; AVX512-NEXT: movb $-1, %bl -; AVX512-NEXT: jb .LBB2_110 -; AVX512-NEXT: # %bb.109: -; AVX512-NEXT: movl %esi, %ebx -; AVX512-NEXT: .LBB2_110: -; AVX512-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $9, %ecx, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $7, %edx, %xmm4, %xmm4 -; AVX512-NEXT: movzbl %r12b, %edx -; AVX512-NEXT: movzbl %bl, %esi -; AVX512-NEXT: vpinsrb $6, %esi, %xmm5, %xmm5 -; AVX512-NEXT: vpextrb $7, %xmm1, %edi -; AVX512-NEXT: vpextrb $7, %xmm0, %esi -; AVX512-NEXT: addb %dil, %sil -; AVX512-NEXT: movb $-1, %bl -; AVX512-NEXT: jb .LBB2_112 -; AVX512-NEXT: # %bb.111: -; AVX512-NEXT: movl %esi, %ebx -; AVX512-NEXT: .LBB2_112: -; AVX512-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $10, %ecx, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $8, %edx, %xmm4, %xmm4 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX512-NEXT: movzbl %bl, %esi -; AVX512-NEXT: vpinsrb $7, %esi, %xmm5, %xmm5 -; AVX512-NEXT: vpextrb $8, %xmm1, %edi -; AVX512-NEXT: vpextrb $8, %xmm0, %esi -; AVX512-NEXT: addb %dil, %sil -; AVX512-NEXT: movb $-1, %bl -; AVX512-NEXT: jb .LBB2_114 -; AVX512-NEXT: # %bb.113: -; AVX512-NEXT: movl %esi, %ebx -; AVX512-NEXT: .LBB2_114: -; AVX512-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $11, %ecx, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $9, %edx, %xmm4, %xmm4 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX512-NEXT: movzbl %bl, %esi -; AVX512-NEXT: vpinsrb $8, %esi, %xmm5, %xmm5 -; AVX512-NEXT: vpextrb $9, %xmm1, %edi -; AVX512-NEXT: vpextrb $9, %xmm0, %esi -; AVX512-NEXT: addb %dil, %sil -; AVX512-NEXT: movb $-1, %bl -; AVX512-NEXT: jb .LBB2_116 -; AVX512-NEXT: # %bb.115: -; AVX512-NEXT: movl %esi, %ebx -; AVX512-NEXT: .LBB2_116: -; AVX512-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $12, %ecx, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $10, %edx, %xmm4, %xmm4 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX512-NEXT: movzbl %bl, %esi -; AVX512-NEXT: vpinsrb $9, %esi, %xmm5, %xmm5 -; AVX512-NEXT: vpextrb $10, %xmm1, %edi -; AVX512-NEXT: vpextrb $10, %xmm0, %esi -; AVX512-NEXT: addb %dil, %sil -; AVX512-NEXT: movb $-1, %bl -; AVX512-NEXT: jb .LBB2_118 -; AVX512-NEXT: # %bb.117: -; AVX512-NEXT: movl %esi, %ebx -; AVX512-NEXT: .LBB2_118: -; AVX512-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $13, %ecx, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $11, %edx, %xmm4, %xmm4 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX512-NEXT: movzbl %bl, %esi -; AVX512-NEXT: vpinsrb $10, %esi, %xmm5, %xmm5 -; AVX512-NEXT: vpextrb $11, %xmm1, %edi -; AVX512-NEXT: vpextrb $11, %xmm0, %esi -; AVX512-NEXT: addb %dil, %sil -; AVX512-NEXT: movb $-1, %bl -; AVX512-NEXT: jb .LBB2_120 -; AVX512-NEXT: # %bb.119: -; AVX512-NEXT: movl %esi, %ebx -; AVX512-NEXT: .LBB2_120: -; AVX512-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $14, %ecx, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $12, %edx, %xmm4, %xmm6 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX512-NEXT: movzbl %bl, %esi -; AVX512-NEXT: vpinsrb $11, %esi, %xmm5, %xmm5 -; AVX512-NEXT: vpextrb $12, %xmm1, %edi -; AVX512-NEXT: vpextrb $12, %xmm0, %esi -; AVX512-NEXT: addb %dil, %sil -; AVX512-NEXT: movb $-1, %bl -; AVX512-NEXT: jb .LBB2_122 -; AVX512-NEXT: # %bb.121: -; AVX512-NEXT: movl %esi, %ebx -; AVX512-NEXT: .LBB2_122: -; AVX512-NEXT: vpinsrb $14, %eax, %xmm2, %xmm4 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $15, %ecx, %xmm3, %xmm2 -; AVX512-NEXT: vpinsrb $13, %edx, %xmm6, %xmm6 -; AVX512-NEXT: movzbl %r8b, %ecx -; AVX512-NEXT: movzbl %bl, %edx -; AVX512-NEXT: vpinsrb $12, %edx, %xmm5, %xmm5 -; AVX512-NEXT: vpextrb $13, %xmm1, %edx -; AVX512-NEXT: vpextrb $13, %xmm0, %esi -; AVX512-NEXT: addb %dl, %sil -; AVX512-NEXT: movb $-1, %dl -; AVX512-NEXT: jb .LBB2_124 -; AVX512-NEXT: # %bb.123: -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: .LBB2_124: -; AVX512-NEXT: vpinsrb $15, %eax, %xmm4, %xmm3 -; AVX512-NEXT: vpinsrb $14, %ecx, %xmm6, %xmm4 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: movzbl %dl, %ecx -; AVX512-NEXT: vpinsrb $13, %ecx, %xmm5, %xmm5 -; AVX512-NEXT: vpextrb $14, %xmm1, %edx -; AVX512-NEXT: vpextrb $14, %xmm0, %ecx -; AVX512-NEXT: addb %dl, %cl -; AVX512-NEXT: movb $-1, %dl -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp -; AVX512-NEXT: jb .LBB2_126 -; AVX512-NEXT: # %bb.125: -; AVX512-NEXT: movl %ecx, %edx -; AVX512-NEXT: .LBB2_126: -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512-NEXT: vpinsrb $15, %eax, %xmm4, %xmm3 -; AVX512-NEXT: movzbl %dl, %eax -; AVX512-NEXT: vpinsrb $14, %eax, %xmm5, %xmm4 -; AVX512-NEXT: vpextrb $15, %xmm1, %ecx -; AVX512-NEXT: vpextrb $15, %xmm0, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: movb $-1, %cl -; AVX512-NEXT: jb .LBB2_128 -; AVX512-NEXT: # %bb.127: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_128: -; AVX512-NEXT: movzbl %cl, %eax -; AVX512-NEXT: vpinsrb $15, %eax, %xmm4, %xmm0 -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: vpaddusb %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq %z = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %x, <64 x i8> %y) ret <64 x i8> %z } define <8 x i16> @v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { -; SSE2-LABEL: v8i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pextrw $7, %xmm1, %eax -; SSE2-NEXT: pextrw $7, %xmm0, %ecx -; SSE2-NEXT: addw %ax, %cx -; SSE2-NEXT: movl $65535, %eax # imm = 0xFFFF -; SSE2-NEXT: cmovbl %eax, %ecx -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: pextrw $6, %xmm1, %ecx -; SSE2-NEXT: pextrw $6, %xmm0, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE2-NEXT: pextrw $5, %xmm1, %ecx -; SSE2-NEXT: pextrw $5, %xmm0, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm4 -; SSE2-NEXT: pextrw $4, %xmm1, %ecx -; SSE2-NEXT: pextrw $4, %xmm0, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE2-NEXT: pextrw $3, %xmm1, %ecx -; SSE2-NEXT: pextrw $3, %xmm0, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm3 -; SSE2-NEXT: pextrw $2, %xmm1, %ecx -; SSE2-NEXT: pextrw $2, %xmm0, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE2-NEXT: pextrw $1, %xmm1, %ecx -; SSE2-NEXT: pextrw $1, %xmm0, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm3 -; SSE2-NEXT: movd %xmm1, %ecx -; SSE2-NEXT: movd %xmm0, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v8i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pextrw $7, %xmm1, %eax -; SSSE3-NEXT: pextrw $7, %xmm0, %ecx -; SSSE3-NEXT: addw %ax, %cx -; SSSE3-NEXT: movl $65535, %eax # imm = 0xFFFF -; SSSE3-NEXT: cmovbl %eax, %ecx -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: pextrw $6, %xmm1, %ecx -; SSSE3-NEXT: pextrw $6, %xmm0, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm3 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSSE3-NEXT: pextrw $5, %xmm1, %ecx -; SSSE3-NEXT: pextrw $5, %xmm0, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm4 -; SSSE3-NEXT: pextrw $4, %xmm1, %ecx -; SSSE3-NEXT: pextrw $4, %xmm0, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSSE3-NEXT: pextrw $3, %xmm1, %ecx -; SSSE3-NEXT: pextrw $3, %xmm0, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm3 -; SSSE3-NEXT: pextrw $2, %xmm1, %ecx -; SSSE3-NEXT: pextrw $2, %xmm0, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm4 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSSE3-NEXT: pextrw $1, %xmm1, %ecx -; SSSE3-NEXT: pextrw $1, %xmm0, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm3 -; SSSE3-NEXT: movd %xmm1, %ecx -; SSSE3-NEXT: movd %xmm0, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v8i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pextrw $1, %xmm1, %eax -; SSE41-NEXT: pextrw $1, %xmm0, %ecx -; SSE41-NEXT: addw %ax, %cx -; SSE41-NEXT: movl $65535, %eax # imm = 0xFFFF -; SSE41-NEXT: cmovbl %eax, %ecx -; SSE41-NEXT: movd %xmm1, %edx -; SSE41-NEXT: movd %xmm0, %esi -; SSE41-NEXT: addw %dx, %si -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: movd %esi, %xmm2 -; SSE41-NEXT: pinsrw $1, %ecx, %xmm2 -; SSE41-NEXT: pextrw $2, %xmm1, %ecx -; SSE41-NEXT: pextrw $2, %xmm0, %edx -; SSE41-NEXT: addw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $2, %edx, %xmm2 -; SSE41-NEXT: pextrw $3, %xmm1, %ecx -; SSE41-NEXT: pextrw $3, %xmm0, %edx -; SSE41-NEXT: addw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $3, %edx, %xmm2 -; SSE41-NEXT: pextrw $4, %xmm1, %ecx -; SSE41-NEXT: pextrw $4, %xmm0, %edx -; SSE41-NEXT: addw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $4, %edx, %xmm2 -; SSE41-NEXT: pextrw $5, %xmm1, %ecx -; SSE41-NEXT: pextrw $5, %xmm0, %edx -; SSE41-NEXT: addw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $5, %edx, %xmm2 -; SSE41-NEXT: pextrw $6, %xmm1, %ecx -; SSE41-NEXT: pextrw $6, %xmm0, %edx -; SSE41-NEXT: addw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $6, %edx, %xmm2 -; SSE41-NEXT: pextrw $7, %xmm1, %ecx -; SSE41-NEXT: pextrw $7, %xmm0, %edx -; SSE41-NEXT: addw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $7, %edx, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: v8i16: +; SSE: # %bb.0: +; SSE-NEXT: paddusw %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: v8i16: ; AVX: # %bb.0: -; AVX-NEXT: vpextrw $1, %xmm1, %eax -; AVX-NEXT: vpextrw $1, %xmm0, %ecx -; AVX-NEXT: addw %ax, %cx -; AVX-NEXT: movl $65535, %eax # imm = 0xFFFF -; AVX-NEXT: cmovbl %eax, %ecx -; AVX-NEXT: vmovd %xmm1, %edx -; AVX-NEXT: vmovd %xmm0, %esi -; AVX-NEXT: addw %dx, %si -; AVX-NEXT: cmovbl %eax, %esi -; AVX-NEXT: vmovd %esi, %xmm2 -; AVX-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 -; AVX-NEXT: vpextrw $2, %xmm1, %ecx -; AVX-NEXT: vpextrw $2, %xmm0, %edx -; AVX-NEXT: addw %cx, %dx -; AVX-NEXT: cmovbl %eax, %edx -; AVX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2 -; AVX-NEXT: vpextrw $3, %xmm1, %ecx -; AVX-NEXT: vpextrw $3, %xmm0, %edx -; AVX-NEXT: addw %cx, %dx -; AVX-NEXT: cmovbl %eax, %edx -; AVX-NEXT: vpinsrw $3, %edx, %xmm2, %xmm2 -; AVX-NEXT: vpextrw $4, %xmm1, %ecx -; AVX-NEXT: vpextrw $4, %xmm0, %edx -; AVX-NEXT: addw %cx, %dx -; AVX-NEXT: cmovbl %eax, %edx -; AVX-NEXT: vpinsrw $4, %edx, %xmm2, %xmm2 -; AVX-NEXT: vpextrw $5, %xmm1, %ecx -; AVX-NEXT: vpextrw $5, %xmm0, %edx -; AVX-NEXT: addw %cx, %dx -; AVX-NEXT: cmovbl %eax, %edx -; AVX-NEXT: vpinsrw $5, %edx, %xmm2, %xmm2 -; AVX-NEXT: vpextrw $6, %xmm1, %ecx -; AVX-NEXT: vpextrw $6, %xmm0, %edx -; AVX-NEXT: addw %cx, %dx -; AVX-NEXT: cmovbl %eax, %edx -; AVX-NEXT: vpinsrw $6, %edx, %xmm2, %xmm2 -; AVX-NEXT: vpextrw $7, %xmm1, %ecx -; AVX-NEXT: vpextrw $7, %xmm0, %edx -; AVX-NEXT: addw %cx, %dx -; AVX-NEXT: cmovbl %eax, %edx -; AVX-NEXT: vpinsrw $7, %edx, %xmm2, %xmm0 +; AVX-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %z = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %x, <8 x i16> %y) ret <8 x i16> %z } define <16 x i16> @v16i16(<16 x i16> %x, <16 x i16> %y) nounwind { -; SSE2-LABEL: v16i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pextrw $7, %xmm2, %eax -; SSE2-NEXT: pextrw $7, %xmm0, %ecx -; SSE2-NEXT: addw %ax, %cx -; SSE2-NEXT: movl $65535, %eax # imm = 0xFFFF -; SSE2-NEXT: cmovbl %eax, %ecx -; SSE2-NEXT: movd %ecx, %xmm4 -; SSE2-NEXT: pextrw $6, %xmm2, %ecx -; SSE2-NEXT: pextrw $6, %xmm0, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm5 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE2-NEXT: pextrw $5, %xmm2, %ecx -; SSE2-NEXT: pextrw $5, %xmm0, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm6 -; SSE2-NEXT: pextrw $4, %xmm2, %ecx -; SSE2-NEXT: pextrw $4, %xmm0, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSE2-NEXT: pextrw $3, %xmm2, %ecx -; SSE2-NEXT: pextrw $3, %xmm0, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm5 -; SSE2-NEXT: pextrw $2, %xmm2, %ecx -; SSE2-NEXT: pextrw $2, %xmm0, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm6 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; SSE2-NEXT: pextrw $1, %xmm2, %ecx -; SSE2-NEXT: pextrw $1, %xmm0, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm5 -; SSE2-NEXT: movd %xmm2, %ecx -; SSE2-NEXT: movd %xmm0, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; SSE2-NEXT: pextrw $7, %xmm3, %ecx -; SSE2-NEXT: pextrw $7, %xmm1, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm2 -; SSE2-NEXT: pextrw $6, %xmm3, %ecx -; SSE2-NEXT: pextrw $6, %xmm1, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSE2-NEXT: pextrw $5, %xmm3, %ecx -; SSE2-NEXT: pextrw $5, %xmm1, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm5 -; SSE2-NEXT: pextrw $4, %xmm3, %ecx -; SSE2-NEXT: pextrw $4, %xmm1, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSE2-NEXT: pextrw $3, %xmm3, %ecx -; SSE2-NEXT: pextrw $3, %xmm1, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm4 -; SSE2-NEXT: pextrw $2, %xmm3, %ecx -; SSE2-NEXT: pextrw $2, %xmm1, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm5 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE2-NEXT: pextrw $1, %xmm3, %ecx -; SSE2-NEXT: pextrw $1, %xmm1, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm4 -; SSE2-NEXT: movd %xmm3, %ecx -; SSE2-NEXT: movd %xmm1, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v16i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pextrw $7, %xmm2, %eax -; SSSE3-NEXT: pextrw $7, %xmm0, %ecx -; SSSE3-NEXT: addw %ax, %cx -; SSSE3-NEXT: movl $65535, %eax # imm = 0xFFFF -; SSSE3-NEXT: cmovbl %eax, %ecx -; SSSE3-NEXT: movd %ecx, %xmm4 -; SSSE3-NEXT: pextrw $6, %xmm2, %ecx -; SSSE3-NEXT: pextrw $6, %xmm0, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm5 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSSE3-NEXT: pextrw $5, %xmm2, %ecx -; SSSE3-NEXT: pextrw $5, %xmm0, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm6 -; SSSE3-NEXT: pextrw $4, %xmm2, %ecx -; SSSE3-NEXT: pextrw $4, %xmm0, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm4 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSSE3-NEXT: pextrw $3, %xmm2, %ecx -; SSSE3-NEXT: pextrw $3, %xmm0, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm5 -; SSSE3-NEXT: pextrw $2, %xmm2, %ecx -; SSSE3-NEXT: pextrw $2, %xmm0, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm6 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; SSSE3-NEXT: pextrw $1, %xmm2, %ecx -; SSSE3-NEXT: pextrw $1, %xmm0, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm5 -; SSSE3-NEXT: movd %xmm2, %ecx -; SSSE3-NEXT: movd %xmm0, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; SSSE3-NEXT: pextrw $7, %xmm3, %ecx -; SSSE3-NEXT: pextrw $7, %xmm1, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm2 -; SSSE3-NEXT: pextrw $6, %xmm3, %ecx -; SSSE3-NEXT: pextrw $6, %xmm1, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm4 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSSE3-NEXT: pextrw $5, %xmm3, %ecx -; SSSE3-NEXT: pextrw $5, %xmm1, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm5 -; SSSE3-NEXT: pextrw $4, %xmm3, %ecx -; SSSE3-NEXT: pextrw $4, %xmm1, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSSE3-NEXT: pextrw $3, %xmm3, %ecx -; SSSE3-NEXT: pextrw $3, %xmm1, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm4 -; SSSE3-NEXT: pextrw $2, %xmm3, %ecx -; SSSE3-NEXT: pextrw $2, %xmm1, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm5 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSSE3-NEXT: pextrw $1, %xmm3, %ecx -; SSSE3-NEXT: pextrw $1, %xmm1, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm4 -; SSSE3-NEXT: movd %xmm3, %ecx -; SSSE3-NEXT: movd %xmm1, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v16i16: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pextrw $1, %xmm2, %eax -; SSE41-NEXT: pextrw $1, %xmm0, %ecx -; SSE41-NEXT: addw %ax, %cx -; SSE41-NEXT: movl $65535, %eax # imm = 0xFFFF -; SSE41-NEXT: cmovbl %eax, %ecx -; SSE41-NEXT: movd %xmm2, %edx -; SSE41-NEXT: movd %xmm0, %esi -; SSE41-NEXT: addw %dx, %si -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: movd %esi, %xmm0 -; SSE41-NEXT: pinsrw $1, %ecx, %xmm0 -; SSE41-NEXT: pextrw $2, %xmm2, %ecx -; SSE41-NEXT: pextrw $2, %xmm4, %edx -; SSE41-NEXT: addw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $2, %edx, %xmm0 -; SSE41-NEXT: pextrw $3, %xmm2, %ecx -; SSE41-NEXT: pextrw $3, %xmm4, %edx -; SSE41-NEXT: addw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $3, %edx, %xmm0 -; SSE41-NEXT: pextrw $4, %xmm2, %ecx -; SSE41-NEXT: pextrw $4, %xmm4, %edx -; SSE41-NEXT: addw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $4, %edx, %xmm0 -; SSE41-NEXT: pextrw $5, %xmm2, %ecx -; SSE41-NEXT: pextrw $5, %xmm4, %edx -; SSE41-NEXT: addw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $5, %edx, %xmm0 -; SSE41-NEXT: pextrw $6, %xmm2, %ecx -; SSE41-NEXT: pextrw $6, %xmm4, %edx -; SSE41-NEXT: addw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $6, %edx, %xmm0 -; SSE41-NEXT: pextrw $7, %xmm2, %ecx -; SSE41-NEXT: pextrw $7, %xmm4, %edx -; SSE41-NEXT: addw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $7, %edx, %xmm0 -; SSE41-NEXT: pextrw $1, %xmm3, %ecx -; SSE41-NEXT: pextrw $1, %xmm1, %edx -; SSE41-NEXT: addw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: movd %xmm3, %ecx -; SSE41-NEXT: movd %xmm1, %esi -; SSE41-NEXT: addw %cx, %si -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: movd %esi, %xmm2 -; SSE41-NEXT: pinsrw $1, %edx, %xmm2 -; SSE41-NEXT: pextrw $2, %xmm3, %ecx -; SSE41-NEXT: pextrw $2, %xmm1, %edx -; SSE41-NEXT: addw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $2, %edx, %xmm2 -; SSE41-NEXT: pextrw $3, %xmm3, %ecx -; SSE41-NEXT: pextrw $3, %xmm1, %edx -; SSE41-NEXT: addw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $3, %edx, %xmm2 -; SSE41-NEXT: pextrw $4, %xmm3, %ecx -; SSE41-NEXT: pextrw $4, %xmm1, %edx -; SSE41-NEXT: addw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $4, %edx, %xmm2 -; SSE41-NEXT: pextrw $5, %xmm3, %ecx -; SSE41-NEXT: pextrw $5, %xmm1, %edx -; SSE41-NEXT: addw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $5, %edx, %xmm2 -; SSE41-NEXT: pextrw $6, %xmm3, %ecx -; SSE41-NEXT: pextrw $6, %xmm1, %edx -; SSE41-NEXT: addw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $6, %edx, %xmm2 -; SSE41-NEXT: pextrw $7, %xmm3, %ecx -; SSE41-NEXT: pextrw $7, %xmm1, %edx -; SSE41-NEXT: addw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $7, %edx, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm1 -; SSE41-NEXT: retq +; SSE-LABEL: v16i16: +; SSE: # %bb.0: +; SSE-NEXT: paddusw %xmm2, %xmm0 +; SSE-NEXT: paddusw %xmm3, %xmm1 +; SSE-NEXT: retq ; ; AVX1-LABEL: v16i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpextrw $1, %xmm2, %eax ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpextrw $1, %xmm3, %ecx -; AVX1-NEXT: addw %ax, %cx -; AVX1-NEXT: movl $65535, %eax # imm = 0xFFFF -; AVX1-NEXT: cmovbl %eax, %ecx -; AVX1-NEXT: vmovd %xmm2, %edx -; AVX1-NEXT: vmovd %xmm3, %esi -; AVX1-NEXT: addw %dx, %si -; AVX1-NEXT: cmovbl %eax, %esi -; AVX1-NEXT: vmovd %esi, %xmm4 -; AVX1-NEXT: vpinsrw $1, %ecx, %xmm4, %xmm4 -; AVX1-NEXT: vpextrw $2, %xmm2, %ecx -; AVX1-NEXT: vpextrw $2, %xmm3, %edx -; AVX1-NEXT: addw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 -; AVX1-NEXT: vpextrw $3, %xmm2, %ecx -; AVX1-NEXT: vpextrw $3, %xmm3, %edx -; AVX1-NEXT: addw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $3, %edx, %xmm4, %xmm4 -; AVX1-NEXT: vpextrw $4, %xmm2, %ecx -; AVX1-NEXT: vpextrw $4, %xmm3, %edx -; AVX1-NEXT: addw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $4, %edx, %xmm4, %xmm4 -; AVX1-NEXT: vpextrw $5, %xmm2, %ecx -; AVX1-NEXT: vpextrw $5, %xmm3, %edx -; AVX1-NEXT: addw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $5, %edx, %xmm4, %xmm4 -; AVX1-NEXT: vpextrw $6, %xmm2, %ecx -; AVX1-NEXT: vpextrw $6, %xmm3, %edx -; AVX1-NEXT: addw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $6, %edx, %xmm4, %xmm4 -; AVX1-NEXT: vpextrw $7, %xmm2, %ecx -; AVX1-NEXT: vpextrw $7, %xmm3, %edx -; AVX1-NEXT: addw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $7, %edx, %xmm4, %xmm2 -; AVX1-NEXT: vpextrw $1, %xmm1, %ecx -; AVX1-NEXT: vpextrw $1, %xmm0, %edx -; AVX1-NEXT: addw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vmovd %xmm1, %ecx -; AVX1-NEXT: vmovd %xmm0, %esi -; AVX1-NEXT: addw %cx, %si -; AVX1-NEXT: cmovbl %eax, %esi -; AVX1-NEXT: vmovd %esi, %xmm3 -; AVX1-NEXT: vpinsrw $1, %edx, %xmm3, %xmm3 -; AVX1-NEXT: vpextrw $2, %xmm1, %ecx -; AVX1-NEXT: vpextrw $2, %xmm0, %edx -; AVX1-NEXT: addw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 -; AVX1-NEXT: vpextrw $3, %xmm1, %ecx -; AVX1-NEXT: vpextrw $3, %xmm0, %edx -; AVX1-NEXT: addw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $3, %edx, %xmm3, %xmm3 -; AVX1-NEXT: vpextrw $4, %xmm1, %ecx -; AVX1-NEXT: vpextrw $4, %xmm0, %edx -; AVX1-NEXT: addw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $4, %edx, %xmm3, %xmm3 -; AVX1-NEXT: vpextrw $5, %xmm1, %ecx -; AVX1-NEXT: vpextrw $5, %xmm0, %edx -; AVX1-NEXT: addw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $5, %edx, %xmm3, %xmm3 -; AVX1-NEXT: vpextrw $6, %xmm1, %ecx -; AVX1-NEXT: vpextrw $6, %xmm0, %edx -; AVX1-NEXT: addw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $6, %edx, %xmm3, %xmm3 -; AVX1-NEXT: vpextrw $7, %xmm1, %ecx -; AVX1-NEXT: vpextrw $7, %xmm0, %edx -; AVX1-NEXT: addw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $7, %edx, %xmm3, %xmm0 +; AVX1-NEXT: vpaddusw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpextrw $1, %xmm2, %eax -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpextrw $1, %xmm3, %ecx -; AVX2-NEXT: addw %ax, %cx -; AVX2-NEXT: movl $65535, %eax # imm = 0xFFFF -; AVX2-NEXT: cmovbl %eax, %ecx -; AVX2-NEXT: vmovd %xmm2, %edx -; AVX2-NEXT: vmovd %xmm3, %esi -; AVX2-NEXT: addw %dx, %si -; AVX2-NEXT: cmovbl %eax, %esi -; AVX2-NEXT: vmovd %esi, %xmm4 -; AVX2-NEXT: vpinsrw $1, %ecx, %xmm4, %xmm4 -; AVX2-NEXT: vpextrw $2, %xmm2, %ecx -; AVX2-NEXT: vpextrw $2, %xmm3, %edx -; AVX2-NEXT: addw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 -; AVX2-NEXT: vpextrw $3, %xmm2, %ecx -; AVX2-NEXT: vpextrw $3, %xmm3, %edx -; AVX2-NEXT: addw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $3, %edx, %xmm4, %xmm4 -; AVX2-NEXT: vpextrw $4, %xmm2, %ecx -; AVX2-NEXT: vpextrw $4, %xmm3, %edx -; AVX2-NEXT: addw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $4, %edx, %xmm4, %xmm4 -; AVX2-NEXT: vpextrw $5, %xmm2, %ecx -; AVX2-NEXT: vpextrw $5, %xmm3, %edx -; AVX2-NEXT: addw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $5, %edx, %xmm4, %xmm4 -; AVX2-NEXT: vpextrw $6, %xmm2, %ecx -; AVX2-NEXT: vpextrw $6, %xmm3, %edx -; AVX2-NEXT: addw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $6, %edx, %xmm4, %xmm4 -; AVX2-NEXT: vpextrw $7, %xmm2, %ecx -; AVX2-NEXT: vpextrw $7, %xmm3, %edx -; AVX2-NEXT: addw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $7, %edx, %xmm4, %xmm2 -; AVX2-NEXT: vpextrw $1, %xmm1, %ecx -; AVX2-NEXT: vpextrw $1, %xmm0, %edx -; AVX2-NEXT: addw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vmovd %xmm1, %ecx -; AVX2-NEXT: vmovd %xmm0, %esi -; AVX2-NEXT: addw %cx, %si -; AVX2-NEXT: cmovbl %eax, %esi -; AVX2-NEXT: vmovd %esi, %xmm3 -; AVX2-NEXT: vpinsrw $1, %edx, %xmm3, %xmm3 -; AVX2-NEXT: vpextrw $2, %xmm1, %ecx -; AVX2-NEXT: vpextrw $2, %xmm0, %edx -; AVX2-NEXT: addw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 -; AVX2-NEXT: vpextrw $3, %xmm1, %ecx -; AVX2-NEXT: vpextrw $3, %xmm0, %edx -; AVX2-NEXT: addw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $3, %edx, %xmm3, %xmm3 -; AVX2-NEXT: vpextrw $4, %xmm1, %ecx -; AVX2-NEXT: vpextrw $4, %xmm0, %edx -; AVX2-NEXT: addw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $4, %edx, %xmm3, %xmm3 -; AVX2-NEXT: vpextrw $5, %xmm1, %ecx -; AVX2-NEXT: vpextrw $5, %xmm0, %edx -; AVX2-NEXT: addw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $5, %edx, %xmm3, %xmm3 -; AVX2-NEXT: vpextrw $6, %xmm1, %ecx -; AVX2-NEXT: vpextrw $6, %xmm0, %edx -; AVX2-NEXT: addw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $6, %edx, %xmm3, %xmm3 -; AVX2-NEXT: vpextrw $7, %xmm1, %ecx -; AVX2-NEXT: vpextrw $7, %xmm0, %edx -; AVX2-NEXT: addw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $7, %edx, %xmm3, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: v16i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512-NEXT: vpextrw $1, %xmm2, %eax -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512-NEXT: vpextrw $1, %xmm3, %ecx -; AVX512-NEXT: addw %ax, %cx -; AVX512-NEXT: movl $65535, %eax # imm = 0xFFFF -; AVX512-NEXT: cmovbl %eax, %ecx -; AVX512-NEXT: vmovd %xmm2, %edx -; AVX512-NEXT: vmovd %xmm3, %esi -; AVX512-NEXT: addw %dx, %si -; AVX512-NEXT: cmovbl %eax, %esi -; AVX512-NEXT: vmovd %esi, %xmm4 -; AVX512-NEXT: vpinsrw $1, %ecx, %xmm4, %xmm4 -; AVX512-NEXT: vpextrw $2, %xmm2, %ecx -; AVX512-NEXT: vpextrw $2, %xmm3, %edx -; AVX512-NEXT: addw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 -; AVX512-NEXT: vpextrw $3, %xmm2, %ecx -; AVX512-NEXT: vpextrw $3, %xmm3, %edx -; AVX512-NEXT: addw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $3, %edx, %xmm4, %xmm4 -; AVX512-NEXT: vpextrw $4, %xmm2, %ecx -; AVX512-NEXT: vpextrw $4, %xmm3, %edx -; AVX512-NEXT: addw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $4, %edx, %xmm4, %xmm4 -; AVX512-NEXT: vpextrw $5, %xmm2, %ecx -; AVX512-NEXT: vpextrw $5, %xmm3, %edx -; AVX512-NEXT: addw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $5, %edx, %xmm4, %xmm4 -; AVX512-NEXT: vpextrw $6, %xmm2, %ecx -; AVX512-NEXT: vpextrw $6, %xmm3, %edx -; AVX512-NEXT: addw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $6, %edx, %xmm4, %xmm4 -; AVX512-NEXT: vpextrw $7, %xmm2, %ecx -; AVX512-NEXT: vpextrw $7, %xmm3, %edx -; AVX512-NEXT: addw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $7, %edx, %xmm4, %xmm2 -; AVX512-NEXT: vpextrw $1, %xmm1, %ecx -; AVX512-NEXT: vpextrw $1, %xmm0, %edx -; AVX512-NEXT: addw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vmovd %xmm1, %ecx -; AVX512-NEXT: vmovd %xmm0, %esi -; AVX512-NEXT: addw %cx, %si -; AVX512-NEXT: cmovbl %eax, %esi -; AVX512-NEXT: vmovd %esi, %xmm3 -; AVX512-NEXT: vpinsrw $1, %edx, %xmm3, %xmm3 -; AVX512-NEXT: vpextrw $2, %xmm1, %ecx -; AVX512-NEXT: vpextrw $2, %xmm0, %edx -; AVX512-NEXT: addw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 -; AVX512-NEXT: vpextrw $3, %xmm1, %ecx -; AVX512-NEXT: vpextrw $3, %xmm0, %edx -; AVX512-NEXT: addw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $3, %edx, %xmm3, %xmm3 -; AVX512-NEXT: vpextrw $4, %xmm1, %ecx -; AVX512-NEXT: vpextrw $4, %xmm0, %edx -; AVX512-NEXT: addw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $4, %edx, %xmm3, %xmm3 -; AVX512-NEXT: vpextrw $5, %xmm1, %ecx -; AVX512-NEXT: vpextrw $5, %xmm0, %edx -; AVX512-NEXT: addw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $5, %edx, %xmm3, %xmm3 -; AVX512-NEXT: vpextrw $6, %xmm1, %ecx -; AVX512-NEXT: vpextrw $6, %xmm0, %edx -; AVX512-NEXT: addw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $6, %edx, %xmm3, %xmm3 -; AVX512-NEXT: vpextrw $7, %xmm1, %ecx -; AVX512-NEXT: vpextrw $7, %xmm0, %edx -; AVX512-NEXT: addw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $7, %edx, %xmm3, %xmm0 -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq %z = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> %x, <16 x i16> %y) ret <16 x i16> %z } define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind { -; SSE2-LABEL: v32i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pextrw $7, %xmm4, %eax -; SSE2-NEXT: pextrw $7, %xmm0, %ecx -; SSE2-NEXT: addw %ax, %cx -; SSE2-NEXT: movl $65535, %eax # imm = 0xFFFF -; SSE2-NEXT: cmovbl %eax, %ecx -; SSE2-NEXT: movd %ecx, %xmm8 -; SSE2-NEXT: pextrw $6, %xmm4, %ecx -; SSE2-NEXT: pextrw $6, %xmm0, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm9 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; SSE2-NEXT: pextrw $5, %xmm4, %ecx -; SSE2-NEXT: pextrw $5, %xmm0, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm10 -; SSE2-NEXT: pextrw $4, %xmm4, %ecx -; SSE2-NEXT: pextrw $4, %xmm0, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm8 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; SSE2-NEXT: pextrw $3, %xmm4, %ecx -; SSE2-NEXT: pextrw $3, %xmm0, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm9 -; SSE2-NEXT: pextrw $2, %xmm4, %ecx -; SSE2-NEXT: pextrw $2, %xmm0, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm10 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; SSE2-NEXT: pextrw $1, %xmm4, %ecx -; SSE2-NEXT: pextrw $1, %xmm0, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm9 -; SSE2-NEXT: movd %xmm4, %ecx -; SSE2-NEXT: movd %xmm0, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm8[0] -; SSE2-NEXT: pextrw $7, %xmm5, %ecx -; SSE2-NEXT: pextrw $7, %xmm1, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm8 -; SSE2-NEXT: pextrw $6, %xmm5, %ecx -; SSE2-NEXT: pextrw $6, %xmm1, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] -; SSE2-NEXT: pextrw $5, %xmm5, %ecx -; SSE2-NEXT: pextrw $5, %xmm1, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm9 -; SSE2-NEXT: pextrw $4, %xmm5, %ecx -; SSE2-NEXT: pextrw $4, %xmm1, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm8 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; SSE2-NEXT: pextrw $3, %xmm5, %ecx -; SSE2-NEXT: pextrw $3, %xmm1, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm9 -; SSE2-NEXT: pextrw $2, %xmm5, %ecx -; SSE2-NEXT: pextrw $2, %xmm1, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] -; SSE2-NEXT: pextrw $1, %xmm5, %ecx -; SSE2-NEXT: pextrw $1, %xmm1, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm9 -; SSE2-NEXT: movd %xmm5, %ecx -; SSE2-NEXT: movd %xmm1, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm8[0] -; SSE2-NEXT: pextrw $7, %xmm6, %ecx -; SSE2-NEXT: pextrw $7, %xmm2, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm4 -; SSE2-NEXT: pextrw $6, %xmm6, %ecx -; SSE2-NEXT: pextrw $6, %xmm2, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm5 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE2-NEXT: pextrw $5, %xmm6, %ecx -; SSE2-NEXT: pextrw $5, %xmm2, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm8 -; SSE2-NEXT: pextrw $4, %xmm6, %ecx -; SSE2-NEXT: pextrw $4, %xmm2, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSE2-NEXT: pextrw $3, %xmm6, %ecx -; SSE2-NEXT: pextrw $3, %xmm2, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm8 -; SSE2-NEXT: pextrw $2, %xmm6, %ecx -; SSE2-NEXT: pextrw $2, %xmm2, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm5 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3] -; SSE2-NEXT: pextrw $1, %xmm6, %ecx -; SSE2-NEXT: pextrw $1, %xmm2, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm8 -; SSE2-NEXT: movd %xmm6, %ecx -; SSE2-NEXT: movd %xmm2, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] -; SSE2-NEXT: pextrw $7, %xmm7, %ecx -; SSE2-NEXT: pextrw $7, %xmm3, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm4 -; SSE2-NEXT: pextrw $6, %xmm7, %ecx -; SSE2-NEXT: pextrw $6, %xmm3, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm5 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE2-NEXT: pextrw $5, %xmm7, %ecx -; SSE2-NEXT: pextrw $5, %xmm3, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm6 -; SSE2-NEXT: pextrw $4, %xmm7, %ecx -; SSE2-NEXT: pextrw $4, %xmm3, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSE2-NEXT: pextrw $3, %xmm7, %ecx -; SSE2-NEXT: pextrw $3, %xmm3, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm5 -; SSE2-NEXT: pextrw $2, %xmm7, %ecx -; SSE2-NEXT: pextrw $2, %xmm3, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm6 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; SSE2-NEXT: pextrw $1, %xmm7, %ecx -; SSE2-NEXT: pextrw $1, %xmm3, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm5 -; SSE2-NEXT: movd %xmm7, %ecx -; SSE2-NEXT: movd %xmm3, %edx -; SSE2-NEXT: addw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v32i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pextrw $7, %xmm4, %eax -; SSSE3-NEXT: pextrw $7, %xmm0, %ecx -; SSSE3-NEXT: addw %ax, %cx -; SSSE3-NEXT: movl $65535, %eax # imm = 0xFFFF -; SSSE3-NEXT: cmovbl %eax, %ecx -; SSSE3-NEXT: movd %ecx, %xmm8 -; SSSE3-NEXT: pextrw $6, %xmm4, %ecx -; SSSE3-NEXT: pextrw $6, %xmm0, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm9 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; SSSE3-NEXT: pextrw $5, %xmm4, %ecx -; SSSE3-NEXT: pextrw $5, %xmm0, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm10 -; SSSE3-NEXT: pextrw $4, %xmm4, %ecx -; SSSE3-NEXT: pextrw $4, %xmm0, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm8 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; SSSE3-NEXT: pextrw $3, %xmm4, %ecx -; SSSE3-NEXT: pextrw $3, %xmm0, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm9 -; SSSE3-NEXT: pextrw $2, %xmm4, %ecx -; SSSE3-NEXT: pextrw $2, %xmm0, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm10 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; SSSE3-NEXT: pextrw $1, %xmm4, %ecx -; SSSE3-NEXT: pextrw $1, %xmm0, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm9 -; SSSE3-NEXT: movd %xmm4, %ecx -; SSSE3-NEXT: movd %xmm0, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm8[0] -; SSSE3-NEXT: pextrw $7, %xmm5, %ecx -; SSSE3-NEXT: pextrw $7, %xmm1, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm8 -; SSSE3-NEXT: pextrw $6, %xmm5, %ecx -; SSSE3-NEXT: pextrw $6, %xmm1, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm4 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] -; SSSE3-NEXT: pextrw $5, %xmm5, %ecx -; SSSE3-NEXT: pextrw $5, %xmm1, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm9 -; SSSE3-NEXT: pextrw $4, %xmm5, %ecx -; SSSE3-NEXT: pextrw $4, %xmm1, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm8 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; SSSE3-NEXT: pextrw $3, %xmm5, %ecx -; SSSE3-NEXT: pextrw $3, %xmm1, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm9 -; SSSE3-NEXT: pextrw $2, %xmm5, %ecx -; SSSE3-NEXT: pextrw $2, %xmm1, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm4 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] -; SSSE3-NEXT: pextrw $1, %xmm5, %ecx -; SSSE3-NEXT: pextrw $1, %xmm1, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm9 -; SSSE3-NEXT: movd %xmm5, %ecx -; SSSE3-NEXT: movd %xmm1, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm8[0] -; SSSE3-NEXT: pextrw $7, %xmm6, %ecx -; SSSE3-NEXT: pextrw $7, %xmm2, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm4 -; SSSE3-NEXT: pextrw $6, %xmm6, %ecx -; SSSE3-NEXT: pextrw $6, %xmm2, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm5 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSSE3-NEXT: pextrw $5, %xmm6, %ecx -; SSSE3-NEXT: pextrw $5, %xmm2, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm8 -; SSSE3-NEXT: pextrw $4, %xmm6, %ecx -; SSSE3-NEXT: pextrw $4, %xmm2, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm4 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSSE3-NEXT: pextrw $3, %xmm6, %ecx -; SSSE3-NEXT: pextrw $3, %xmm2, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm8 -; SSSE3-NEXT: pextrw $2, %xmm6, %ecx -; SSSE3-NEXT: pextrw $2, %xmm2, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm5 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3] -; SSSE3-NEXT: pextrw $1, %xmm6, %ecx -; SSSE3-NEXT: pextrw $1, %xmm2, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm8 -; SSSE3-NEXT: movd %xmm6, %ecx -; SSSE3-NEXT: movd %xmm2, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] -; SSSE3-NEXT: pextrw $7, %xmm7, %ecx -; SSSE3-NEXT: pextrw $7, %xmm3, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm4 -; SSSE3-NEXT: pextrw $6, %xmm7, %ecx -; SSSE3-NEXT: pextrw $6, %xmm3, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm5 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSSE3-NEXT: pextrw $5, %xmm7, %ecx -; SSSE3-NEXT: pextrw $5, %xmm3, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm6 -; SSSE3-NEXT: pextrw $4, %xmm7, %ecx -; SSSE3-NEXT: pextrw $4, %xmm3, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm4 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSSE3-NEXT: pextrw $3, %xmm7, %ecx -; SSSE3-NEXT: pextrw $3, %xmm3, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm5 -; SSSE3-NEXT: pextrw $2, %xmm7, %ecx -; SSSE3-NEXT: pextrw $2, %xmm3, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm6 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; SSSE3-NEXT: pextrw $1, %xmm7, %ecx -; SSSE3-NEXT: pextrw $1, %xmm3, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm5 -; SSSE3-NEXT: movd %xmm7, %ecx -; SSSE3-NEXT: movd %xmm3, %edx -; SSSE3-NEXT: addw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm3 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v32i16: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm1, %xmm8 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pextrw $1, %xmm4, %eax -; SSE41-NEXT: pextrw $1, %xmm0, %ecx -; SSE41-NEXT: addw %ax, %cx -; SSE41-NEXT: movl $65535, %eax # imm = 0xFFFF -; SSE41-NEXT: cmovbl %eax, %ecx -; SSE41-NEXT: movd %xmm4, %edx -; SSE41-NEXT: movd %xmm0, %esi -; SSE41-NEXT: addw %dx, %si -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: movd %esi, %xmm0 -; SSE41-NEXT: pinsrw $1, %ecx, %xmm0 -; SSE41-NEXT: pextrw $2, %xmm4, %ecx -; SSE41-NEXT: pextrw $2, %xmm1, %edx -; SSE41-NEXT: addw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $2, %edx, %xmm0 -; SSE41-NEXT: pextrw $3, %xmm4, %ecx -; SSE41-NEXT: pextrw $3, %xmm1, %edx -; SSE41-NEXT: addw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $3, %edx, %xmm0 -; SSE41-NEXT: pextrw $4, %xmm4, %ecx -; SSE41-NEXT: pextrw $4, %xmm1, %edx -; SSE41-NEXT: addw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $4, %edx, %xmm0 -; SSE41-NEXT: pextrw $5, %xmm4, %ecx -; SSE41-NEXT: pextrw $5, %xmm1, %edx -; SSE41-NEXT: addw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $5, %edx, %xmm0 -; SSE41-NEXT: pextrw $6, %xmm4, %ecx -; SSE41-NEXT: pextrw $6, %xmm1, %edx -; SSE41-NEXT: addw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $6, %edx, %xmm0 -; SSE41-NEXT: pextrw $7, %xmm4, %ecx -; SSE41-NEXT: pextrw $7, %xmm1, %edx -; SSE41-NEXT: addw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $7, %edx, %xmm0 -; SSE41-NEXT: pextrw $1, %xmm5, %ecx -; SSE41-NEXT: pextrw $1, %xmm8, %edx -; SSE41-NEXT: addw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: movd %xmm5, %ecx -; SSE41-NEXT: movd %xmm8, %esi -; SSE41-NEXT: addw %cx, %si -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: movd %esi, %xmm1 -; SSE41-NEXT: pinsrw $1, %edx, %xmm1 -; SSE41-NEXT: pextrw $2, %xmm5, %ecx -; SSE41-NEXT: pextrw $2, %xmm8, %edx -; SSE41-NEXT: addw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $2, %edx, %xmm1 -; SSE41-NEXT: pextrw $3, %xmm5, %ecx -; SSE41-NEXT: pextrw $3, %xmm8, %edx -; SSE41-NEXT: addw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $3, %edx, %xmm1 -; SSE41-NEXT: pextrw $4, %xmm5, %ecx -; SSE41-NEXT: pextrw $4, %xmm8, %edx -; SSE41-NEXT: addw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $4, %edx, %xmm1 -; SSE41-NEXT: pextrw $5, %xmm5, %ecx -; SSE41-NEXT: pextrw $5, %xmm8, %edx -; SSE41-NEXT: addw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $5, %edx, %xmm1 -; SSE41-NEXT: pextrw $6, %xmm5, %ecx -; SSE41-NEXT: pextrw $6, %xmm8, %edx -; SSE41-NEXT: addw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $6, %edx, %xmm1 -; SSE41-NEXT: pextrw $7, %xmm5, %ecx -; SSE41-NEXT: pextrw $7, %xmm8, %edx -; SSE41-NEXT: addw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $7, %edx, %xmm1 -; SSE41-NEXT: pextrw $1, %xmm6, %ecx -; SSE41-NEXT: pextrw $1, %xmm2, %edx -; SSE41-NEXT: addw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: movd %xmm6, %ecx -; SSE41-NEXT: movd %xmm2, %esi -; SSE41-NEXT: addw %cx, %si -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: movd %esi, %xmm4 -; SSE41-NEXT: pinsrw $1, %edx, %xmm4 -; SSE41-NEXT: pextrw $2, %xmm6, %ecx -; SSE41-NEXT: pextrw $2, %xmm2, %edx -; SSE41-NEXT: addw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $2, %edx, %xmm4 -; SSE41-NEXT: pextrw $3, %xmm6, %ecx -; SSE41-NEXT: pextrw $3, %xmm2, %edx -; SSE41-NEXT: addw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $3, %edx, %xmm4 -; SSE41-NEXT: pextrw $4, %xmm6, %ecx -; SSE41-NEXT: pextrw $4, %xmm2, %edx -; SSE41-NEXT: addw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $4, %edx, %xmm4 -; SSE41-NEXT: pextrw $5, %xmm6, %ecx -; SSE41-NEXT: pextrw $5, %xmm2, %edx -; SSE41-NEXT: addw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $5, %edx, %xmm4 -; SSE41-NEXT: pextrw $6, %xmm6, %ecx -; SSE41-NEXT: pextrw $6, %xmm2, %edx -; SSE41-NEXT: addw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $6, %edx, %xmm4 -; SSE41-NEXT: pextrw $7, %xmm6, %ecx -; SSE41-NEXT: pextrw $7, %xmm2, %edx -; SSE41-NEXT: addw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $7, %edx, %xmm4 -; SSE41-NEXT: pextrw $1, %xmm7, %ecx -; SSE41-NEXT: pextrw $1, %xmm3, %edx -; SSE41-NEXT: addw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: movd %xmm7, %ecx -; SSE41-NEXT: movd %xmm3, %esi -; SSE41-NEXT: addw %cx, %si -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: movd %esi, %xmm5 -; SSE41-NEXT: pinsrw $1, %edx, %xmm5 -; SSE41-NEXT: pextrw $2, %xmm7, %ecx -; SSE41-NEXT: pextrw $2, %xmm3, %edx -; SSE41-NEXT: addw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $2, %edx, %xmm5 -; SSE41-NEXT: pextrw $3, %xmm7, %ecx -; SSE41-NEXT: pextrw $3, %xmm3, %edx -; SSE41-NEXT: addw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $3, %edx, %xmm5 -; SSE41-NEXT: pextrw $4, %xmm7, %ecx -; SSE41-NEXT: pextrw $4, %xmm3, %edx -; SSE41-NEXT: addw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $4, %edx, %xmm5 -; SSE41-NEXT: pextrw $5, %xmm7, %ecx -; SSE41-NEXT: pextrw $5, %xmm3, %edx -; SSE41-NEXT: addw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $5, %edx, %xmm5 -; SSE41-NEXT: pextrw $6, %xmm7, %ecx -; SSE41-NEXT: pextrw $6, %xmm3, %edx -; SSE41-NEXT: addw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $6, %edx, %xmm5 -; SSE41-NEXT: pextrw $7, %xmm7, %ecx -; SSE41-NEXT: pextrw $7, %xmm3, %edx -; SSE41-NEXT: addw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $7, %edx, %xmm5 -; SSE41-NEXT: movdqa %xmm4, %xmm2 -; SSE41-NEXT: movdqa %xmm5, %xmm3 -; SSE41-NEXT: retq +; SSE-LABEL: v32i16: +; SSE: # %bb.0: +; SSE-NEXT: paddusw %xmm4, %xmm0 +; SSE-NEXT: paddusw %xmm5, %xmm1 +; SSE-NEXT: paddusw %xmm6, %xmm2 +; SSE-NEXT: paddusw %xmm7, %xmm3 +; SSE-NEXT: retq ; ; AVX1-LABEL: v32i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vpextrw $1, %xmm4, %eax ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vpextrw $1, %xmm5, %ecx -; AVX1-NEXT: addw %ax, %cx -; AVX1-NEXT: movl $65535, %eax # imm = 0xFFFF -; AVX1-NEXT: cmovbl %eax, %ecx -; AVX1-NEXT: vmovd %xmm4, %edx -; AVX1-NEXT: vmovd %xmm5, %esi -; AVX1-NEXT: addw %dx, %si -; AVX1-NEXT: cmovbl %eax, %esi -; AVX1-NEXT: vmovd %esi, %xmm6 -; AVX1-NEXT: vpinsrw $1, %ecx, %xmm6, %xmm6 -; AVX1-NEXT: vpextrw $2, %xmm4, %ecx -; AVX1-NEXT: vpextrw $2, %xmm5, %edx -; AVX1-NEXT: addw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $2, %edx, %xmm6, %xmm6 -; AVX1-NEXT: vpextrw $3, %xmm4, %ecx -; AVX1-NEXT: vpextrw $3, %xmm5, %edx -; AVX1-NEXT: addw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $3, %edx, %xmm6, %xmm6 -; AVX1-NEXT: vpextrw $4, %xmm4, %ecx -; AVX1-NEXT: vpextrw $4, %xmm5, %edx -; AVX1-NEXT: addw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $4, %edx, %xmm6, %xmm6 -; AVX1-NEXT: vpextrw $5, %xmm4, %ecx -; AVX1-NEXT: vpextrw $5, %xmm5, %edx -; AVX1-NEXT: addw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $5, %edx, %xmm6, %xmm6 -; AVX1-NEXT: vpextrw $6, %xmm4, %ecx -; AVX1-NEXT: vpextrw $6, %xmm5, %edx -; AVX1-NEXT: addw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $6, %edx, %xmm6, %xmm6 -; AVX1-NEXT: vpextrw $7, %xmm4, %ecx -; AVX1-NEXT: vpextrw $7, %xmm5, %edx -; AVX1-NEXT: addw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $7, %edx, %xmm6, %xmm4 -; AVX1-NEXT: vpextrw $1, %xmm2, %ecx -; AVX1-NEXT: vpextrw $1, %xmm0, %edx -; AVX1-NEXT: addw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vmovd %xmm2, %ecx -; AVX1-NEXT: vmovd %xmm0, %esi -; AVX1-NEXT: addw %cx, %si -; AVX1-NEXT: cmovbl %eax, %esi -; AVX1-NEXT: vmovd %esi, %xmm5 -; AVX1-NEXT: vpinsrw $1, %edx, %xmm5, %xmm5 -; AVX1-NEXT: vpextrw $2, %xmm2, %ecx -; AVX1-NEXT: vpextrw $2, %xmm0, %edx -; AVX1-NEXT: addw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 -; AVX1-NEXT: vpextrw $3, %xmm2, %ecx -; AVX1-NEXT: vpextrw $3, %xmm0, %edx -; AVX1-NEXT: addw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $3, %edx, %xmm5, %xmm5 -; AVX1-NEXT: vpextrw $4, %xmm2, %ecx -; AVX1-NEXT: vpextrw $4, %xmm0, %edx -; AVX1-NEXT: addw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $4, %edx, %xmm5, %xmm5 -; AVX1-NEXT: vpextrw $5, %xmm2, %ecx -; AVX1-NEXT: vpextrw $5, %xmm0, %edx -; AVX1-NEXT: addw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $5, %edx, %xmm5, %xmm5 -; AVX1-NEXT: vpextrw $6, %xmm2, %ecx -; AVX1-NEXT: vpextrw $6, %xmm0, %edx -; AVX1-NEXT: addw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $6, %edx, %xmm5, %xmm5 -; AVX1-NEXT: vpextrw $7, %xmm2, %ecx -; AVX1-NEXT: vpextrw $7, %xmm0, %edx -; AVX1-NEXT: addw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $7, %edx, %xmm5, %xmm0 +; AVX1-NEXT: vpaddusw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpaddusw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 -; AVX1-NEXT: vpextrw $1, %xmm2, %ecx ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpextrw $1, %xmm4, %edx -; AVX1-NEXT: addw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vmovd %xmm2, %ecx -; AVX1-NEXT: vmovd %xmm4, %esi -; AVX1-NEXT: addw %cx, %si -; AVX1-NEXT: cmovbl %eax, %esi -; AVX1-NEXT: vmovd %esi, %xmm5 -; AVX1-NEXT: vpinsrw $1, %edx, %xmm5, %xmm5 -; AVX1-NEXT: vpextrw $2, %xmm2, %ecx -; AVX1-NEXT: vpextrw $2, %xmm4, %edx -; AVX1-NEXT: addw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 -; AVX1-NEXT: vpextrw $3, %xmm2, %ecx -; AVX1-NEXT: vpextrw $3, %xmm4, %edx -; AVX1-NEXT: addw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $3, %edx, %xmm5, %xmm5 -; AVX1-NEXT: vpextrw $4, %xmm2, %ecx -; AVX1-NEXT: vpextrw $4, %xmm4, %edx -; AVX1-NEXT: addw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $4, %edx, %xmm5, %xmm5 -; AVX1-NEXT: vpextrw $5, %xmm2, %ecx -; AVX1-NEXT: vpextrw $5, %xmm4, %edx -; AVX1-NEXT: addw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $5, %edx, %xmm5, %xmm5 -; AVX1-NEXT: vpextrw $6, %xmm2, %ecx -; AVX1-NEXT: vpextrw $6, %xmm4, %edx -; AVX1-NEXT: addw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $6, %edx, %xmm5, %xmm5 -; AVX1-NEXT: vpextrw $7, %xmm2, %ecx -; AVX1-NEXT: vpextrw $7, %xmm4, %edx -; AVX1-NEXT: addw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $7, %edx, %xmm5, %xmm2 -; AVX1-NEXT: vpextrw $1, %xmm3, %ecx -; AVX1-NEXT: vpextrw $1, %xmm1, %edx -; AVX1-NEXT: addw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vmovd %xmm3, %ecx -; AVX1-NEXT: vmovd %xmm1, %esi -; AVX1-NEXT: addw %cx, %si -; AVX1-NEXT: cmovbl %eax, %esi -; AVX1-NEXT: vmovd %esi, %xmm4 -; AVX1-NEXT: vpinsrw $1, %edx, %xmm4, %xmm4 -; AVX1-NEXT: vpextrw $2, %xmm3, %ecx -; AVX1-NEXT: vpextrw $2, %xmm1, %edx -; AVX1-NEXT: addw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 -; AVX1-NEXT: vpextrw $3, %xmm3, %ecx -; AVX1-NEXT: vpextrw $3, %xmm1, %edx -; AVX1-NEXT: addw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $3, %edx, %xmm4, %xmm4 -; AVX1-NEXT: vpextrw $4, %xmm3, %ecx -; AVX1-NEXT: vpextrw $4, %xmm1, %edx -; AVX1-NEXT: addw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $4, %edx, %xmm4, %xmm4 -; AVX1-NEXT: vpextrw $5, %xmm3, %ecx -; AVX1-NEXT: vpextrw $5, %xmm1, %edx -; AVX1-NEXT: addw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $5, %edx, %xmm4, %xmm4 -; AVX1-NEXT: vpextrw $6, %xmm3, %ecx -; AVX1-NEXT: vpextrw $6, %xmm1, %edx -; AVX1-NEXT: addw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $6, %edx, %xmm4, %xmm4 -; AVX1-NEXT: vpextrw $7, %xmm3, %ecx -; AVX1-NEXT: vpextrw $7, %xmm1, %edx -; AVX1-NEXT: addw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $7, %edx, %xmm4, %xmm1 +; AVX1-NEXT: vpaddusw %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpaddusw %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: retq ; -; AVX2-LABEL: v32i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-NEXT: vpextrw $1, %xmm4, %eax -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX2-NEXT: vpextrw $1, %xmm5, %ecx -; AVX2-NEXT: addw %ax, %cx -; AVX2-NEXT: movl $65535, %eax # imm = 0xFFFF -; AVX2-NEXT: cmovbl %eax, %ecx -; AVX2-NEXT: vmovd %xmm4, %edx -; AVX2-NEXT: vmovd %xmm5, %esi -; AVX2-NEXT: addw %dx, %si -; AVX2-NEXT: cmovbl %eax, %esi -; AVX2-NEXT: vmovd %esi, %xmm6 -; AVX2-NEXT: vpinsrw $1, %ecx, %xmm6, %xmm6 -; AVX2-NEXT: vpextrw $2, %xmm4, %ecx -; AVX2-NEXT: vpextrw $2, %xmm5, %edx -; AVX2-NEXT: addw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $2, %edx, %xmm6, %xmm6 -; AVX2-NEXT: vpextrw $3, %xmm4, %ecx -; AVX2-NEXT: vpextrw $3, %xmm5, %edx -; AVX2-NEXT: addw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $3, %edx, %xmm6, %xmm6 -; AVX2-NEXT: vpextrw $4, %xmm4, %ecx -; AVX2-NEXT: vpextrw $4, %xmm5, %edx -; AVX2-NEXT: addw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $4, %edx, %xmm6, %xmm6 -; AVX2-NEXT: vpextrw $5, %xmm4, %ecx -; AVX2-NEXT: vpextrw $5, %xmm5, %edx -; AVX2-NEXT: addw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $5, %edx, %xmm6, %xmm6 -; AVX2-NEXT: vpextrw $6, %xmm4, %ecx -; AVX2-NEXT: vpextrw $6, %xmm5, %edx -; AVX2-NEXT: addw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $6, %edx, %xmm6, %xmm6 -; AVX2-NEXT: vpextrw $7, %xmm4, %ecx -; AVX2-NEXT: vpextrw $7, %xmm5, %edx -; AVX2-NEXT: addw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $7, %edx, %xmm6, %xmm4 -; AVX2-NEXT: vpextrw $1, %xmm2, %ecx -; AVX2-NEXT: vpextrw $1, %xmm0, %edx -; AVX2-NEXT: addw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vmovd %xmm2, %ecx -; AVX2-NEXT: vmovd %xmm0, %esi -; AVX2-NEXT: addw %cx, %si -; AVX2-NEXT: cmovbl %eax, %esi -; AVX2-NEXT: vmovd %esi, %xmm5 -; AVX2-NEXT: vpinsrw $1, %edx, %xmm5, %xmm5 -; AVX2-NEXT: vpextrw $2, %xmm2, %ecx -; AVX2-NEXT: vpextrw $2, %xmm0, %edx -; AVX2-NEXT: addw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 -; AVX2-NEXT: vpextrw $3, %xmm2, %ecx -; AVX2-NEXT: vpextrw $3, %xmm0, %edx -; AVX2-NEXT: addw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $3, %edx, %xmm5, %xmm5 -; AVX2-NEXT: vpextrw $4, %xmm2, %ecx -; AVX2-NEXT: vpextrw $4, %xmm0, %edx -; AVX2-NEXT: addw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $4, %edx, %xmm5, %xmm5 -; AVX2-NEXT: vpextrw $5, %xmm2, %ecx -; AVX2-NEXT: vpextrw $5, %xmm0, %edx -; AVX2-NEXT: addw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $5, %edx, %xmm5, %xmm5 -; AVX2-NEXT: vpextrw $6, %xmm2, %ecx -; AVX2-NEXT: vpextrw $6, %xmm0, %edx -; AVX2-NEXT: addw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $6, %edx, %xmm5, %xmm5 -; AVX2-NEXT: vpextrw $7, %xmm2, %ecx -; AVX2-NEXT: vpextrw $7, %xmm0, %edx -; AVX2-NEXT: addw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $7, %edx, %xmm5, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm2 -; AVX2-NEXT: vpextrw $1, %xmm2, %ecx -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-NEXT: vpextrw $1, %xmm4, %edx -; AVX2-NEXT: addw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vmovd %xmm2, %ecx -; AVX2-NEXT: vmovd %xmm4, %esi -; AVX2-NEXT: addw %cx, %si -; AVX2-NEXT: cmovbl %eax, %esi -; AVX2-NEXT: vmovd %esi, %xmm5 -; AVX2-NEXT: vpinsrw $1, %edx, %xmm5, %xmm5 -; AVX2-NEXT: vpextrw $2, %xmm2, %ecx -; AVX2-NEXT: vpextrw $2, %xmm4, %edx -; AVX2-NEXT: addw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 -; AVX2-NEXT: vpextrw $3, %xmm2, %ecx -; AVX2-NEXT: vpextrw $3, %xmm4, %edx -; AVX2-NEXT: addw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $3, %edx, %xmm5, %xmm5 -; AVX2-NEXT: vpextrw $4, %xmm2, %ecx -; AVX2-NEXT: vpextrw $4, %xmm4, %edx -; AVX2-NEXT: addw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $4, %edx, %xmm5, %xmm5 -; AVX2-NEXT: vpextrw $5, %xmm2, %ecx -; AVX2-NEXT: vpextrw $5, %xmm4, %edx -; AVX2-NEXT: addw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $5, %edx, %xmm5, %xmm5 -; AVX2-NEXT: vpextrw $6, %xmm2, %ecx -; AVX2-NEXT: vpextrw $6, %xmm4, %edx -; AVX2-NEXT: addw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $6, %edx, %xmm5, %xmm5 -; AVX2-NEXT: vpextrw $7, %xmm2, %ecx -; AVX2-NEXT: vpextrw $7, %xmm4, %edx -; AVX2-NEXT: addw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $7, %edx, %xmm5, %xmm2 -; AVX2-NEXT: vpextrw $1, %xmm3, %ecx -; AVX2-NEXT: vpextrw $1, %xmm1, %edx -; AVX2-NEXT: addw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vmovd %xmm3, %ecx -; AVX2-NEXT: vmovd %xmm1, %esi -; AVX2-NEXT: addw %cx, %si -; AVX2-NEXT: cmovbl %eax, %esi -; AVX2-NEXT: vmovd %esi, %xmm4 -; AVX2-NEXT: vpinsrw $1, %edx, %xmm4, %xmm4 -; AVX2-NEXT: vpextrw $2, %xmm3, %ecx -; AVX2-NEXT: vpextrw $2, %xmm1, %edx -; AVX2-NEXT: addw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 -; AVX2-NEXT: vpextrw $3, %xmm3, %ecx -; AVX2-NEXT: vpextrw $3, %xmm1, %edx -; AVX2-NEXT: addw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $3, %edx, %xmm4, %xmm4 -; AVX2-NEXT: vpextrw $4, %xmm3, %ecx -; AVX2-NEXT: vpextrw $4, %xmm1, %edx -; AVX2-NEXT: addw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $4, %edx, %xmm4, %xmm4 -; AVX2-NEXT: vpextrw $5, %xmm3, %ecx -; AVX2-NEXT: vpextrw $5, %xmm1, %edx -; AVX2-NEXT: addw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $5, %edx, %xmm4, %xmm4 -; AVX2-NEXT: vpextrw $6, %xmm3, %ecx -; AVX2-NEXT: vpextrw $6, %xmm1, %edx -; AVX2-NEXT: addw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $6, %edx, %xmm4, %xmm4 -; AVX2-NEXT: vpextrw $7, %xmm3, %ecx -; AVX2-NEXT: vpextrw $7, %xmm1, %edx -; AVX2-NEXT: addw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $7, %edx, %xmm4, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-NEXT: retq -; -; AVX512-LABEL: v32i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm2 -; AVX512-NEXT: vpextrw $1, %xmm2, %eax -; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm3 -; AVX512-NEXT: vpextrw $1, %xmm3, %ecx -; AVX512-NEXT: addw %ax, %cx -; AVX512-NEXT: movl $65535, %eax # imm = 0xFFFF -; AVX512-NEXT: cmovbl %eax, %ecx -; AVX512-NEXT: vmovd %xmm2, %edx -; AVX512-NEXT: vmovd %xmm3, %esi -; AVX512-NEXT: addw %dx, %si -; AVX512-NEXT: cmovbl %eax, %esi -; AVX512-NEXT: vmovd %esi, %xmm4 -; AVX512-NEXT: vpinsrw $1, %ecx, %xmm4, %xmm4 -; AVX512-NEXT: vpextrw $2, %xmm2, %ecx -; AVX512-NEXT: vpextrw $2, %xmm3, %edx -; AVX512-NEXT: addw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 -; AVX512-NEXT: vpextrw $3, %xmm2, %ecx -; AVX512-NEXT: vpextrw $3, %xmm3, %edx -; AVX512-NEXT: addw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $3, %edx, %xmm4, %xmm4 -; AVX512-NEXT: vpextrw $4, %xmm2, %ecx -; AVX512-NEXT: vpextrw $4, %xmm3, %edx -; AVX512-NEXT: addw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $4, %edx, %xmm4, %xmm4 -; AVX512-NEXT: vpextrw $5, %xmm2, %ecx -; AVX512-NEXT: vpextrw $5, %xmm3, %edx -; AVX512-NEXT: addw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $5, %edx, %xmm4, %xmm4 -; AVX512-NEXT: vpextrw $6, %xmm2, %ecx -; AVX512-NEXT: vpextrw $6, %xmm3, %edx -; AVX512-NEXT: addw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $6, %edx, %xmm4, %xmm4 -; AVX512-NEXT: vpextrw $7, %xmm2, %ecx -; AVX512-NEXT: vpextrw $7, %xmm3, %edx -; AVX512-NEXT: addw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $7, %edx, %xmm4, %xmm2 -; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm3 -; AVX512-NEXT: vpextrw $1, %xmm3, %ecx -; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm4 -; AVX512-NEXT: vpextrw $1, %xmm4, %edx -; AVX512-NEXT: addw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vmovd %xmm3, %ecx -; AVX512-NEXT: vmovd %xmm4, %esi -; AVX512-NEXT: addw %cx, %si -; AVX512-NEXT: cmovbl %eax, %esi -; AVX512-NEXT: vmovd %esi, %xmm5 -; AVX512-NEXT: vpinsrw $1, %edx, %xmm5, %xmm5 -; AVX512-NEXT: vpextrw $2, %xmm3, %ecx -; AVX512-NEXT: vpextrw $2, %xmm4, %edx -; AVX512-NEXT: addw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 -; AVX512-NEXT: vpextrw $3, %xmm3, %ecx -; AVX512-NEXT: vpextrw $3, %xmm4, %edx -; AVX512-NEXT: addw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $3, %edx, %xmm5, %xmm5 -; AVX512-NEXT: vpextrw $4, %xmm3, %ecx -; AVX512-NEXT: vpextrw $4, %xmm4, %edx -; AVX512-NEXT: addw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $4, %edx, %xmm5, %xmm5 -; AVX512-NEXT: vpextrw $5, %xmm3, %ecx -; AVX512-NEXT: vpextrw $5, %xmm4, %edx -; AVX512-NEXT: addw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $5, %edx, %xmm5, %xmm5 -; AVX512-NEXT: vpextrw $6, %xmm3, %ecx -; AVX512-NEXT: vpextrw $6, %xmm4, %edx -; AVX512-NEXT: addw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $6, %edx, %xmm5, %xmm5 -; AVX512-NEXT: vpextrw $7, %xmm3, %ecx -; AVX512-NEXT: vpextrw $7, %xmm4, %edx -; AVX512-NEXT: addw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $7, %edx, %xmm5, %xmm3 -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512-NEXT: vpextrw $1, %xmm3, %ecx -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512-NEXT: vpextrw $1, %xmm4, %edx -; AVX512-NEXT: addw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vmovd %xmm3, %ecx -; AVX512-NEXT: vmovd %xmm4, %esi -; AVX512-NEXT: addw %cx, %si -; AVX512-NEXT: cmovbl %eax, %esi -; AVX512-NEXT: vmovd %esi, %xmm5 -; AVX512-NEXT: vpinsrw $1, %edx, %xmm5, %xmm5 -; AVX512-NEXT: vpextrw $2, %xmm3, %ecx -; AVX512-NEXT: vpextrw $2, %xmm4, %edx -; AVX512-NEXT: addw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 -; AVX512-NEXT: vpextrw $3, %xmm3, %ecx -; AVX512-NEXT: vpextrw $3, %xmm4, %edx -; AVX512-NEXT: addw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $3, %edx, %xmm5, %xmm5 -; AVX512-NEXT: vpextrw $4, %xmm3, %ecx -; AVX512-NEXT: vpextrw $4, %xmm4, %edx -; AVX512-NEXT: addw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $4, %edx, %xmm5, %xmm5 -; AVX512-NEXT: vpextrw $5, %xmm3, %ecx -; AVX512-NEXT: vpextrw $5, %xmm4, %edx -; AVX512-NEXT: addw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $5, %edx, %xmm5, %xmm5 -; AVX512-NEXT: vpextrw $6, %xmm3, %ecx -; AVX512-NEXT: vpextrw $6, %xmm4, %edx -; AVX512-NEXT: addw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $6, %edx, %xmm5, %xmm5 -; AVX512-NEXT: vpextrw $7, %xmm3, %ecx -; AVX512-NEXT: vpextrw $7, %xmm4, %edx -; AVX512-NEXT: addw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $7, %edx, %xmm5, %xmm3 -; AVX512-NEXT: vpextrw $1, %xmm1, %ecx -; AVX512-NEXT: vpextrw $1, %xmm0, %edx -; AVX512-NEXT: addw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vmovd %xmm1, %ecx -; AVX512-NEXT: vmovd %xmm0, %esi -; AVX512-NEXT: addw %cx, %si -; AVX512-NEXT: cmovbl %eax, %esi -; AVX512-NEXT: vmovd %esi, %xmm4 -; AVX512-NEXT: vpinsrw $1, %edx, %xmm4, %xmm4 -; AVX512-NEXT: vpextrw $2, %xmm1, %ecx -; AVX512-NEXT: vpextrw $2, %xmm0, %edx -; AVX512-NEXT: addw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 -; AVX512-NEXT: vpextrw $3, %xmm1, %ecx -; AVX512-NEXT: vpextrw $3, %xmm0, %edx -; AVX512-NEXT: addw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $3, %edx, %xmm4, %xmm4 -; AVX512-NEXT: vpextrw $4, %xmm1, %ecx -; AVX512-NEXT: vpextrw $4, %xmm0, %edx -; AVX512-NEXT: addw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $4, %edx, %xmm4, %xmm4 -; AVX512-NEXT: vpextrw $5, %xmm1, %ecx -; AVX512-NEXT: vpextrw $5, %xmm0, %edx -; AVX512-NEXT: addw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $5, %edx, %xmm4, %xmm4 -; AVX512-NEXT: vpextrw $6, %xmm1, %ecx -; AVX512-NEXT: vpextrw $6, %xmm0, %edx -; AVX512-NEXT: addw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $6, %edx, %xmm4, %xmm4 -; AVX512-NEXT: vpextrw $7, %xmm1, %ecx -; AVX512-NEXT: vpextrw $7, %xmm0, %edx -; AVX512-NEXT: addw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $7, %edx, %xmm4, %xmm0 -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512-NEXT: retq - %z = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %x, <32 x i16> %y) - ret <32 x i16> %z -} - -; Too narrow vectors, legalized by widening. - -define void @v8i8(<8 x i8>* %px, <8 x i8>* %py, <8 x i8>* %pz) nounwind { -; SSE2-LABEL: v8i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE2-NEXT: pextrw $7, %xmm1, %eax -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: pextrw $7, %xmm0, %ecx -; SSE2-NEXT: addw %ax, %cx -; SSE2-NEXT: movl $65535, %eax # imm = 0xFFFF -; SSE2-NEXT: cmovbl %eax, %ecx -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: pextrw $6, %xmm1, %ecx -; SSE2-NEXT: pextrw $6, %xmm0, %esi -; SSE2-NEXT: addw %cx, %si -; SSE2-NEXT: cmovbl %eax, %esi -; SSE2-NEXT: movd %esi, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE2-NEXT: pextrw $5, %xmm1, %ecx -; SSE2-NEXT: pextrw $5, %xmm0, %esi -; SSE2-NEXT: addw %cx, %si -; SSE2-NEXT: cmovbl %eax, %esi -; SSE2-NEXT: movd %esi, %xmm4 -; SSE2-NEXT: pextrw $4, %xmm1, %ecx -; SSE2-NEXT: pextrw $4, %xmm0, %esi -; SSE2-NEXT: addw %cx, %si -; SSE2-NEXT: cmovbl %eax, %esi -; SSE2-NEXT: movd %esi, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE2-NEXT: pextrw $3, %xmm1, %ecx -; SSE2-NEXT: pextrw $3, %xmm0, %esi -; SSE2-NEXT: addw %cx, %si -; SSE2-NEXT: cmovbl %eax, %esi -; SSE2-NEXT: movd %esi, %xmm3 -; SSE2-NEXT: pextrw $2, %xmm1, %ecx -; SSE2-NEXT: pextrw $2, %xmm0, %esi -; SSE2-NEXT: addw %cx, %si -; SSE2-NEXT: cmovbl %eax, %esi -; SSE2-NEXT: movd %esi, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE2-NEXT: pextrw $1, %xmm1, %ecx -; SSE2-NEXT: pextrw $1, %xmm0, %esi -; SSE2-NEXT: addw %cx, %si -; SSE2-NEXT: cmovbl %eax, %esi -; SSE2-NEXT: movd %esi, %xmm3 -; SSE2-NEXT: movd %xmm1, %ecx -; SSE2-NEXT: movd %xmm0, %esi -; SSE2-NEXT: addw %cx, %si -; SSE2-NEXT: cmovbl %eax, %esi -; SSE2-NEXT: movd %esi, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: movq %xmm0, (%rdx) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v8i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSSE3-NEXT: movq {{.*#+}} xmm3 = mem[0],zero -; SSSE3-NEXT: pxor %xmm0, %xmm0 -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSSE3-NEXT: pextrw $7, %xmm1, %eax -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSSE3-NEXT: pextrw $7, %xmm0, %ecx -; SSSE3-NEXT: addw %ax, %cx -; SSSE3-NEXT: movl $65535, %eax # imm = 0xFFFF -; SSSE3-NEXT: cmovbl %eax, %ecx -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: pextrw $6, %xmm1, %ecx -; SSSE3-NEXT: pextrw $6, %xmm0, %esi -; SSSE3-NEXT: addw %cx, %si -; SSSE3-NEXT: cmovbl %eax, %esi -; SSSE3-NEXT: movd %esi, %xmm3 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSSE3-NEXT: pextrw $5, %xmm1, %ecx -; SSSE3-NEXT: pextrw $5, %xmm0, %esi -; SSSE3-NEXT: addw %cx, %si -; SSSE3-NEXT: cmovbl %eax, %esi -; SSSE3-NEXT: movd %esi, %xmm4 -; SSSE3-NEXT: pextrw $4, %xmm1, %ecx -; SSSE3-NEXT: pextrw $4, %xmm0, %esi -; SSSE3-NEXT: addw %cx, %si -; SSSE3-NEXT: cmovbl %eax, %esi -; SSSE3-NEXT: movd %esi, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSSE3-NEXT: pextrw $3, %xmm1, %ecx -; SSSE3-NEXT: pextrw $3, %xmm0, %esi -; SSSE3-NEXT: addw %cx, %si -; SSSE3-NEXT: cmovbl %eax, %esi -; SSSE3-NEXT: movd %esi, %xmm3 -; SSSE3-NEXT: pextrw $2, %xmm1, %ecx -; SSSE3-NEXT: pextrw $2, %xmm0, %esi -; SSSE3-NEXT: addw %cx, %si -; SSSE3-NEXT: cmovbl %eax, %esi -; SSSE3-NEXT: movd %esi, %xmm4 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSSE3-NEXT: pextrw $1, %xmm1, %ecx -; SSSE3-NEXT: pextrw $1, %xmm0, %esi -; SSSE3-NEXT: addw %cx, %si -; SSSE3-NEXT: cmovbl %eax, %esi -; SSSE3-NEXT: movd %esi, %xmm3 -; SSSE3-NEXT: movd %xmm1, %ecx -; SSSE3-NEXT: movd %xmm0, %esi -; SSSE3-NEXT: addw %cx, %si -; SSSE3-NEXT: cmovbl %eax, %esi -; SSSE3-NEXT: movd %esi, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSSE3-NEXT: psrlw $8, %xmm0 -; SSSE3-NEXT: packuswb %xmm0, %xmm0 -; SSSE3-NEXT: movq %xmm0, (%rdx) -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v8i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSE41-NEXT: movq {{.*#+}} xmm3 = mem[0],zero -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE41-NEXT: pextrw $1, %xmm1, %eax -; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE41-NEXT: pextrw $1, %xmm0, %ecx -; SSE41-NEXT: addw %ax, %cx -; SSE41-NEXT: movl $65535, %eax # imm = 0xFFFF -; SSE41-NEXT: cmovbl %eax, %ecx -; SSE41-NEXT: movd %xmm1, %esi -; SSE41-NEXT: movd %xmm0, %edi -; SSE41-NEXT: addw %si, %di -; SSE41-NEXT: cmovbl %eax, %edi -; SSE41-NEXT: movd %edi, %xmm2 -; SSE41-NEXT: pinsrw $1, %ecx, %xmm2 -; SSE41-NEXT: pextrw $2, %xmm1, %ecx -; SSE41-NEXT: pextrw $2, %xmm0, %esi -; SSE41-NEXT: addw %cx, %si -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: pinsrw $2, %esi, %xmm2 -; SSE41-NEXT: pextrw $3, %xmm1, %ecx -; SSE41-NEXT: pextrw $3, %xmm0, %esi -; SSE41-NEXT: addw %cx, %si -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: pinsrw $3, %esi, %xmm2 -; SSE41-NEXT: pextrw $4, %xmm1, %ecx -; SSE41-NEXT: pextrw $4, %xmm0, %esi -; SSE41-NEXT: addw %cx, %si -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: pinsrw $4, %esi, %xmm2 -; SSE41-NEXT: pextrw $5, %xmm1, %ecx -; SSE41-NEXT: pextrw $5, %xmm0, %esi -; SSE41-NEXT: addw %cx, %si -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: pinsrw $5, %esi, %xmm2 -; SSE41-NEXT: pextrw $6, %xmm1, %ecx -; SSE41-NEXT: pextrw $6, %xmm0, %esi -; SSE41-NEXT: addw %cx, %si -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: pinsrw $6, %esi, %xmm2 -; SSE41-NEXT: pextrw $7, %xmm1, %ecx -; SSE41-NEXT: pextrw $7, %xmm0, %esi -; SSE41-NEXT: addw %cx, %si -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: pinsrw $7, %esi, %xmm2 -; SSE41-NEXT: psrlw $8, %xmm2 -; SSE41-NEXT: packuswb %xmm0, %xmm2 -; SSE41-NEXT: movq %xmm2, (%rdx) -; SSE41-NEXT: retq +; AVX2-LABEL: v32i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddusw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddusw %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX512-LABEL: v32i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: retq + %z = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %x, <32 x i16> %y) + ret <32 x i16> %z +} + +; Too narrow vectors, legalized by widening. + +define void @v8i8(<8 x i8>* %px, <8 x i8>* %py, <8 x i8>* %pz) nounwind { +; SSE-LABEL: v8i8: +; SSE: # %bb.0: +; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE-NEXT: paddusb %xmm0, %xmm1 +; SSE-NEXT: movq %xmm1, (%rdx) +; SSE-NEXT: retq ; ; AVX1-LABEL: v8i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX1-NEXT: vpextrw $1, %xmm0, %eax -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX1-NEXT: vpextrw $1, %xmm1, %ecx -; AVX1-NEXT: addw %ax, %cx -; AVX1-NEXT: movl $65535, %eax # imm = 0xFFFF -; AVX1-NEXT: cmovbl %eax, %ecx -; AVX1-NEXT: vmovd %xmm0, %esi -; AVX1-NEXT: vmovd %xmm1, %edi -; AVX1-NEXT: addw %si, %di -; AVX1-NEXT: cmovbl %eax, %edi -; AVX1-NEXT: vmovd %edi, %xmm2 -; AVX1-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $2, %xmm0, %ecx -; AVX1-NEXT: vpextrw $2, %xmm1, %esi -; AVX1-NEXT: addw %cx, %si -; AVX1-NEXT: cmovbl %eax, %esi -; AVX1-NEXT: vpinsrw $2, %esi, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $3, %xmm0, %ecx -; AVX1-NEXT: vpextrw $3, %xmm1, %esi -; AVX1-NEXT: addw %cx, %si -; AVX1-NEXT: cmovbl %eax, %esi -; AVX1-NEXT: vpinsrw $3, %esi, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $4, %xmm0, %ecx -; AVX1-NEXT: vpextrw $4, %xmm1, %esi -; AVX1-NEXT: addw %cx, %si -; AVX1-NEXT: cmovbl %eax, %esi -; AVX1-NEXT: vpinsrw $4, %esi, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $5, %xmm0, %ecx -; AVX1-NEXT: vpextrw $5, %xmm1, %esi -; AVX1-NEXT: addw %cx, %si -; AVX1-NEXT: cmovbl %eax, %esi -; AVX1-NEXT: vpinsrw $5, %esi, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $6, %xmm0, %ecx -; AVX1-NEXT: vpextrw $6, %xmm1, %esi -; AVX1-NEXT: addw %cx, %si -; AVX1-NEXT: cmovbl %eax, %esi -; AVX1-NEXT: vpinsrw $6, %esi, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $7, %xmm0, %ecx -; AVX1-NEXT: vpextrw $7, %xmm1, %esi -; AVX1-NEXT: addw %cx, %si -; AVX1-NEXT: cmovbl %eax, %esi -; AVX1-NEXT: vpinsrw $7, %esi, %xmm2, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, (%rdx) ; AVX1-NEXT: retq ; ; AVX2-LABEL: v8i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX2-NEXT: vpextrw $1, %xmm0, %eax -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX2-NEXT: vpextrw $1, %xmm1, %ecx -; AVX2-NEXT: addw %ax, %cx -; AVX2-NEXT: movl $65535, %eax # imm = 0xFFFF -; AVX2-NEXT: cmovbl %eax, %ecx -; AVX2-NEXT: vmovd %xmm0, %esi -; AVX2-NEXT: vmovd %xmm1, %edi -; AVX2-NEXT: addw %si, %di -; AVX2-NEXT: cmovbl %eax, %edi -; AVX2-NEXT: vmovd %edi, %xmm2 -; AVX2-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrw $2, %xmm0, %ecx -; AVX2-NEXT: vpextrw $2, %xmm1, %esi -; AVX2-NEXT: addw %cx, %si -; AVX2-NEXT: cmovbl %eax, %esi -; AVX2-NEXT: vpinsrw $2, %esi, %xmm2, %xmm2 -; AVX2-NEXT: vpextrw $3, %xmm0, %ecx -; AVX2-NEXT: vpextrw $3, %xmm1, %esi -; AVX2-NEXT: addw %cx, %si -; AVX2-NEXT: cmovbl %eax, %esi -; AVX2-NEXT: vpinsrw $3, %esi, %xmm2, %xmm2 -; AVX2-NEXT: vpextrw $4, %xmm0, %ecx -; AVX2-NEXT: vpextrw $4, %xmm1, %esi -; AVX2-NEXT: addw %cx, %si -; AVX2-NEXT: cmovbl %eax, %esi -; AVX2-NEXT: vpinsrw $4, %esi, %xmm2, %xmm2 -; AVX2-NEXT: vpextrw $5, %xmm0, %ecx -; AVX2-NEXT: vpextrw $5, %xmm1, %esi -; AVX2-NEXT: addw %cx, %si -; AVX2-NEXT: cmovbl %eax, %esi -; AVX2-NEXT: vpinsrw $5, %esi, %xmm2, %xmm2 -; AVX2-NEXT: vpextrw $6, %xmm0, %ecx -; AVX2-NEXT: vpextrw $6, %xmm1, %esi -; AVX2-NEXT: addw %cx, %si -; AVX2-NEXT: cmovbl %eax, %esi -; AVX2-NEXT: vpinsrw $6, %esi, %xmm2, %xmm2 -; AVX2-NEXT: vpextrw $7, %xmm0, %ecx -; AVX2-NEXT: vpextrw $7, %xmm1, %esi -; AVX2-NEXT: addw %cx, %si -; AVX2-NEXT: cmovbl %eax, %esi -; AVX2-NEXT: vpinsrw $7, %esi, %xmm2, %xmm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, (%rdx) ; AVX2-NEXT: retq ; ; AVX512-LABEL: v8i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512-NEXT: vpextrw $1, %xmm0, %eax -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512-NEXT: vpextrw $1, %xmm1, %ecx -; AVX512-NEXT: addw %ax, %cx -; AVX512-NEXT: movl $65535, %eax # imm = 0xFFFF -; AVX512-NEXT: cmovbl %eax, %ecx -; AVX512-NEXT: vmovd %xmm0, %esi -; AVX512-NEXT: vmovd %xmm1, %edi -; AVX512-NEXT: addw %si, %di -; AVX512-NEXT: cmovbl %eax, %edi -; AVX512-NEXT: vmovd %edi, %xmm2 -; AVX512-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 -; AVX512-NEXT: vpextrw $2, %xmm0, %ecx -; AVX512-NEXT: vpextrw $2, %xmm1, %esi -; AVX512-NEXT: addw %cx, %si -; AVX512-NEXT: cmovbl %eax, %esi -; AVX512-NEXT: vpinsrw $2, %esi, %xmm2, %xmm2 -; AVX512-NEXT: vpextrw $3, %xmm0, %ecx -; AVX512-NEXT: vpextrw $3, %xmm1, %esi -; AVX512-NEXT: addw %cx, %si -; AVX512-NEXT: cmovbl %eax, %esi -; AVX512-NEXT: vpinsrw $3, %esi, %xmm2, %xmm2 -; AVX512-NEXT: vpextrw $4, %xmm0, %ecx -; AVX512-NEXT: vpextrw $4, %xmm1, %esi -; AVX512-NEXT: addw %cx, %si -; AVX512-NEXT: cmovbl %eax, %esi -; AVX512-NEXT: vpinsrw $4, %esi, %xmm2, %xmm2 -; AVX512-NEXT: vpextrw $5, %xmm0, %ecx -; AVX512-NEXT: vpextrw $5, %xmm1, %esi -; AVX512-NEXT: addw %cx, %si -; AVX512-NEXT: cmovbl %eax, %esi -; AVX512-NEXT: vpinsrw $5, %esi, %xmm2, %xmm2 -; AVX512-NEXT: vpextrw $6, %xmm0, %ecx -; AVX512-NEXT: vpextrw $6, %xmm1, %esi -; AVX512-NEXT: addw %cx, %si -; AVX512-NEXT: cmovbl %eax, %esi -; AVX512-NEXT: vpinsrw $6, %esi, %xmm2, %xmm2 -; AVX512-NEXT: vpextrw $7, %xmm0, %ecx -; AVX512-NEXT: vpextrw $7, %xmm1, %esi -; AVX512-NEXT: addw %cx, %si -; AVX512-NEXT: cmovbl %eax, %esi -; AVX512-NEXT: vpinsrw $7, %esi, %xmm2, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512-NEXT: vpmovwb %xmm0, (%rdx) ; AVX512-NEXT: retq %x = load <8 x i8>, <8 x i8>* %px @@ -9478,183 +236,27 @@ } define void @v4i8(<4 x i8>* %px, <4 x i8>* %py, <4 x i8>* %pz) nounwind { -; SSE2-LABEL: v4i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: pslld $24, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3] -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: pslld $24, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] -; SSE2-NEXT: movd %xmm2, %ecx -; SSE2-NEXT: addl %eax, %ecx -; SSE2-NEXT: movl $-1, %eax -; SSE2-NEXT: cmovbl %eax, %ecx -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] -; SSE2-NEXT: movd %xmm3, %ecx -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm3, %esi -; SSE2-NEXT: addl %ecx, %esi -; SSE2-NEXT: cmovbl %eax, %esi -; SSE2-NEXT: movd %esi, %xmm3 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE2-NEXT: movd %xmm1, %ecx -; SSE2-NEXT: movd %xmm0, %esi -; SSE2-NEXT: addl %ecx, %esi -; SSE2-NEXT: cmovbl %eax, %esi -; SSE2-NEXT: movd %esi, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; SSE2-NEXT: movd %xmm1, %ecx -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE2-NEXT: movd %xmm0, %esi -; SSE2-NEXT: addl %ecx, %esi -; SSE2-NEXT: cmovbl %eax, %esi -; SSE2-NEXT: movd %esi, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE2-NEXT: psrld $24, %xmm2 -; SSE2-NEXT: packuswb %xmm2, %xmm2 -; SSE2-NEXT: packuswb %xmm2, %xmm2 -; SSE2-NEXT: movd %xmm2, (%rdx) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v4i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,1,255,255,255,2,255,255,255,3] -; SSSE3-NEXT: pshufb %xmm2, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,1,2,3] -; SSSE3-NEXT: movd %xmm3, %eax -; SSSE3-NEXT: pshufb %xmm2, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] -; SSSE3-NEXT: movd %xmm2, %ecx -; SSSE3-NEXT: addl %eax, %ecx -; SSSE3-NEXT: movl $-1, %eax -; SSSE3-NEXT: cmovbl %eax, %ecx -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] -; SSSE3-NEXT: movd %xmm3, %ecx -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] -; SSSE3-NEXT: movd %xmm3, %esi -; SSSE3-NEXT: addl %ecx, %esi -; SSSE3-NEXT: cmovbl %eax, %esi -; SSSE3-NEXT: movd %esi, %xmm3 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSSE3-NEXT: movd %xmm1, %ecx -; SSSE3-NEXT: movd %xmm0, %esi -; SSSE3-NEXT: addl %ecx, %esi -; SSSE3-NEXT: cmovbl %eax, %esi -; SSSE3-NEXT: movd %esi, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; SSSE3-NEXT: movd %xmm1, %ecx -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSSE3-NEXT: movd %xmm0, %esi -; SSSE3-NEXT: addl %ecx, %esi -; SSSE3-NEXT: cmovbl %eax, %esi -; SSSE3-NEXT: movd %esi, %xmm0 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] -; SSSE3-NEXT: movd %xmm2, (%rdx) -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v4i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; SSE41-NEXT: pslld $24, %xmm1 -; SSE41-NEXT: pextrd $1, %xmm1, %eax -; SSE41-NEXT: pslld $24, %xmm0 -; SSE41-NEXT: pextrd $1, %xmm0, %ecx -; SSE41-NEXT: addl %eax, %ecx -; SSE41-NEXT: movl $-1, %eax -; SSE41-NEXT: cmovbl %eax, %ecx -; SSE41-NEXT: movd %xmm1, %esi -; SSE41-NEXT: movd %xmm0, %edi -; SSE41-NEXT: addl %esi, %edi -; SSE41-NEXT: cmovbl %eax, %edi -; SSE41-NEXT: movd %edi, %xmm2 -; SSE41-NEXT: pinsrd $1, %ecx, %xmm2 -; SSE41-NEXT: pextrd $2, %xmm1, %ecx -; SSE41-NEXT: pextrd $2, %xmm0, %esi -; SSE41-NEXT: addl %ecx, %esi -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: pinsrd $2, %esi, %xmm2 -; SSE41-NEXT: pextrd $3, %xmm1, %ecx -; SSE41-NEXT: pextrd $3, %xmm0, %esi -; SSE41-NEXT: addl %ecx, %esi -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: pinsrd $3, %esi, %xmm2 -; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] -; SSE41-NEXT: movd %xmm2, (%rdx) -; SSE41-NEXT: retq +; SSE-LABEL: v4i8: +; SSE: # %bb.0: +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: paddusb %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, (%rdx) +; SSE-NEXT: retq ; ; AVX1-LABEL: v4i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpslld $24, %xmm1, %xmm1 -; AVX1-NEXT: vpextrd $1, %xmm1, %eax -; AVX1-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX1-NEXT: vpextrd $1, %xmm0, %ecx -; AVX1-NEXT: addl %eax, %ecx -; AVX1-NEXT: movl $-1, %eax -; AVX1-NEXT: cmovbl %eax, %ecx -; AVX1-NEXT: vmovd %xmm1, %esi -; AVX1-NEXT: vmovd %xmm0, %edi -; AVX1-NEXT: addl %esi, %edi -; AVX1-NEXT: cmovbl %eax, %edi -; AVX1-NEXT: vmovd %edi, %xmm2 -; AVX1-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrd $2, %xmm1, %ecx -; AVX1-NEXT: vpextrd $2, %xmm0, %esi -; AVX1-NEXT: addl %ecx, %esi -; AVX1-NEXT: cmovbl %eax, %esi -; AVX1-NEXT: vpinsrd $2, %esi, %xmm2, %xmm2 -; AVX1-NEXT: vpextrd $3, %xmm1, %ecx -; AVX1-NEXT: vpextrd $3, %xmm0, %esi -; AVX1-NEXT: addl %ecx, %esi -; AVX1-NEXT: cmovbl %eax, %esi -; AVX1-NEXT: vpinsrd $3, %esi, %xmm2, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, (%rdx) ; AVX1-NEXT: retq ; ; AVX2-LABEL: v4i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX2-NEXT: vpslld $24, %xmm1, %xmm1 -; AVX2-NEXT: vpextrd $1, %xmm1, %eax -; AVX2-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX2-NEXT: vpextrd $1, %xmm0, %ecx -; AVX2-NEXT: addl %eax, %ecx -; AVX2-NEXT: movl $-1, %eax -; AVX2-NEXT: cmovbl %eax, %ecx -; AVX2-NEXT: vmovd %xmm1, %esi -; AVX2-NEXT: vmovd %xmm0, %edi -; AVX2-NEXT: addl %esi, %edi -; AVX2-NEXT: cmovbl %eax, %edi -; AVX2-NEXT: vmovd %edi, %xmm2 -; AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrd $2, %xmm1, %ecx -; AVX2-NEXT: vpextrd $2, %xmm0, %esi -; AVX2-NEXT: addl %ecx, %esi -; AVX2-NEXT: cmovbl %eax, %esi -; AVX2-NEXT: vpinsrd $2, %esi, %xmm2, %xmm2 -; AVX2-NEXT: vpextrd $3, %xmm1, %ecx -; AVX2-NEXT: vpextrd $3, %xmm0, %esi -; AVX2-NEXT: addl %ecx, %esi -; AVX2-NEXT: cmovbl %eax, %esi -; AVX2-NEXT: vpinsrd $3, %esi, %xmm2, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, (%rdx) ; AVX2-NEXT: retq ; @@ -9662,31 +264,8 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,1,255,255,255,2,255,255,255,3] -; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpextrd $1, %xmm1, %eax -; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpextrd $1, %xmm0, %ecx -; AVX512-NEXT: addl %eax, %ecx -; AVX512-NEXT: movl $-1, %eax -; AVX512-NEXT: cmovbl %eax, %ecx -; AVX512-NEXT: vmovd %xmm1, %esi -; AVX512-NEXT: vmovd %xmm0, %edi -; AVX512-NEXT: addl %esi, %edi -; AVX512-NEXT: cmovbl %eax, %edi -; AVX512-NEXT: vmovd %edi, %xmm2 -; AVX512-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 -; AVX512-NEXT: vpextrd $2, %xmm1, %ecx -; AVX512-NEXT: vpextrd $2, %xmm0, %esi -; AVX512-NEXT: addl %ecx, %esi -; AVX512-NEXT: cmovbl %eax, %esi -; AVX512-NEXT: vpinsrd $2, %esi, %xmm2, %xmm2 -; AVX512-NEXT: vpextrd $3, %xmm1, %ecx -; AVX512-NEXT: vpextrd $3, %xmm0, %esi -; AVX512-NEXT: addl %ecx, %esi -; AVX512-NEXT: cmovbl %eax, %esi -; AVX512-NEXT: vpinsrd $3, %esi, %xmm2, %xmm0 -; AVX512-NEXT: vpsrld $24, %xmm0, %xmm0 +; AVX512-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX512-NEXT: vpmovdb %xmm0, (%rdx) ; AVX512-NEXT: retq %x = load <4 x i8>, <4 x i8>* %px @@ -9701,35 +280,10 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movzwl (%rdi), %eax ; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; SSE2-NEXT: movzwl (%rsi), %eax ; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; SSE2-NEXT: psllq $56, %xmm1 -; SSE2-NEXT: movq %xmm1, %rax -; SSE2-NEXT: psllq $56, %xmm0 -; SSE2-NEXT: movq %xmm0, %rcx -; SSE2-NEXT: addq %rax, %rcx -; SSE2-NEXT: movq $-1, %rax -; SSE2-NEXT: cmovbq %rax, %rcx -; SSE2-NEXT: movq %rcx, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: movq %xmm1, %rcx -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movq %xmm0, %rsi -; SSE2-NEXT: addq %rcx, %rsi -; SSE2-NEXT: cmovbq %rax, %rsi -; SSE2-NEXT: movq %rsi, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE2-NEXT: psrlq $56, %xmm2 -; SSE2-NEXT: packuswb %xmm2, %xmm2 -; SSE2-NEXT: packuswb %xmm2, %xmm2 -; SSE2-NEXT: packuswb %xmm2, %xmm2 -; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: paddusb %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: movw %ax, (%rdx) ; SSE2-NEXT: retq ; @@ -9739,91 +293,38 @@ ; SSSE3-NEXT: movd %eax, %xmm0 ; SSSE3-NEXT: movzwl (%rsi), %eax ; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,1] -; SSSE3-NEXT: pshufb %xmm2, %xmm1 -; SSSE3-NEXT: movq %xmm1, %rax -; SSSE3-NEXT: pshufb %xmm2, %xmm0 -; SSSE3-NEXT: movq %xmm0, %rcx -; SSSE3-NEXT: addq %rax, %rcx -; SSSE3-NEXT: movq $-1, %rax -; SSSE3-NEXT: cmovbq %rax, %rcx -; SSSE3-NEXT: movq %rcx, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSSE3-NEXT: movq %xmm1, %rcx -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSSE3-NEXT: movq %xmm0, %rsi -; SSSE3-NEXT: addq %rcx, %rsi -; SSSE3-NEXT: cmovbq %rax, %rsi -; SSSE3-NEXT: movq %rsi, %xmm0 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; SSSE3-NEXT: movd %xmm2, %eax +; SSSE3-NEXT: paddusb %xmm0, %xmm1 +; SSSE3-NEXT: movd %xmm1, %eax ; SSSE3-NEXT: movw %ax, (%rdx) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v2i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: psllq $56, %xmm1 -; SSE41-NEXT: pextrq $1, %xmm1, %rax -; SSE41-NEXT: psllq $56, %xmm0 -; SSE41-NEXT: pextrq $1, %xmm0, %rcx -; SSE41-NEXT: addq %rax, %rcx -; SSE41-NEXT: movq $-1, %rax -; SSE41-NEXT: cmovbq %rax, %rcx -; SSE41-NEXT: movq %rcx, %xmm2 -; SSE41-NEXT: movq %xmm1, %rcx -; SSE41-NEXT: movq %xmm0, %rsi -; SSE41-NEXT: addq %rcx, %rsi -; SSE41-NEXT: cmovbq %rax, %rsi -; SSE41-NEXT: movq %rsi, %xmm0 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; SSE41-NEXT: pextrw $0, %xmm0, (%rdx) +; SSE41-NEXT: movzwl (%rdi), %eax +; SSE41-NEXT: movd %eax, %xmm0 +; SSE41-NEXT: movzwl (%rsi), %eax +; SSE41-NEXT: movd %eax, %xmm1 +; SSE41-NEXT: paddusb %xmm0, %xmm1 +; SSE41-NEXT: pextrw $0, %xmm1, (%rdx) ; SSE41-NEXT: retq ; ; AVX1-LABEL: v2i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsllq $56, %xmm1, %xmm1 -; AVX1-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX1-NEXT: vpextrq $1, %xmm0, %rcx -; AVX1-NEXT: addq %rax, %rcx -; AVX1-NEXT: movq $-1, %rax -; AVX1-NEXT: cmovbq %rax, %rcx -; AVX1-NEXT: vmovq %rcx, %xmm2 -; AVX1-NEXT: vmovq %xmm1, %rcx -; AVX1-NEXT: vmovq %xmm0, %rsi -; AVX1-NEXT: addq %rcx, %rsi -; AVX1-NEXT: cmovbq %rax, %rsi -; AVX1-NEXT: vmovq %rsi, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: movzwl (%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: movzwl (%rsi), %eax +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpextrw $0, %xmm0, (%rdx) ; AVX1-NEXT: retq ; ; AVX2-LABEL: v2i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpsllq $56, %xmm1, %xmm1 -; AVX2-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX2-NEXT: vpextrq $1, %xmm0, %rcx -; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: movq $-1, %rax -; AVX2-NEXT: cmovbq %rax, %rcx -; AVX2-NEXT: vmovq %rcx, %xmm2 -; AVX2-NEXT: vmovq %xmm1, %rcx -; AVX2-NEXT: vmovq %xmm0, %rsi -; AVX2-NEXT: addq %rcx, %rsi -; AVX2-NEXT: cmovbq %rax, %rsi -; AVX2-NEXT: vmovq %rsi, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: movzwl (%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: movzwl (%rsi), %eax +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpextrw $0, %xmm0, (%rdx) ; AVX2-NEXT: retq ; @@ -9833,22 +334,8 @@ ; AVX512-NEXT: vmovd %eax, %xmm0 ; AVX512-NEXT: movzwl (%rsi), %eax ; AVX512-NEXT: vmovd %eax, %xmm1 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,1] -; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpextrq $1, %xmm1, %rax -; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpextrq $1, %xmm0, %rcx -; AVX512-NEXT: addq %rax, %rcx -; AVX512-NEXT: movq $-1, %rax -; AVX512-NEXT: cmovbq %rax, %rcx -; AVX512-NEXT: vmovq %rcx, %xmm2 -; AVX512-NEXT: vmovq %xmm1, %rcx -; AVX512-NEXT: vmovq %xmm0, %rsi -; AVX512-NEXT: addq %rcx, %rsi -; AVX512-NEXT: cmovbq %rax, %rsi -; AVX512-NEXT: vmovq %rsi, %xmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX512-NEXT: vpsrlq $56, %xmm0, %xmm0 +; AVX512-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vpmovqb %xmm0, (%rdx) ; AVX512-NEXT: retq %x = load <2 x i8>, <2 x i8>* %px @@ -9859,157 +346,19 @@ } define void @v4i16(<4 x i16>* %px, <4 x i16>* %py, <4 x i16>* %pz) nounwind { -; SSE2-LABEL: v4i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,1,2,3] -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] -; SSE2-NEXT: movd %xmm1, %ecx -; SSE2-NEXT: addl %eax, %ecx -; SSE2-NEXT: movl $-1, %eax -; SSE2-NEXT: cmovbl %eax, %ecx -; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] -; SSE2-NEXT: movd %xmm2, %ecx -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm2, %esi -; SSE2-NEXT: addl %ecx, %esi -; SSE2-NEXT: cmovbl %eax, %esi -; SSE2-NEXT: movd %esi, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: movd %xmm3, %ecx -; SSE2-NEXT: movd %xmm0, %esi -; SSE2-NEXT: addl %ecx, %esi -; SSE2-NEXT: cmovbl %eax, %esi -; SSE2-NEXT: movd %esi, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] -; SSE2-NEXT: movd %xmm3, %ecx -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE2-NEXT: movd %xmm0, %esi -; SSE2-NEXT: addl %ecx, %esi -; SSE2-NEXT: cmovbl %eax, %esi -; SSE2-NEXT: movd %esi, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: movq %xmm0, (%rdx) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v4i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSSE3-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSSE3-NEXT: pxor %xmm0, %xmm0 -; SSSE3-NEXT: pxor %xmm3, %xmm3 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,1,2,3] -; SSSE3-NEXT: movd %xmm2, %eax -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] -; SSSE3-NEXT: movd %xmm1, %ecx -; SSSE3-NEXT: addl %eax, %ecx -; SSSE3-NEXT: movl $-1, %eax -; SSSE3-NEXT: cmovbl %eax, %ecx -; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] -; SSSE3-NEXT: movd %xmm2, %ecx -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSSE3-NEXT: movd %xmm2, %esi -; SSSE3-NEXT: addl %ecx, %esi -; SSSE3-NEXT: cmovbl %eax, %esi -; SSSE3-NEXT: movd %esi, %xmm2 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSSE3-NEXT: movd %xmm3, %ecx -; SSSE3-NEXT: movd %xmm0, %esi -; SSSE3-NEXT: addl %ecx, %esi -; SSSE3-NEXT: cmovbl %eax, %esi -; SSSE3-NEXT: movd %esi, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] -; SSSE3-NEXT: movd %xmm3, %ecx -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSSE3-NEXT: movd %xmm0, %esi -; SSSE3-NEXT: addl %ecx, %esi -; SSSE3-NEXT: cmovbl %eax, %esi -; SSSE3-NEXT: movd %esi, %xmm0 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,10,11,14,15,14,15],zero,zero -; SSSE3-NEXT: movq %xmm1, (%rdx) -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v4i16: -; SSE41: # %bb.0: -; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE41-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pxor %xmm3, %xmm3 -; SSE41-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE41-NEXT: pextrd $1, %xmm3, %eax -; SSE41-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE41-NEXT: pextrd $1, %xmm2, %ecx -; SSE41-NEXT: addl %eax, %ecx -; SSE41-NEXT: movl $-1, %eax -; SSE41-NEXT: cmovbl %eax, %ecx -; SSE41-NEXT: movd %xmm3, %esi -; SSE41-NEXT: movd %xmm2, %edi -; SSE41-NEXT: addl %esi, %edi -; SSE41-NEXT: cmovbl %eax, %edi -; SSE41-NEXT: movd %edi, %xmm0 -; SSE41-NEXT: pinsrd $1, %ecx, %xmm0 -; SSE41-NEXT: pextrd $2, %xmm3, %ecx -; SSE41-NEXT: pextrd $2, %xmm2, %esi -; SSE41-NEXT: addl %ecx, %esi -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: pinsrd $2, %esi, %xmm0 -; SSE41-NEXT: pextrd $3, %xmm3, %ecx -; SSE41-NEXT: pextrd $3, %xmm2, %esi -; SSE41-NEXT: addl %ecx, %esi -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: pinsrd $3, %esi, %xmm0 -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: packusdw %xmm0, %xmm0 -; SSE41-NEXT: movq %xmm0, (%rdx) -; SSE41-NEXT: retq +; SSE-LABEL: v4i16: +; SSE: # %bb.0: +; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE-NEXT: paddusw %xmm0, %xmm1 +; SSE-NEXT: movq %xmm1, (%rdx) +; SSE-NEXT: retq ; ; AVX1-LABEL: v4i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-NEXT: vpextrd $1, %xmm1, %eax -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX1-NEXT: vpextrd $1, %xmm0, %ecx -; AVX1-NEXT: addl %eax, %ecx -; AVX1-NEXT: movl $-1, %eax -; AVX1-NEXT: cmovbl %eax, %ecx -; AVX1-NEXT: vmovd %xmm1, %esi -; AVX1-NEXT: vmovd %xmm0, %edi -; AVX1-NEXT: addl %esi, %edi -; AVX1-NEXT: cmovbl %eax, %edi -; AVX1-NEXT: vmovd %edi, %xmm2 -; AVX1-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrd $2, %xmm1, %ecx -; AVX1-NEXT: vpextrd $2, %xmm0, %esi -; AVX1-NEXT: addl %ecx, %esi -; AVX1-NEXT: cmovbl %eax, %esi -; AVX1-NEXT: vpinsrd $2, %esi, %xmm2, %xmm2 -; AVX1-NEXT: vpextrd $3, %xmm1, %ecx -; AVX1-NEXT: vpextrd $3, %xmm0, %esi -; AVX1-NEXT: addl %ecx, %esi -; AVX1-NEXT: cmovbl %eax, %esi -; AVX1-NEXT: vpinsrd $3, %esi, %xmm2, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, (%rdx) ; AVX1-NEXT: retq ; @@ -10017,32 +366,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-NEXT: vpextrd $1, %xmm1, %eax -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX2-NEXT: vpextrd $1, %xmm0, %ecx -; AVX2-NEXT: addl %eax, %ecx -; AVX2-NEXT: movl $-1, %eax -; AVX2-NEXT: cmovbl %eax, %ecx -; AVX2-NEXT: vmovd %xmm1, %esi -; AVX2-NEXT: vmovd %xmm0, %edi -; AVX2-NEXT: addl %esi, %edi -; AVX2-NEXT: cmovbl %eax, %edi -; AVX2-NEXT: vmovd %edi, %xmm2 -; AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrd $2, %xmm1, %ecx -; AVX2-NEXT: vpextrd $2, %xmm0, %esi -; AVX2-NEXT: addl %ecx, %esi -; AVX2-NEXT: cmovbl %eax, %esi -; AVX2-NEXT: vpinsrd $2, %esi, %xmm2, %xmm2 -; AVX2-NEXT: vpextrd $3, %xmm1, %ecx -; AVX2-NEXT: vpextrd $3, %xmm0, %esi -; AVX2-NEXT: addl %ecx, %esi -; AVX2-NEXT: cmovbl %eax, %esi -; AVX2-NEXT: vpinsrd $3, %esi, %xmm2, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, (%rdx) ; AVX2-NEXT: retq ; @@ -10050,31 +374,8 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,0,1,255,255,2,3,255,255,4,5,255,255,6,7] -; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpextrd $1, %xmm1, %eax -; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpextrd $1, %xmm0, %ecx -; AVX512-NEXT: addl %eax, %ecx -; AVX512-NEXT: movl $-1, %eax -; AVX512-NEXT: cmovbl %eax, %ecx -; AVX512-NEXT: vmovd %xmm1, %esi -; AVX512-NEXT: vmovd %xmm0, %edi -; AVX512-NEXT: addl %esi, %edi -; AVX512-NEXT: cmovbl %eax, %edi -; AVX512-NEXT: vmovd %edi, %xmm2 -; AVX512-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 -; AVX512-NEXT: vpextrd $2, %xmm1, %ecx -; AVX512-NEXT: vpextrd $2, %xmm0, %esi -; AVX512-NEXT: addl %ecx, %esi -; AVX512-NEXT: cmovbl %eax, %esi -; AVX512-NEXT: vpinsrd $2, %esi, %xmm2, %xmm2 -; AVX512-NEXT: vpextrd $3, %xmm1, %ecx -; AVX512-NEXT: vpextrd $3, %xmm0, %esi -; AVX512-NEXT: addl %ecx, %esi -; AVX512-NEXT: cmovbl %eax, %esi -; AVX512-NEXT: vpinsrd $3, %esi, %xmm2, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX512-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX512-NEXT: vpmovdw %xmm0, (%rdx) ; AVX512-NEXT: retq %x = load <4 x i16>, <4 x i16>* %px @@ -10085,124 +386,27 @@ } define void @v2i16(<2 x i16>* %px, <2 x i16>* %py, <2 x i16>* %pz) nounwind { -; SSE2-LABEL: v2i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7] -; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,6,7] -; SSE2-NEXT: psllq $48, %xmm1 -; SSE2-NEXT: movq %xmm1, %rax -; SSE2-NEXT: psllq $48, %xmm0 -; SSE2-NEXT: movq %xmm0, %rcx -; SSE2-NEXT: addq %rax, %rcx -; SSE2-NEXT: movq $-1, %rax -; SSE2-NEXT: cmovbq %rax, %rcx -; SSE2-NEXT: movq %rcx, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: movq %xmm1, %rcx -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movq %xmm0, %rsi -; SSE2-NEXT: addq %rcx, %rsi -; SSE2-NEXT: cmovbq %rax, %rsi -; SSE2-NEXT: movq %rsi, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE2-NEXT: psrlq $48, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: movd %xmm0, (%rdx) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v2i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,0,1,255,255,255,255,255,255,2,3] -; SSSE3-NEXT: pshufb %xmm2, %xmm1 -; SSSE3-NEXT: movq %xmm1, %rax -; SSSE3-NEXT: pshufb %xmm2, %xmm0 -; SSSE3-NEXT: movq %xmm0, %rcx -; SSSE3-NEXT: addq %rax, %rcx -; SSSE3-NEXT: movq $-1, %rax -; SSSE3-NEXT: cmovbq %rax, %rcx -; SSSE3-NEXT: movq %rcx, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSSE3-NEXT: movq %xmm1, %rcx -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSSE3-NEXT: movq %xmm0, %rsi -; SSSE3-NEXT: addq %rcx, %rsi -; SSSE3-NEXT: cmovbq %rax, %rsi -; SSSE3-NEXT: movq %rsi, %xmm0 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[6,7,14,15,14,15],zero,zero,xmm2[14,15],zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: movd %xmm2, (%rdx) -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v2i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; SSE41-NEXT: psllq $48, %xmm1 -; SSE41-NEXT: pextrq $1, %xmm1, %rax -; SSE41-NEXT: psllq $48, %xmm0 -; SSE41-NEXT: pextrq $1, %xmm0, %rcx -; SSE41-NEXT: addq %rax, %rcx -; SSE41-NEXT: movq $-1, %rax -; SSE41-NEXT: cmovbq %rax, %rcx -; SSE41-NEXT: movq %rcx, %xmm2 -; SSE41-NEXT: movq %xmm1, %rcx -; SSE41-NEXT: movq %xmm0, %rsi -; SSE41-NEXT: addq %rcx, %rsi -; SSE41-NEXT: cmovbq %rax, %rsi -; SSE41-NEXT: movq %rsi, %xmm0 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,14,15],zero,zero,xmm0[14,15],zero,zero,zero,zero,zero,zero -; SSE41-NEXT: movd %xmm0, (%rdx) -; SSE41-NEXT: retq +; SSE-LABEL: v2i16: +; SSE: # %bb.0: +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: paddusw %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, (%rdx) +; SSE-NEXT: retq ; ; AVX1-LABEL: v2i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; AVX1-NEXT: vpsllq $48, %xmm1, %xmm1 -; AVX1-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-NEXT: vpsllq $48, %xmm0, %xmm0 -; AVX1-NEXT: vpextrq $1, %xmm0, %rcx -; AVX1-NEXT: addq %rax, %rcx -; AVX1-NEXT: movq $-1, %rax -; AVX1-NEXT: cmovbq %rax, %rcx -; AVX1-NEXT: vmovq %rcx, %xmm2 -; AVX1-NEXT: vmovq %xmm1, %rcx -; AVX1-NEXT: vmovq %xmm0, %rsi -; AVX1-NEXT: addq %rcx, %rsi -; AVX1-NEXT: cmovbq %rax, %rsi -; AVX1-NEXT: vmovq %rsi, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,14,15],zero,zero,xmm0[14,15],zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, (%rdx) ; AVX1-NEXT: retq ; ; AVX2-LABEL: v2i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1 -; AVX2-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-NEXT: vpsllq $48, %xmm0, %xmm0 -; AVX2-NEXT: vpextrq $1, %xmm0, %rcx -; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: movq $-1, %rax -; AVX2-NEXT: cmovbq %rax, %rcx -; AVX2-NEXT: vmovq %rcx, %xmm2 -; AVX2-NEXT: vmovq %xmm1, %rcx -; AVX2-NEXT: vmovq %xmm0, %rsi -; AVX2-NEXT: addq %rcx, %rsi -; AVX2-NEXT: cmovbq %rax, %rsi -; AVX2-NEXT: vmovq %rsi, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,14,15],zero,zero,xmm0[14,15],zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, (%rdx) ; AVX2-NEXT: retq ; @@ -10210,22 +414,8 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,0,1,255,255,255,255,255,255,2,3] -; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpextrq $1, %xmm1, %rax -; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpextrq $1, %xmm0, %rcx -; AVX512-NEXT: addq %rax, %rcx -; AVX512-NEXT: movq $-1, %rax -; AVX512-NEXT: cmovbq %rax, %rcx -; AVX512-NEXT: vmovq %rcx, %xmm2 -; AVX512-NEXT: vmovq %xmm1, %rcx -; AVX512-NEXT: vmovq %xmm0, %rsi -; AVX512-NEXT: addq %rcx, %rsi -; AVX512-NEXT: cmovbq %rax, %rsi -; AVX512-NEXT: vmovq %rsi, %xmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX512-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX512-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512-NEXT: vpmovqw %xmm0, (%rdx) ; AVX512-NEXT: retq %x = load <2 x i16>, <2 x i16>* %px @@ -10236,1014 +426,59 @@ } define <12 x i8> @v12i8(<12 x i8> %x, <12 x i8> %y) nounwind { -; SSE2-LABEL: v12i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pushq %rbp -; SSE2-NEXT: pushq %r15 -; SSE2-NEXT: pushq %r14 -; SSE2-NEXT: pushq %r13 -; SSE2-NEXT: pushq %r12 -; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %dil -; SSE2-NEXT: jb .LBB11_2 -; SSE2-NEXT: # %bb.1: -; SSE2-NEXT: movl %eax, %edi -; SSE2-NEXT: .LBB11_2: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb $-1, %al -; SSE2-NEXT: jb .LBB11_4 -; SSE2-NEXT: # %bb.3: -; SSE2-NEXT: movl %ecx, %eax -; SSE2-NEXT: .LBB11_4: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb $-1, %cl -; SSE2-NEXT: jb .LBB11_6 -; SSE2-NEXT: # %bb.5: -; SSE2-NEXT: movl %edx, %ecx -; SSE2-NEXT: .LBB11_6: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb $-1, %r10b -; SSE2-NEXT: jb .LBB11_8 -; SSE2-NEXT: # %bb.7: -; SSE2-NEXT: movl %edx, %r10d -; SSE2-NEXT: .LBB11_8: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb $-1, %r11b -; SSE2-NEXT: jb .LBB11_10 -; SSE2-NEXT: # %bb.9: -; SSE2-NEXT: movl %edx, %r11d -; SSE2-NEXT: .LBB11_10: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb $-1, %r12b -; SSE2-NEXT: jb .LBB11_12 -; SSE2-NEXT: # %bb.11: -; SSE2-NEXT: movl %edx, %r12d -; SSE2-NEXT: .LBB11_12: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb $-1, %r13b -; SSE2-NEXT: jb .LBB11_14 -; SSE2-NEXT: # %bb.13: -; SSE2-NEXT: movl %edx, %r13d -; SSE2-NEXT: .LBB11_14: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb $-1, %r8b -; SSE2-NEXT: jb .LBB11_16 -; SSE2-NEXT: # %bb.15: -; SSE2-NEXT: movl %edx, %r8d -; SSE2-NEXT: .LBB11_16: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb $-1, %r14b -; SSE2-NEXT: jb .LBB11_18 -; SSE2-NEXT: # %bb.17: -; SSE2-NEXT: movl %edx, %r14d -; SSE2-NEXT: .LBB11_18: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb $-1, %r15b -; SSE2-NEXT: jb .LBB11_20 -; SSE2-NEXT: # %bb.19: -; SSE2-NEXT: movl %edx, %r15d -; SSE2-NEXT: .LBB11_20: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb $-1, %r9b -; SSE2-NEXT: jb .LBB11_22 -; SSE2-NEXT: # %bb.21: -; SSE2-NEXT: movl %edx, %r9d -; SSE2-NEXT: .LBB11_22: -; SSE2-NEXT: movzbl %dil, %edi -; SSE2-NEXT: movzbl %al, %esi -; SSE2-NEXT: movzbl %cl, %ebp -; SSE2-NEXT: movzbl %r10b, %edx -; SSE2-NEXT: movzbl %r11b, %ebx -; SSE2-NEXT: movzbl %r12b, %r10d -; SSE2-NEXT: movzbl %r13b, %r11d -; SSE2-NEXT: movzbl %r8b, %r8d -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %cl -; SSE2-NEXT: jb .LBB11_24 -; SSE2-NEXT: # %bb.23: -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: .LBB11_24: -; SSE2-NEXT: movd %edi, %xmm2 -; SSE2-NEXT: movd %esi, %xmm3 -; SSE2-NEXT: movd %ebp, %xmm5 -; SSE2-NEXT: movd %edx, %xmm0 -; SSE2-NEXT: movd %ebx, %xmm6 -; SSE2-NEXT: movd %r10d, %xmm4 -; SSE2-NEXT: movd %r11d, %xmm7 -; SSE2-NEXT: movd %r8d, %xmm1 -; SSE2-NEXT: movzbl %r14b, %esi -; SSE2-NEXT: movzbl %r15b, %edx -; SSE2-NEXT: movzbl %r9b, %eax -; SSE2-NEXT: movzbl %cl, %edi -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb $-1, %bl -; SSE2-NEXT: jb .LBB11_26 -; SSE2-NEXT: # %bb.25: -; SSE2-NEXT: movl %ecx, %ebx -; SSE2-NEXT: .LBB11_26: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; SSE2-NEXT: movd %esi, %xmm6 -; SSE2-NEXT: movd %edx, %xmm5 -; SSE2-NEXT: movd %eax, %xmm7 -; SSE2-NEXT: movd %edi, %xmm2 -; SSE2-NEXT: movzbl %bl, %eax -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb $-1, %cl -; SSE2-NEXT: jb .LBB11_28 -; SSE2-NEXT: # %bb.27: -; SSE2-NEXT: movl %edx, %ecx -; SSE2-NEXT: .LBB11_28: -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] -; SSE2-NEXT: movd %eax, %xmm4 -; SSE2-NEXT: movzbl %cl, %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb $-1, %cl -; SSE2-NEXT: jb .LBB11_30 -; SSE2-NEXT: # %bb.29: -; SSE2-NEXT: movl %edx, %ecx -; SSE2-NEXT: .LBB11_30: -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE2-NEXT: movzbl %cl, %ecx -; SSE2-NEXT: movd %ecx, %xmm4 -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %cl -; SSE2-NEXT: jb .LBB11_32 -; SSE2-NEXT: # %bb.31: -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: .LBB11_32: -; SSE2-NEXT: movzbl %cl, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: popq %rbx -; SSE2-NEXT: popq %r12 -; SSE2-NEXT: popq %r13 -; SSE2-NEXT: popq %r14 -; SSE2-NEXT: popq %r15 -; SSE2-NEXT: popq %rbp -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v12i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pushq %rbp -; SSSE3-NEXT: pushq %r15 -; SSSE3-NEXT: pushq %r14 -; SSSE3-NEXT: pushq %r13 -; SSSE3-NEXT: pushq %r12 -; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %dil -; SSSE3-NEXT: jb .LBB11_2 -; SSSE3-NEXT: # %bb.1: -; SSSE3-NEXT: movl %eax, %edi -; SSSE3-NEXT: .LBB11_2: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb $-1, %al -; SSSE3-NEXT: jb .LBB11_4 -; SSSE3-NEXT: # %bb.3: -; SSSE3-NEXT: movl %ecx, %eax -; SSSE3-NEXT: .LBB11_4: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb $-1, %cl -; SSSE3-NEXT: jb .LBB11_6 -; SSSE3-NEXT: # %bb.5: -; SSSE3-NEXT: movl %edx, %ecx -; SSSE3-NEXT: .LBB11_6: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb $-1, %r10b -; SSSE3-NEXT: jb .LBB11_8 -; SSSE3-NEXT: # %bb.7: -; SSSE3-NEXT: movl %edx, %r10d -; SSSE3-NEXT: .LBB11_8: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb $-1, %r11b -; SSSE3-NEXT: jb .LBB11_10 -; SSSE3-NEXT: # %bb.9: -; SSSE3-NEXT: movl %edx, %r11d -; SSSE3-NEXT: .LBB11_10: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb $-1, %r12b -; SSSE3-NEXT: jb .LBB11_12 -; SSSE3-NEXT: # %bb.11: -; SSSE3-NEXT: movl %edx, %r12d -; SSSE3-NEXT: .LBB11_12: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb $-1, %r13b -; SSSE3-NEXT: jb .LBB11_14 -; SSSE3-NEXT: # %bb.13: -; SSSE3-NEXT: movl %edx, %r13d -; SSSE3-NEXT: .LBB11_14: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb $-1, %r8b -; SSSE3-NEXT: jb .LBB11_16 -; SSSE3-NEXT: # %bb.15: -; SSSE3-NEXT: movl %edx, %r8d -; SSSE3-NEXT: .LBB11_16: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb $-1, %r14b -; SSSE3-NEXT: jb .LBB11_18 -; SSSE3-NEXT: # %bb.17: -; SSSE3-NEXT: movl %edx, %r14d -; SSSE3-NEXT: .LBB11_18: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb $-1, %r15b -; SSSE3-NEXT: jb .LBB11_20 -; SSSE3-NEXT: # %bb.19: -; SSSE3-NEXT: movl %edx, %r15d -; SSSE3-NEXT: .LBB11_20: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb $-1, %r9b -; SSSE3-NEXT: jb .LBB11_22 -; SSSE3-NEXT: # %bb.21: -; SSSE3-NEXT: movl %edx, %r9d -; SSSE3-NEXT: .LBB11_22: -; SSSE3-NEXT: movzbl %dil, %edi -; SSSE3-NEXT: movzbl %al, %esi -; SSSE3-NEXT: movzbl %cl, %ebp -; SSSE3-NEXT: movzbl %r10b, %edx -; SSSE3-NEXT: movzbl %r11b, %ebx -; SSSE3-NEXT: movzbl %r12b, %r10d -; SSSE3-NEXT: movzbl %r13b, %r11d -; SSSE3-NEXT: movzbl %r8b, %r8d -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %cl -; SSSE3-NEXT: jb .LBB11_24 -; SSSE3-NEXT: # %bb.23: -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: .LBB11_24: -; SSSE3-NEXT: movd %edi, %xmm2 -; SSSE3-NEXT: movd %esi, %xmm3 -; SSSE3-NEXT: movd %ebp, %xmm5 -; SSSE3-NEXT: movd %edx, %xmm0 -; SSSE3-NEXT: movd %ebx, %xmm6 -; SSSE3-NEXT: movd %r10d, %xmm4 -; SSSE3-NEXT: movd %r11d, %xmm7 -; SSSE3-NEXT: movd %r8d, %xmm1 -; SSSE3-NEXT: movzbl %r14b, %esi -; SSSE3-NEXT: movzbl %r15b, %edx -; SSSE3-NEXT: movzbl %r9b, %eax -; SSSE3-NEXT: movzbl %cl, %edi -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb $-1, %bl -; SSSE3-NEXT: jb .LBB11_26 -; SSSE3-NEXT: # %bb.25: -; SSSE3-NEXT: movl %ecx, %ebx -; SSSE3-NEXT: .LBB11_26: -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; SSSE3-NEXT: movd %esi, %xmm6 -; SSSE3-NEXT: movd %edx, %xmm5 -; SSSE3-NEXT: movd %eax, %xmm7 -; SSSE3-NEXT: movd %edi, %xmm2 -; SSSE3-NEXT: movzbl %bl, %eax -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb $-1, %cl -; SSSE3-NEXT: jb .LBB11_28 -; SSSE3-NEXT: # %bb.27: -; SSSE3-NEXT: movl %edx, %ecx -; SSSE3-NEXT: .LBB11_28: -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] -; SSSE3-NEXT: movd %eax, %xmm4 -; SSSE3-NEXT: movzbl %cl, %eax -; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb $-1, %cl -; SSSE3-NEXT: jb .LBB11_30 -; SSSE3-NEXT: # %bb.29: -; SSSE3-NEXT: movl %edx, %ecx -; SSSE3-NEXT: .LBB11_30: -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSSE3-NEXT: movzbl %cl, %ecx -; SSSE3-NEXT: movd %ecx, %xmm4 -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %cl -; SSSE3-NEXT: jb .LBB11_32 -; SSSE3-NEXT: # %bb.31: -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: .LBB11_32: -; SSSE3-NEXT: movzbl %cl, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: popq %rbx -; SSSE3-NEXT: popq %r12 -; SSSE3-NEXT: popq %r13 -; SSSE3-NEXT: popq %r14 -; SSSE3-NEXT: popq %r15 -; SSSE3-NEXT: popq %rbp -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v12i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pextrb $1, %xmm1, %eax -; SSE41-NEXT: pextrb $1, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %sil -; SSE41-NEXT: movb $-1, %dl -; SSE41-NEXT: jb .LBB11_2 -; SSE41-NEXT: # %bb.1: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB11_2: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pextrb $0, %xmm1, %eax -; SSE41-NEXT: pextrb $0, %xmm0, %edx -; SSE41-NEXT: addb %al, %dl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB11_4 -; SSE41-NEXT: # %bb.3: -; SSE41-NEXT: movl %edx, %eax -; SSE41-NEXT: .LBB11_4: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: movd %eax, %xmm2 -; SSE41-NEXT: pinsrb $1, %ecx, %xmm2 -; SSE41-NEXT: pextrb $2, %xmm1, %eax -; SSE41-NEXT: pextrb $2, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB11_6 -; SSE41-NEXT: # %bb.5: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB11_6: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $2, %eax, %xmm2 -; SSE41-NEXT: pextrb $3, %xmm1, %eax -; SSE41-NEXT: pextrb $3, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB11_8 -; SSE41-NEXT: # %bb.7: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB11_8: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $3, %eax, %xmm2 -; SSE41-NEXT: pextrb $4, %xmm1, %eax -; SSE41-NEXT: pextrb $4, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB11_10 -; SSE41-NEXT: # %bb.9: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB11_10: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $4, %eax, %xmm2 -; SSE41-NEXT: pextrb $5, %xmm1, %eax -; SSE41-NEXT: pextrb $5, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB11_12 -; SSE41-NEXT: # %bb.11: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB11_12: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $5, %eax, %xmm2 -; SSE41-NEXT: pextrb $6, %xmm1, %eax -; SSE41-NEXT: pextrb $6, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB11_14 -; SSE41-NEXT: # %bb.13: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB11_14: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $6, %eax, %xmm2 -; SSE41-NEXT: pextrb $7, %xmm1, %eax -; SSE41-NEXT: pextrb $7, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB11_16 -; SSE41-NEXT: # %bb.15: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB11_16: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $7, %eax, %xmm2 -; SSE41-NEXT: pextrb $8, %xmm1, %eax -; SSE41-NEXT: pextrb $8, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB11_18 -; SSE41-NEXT: # %bb.17: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB11_18: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $8, %eax, %xmm2 -; SSE41-NEXT: pextrb $9, %xmm1, %eax -; SSE41-NEXT: pextrb $9, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB11_20 -; SSE41-NEXT: # %bb.19: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB11_20: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $9, %eax, %xmm2 -; SSE41-NEXT: pextrb $10, %xmm1, %eax -; SSE41-NEXT: pextrb $10, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB11_22 -; SSE41-NEXT: # %bb.21: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB11_22: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $10, %eax, %xmm2 -; SSE41-NEXT: pextrb $11, %xmm1, %eax -; SSE41-NEXT: pextrb $11, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB11_24 -; SSE41-NEXT: # %bb.23: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB11_24: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $11, %eax, %xmm2 -; SSE41-NEXT: pextrb $12, %xmm1, %eax -; SSE41-NEXT: pextrb $12, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB11_26 -; SSE41-NEXT: # %bb.25: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB11_26: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $12, %eax, %xmm2 -; SSE41-NEXT: pextrb $13, %xmm1, %eax -; SSE41-NEXT: pextrb $13, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB11_28 -; SSE41-NEXT: # %bb.27: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB11_28: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $13, %eax, %xmm2 -; SSE41-NEXT: pextrb $14, %xmm1, %eax -; SSE41-NEXT: pextrb $14, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB11_30 -; SSE41-NEXT: # %bb.29: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB11_30: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $14, %eax, %xmm2 -; SSE41-NEXT: pextrb $15, %xmm1, %eax -; SSE41-NEXT: pextrb $15, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: jb .LBB11_32 -; SSE41-NEXT: # %bb.31: -; SSE41-NEXT: movl %ecx, %esi -; SSE41-NEXT: .LBB11_32: -; SSE41-NEXT: movzbl %sil, %eax -; SSE41-NEXT: pinsrb $15, %eax, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: v12i8: +; SSE: # %bb.0: +; SSE-NEXT: paddusb %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: v12i8: ; AVX: # %bb.0: -; AVX-NEXT: vpextrb $1, %xmm1, %eax -; AVX-NEXT: vpextrb $1, %xmm0, %ecx -; AVX-NEXT: addb %al, %cl -; AVX-NEXT: movb $-1, %sil -; AVX-NEXT: movb $-1, %dl -; AVX-NEXT: jb .LBB11_2 -; AVX-NEXT: # %bb.1: -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: .LBB11_2: -; AVX-NEXT: movzbl %dl, %ecx -; AVX-NEXT: vpextrb $0, %xmm1, %eax -; AVX-NEXT: vpextrb $0, %xmm0, %edx -; AVX-NEXT: addb %al, %dl -; AVX-NEXT: movb $-1, %al -; AVX-NEXT: jb .LBB11_4 -; AVX-NEXT: # %bb.3: -; AVX-NEXT: movl %edx, %eax -; AVX-NEXT: .LBB11_4: -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vmovd %eax, %xmm2 -; AVX-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $2, %xmm1, %eax -; AVX-NEXT: vpextrb $2, %xmm0, %ecx -; AVX-NEXT: addb %al, %cl -; AVX-NEXT: movb $-1, %al -; AVX-NEXT: jb .LBB11_6 -; AVX-NEXT: # %bb.5: -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: .LBB11_6: -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $3, %xmm1, %eax -; AVX-NEXT: vpextrb $3, %xmm0, %ecx -; AVX-NEXT: addb %al, %cl -; AVX-NEXT: movb $-1, %al -; AVX-NEXT: jb .LBB11_8 -; AVX-NEXT: # %bb.7: -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: .LBB11_8: -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $4, %xmm1, %eax -; AVX-NEXT: vpextrb $4, %xmm0, %ecx -; AVX-NEXT: addb %al, %cl -; AVX-NEXT: movb $-1, %al -; AVX-NEXT: jb .LBB11_10 -; AVX-NEXT: # %bb.9: -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: .LBB11_10: -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $5, %xmm1, %eax -; AVX-NEXT: vpextrb $5, %xmm0, %ecx -; AVX-NEXT: addb %al, %cl -; AVX-NEXT: movb $-1, %al -; AVX-NEXT: jb .LBB11_12 -; AVX-NEXT: # %bb.11: -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: .LBB11_12: -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $6, %xmm1, %eax -; AVX-NEXT: vpextrb $6, %xmm0, %ecx -; AVX-NEXT: addb %al, %cl -; AVX-NEXT: movb $-1, %al -; AVX-NEXT: jb .LBB11_14 -; AVX-NEXT: # %bb.13: -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: .LBB11_14: -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $7, %xmm1, %eax -; AVX-NEXT: vpextrb $7, %xmm0, %ecx -; AVX-NEXT: addb %al, %cl -; AVX-NEXT: movb $-1, %al -; AVX-NEXT: jb .LBB11_16 -; AVX-NEXT: # %bb.15: -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: .LBB11_16: -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $8, %xmm1, %eax -; AVX-NEXT: vpextrb $8, %xmm0, %ecx -; AVX-NEXT: addb %al, %cl -; AVX-NEXT: movb $-1, %al -; AVX-NEXT: jb .LBB11_18 -; AVX-NEXT: # %bb.17: -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: .LBB11_18: -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $9, %xmm1, %eax -; AVX-NEXT: vpextrb $9, %xmm0, %ecx -; AVX-NEXT: addb %al, %cl -; AVX-NEXT: movb $-1, %al -; AVX-NEXT: jb .LBB11_20 -; AVX-NEXT: # %bb.19: -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: .LBB11_20: -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $10, %xmm1, %eax -; AVX-NEXT: vpextrb $10, %xmm0, %ecx -; AVX-NEXT: addb %al, %cl -; AVX-NEXT: movb $-1, %al -; AVX-NEXT: jb .LBB11_22 -; AVX-NEXT: # %bb.21: -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: .LBB11_22: -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $11, %xmm1, %eax -; AVX-NEXT: vpextrb $11, %xmm0, %ecx -; AVX-NEXT: addb %al, %cl -; AVX-NEXT: movb $-1, %al -; AVX-NEXT: jb .LBB11_24 -; AVX-NEXT: # %bb.23: -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: .LBB11_24: -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $12, %xmm1, %eax -; AVX-NEXT: vpextrb $12, %xmm0, %ecx -; AVX-NEXT: addb %al, %cl -; AVX-NEXT: movb $-1, %al -; AVX-NEXT: jb .LBB11_26 -; AVX-NEXT: # %bb.25: -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: .LBB11_26: -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $13, %xmm1, %eax -; AVX-NEXT: vpextrb $13, %xmm0, %ecx -; AVX-NEXT: addb %al, %cl -; AVX-NEXT: movb $-1, %al -; AVX-NEXT: jb .LBB11_28 -; AVX-NEXT: # %bb.27: -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: .LBB11_28: -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $14, %xmm1, %eax -; AVX-NEXT: vpextrb $14, %xmm0, %ecx -; AVX-NEXT: addb %al, %cl -; AVX-NEXT: movb $-1, %al -; AVX-NEXT: jb .LBB11_30 -; AVX-NEXT: # %bb.29: -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: .LBB11_30: -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $15, %xmm1, %eax -; AVX-NEXT: vpextrb $15, %xmm0, %ecx -; AVX-NEXT: addb %al, %cl -; AVX-NEXT: jb .LBB11_32 -; AVX-NEXT: # %bb.31: -; AVX-NEXT: movl %ecx, %esi -; AVX-NEXT: .LBB11_32: -; AVX-NEXT: movzbl %sil, %eax -; AVX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0 +; AVX-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %z = call <12 x i8> @llvm.uadd.sat.v12i8(<12 x i8> %x, <12 x i8> %y) ret <12 x i8> %z } define void @v12i16(<12 x i16>* %px, <12 x i16>* %py, <12 x i16>* %pz) nounwind { -; SSE2-LABEL: v12i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm2 -; SSE2-NEXT: movdqa 16(%rdi), %xmm0 -; SSE2-NEXT: movdqa (%rsi), %xmm3 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: pextrw $7, %xmm3, %eax -; SSE2-NEXT: pextrw $7, %xmm2, %ecx -; SSE2-NEXT: addw %ax, %cx -; SSE2-NEXT: movl $65535, %eax # imm = 0xFFFF -; SSE2-NEXT: cmovbl %eax, %ecx -; SSE2-NEXT: movd %ecx, %xmm4 -; SSE2-NEXT: pextrw $6, %xmm3, %ecx -; SSE2-NEXT: pextrw $6, %xmm2, %esi -; SSE2-NEXT: addw %cx, %si -; SSE2-NEXT: cmovbl %eax, %esi -; SSE2-NEXT: movd %esi, %xmm5 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE2-NEXT: pextrw $5, %xmm3, %ecx -; SSE2-NEXT: pextrw $5, %xmm2, %esi -; SSE2-NEXT: addw %cx, %si -; SSE2-NEXT: cmovbl %eax, %esi -; SSE2-NEXT: movd %esi, %xmm6 -; SSE2-NEXT: pextrw $4, %xmm3, %ecx -; SSE2-NEXT: pextrw $4, %xmm2, %esi -; SSE2-NEXT: addw %cx, %si -; SSE2-NEXT: cmovbl %eax, %esi -; SSE2-NEXT: movd %esi, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSE2-NEXT: pextrw $3, %xmm3, %ecx -; SSE2-NEXT: pextrw $3, %xmm2, %esi -; SSE2-NEXT: addw %cx, %si -; SSE2-NEXT: cmovbl %eax, %esi -; SSE2-NEXT: movd %esi, %xmm5 -; SSE2-NEXT: pextrw $2, %xmm3, %ecx -; SSE2-NEXT: pextrw $2, %xmm2, %esi -; SSE2-NEXT: addw %cx, %si -; SSE2-NEXT: cmovbl %eax, %esi -; SSE2-NEXT: movd %esi, %xmm6 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; SSE2-NEXT: pextrw $1, %xmm3, %ecx -; SSE2-NEXT: pextrw $1, %xmm2, %esi -; SSE2-NEXT: addw %cx, %si -; SSE2-NEXT: cmovbl %eax, %esi -; SSE2-NEXT: movd %esi, %xmm5 -; SSE2-NEXT: movd %xmm3, %ecx -; SSE2-NEXT: movd %xmm2, %esi -; SSE2-NEXT: addw %cx, %si -; SSE2-NEXT: cmovbl %eax, %esi -; SSE2-NEXT: movd %esi, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] -; SSE2-NEXT: pextrw $1, %xmm1, %ecx -; SSE2-NEXT: pextrw $1, %xmm0, %esi -; SSE2-NEXT: addw %cx, %si -; SSE2-NEXT: cmovbl %eax, %esi -; SSE2-NEXT: movd %xmm1, %ecx -; SSE2-NEXT: movd %xmm0, %edi -; SSE2-NEXT: addw %cx, %di -; SSE2-NEXT: cmovbl %eax, %edi -; SSE2-NEXT: movd %edi, %xmm3 -; SSE2-NEXT: pinsrw $1, %esi, %xmm3 -; SSE2-NEXT: pextrw $2, %xmm1, %ecx -; SSE2-NEXT: pextrw $2, %xmm0, %esi -; SSE2-NEXT: addw %cx, %si -; SSE2-NEXT: cmovbl %eax, %esi -; SSE2-NEXT: pinsrw $2, %esi, %xmm3 -; SSE2-NEXT: pextrw $3, %xmm1, %ecx -; SSE2-NEXT: pextrw $3, %xmm0, %esi -; SSE2-NEXT: addw %cx, %si -; SSE2-NEXT: cmovbl %eax, %esi -; SSE2-NEXT: pinsrw $3, %esi, %xmm3 -; SSE2-NEXT: movq %xmm3, 16(%rdx) -; SSE2-NEXT: movdqa %xmm2, (%rdx) -; SSE2-NEXT: retq +; SSE-LABEL: v12i16: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa 16(%rdi), %xmm1 +; SSE-NEXT: paddusw (%rsi), %xmm0 +; SSE-NEXT: paddusw 16(%rsi), %xmm1 +; SSE-NEXT: movq %xmm1, 16(%rdx) +; SSE-NEXT: movdqa %xmm0, (%rdx) +; SSE-NEXT: retq ; -; SSSE3-LABEL: v12i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa (%rdi), %xmm2 -; SSSE3-NEXT: movdqa 16(%rdi), %xmm0 -; SSSE3-NEXT: movdqa (%rsi), %xmm3 -; SSSE3-NEXT: movdqa 16(%rsi), %xmm1 -; SSSE3-NEXT: pextrw $7, %xmm3, %eax -; SSSE3-NEXT: pextrw $7, %xmm2, %ecx -; SSSE3-NEXT: addw %ax, %cx -; SSSE3-NEXT: movl $65535, %eax # imm = 0xFFFF -; SSSE3-NEXT: cmovbl %eax, %ecx -; SSSE3-NEXT: movd %ecx, %xmm4 -; SSSE3-NEXT: pextrw $6, %xmm3, %ecx -; SSSE3-NEXT: pextrw $6, %xmm2, %esi -; SSSE3-NEXT: addw %cx, %si -; SSSE3-NEXT: cmovbl %eax, %esi -; SSSE3-NEXT: movd %esi, %xmm5 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSSE3-NEXT: pextrw $5, %xmm3, %ecx -; SSSE3-NEXT: pextrw $5, %xmm2, %esi -; SSSE3-NEXT: addw %cx, %si -; SSSE3-NEXT: cmovbl %eax, %esi -; SSSE3-NEXT: movd %esi, %xmm6 -; SSSE3-NEXT: pextrw $4, %xmm3, %ecx -; SSSE3-NEXT: pextrw $4, %xmm2, %esi -; SSSE3-NEXT: addw %cx, %si -; SSSE3-NEXT: cmovbl %eax, %esi -; SSSE3-NEXT: movd %esi, %xmm4 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSSE3-NEXT: pextrw $3, %xmm3, %ecx -; SSSE3-NEXT: pextrw $3, %xmm2, %esi -; SSSE3-NEXT: addw %cx, %si -; SSSE3-NEXT: cmovbl %eax, %esi -; SSSE3-NEXT: movd %esi, %xmm5 -; SSSE3-NEXT: pextrw $2, %xmm3, %ecx -; SSSE3-NEXT: pextrw $2, %xmm2, %esi -; SSSE3-NEXT: addw %cx, %si -; SSSE3-NEXT: cmovbl %eax, %esi -; SSSE3-NEXT: movd %esi, %xmm6 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; SSSE3-NEXT: pextrw $1, %xmm3, %ecx -; SSSE3-NEXT: pextrw $1, %xmm2, %esi -; SSSE3-NEXT: addw %cx, %si -; SSSE3-NEXT: cmovbl %eax, %esi -; SSSE3-NEXT: movd %esi, %xmm5 -; SSSE3-NEXT: movd %xmm3, %ecx -; SSSE3-NEXT: movd %xmm2, %esi -; SSSE3-NEXT: addw %cx, %si -; SSSE3-NEXT: cmovbl %eax, %esi -; SSSE3-NEXT: movd %esi, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] -; SSSE3-NEXT: pextrw $1, %xmm1, %ecx -; SSSE3-NEXT: pextrw $1, %xmm0, %esi -; SSSE3-NEXT: addw %cx, %si -; SSSE3-NEXT: cmovbl %eax, %esi -; SSSE3-NEXT: movd %xmm1, %ecx -; SSSE3-NEXT: movd %xmm0, %edi -; SSSE3-NEXT: addw %cx, %di -; SSSE3-NEXT: cmovbl %eax, %edi -; SSSE3-NEXT: movd %edi, %xmm3 -; SSSE3-NEXT: pinsrw $1, %esi, %xmm3 -; SSSE3-NEXT: pextrw $2, %xmm1, %ecx -; SSSE3-NEXT: pextrw $2, %xmm0, %esi -; SSSE3-NEXT: addw %cx, %si -; SSSE3-NEXT: cmovbl %eax, %esi -; SSSE3-NEXT: pinsrw $2, %esi, %xmm3 -; SSSE3-NEXT: pextrw $3, %xmm1, %ecx -; SSSE3-NEXT: pextrw $3, %xmm0, %esi -; SSSE3-NEXT: addw %cx, %si -; SSSE3-NEXT: cmovbl %eax, %esi -; SSSE3-NEXT: pinsrw $3, %esi, %xmm3 -; SSSE3-NEXT: movq %xmm3, 16(%rdx) -; SSSE3-NEXT: movdqa %xmm2, (%rdx) -; SSSE3-NEXT: retq +; AVX1-LABEL: v12i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vpaddusw (%rsi), %xmm0, %xmm0 +; AVX1-NEXT: vpaddusw 16(%rsi), %xmm1, %xmm1 +; AVX1-NEXT: vmovq %xmm1, 16(%rdx) +; AVX1-NEXT: vmovdqa %xmm0, (%rdx) +; AVX1-NEXT: retq ; -; SSE41-LABEL: v12i16: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm3 -; SSE41-NEXT: movdqa 16(%rdi), %xmm0 -; SSE41-NEXT: movdqa (%rsi), %xmm4 -; SSE41-NEXT: movdqa 16(%rsi), %xmm1 -; SSE41-NEXT: pextrw $1, %xmm4, %eax -; SSE41-NEXT: pextrw $1, %xmm3, %ecx -; SSE41-NEXT: addw %ax, %cx -; SSE41-NEXT: movl $65535, %eax # imm = 0xFFFF -; SSE41-NEXT: cmovbl %eax, %ecx -; SSE41-NEXT: movd %xmm4, %esi -; SSE41-NEXT: movd %xmm3, %edi -; SSE41-NEXT: addw %si, %di -; SSE41-NEXT: cmovbl %eax, %edi -; SSE41-NEXT: movd %edi, %xmm2 -; SSE41-NEXT: pinsrw $1, %ecx, %xmm2 -; SSE41-NEXT: pextrw $2, %xmm4, %ecx -; SSE41-NEXT: pextrw $2, %xmm3, %esi -; SSE41-NEXT: addw %cx, %si -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: pinsrw $2, %esi, %xmm2 -; SSE41-NEXT: pextrw $3, %xmm4, %ecx -; SSE41-NEXT: pextrw $3, %xmm3, %esi -; SSE41-NEXT: addw %cx, %si -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: pinsrw $3, %esi, %xmm2 -; SSE41-NEXT: pextrw $4, %xmm4, %ecx -; SSE41-NEXT: pextrw $4, %xmm3, %esi -; SSE41-NEXT: addw %cx, %si -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: pinsrw $4, %esi, %xmm2 -; SSE41-NEXT: pextrw $5, %xmm4, %ecx -; SSE41-NEXT: pextrw $5, %xmm3, %esi -; SSE41-NEXT: addw %cx, %si -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: pinsrw $5, %esi, %xmm2 -; SSE41-NEXT: pextrw $6, %xmm4, %ecx -; SSE41-NEXT: pextrw $6, %xmm3, %esi -; SSE41-NEXT: addw %cx, %si -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: pinsrw $6, %esi, %xmm2 -; SSE41-NEXT: pextrw $7, %xmm4, %ecx -; SSE41-NEXT: pextrw $7, %xmm3, %esi -; SSE41-NEXT: addw %cx, %si -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: pinsrw $7, %esi, %xmm2 -; SSE41-NEXT: pextrw $1, %xmm1, %ecx -; SSE41-NEXT: pextrw $1, %xmm0, %esi -; SSE41-NEXT: addw %cx, %si -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: movd %xmm1, %ecx -; SSE41-NEXT: movd %xmm0, %edi -; SSE41-NEXT: addw %cx, %di -; SSE41-NEXT: cmovbl %eax, %edi -; SSE41-NEXT: movd %edi, %xmm3 -; SSE41-NEXT: pinsrw $1, %esi, %xmm3 -; SSE41-NEXT: pextrw $2, %xmm1, %ecx -; SSE41-NEXT: pextrw $2, %xmm0, %esi -; SSE41-NEXT: addw %cx, %si -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: pinsrw $2, %esi, %xmm3 -; SSE41-NEXT: pextrw $3, %xmm1, %ecx -; SSE41-NEXT: pextrw $3, %xmm0, %esi -; SSE41-NEXT: addw %cx, %si -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: pinsrw $3, %esi, %xmm3 -; SSE41-NEXT: movq %xmm3, 16(%rdx) -; SSE41-NEXT: movdqa %xmm2, (%rdx) -; SSE41-NEXT: retq +; AVX2-LABEL: v12i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddusw (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovq %xmm1, 16(%rdx) +; AVX2-NEXT: vmovdqa %xmm0, (%rdx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; -; AVX-LABEL: v12i16: -; AVX: # %bb.0: -; AVX-NEXT: pushq %rbp -; AVX-NEXT: pushq %rbx -; AVX-NEXT: vmovdqa (%rsi), %xmm0 -; AVX-NEXT: vmovdqa 16(%rsi), %xmm2 -; AVX-NEXT: vpextrw $1, %xmm2, %eax -; AVX-NEXT: vmovdqa (%rdi), %xmm1 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX-NEXT: vpextrw $1, %xmm3, %ecx -; AVX-NEXT: addw %ax, %cx -; AVX-NEXT: movl $65535, %r8d # imm = 0xFFFF -; AVX-NEXT: cmovbl %r8d, %ecx -; AVX-NEXT: vmovd %xmm2, %eax -; AVX-NEXT: vmovd %xmm3, %esi -; AVX-NEXT: addw %ax, %si -; AVX-NEXT: cmovbl %r8d, %esi -; AVX-NEXT: vmovd %esi, %xmm4 -; AVX-NEXT: vpinsrw $1, %ecx, %xmm4, %xmm4 -; AVX-NEXT: vpextrw $2, %xmm2, %eax -; AVX-NEXT: vpextrw $2, %xmm3, %ecx -; AVX-NEXT: addw %ax, %cx -; AVX-NEXT: cmovbl %r8d, %ecx -; AVX-NEXT: vpinsrw $2, %ecx, %xmm4, %xmm4 -; AVX-NEXT: vpextrw $3, %xmm2, %eax -; AVX-NEXT: vpextrw $3, %xmm3, %ecx -; AVX-NEXT: addw %ax, %cx -; AVX-NEXT: cmovbl %r8d, %ecx -; AVX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm4 -; AVX-NEXT: vpextrw $4, %xmm2, %eax -; AVX-NEXT: vpextrw $4, %xmm3, %ecx -; AVX-NEXT: addw %ax, %cx -; AVX-NEXT: cmovbl %r8d, %ecx -; AVX-NEXT: vpinsrw $4, %ecx, %xmm4, %xmm4 -; AVX-NEXT: vpextrw $5, %xmm2, %eax -; AVX-NEXT: vpextrw $5, %xmm3, %ecx -; AVX-NEXT: addw %ax, %cx -; AVX-NEXT: cmovbl %r8d, %ecx -; AVX-NEXT: vpinsrw $5, %ecx, %xmm4, %xmm4 -; AVX-NEXT: vpextrw $6, %xmm2, %eax -; AVX-NEXT: vpextrw $6, %xmm3, %ecx -; AVX-NEXT: addw %ax, %cx -; AVX-NEXT: cmovbl %r8d, %ecx -; AVX-NEXT: vpinsrw $6, %ecx, %xmm4, %xmm4 -; AVX-NEXT: vpextrw $7, %xmm2, %eax -; AVX-NEXT: vpextrw $7, %xmm3, %ecx -; AVX-NEXT: addw %ax, %cx -; AVX-NEXT: cmovbl %r8d, %ecx -; AVX-NEXT: vpinsrw $7, %ecx, %xmm4, %xmm2 -; AVX-NEXT: vpextrw $7, %xmm0, %eax -; AVX-NEXT: vpextrw $7, %xmm1, %r9d -; AVX-NEXT: addw %ax, %r9w -; AVX-NEXT: cmovbl %r8d, %r9d -; AVX-NEXT: vpextrw $6, %xmm0, %eax -; AVX-NEXT: vpextrw $6, %xmm1, %r10d -; AVX-NEXT: addw %ax, %r10w -; AVX-NEXT: cmovbl %r8d, %r10d -; AVX-NEXT: vpextrw $5, %xmm0, %eax -; AVX-NEXT: vpextrw $5, %xmm1, %edi -; AVX-NEXT: addw %ax, %di -; AVX-NEXT: cmovbl %r8d, %edi -; AVX-NEXT: vpextrw $4, %xmm0, %ecx -; AVX-NEXT: vpextrw $4, %xmm1, %eax -; AVX-NEXT: addw %cx, %ax -; AVX-NEXT: cmovbl %r8d, %eax -; AVX-NEXT: vpextrw $3, %xmm0, %esi -; AVX-NEXT: vpextrw $3, %xmm1, %ecx -; AVX-NEXT: addw %si, %cx -; AVX-NEXT: cmovbl %r8d, %ecx -; AVX-NEXT: vpextrw $2, %xmm0, %r11d -; AVX-NEXT: vpextrw $2, %xmm1, %esi -; AVX-NEXT: addw %r11w, %si -; AVX-NEXT: cmovbl %r8d, %esi -; AVX-NEXT: vpextrw $1, %xmm0, %r11d -; AVX-NEXT: vpextrw $1, %xmm1, %ebx -; AVX-NEXT: addw %r11w, %bx -; AVX-NEXT: cmovbl %r8d, %ebx -; AVX-NEXT: vmovd %xmm0, %r11d -; AVX-NEXT: vmovd %xmm1, %ebp -; AVX-NEXT: addw %r11w, %bp -; AVX-NEXT: cmovbl %r8d, %ebp -; AVX-NEXT: vmovq %xmm2, 16(%rdx) -; AVX-NEXT: vmovd %ebp, %xmm0 -; AVX-NEXT: vpinsrw $1, %ebx, %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $2, %esi, %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $5, %edi, %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $6, %r10d, %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $7, %r9d, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: popq %rbx -; AVX-NEXT: popq %rbp -; AVX-NEXT: retq +; AVX512-LABEL: v12i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512-NEXT: vpaddusw (%rsi), %ymm0, %ymm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vmovq %xmm1, 16(%rdx) +; AVX512-NEXT: vmovdqa %xmm0, (%rdx) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %x = load <12 x i16>, <12 x i16>* %px %y = load <12 x i16>, <12 x i16>* %py %z = call <12 x i16> @llvm.uadd.sat.v12i16(<12 x i16> %x, <12 x i16> %y) @@ -11312,713 +547,26 @@ ; Promotion define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { -; SSE2-LABEL: v16i4: -; SSE2: # %bb.0: -; SSE2-NEXT: pushq %rbp -; SSE2-NEXT: pushq %r15 -; SSE2-NEXT: pushq %r14 -; SSE2-NEXT: pushq %r13 -; SSE2-NEXT: pushq %r12 -; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: psllw $4, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: psllw $4, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %dil -; SSE2-NEXT: jb .LBB15_2 -; SSE2-NEXT: # %bb.1: -; SSE2-NEXT: movl %eax, %edi -; SSE2-NEXT: .LBB15_2: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb $-1, %al -; SSE2-NEXT: jb .LBB15_4 -; SSE2-NEXT: # %bb.3: -; SSE2-NEXT: movl %ecx, %eax -; SSE2-NEXT: .LBB15_4: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb $-1, %cl -; SSE2-NEXT: jb .LBB15_6 -; SSE2-NEXT: # %bb.5: -; SSE2-NEXT: movl %edx, %ecx -; SSE2-NEXT: .LBB15_6: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb $-1, %r10b -; SSE2-NEXT: jb .LBB15_8 -; SSE2-NEXT: # %bb.7: -; SSE2-NEXT: movl %edx, %r10d -; SSE2-NEXT: .LBB15_8: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb $-1, %r11b -; SSE2-NEXT: jb .LBB15_10 -; SSE2-NEXT: # %bb.9: -; SSE2-NEXT: movl %edx, %r11d -; SSE2-NEXT: .LBB15_10: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb $-1, %r12b -; SSE2-NEXT: jb .LBB15_12 -; SSE2-NEXT: # %bb.11: -; SSE2-NEXT: movl %edx, %r12d -; SSE2-NEXT: .LBB15_12: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb $-1, %r13b -; SSE2-NEXT: jb .LBB15_14 -; SSE2-NEXT: # %bb.13: -; SSE2-NEXT: movl %edx, %r13d -; SSE2-NEXT: .LBB15_14: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb $-1, %r8b -; SSE2-NEXT: jb .LBB15_16 -; SSE2-NEXT: # %bb.15: -; SSE2-NEXT: movl %edx, %r8d -; SSE2-NEXT: .LBB15_16: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb $-1, %r14b -; SSE2-NEXT: jb .LBB15_18 -; SSE2-NEXT: # %bb.17: -; SSE2-NEXT: movl %edx, %r14d -; SSE2-NEXT: .LBB15_18: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb $-1, %r15b -; SSE2-NEXT: jb .LBB15_20 -; SSE2-NEXT: # %bb.19: -; SSE2-NEXT: movl %edx, %r15d -; SSE2-NEXT: .LBB15_20: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb $-1, %r9b -; SSE2-NEXT: jb .LBB15_22 -; SSE2-NEXT: # %bb.21: -; SSE2-NEXT: movl %edx, %r9d -; SSE2-NEXT: .LBB15_22: -; SSE2-NEXT: movzbl %dil, %edi -; SSE2-NEXT: movzbl %al, %esi -; SSE2-NEXT: movzbl %cl, %ebp -; SSE2-NEXT: movzbl %r10b, %edx -; SSE2-NEXT: movzbl %r11b, %ebx -; SSE2-NEXT: movzbl %r12b, %r10d -; SSE2-NEXT: movzbl %r13b, %r11d -; SSE2-NEXT: movzbl %r8b, %r8d -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %cl -; SSE2-NEXT: jb .LBB15_24 -; SSE2-NEXT: # %bb.23: -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: .LBB15_24: -; SSE2-NEXT: movd %edi, %xmm2 -; SSE2-NEXT: movd %esi, %xmm3 -; SSE2-NEXT: movd %ebp, %xmm5 -; SSE2-NEXT: movd %edx, %xmm0 -; SSE2-NEXT: movd %ebx, %xmm6 -; SSE2-NEXT: movd %r10d, %xmm4 -; SSE2-NEXT: movd %r11d, %xmm7 -; SSE2-NEXT: movd %r8d, %xmm1 -; SSE2-NEXT: movzbl %r14b, %esi -; SSE2-NEXT: movzbl %r15b, %edx -; SSE2-NEXT: movzbl %r9b, %eax -; SSE2-NEXT: movzbl %cl, %edi -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb $-1, %bl -; SSE2-NEXT: jb .LBB15_26 -; SSE2-NEXT: # %bb.25: -; SSE2-NEXT: movl %ecx, %ebx -; SSE2-NEXT: .LBB15_26: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; SSE2-NEXT: movd %esi, %xmm6 -; SSE2-NEXT: movd %edx, %xmm5 -; SSE2-NEXT: movd %eax, %xmm7 -; SSE2-NEXT: movd %edi, %xmm2 -; SSE2-NEXT: movzbl %bl, %eax -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb $-1, %cl -; SSE2-NEXT: jb .LBB15_28 -; SSE2-NEXT: # %bb.27: -; SSE2-NEXT: movl %edx, %ecx -; SSE2-NEXT: .LBB15_28: -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] -; SSE2-NEXT: movd %eax, %xmm4 -; SSE2-NEXT: movzbl %cl, %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb $-1, %cl -; SSE2-NEXT: jb .LBB15_30 -; SSE2-NEXT: # %bb.29: -; SSE2-NEXT: movl %edx, %ecx -; SSE2-NEXT: .LBB15_30: -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE2-NEXT: movzbl %cl, %ecx -; SSE2-NEXT: movd %ecx, %xmm4 -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %cl -; SSE2-NEXT: jb .LBB15_32 -; SSE2-NEXT: # %bb.31: -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: .LBB15_32: -; SSE2-NEXT: movzbl %cl, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: psrlw $4, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: popq %rbx -; SSE2-NEXT: popq %r12 -; SSE2-NEXT: popq %r13 -; SSE2-NEXT: popq %r14 -; SSE2-NEXT: popq %r15 -; SSE2-NEXT: popq %rbp -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v16i4: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pushq %rbp -; SSSE3-NEXT: pushq %r15 -; SSSE3-NEXT: pushq %r14 -; SSSE3-NEXT: pushq %r13 -; SSSE3-NEXT: pushq %r12 -; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: psllw $4, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: psllw $4, %xmm1 -; SSSE3-NEXT: pand %xmm2, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %dil -; SSSE3-NEXT: jb .LBB15_2 -; SSSE3-NEXT: # %bb.1: -; SSSE3-NEXT: movl %eax, %edi -; SSSE3-NEXT: .LBB15_2: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb $-1, %al -; SSSE3-NEXT: jb .LBB15_4 -; SSSE3-NEXT: # %bb.3: -; SSSE3-NEXT: movl %ecx, %eax -; SSSE3-NEXT: .LBB15_4: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb $-1, %cl -; SSSE3-NEXT: jb .LBB15_6 -; SSSE3-NEXT: # %bb.5: -; SSSE3-NEXT: movl %edx, %ecx -; SSSE3-NEXT: .LBB15_6: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb $-1, %r10b -; SSSE3-NEXT: jb .LBB15_8 -; SSSE3-NEXT: # %bb.7: -; SSSE3-NEXT: movl %edx, %r10d -; SSSE3-NEXT: .LBB15_8: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb $-1, %r11b -; SSSE3-NEXT: jb .LBB15_10 -; SSSE3-NEXT: # %bb.9: -; SSSE3-NEXT: movl %edx, %r11d -; SSSE3-NEXT: .LBB15_10: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb $-1, %r12b -; SSSE3-NEXT: jb .LBB15_12 -; SSSE3-NEXT: # %bb.11: -; SSSE3-NEXT: movl %edx, %r12d -; SSSE3-NEXT: .LBB15_12: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb $-1, %r13b -; SSSE3-NEXT: jb .LBB15_14 -; SSSE3-NEXT: # %bb.13: -; SSSE3-NEXT: movl %edx, %r13d -; SSSE3-NEXT: .LBB15_14: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb $-1, %r8b -; SSSE3-NEXT: jb .LBB15_16 -; SSSE3-NEXT: # %bb.15: -; SSSE3-NEXT: movl %edx, %r8d -; SSSE3-NEXT: .LBB15_16: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb $-1, %r14b -; SSSE3-NEXT: jb .LBB15_18 -; SSSE3-NEXT: # %bb.17: -; SSSE3-NEXT: movl %edx, %r14d -; SSSE3-NEXT: .LBB15_18: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb $-1, %r15b -; SSSE3-NEXT: jb .LBB15_20 -; SSSE3-NEXT: # %bb.19: -; SSSE3-NEXT: movl %edx, %r15d -; SSSE3-NEXT: .LBB15_20: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb $-1, %r9b -; SSSE3-NEXT: jb .LBB15_22 -; SSSE3-NEXT: # %bb.21: -; SSSE3-NEXT: movl %edx, %r9d -; SSSE3-NEXT: .LBB15_22: -; SSSE3-NEXT: movzbl %dil, %edi -; SSSE3-NEXT: movzbl %al, %esi -; SSSE3-NEXT: movzbl %cl, %ebp -; SSSE3-NEXT: movzbl %r10b, %edx -; SSSE3-NEXT: movzbl %r11b, %ebx -; SSSE3-NEXT: movzbl %r12b, %r10d -; SSSE3-NEXT: movzbl %r13b, %r11d -; SSSE3-NEXT: movzbl %r8b, %r8d -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %cl -; SSSE3-NEXT: jb .LBB15_24 -; SSSE3-NEXT: # %bb.23: -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: .LBB15_24: -; SSSE3-NEXT: movd %edi, %xmm2 -; SSSE3-NEXT: movd %esi, %xmm3 -; SSSE3-NEXT: movd %ebp, %xmm5 -; SSSE3-NEXT: movd %edx, %xmm0 -; SSSE3-NEXT: movd %ebx, %xmm6 -; SSSE3-NEXT: movd %r10d, %xmm4 -; SSSE3-NEXT: movd %r11d, %xmm7 -; SSSE3-NEXT: movd %r8d, %xmm1 -; SSSE3-NEXT: movzbl %r14b, %esi -; SSSE3-NEXT: movzbl %r15b, %edx -; SSSE3-NEXT: movzbl %r9b, %eax -; SSSE3-NEXT: movzbl %cl, %edi -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb $-1, %bl -; SSSE3-NEXT: jb .LBB15_26 -; SSSE3-NEXT: # %bb.25: -; SSSE3-NEXT: movl %ecx, %ebx -; SSSE3-NEXT: .LBB15_26: -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; SSSE3-NEXT: movd %esi, %xmm6 -; SSSE3-NEXT: movd %edx, %xmm5 -; SSSE3-NEXT: movd %eax, %xmm7 -; SSSE3-NEXT: movd %edi, %xmm2 -; SSSE3-NEXT: movzbl %bl, %eax -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb $-1, %cl -; SSSE3-NEXT: jb .LBB15_28 -; SSSE3-NEXT: # %bb.27: -; SSSE3-NEXT: movl %edx, %ecx -; SSSE3-NEXT: .LBB15_28: -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] -; SSSE3-NEXT: movd %eax, %xmm4 -; SSSE3-NEXT: movzbl %cl, %eax -; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb $-1, %cl -; SSSE3-NEXT: jb .LBB15_30 -; SSSE3-NEXT: # %bb.29: -; SSSE3-NEXT: movl %edx, %ecx -; SSSE3-NEXT: .LBB15_30: -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSSE3-NEXT: movzbl %cl, %ecx -; SSSE3-NEXT: movd %ecx, %xmm4 -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %cl -; SSSE3-NEXT: jb .LBB15_32 -; SSSE3-NEXT: # %bb.31: -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: .LBB15_32: -; SSSE3-NEXT: movzbl %cl, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: psrlw $4, %xmm0 -; SSSE3-NEXT: pand {{.*}}(%rip), %xmm0 -; SSSE3-NEXT: popq %rbx -; SSSE3-NEXT: popq %r12 -; SSSE3-NEXT: popq %r13 -; SSSE3-NEXT: popq %r14 -; SSSE3-NEXT: popq %r15 -; SSSE3-NEXT: popq %rbp -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v16i4: -; SSE41: # %bb.0: -; SSE41-NEXT: psllw $4, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: pextrb $1, %xmm1, %eax -; SSE41-NEXT: psllw $4, %xmm0 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pextrb $1, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %sil -; SSE41-NEXT: movb $-1, %dl -; SSE41-NEXT: jb .LBB15_2 -; SSE41-NEXT: # %bb.1: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB15_2: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pextrb $0, %xmm1, %eax -; SSE41-NEXT: pextrb $0, %xmm0, %edx -; SSE41-NEXT: addb %al, %dl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB15_4 -; SSE41-NEXT: # %bb.3: -; SSE41-NEXT: movl %edx, %eax -; SSE41-NEXT: .LBB15_4: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: movd %eax, %xmm2 -; SSE41-NEXT: pinsrb $1, %ecx, %xmm2 -; SSE41-NEXT: pextrb $2, %xmm1, %eax -; SSE41-NEXT: pextrb $2, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB15_6 -; SSE41-NEXT: # %bb.5: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB15_6: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $2, %eax, %xmm2 -; SSE41-NEXT: pextrb $3, %xmm1, %eax -; SSE41-NEXT: pextrb $3, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB15_8 -; SSE41-NEXT: # %bb.7: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB15_8: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $3, %eax, %xmm2 -; SSE41-NEXT: pextrb $4, %xmm1, %eax -; SSE41-NEXT: pextrb $4, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB15_10 -; SSE41-NEXT: # %bb.9: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB15_10: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $4, %eax, %xmm2 -; SSE41-NEXT: pextrb $5, %xmm1, %eax -; SSE41-NEXT: pextrb $5, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB15_12 -; SSE41-NEXT: # %bb.11: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB15_12: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $5, %eax, %xmm2 -; SSE41-NEXT: pextrb $6, %xmm1, %eax -; SSE41-NEXT: pextrb $6, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB15_14 -; SSE41-NEXT: # %bb.13: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB15_14: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $6, %eax, %xmm2 -; SSE41-NEXT: pextrb $7, %xmm1, %eax -; SSE41-NEXT: pextrb $7, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB15_16 -; SSE41-NEXT: # %bb.15: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB15_16: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $7, %eax, %xmm2 -; SSE41-NEXT: pextrb $8, %xmm1, %eax -; SSE41-NEXT: pextrb $8, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB15_18 -; SSE41-NEXT: # %bb.17: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB15_18: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $8, %eax, %xmm2 -; SSE41-NEXT: pextrb $9, %xmm1, %eax -; SSE41-NEXT: pextrb $9, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB15_20 -; SSE41-NEXT: # %bb.19: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB15_20: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $9, %eax, %xmm2 -; SSE41-NEXT: pextrb $10, %xmm1, %eax -; SSE41-NEXT: pextrb $10, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB15_22 -; SSE41-NEXT: # %bb.21: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB15_22: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $10, %eax, %xmm2 -; SSE41-NEXT: pextrb $11, %xmm1, %eax -; SSE41-NEXT: pextrb $11, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB15_24 -; SSE41-NEXT: # %bb.23: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB15_24: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $11, %eax, %xmm2 -; SSE41-NEXT: pextrb $12, %xmm1, %eax -; SSE41-NEXT: pextrb $12, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB15_26 -; SSE41-NEXT: # %bb.25: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB15_26: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $12, %eax, %xmm2 -; SSE41-NEXT: pextrb $13, %xmm1, %eax -; SSE41-NEXT: pextrb $13, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB15_28 -; SSE41-NEXT: # %bb.27: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB15_28: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $13, %eax, %xmm2 -; SSE41-NEXT: pextrb $14, %xmm1, %eax -; SSE41-NEXT: pextrb $14, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB15_30 -; SSE41-NEXT: # %bb.29: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB15_30: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $14, %eax, %xmm2 -; SSE41-NEXT: pextrb $15, %xmm1, %eax -; SSE41-NEXT: pextrb $15, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: jb .LBB15_32 -; SSE41-NEXT: # %bb.31: -; SSE41-NEXT: movl %ecx, %esi -; SSE41-NEXT: .LBB15_32: -; SSE41-NEXT: movzbl %sil, %eax -; SSE41-NEXT: pinsrb $15, %eax, %xmm2 -; SSE41-NEXT: psrlw $4, %xmm2 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: v16i4: +; SSE: # %bb.0: +; SSE-NEXT: psllw $4, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: psllw $4, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: paddusb %xmm1, %xmm0 +; SSE-NEXT: psrlw $4, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: v16i4: ; AVX: # %bb.0: ; AVX-NEXT: vpsllw $4, %xmm1, %xmm1 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpextrb $1, %xmm1, %eax ; AVX-NEXT: vpsllw $4, %xmm0, %xmm0 ; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpextrb $1, %xmm0, %ecx -; AVX-NEXT: addb %al, %cl -; AVX-NEXT: movb $-1, %sil -; AVX-NEXT: movb $-1, %dl -; AVX-NEXT: jb .LBB15_2 -; AVX-NEXT: # %bb.1: -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: .LBB15_2: -; AVX-NEXT: movzbl %dl, %ecx -; AVX-NEXT: vpextrb $0, %xmm1, %eax -; AVX-NEXT: vpextrb $0, %xmm0, %edx -; AVX-NEXT: addb %al, %dl -; AVX-NEXT: movb $-1, %al -; AVX-NEXT: jb .LBB15_4 -; AVX-NEXT: # %bb.3: -; AVX-NEXT: movl %edx, %eax -; AVX-NEXT: .LBB15_4: -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vmovd %eax, %xmm2 -; AVX-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $2, %xmm1, %eax -; AVX-NEXT: vpextrb $2, %xmm0, %ecx -; AVX-NEXT: addb %al, %cl -; AVX-NEXT: movb $-1, %al -; AVX-NEXT: jb .LBB15_6 -; AVX-NEXT: # %bb.5: -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: .LBB15_6: -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $3, %xmm1, %eax -; AVX-NEXT: vpextrb $3, %xmm0, %ecx -; AVX-NEXT: addb %al, %cl -; AVX-NEXT: movb $-1, %al -; AVX-NEXT: jb .LBB15_8 -; AVX-NEXT: # %bb.7: -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: .LBB15_8: -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $4, %xmm1, %eax -; AVX-NEXT: vpextrb $4, %xmm0, %ecx -; AVX-NEXT: addb %al, %cl -; AVX-NEXT: movb $-1, %al -; AVX-NEXT: jb .LBB15_10 -; AVX-NEXT: # %bb.9: -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: .LBB15_10: -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $5, %xmm1, %eax -; AVX-NEXT: vpextrb $5, %xmm0, %ecx -; AVX-NEXT: addb %al, %cl -; AVX-NEXT: movb $-1, %al -; AVX-NEXT: jb .LBB15_12 -; AVX-NEXT: # %bb.11: -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: .LBB15_12: -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $6, %xmm1, %eax -; AVX-NEXT: vpextrb $6, %xmm0, %ecx -; AVX-NEXT: addb %al, %cl -; AVX-NEXT: movb $-1, %al -; AVX-NEXT: jb .LBB15_14 -; AVX-NEXT: # %bb.13: -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: .LBB15_14: -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $7, %xmm1, %eax -; AVX-NEXT: vpextrb $7, %xmm0, %ecx -; AVX-NEXT: addb %al, %cl -; AVX-NEXT: movb $-1, %al -; AVX-NEXT: jb .LBB15_16 -; AVX-NEXT: # %bb.15: -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: .LBB15_16: -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $8, %xmm1, %eax -; AVX-NEXT: vpextrb $8, %xmm0, %ecx -; AVX-NEXT: addb %al, %cl -; AVX-NEXT: movb $-1, %al -; AVX-NEXT: jb .LBB15_18 -; AVX-NEXT: # %bb.17: -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: .LBB15_18: -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $9, %xmm1, %eax -; AVX-NEXT: vpextrb $9, %xmm0, %ecx -; AVX-NEXT: addb %al, %cl -; AVX-NEXT: movb $-1, %al -; AVX-NEXT: jb .LBB15_20 -; AVX-NEXT: # %bb.19: -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: .LBB15_20: -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $10, %xmm1, %eax -; AVX-NEXT: vpextrb $10, %xmm0, %ecx -; AVX-NEXT: addb %al, %cl -; AVX-NEXT: movb $-1, %al -; AVX-NEXT: jb .LBB15_22 -; AVX-NEXT: # %bb.21: -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: .LBB15_22: -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $11, %xmm1, %eax -; AVX-NEXT: vpextrb $11, %xmm0, %ecx -; AVX-NEXT: addb %al, %cl -; AVX-NEXT: movb $-1, %al -; AVX-NEXT: jb .LBB15_24 -; AVX-NEXT: # %bb.23: -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: .LBB15_24: -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $12, %xmm1, %eax -; AVX-NEXT: vpextrb $12, %xmm0, %ecx -; AVX-NEXT: addb %al, %cl -; AVX-NEXT: movb $-1, %al -; AVX-NEXT: jb .LBB15_26 -; AVX-NEXT: # %bb.25: -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: .LBB15_26: -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $13, %xmm1, %eax -; AVX-NEXT: vpextrb $13, %xmm0, %ecx -; AVX-NEXT: addb %al, %cl -; AVX-NEXT: movb $-1, %al -; AVX-NEXT: jb .LBB15_28 -; AVX-NEXT: # %bb.27: -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: .LBB15_28: -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $14, %xmm1, %eax -; AVX-NEXT: vpextrb $14, %xmm0, %ecx -; AVX-NEXT: addb %al, %cl -; AVX-NEXT: movb $-1, %al -; AVX-NEXT: jb .LBB15_30 -; AVX-NEXT: # %bb.29: -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: .LBB15_30: -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $15, %xmm1, %eax -; AVX-NEXT: vpextrb $15, %xmm0, %ecx -; AVX-NEXT: addb %al, %cl -; AVX-NEXT: jb .LBB15_32 -; AVX-NEXT: # %bb.31: -; AVX-NEXT: movl %ecx, %esi -; AVX-NEXT: .LBB15_32: -; AVX-NEXT: movzbl %sil, %eax -; AVX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0 +; AVX-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq @@ -12027,713 +575,26 @@ } define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind { -; SSE2-LABEL: v16i1: -; SSE2: # %bb.0: -; SSE2-NEXT: pushq %rbp -; SSE2-NEXT: pushq %r15 -; SSE2-NEXT: pushq %r14 -; SSE2-NEXT: pushq %r13 -; SSE2-NEXT: pushq %r12 -; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: psllw $7, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: psllw $7, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %dil -; SSE2-NEXT: jb .LBB16_2 -; SSE2-NEXT: # %bb.1: -; SSE2-NEXT: movl %eax, %edi -; SSE2-NEXT: .LBB16_2: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb $-1, %al -; SSE2-NEXT: jb .LBB16_4 -; SSE2-NEXT: # %bb.3: -; SSE2-NEXT: movl %ecx, %eax -; SSE2-NEXT: .LBB16_4: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb $-1, %cl -; SSE2-NEXT: jb .LBB16_6 -; SSE2-NEXT: # %bb.5: -; SSE2-NEXT: movl %edx, %ecx -; SSE2-NEXT: .LBB16_6: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb $-1, %r10b -; SSE2-NEXT: jb .LBB16_8 -; SSE2-NEXT: # %bb.7: -; SSE2-NEXT: movl %edx, %r10d -; SSE2-NEXT: .LBB16_8: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb $-1, %r11b -; SSE2-NEXT: jb .LBB16_10 -; SSE2-NEXT: # %bb.9: -; SSE2-NEXT: movl %edx, %r11d -; SSE2-NEXT: .LBB16_10: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb $-1, %r12b -; SSE2-NEXT: jb .LBB16_12 -; SSE2-NEXT: # %bb.11: -; SSE2-NEXT: movl %edx, %r12d -; SSE2-NEXT: .LBB16_12: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb $-1, %r13b -; SSE2-NEXT: jb .LBB16_14 -; SSE2-NEXT: # %bb.13: -; SSE2-NEXT: movl %edx, %r13d -; SSE2-NEXT: .LBB16_14: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb $-1, %r8b -; SSE2-NEXT: jb .LBB16_16 -; SSE2-NEXT: # %bb.15: -; SSE2-NEXT: movl %edx, %r8d -; SSE2-NEXT: .LBB16_16: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb $-1, %r14b -; SSE2-NEXT: jb .LBB16_18 -; SSE2-NEXT: # %bb.17: -; SSE2-NEXT: movl %edx, %r14d -; SSE2-NEXT: .LBB16_18: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb $-1, %r15b -; SSE2-NEXT: jb .LBB16_20 -; SSE2-NEXT: # %bb.19: -; SSE2-NEXT: movl %edx, %r15d -; SSE2-NEXT: .LBB16_20: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb $-1, %r9b -; SSE2-NEXT: jb .LBB16_22 -; SSE2-NEXT: # %bb.21: -; SSE2-NEXT: movl %edx, %r9d -; SSE2-NEXT: .LBB16_22: -; SSE2-NEXT: movzbl %dil, %edi -; SSE2-NEXT: movzbl %al, %esi -; SSE2-NEXT: movzbl %cl, %ebp -; SSE2-NEXT: movzbl %r10b, %edx -; SSE2-NEXT: movzbl %r11b, %ebx -; SSE2-NEXT: movzbl %r12b, %r10d -; SSE2-NEXT: movzbl %r13b, %r11d -; SSE2-NEXT: movzbl %r8b, %r8d -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %cl -; SSE2-NEXT: jb .LBB16_24 -; SSE2-NEXT: # %bb.23: -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: .LBB16_24: -; SSE2-NEXT: movd %edi, %xmm2 -; SSE2-NEXT: movd %esi, %xmm3 -; SSE2-NEXT: movd %ebp, %xmm5 -; SSE2-NEXT: movd %edx, %xmm0 -; SSE2-NEXT: movd %ebx, %xmm6 -; SSE2-NEXT: movd %r10d, %xmm4 -; SSE2-NEXT: movd %r11d, %xmm7 -; SSE2-NEXT: movd %r8d, %xmm1 -; SSE2-NEXT: movzbl %r14b, %esi -; SSE2-NEXT: movzbl %r15b, %edx -; SSE2-NEXT: movzbl %r9b, %eax -; SSE2-NEXT: movzbl %cl, %edi -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb $-1, %bl -; SSE2-NEXT: jb .LBB16_26 -; SSE2-NEXT: # %bb.25: -; SSE2-NEXT: movl %ecx, %ebx -; SSE2-NEXT: .LBB16_26: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; SSE2-NEXT: movd %esi, %xmm6 -; SSE2-NEXT: movd %edx, %xmm5 -; SSE2-NEXT: movd %eax, %xmm7 -; SSE2-NEXT: movd %edi, %xmm2 -; SSE2-NEXT: movzbl %bl, %eax -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb $-1, %cl -; SSE2-NEXT: jb .LBB16_28 -; SSE2-NEXT: # %bb.27: -; SSE2-NEXT: movl %edx, %ecx -; SSE2-NEXT: .LBB16_28: -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] -; SSE2-NEXT: movd %eax, %xmm4 -; SSE2-NEXT: movzbl %cl, %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb $-1, %cl -; SSE2-NEXT: jb .LBB16_30 -; SSE2-NEXT: # %bb.29: -; SSE2-NEXT: movl %edx, %ecx -; SSE2-NEXT: .LBB16_30: -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE2-NEXT: movzbl %cl, %ecx -; SSE2-NEXT: movd %ecx, %xmm4 -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb $-1, %cl -; SSE2-NEXT: jb .LBB16_32 -; SSE2-NEXT: # %bb.31: -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: .LBB16_32: -; SSE2-NEXT: movzbl %cl, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: psrlw $7, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: popq %rbx -; SSE2-NEXT: popq %r12 -; SSE2-NEXT: popq %r13 -; SSE2-NEXT: popq %r14 -; SSE2-NEXT: popq %r15 -; SSE2-NEXT: popq %rbp -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v16i1: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pushq %rbp -; SSSE3-NEXT: pushq %r15 -; SSSE3-NEXT: pushq %r14 -; SSSE3-NEXT: pushq %r13 -; SSSE3-NEXT: pushq %r12 -; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: psllw $7, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: psllw $7, %xmm1 -; SSSE3-NEXT: pand %xmm2, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %dil -; SSSE3-NEXT: jb .LBB16_2 -; SSSE3-NEXT: # %bb.1: -; SSSE3-NEXT: movl %eax, %edi -; SSSE3-NEXT: .LBB16_2: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb $-1, %al -; SSSE3-NEXT: jb .LBB16_4 -; SSSE3-NEXT: # %bb.3: -; SSSE3-NEXT: movl %ecx, %eax -; SSSE3-NEXT: .LBB16_4: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb $-1, %cl -; SSSE3-NEXT: jb .LBB16_6 -; SSSE3-NEXT: # %bb.5: -; SSSE3-NEXT: movl %edx, %ecx -; SSSE3-NEXT: .LBB16_6: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb $-1, %r10b -; SSSE3-NEXT: jb .LBB16_8 -; SSSE3-NEXT: # %bb.7: -; SSSE3-NEXT: movl %edx, %r10d -; SSSE3-NEXT: .LBB16_8: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb $-1, %r11b -; SSSE3-NEXT: jb .LBB16_10 -; SSSE3-NEXT: # %bb.9: -; SSSE3-NEXT: movl %edx, %r11d -; SSSE3-NEXT: .LBB16_10: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb $-1, %r12b -; SSSE3-NEXT: jb .LBB16_12 -; SSSE3-NEXT: # %bb.11: -; SSSE3-NEXT: movl %edx, %r12d -; SSSE3-NEXT: .LBB16_12: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb $-1, %r13b -; SSSE3-NEXT: jb .LBB16_14 -; SSSE3-NEXT: # %bb.13: -; SSSE3-NEXT: movl %edx, %r13d -; SSSE3-NEXT: .LBB16_14: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb $-1, %r8b -; SSSE3-NEXT: jb .LBB16_16 -; SSSE3-NEXT: # %bb.15: -; SSSE3-NEXT: movl %edx, %r8d -; SSSE3-NEXT: .LBB16_16: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb $-1, %r14b -; SSSE3-NEXT: jb .LBB16_18 -; SSSE3-NEXT: # %bb.17: -; SSSE3-NEXT: movl %edx, %r14d -; SSSE3-NEXT: .LBB16_18: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb $-1, %r15b -; SSSE3-NEXT: jb .LBB16_20 -; SSSE3-NEXT: # %bb.19: -; SSSE3-NEXT: movl %edx, %r15d -; SSSE3-NEXT: .LBB16_20: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb $-1, %r9b -; SSSE3-NEXT: jb .LBB16_22 -; SSSE3-NEXT: # %bb.21: -; SSSE3-NEXT: movl %edx, %r9d -; SSSE3-NEXT: .LBB16_22: -; SSSE3-NEXT: movzbl %dil, %edi -; SSSE3-NEXT: movzbl %al, %esi -; SSSE3-NEXT: movzbl %cl, %ebp -; SSSE3-NEXT: movzbl %r10b, %edx -; SSSE3-NEXT: movzbl %r11b, %ebx -; SSSE3-NEXT: movzbl %r12b, %r10d -; SSSE3-NEXT: movzbl %r13b, %r11d -; SSSE3-NEXT: movzbl %r8b, %r8d -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %cl -; SSSE3-NEXT: jb .LBB16_24 -; SSSE3-NEXT: # %bb.23: -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: .LBB16_24: -; SSSE3-NEXT: movd %edi, %xmm2 -; SSSE3-NEXT: movd %esi, %xmm3 -; SSSE3-NEXT: movd %ebp, %xmm5 -; SSSE3-NEXT: movd %edx, %xmm0 -; SSSE3-NEXT: movd %ebx, %xmm6 -; SSSE3-NEXT: movd %r10d, %xmm4 -; SSSE3-NEXT: movd %r11d, %xmm7 -; SSSE3-NEXT: movd %r8d, %xmm1 -; SSSE3-NEXT: movzbl %r14b, %esi -; SSSE3-NEXT: movzbl %r15b, %edx -; SSSE3-NEXT: movzbl %r9b, %eax -; SSSE3-NEXT: movzbl %cl, %edi -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb $-1, %bl -; SSSE3-NEXT: jb .LBB16_26 -; SSSE3-NEXT: # %bb.25: -; SSSE3-NEXT: movl %ecx, %ebx -; SSSE3-NEXT: .LBB16_26: -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; SSSE3-NEXT: movd %esi, %xmm6 -; SSSE3-NEXT: movd %edx, %xmm5 -; SSSE3-NEXT: movd %eax, %xmm7 -; SSSE3-NEXT: movd %edi, %xmm2 -; SSSE3-NEXT: movzbl %bl, %eax -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb $-1, %cl -; SSSE3-NEXT: jb .LBB16_28 -; SSSE3-NEXT: # %bb.27: -; SSSE3-NEXT: movl %edx, %ecx -; SSSE3-NEXT: .LBB16_28: -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] -; SSSE3-NEXT: movd %eax, %xmm4 -; SSSE3-NEXT: movzbl %cl, %eax -; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb $-1, %cl -; SSSE3-NEXT: jb .LBB16_30 -; SSSE3-NEXT: # %bb.29: -; SSSE3-NEXT: movl %edx, %ecx -; SSSE3-NEXT: .LBB16_30: -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSSE3-NEXT: movzbl %cl, %ecx -; SSSE3-NEXT: movd %ecx, %xmm4 -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb $-1, %cl -; SSSE3-NEXT: jb .LBB16_32 -; SSSE3-NEXT: # %bb.31: -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: .LBB16_32: -; SSSE3-NEXT: movzbl %cl, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: psrlw $7, %xmm0 -; SSSE3-NEXT: pand {{.*}}(%rip), %xmm0 -; SSSE3-NEXT: popq %rbx -; SSSE3-NEXT: popq %r12 -; SSSE3-NEXT: popq %r13 -; SSSE3-NEXT: popq %r14 -; SSSE3-NEXT: popq %r15 -; SSSE3-NEXT: popq %rbp -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v16i1: -; SSE41: # %bb.0: -; SSE41-NEXT: psllw $7, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: pextrb $1, %xmm1, %eax -; SSE41-NEXT: psllw $7, %xmm0 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pextrb $1, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %sil -; SSE41-NEXT: movb $-1, %dl -; SSE41-NEXT: jb .LBB16_2 -; SSE41-NEXT: # %bb.1: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB16_2: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pextrb $0, %xmm1, %eax -; SSE41-NEXT: pextrb $0, %xmm0, %edx -; SSE41-NEXT: addb %al, %dl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB16_4 -; SSE41-NEXT: # %bb.3: -; SSE41-NEXT: movl %edx, %eax -; SSE41-NEXT: .LBB16_4: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: movd %eax, %xmm2 -; SSE41-NEXT: pinsrb $1, %ecx, %xmm2 -; SSE41-NEXT: pextrb $2, %xmm1, %eax -; SSE41-NEXT: pextrb $2, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB16_6 -; SSE41-NEXT: # %bb.5: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB16_6: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $2, %eax, %xmm2 -; SSE41-NEXT: pextrb $3, %xmm1, %eax -; SSE41-NEXT: pextrb $3, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB16_8 -; SSE41-NEXT: # %bb.7: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB16_8: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $3, %eax, %xmm2 -; SSE41-NEXT: pextrb $4, %xmm1, %eax -; SSE41-NEXT: pextrb $4, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB16_10 -; SSE41-NEXT: # %bb.9: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB16_10: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $4, %eax, %xmm2 -; SSE41-NEXT: pextrb $5, %xmm1, %eax -; SSE41-NEXT: pextrb $5, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB16_12 -; SSE41-NEXT: # %bb.11: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB16_12: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $5, %eax, %xmm2 -; SSE41-NEXT: pextrb $6, %xmm1, %eax -; SSE41-NEXT: pextrb $6, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB16_14 -; SSE41-NEXT: # %bb.13: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB16_14: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $6, %eax, %xmm2 -; SSE41-NEXT: pextrb $7, %xmm1, %eax -; SSE41-NEXT: pextrb $7, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB16_16 -; SSE41-NEXT: # %bb.15: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB16_16: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $7, %eax, %xmm2 -; SSE41-NEXT: pextrb $8, %xmm1, %eax -; SSE41-NEXT: pextrb $8, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB16_18 -; SSE41-NEXT: # %bb.17: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB16_18: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $8, %eax, %xmm2 -; SSE41-NEXT: pextrb $9, %xmm1, %eax -; SSE41-NEXT: pextrb $9, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB16_20 -; SSE41-NEXT: # %bb.19: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB16_20: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $9, %eax, %xmm2 -; SSE41-NEXT: pextrb $10, %xmm1, %eax -; SSE41-NEXT: pextrb $10, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB16_22 -; SSE41-NEXT: # %bb.21: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB16_22: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $10, %eax, %xmm2 -; SSE41-NEXT: pextrb $11, %xmm1, %eax -; SSE41-NEXT: pextrb $11, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB16_24 -; SSE41-NEXT: # %bb.23: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB16_24: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $11, %eax, %xmm2 -; SSE41-NEXT: pextrb $12, %xmm1, %eax -; SSE41-NEXT: pextrb $12, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB16_26 -; SSE41-NEXT: # %bb.25: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB16_26: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $12, %eax, %xmm2 -; SSE41-NEXT: pextrb $13, %xmm1, %eax -; SSE41-NEXT: pextrb $13, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB16_28 -; SSE41-NEXT: # %bb.27: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB16_28: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $13, %eax, %xmm2 -; SSE41-NEXT: pextrb $14, %xmm1, %eax -; SSE41-NEXT: pextrb $14, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: movb $-1, %al -; SSE41-NEXT: jb .LBB16_30 -; SSE41-NEXT: # %bb.29: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB16_30: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $14, %eax, %xmm2 -; SSE41-NEXT: pextrb $15, %xmm1, %eax -; SSE41-NEXT: pextrb $15, %xmm0, %ecx -; SSE41-NEXT: addb %al, %cl -; SSE41-NEXT: jb .LBB16_32 -; SSE41-NEXT: # %bb.31: -; SSE41-NEXT: movl %ecx, %esi -; SSE41-NEXT: .LBB16_32: -; SSE41-NEXT: movzbl %sil, %eax -; SSE41-NEXT: pinsrb $15, %eax, %xmm2 -; SSE41-NEXT: psrlw $7, %xmm2 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: v16i1: +; SSE: # %bb.0: +; SSE-NEXT: psllw $7, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: psllw $7, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: paddusb %xmm1, %xmm0 +; SSE-NEXT: psrlw $7, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: v16i1: ; AVX1: # %bb.0: ; AVX1-NEXT: vpsllw $7, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $1, %xmm1, %eax ; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $1, %xmm0, %ecx -; AVX1-NEXT: addb %al, %cl -; AVX1-NEXT: movb $-1, %sil -; AVX1-NEXT: movb $-1, %dl -; AVX1-NEXT: jb .LBB16_2 -; AVX1-NEXT: # %bb.1: -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: .LBB16_2: -; AVX1-NEXT: movzbl %dl, %ecx -; AVX1-NEXT: vpextrb $0, %xmm1, %eax -; AVX1-NEXT: vpextrb $0, %xmm0, %edx -; AVX1-NEXT: addb %al, %dl -; AVX1-NEXT: movb $-1, %al -; AVX1-NEXT: jb .LBB16_4 -; AVX1-NEXT: # %bb.3: -; AVX1-NEXT: movl %edx, %eax -; AVX1-NEXT: .LBB16_4: -; AVX1-NEXT: movzbl %al, %eax -; AVX1-NEXT: vmovd %eax, %xmm2 -; AVX1-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $2, %xmm1, %eax -; AVX1-NEXT: vpextrb $2, %xmm0, %ecx -; AVX1-NEXT: addb %al, %cl -; AVX1-NEXT: movb $-1, %al -; AVX1-NEXT: jb .LBB16_6 -; AVX1-NEXT: # %bb.5: -; AVX1-NEXT: movl %ecx, %eax -; AVX1-NEXT: .LBB16_6: -; AVX1-NEXT: movzbl %al, %eax -; AVX1-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $3, %xmm1, %eax -; AVX1-NEXT: vpextrb $3, %xmm0, %ecx -; AVX1-NEXT: addb %al, %cl -; AVX1-NEXT: movb $-1, %al -; AVX1-NEXT: jb .LBB16_8 -; AVX1-NEXT: # %bb.7: -; AVX1-NEXT: movl %ecx, %eax -; AVX1-NEXT: .LBB16_8: -; AVX1-NEXT: movzbl %al, %eax -; AVX1-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $4, %xmm1, %eax -; AVX1-NEXT: vpextrb $4, %xmm0, %ecx -; AVX1-NEXT: addb %al, %cl -; AVX1-NEXT: movb $-1, %al -; AVX1-NEXT: jb .LBB16_10 -; AVX1-NEXT: # %bb.9: -; AVX1-NEXT: movl %ecx, %eax -; AVX1-NEXT: .LBB16_10: -; AVX1-NEXT: movzbl %al, %eax -; AVX1-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $5, %xmm1, %eax -; AVX1-NEXT: vpextrb $5, %xmm0, %ecx -; AVX1-NEXT: addb %al, %cl -; AVX1-NEXT: movb $-1, %al -; AVX1-NEXT: jb .LBB16_12 -; AVX1-NEXT: # %bb.11: -; AVX1-NEXT: movl %ecx, %eax -; AVX1-NEXT: .LBB16_12: -; AVX1-NEXT: movzbl %al, %eax -; AVX1-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $6, %xmm1, %eax -; AVX1-NEXT: vpextrb $6, %xmm0, %ecx -; AVX1-NEXT: addb %al, %cl -; AVX1-NEXT: movb $-1, %al -; AVX1-NEXT: jb .LBB16_14 -; AVX1-NEXT: # %bb.13: -; AVX1-NEXT: movl %ecx, %eax -; AVX1-NEXT: .LBB16_14: -; AVX1-NEXT: movzbl %al, %eax -; AVX1-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $7, %xmm1, %eax -; AVX1-NEXT: vpextrb $7, %xmm0, %ecx -; AVX1-NEXT: addb %al, %cl -; AVX1-NEXT: movb $-1, %al -; AVX1-NEXT: jb .LBB16_16 -; AVX1-NEXT: # %bb.15: -; AVX1-NEXT: movl %ecx, %eax -; AVX1-NEXT: .LBB16_16: -; AVX1-NEXT: movzbl %al, %eax -; AVX1-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $8, %xmm1, %eax -; AVX1-NEXT: vpextrb $8, %xmm0, %ecx -; AVX1-NEXT: addb %al, %cl -; AVX1-NEXT: movb $-1, %al -; AVX1-NEXT: jb .LBB16_18 -; AVX1-NEXT: # %bb.17: -; AVX1-NEXT: movl %ecx, %eax -; AVX1-NEXT: .LBB16_18: -; AVX1-NEXT: movzbl %al, %eax -; AVX1-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $9, %xmm1, %eax -; AVX1-NEXT: vpextrb $9, %xmm0, %ecx -; AVX1-NEXT: addb %al, %cl -; AVX1-NEXT: movb $-1, %al -; AVX1-NEXT: jb .LBB16_20 -; AVX1-NEXT: # %bb.19: -; AVX1-NEXT: movl %ecx, %eax -; AVX1-NEXT: .LBB16_20: -; AVX1-NEXT: movzbl %al, %eax -; AVX1-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $10, %xmm1, %eax -; AVX1-NEXT: vpextrb $10, %xmm0, %ecx -; AVX1-NEXT: addb %al, %cl -; AVX1-NEXT: movb $-1, %al -; AVX1-NEXT: jb .LBB16_22 -; AVX1-NEXT: # %bb.21: -; AVX1-NEXT: movl %ecx, %eax -; AVX1-NEXT: .LBB16_22: -; AVX1-NEXT: movzbl %al, %eax -; AVX1-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $11, %xmm1, %eax -; AVX1-NEXT: vpextrb $11, %xmm0, %ecx -; AVX1-NEXT: addb %al, %cl -; AVX1-NEXT: movb $-1, %al -; AVX1-NEXT: jb .LBB16_24 -; AVX1-NEXT: # %bb.23: -; AVX1-NEXT: movl %ecx, %eax -; AVX1-NEXT: .LBB16_24: -; AVX1-NEXT: movzbl %al, %eax -; AVX1-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $12, %xmm1, %eax -; AVX1-NEXT: vpextrb $12, %xmm0, %ecx -; AVX1-NEXT: addb %al, %cl -; AVX1-NEXT: movb $-1, %al -; AVX1-NEXT: jb .LBB16_26 -; AVX1-NEXT: # %bb.25: -; AVX1-NEXT: movl %ecx, %eax -; AVX1-NEXT: .LBB16_26: -; AVX1-NEXT: movzbl %al, %eax -; AVX1-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $13, %xmm1, %eax -; AVX1-NEXT: vpextrb $13, %xmm0, %ecx -; AVX1-NEXT: addb %al, %cl -; AVX1-NEXT: movb $-1, %al -; AVX1-NEXT: jb .LBB16_28 -; AVX1-NEXT: # %bb.27: -; AVX1-NEXT: movl %ecx, %eax -; AVX1-NEXT: .LBB16_28: -; AVX1-NEXT: movzbl %al, %eax -; AVX1-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $14, %xmm1, %eax -; AVX1-NEXT: vpextrb $14, %xmm0, %ecx -; AVX1-NEXT: addb %al, %cl -; AVX1-NEXT: movb $-1, %al -; AVX1-NEXT: jb .LBB16_30 -; AVX1-NEXT: # %bb.29: -; AVX1-NEXT: movl %ecx, %eax -; AVX1-NEXT: .LBB16_30: -; AVX1-NEXT: movzbl %al, %eax -; AVX1-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $15, %xmm1, %eax -; AVX1-NEXT: vpextrb $15, %xmm0, %ecx -; AVX1-NEXT: addb %al, %cl -; AVX1-NEXT: jb .LBB16_32 -; AVX1-NEXT: # %bb.31: -; AVX1-NEXT: movl %ecx, %esi -; AVX1-NEXT: .LBB16_32: -; AVX1-NEXT: movzbl %sil, %eax -; AVX1-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0 +; AVX1-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -12743,483 +604,21 @@ ; AVX2-NEXT: vpsllw $7, %xmm1, %xmm1 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $1, %xmm1, %eax ; AVX2-NEXT: vpsllw $7, %xmm0, %xmm0 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $1, %xmm0, %ecx -; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: movb $-1, %sil -; AVX2-NEXT: movb $-1, %dl -; AVX2-NEXT: jb .LBB16_2 -; AVX2-NEXT: # %bb.1: -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: .LBB16_2: -; AVX2-NEXT: movzbl %dl, %ecx -; AVX2-NEXT: vpextrb $0, %xmm1, %eax -; AVX2-NEXT: vpextrb $0, %xmm0, %edx -; AVX2-NEXT: addb %al, %dl -; AVX2-NEXT: movb $-1, %al -; AVX2-NEXT: jb .LBB16_4 -; AVX2-NEXT: # %bb.3: -; AVX2-NEXT: movl %edx, %eax -; AVX2-NEXT: .LBB16_4: -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vmovd %eax, %xmm2 -; AVX2-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $2, %xmm1, %eax -; AVX2-NEXT: vpextrb $2, %xmm0, %ecx -; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: movb $-1, %al -; AVX2-NEXT: jb .LBB16_6 -; AVX2-NEXT: # %bb.5: -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: .LBB16_6: -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $3, %xmm1, %eax -; AVX2-NEXT: vpextrb $3, %xmm0, %ecx -; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: movb $-1, %al -; AVX2-NEXT: jb .LBB16_8 -; AVX2-NEXT: # %bb.7: -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: .LBB16_8: -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $4, %xmm1, %eax -; AVX2-NEXT: vpextrb $4, %xmm0, %ecx -; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: movb $-1, %al -; AVX2-NEXT: jb .LBB16_10 -; AVX2-NEXT: # %bb.9: -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: .LBB16_10: -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $5, %xmm1, %eax -; AVX2-NEXT: vpextrb $5, %xmm0, %ecx -; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: movb $-1, %al -; AVX2-NEXT: jb .LBB16_12 -; AVX2-NEXT: # %bb.11: -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: .LBB16_12: -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $6, %xmm1, %eax -; AVX2-NEXT: vpextrb $6, %xmm0, %ecx -; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: movb $-1, %al -; AVX2-NEXT: jb .LBB16_14 -; AVX2-NEXT: # %bb.13: -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: .LBB16_14: -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $7, %xmm1, %eax -; AVX2-NEXT: vpextrb $7, %xmm0, %ecx -; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: movb $-1, %al -; AVX2-NEXT: jb .LBB16_16 -; AVX2-NEXT: # %bb.15: -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: .LBB16_16: -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $8, %xmm1, %eax -; AVX2-NEXT: vpextrb $8, %xmm0, %ecx -; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: movb $-1, %al -; AVX2-NEXT: jb .LBB16_18 -; AVX2-NEXT: # %bb.17: -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: .LBB16_18: -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $9, %xmm1, %eax -; AVX2-NEXT: vpextrb $9, %xmm0, %ecx -; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: movb $-1, %al -; AVX2-NEXT: jb .LBB16_20 -; AVX2-NEXT: # %bb.19: -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: .LBB16_20: -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $10, %xmm1, %eax -; AVX2-NEXT: vpextrb $10, %xmm0, %ecx -; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: movb $-1, %al -; AVX2-NEXT: jb .LBB16_22 -; AVX2-NEXT: # %bb.21: -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: .LBB16_22: -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $11, %xmm1, %eax -; AVX2-NEXT: vpextrb $11, %xmm0, %ecx -; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: movb $-1, %al -; AVX2-NEXT: jb .LBB16_24 -; AVX2-NEXT: # %bb.23: -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: .LBB16_24: -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $12, %xmm1, %eax -; AVX2-NEXT: vpextrb $12, %xmm0, %ecx -; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: movb $-1, %al -; AVX2-NEXT: jb .LBB16_26 -; AVX2-NEXT: # %bb.25: -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: .LBB16_26: -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $13, %xmm1, %eax -; AVX2-NEXT: vpextrb $13, %xmm0, %ecx -; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: movb $-1, %al -; AVX2-NEXT: jb .LBB16_28 -; AVX2-NEXT: # %bb.27: -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: .LBB16_28: -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $14, %xmm1, %eax -; AVX2-NEXT: vpextrb $14, %xmm0, %ecx -; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: movb $-1, %al -; AVX2-NEXT: jb .LBB16_30 -; AVX2-NEXT: # %bb.29: -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: .LBB16_30: -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $15, %xmm1, %eax -; AVX2-NEXT: vpextrb $15, %xmm0, %ecx -; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: jb .LBB16_32 -; AVX2-NEXT: # %bb.31: -; AVX2-NEXT: movl %ecx, %esi -; AVX2-NEXT: .LBB16_32: -; AVX2-NEXT: movzbl %sil, %eax -; AVX2-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0 +; AVX2-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrlw $7, %xmm0, %xmm0 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: v16i1: ; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: vpsllw $7, %xmm1, %xmm1 +; AVX512-NEXT: vpmovb2m %xmm1, %k0 ; AVX512-NEXT: vpsllw $7, %xmm0, %xmm0 -; AVX512-NEXT: vpmovb2m %xmm0, %k0 -; AVX512-NEXT: vpsllw $7, %xmm1, %xmm0 ; AVX512-NEXT: vpmovb2m %xmm0, %k1 -; AVX512-NEXT: kshiftrw $4, %k0, %k2 -; AVX512-NEXT: kshiftrw $4, %k1, %k3 -; AVX512-NEXT: kshiftrw $3, %k0, %k4 -; AVX512-NEXT: kmovd %k4, %r15d -; AVX512-NEXT: kshiftrw $3, %k1, %k4 -; AVX512-NEXT: kmovd %k4, %r9d -; AVX512-NEXT: kshiftrw $2, %k0, %k4 -; AVX512-NEXT: kmovd %k4, %eax -; AVX512-NEXT: kshiftrw $2, %k1, %k4 -; AVX512-NEXT: kmovd %k4, %ebp -; AVX512-NEXT: kmovd %k0, %ecx -; AVX512-NEXT: kmovd %k1, %esi -; AVX512-NEXT: kshiftrw $1, %k0, %k4 -; AVX512-NEXT: kmovd %k4, %edi -; AVX512-NEXT: kshiftrw $1, %k1, %k4 -; AVX512-NEXT: kmovd %k4, %edx -; AVX512-NEXT: shlb $7, %dl -; AVX512-NEXT: shlb $7, %dil -; AVX512-NEXT: addb %dl, %dil -; AVX512-NEXT: movb $-1, %r8b -; AVX512-NEXT: movb $-1, %bl -; AVX512-NEXT: jb .LBB16_2 -; AVX512-NEXT: # %bb.1: -; AVX512-NEXT: movl %edi, %ebx -; AVX512-NEXT: .LBB16_2: -; AVX512-NEXT: kshiftrw $5, %k0, %k4 -; AVX512-NEXT: kshiftrw $5, %k1, %k5 -; AVX512-NEXT: kmovd %k2, %edi -; AVX512-NEXT: kmovd %k3, %r11d -; AVX512-NEXT: shrb $7, %bl -; AVX512-NEXT: kmovd %ebx, %k6 -; AVX512-NEXT: shlb $7, %sil -; AVX512-NEXT: shlb $7, %cl -; AVX512-NEXT: addb %sil, %cl -; AVX512-NEXT: movb $-1, %dl -; AVX512-NEXT: jb .LBB16_4 -; AVX512-NEXT: # %bb.3: -; AVX512-NEXT: movl %ecx, %edx -; AVX512-NEXT: .LBB16_4: -; AVX512-NEXT: kshiftrw $6, %k0, %k2 -; AVX512-NEXT: kshiftrw $6, %k1, %k3 -; AVX512-NEXT: kmovd %k4, %esi -; AVX512-NEXT: kmovd %k5, %r14d -; AVX512-NEXT: shrb $7, %dl -; AVX512-NEXT: kmovd %edx, %k4 -; AVX512-NEXT: kshiftrw $1, %k4, %k5 -; AVX512-NEXT: kxorw %k6, %k5, %k5 -; AVX512-NEXT: kshiftlw $15, %k5, %k5 -; AVX512-NEXT: kshiftrw $14, %k5, %k5 -; AVX512-NEXT: kxorw %k5, %k4, %k6 -; AVX512-NEXT: kshiftrw $2, %k6, %k7 -; AVX512-NEXT: shlb $7, %bpl -; AVX512-NEXT: shlb $7, %al -; AVX512-NEXT: addb %bpl, %al -; AVX512-NEXT: movb $-1, %cl -; AVX512-NEXT: jb .LBB16_6 -; AVX512-NEXT: # %bb.5: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB16_6: -; AVX512-NEXT: kshiftrw $7, %k0, %k4 -; AVX512-NEXT: kshiftrw $7, %k1, %k5 -; AVX512-NEXT: kmovd %k2, %eax -; AVX512-NEXT: kmovd %k3, %r10d -; AVX512-NEXT: shrb $7, %cl -; AVX512-NEXT: kmovd %ecx, %k2 -; AVX512-NEXT: kxorw %k2, %k7, %k2 -; AVX512-NEXT: kshiftlw $15, %k2, %k2 -; AVX512-NEXT: kshiftrw $13, %k2, %k2 -; AVX512-NEXT: kxorw %k2, %k6, %k6 -; AVX512-NEXT: kshiftrw $3, %k6, %k7 -; AVX512-NEXT: shlb $7, %r9b -; AVX512-NEXT: shlb $7, %r15b -; AVX512-NEXT: addb %r9b, %r15b -; AVX512-NEXT: movb $-1, %dl -; AVX512-NEXT: jb .LBB16_8 -; AVX512-NEXT: # %bb.7: -; AVX512-NEXT: movl %r15d, %edx -; AVX512-NEXT: .LBB16_8: -; AVX512-NEXT: kshiftrw $8, %k0, %k2 -; AVX512-NEXT: kshiftrw $8, %k1, %k3 -; AVX512-NEXT: kmovd %k4, %ecx -; AVX512-NEXT: kmovd %k5, %r9d -; AVX512-NEXT: shrb $7, %dl -; AVX512-NEXT: kmovd %edx, %k4 -; AVX512-NEXT: kxorw %k4, %k7, %k4 -; AVX512-NEXT: kshiftlw $15, %k4, %k4 -; AVX512-NEXT: kshiftrw $12, %k4, %k4 -; AVX512-NEXT: kxorw %k4, %k6, %k6 -; AVX512-NEXT: kshiftrw $4, %k6, %k7 -; AVX512-NEXT: shlb $7, %r11b -; AVX512-NEXT: shlb $7, %dil -; AVX512-NEXT: addb %r11b, %dil -; AVX512-NEXT: movb $-1, %dl -; AVX512-NEXT: jb .LBB16_10 -; AVX512-NEXT: # %bb.9: -; AVX512-NEXT: movl %edi, %edx -; AVX512-NEXT: .LBB16_10: -; AVX512-NEXT: kshiftrw $9, %k0, %k4 -; AVX512-NEXT: kshiftrw $9, %k1, %k5 -; AVX512-NEXT: kmovd %k2, %edi -; AVX512-NEXT: kmovd %k3, %ebx -; AVX512-NEXT: shrb $7, %dl -; AVX512-NEXT: kmovd %edx, %k2 -; AVX512-NEXT: kxorw %k2, %k7, %k2 -; AVX512-NEXT: kshiftlw $15, %k2, %k2 -; AVX512-NEXT: kshiftrw $11, %k2, %k2 -; AVX512-NEXT: kxorw %k2, %k6, %k6 -; AVX512-NEXT: kshiftrw $5, %k6, %k7 -; AVX512-NEXT: shlb $7, %r14b -; AVX512-NEXT: shlb $7, %sil -; AVX512-NEXT: addb %r14b, %sil -; AVX512-NEXT: movb $-1, %dl -; AVX512-NEXT: jb .LBB16_12 -; AVX512-NEXT: # %bb.11: -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: .LBB16_12: -; AVX512-NEXT: kshiftrw $10, %k0, %k2 -; AVX512-NEXT: kshiftrw $10, %k1, %k3 -; AVX512-NEXT: kmovd %k4, %esi -; AVX512-NEXT: kmovd %k5, %r11d -; AVX512-NEXT: shrb $7, %dl -; AVX512-NEXT: kmovd %edx, %k4 -; AVX512-NEXT: kxorw %k4, %k7, %k4 -; AVX512-NEXT: kshiftlw $15, %k4, %k4 -; AVX512-NEXT: kshiftrw $10, %k4, %k4 -; AVX512-NEXT: kxorw %k4, %k6, %k6 -; AVX512-NEXT: kshiftrw $6, %k6, %k7 -; AVX512-NEXT: shlb $7, %r10b -; AVX512-NEXT: shlb $7, %al -; AVX512-NEXT: addb %r10b, %al -; AVX512-NEXT: movb $-1, %bpl -; AVX512-NEXT: jb .LBB16_14 -; AVX512-NEXT: # %bb.13: -; AVX512-NEXT: movl %eax, %ebp -; AVX512-NEXT: .LBB16_14: -; AVX512-NEXT: kshiftrw $11, %k0, %k4 -; AVX512-NEXT: kshiftrw $11, %k1, %k5 -; AVX512-NEXT: kmovd %k2, %r15d -; AVX512-NEXT: kmovd %k3, %r10d -; AVX512-NEXT: shrb $7, %bpl -; AVX512-NEXT: kmovd %ebp, %k2 -; AVX512-NEXT: kxorw %k2, %k7, %k2 -; AVX512-NEXT: kshiftlw $15, %k2, %k2 -; AVX512-NEXT: kshiftrw $9, %k2, %k2 -; AVX512-NEXT: kxorw %k2, %k6, %k6 -; AVX512-NEXT: kshiftrw $7, %k6, %k7 -; AVX512-NEXT: shlb $7, %r9b -; AVX512-NEXT: shlb $7, %cl -; AVX512-NEXT: addb %r9b, %cl -; AVX512-NEXT: movb $-1, %al -; AVX512-NEXT: jb .LBB16_16 -; AVX512-NEXT: # %bb.15: -; AVX512-NEXT: movl %ecx, %eax -; AVX512-NEXT: .LBB16_16: -; AVX512-NEXT: kshiftrw $12, %k0, %k2 -; AVX512-NEXT: kshiftrw $12, %k1, %k3 -; AVX512-NEXT: kmovd %k4, %ecx -; AVX512-NEXT: kmovd %k5, %r9d -; AVX512-NEXT: shrb $7, %al -; AVX512-NEXT: kmovd %eax, %k4 -; AVX512-NEXT: kxorw %k4, %k7, %k4 -; AVX512-NEXT: kshiftlw $15, %k4, %k4 -; AVX512-NEXT: kshiftrw $8, %k4, %k4 -; AVX512-NEXT: kxorw %k4, %k6, %k6 -; AVX512-NEXT: kshiftrw $8, %k6, %k7 -; AVX512-NEXT: shlb $7, %bl -; AVX512-NEXT: shlb $7, %dil -; AVX512-NEXT: addb %bl, %dil -; AVX512-NEXT: movb $-1, %bl -; AVX512-NEXT: jb .LBB16_18 -; AVX512-NEXT: # %bb.17: -; AVX512-NEXT: movl %edi, %ebx -; AVX512-NEXT: .LBB16_18: -; AVX512-NEXT: kshiftrw $13, %k0, %k4 -; AVX512-NEXT: kshiftrw $13, %k1, %k5 -; AVX512-NEXT: kmovd %k2, %eax -; AVX512-NEXT: kmovd %k3, %r14d -; AVX512-NEXT: shrb $7, %bl -; AVX512-NEXT: kmovd %ebx, %k2 -; AVX512-NEXT: kxorw %k2, %k7, %k2 -; AVX512-NEXT: kshiftlw $15, %k2, %k2 -; AVX512-NEXT: kshiftrw $7, %k2, %k2 -; AVX512-NEXT: kxorw %k2, %k6, %k6 -; AVX512-NEXT: kshiftrw $9, %k6, %k7 -; AVX512-NEXT: shlb $7, %r11b -; AVX512-NEXT: shlb $7, %sil -; AVX512-NEXT: addb %r11b, %sil -; AVX512-NEXT: movb $-1, %dil -; AVX512-NEXT: jb .LBB16_20 -; AVX512-NEXT: # %bb.19: -; AVX512-NEXT: movl %esi, %edi -; AVX512-NEXT: .LBB16_20: -; AVX512-NEXT: kshiftrw $14, %k0, %k2 -; AVX512-NEXT: kshiftrw $14, %k1, %k3 -; AVX512-NEXT: kmovd %k4, %esi -; AVX512-NEXT: kmovd %k5, %r11d -; AVX512-NEXT: shrb $7, %dil -; AVX512-NEXT: kmovd %edi, %k4 -; AVX512-NEXT: kxorw %k4, %k7, %k4 -; AVX512-NEXT: kshiftlw $15, %k4, %k4 -; AVX512-NEXT: kshiftrw $6, %k4, %k4 -; AVX512-NEXT: kxorw %k4, %k6, %k4 -; AVX512-NEXT: kshiftrw $10, %k4, %k5 -; AVX512-NEXT: shlb $7, %r10b -; AVX512-NEXT: shlb $7, %r15b -; AVX512-NEXT: addb %r10b, %r15b -; AVX512-NEXT: movb $-1, %dl -; AVX512-NEXT: jb .LBB16_22 -; AVX512-NEXT: # %bb.21: -; AVX512-NEXT: movl %r15d, %edx -; AVX512-NEXT: .LBB16_22: -; AVX512-NEXT: kshiftrw $15, %k0, %k0 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovd %k2, %ebx -; AVX512-NEXT: kmovd %k3, %edi -; AVX512-NEXT: shrb $7, %dl -; AVX512-NEXT: kmovd %edx, %k2 -; AVX512-NEXT: kxorw %k2, %k5, %k2 -; AVX512-NEXT: kshiftlw $15, %k2, %k2 -; AVX512-NEXT: kshiftrw $5, %k2, %k2 -; AVX512-NEXT: kxorw %k2, %k4, %k2 -; AVX512-NEXT: kshiftrw $11, %k2, %k3 -; AVX512-NEXT: shlb $7, %r9b -; AVX512-NEXT: shlb $7, %cl -; AVX512-NEXT: addb %r9b, %cl -; AVX512-NEXT: movb $-1, %dl -; AVX512-NEXT: jb .LBB16_24 -; AVX512-NEXT: # %bb.23: -; AVX512-NEXT: movl %ecx, %edx -; AVX512-NEXT: .LBB16_24: -; AVX512-NEXT: kmovd %k0, %ecx -; AVX512-NEXT: kmovd %k1, %ebp -; AVX512-NEXT: shrb $7, %dl -; AVX512-NEXT: kmovd %edx, %k0 -; AVX512-NEXT: kxorw %k0, %k3, %k0 -; AVX512-NEXT: kshiftlw $15, %k0, %k0 -; AVX512-NEXT: kshiftrw $4, %k0, %k0 -; AVX512-NEXT: kxorw %k0, %k2, %k0 -; AVX512-NEXT: kshiftrw $12, %k0, %k1 -; AVX512-NEXT: shlb $7, %r14b -; AVX512-NEXT: shlb $7, %al -; AVX512-NEXT: addb %r14b, %al -; AVX512-NEXT: movb $-1, %dl -; AVX512-NEXT: jb .LBB16_26 -; AVX512-NEXT: # %bb.25: -; AVX512-NEXT: movl %eax, %edx -; AVX512-NEXT: .LBB16_26: -; AVX512-NEXT: shrb $7, %dl -; AVX512-NEXT: kmovd %edx, %k2 -; AVX512-NEXT: kxorw %k2, %k1, %k1 -; AVX512-NEXT: kshiftlw $15, %k1, %k1 -; AVX512-NEXT: kshiftrw $3, %k1, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k0 -; AVX512-NEXT: kshiftrw $13, %k0, %k1 -; AVX512-NEXT: shlb $7, %r11b -; AVX512-NEXT: shlb $7, %sil -; AVX512-NEXT: addb %r11b, %sil -; AVX512-NEXT: movb $-1, %al -; AVX512-NEXT: jb .LBB16_28 -; AVX512-NEXT: # %bb.27: -; AVX512-NEXT: movl %esi, %eax -; AVX512-NEXT: .LBB16_28: -; AVX512-NEXT: shrb $7, %al -; AVX512-NEXT: kmovd %eax, %k2 -; AVX512-NEXT: kxorw %k2, %k1, %k1 -; AVX512-NEXT: kshiftlw $15, %k1, %k1 -; AVX512-NEXT: kshiftrw $2, %k1, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k0 -; AVX512-NEXT: kshiftrw $14, %k0, %k1 -; AVX512-NEXT: shlb $7, %dil -; AVX512-NEXT: shlb $7, %bl -; AVX512-NEXT: addb %dil, %bl -; AVX512-NEXT: movb $-1, %al -; AVX512-NEXT: jb .LBB16_30 -; AVX512-NEXT: # %bb.29: -; AVX512-NEXT: movl %ebx, %eax -; AVX512-NEXT: .LBB16_30: -; AVX512-NEXT: shrb $7, %al -; AVX512-NEXT: kmovd %eax, %k2 -; AVX512-NEXT: kxorw %k2, %k1, %k1 -; AVX512-NEXT: kshiftlw $15, %k1, %k1 -; AVX512-NEXT: kshiftrw $1, %k1, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k0 -; AVX512-NEXT: kshiftlw $1, %k0, %k0 -; AVX512-NEXT: kshiftrw $1, %k0, %k0 -; AVX512-NEXT: shlb $7, %bpl -; AVX512-NEXT: shlb $7, %cl -; AVX512-NEXT: addb %bpl, %cl -; AVX512-NEXT: jb .LBB16_32 -; AVX512-NEXT: # %bb.31: -; AVX512-NEXT: movl %ecx, %r8d -; AVX512-NEXT: .LBB16_32: -; AVX512-NEXT: shrb $7, %r8b -; AVX512-NEXT: kmovd %r8d, %k1 -; AVX512-NEXT: kshiftlw $15, %k1, %k1 -; AVX512-NEXT: korw %k1, %k0, %k0 +; AVX512-NEXT: korw %k0, %k1, %k0 ; AVX512-NEXT: vpmovm2b %k0, %xmm0 -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp ; AVX512-NEXT: retq %z = call <16 x i1> @llvm.uadd.sat.v16i1(<16 x i1> %x, <16 x i1> %y) ret <16 x i1> %z Index: llvm/trunk/test/CodeGen/X86/usub_sat_vec.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/usub_sat_vec.ll +++ llvm/trunk/test/CodeGen/X86/usub_sat_vec.ll @@ -34,9182 +34,160 @@ ; Legal types, depending on architecture. define <16 x i8> @v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { -; SSE2-LABEL: v16i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pushq %rbp -; SSE2-NEXT: pushq %r15 -; SSE2-NEXT: pushq %r14 -; SSE2-NEXT: pushq %r13 -; SSE2-NEXT: pushq %r12 -; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movl $0, %eax -; SSE2-NEXT: jb .LBB0_2 -; SSE2-NEXT: # %bb.1: -; SSE2-NEXT: movl %ecx, %eax -; SSE2-NEXT: .LBB0_2: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl $0, %ecx -; SSE2-NEXT: jb .LBB0_4 -; SSE2-NEXT: # %bb.3: -; SSE2-NEXT: movl %edx, %ecx -; SSE2-NEXT: .LBB0_4: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl $0, %edx -; SSE2-NEXT: jb .LBB0_6 -; SSE2-NEXT: # %bb.5: -; SSE2-NEXT: movl %ebx, %edx -; SSE2-NEXT: .LBB0_6: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl $0, %esi -; SSE2-NEXT: jb .LBB0_8 -; SSE2-NEXT: # %bb.7: -; SSE2-NEXT: movl %ebx, %esi -; SSE2-NEXT: .LBB0_8: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl $0, %edi -; SSE2-NEXT: jb .LBB0_10 -; SSE2-NEXT: # %bb.9: -; SSE2-NEXT: movl %ebx, %edi -; SSE2-NEXT: .LBB0_10: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl $0, %r12d -; SSE2-NEXT: jb .LBB0_12 -; SSE2-NEXT: # %bb.11: -; SSE2-NEXT: movl %ebx, %r12d -; SSE2-NEXT: .LBB0_12: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl $0, %r8d -; SSE2-NEXT: jb .LBB0_14 -; SSE2-NEXT: # %bb.13: -; SSE2-NEXT: movl %ebx, %r8d -; SSE2-NEXT: .LBB0_14: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl $0, %r10d -; SSE2-NEXT: jb .LBB0_16 -; SSE2-NEXT: # %bb.15: -; SSE2-NEXT: movl %ebx, %r10d -; SSE2-NEXT: .LBB0_16: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl $0, %r13d -; SSE2-NEXT: jb .LBB0_18 -; SSE2-NEXT: # %bb.17: -; SSE2-NEXT: movl %ebx, %r13d -; SSE2-NEXT: .LBB0_18: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl $0, %r9d -; SSE2-NEXT: jb .LBB0_20 -; SSE2-NEXT: # %bb.19: -; SSE2-NEXT: movl %ebx, %r9d -; SSE2-NEXT: .LBB0_20: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl $0, %r11d -; SSE2-NEXT: jb .LBB0_22 -; SSE2-NEXT: # %bb.21: -; SSE2-NEXT: movl %ebx, %r11d -; SSE2-NEXT: .LBB0_22: -; SSE2-NEXT: movzbl %al, %r14d -; SSE2-NEXT: movzbl %cl, %r15d -; SSE2-NEXT: movzbl %dl, %edx -; SSE2-NEXT: movzbl %sil, %esi -; SSE2-NEXT: movzbl %dil, %ebx -; SSE2-NEXT: movzbl %r12b, %ebp -; SSE2-NEXT: movzbl %r8b, %edi -; SSE2-NEXT: movzbl %r10b, %r8d -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movl $0, %eax -; SSE2-NEXT: jb .LBB0_24 -; SSE2-NEXT: # %bb.23: -; SSE2-NEXT: movl %ecx, %eax -; SSE2-NEXT: .LBB0_24: -; SSE2-NEXT: movd %r14d, %xmm2 -; SSE2-NEXT: movd %r15d, %xmm3 -; SSE2-NEXT: movd %edx, %xmm5 -; SSE2-NEXT: movd %esi, %xmm0 -; SSE2-NEXT: movd %ebx, %xmm6 -; SSE2-NEXT: movd %ebp, %xmm4 -; SSE2-NEXT: movd %edi, %xmm7 -; SSE2-NEXT: movd %r8d, %xmm1 -; SSE2-NEXT: movzbl %r13b, %ebp -; SSE2-NEXT: movzbl %r9b, %ecx -; SSE2-NEXT: movzbl %r11b, %edx -; SSE2-NEXT: movzbl %al, %esi -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, %edi -; SSE2-NEXT: jb .LBB0_26 -; SSE2-NEXT: # %bb.25: -; SSE2-NEXT: movl %eax, %edi -; SSE2-NEXT: .LBB0_26: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; SSE2-NEXT: movd %ebp, %xmm6 -; SSE2-NEXT: movd %ecx, %xmm5 -; SSE2-NEXT: movd %edx, %xmm7 -; SSE2-NEXT: movd %esi, %xmm2 -; SSE2-NEXT: movzbl %dil, %eax -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl $0, %ecx -; SSE2-NEXT: jb .LBB0_28 -; SSE2-NEXT: # %bb.27: -; SSE2-NEXT: movl %edx, %ecx -; SSE2-NEXT: .LBB0_28: -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] -; SSE2-NEXT: movd %eax, %xmm4 -; SSE2-NEXT: movzbl %cl, %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl $0, %ecx -; SSE2-NEXT: jb .LBB0_30 -; SSE2-NEXT: # %bb.29: -; SSE2-NEXT: movl %edx, %ecx -; SSE2-NEXT: .LBB0_30: -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE2-NEXT: movzbl %cl, %ecx -; SSE2-NEXT: movd %ecx, %xmm4 -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, %ecx -; SSE2-NEXT: jb .LBB0_32 -; SSE2-NEXT: # %bb.31: -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: .LBB0_32: -; SSE2-NEXT: movzbl %cl, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: popq %rbx -; SSE2-NEXT: popq %r12 -; SSE2-NEXT: popq %r13 -; SSE2-NEXT: popq %r14 -; SSE2-NEXT: popq %r15 -; SSE2-NEXT: popq %rbp -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v16i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pushq %rbp -; SSSE3-NEXT: pushq %r15 -; SSSE3-NEXT: pushq %r14 -; SSSE3-NEXT: pushq %r13 -; SSSE3-NEXT: pushq %r12 -; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movl $0, %eax -; SSSE3-NEXT: jb .LBB0_2 -; SSSE3-NEXT: # %bb.1: -; SSSE3-NEXT: movl %ecx, %eax -; SSSE3-NEXT: .LBB0_2: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl $0, %ecx -; SSSE3-NEXT: jb .LBB0_4 -; SSSE3-NEXT: # %bb.3: -; SSSE3-NEXT: movl %edx, %ecx -; SSSE3-NEXT: .LBB0_4: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl $0, %edx -; SSSE3-NEXT: jb .LBB0_6 -; SSSE3-NEXT: # %bb.5: -; SSSE3-NEXT: movl %ebx, %edx -; SSSE3-NEXT: .LBB0_6: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl $0, %esi -; SSSE3-NEXT: jb .LBB0_8 -; SSSE3-NEXT: # %bb.7: -; SSSE3-NEXT: movl %ebx, %esi -; SSSE3-NEXT: .LBB0_8: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl $0, %edi -; SSSE3-NEXT: jb .LBB0_10 -; SSSE3-NEXT: # %bb.9: -; SSSE3-NEXT: movl %ebx, %edi -; SSSE3-NEXT: .LBB0_10: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl $0, %r12d -; SSSE3-NEXT: jb .LBB0_12 -; SSSE3-NEXT: # %bb.11: -; SSSE3-NEXT: movl %ebx, %r12d -; SSSE3-NEXT: .LBB0_12: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl $0, %r8d -; SSSE3-NEXT: jb .LBB0_14 -; SSSE3-NEXT: # %bb.13: -; SSSE3-NEXT: movl %ebx, %r8d -; SSSE3-NEXT: .LBB0_14: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl $0, %r10d -; SSSE3-NEXT: jb .LBB0_16 -; SSSE3-NEXT: # %bb.15: -; SSSE3-NEXT: movl %ebx, %r10d -; SSSE3-NEXT: .LBB0_16: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl $0, %r13d -; SSSE3-NEXT: jb .LBB0_18 -; SSSE3-NEXT: # %bb.17: -; SSSE3-NEXT: movl %ebx, %r13d -; SSSE3-NEXT: .LBB0_18: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl $0, %r9d -; SSSE3-NEXT: jb .LBB0_20 -; SSSE3-NEXT: # %bb.19: -; SSSE3-NEXT: movl %ebx, %r9d -; SSSE3-NEXT: .LBB0_20: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl $0, %r11d -; SSSE3-NEXT: jb .LBB0_22 -; SSSE3-NEXT: # %bb.21: -; SSSE3-NEXT: movl %ebx, %r11d -; SSSE3-NEXT: .LBB0_22: -; SSSE3-NEXT: movzbl %al, %r14d -; SSSE3-NEXT: movzbl %cl, %r15d -; SSSE3-NEXT: movzbl %dl, %edx -; SSSE3-NEXT: movzbl %sil, %esi -; SSSE3-NEXT: movzbl %dil, %ebx -; SSSE3-NEXT: movzbl %r12b, %ebp -; SSSE3-NEXT: movzbl %r8b, %edi -; SSSE3-NEXT: movzbl %r10b, %r8d -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movl $0, %eax -; SSSE3-NEXT: jb .LBB0_24 -; SSSE3-NEXT: # %bb.23: -; SSSE3-NEXT: movl %ecx, %eax -; SSSE3-NEXT: .LBB0_24: -; SSSE3-NEXT: movd %r14d, %xmm2 -; SSSE3-NEXT: movd %r15d, %xmm3 -; SSSE3-NEXT: movd %edx, %xmm5 -; SSSE3-NEXT: movd %esi, %xmm0 -; SSSE3-NEXT: movd %ebx, %xmm6 -; SSSE3-NEXT: movd %ebp, %xmm4 -; SSSE3-NEXT: movd %edi, %xmm7 -; SSSE3-NEXT: movd %r8d, %xmm1 -; SSSE3-NEXT: movzbl %r13b, %ebp -; SSSE3-NEXT: movzbl %r9b, %ecx -; SSSE3-NEXT: movzbl %r11b, %edx -; SSSE3-NEXT: movzbl %al, %esi -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, %edi -; SSSE3-NEXT: jb .LBB0_26 -; SSSE3-NEXT: # %bb.25: -; SSSE3-NEXT: movl %eax, %edi -; SSSE3-NEXT: .LBB0_26: -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; SSSE3-NEXT: movd %ebp, %xmm6 -; SSSE3-NEXT: movd %ecx, %xmm5 -; SSSE3-NEXT: movd %edx, %xmm7 -; SSSE3-NEXT: movd %esi, %xmm2 -; SSSE3-NEXT: movzbl %dil, %eax -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl $0, %ecx -; SSSE3-NEXT: jb .LBB0_28 -; SSSE3-NEXT: # %bb.27: -; SSSE3-NEXT: movl %edx, %ecx -; SSSE3-NEXT: .LBB0_28: -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] -; SSSE3-NEXT: movd %eax, %xmm4 -; SSSE3-NEXT: movzbl %cl, %eax -; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl $0, %ecx -; SSSE3-NEXT: jb .LBB0_30 -; SSSE3-NEXT: # %bb.29: -; SSSE3-NEXT: movl %edx, %ecx -; SSSE3-NEXT: .LBB0_30: -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSSE3-NEXT: movzbl %cl, %ecx -; SSSE3-NEXT: movd %ecx, %xmm4 -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, %ecx -; SSSE3-NEXT: jb .LBB0_32 -; SSSE3-NEXT: # %bb.31: -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: .LBB0_32: -; SSSE3-NEXT: movzbl %cl, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: popq %rbx -; SSSE3-NEXT: popq %r12 -; SSSE3-NEXT: popq %r13 -; SSSE3-NEXT: popq %r14 -; SSSE3-NEXT: popq %r15 -; SSSE3-NEXT: popq %rbp -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v16i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pextrb $1, %xmm1, %edx -; SSE41-NEXT: pextrb $1, %xmm0, %ecx -; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB0_2 -; SSE41-NEXT: # %bb.1: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB0_2: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pextrb $0, %xmm1, %esi -; SSE41-NEXT: pextrb $0, %xmm0, %edx -; SSE41-NEXT: subb %sil, %dl -; SSE41-NEXT: movl $0, %esi -; SSE41-NEXT: jb .LBB0_4 -; SSE41-NEXT: # %bb.3: -; SSE41-NEXT: movl %edx, %esi -; SSE41-NEXT: .LBB0_4: -; SSE41-NEXT: movzbl %sil, %edx -; SSE41-NEXT: movd %edx, %xmm2 -; SSE41-NEXT: pinsrb $1, %ecx, %xmm2 -; SSE41-NEXT: pextrb $2, %xmm1, %edx -; SSE41-NEXT: pextrb $2, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB0_6 -; SSE41-NEXT: # %bb.5: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB0_6: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $2, %ecx, %xmm2 -; SSE41-NEXT: pextrb $3, %xmm1, %edx -; SSE41-NEXT: pextrb $3, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB0_8 -; SSE41-NEXT: # %bb.7: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB0_8: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $3, %ecx, %xmm2 -; SSE41-NEXT: pextrb $4, %xmm1, %edx -; SSE41-NEXT: pextrb $4, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB0_10 -; SSE41-NEXT: # %bb.9: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB0_10: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $4, %ecx, %xmm2 -; SSE41-NEXT: pextrb $5, %xmm1, %edx -; SSE41-NEXT: pextrb $5, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB0_12 -; SSE41-NEXT: # %bb.11: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB0_12: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $5, %ecx, %xmm2 -; SSE41-NEXT: pextrb $6, %xmm1, %edx -; SSE41-NEXT: pextrb $6, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB0_14 -; SSE41-NEXT: # %bb.13: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB0_14: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $6, %ecx, %xmm2 -; SSE41-NEXT: pextrb $7, %xmm1, %edx -; SSE41-NEXT: pextrb $7, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB0_16 -; SSE41-NEXT: # %bb.15: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB0_16: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $7, %ecx, %xmm2 -; SSE41-NEXT: pextrb $8, %xmm1, %edx -; SSE41-NEXT: pextrb $8, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB0_18 -; SSE41-NEXT: # %bb.17: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB0_18: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $8, %ecx, %xmm2 -; SSE41-NEXT: pextrb $9, %xmm1, %edx -; SSE41-NEXT: pextrb $9, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB0_20 -; SSE41-NEXT: # %bb.19: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB0_20: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $9, %ecx, %xmm2 -; SSE41-NEXT: pextrb $10, %xmm1, %edx -; SSE41-NEXT: pextrb $10, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB0_22 -; SSE41-NEXT: # %bb.21: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB0_22: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $10, %ecx, %xmm2 -; SSE41-NEXT: pextrb $11, %xmm1, %edx -; SSE41-NEXT: pextrb $11, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB0_24 -; SSE41-NEXT: # %bb.23: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB0_24: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $11, %ecx, %xmm2 -; SSE41-NEXT: pextrb $12, %xmm1, %edx -; SSE41-NEXT: pextrb $12, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB0_26 -; SSE41-NEXT: # %bb.25: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB0_26: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $12, %ecx, %xmm2 -; SSE41-NEXT: pextrb $13, %xmm1, %edx -; SSE41-NEXT: pextrb $13, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB0_28 -; SSE41-NEXT: # %bb.27: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB0_28: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $13, %ecx, %xmm2 -; SSE41-NEXT: pextrb $14, %xmm1, %edx -; SSE41-NEXT: pextrb $14, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB0_30 -; SSE41-NEXT: # %bb.29: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB0_30: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $14, %ecx, %xmm2 -; SSE41-NEXT: pextrb $15, %xmm1, %edx -; SSE41-NEXT: pextrb $15, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: jb .LBB0_32 -; SSE41-NEXT: # %bb.31: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB0_32: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $15, %eax, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: v16i8: +; SSE: # %bb.0: +; SSE-NEXT: psubusb %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: v16i8: ; AVX: # %bb.0: -; AVX-NEXT: vpextrb $1, %xmm1, %edx -; AVX-NEXT: vpextrb $1, %xmm0, %ecx -; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: subb %dl, %cl -; AVX-NEXT: movl $0, %edx -; AVX-NEXT: jb .LBB0_2 -; AVX-NEXT: # %bb.1: -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: .LBB0_2: -; AVX-NEXT: movzbl %dl, %ecx -; AVX-NEXT: vpextrb $0, %xmm1, %esi -; AVX-NEXT: vpextrb $0, %xmm0, %edx -; AVX-NEXT: subb %sil, %dl -; AVX-NEXT: movl $0, %esi -; AVX-NEXT: jb .LBB0_4 -; AVX-NEXT: # %bb.3: -; AVX-NEXT: movl %edx, %esi -; AVX-NEXT: .LBB0_4: -; AVX-NEXT: movzbl %sil, %edx -; AVX-NEXT: vmovd %edx, %xmm2 -; AVX-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $2, %xmm1, %edx -; AVX-NEXT: vpextrb $2, %xmm0, %ecx -; AVX-NEXT: subb %dl, %cl -; AVX-NEXT: movl $0, %edx -; AVX-NEXT: jb .LBB0_6 -; AVX-NEXT: # %bb.5: -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: .LBB0_6: -; AVX-NEXT: movzbl %dl, %ecx -; AVX-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $3, %xmm1, %edx -; AVX-NEXT: vpextrb $3, %xmm0, %ecx -; AVX-NEXT: subb %dl, %cl -; AVX-NEXT: movl $0, %edx -; AVX-NEXT: jb .LBB0_8 -; AVX-NEXT: # %bb.7: -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: .LBB0_8: -; AVX-NEXT: movzbl %dl, %ecx -; AVX-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $4, %xmm1, %edx -; AVX-NEXT: vpextrb $4, %xmm0, %ecx -; AVX-NEXT: subb %dl, %cl -; AVX-NEXT: movl $0, %edx -; AVX-NEXT: jb .LBB0_10 -; AVX-NEXT: # %bb.9: -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: .LBB0_10: -; AVX-NEXT: movzbl %dl, %ecx -; AVX-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $5, %xmm1, %edx -; AVX-NEXT: vpextrb $5, %xmm0, %ecx -; AVX-NEXT: subb %dl, %cl -; AVX-NEXT: movl $0, %edx -; AVX-NEXT: jb .LBB0_12 -; AVX-NEXT: # %bb.11: -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: .LBB0_12: -; AVX-NEXT: movzbl %dl, %ecx -; AVX-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $6, %xmm1, %edx -; AVX-NEXT: vpextrb $6, %xmm0, %ecx -; AVX-NEXT: subb %dl, %cl -; AVX-NEXT: movl $0, %edx -; AVX-NEXT: jb .LBB0_14 -; AVX-NEXT: # %bb.13: -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: .LBB0_14: -; AVX-NEXT: movzbl %dl, %ecx -; AVX-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $7, %xmm1, %edx -; AVX-NEXT: vpextrb $7, %xmm0, %ecx -; AVX-NEXT: subb %dl, %cl -; AVX-NEXT: movl $0, %edx -; AVX-NEXT: jb .LBB0_16 -; AVX-NEXT: # %bb.15: -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: .LBB0_16: -; AVX-NEXT: movzbl %dl, %ecx -; AVX-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $8, %xmm1, %edx -; AVX-NEXT: vpextrb $8, %xmm0, %ecx -; AVX-NEXT: subb %dl, %cl -; AVX-NEXT: movl $0, %edx -; AVX-NEXT: jb .LBB0_18 -; AVX-NEXT: # %bb.17: -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: .LBB0_18: -; AVX-NEXT: movzbl %dl, %ecx -; AVX-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $9, %xmm1, %edx -; AVX-NEXT: vpextrb $9, %xmm0, %ecx -; AVX-NEXT: subb %dl, %cl -; AVX-NEXT: movl $0, %edx -; AVX-NEXT: jb .LBB0_20 -; AVX-NEXT: # %bb.19: -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: .LBB0_20: -; AVX-NEXT: movzbl %dl, %ecx -; AVX-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $10, %xmm1, %edx -; AVX-NEXT: vpextrb $10, %xmm0, %ecx -; AVX-NEXT: subb %dl, %cl -; AVX-NEXT: movl $0, %edx -; AVX-NEXT: jb .LBB0_22 -; AVX-NEXT: # %bb.21: -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: .LBB0_22: -; AVX-NEXT: movzbl %dl, %ecx -; AVX-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $11, %xmm1, %edx -; AVX-NEXT: vpextrb $11, %xmm0, %ecx -; AVX-NEXT: subb %dl, %cl -; AVX-NEXT: movl $0, %edx -; AVX-NEXT: jb .LBB0_24 -; AVX-NEXT: # %bb.23: -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: .LBB0_24: -; AVX-NEXT: movzbl %dl, %ecx -; AVX-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $12, %xmm1, %edx -; AVX-NEXT: vpextrb $12, %xmm0, %ecx -; AVX-NEXT: subb %dl, %cl -; AVX-NEXT: movl $0, %edx -; AVX-NEXT: jb .LBB0_26 -; AVX-NEXT: # %bb.25: -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: .LBB0_26: -; AVX-NEXT: movzbl %dl, %ecx -; AVX-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $13, %xmm1, %edx -; AVX-NEXT: vpextrb $13, %xmm0, %ecx -; AVX-NEXT: subb %dl, %cl -; AVX-NEXT: movl $0, %edx -; AVX-NEXT: jb .LBB0_28 -; AVX-NEXT: # %bb.27: -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: .LBB0_28: -; AVX-NEXT: movzbl %dl, %ecx -; AVX-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $14, %xmm1, %edx -; AVX-NEXT: vpextrb $14, %xmm0, %ecx -; AVX-NEXT: subb %dl, %cl -; AVX-NEXT: movl $0, %edx -; AVX-NEXT: jb .LBB0_30 -; AVX-NEXT: # %bb.29: -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: .LBB0_30: -; AVX-NEXT: movzbl %dl, %ecx -; AVX-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $15, %xmm1, %edx -; AVX-NEXT: vpextrb $15, %xmm0, %ecx -; AVX-NEXT: subb %dl, %cl -; AVX-NEXT: jb .LBB0_32 -; AVX-NEXT: # %bb.31: -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: .LBB0_32: -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0 +; AVX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %z = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %x, <16 x i8> %y) ret <16 x i8> %z } define <32 x i8> @v32i8(<32 x i8> %x, <32 x i8> %y) nounwind { -; SSE2-LABEL: v32i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pushq %rbp -; SSE2-NEXT: pushq %r15 -; SSE2-NEXT: pushq %r14 -; SSE2-NEXT: pushq %r13 -; SSE2-NEXT: pushq %r12 -; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: pushq %rax -; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, %edx -; SSE2-NEXT: jb .LBB1_2 -; SSE2-NEXT: # %bb.1: -; SSE2-NEXT: movl %eax, %edx -; SSE2-NEXT: .LBB1_2: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, %r15d -; SSE2-NEXT: jb .LBB1_4 -; SSE2-NEXT: # %bb.3: -; SSE2-NEXT: movl %eax, %r15d -; SSE2-NEXT: .LBB1_4: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, %esi -; SSE2-NEXT: jb .LBB1_6 -; SSE2-NEXT: # %bb.5: -; SSE2-NEXT: movl %eax, %esi -; SSE2-NEXT: .LBB1_6: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, %ebx -; SSE2-NEXT: jb .LBB1_8 -; SSE2-NEXT: # %bb.7: -; SSE2-NEXT: movl %eax, %ebx -; SSE2-NEXT: .LBB1_8: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, %ebp -; SSE2-NEXT: jb .LBB1_10 -; SSE2-NEXT: # %bb.9: -; SSE2-NEXT: movl %eax, %ebp -; SSE2-NEXT: .LBB1_10: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, %edi -; SSE2-NEXT: jb .LBB1_12 -; SSE2-NEXT: # %bb.11: -; SSE2-NEXT: movl %eax, %edi -; SSE2-NEXT: .LBB1_12: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, %r13d -; SSE2-NEXT: jb .LBB1_14 -; SSE2-NEXT: # %bb.13: -; SSE2-NEXT: movl %eax, %r13d -; SSE2-NEXT: .LBB1_14: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, %ecx -; SSE2-NEXT: jb .LBB1_16 -; SSE2-NEXT: # %bb.15: -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: .LBB1_16: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, %r12d -; SSE2-NEXT: jb .LBB1_18 -; SSE2-NEXT: # %bb.17: -; SSE2-NEXT: movl %eax, %r12d -; SSE2-NEXT: .LBB1_18: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, %r14d -; SSE2-NEXT: jb .LBB1_20 -; SSE2-NEXT: # %bb.19: -; SSE2-NEXT: movl %eax, %r14d -; SSE2-NEXT: .LBB1_20: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, %r8d -; SSE2-NEXT: jb .LBB1_22 -; SSE2-NEXT: # %bb.21: -; SSE2-NEXT: movl %eax, %r8d -; SSE2-NEXT: .LBB1_22: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, %r9d -; SSE2-NEXT: jb .LBB1_24 -; SSE2-NEXT: # %bb.23: -; SSE2-NEXT: movl %eax, %r9d -; SSE2-NEXT: .LBB1_24: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, %r10d -; SSE2-NEXT: jb .LBB1_26 -; SSE2-NEXT: # %bb.25: -; SSE2-NEXT: movl %eax, %r10d -; SSE2-NEXT: .LBB1_26: -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, %r11d -; SSE2-NEXT: jb .LBB1_28 -; SSE2-NEXT: # %bb.27: -; SSE2-NEXT: movl %eax, %r11d -; SSE2-NEXT: .LBB1_28: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: jb .LBB1_30 -; SSE2-NEXT: # %bb.29: -; SSE2-NEXT: # kill: def $cl killed $cl def $ecx -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB1_30: -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, %ecx -; SSE2-NEXT: jb .LBB1_32 -; SSE2-NEXT: # %bb.31: -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: .LBB1_32: -; SSE2-NEXT: movl %ecx, (%rsp) # 4-byte Spill -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, %ecx -; SSE2-NEXT: jb .LBB1_34 -; SSE2-NEXT: # %bb.33: -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: .LBB1_34: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: jb .LBB1_36 -; SSE2-NEXT: # %bb.35: -; SSE2-NEXT: # kill: def $al killed $al def $eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB1_36: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: jb .LBB1_38 -; SSE2-NEXT: # %bb.37: -; SSE2-NEXT: # kill: def $al killed $al def $eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB1_38: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: jb .LBB1_40 -; SSE2-NEXT: # %bb.39: -; SSE2-NEXT: # kill: def $al killed $al def $eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB1_40: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: jb .LBB1_42 -; SSE2-NEXT: # %bb.41: -; SSE2-NEXT: # kill: def $al killed $al def $eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB1_42: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: jb .LBB1_44 -; SSE2-NEXT: # %bb.43: -; SSE2-NEXT: # kill: def $al killed $al def $eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB1_44: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: jb .LBB1_46 -; SSE2-NEXT: # %bb.45: -; SSE2-NEXT: # kill: def $al killed $al def $eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB1_46: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: jb .LBB1_48 -; SSE2-NEXT: # %bb.47: -; SSE2-NEXT: # kill: def $al killed $al def $eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB1_48: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: jb .LBB1_50 -; SSE2-NEXT: # %bb.49: -; SSE2-NEXT: # kill: def $al killed $al def $eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB1_50: -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, %ecx -; SSE2-NEXT: jb .LBB1_52 -; SSE2-NEXT: # %bb.51: -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: .LBB1_52: -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl %dl, %eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl %r15b, %ecx -; SSE2-NEXT: movzbl %sil, %edx -; SSE2-NEXT: movzbl %bl, %esi -; SSE2-NEXT: movzbl %bpl, %ebx -; SSE2-NEXT: movzbl %dil, %ebp -; SSE2-NEXT: movzbl %r13b, %edi -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload -; SSE2-NEXT: movzbl %r12b, %r12d -; SSE2-NEXT: movzbl %r14b, %r13d -; SSE2-NEXT: movzbl %r8b, %r8d -; SSE2-NEXT: movzbl %r9b, %r9d -; SSE2-NEXT: movzbl %r10b, %r10d -; SSE2-NEXT: movzbl %r11b, %r11d -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload -; SSE2-NEXT: movzbl (%rsp), %eax # 1-byte Folded Reload -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: jb .LBB1_54 -; SSE2-NEXT: # %bb.53: -; SSE2-NEXT: # kill: def $al killed $al def $eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB1_54: -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload -; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movd %ecx, %xmm8 -; SSE2-NEXT: movd %edx, %xmm3 -; SSE2-NEXT: movd %esi, %xmm11 -; SSE2-NEXT: movd %ebx, %xmm5 -; SSE2-NEXT: movd %ebp, %xmm9 -; SSE2-NEXT: movd %edi, %xmm7 -; SSE2-NEXT: movd %r15d, %xmm1 -; SSE2-NEXT: movd %r12d, %xmm12 -; SSE2-NEXT: movd %r13d, %xmm10 -; SSE2-NEXT: movd %r8d, %xmm13 -; SSE2-NEXT: movd %r9d, %xmm4 -; SSE2-NEXT: movd %r10d, %xmm14 -; SSE2-NEXT: movd %r11d, %xmm6 -; SSE2-NEXT: movd %r14d, %xmm15 -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload -; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movl $0, %eax -; SSE2-NEXT: jb .LBB1_56 -; SSE2-NEXT: # %bb.55: -; SSE2-NEXT: movl %ecx, %eax -; SSE2-NEXT: .LBB1_56: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3],xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3],xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3],xmm6[4],xmm14[4],xmm6[5],xmm14[5],xmm6[6],xmm14[6],xmm6[7],xmm14[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; SSE2-NEXT: movd %r8d, %xmm7 -; SSE2-NEXT: movd %edx, %xmm12 -; SSE2-NEXT: movd %esi, %xmm13 -; SSE2-NEXT: movd %ebx, %xmm5 -; SSE2-NEXT: movd %ebp, %xmm14 -; SSE2-NEXT: movd %edi, %xmm2 -; SSE2-NEXT: movd %r10d, %xmm15 -; SSE2-NEXT: movd %r9d, %xmm3 -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE2-NEXT: movzbl %al, %eax -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl $0, %edi -; SSE2-NEXT: jb .LBB1_58 -; SSE2-NEXT: # %bb.57: -; SSE2-NEXT: movl %ebx, %edi -; SSE2-NEXT: .LBB1_58: -; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] -; SSE2-NEXT: movd %ecx, %xmm8 -; SSE2-NEXT: movd %edx, %xmm7 -; SSE2-NEXT: movd %esi, %xmm9 -; SSE2-NEXT: movd %eax, %xmm6 -; SSE2-NEXT: movzbl %dil, %eax -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl $0, %ecx -; SSE2-NEXT: jb .LBB1_60 -; SSE2-NEXT: # %bb.59: -; SSE2-NEXT: movl %edx, %ecx -; SSE2-NEXT: .LBB1_60: -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] -; SSE2-NEXT: movd %eax, %xmm4 -; SSE2-NEXT: movzbl %cl, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl $0, %ecx -; SSE2-NEXT: jb .LBB1_62 -; SSE2-NEXT: # %bb.61: -; SSE2-NEXT: movl %edx, %ecx -; SSE2-NEXT: .LBB1_62: -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE2-NEXT: movzbl %cl, %ecx -; SSE2-NEXT: movd %ecx, %xmm4 -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, %ecx -; SSE2-NEXT: jb .LBB1_64 -; SSE2-NEXT: # %bb.63: -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: .LBB1_64: -; SSE2-NEXT: movzbl %cl, %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; SSE2-NEXT: addq $8, %rsp -; SSE2-NEXT: popq %rbx -; SSE2-NEXT: popq %r12 -; SSE2-NEXT: popq %r13 -; SSE2-NEXT: popq %r14 -; SSE2-NEXT: popq %r15 -; SSE2-NEXT: popq %rbp -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v32i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pushq %rbp -; SSSE3-NEXT: pushq %r15 -; SSSE3-NEXT: pushq %r14 -; SSSE3-NEXT: pushq %r13 -; SSSE3-NEXT: pushq %r12 -; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: pushq %rax -; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, %edx -; SSSE3-NEXT: jb .LBB1_2 -; SSSE3-NEXT: # %bb.1: -; SSSE3-NEXT: movl %eax, %edx -; SSSE3-NEXT: .LBB1_2: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, %r15d -; SSSE3-NEXT: jb .LBB1_4 -; SSSE3-NEXT: # %bb.3: -; SSSE3-NEXT: movl %eax, %r15d -; SSSE3-NEXT: .LBB1_4: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, %esi -; SSSE3-NEXT: jb .LBB1_6 -; SSSE3-NEXT: # %bb.5: -; SSSE3-NEXT: movl %eax, %esi -; SSSE3-NEXT: .LBB1_6: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, %ebx -; SSSE3-NEXT: jb .LBB1_8 -; SSSE3-NEXT: # %bb.7: -; SSSE3-NEXT: movl %eax, %ebx -; SSSE3-NEXT: .LBB1_8: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, %ebp -; SSSE3-NEXT: jb .LBB1_10 -; SSSE3-NEXT: # %bb.9: -; SSSE3-NEXT: movl %eax, %ebp -; SSSE3-NEXT: .LBB1_10: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, %edi -; SSSE3-NEXT: jb .LBB1_12 -; SSSE3-NEXT: # %bb.11: -; SSSE3-NEXT: movl %eax, %edi -; SSSE3-NEXT: .LBB1_12: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, %r13d -; SSSE3-NEXT: jb .LBB1_14 -; SSSE3-NEXT: # %bb.13: -; SSSE3-NEXT: movl %eax, %r13d -; SSSE3-NEXT: .LBB1_14: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, %ecx -; SSSE3-NEXT: jb .LBB1_16 -; SSSE3-NEXT: # %bb.15: -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: .LBB1_16: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, %r12d -; SSSE3-NEXT: jb .LBB1_18 -; SSSE3-NEXT: # %bb.17: -; SSSE3-NEXT: movl %eax, %r12d -; SSSE3-NEXT: .LBB1_18: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, %r14d -; SSSE3-NEXT: jb .LBB1_20 -; SSSE3-NEXT: # %bb.19: -; SSSE3-NEXT: movl %eax, %r14d -; SSSE3-NEXT: .LBB1_20: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, %r8d -; SSSE3-NEXT: jb .LBB1_22 -; SSSE3-NEXT: # %bb.21: -; SSSE3-NEXT: movl %eax, %r8d -; SSSE3-NEXT: .LBB1_22: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, %r9d -; SSSE3-NEXT: jb .LBB1_24 -; SSSE3-NEXT: # %bb.23: -; SSSE3-NEXT: movl %eax, %r9d -; SSSE3-NEXT: .LBB1_24: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, %r10d -; SSSE3-NEXT: jb .LBB1_26 -; SSSE3-NEXT: # %bb.25: -; SSSE3-NEXT: movl %eax, %r10d -; SSSE3-NEXT: .LBB1_26: -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, %r11d -; SSSE3-NEXT: jb .LBB1_28 -; SSSE3-NEXT: # %bb.27: -; SSSE3-NEXT: movl %eax, %r11d -; SSSE3-NEXT: .LBB1_28: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSSE3-NEXT: jb .LBB1_30 -; SSSE3-NEXT: # %bb.29: -; SSSE3-NEXT: # kill: def $cl killed $cl def $ecx -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: .LBB1_30: -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, %ecx -; SSSE3-NEXT: jb .LBB1_32 -; SSSE3-NEXT: # %bb.31: -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: .LBB1_32: -; SSSE3-NEXT: movl %ecx, (%rsp) # 4-byte Spill -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, %ecx -; SSSE3-NEXT: jb .LBB1_34 -; SSSE3-NEXT: # %bb.33: -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: .LBB1_34: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSSE3-NEXT: jb .LBB1_36 -; SSSE3-NEXT: # %bb.35: -; SSSE3-NEXT: # kill: def $al killed $al def $eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: .LBB1_36: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSSE3-NEXT: jb .LBB1_38 -; SSSE3-NEXT: # %bb.37: -; SSSE3-NEXT: # kill: def $al killed $al def $eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: .LBB1_38: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSSE3-NEXT: jb .LBB1_40 -; SSSE3-NEXT: # %bb.39: -; SSSE3-NEXT: # kill: def $al killed $al def $eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: .LBB1_40: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSSE3-NEXT: jb .LBB1_42 -; SSSE3-NEXT: # %bb.41: -; SSSE3-NEXT: # kill: def $al killed $al def $eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: .LBB1_42: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSSE3-NEXT: jb .LBB1_44 -; SSSE3-NEXT: # %bb.43: -; SSSE3-NEXT: # kill: def $al killed $al def $eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: .LBB1_44: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSSE3-NEXT: jb .LBB1_46 -; SSSE3-NEXT: # %bb.45: -; SSSE3-NEXT: # kill: def $al killed $al def $eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: .LBB1_46: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSSE3-NEXT: jb .LBB1_48 -; SSSE3-NEXT: # %bb.47: -; SSSE3-NEXT: # kill: def $al killed $al def $eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: .LBB1_48: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSSE3-NEXT: jb .LBB1_50 -; SSSE3-NEXT: # %bb.49: -; SSSE3-NEXT: # kill: def $al killed $al def $eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: .LBB1_50: -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, %ecx -; SSSE3-NEXT: jb .LBB1_52 -; SSSE3-NEXT: # %bb.51: -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: .LBB1_52: -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl %dl, %eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl %r15b, %ecx -; SSSE3-NEXT: movzbl %sil, %edx -; SSSE3-NEXT: movzbl %bl, %esi -; SSSE3-NEXT: movzbl %bpl, %ebx -; SSSE3-NEXT: movzbl %dil, %ebp -; SSSE3-NEXT: movzbl %r13b, %edi -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl %r12b, %r12d -; SSSE3-NEXT: movzbl %r14b, %r13d -; SSSE3-NEXT: movzbl %r8b, %r8d -; SSSE3-NEXT: movzbl %r9b, %r9d -; SSSE3-NEXT: movzbl %r10b, %r10d -; SSSE3-NEXT: movzbl %r11b, %r11d -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl (%rsp), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSSE3-NEXT: jb .LBB1_54 -; SSSE3-NEXT: # %bb.53: -; SSSE3-NEXT: # kill: def $al killed $al def $eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: .LBB1_54: -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd %ecx, %xmm8 -; SSSE3-NEXT: movd %edx, %xmm3 -; SSSE3-NEXT: movd %esi, %xmm11 -; SSSE3-NEXT: movd %ebx, %xmm5 -; SSSE3-NEXT: movd %ebp, %xmm9 -; SSSE3-NEXT: movd %edi, %xmm7 -; SSSE3-NEXT: movd %r15d, %xmm1 -; SSSE3-NEXT: movd %r12d, %xmm12 -; SSSE3-NEXT: movd %r13d, %xmm10 -; SSSE3-NEXT: movd %r8d, %xmm13 -; SSSE3-NEXT: movd %r9d, %xmm4 -; SSSE3-NEXT: movd %r10d, %xmm14 -; SSSE3-NEXT: movd %r11d, %xmm6 -; SSSE3-NEXT: movd %r14d, %xmm15 -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movl $0, %eax -; SSSE3-NEXT: jb .LBB1_56 -; SSSE3-NEXT: # %bb.55: -; SSSE3-NEXT: movl %ecx, %eax -; SSSE3-NEXT: .LBB1_56: -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3],xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3],xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3],xmm6[4],xmm14[4],xmm6[5],xmm14[5],xmm6[6],xmm14[6],xmm6[7],xmm14[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; SSSE3-NEXT: movd %r8d, %xmm7 -; SSSE3-NEXT: movd %edx, %xmm12 -; SSSE3-NEXT: movd %esi, %xmm13 -; SSSE3-NEXT: movd %ebx, %xmm5 -; SSSE3-NEXT: movd %ebp, %xmm14 -; SSSE3-NEXT: movd %edi, %xmm2 -; SSSE3-NEXT: movd %r10d, %xmm15 -; SSSE3-NEXT: movd %r9d, %xmm3 -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSSE3-NEXT: movzbl %al, %eax -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl $0, %edi -; SSSE3-NEXT: jb .LBB1_58 -; SSSE3-NEXT: # %bb.57: -; SSSE3-NEXT: movl %ebx, %edi -; SSSE3-NEXT: .LBB1_58: -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] -; SSSE3-NEXT: movd %ecx, %xmm8 -; SSSE3-NEXT: movd %edx, %xmm7 -; SSSE3-NEXT: movd %esi, %xmm9 -; SSSE3-NEXT: movd %eax, %xmm6 -; SSSE3-NEXT: movzbl %dil, %eax -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl $0, %ecx -; SSSE3-NEXT: jb .LBB1_60 -; SSSE3-NEXT: # %bb.59: -; SSSE3-NEXT: movl %edx, %ecx -; SSSE3-NEXT: .LBB1_60: -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] -; SSSE3-NEXT: movd %eax, %xmm4 -; SSSE3-NEXT: movzbl %cl, %eax -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl $0, %ecx -; SSSE3-NEXT: jb .LBB1_62 -; SSSE3-NEXT: # %bb.61: -; SSSE3-NEXT: movl %edx, %ecx -; SSSE3-NEXT: .LBB1_62: -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSSE3-NEXT: movzbl %cl, %ecx -; SSSE3-NEXT: movd %ecx, %xmm4 -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, %ecx -; SSSE3-NEXT: jb .LBB1_64 -; SSSE3-NEXT: # %bb.63: -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: .LBB1_64: -; SSSE3-NEXT: movzbl %cl, %eax -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; SSSE3-NEXT: addq $8, %rsp -; SSSE3-NEXT: popq %rbx -; SSSE3-NEXT: popq %r12 -; SSSE3-NEXT: popq %r13 -; SSSE3-NEXT: popq %r14 -; SSSE3-NEXT: popq %r15 -; SSSE3-NEXT: popq %rbp -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v32i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pextrb $1, %xmm2, %ecx -; SSE41-NEXT: pextrb $1, %xmm0, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB1_2 -; SSE41-NEXT: # %bb.1: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB1_2: -; SSE41-NEXT: pextrb $0, %xmm2, %edx -; SSE41-NEXT: pextrb $0, %xmm0, %eax -; SSE41-NEXT: subb %dl, %al -; SSE41-NEXT: movl $0, %esi -; SSE41-NEXT: jb .LBB1_4 -; SSE41-NEXT: # %bb.3: -; SSE41-NEXT: movl %eax, %esi -; SSE41-NEXT: .LBB1_4: -; SSE41-NEXT: pextrb $2, %xmm2, %edx -; SSE41-NEXT: pextrb $2, %xmm0, %eax -; SSE41-NEXT: subb %dl, %al -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB1_6 -; SSE41-NEXT: # %bb.5: -; SSE41-NEXT: movl %eax, %edx -; SSE41-NEXT: .LBB1_6: -; SSE41-NEXT: pushq %rbp -; SSE41-NEXT: pushq %r15 -; SSE41-NEXT: pushq %r14 -; SSE41-NEXT: pushq %r13 -; SSE41-NEXT: pushq %r12 -; SSE41-NEXT: pushq %rbx -; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrb $3, %xmm2, %edx -; SSE41-NEXT: pextrb $3, %xmm0, %eax -; SSE41-NEXT: subb %dl, %al -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB1_8 -; SSE41-NEXT: # %bb.7: -; SSE41-NEXT: movl %eax, %edx -; SSE41-NEXT: .LBB1_8: -; SSE41-NEXT: pextrb $4, %xmm2, %eax -; SSE41-NEXT: pextrb $4, %xmm0, %edi -; SSE41-NEXT: subb %al, %dil -; SSE41-NEXT: movl $0, %eax -; SSE41-NEXT: jb .LBB1_10 -; SSE41-NEXT: # %bb.9: -; SSE41-NEXT: movl %edi, %eax -; SSE41-NEXT: .LBB1_10: -; SSE41-NEXT: pextrb $5, %xmm2, %ebp -; SSE41-NEXT: pextrb $5, %xmm0, %edi -; SSE41-NEXT: subb %bpl, %dil -; SSE41-NEXT: movl $0, %ebx -; SSE41-NEXT: jb .LBB1_12 -; SSE41-NEXT: # %bb.11: -; SSE41-NEXT: movl %edi, %ebx -; SSE41-NEXT: .LBB1_12: -; SSE41-NEXT: pextrb $6, %xmm2, %ebp -; SSE41-NEXT: pextrb $6, %xmm0, %edi -; SSE41-NEXT: subb %bpl, %dil -; SSE41-NEXT: movl $0, %ebp -; SSE41-NEXT: jb .LBB1_14 -; SSE41-NEXT: # %bb.13: -; SSE41-NEXT: movl %edi, %ebp -; SSE41-NEXT: .LBB1_14: -; SSE41-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrb $7, %xmm2, %ebx -; SSE41-NEXT: pextrb $7, %xmm0, %edi -; SSE41-NEXT: subb %bl, %dil -; SSE41-NEXT: movl $0, %ebp -; SSE41-NEXT: jb .LBB1_16 -; SSE41-NEXT: # %bb.15: -; SSE41-NEXT: movl %edi, %ebp -; SSE41-NEXT: .LBB1_16: -; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrb $8, %xmm2, %ebx -; SSE41-NEXT: pextrb $8, %xmm0, %edi -; SSE41-NEXT: subb %bl, %dil -; SSE41-NEXT: movl $0, %r8d -; SSE41-NEXT: jb .LBB1_18 -; SSE41-NEXT: # %bb.17: -; SSE41-NEXT: movl %edi, %r8d -; SSE41-NEXT: .LBB1_18: -; SSE41-NEXT: pextrb $9, %xmm2, %ebx -; SSE41-NEXT: pextrb $9, %xmm0, %edi -; SSE41-NEXT: subb %bl, %dil -; SSE41-NEXT: movl $0, %r14d -; SSE41-NEXT: jb .LBB1_20 -; SSE41-NEXT: # %bb.19: -; SSE41-NEXT: movl %edi, %r14d -; SSE41-NEXT: .LBB1_20: -; SSE41-NEXT: pextrb $10, %xmm2, %ebx -; SSE41-NEXT: pextrb $10, %xmm0, %edi -; SSE41-NEXT: subb %bl, %dil -; SSE41-NEXT: movl $0, %ebp -; SSE41-NEXT: jb .LBB1_22 -; SSE41-NEXT: # %bb.21: -; SSE41-NEXT: movl %edi, %ebp -; SSE41-NEXT: .LBB1_22: -; SSE41-NEXT: pextrb $11, %xmm2, %ebx -; SSE41-NEXT: pextrb $11, %xmm0, %edi -; SSE41-NEXT: subb %bl, %dil -; SSE41-NEXT: movl $0, %r11d -; SSE41-NEXT: jb .LBB1_24 -; SSE41-NEXT: # %bb.23: -; SSE41-NEXT: movl %edi, %r11d -; SSE41-NEXT: .LBB1_24: -; SSE41-NEXT: pextrb $12, %xmm2, %ebx -; SSE41-NEXT: pextrb $12, %xmm0, %edi -; SSE41-NEXT: subb %bl, %dil -; SSE41-NEXT: movl $0, %r9d -; SSE41-NEXT: jb .LBB1_26 -; SSE41-NEXT: # %bb.25: -; SSE41-NEXT: movl %edi, %r9d -; SSE41-NEXT: .LBB1_26: -; SSE41-NEXT: pextrb $13, %xmm2, %ebx -; SSE41-NEXT: pextrb $13, %xmm0, %edi -; SSE41-NEXT: subb %bl, %dil -; SSE41-NEXT: movl $0, %r13d -; SSE41-NEXT: jb .LBB1_28 -; SSE41-NEXT: # %bb.27: -; SSE41-NEXT: movl %edi, %r13d -; SSE41-NEXT: .LBB1_28: -; SSE41-NEXT: pextrb $14, %xmm2, %ebx -; SSE41-NEXT: pextrb $14, %xmm0, %edi -; SSE41-NEXT: subb %bl, %dil -; SSE41-NEXT: movl $0, %r12d -; SSE41-NEXT: jb .LBB1_30 -; SSE41-NEXT: # %bb.29: -; SSE41-NEXT: movl %edi, %r12d -; SSE41-NEXT: .LBB1_30: -; SSE41-NEXT: movzbl %sil, %esi -; SSE41-NEXT: pextrb $15, %xmm2, %ebx -; SSE41-NEXT: pextrb $15, %xmm0, %edi -; SSE41-NEXT: subb %bl, %dil -; SSE41-NEXT: movl $0, %r15d -; SSE41-NEXT: jb .LBB1_32 -; SSE41-NEXT: # %bb.31: -; SSE41-NEXT: movl %edi, %r15d -; SSE41-NEXT: .LBB1_32: -; SSE41-NEXT: movzbl %cl, %ecx -; SSE41-NEXT: movd %esi, %xmm0 -; SSE41-NEXT: pextrb $1, %xmm3, %edi -; SSE41-NEXT: pextrb $1, %xmm1, %esi -; SSE41-NEXT: subb %dil, %sil -; SSE41-NEXT: movl $0, %r10d -; SSE41-NEXT: jb .LBB1_34 -; SSE41-NEXT: # %bb.33: -; SSE41-NEXT: movl %esi, %r10d -; SSE41-NEXT: .LBB1_34: -; SSE41-NEXT: pinsrb $1, %ecx, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE41-NEXT: movzbl %r10b, %ecx -; SSE41-NEXT: pextrb $0, %xmm3, %ebx -; SSE41-NEXT: pextrb $0, %xmm1, %edi -; SSE41-NEXT: subb %bl, %dil -; SSE41-NEXT: movl $0, %ebx -; SSE41-NEXT: jb .LBB1_36 -; SSE41-NEXT: # %bb.35: -; SSE41-NEXT: movl %edi, %ebx -; SSE41-NEXT: .LBB1_36: -; SSE41-NEXT: pinsrb $2, %esi, %xmm0 -; SSE41-NEXT: movzbl %dl, %edx -; SSE41-NEXT: movzbl %bl, %esi -; SSE41-NEXT: movd %esi, %xmm2 -; SSE41-NEXT: pinsrb $1, %ecx, %xmm2 -; SSE41-NEXT: pextrb $2, %xmm3, %ecx -; SSE41-NEXT: pextrb $2, %xmm1, %esi -; SSE41-NEXT: subb %cl, %sil -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB1_38 -; SSE41-NEXT: # %bb.37: -; SSE41-NEXT: movl %esi, %ecx -; SSE41-NEXT: .LBB1_38: -; SSE41-NEXT: pinsrb $3, %edx, %xmm0 -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: movzbl %cl, %ecx -; SSE41-NEXT: pinsrb $2, %ecx, %xmm2 -; SSE41-NEXT: pextrb $3, %xmm3, %ecx -; SSE41-NEXT: pextrb $3, %xmm1, %edx -; SSE41-NEXT: subb %cl, %dl -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB1_40 -; SSE41-NEXT: # %bb.39: -; SSE41-NEXT: movl %edx, %ecx -; SSE41-NEXT: .LBB1_40: -; SSE41-NEXT: pinsrb $4, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: movzbl %cl, %ecx -; SSE41-NEXT: pinsrb $3, %ecx, %xmm2 -; SSE41-NEXT: pextrb $4, %xmm3, %ecx -; SSE41-NEXT: pextrb $4, %xmm1, %edx -; SSE41-NEXT: subb %cl, %dl -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB1_42 -; SSE41-NEXT: # %bb.41: -; SSE41-NEXT: movl %edx, %ecx -; SSE41-NEXT: .LBB1_42: -; SSE41-NEXT: pinsrb $5, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: movzbl %cl, %ecx -; SSE41-NEXT: pinsrb $4, %ecx, %xmm2 -; SSE41-NEXT: pextrb $5, %xmm3, %ecx -; SSE41-NEXT: pextrb $5, %xmm1, %edx -; SSE41-NEXT: subb %cl, %dl -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB1_44 -; SSE41-NEXT: # %bb.43: -; SSE41-NEXT: movl %edx, %ecx -; SSE41-NEXT: .LBB1_44: -; SSE41-NEXT: pinsrb $6, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: movzbl %cl, %ecx -; SSE41-NEXT: pinsrb $5, %ecx, %xmm2 -; SSE41-NEXT: pextrb $6, %xmm3, %ecx -; SSE41-NEXT: pextrb $6, %xmm1, %edx -; SSE41-NEXT: subb %cl, %dl -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB1_46 -; SSE41-NEXT: # %bb.45: -; SSE41-NEXT: movl %edx, %ecx -; SSE41-NEXT: .LBB1_46: -; SSE41-NEXT: pinsrb $7, %eax, %xmm0 -; SSE41-NEXT: movzbl %r8b, %eax -; SSE41-NEXT: movzbl %cl, %ecx -; SSE41-NEXT: pinsrb $6, %ecx, %xmm2 -; SSE41-NEXT: pextrb $7, %xmm3, %ecx -; SSE41-NEXT: pextrb $7, %xmm1, %edx -; SSE41-NEXT: subb %cl, %dl -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB1_48 -; SSE41-NEXT: # %bb.47: -; SSE41-NEXT: movl %edx, %ecx -; SSE41-NEXT: .LBB1_48: -; SSE41-NEXT: pinsrb $8, %eax, %xmm0 -; SSE41-NEXT: movzbl %r14b, %eax -; SSE41-NEXT: movzbl %cl, %ecx -; SSE41-NEXT: pinsrb $7, %ecx, %xmm2 -; SSE41-NEXT: pextrb $8, %xmm3, %ecx -; SSE41-NEXT: pextrb $8, %xmm1, %edx -; SSE41-NEXT: subb %cl, %dl -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB1_50 -; SSE41-NEXT: # %bb.49: -; SSE41-NEXT: movl %edx, %ecx -; SSE41-NEXT: .LBB1_50: -; SSE41-NEXT: pinsrb $9, %eax, %xmm0 -; SSE41-NEXT: movzbl %bpl, %eax -; SSE41-NEXT: movzbl %cl, %ecx -; SSE41-NEXT: pinsrb $8, %ecx, %xmm2 -; SSE41-NEXT: pextrb $9, %xmm3, %ecx -; SSE41-NEXT: pextrb $9, %xmm1, %edx -; SSE41-NEXT: subb %cl, %dl -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB1_52 -; SSE41-NEXT: # %bb.51: -; SSE41-NEXT: movl %edx, %ecx -; SSE41-NEXT: .LBB1_52: -; SSE41-NEXT: pinsrb $10, %eax, %xmm0 -; SSE41-NEXT: movzbl %r11b, %eax -; SSE41-NEXT: movzbl %cl, %ecx -; SSE41-NEXT: pinsrb $9, %ecx, %xmm2 -; SSE41-NEXT: pextrb $10, %xmm3, %ecx -; SSE41-NEXT: pextrb $10, %xmm1, %edx -; SSE41-NEXT: subb %cl, %dl -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB1_54 -; SSE41-NEXT: # %bb.53: -; SSE41-NEXT: movl %edx, %ecx -; SSE41-NEXT: .LBB1_54: -; SSE41-NEXT: pinsrb $11, %eax, %xmm0 -; SSE41-NEXT: movzbl %r9b, %eax -; SSE41-NEXT: movzbl %cl, %ecx -; SSE41-NEXT: pinsrb $10, %ecx, %xmm2 -; SSE41-NEXT: pextrb $11, %xmm3, %ecx -; SSE41-NEXT: pextrb $11, %xmm1, %edx -; SSE41-NEXT: subb %cl, %dl -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB1_56 -; SSE41-NEXT: # %bb.55: -; SSE41-NEXT: movl %edx, %ecx -; SSE41-NEXT: .LBB1_56: -; SSE41-NEXT: pinsrb $12, %eax, %xmm0 -; SSE41-NEXT: movzbl %r13b, %eax -; SSE41-NEXT: movzbl %cl, %ecx -; SSE41-NEXT: pinsrb $11, %ecx, %xmm2 -; SSE41-NEXT: pextrb $12, %xmm3, %ecx -; SSE41-NEXT: pextrb $12, %xmm1, %edx -; SSE41-NEXT: subb %cl, %dl -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB1_58 -; SSE41-NEXT: # %bb.57: -; SSE41-NEXT: movl %edx, %ecx -; SSE41-NEXT: .LBB1_58: -; SSE41-NEXT: pinsrb $13, %eax, %xmm0 -; SSE41-NEXT: movzbl %r12b, %eax -; SSE41-NEXT: movzbl %cl, %ecx -; SSE41-NEXT: pinsrb $12, %ecx, %xmm2 -; SSE41-NEXT: pextrb $13, %xmm3, %ecx -; SSE41-NEXT: pextrb $13, %xmm1, %edx -; SSE41-NEXT: subb %cl, %dl -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB1_60 -; SSE41-NEXT: # %bb.59: -; SSE41-NEXT: movl %edx, %ecx -; SSE41-NEXT: .LBB1_60: -; SSE41-NEXT: pinsrb $14, %eax, %xmm0 -; SSE41-NEXT: movzbl %r15b, %eax -; SSE41-NEXT: movzbl %cl, %ecx -; SSE41-NEXT: pinsrb $13, %ecx, %xmm2 -; SSE41-NEXT: pextrb $14, %xmm3, %edx -; SSE41-NEXT: pextrb $14, %xmm1, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: popq %rbx -; SSE41-NEXT: popq %r12 -; SSE41-NEXT: popq %r13 -; SSE41-NEXT: popq %r14 -; SSE41-NEXT: popq %r15 -; SSE41-NEXT: popq %rbp -; SSE41-NEXT: jb .LBB1_62 -; SSE41-NEXT: # %bb.61: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB1_62: -; SSE41-NEXT: pinsrb $15, %eax, %xmm0 -; SSE41-NEXT: movzbl %dl, %eax -; SSE41-NEXT: pinsrb $14, %eax, %xmm2 -; SSE41-NEXT: pextrb $15, %xmm3, %ecx -; SSE41-NEXT: pextrb $15, %xmm1, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB1_64 -; SSE41-NEXT: # %bb.63: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB1_64: -; SSE41-NEXT: movzbl %cl, %eax -; SSE41-NEXT: pinsrb $15, %eax, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm1 -; SSE41-NEXT: retq +; SSE-LABEL: v32i8: +; SSE: # %bb.0: +; SSE-NEXT: psubusb %xmm2, %xmm0 +; SSE-NEXT: psubusb %xmm3, %xmm1 +; SSE-NEXT: retq ; ; AVX1-LABEL: v32i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpextrb $1, %xmm2, %ecx ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpextrb $1, %xmm3, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %ecx -; AVX1-NEXT: jb .LBB1_2 -; AVX1-NEXT: # %bb.1: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB1_2: -; AVX1-NEXT: vpextrb $0, %xmm2, %edx -; AVX1-NEXT: vpextrb $0, %xmm3, %eax -; AVX1-NEXT: subb %dl, %al -; AVX1-NEXT: movl $0, %esi -; AVX1-NEXT: jb .LBB1_4 -; AVX1-NEXT: # %bb.3: -; AVX1-NEXT: movl %eax, %esi -; AVX1-NEXT: .LBB1_4: -; AVX1-NEXT: vpextrb $2, %xmm2, %edx -; AVX1-NEXT: vpextrb $2, %xmm3, %eax -; AVX1-NEXT: subb %dl, %al -; AVX1-NEXT: movl $0, %r8d -; AVX1-NEXT: jb .LBB1_6 -; AVX1-NEXT: # %bb.5: -; AVX1-NEXT: movl %eax, %r8d -; AVX1-NEXT: .LBB1_6: -; AVX1-NEXT: vpextrb $3, %xmm2, %edx -; AVX1-NEXT: vpextrb $3, %xmm3, %eax -; AVX1-NEXT: subb %dl, %al -; AVX1-NEXT: movl $0, %edx -; AVX1-NEXT: jb .LBB1_8 -; AVX1-NEXT: # %bb.7: -; AVX1-NEXT: movl %eax, %edx -; AVX1-NEXT: .LBB1_8: -; AVX1-NEXT: vpextrb $4, %xmm2, %eax -; AVX1-NEXT: vpextrb $4, %xmm3, %edi -; AVX1-NEXT: subb %al, %dil -; AVX1-NEXT: movl $0, %eax -; AVX1-NEXT: jb .LBB1_10 -; AVX1-NEXT: # %bb.9: -; AVX1-NEXT: movl %edi, %eax -; AVX1-NEXT: .LBB1_10: -; AVX1-NEXT: pushq %rbp -; AVX1-NEXT: pushq %r15 -; AVX1-NEXT: pushq %r14 -; AVX1-NEXT: pushq %r13 -; AVX1-NEXT: pushq %r12 -; AVX1-NEXT: pushq %rbx -; AVX1-NEXT: vpextrb $5, %xmm2, %ebp -; AVX1-NEXT: vpextrb $5, %xmm3, %edi -; AVX1-NEXT: subb %bpl, %dil -; AVX1-NEXT: movl $0, %r10d -; AVX1-NEXT: jb .LBB1_12 -; AVX1-NEXT: # %bb.11: -; AVX1-NEXT: movl %edi, %r10d -; AVX1-NEXT: .LBB1_12: -; AVX1-NEXT: vpextrb $6, %xmm2, %ebp -; AVX1-NEXT: vpextrb $6, %xmm3, %edi -; AVX1-NEXT: subb %bpl, %dil -; AVX1-NEXT: movl $0, %ebp -; AVX1-NEXT: jb .LBB1_14 -; AVX1-NEXT: # %bb.13: -; AVX1-NEXT: movl %edi, %ebp -; AVX1-NEXT: .LBB1_14: -; AVX1-NEXT: vpextrb $7, %xmm2, %ebx -; AVX1-NEXT: vpextrb $7, %xmm3, %edi -; AVX1-NEXT: subb %bl, %dil -; AVX1-NEXT: movl $0, %r11d -; AVX1-NEXT: jb .LBB1_16 -; AVX1-NEXT: # %bb.15: -; AVX1-NEXT: movl %edi, %r11d -; AVX1-NEXT: .LBB1_16: -; AVX1-NEXT: vpextrb $8, %xmm2, %ebx -; AVX1-NEXT: vpextrb $8, %xmm3, %edi -; AVX1-NEXT: subb %bl, %dil -; AVX1-NEXT: movl $0, %r9d -; AVX1-NEXT: jb .LBB1_18 -; AVX1-NEXT: # %bb.17: -; AVX1-NEXT: movl %edi, %r9d -; AVX1-NEXT: .LBB1_18: -; AVX1-NEXT: vpextrb $9, %xmm2, %ebx -; AVX1-NEXT: vpextrb $9, %xmm3, %edi -; AVX1-NEXT: subb %bl, %dil -; AVX1-NEXT: movl $0, %r13d -; AVX1-NEXT: jb .LBB1_20 -; AVX1-NEXT: # %bb.19: -; AVX1-NEXT: movl %edi, %r13d -; AVX1-NEXT: .LBB1_20: -; AVX1-NEXT: vpextrb $10, %xmm2, %ebx -; AVX1-NEXT: vpextrb $10, %xmm3, %edi -; AVX1-NEXT: subb %bl, %dil -; AVX1-NEXT: movl $0, %ebx -; AVX1-NEXT: jb .LBB1_22 -; AVX1-NEXT: # %bb.21: -; AVX1-NEXT: movl %edi, %ebx -; AVX1-NEXT: .LBB1_22: -; AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrb $11, %xmm2, %ebx -; AVX1-NEXT: vpextrb $11, %xmm3, %edi -; AVX1-NEXT: subb %bl, %dil -; AVX1-NEXT: movl $0, %ebx -; AVX1-NEXT: jb .LBB1_24 -; AVX1-NEXT: # %bb.23: -; AVX1-NEXT: movl %edi, %ebx -; AVX1-NEXT: .LBB1_24: -; AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrb $12, %xmm2, %ebx -; AVX1-NEXT: vpextrb $12, %xmm3, %edi -; AVX1-NEXT: subb %bl, %dil -; AVX1-NEXT: movl $0, %ebx -; AVX1-NEXT: jb .LBB1_26 -; AVX1-NEXT: # %bb.25: -; AVX1-NEXT: movl %edi, %ebx -; AVX1-NEXT: .LBB1_26: -; AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrb $13, %xmm2, %ebx -; AVX1-NEXT: vpextrb $13, %xmm3, %edi -; AVX1-NEXT: subb %bl, %dil -; AVX1-NEXT: movl $0, %r12d -; AVX1-NEXT: jb .LBB1_28 -; AVX1-NEXT: # %bb.27: -; AVX1-NEXT: movl %edi, %r12d -; AVX1-NEXT: .LBB1_28: -; AVX1-NEXT: vpextrb $14, %xmm2, %ebx -; AVX1-NEXT: vpextrb $14, %xmm3, %edi -; AVX1-NEXT: subb %bl, %dil -; AVX1-NEXT: movl $0, %r15d -; AVX1-NEXT: jb .LBB1_30 -; AVX1-NEXT: # %bb.29: -; AVX1-NEXT: movl %edi, %r15d -; AVX1-NEXT: .LBB1_30: -; AVX1-NEXT: movzbl %sil, %esi -; AVX1-NEXT: vpextrb $15, %xmm2, %ebx -; AVX1-NEXT: vpextrb $15, %xmm3, %edi -; AVX1-NEXT: subb %bl, %dil -; AVX1-NEXT: movl $0, %r14d -; AVX1-NEXT: jb .LBB1_32 -; AVX1-NEXT: # %bb.31: -; AVX1-NEXT: movl %edi, %r14d -; AVX1-NEXT: .LBB1_32: -; AVX1-NEXT: movzbl %cl, %ecx -; AVX1-NEXT: vmovd %esi, %xmm2 -; AVX1-NEXT: vpextrb $1, %xmm1, %edi -; AVX1-NEXT: vpextrb $1, %xmm0, %esi -; AVX1-NEXT: subb %dil, %sil -; AVX1-NEXT: movl $0, %edi -; AVX1-NEXT: jb .LBB1_34 -; AVX1-NEXT: # %bb.33: -; AVX1-NEXT: movl %esi, %edi -; AVX1-NEXT: .LBB1_34: -; AVX1-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: movzbl %r8b, %esi -; AVX1-NEXT: movzbl %dil, %ecx -; AVX1-NEXT: vpextrb $0, %xmm1, %ebx -; AVX1-NEXT: vpextrb $0, %xmm0, %edi -; AVX1-NEXT: subb %bl, %dil -; AVX1-NEXT: movl $0, %ebx -; AVX1-NEXT: jb .LBB1_36 -; AVX1-NEXT: # %bb.35: -; AVX1-NEXT: movl %edi, %ebx -; AVX1-NEXT: .LBB1_36: -; AVX1-NEXT: vpinsrb $2, %esi, %xmm2, %xmm2 -; AVX1-NEXT: movzbl %dl, %edx -; AVX1-NEXT: movzbl %bl, %esi -; AVX1-NEXT: vmovd %esi, %xmm3 -; AVX1-NEXT: vpinsrb $1, %ecx, %xmm3, %xmm3 -; AVX1-NEXT: vpextrb $2, %xmm1, %esi -; AVX1-NEXT: vpextrb $2, %xmm0, %ecx -; AVX1-NEXT: subb %sil, %cl -; AVX1-NEXT: movl $0, %esi -; AVX1-NEXT: jb .LBB1_38 -; AVX1-NEXT: # %bb.37: -; AVX1-NEXT: movl %ecx, %esi -; AVX1-NEXT: .LBB1_38: -; AVX1-NEXT: vpinsrb $3, %edx, %xmm2, %xmm2 -; AVX1-NEXT: movzbl %al, %eax -; AVX1-NEXT: movzbl %sil, %ecx -; AVX1-NEXT: vpinsrb $2, %ecx, %xmm3, %xmm3 -; AVX1-NEXT: vpextrb $3, %xmm1, %edx -; AVX1-NEXT: vpextrb $3, %xmm0, %ecx -; AVX1-NEXT: subb %dl, %cl -; AVX1-NEXT: movl $0, %edx -; AVX1-NEXT: jb .LBB1_40 -; AVX1-NEXT: # %bb.39: -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: .LBB1_40: -; AVX1-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl %r10b, %eax -; AVX1-NEXT: movzbl %dl, %ecx -; AVX1-NEXT: vpinsrb $3, %ecx, %xmm3, %xmm3 -; AVX1-NEXT: vpextrb $4, %xmm1, %edx -; AVX1-NEXT: vpextrb $4, %xmm0, %ecx -; AVX1-NEXT: subb %dl, %cl -; AVX1-NEXT: movl $0, %edx -; AVX1-NEXT: jb .LBB1_42 -; AVX1-NEXT: # %bb.41: -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: .LBB1_42: -; AVX1-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl %bpl, %eax -; AVX1-NEXT: movzbl %dl, %ecx -; AVX1-NEXT: vpinsrb $4, %ecx, %xmm3, %xmm3 -; AVX1-NEXT: vpextrb $5, %xmm1, %edx -; AVX1-NEXT: vpextrb $5, %xmm0, %ecx -; AVX1-NEXT: subb %dl, %cl -; AVX1-NEXT: movl $0, %edx -; AVX1-NEXT: jb .LBB1_44 -; AVX1-NEXT: # %bb.43: -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: .LBB1_44: -; AVX1-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl %r11b, %eax -; AVX1-NEXT: movzbl %dl, %ecx -; AVX1-NEXT: vpinsrb $5, %ecx, %xmm3, %xmm3 -; AVX1-NEXT: vpextrb $6, %xmm1, %edx -; AVX1-NEXT: vpextrb $6, %xmm0, %ecx -; AVX1-NEXT: subb %dl, %cl -; AVX1-NEXT: movl $0, %edx -; AVX1-NEXT: jb .LBB1_46 -; AVX1-NEXT: # %bb.45: -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: .LBB1_46: -; AVX1-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl %r9b, %eax -; AVX1-NEXT: movzbl %dl, %ecx -; AVX1-NEXT: vpinsrb $6, %ecx, %xmm3, %xmm3 -; AVX1-NEXT: vpextrb $7, %xmm1, %edx -; AVX1-NEXT: vpextrb $7, %xmm0, %ecx -; AVX1-NEXT: subb %dl, %cl -; AVX1-NEXT: movl $0, %edx -; AVX1-NEXT: jb .LBB1_48 -; AVX1-NEXT: # %bb.47: -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: .LBB1_48: -; AVX1-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl %r13b, %eax -; AVX1-NEXT: movzbl %dl, %ecx -; AVX1-NEXT: vpinsrb $7, %ecx, %xmm3, %xmm3 -; AVX1-NEXT: vpextrb $8, %xmm1, %edx -; AVX1-NEXT: vpextrb $8, %xmm0, %ecx -; AVX1-NEXT: subb %dl, %cl -; AVX1-NEXT: movl $0, %edx -; AVX1-NEXT: jb .LBB1_50 -; AVX1-NEXT: # %bb.49: -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: .LBB1_50: -; AVX1-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: movzbl %dl, %ecx -; AVX1-NEXT: vpinsrb $8, %ecx, %xmm3, %xmm3 -; AVX1-NEXT: vpextrb $9, %xmm1, %edx -; AVX1-NEXT: vpextrb $9, %xmm0, %ecx -; AVX1-NEXT: subb %dl, %cl -; AVX1-NEXT: movl $0, %edx -; AVX1-NEXT: jb .LBB1_52 -; AVX1-NEXT: # %bb.51: -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: .LBB1_52: -; AVX1-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: movzbl %dl, %ecx -; AVX1-NEXT: vpinsrb $9, %ecx, %xmm3, %xmm3 -; AVX1-NEXT: vpextrb $10, %xmm1, %edx -; AVX1-NEXT: vpextrb $10, %xmm0, %ecx -; AVX1-NEXT: subb %dl, %cl -; AVX1-NEXT: movl $0, %edx -; AVX1-NEXT: jb .LBB1_54 -; AVX1-NEXT: # %bb.53: -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: .LBB1_54: -; AVX1-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: movzbl %dl, %ecx -; AVX1-NEXT: vpinsrb $10, %ecx, %xmm3, %xmm3 -; AVX1-NEXT: vpextrb $11, %xmm1, %edx -; AVX1-NEXT: vpextrb $11, %xmm0, %ecx -; AVX1-NEXT: subb %dl, %cl -; AVX1-NEXT: movl $0, %edx -; AVX1-NEXT: jb .LBB1_56 -; AVX1-NEXT: # %bb.55: -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: .LBB1_56: -; AVX1-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl %r12b, %eax -; AVX1-NEXT: movzbl %dl, %ecx -; AVX1-NEXT: vpinsrb $11, %ecx, %xmm3, %xmm3 -; AVX1-NEXT: vpextrb $12, %xmm1, %edx -; AVX1-NEXT: vpextrb $12, %xmm0, %ecx -; AVX1-NEXT: subb %dl, %cl -; AVX1-NEXT: movl $0, %edx -; AVX1-NEXT: jb .LBB1_58 -; AVX1-NEXT: # %bb.57: -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: .LBB1_58: -; AVX1-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl %r15b, %eax -; AVX1-NEXT: movzbl %dl, %ecx -; AVX1-NEXT: vpinsrb $12, %ecx, %xmm3, %xmm3 -; AVX1-NEXT: vpextrb $13, %xmm1, %edx -; AVX1-NEXT: vpextrb $13, %xmm0, %ecx -; AVX1-NEXT: subb %dl, %cl -; AVX1-NEXT: movl $0, %edx -; AVX1-NEXT: jb .LBB1_60 -; AVX1-NEXT: # %bb.59: -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: .LBB1_60: -; AVX1-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl %r14b, %eax -; AVX1-NEXT: movzbl %dl, %ecx -; AVX1-NEXT: vpinsrb $13, %ecx, %xmm3, %xmm3 -; AVX1-NEXT: vpextrb $14, %xmm1, %edx -; AVX1-NEXT: vpextrb $14, %xmm0, %ecx -; AVX1-NEXT: subb %dl, %cl -; AVX1-NEXT: movl $0, %edx -; AVX1-NEXT: popq %rbx -; AVX1-NEXT: popq %r12 -; AVX1-NEXT: popq %r13 -; AVX1-NEXT: popq %r14 -; AVX1-NEXT: popq %r15 -; AVX1-NEXT: popq %rbp -; AVX1-NEXT: jb .LBB1_62 -; AVX1-NEXT: # %bb.61: -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: .LBB1_62: -; AVX1-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl %dl, %eax -; AVX1-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 -; AVX1-NEXT: vpextrb $15, %xmm1, %ecx -; AVX1-NEXT: vpextrb $15, %xmm0, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %ecx -; AVX1-NEXT: jb .LBB1_64 -; AVX1-NEXT: # %bb.63: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB1_64: -; AVX1-NEXT: movzbl %cl, %eax -; AVX1-NEXT: vpinsrb $15, %eax, %xmm3, %xmm0 +; AVX1-NEXT: vpsubusb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpextrb $1, %xmm2, %ecx -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpextrb $1, %xmm3, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %ecx -; AVX2-NEXT: jb .LBB1_2 -; AVX2-NEXT: # %bb.1: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB1_2: -; AVX2-NEXT: vpextrb $0, %xmm2, %edx -; AVX2-NEXT: vpextrb $0, %xmm3, %eax -; AVX2-NEXT: subb %dl, %al -; AVX2-NEXT: movl $0, %esi -; AVX2-NEXT: jb .LBB1_4 -; AVX2-NEXT: # %bb.3: -; AVX2-NEXT: movl %eax, %esi -; AVX2-NEXT: .LBB1_4: -; AVX2-NEXT: vpextrb $2, %xmm2, %edx -; AVX2-NEXT: vpextrb $2, %xmm3, %eax -; AVX2-NEXT: subb %dl, %al -; AVX2-NEXT: movl $0, %r8d -; AVX2-NEXT: jb .LBB1_6 -; AVX2-NEXT: # %bb.5: -; AVX2-NEXT: movl %eax, %r8d -; AVX2-NEXT: .LBB1_6: -; AVX2-NEXT: vpextrb $3, %xmm2, %edx -; AVX2-NEXT: vpextrb $3, %xmm3, %eax -; AVX2-NEXT: subb %dl, %al -; AVX2-NEXT: movl $0, %edx -; AVX2-NEXT: jb .LBB1_8 -; AVX2-NEXT: # %bb.7: -; AVX2-NEXT: movl %eax, %edx -; AVX2-NEXT: .LBB1_8: -; AVX2-NEXT: vpextrb $4, %xmm2, %eax -; AVX2-NEXT: vpextrb $4, %xmm3, %edi -; AVX2-NEXT: subb %al, %dil -; AVX2-NEXT: movl $0, %eax -; AVX2-NEXT: jb .LBB1_10 -; AVX2-NEXT: # %bb.9: -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: .LBB1_10: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: pushq %r15 -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %r13 -; AVX2-NEXT: pushq %r12 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: vpextrb $5, %xmm2, %ebp -; AVX2-NEXT: vpextrb $5, %xmm3, %edi -; AVX2-NEXT: subb %bpl, %dil -; AVX2-NEXT: movl $0, %r10d -; AVX2-NEXT: jb .LBB1_12 -; AVX2-NEXT: # %bb.11: -; AVX2-NEXT: movl %edi, %r10d -; AVX2-NEXT: .LBB1_12: -; AVX2-NEXT: vpextrb $6, %xmm2, %ebp -; AVX2-NEXT: vpextrb $6, %xmm3, %edi -; AVX2-NEXT: subb %bpl, %dil -; AVX2-NEXT: movl $0, %ebp -; AVX2-NEXT: jb .LBB1_14 -; AVX2-NEXT: # %bb.13: -; AVX2-NEXT: movl %edi, %ebp -; AVX2-NEXT: .LBB1_14: -; AVX2-NEXT: vpextrb $7, %xmm2, %ebx -; AVX2-NEXT: vpextrb $7, %xmm3, %edi -; AVX2-NEXT: subb %bl, %dil -; AVX2-NEXT: movl $0, %r11d -; AVX2-NEXT: jb .LBB1_16 -; AVX2-NEXT: # %bb.15: -; AVX2-NEXT: movl %edi, %r11d -; AVX2-NEXT: .LBB1_16: -; AVX2-NEXT: vpextrb $8, %xmm2, %ebx -; AVX2-NEXT: vpextrb $8, %xmm3, %edi -; AVX2-NEXT: subb %bl, %dil -; AVX2-NEXT: movl $0, %r9d -; AVX2-NEXT: jb .LBB1_18 -; AVX2-NEXT: # %bb.17: -; AVX2-NEXT: movl %edi, %r9d -; AVX2-NEXT: .LBB1_18: -; AVX2-NEXT: vpextrb $9, %xmm2, %ebx -; AVX2-NEXT: vpextrb $9, %xmm3, %edi -; AVX2-NEXT: subb %bl, %dil -; AVX2-NEXT: movl $0, %r13d -; AVX2-NEXT: jb .LBB1_20 -; AVX2-NEXT: # %bb.19: -; AVX2-NEXT: movl %edi, %r13d -; AVX2-NEXT: .LBB1_20: -; AVX2-NEXT: vpextrb $10, %xmm2, %ebx -; AVX2-NEXT: vpextrb $10, %xmm3, %edi -; AVX2-NEXT: subb %bl, %dil -; AVX2-NEXT: movl $0, %ebx -; AVX2-NEXT: jb .LBB1_22 -; AVX2-NEXT: # %bb.21: -; AVX2-NEXT: movl %edi, %ebx -; AVX2-NEXT: .LBB1_22: -; AVX2-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrb $11, %xmm2, %ebx -; AVX2-NEXT: vpextrb $11, %xmm3, %edi -; AVX2-NEXT: subb %bl, %dil -; AVX2-NEXT: movl $0, %ebx -; AVX2-NEXT: jb .LBB1_24 -; AVX2-NEXT: # %bb.23: -; AVX2-NEXT: movl %edi, %ebx -; AVX2-NEXT: .LBB1_24: -; AVX2-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrb $12, %xmm2, %ebx -; AVX2-NEXT: vpextrb $12, %xmm3, %edi -; AVX2-NEXT: subb %bl, %dil -; AVX2-NEXT: movl $0, %ebx -; AVX2-NEXT: jb .LBB1_26 -; AVX2-NEXT: # %bb.25: -; AVX2-NEXT: movl %edi, %ebx -; AVX2-NEXT: .LBB1_26: -; AVX2-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrb $13, %xmm2, %ebx -; AVX2-NEXT: vpextrb $13, %xmm3, %edi -; AVX2-NEXT: subb %bl, %dil -; AVX2-NEXT: movl $0, %r12d -; AVX2-NEXT: jb .LBB1_28 -; AVX2-NEXT: # %bb.27: -; AVX2-NEXT: movl %edi, %r12d -; AVX2-NEXT: .LBB1_28: -; AVX2-NEXT: vpextrb $14, %xmm2, %ebx -; AVX2-NEXT: vpextrb $14, %xmm3, %edi -; AVX2-NEXT: subb %bl, %dil -; AVX2-NEXT: movl $0, %r15d -; AVX2-NEXT: jb .LBB1_30 -; AVX2-NEXT: # %bb.29: -; AVX2-NEXT: movl %edi, %r15d -; AVX2-NEXT: .LBB1_30: -; AVX2-NEXT: movzbl %sil, %esi -; AVX2-NEXT: vpextrb $15, %xmm2, %ebx -; AVX2-NEXT: vpextrb $15, %xmm3, %edi -; AVX2-NEXT: subb %bl, %dil -; AVX2-NEXT: movl $0, %r14d -; AVX2-NEXT: jb .LBB1_32 -; AVX2-NEXT: # %bb.31: -; AVX2-NEXT: movl %edi, %r14d -; AVX2-NEXT: .LBB1_32: -; AVX2-NEXT: movzbl %cl, %ecx -; AVX2-NEXT: vmovd %esi, %xmm2 -; AVX2-NEXT: vpextrb $1, %xmm1, %edi -; AVX2-NEXT: vpextrb $1, %xmm0, %esi -; AVX2-NEXT: subb %dil, %sil -; AVX2-NEXT: movl $0, %edi -; AVX2-NEXT: jb .LBB1_34 -; AVX2-NEXT: # %bb.33: -; AVX2-NEXT: movl %esi, %edi -; AVX2-NEXT: .LBB1_34: -; AVX2-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: movzbl %r8b, %esi -; AVX2-NEXT: movzbl %dil, %ecx -; AVX2-NEXT: vpextrb $0, %xmm1, %ebx -; AVX2-NEXT: vpextrb $0, %xmm0, %edi -; AVX2-NEXT: subb %bl, %dil -; AVX2-NEXT: movl $0, %ebx -; AVX2-NEXT: jb .LBB1_36 -; AVX2-NEXT: # %bb.35: -; AVX2-NEXT: movl %edi, %ebx -; AVX2-NEXT: .LBB1_36: -; AVX2-NEXT: vpinsrb $2, %esi, %xmm2, %xmm2 -; AVX2-NEXT: movzbl %dl, %edx -; AVX2-NEXT: movzbl %bl, %esi -; AVX2-NEXT: vmovd %esi, %xmm3 -; AVX2-NEXT: vpinsrb $1, %ecx, %xmm3, %xmm3 -; AVX2-NEXT: vpextrb $2, %xmm1, %esi -; AVX2-NEXT: vpextrb $2, %xmm0, %ecx -; AVX2-NEXT: subb %sil, %cl -; AVX2-NEXT: movl $0, %esi -; AVX2-NEXT: jb .LBB1_38 -; AVX2-NEXT: # %bb.37: -; AVX2-NEXT: movl %ecx, %esi -; AVX2-NEXT: .LBB1_38: -; AVX2-NEXT: vpinsrb $3, %edx, %xmm2, %xmm2 -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: movzbl %sil, %ecx -; AVX2-NEXT: vpinsrb $2, %ecx, %xmm3, %xmm3 -; AVX2-NEXT: vpextrb $3, %xmm1, %edx -; AVX2-NEXT: vpextrb $3, %xmm0, %ecx -; AVX2-NEXT: subb %dl, %cl -; AVX2-NEXT: movl $0, %edx -; AVX2-NEXT: jb .LBB1_40 -; AVX2-NEXT: # %bb.39: -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: .LBB1_40: -; AVX2-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl %r10b, %eax -; AVX2-NEXT: movzbl %dl, %ecx -; AVX2-NEXT: vpinsrb $3, %ecx, %xmm3, %xmm3 -; AVX2-NEXT: vpextrb $4, %xmm1, %edx -; AVX2-NEXT: vpextrb $4, %xmm0, %ecx -; AVX2-NEXT: subb %dl, %cl -; AVX2-NEXT: movl $0, %edx -; AVX2-NEXT: jb .LBB1_42 -; AVX2-NEXT: # %bb.41: -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: .LBB1_42: -; AVX2-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl %bpl, %eax -; AVX2-NEXT: movzbl %dl, %ecx -; AVX2-NEXT: vpinsrb $4, %ecx, %xmm3, %xmm3 -; AVX2-NEXT: vpextrb $5, %xmm1, %edx -; AVX2-NEXT: vpextrb $5, %xmm0, %ecx -; AVX2-NEXT: subb %dl, %cl -; AVX2-NEXT: movl $0, %edx -; AVX2-NEXT: jb .LBB1_44 -; AVX2-NEXT: # %bb.43: -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: .LBB1_44: -; AVX2-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl %r11b, %eax -; AVX2-NEXT: movzbl %dl, %ecx -; AVX2-NEXT: vpinsrb $5, %ecx, %xmm3, %xmm3 -; AVX2-NEXT: vpextrb $6, %xmm1, %edx -; AVX2-NEXT: vpextrb $6, %xmm0, %ecx -; AVX2-NEXT: subb %dl, %cl -; AVX2-NEXT: movl $0, %edx -; AVX2-NEXT: jb .LBB1_46 -; AVX2-NEXT: # %bb.45: -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: .LBB1_46: -; AVX2-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl %r9b, %eax -; AVX2-NEXT: movzbl %dl, %ecx -; AVX2-NEXT: vpinsrb $6, %ecx, %xmm3, %xmm3 -; AVX2-NEXT: vpextrb $7, %xmm1, %edx -; AVX2-NEXT: vpextrb $7, %xmm0, %ecx -; AVX2-NEXT: subb %dl, %cl -; AVX2-NEXT: movl $0, %edx -; AVX2-NEXT: jb .LBB1_48 -; AVX2-NEXT: # %bb.47: -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: .LBB1_48: -; AVX2-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl %r13b, %eax -; AVX2-NEXT: movzbl %dl, %ecx -; AVX2-NEXT: vpinsrb $7, %ecx, %xmm3, %xmm3 -; AVX2-NEXT: vpextrb $8, %xmm1, %edx -; AVX2-NEXT: vpextrb $8, %xmm0, %ecx -; AVX2-NEXT: subb %dl, %cl -; AVX2-NEXT: movl $0, %edx -; AVX2-NEXT: jb .LBB1_50 -; AVX2-NEXT: # %bb.49: -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: .LBB1_50: -; AVX2-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: movzbl %dl, %ecx -; AVX2-NEXT: vpinsrb $8, %ecx, %xmm3, %xmm3 -; AVX2-NEXT: vpextrb $9, %xmm1, %edx -; AVX2-NEXT: vpextrb $9, %xmm0, %ecx -; AVX2-NEXT: subb %dl, %cl -; AVX2-NEXT: movl $0, %edx -; AVX2-NEXT: jb .LBB1_52 -; AVX2-NEXT: # %bb.51: -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: .LBB1_52: -; AVX2-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: movzbl %dl, %ecx -; AVX2-NEXT: vpinsrb $9, %ecx, %xmm3, %xmm3 -; AVX2-NEXT: vpextrb $10, %xmm1, %edx -; AVX2-NEXT: vpextrb $10, %xmm0, %ecx -; AVX2-NEXT: subb %dl, %cl -; AVX2-NEXT: movl $0, %edx -; AVX2-NEXT: jb .LBB1_54 -; AVX2-NEXT: # %bb.53: -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: .LBB1_54: -; AVX2-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: movzbl %dl, %ecx -; AVX2-NEXT: vpinsrb $10, %ecx, %xmm3, %xmm3 -; AVX2-NEXT: vpextrb $11, %xmm1, %edx -; AVX2-NEXT: vpextrb $11, %xmm0, %ecx -; AVX2-NEXT: subb %dl, %cl -; AVX2-NEXT: movl $0, %edx -; AVX2-NEXT: jb .LBB1_56 -; AVX2-NEXT: # %bb.55: -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: .LBB1_56: -; AVX2-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl %r12b, %eax -; AVX2-NEXT: movzbl %dl, %ecx -; AVX2-NEXT: vpinsrb $11, %ecx, %xmm3, %xmm3 -; AVX2-NEXT: vpextrb $12, %xmm1, %edx -; AVX2-NEXT: vpextrb $12, %xmm0, %ecx -; AVX2-NEXT: subb %dl, %cl -; AVX2-NEXT: movl $0, %edx -; AVX2-NEXT: jb .LBB1_58 -; AVX2-NEXT: # %bb.57: -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: .LBB1_58: -; AVX2-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl %r15b, %eax -; AVX2-NEXT: movzbl %dl, %ecx -; AVX2-NEXT: vpinsrb $12, %ecx, %xmm3, %xmm3 -; AVX2-NEXT: vpextrb $13, %xmm1, %edx -; AVX2-NEXT: vpextrb $13, %xmm0, %ecx -; AVX2-NEXT: subb %dl, %cl -; AVX2-NEXT: movl $0, %edx -; AVX2-NEXT: jb .LBB1_60 -; AVX2-NEXT: # %bb.59: -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: .LBB1_60: -; AVX2-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl %r14b, %eax -; AVX2-NEXT: movzbl %dl, %ecx -; AVX2-NEXT: vpinsrb $13, %ecx, %xmm3, %xmm3 -; AVX2-NEXT: vpextrb $14, %xmm1, %edx -; AVX2-NEXT: vpextrb $14, %xmm0, %ecx -; AVX2-NEXT: subb %dl, %cl -; AVX2-NEXT: movl $0, %edx -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r12 -; AVX2-NEXT: popq %r13 -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %r15 -; AVX2-NEXT: popq %rbp -; AVX2-NEXT: jb .LBB1_62 -; AVX2-NEXT: # %bb.61: -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: .LBB1_62: -; AVX2-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl %dl, %eax -; AVX2-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 -; AVX2-NEXT: vpextrb $15, %xmm1, %ecx -; AVX2-NEXT: vpextrb $15, %xmm0, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %ecx -; AVX2-NEXT: jb .LBB1_64 -; AVX2-NEXT: # %bb.63: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB1_64: -; AVX2-NEXT: movzbl %cl, %eax -; AVX2-NEXT: vpinsrb $15, %eax, %xmm3, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: v32i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512-NEXT: vpextrb $1, %xmm2, %ecx -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512-NEXT: vpextrb $1, %xmm3, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %ecx -; AVX512-NEXT: jb .LBB1_2 -; AVX512-NEXT: # %bb.1: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB1_2: -; AVX512-NEXT: vpextrb $0, %xmm2, %edx -; AVX512-NEXT: vpextrb $0, %xmm3, %eax -; AVX512-NEXT: subb %dl, %al -; AVX512-NEXT: movl $0, %esi -; AVX512-NEXT: jb .LBB1_4 -; AVX512-NEXT: # %bb.3: -; AVX512-NEXT: movl %eax, %esi -; AVX512-NEXT: .LBB1_4: -; AVX512-NEXT: vpextrb $2, %xmm2, %edx -; AVX512-NEXT: vpextrb $2, %xmm3, %eax -; AVX512-NEXT: subb %dl, %al -; AVX512-NEXT: movl $0, %r8d -; AVX512-NEXT: jb .LBB1_6 -; AVX512-NEXT: # %bb.5: -; AVX512-NEXT: movl %eax, %r8d -; AVX512-NEXT: .LBB1_6: -; AVX512-NEXT: vpextrb $3, %xmm2, %edx -; AVX512-NEXT: vpextrb $3, %xmm3, %eax -; AVX512-NEXT: subb %dl, %al -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: jb .LBB1_8 -; AVX512-NEXT: # %bb.7: -; AVX512-NEXT: movl %eax, %edx -; AVX512-NEXT: .LBB1_8: -; AVX512-NEXT: vpextrb $4, %xmm2, %eax -; AVX512-NEXT: vpextrb $4, %xmm3, %edi -; AVX512-NEXT: subb %al, %dil -; AVX512-NEXT: movl $0, %eax -; AVX512-NEXT: jb .LBB1_10 -; AVX512-NEXT: # %bb.9: -; AVX512-NEXT: movl %edi, %eax -; AVX512-NEXT: .LBB1_10: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: vpextrb $5, %xmm2, %ebp -; AVX512-NEXT: vpextrb $5, %xmm3, %edi -; AVX512-NEXT: subb %bpl, %dil -; AVX512-NEXT: movl $0, %r10d -; AVX512-NEXT: jb .LBB1_12 -; AVX512-NEXT: # %bb.11: -; AVX512-NEXT: movl %edi, %r10d -; AVX512-NEXT: .LBB1_12: -; AVX512-NEXT: vpextrb $6, %xmm2, %ebp -; AVX512-NEXT: vpextrb $6, %xmm3, %edi -; AVX512-NEXT: subb %bpl, %dil -; AVX512-NEXT: movl $0, %ebp -; AVX512-NEXT: jb .LBB1_14 -; AVX512-NEXT: # %bb.13: -; AVX512-NEXT: movl %edi, %ebp -; AVX512-NEXT: .LBB1_14: -; AVX512-NEXT: vpextrb $7, %xmm2, %ebx -; AVX512-NEXT: vpextrb $7, %xmm3, %edi -; AVX512-NEXT: subb %bl, %dil -; AVX512-NEXT: movl $0, %r11d -; AVX512-NEXT: jb .LBB1_16 -; AVX512-NEXT: # %bb.15: -; AVX512-NEXT: movl %edi, %r11d -; AVX512-NEXT: .LBB1_16: -; AVX512-NEXT: vpextrb $8, %xmm2, %ebx -; AVX512-NEXT: vpextrb $8, %xmm3, %edi -; AVX512-NEXT: subb %bl, %dil -; AVX512-NEXT: movl $0, %r9d -; AVX512-NEXT: jb .LBB1_18 -; AVX512-NEXT: # %bb.17: -; AVX512-NEXT: movl %edi, %r9d -; AVX512-NEXT: .LBB1_18: -; AVX512-NEXT: vpextrb $9, %xmm2, %ebx -; AVX512-NEXT: vpextrb $9, %xmm3, %edi -; AVX512-NEXT: subb %bl, %dil -; AVX512-NEXT: movl $0, %r13d -; AVX512-NEXT: jb .LBB1_20 -; AVX512-NEXT: # %bb.19: -; AVX512-NEXT: movl %edi, %r13d -; AVX512-NEXT: .LBB1_20: -; AVX512-NEXT: vpextrb $10, %xmm2, %ebx -; AVX512-NEXT: vpextrb $10, %xmm3, %edi -; AVX512-NEXT: subb %bl, %dil -; AVX512-NEXT: movl $0, %ebx -; AVX512-NEXT: jb .LBB1_22 -; AVX512-NEXT: # %bb.21: -; AVX512-NEXT: movl %edi, %ebx -; AVX512-NEXT: .LBB1_22: -; AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrb $11, %xmm2, %ebx -; AVX512-NEXT: vpextrb $11, %xmm3, %edi -; AVX512-NEXT: subb %bl, %dil -; AVX512-NEXT: movl $0, %ebx -; AVX512-NEXT: jb .LBB1_24 -; AVX512-NEXT: # %bb.23: -; AVX512-NEXT: movl %edi, %ebx -; AVX512-NEXT: .LBB1_24: -; AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrb $12, %xmm2, %ebx -; AVX512-NEXT: vpextrb $12, %xmm3, %edi -; AVX512-NEXT: subb %bl, %dil -; AVX512-NEXT: movl $0, %ebx -; AVX512-NEXT: jb .LBB1_26 -; AVX512-NEXT: # %bb.25: -; AVX512-NEXT: movl %edi, %ebx -; AVX512-NEXT: .LBB1_26: -; AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrb $13, %xmm2, %ebx -; AVX512-NEXT: vpextrb $13, %xmm3, %edi -; AVX512-NEXT: subb %bl, %dil -; AVX512-NEXT: movl $0, %r12d -; AVX512-NEXT: jb .LBB1_28 -; AVX512-NEXT: # %bb.27: -; AVX512-NEXT: movl %edi, %r12d -; AVX512-NEXT: .LBB1_28: -; AVX512-NEXT: vpextrb $14, %xmm2, %ebx -; AVX512-NEXT: vpextrb $14, %xmm3, %edi -; AVX512-NEXT: subb %bl, %dil -; AVX512-NEXT: movl $0, %r15d -; AVX512-NEXT: jb .LBB1_30 -; AVX512-NEXT: # %bb.29: -; AVX512-NEXT: movl %edi, %r15d -; AVX512-NEXT: .LBB1_30: -; AVX512-NEXT: movzbl %sil, %esi -; AVX512-NEXT: vpextrb $15, %xmm2, %ebx -; AVX512-NEXT: vpextrb $15, %xmm3, %edi -; AVX512-NEXT: subb %bl, %dil -; AVX512-NEXT: movl $0, %r14d -; AVX512-NEXT: jb .LBB1_32 -; AVX512-NEXT: # %bb.31: -; AVX512-NEXT: movl %edi, %r14d -; AVX512-NEXT: .LBB1_32: -; AVX512-NEXT: movzbl %cl, %ecx -; AVX512-NEXT: vmovd %esi, %xmm2 -; AVX512-NEXT: vpextrb $1, %xmm1, %edi -; AVX512-NEXT: vpextrb $1, %xmm0, %esi -; AVX512-NEXT: subb %dil, %sil -; AVX512-NEXT: movl $0, %edi -; AVX512-NEXT: jb .LBB1_34 -; AVX512-NEXT: # %bb.33: -; AVX512-NEXT: movl %esi, %edi -; AVX512-NEXT: .LBB1_34: -; AVX512-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 -; AVX512-NEXT: movzbl %r8b, %esi -; AVX512-NEXT: movzbl %dil, %ecx -; AVX512-NEXT: vpextrb $0, %xmm1, %ebx -; AVX512-NEXT: vpextrb $0, %xmm0, %edi -; AVX512-NEXT: subb %bl, %dil -; AVX512-NEXT: movl $0, %ebx -; AVX512-NEXT: jb .LBB1_36 -; AVX512-NEXT: # %bb.35: -; AVX512-NEXT: movl %edi, %ebx -; AVX512-NEXT: .LBB1_36: -; AVX512-NEXT: vpinsrb $2, %esi, %xmm2, %xmm2 -; AVX512-NEXT: movzbl %dl, %edx -; AVX512-NEXT: movzbl %bl, %esi -; AVX512-NEXT: vmovd %esi, %xmm3 -; AVX512-NEXT: vpinsrb $1, %ecx, %xmm3, %xmm3 -; AVX512-NEXT: vpextrb $2, %xmm1, %esi -; AVX512-NEXT: vpextrb $2, %xmm0, %ecx -; AVX512-NEXT: subb %sil, %cl -; AVX512-NEXT: movl $0, %esi -; AVX512-NEXT: jb .LBB1_38 -; AVX512-NEXT: # %bb.37: -; AVX512-NEXT: movl %ecx, %esi -; AVX512-NEXT: .LBB1_38: -; AVX512-NEXT: vpinsrb $3, %edx, %xmm2, %xmm2 -; AVX512-NEXT: movzbl %al, %eax -; AVX512-NEXT: movzbl %sil, %ecx -; AVX512-NEXT: vpinsrb $2, %ecx, %xmm3, %xmm3 -; AVX512-NEXT: vpextrb $3, %xmm1, %edx -; AVX512-NEXT: vpextrb $3, %xmm0, %ecx -; AVX512-NEXT: subb %dl, %cl -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: jb .LBB1_40 -; AVX512-NEXT: # %bb.39: -; AVX512-NEXT: movl %ecx, %edx -; AVX512-NEXT: .LBB1_40: -; AVX512-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl %r10b, %eax -; AVX512-NEXT: movzbl %dl, %ecx -; AVX512-NEXT: vpinsrb $3, %ecx, %xmm3, %xmm3 -; AVX512-NEXT: vpextrb $4, %xmm1, %edx -; AVX512-NEXT: vpextrb $4, %xmm0, %ecx -; AVX512-NEXT: subb %dl, %cl -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: jb .LBB1_42 -; AVX512-NEXT: # %bb.41: -; AVX512-NEXT: movl %ecx, %edx -; AVX512-NEXT: .LBB1_42: -; AVX512-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl %bpl, %eax -; AVX512-NEXT: movzbl %dl, %ecx -; AVX512-NEXT: vpinsrb $4, %ecx, %xmm3, %xmm3 -; AVX512-NEXT: vpextrb $5, %xmm1, %edx -; AVX512-NEXT: vpextrb $5, %xmm0, %ecx -; AVX512-NEXT: subb %dl, %cl -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: jb .LBB1_44 -; AVX512-NEXT: # %bb.43: -; AVX512-NEXT: movl %ecx, %edx -; AVX512-NEXT: .LBB1_44: -; AVX512-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl %r11b, %eax -; AVX512-NEXT: movzbl %dl, %ecx -; AVX512-NEXT: vpinsrb $5, %ecx, %xmm3, %xmm3 -; AVX512-NEXT: vpextrb $6, %xmm1, %edx -; AVX512-NEXT: vpextrb $6, %xmm0, %ecx -; AVX512-NEXT: subb %dl, %cl -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: jb .LBB1_46 -; AVX512-NEXT: # %bb.45: -; AVX512-NEXT: movl %ecx, %edx -; AVX512-NEXT: .LBB1_46: -; AVX512-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl %r9b, %eax -; AVX512-NEXT: movzbl %dl, %ecx -; AVX512-NEXT: vpinsrb $6, %ecx, %xmm3, %xmm3 -; AVX512-NEXT: vpextrb $7, %xmm1, %edx -; AVX512-NEXT: vpextrb $7, %xmm0, %ecx -; AVX512-NEXT: subb %dl, %cl -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: jb .LBB1_48 -; AVX512-NEXT: # %bb.47: -; AVX512-NEXT: movl %ecx, %edx -; AVX512-NEXT: .LBB1_48: -; AVX512-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl %r13b, %eax -; AVX512-NEXT: movzbl %dl, %ecx -; AVX512-NEXT: vpinsrb $7, %ecx, %xmm3, %xmm3 -; AVX512-NEXT: vpextrb $8, %xmm1, %edx -; AVX512-NEXT: vpextrb $8, %xmm0, %ecx -; AVX512-NEXT: subb %dl, %cl -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: jb .LBB1_50 -; AVX512-NEXT: # %bb.49: -; AVX512-NEXT: movl %ecx, %edx -; AVX512-NEXT: .LBB1_50: -; AVX512-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: movzbl %dl, %ecx -; AVX512-NEXT: vpinsrb $8, %ecx, %xmm3, %xmm3 -; AVX512-NEXT: vpextrb $9, %xmm1, %edx -; AVX512-NEXT: vpextrb $9, %xmm0, %ecx -; AVX512-NEXT: subb %dl, %cl -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: jb .LBB1_52 -; AVX512-NEXT: # %bb.51: -; AVX512-NEXT: movl %ecx, %edx -; AVX512-NEXT: .LBB1_52: -; AVX512-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: movzbl %dl, %ecx -; AVX512-NEXT: vpinsrb $9, %ecx, %xmm3, %xmm3 -; AVX512-NEXT: vpextrb $10, %xmm1, %edx -; AVX512-NEXT: vpextrb $10, %xmm0, %ecx -; AVX512-NEXT: subb %dl, %cl -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: jb .LBB1_54 -; AVX512-NEXT: # %bb.53: -; AVX512-NEXT: movl %ecx, %edx -; AVX512-NEXT: .LBB1_54: -; AVX512-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: movzbl %dl, %ecx -; AVX512-NEXT: vpinsrb $10, %ecx, %xmm3, %xmm3 -; AVX512-NEXT: vpextrb $11, %xmm1, %edx -; AVX512-NEXT: vpextrb $11, %xmm0, %ecx -; AVX512-NEXT: subb %dl, %cl -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: jb .LBB1_56 -; AVX512-NEXT: # %bb.55: -; AVX512-NEXT: movl %ecx, %edx -; AVX512-NEXT: .LBB1_56: -; AVX512-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl %r12b, %eax -; AVX512-NEXT: movzbl %dl, %ecx -; AVX512-NEXT: vpinsrb $11, %ecx, %xmm3, %xmm3 -; AVX512-NEXT: vpextrb $12, %xmm1, %edx -; AVX512-NEXT: vpextrb $12, %xmm0, %ecx -; AVX512-NEXT: subb %dl, %cl -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: jb .LBB1_58 -; AVX512-NEXT: # %bb.57: -; AVX512-NEXT: movl %ecx, %edx -; AVX512-NEXT: .LBB1_58: -; AVX512-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl %r15b, %eax -; AVX512-NEXT: movzbl %dl, %ecx -; AVX512-NEXT: vpinsrb $12, %ecx, %xmm3, %xmm3 -; AVX512-NEXT: vpextrb $13, %xmm1, %edx -; AVX512-NEXT: vpextrb $13, %xmm0, %ecx -; AVX512-NEXT: subb %dl, %cl -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: jb .LBB1_60 -; AVX512-NEXT: # %bb.59: -; AVX512-NEXT: movl %ecx, %edx -; AVX512-NEXT: .LBB1_60: -; AVX512-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl %r14b, %eax -; AVX512-NEXT: movzbl %dl, %ecx -; AVX512-NEXT: vpinsrb $13, %ecx, %xmm3, %xmm3 -; AVX512-NEXT: vpextrb $14, %xmm1, %edx -; AVX512-NEXT: vpextrb $14, %xmm0, %ecx -; AVX512-NEXT: subb %dl, %cl -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp -; AVX512-NEXT: jb .LBB1_62 -; AVX512-NEXT: # %bb.61: -; AVX512-NEXT: movl %ecx, %edx -; AVX512-NEXT: .LBB1_62: -; AVX512-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl %dl, %eax -; AVX512-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 -; AVX512-NEXT: vpextrb $15, %xmm1, %ecx -; AVX512-NEXT: vpextrb $15, %xmm0, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %ecx -; AVX512-NEXT: jb .LBB1_64 -; AVX512-NEXT: # %bb.63: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB1_64: -; AVX512-NEXT: movzbl %cl, %eax -; AVX512-NEXT: vpinsrb $15, %eax, %xmm3, %xmm0 -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq %z = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> %x, <32 x i8> %y) ret <32 x i8> %z } define <64 x i8> @v64i8(<64 x i8> %x, <64 x i8> %y) nounwind { -; SSE2-LABEL: v64i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pushq %rbp -; SSE2-NEXT: pushq %r15 -; SSE2-NEXT: pushq %r14 -; SSE2-NEXT: pushq %r13 -; SSE2-NEXT: pushq %r12 -; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: subq $648, %rsp # imm = 0x288 -; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm5, (%rsp) -; SSE2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, %esi -; SSE2-NEXT: jb .LBB2_2 -; SSE2-NEXT: # %bb.1: -; SSE2-NEXT: movl %eax, %esi -; SSE2-NEXT: .LBB2_2: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, %ebx -; SSE2-NEXT: jb .LBB2_4 -; SSE2-NEXT: # %bb.3: -; SSE2-NEXT: movl %eax, %ebx -; SSE2-NEXT: .LBB2_4: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, %ebp -; SSE2-NEXT: jb .LBB2_6 -; SSE2-NEXT: # %bb.5: -; SSE2-NEXT: movl %eax, %ebp -; SSE2-NEXT: .LBB2_6: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, %edi -; SSE2-NEXT: jb .LBB2_8 -; SSE2-NEXT: # %bb.7: -; SSE2-NEXT: movl %eax, %edi -; SSE2-NEXT: .LBB2_8: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, %r9d -; SSE2-NEXT: jb .LBB2_10 -; SSE2-NEXT: # %bb.9: -; SSE2-NEXT: movl %eax, %r9d -; SSE2-NEXT: .LBB2_10: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, %edx -; SSE2-NEXT: jb .LBB2_12 -; SSE2-NEXT: # %bb.11: -; SSE2-NEXT: movl %eax, %edx -; SSE2-NEXT: .LBB2_12: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, %ecx -; SSE2-NEXT: jb .LBB2_14 -; SSE2-NEXT: # %bb.13: -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: .LBB2_14: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, %r10d -; SSE2-NEXT: jb .LBB2_16 -; SSE2-NEXT: # %bb.15: -; SSE2-NEXT: movl %eax, %r10d -; SSE2-NEXT: .LBB2_16: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, %r11d -; SSE2-NEXT: jb .LBB2_18 -; SSE2-NEXT: # %bb.17: -; SSE2-NEXT: movl %eax, %r11d -; SSE2-NEXT: .LBB2_18: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, %r14d -; SSE2-NEXT: jb .LBB2_20 -; SSE2-NEXT: # %bb.19: -; SSE2-NEXT: movl %eax, %r14d -; SSE2-NEXT: .LBB2_20: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, %r15d -; SSE2-NEXT: jb .LBB2_22 -; SSE2-NEXT: # %bb.21: -; SSE2-NEXT: movl %eax, %r15d -; SSE2-NEXT: .LBB2_22: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, %r12d -; SSE2-NEXT: jb .LBB2_24 -; SSE2-NEXT: # %bb.23: -; SSE2-NEXT: movl %eax, %r12d -; SSE2-NEXT: .LBB2_24: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, %r13d -; SSE2-NEXT: jb .LBB2_26 -; SSE2-NEXT: # %bb.25: -; SSE2-NEXT: movl %eax, %r13d -; SSE2-NEXT: .LBB2_26: -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, %r8d -; SSE2-NEXT: jb .LBB2_28 -; SSE2-NEXT: # %bb.27: -; SSE2-NEXT: movl %eax, %r8d -; SSE2-NEXT: .LBB2_28: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: jb .LBB2_30 -; SSE2-NEXT: # %bb.29: -; SSE2-NEXT: # kill: def $cl killed $cl def $ecx -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB2_30: -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, %ecx -; SSE2-NEXT: jb .LBB2_32 -; SSE2-NEXT: # %bb.31: -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: .LBB2_32: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: jb .LBB2_34 -; SSE2-NEXT: # %bb.33: -; SSE2-NEXT: # kill: def $al killed $al def $eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB2_34: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: jb .LBB2_36 -; SSE2-NEXT: # %bb.35: -; SSE2-NEXT: # kill: def $al killed $al def $eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB2_36: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: jb .LBB2_38 -; SSE2-NEXT: # %bb.37: -; SSE2-NEXT: # kill: def $al killed $al def $eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB2_38: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: jb .LBB2_40 -; SSE2-NEXT: # %bb.39: -; SSE2-NEXT: # kill: def $al killed $al def $eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB2_40: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: jb .LBB2_42 -; SSE2-NEXT: # %bb.41: -; SSE2-NEXT: # kill: def $al killed $al def $eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB2_42: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: jb .LBB2_44 -; SSE2-NEXT: # %bb.43: -; SSE2-NEXT: # kill: def $al killed $al def $eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB2_44: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: jb .LBB2_46 -; SSE2-NEXT: # %bb.45: -; SSE2-NEXT: # kill: def $al killed $al def $eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB2_46: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: jb .LBB2_48 -; SSE2-NEXT: # %bb.47: -; SSE2-NEXT: # kill: def $al killed $al def $eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB2_48: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: jb .LBB2_50 -; SSE2-NEXT: # %bb.49: -; SSE2-NEXT: # kill: def $al killed $al def $eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB2_50: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: jb .LBB2_52 -; SSE2-NEXT: # %bb.51: -; SSE2-NEXT: # kill: def $al killed $al def $eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB2_52: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: jb .LBB2_54 -; SSE2-NEXT: # %bb.53: -; SSE2-NEXT: # kill: def $al killed $al def $eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB2_54: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: jb .LBB2_56 -; SSE2-NEXT: # %bb.55: -; SSE2-NEXT: # kill: def $al killed $al def $eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB2_56: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: jb .LBB2_58 -; SSE2-NEXT: # %bb.57: -; SSE2-NEXT: # kill: def $al killed $al def $eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB2_58: -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, %ecx -; SSE2-NEXT: jb .LBB2_60 -; SSE2-NEXT: # %bb.59: -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: .LBB2_60: -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: jb .LBB2_62 -; SSE2-NEXT: # %bb.61: -; SSE2-NEXT: # kill: def $cl killed $cl def $ecx -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB2_62: -; SSE2-NEXT: subb (%rsp), %al -; SSE2-NEXT: movl $0, %ecx -; SSE2-NEXT: jb .LBB2_64 -; SSE2-NEXT: # %bb.63: -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: .LBB2_64: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: jb .LBB2_66 -; SSE2-NEXT: # %bb.65: -; SSE2-NEXT: # kill: def $al killed $al def $eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB2_66: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: jb .LBB2_68 -; SSE2-NEXT: # %bb.67: -; SSE2-NEXT: # kill: def $al killed $al def $eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB2_68: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: jb .LBB2_70 -; SSE2-NEXT: # %bb.69: -; SSE2-NEXT: # kill: def $al killed $al def $eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB2_70: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: jb .LBB2_72 -; SSE2-NEXT: # %bb.71: -; SSE2-NEXT: # kill: def $al killed $al def $eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB2_72: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: jb .LBB2_74 -; SSE2-NEXT: # %bb.73: -; SSE2-NEXT: # kill: def $al killed $al def $eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB2_74: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: jb .LBB2_76 -; SSE2-NEXT: # %bb.75: -; SSE2-NEXT: # kill: def $al killed $al def $eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB2_76: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: jb .LBB2_78 -; SSE2-NEXT: # %bb.77: -; SSE2-NEXT: # kill: def $al killed $al def $eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB2_78: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: jb .LBB2_80 -; SSE2-NEXT: # %bb.79: -; SSE2-NEXT: # kill: def $al killed $al def $eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB2_80: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: jb .LBB2_82 -; SSE2-NEXT: # %bb.81: -; SSE2-NEXT: # kill: def $al killed $al def $eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB2_82: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: jb .LBB2_84 -; SSE2-NEXT: # %bb.83: -; SSE2-NEXT: # kill: def $al killed $al def $eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB2_84: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: jb .LBB2_86 -; SSE2-NEXT: # %bb.85: -; SSE2-NEXT: # kill: def $al killed $al def $eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB2_86: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: jb .LBB2_88 -; SSE2-NEXT: # %bb.87: -; SSE2-NEXT: # kill: def $al killed $al def $eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB2_88: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: jb .LBB2_90 -; SSE2-NEXT: # %bb.89: -; SSE2-NEXT: # kill: def $al killed $al def $eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB2_90: -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, %ecx -; SSE2-NEXT: jb .LBB2_92 -; SSE2-NEXT: # %bb.91: -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: .LBB2_92: -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: jb .LBB2_94 -; SSE2-NEXT: # %bb.93: -; SSE2-NEXT: # kill: def $cl killed $cl def $ecx -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB2_94: -; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, %ecx -; SSE2-NEXT: jb .LBB2_96 -; SSE2-NEXT: # %bb.95: -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: .LBB2_96: -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, %ecx -; SSE2-NEXT: jb .LBB2_98 -; SSE2-NEXT: # %bb.97: -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: .LBB2_98: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: jb .LBB2_100 -; SSE2-NEXT: # %bb.99: -; SSE2-NEXT: # kill: def $al killed $al def $eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB2_100: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: jb .LBB2_102 -; SSE2-NEXT: # %bb.101: -; SSE2-NEXT: # kill: def $al killed $al def $eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB2_102: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: jb .LBB2_104 -; SSE2-NEXT: # %bb.103: -; SSE2-NEXT: # kill: def $al killed $al def $eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB2_104: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: jb .LBB2_106 -; SSE2-NEXT: # %bb.105: -; SSE2-NEXT: # kill: def $al killed $al def $eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB2_106: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: jb .LBB2_108 -; SSE2-NEXT: # %bb.107: -; SSE2-NEXT: # kill: def $al killed $al def $eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB2_108: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: jb .LBB2_110 -; SSE2-NEXT: # %bb.109: -; SSE2-NEXT: # kill: def $al killed $al def $eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB2_110: -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, %ecx -; SSE2-NEXT: jb .LBB2_112 -; SSE2-NEXT: # %bb.111: -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: .LBB2_112: -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, %ecx -; SSE2-NEXT: jb .LBB2_114 -; SSE2-NEXT: # %bb.113: -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: .LBB2_114: -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, %ecx -; SSE2-NEXT: jb .LBB2_116 -; SSE2-NEXT: # %bb.115: -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: .LBB2_116: -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl %sil, %eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl %bl, %esi -; SSE2-NEXT: movzbl %bpl, %ebx -; SSE2-NEXT: movzbl %dil, %ebp -; SSE2-NEXT: movzbl %r9b, %edi -; SSE2-NEXT: movzbl %dl, %ecx -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; SSE2-NEXT: movzbl %r10b, %r10d -; SSE2-NEXT: movzbl %r11b, %r11d -; SSE2-NEXT: movzbl %r14b, %r14d -; SSE2-NEXT: movzbl %r15b, %r15d -; SSE2-NEXT: movzbl %r12b, %r12d -; SSE2-NEXT: movzbl %r13b, %r13d -; SSE2-NEXT: movzbl %r8b, %r8d -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: jb .LBB2_118 -; SSE2-NEXT: # %bb.117: -; SSE2-NEXT: # kill: def $al killed $al def $eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB2_118: -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 4-byte Folded Reload -; SSE2-NEXT: # xmm10 = mem[0],zero,zero,zero -; SSE2-NEXT: movd %esi, %xmm4 -; SSE2-NEXT: movd %ebx, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd %ebp, %xmm3 -; SSE2-NEXT: movd %edi, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd %ecx, %xmm5 -; SSE2-NEXT: movd %edx, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd %r10d, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd %r11d, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd %r14d, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd %r15d, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd %r12d, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd %r13d, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd %r8d, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd %r9d, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload -; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movl $0, %ebx -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 4-byte Folded Reload -; SSE2-NEXT: # xmm6 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload -; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload -; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload -; SSE2-NEXT: # xmm15 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 4-byte Folded Reload -; SSE2-NEXT: # xmm9 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload -; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 4-byte Folded Reload -; SSE2-NEXT: # xmm11 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 4-byte Folded Reload -; SSE2-NEXT: # xmm14 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload -; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload -; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload -; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 4-byte Folded Reload -; SSE2-NEXT: # xmm13 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload -; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload -; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload -; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload -; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload -; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload -; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload -; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 4-byte Folded Reload -; SSE2-NEXT: # xmm12 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload -; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload -; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload -; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 4-byte Folded Reload -; SSE2-NEXT: # xmm7 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload -; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload -; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload -; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 4-byte Folded Reload -; SSE2-NEXT: # xmm8 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload -; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload -; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload -; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload -; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: jb .LBB2_120 -; SSE2-NEXT: # %bb.119: -; SSE2-NEXT: movl %ecx, %ebx -; SSE2-NEXT: .LBB2_120: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] -; SSE2-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE2-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3],xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE2-NEXT: # xmm10 = xmm10[0],mem[0],xmm10[1],mem[1],xmm10[2],mem[2],xmm10[3],mem[3],xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] -; SSE2-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE2-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1],xmm15[2],mem[2],xmm15[3],mem[3],xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] -; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3],xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE2-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1],xmm13[2],mem[2],xmm13[3],mem[3],xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE2-NEXT: # xmm12 = xmm12[0],mem[0],xmm12[1],mem[1],xmm12[2],mem[2],xmm12[3],mem[3],xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE2-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3],xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE2-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3],xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE2-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSE2-NEXT: movd %r10d, %xmm3 -; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd %r11d, %xmm10 -; SSE2-NEXT: movd %r14d, %xmm3 -; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd %eax, %xmm9 -; SSE2-NEXT: movd %r15d, %xmm3 -; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd %r12d, %xmm11 -; SSE2-NEXT: movd %r13d, %xmm3 -; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd %esi, %xmm5 -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; SSE2-NEXT: movzbl %bl, %esi -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl $0, %edi -; SSE2-NEXT: jb .LBB2_122 -; SSE2-NEXT: # %bb.121: -; SSE2-NEXT: movl %ebx, %edi -; SSE2-NEXT: .LBB2_122: -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3] -; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE2-NEXT: # xmm14 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3] -; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE2-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1],xmm13[2],mem[2],xmm13[3],mem[3] -; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE2-NEXT: # xmm12 = xmm12[0],mem[0],xmm12[1],mem[1],xmm12[2],mem[2],xmm12[3],mem[3] -; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE2-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3] -; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE2-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3] -; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE2-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE2-NEXT: # xmm10 = xmm10[0],mem[0],xmm10[1],mem[1],xmm10[2],mem[2],xmm10[3],mem[3],xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE2-NEXT: # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1],xmm9[2],mem[2],xmm9[3],mem[3],xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE2-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3],xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE2-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3],xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd %ecx, %xmm3 -; SSE2-NEXT: movd %edx, %xmm4 -; SSE2-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd %esi, %xmm4 -; SSE2-NEXT: movzbl %dil, %eax -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl $0, %ecx -; SSE2-NEXT: jb .LBB2_124 -; SSE2-NEXT: # %bb.123: -; SSE2-NEXT: movl %edx, %ecx -; SSE2-NEXT: .LBB2_124: -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE2-NEXT: # xmm6 = xmm6[0],mem[0],xmm6[1],mem[1] -; SSE2-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE2-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3],xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; SSE2-NEXT: movd %eax, %xmm8 -; SSE2-NEXT: movzbl %cl, %eax -; SSE2-NEXT: movd %eax, %xmm6 -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl $0, %ecx -; SSE2-NEXT: jb .LBB2_126 -; SSE2-NEXT: # %bb.125: -; SSE2-NEXT: movl %edx, %ecx -; SSE2-NEXT: .LBB2_126: -; SSE2-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE2-NEXT: # xmm0 = xmm0[0],mem[0] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm14[0] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm7[0] -; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; SSE2-NEXT: movzbl %cl, %ecx -; SSE2-NEXT: movd %ecx, %xmm7 -; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, %ecx -; SSE2-NEXT: jb .LBB2_128 -; SSE2-NEXT: # %bb.127: -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: .LBB2_128: -; SSE2-NEXT: movzbl %cl, %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0] -; SSE2-NEXT: addq $648, %rsp # imm = 0x288 -; SSE2-NEXT: popq %rbx -; SSE2-NEXT: popq %r12 -; SSE2-NEXT: popq %r13 -; SSE2-NEXT: popq %r14 -; SSE2-NEXT: popq %r15 -; SSE2-NEXT: popq %rbp -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v64i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pushq %rbp -; SSSE3-NEXT: pushq %r15 -; SSSE3-NEXT: pushq %r14 -; SSSE3-NEXT: pushq %r13 -; SSSE3-NEXT: pushq %r12 -; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: subq $648, %rsp # imm = 0x288 -; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm5, (%rsp) -; SSSE3-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, %esi -; SSSE3-NEXT: jb .LBB2_2 -; SSSE3-NEXT: # %bb.1: -; SSSE3-NEXT: movl %eax, %esi -; SSSE3-NEXT: .LBB2_2: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, %ebx -; SSSE3-NEXT: jb .LBB2_4 -; SSSE3-NEXT: # %bb.3: -; SSSE3-NEXT: movl %eax, %ebx -; SSSE3-NEXT: .LBB2_4: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, %ebp -; SSSE3-NEXT: jb .LBB2_6 -; SSSE3-NEXT: # %bb.5: -; SSSE3-NEXT: movl %eax, %ebp -; SSSE3-NEXT: .LBB2_6: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, %edi -; SSSE3-NEXT: jb .LBB2_8 -; SSSE3-NEXT: # %bb.7: -; SSSE3-NEXT: movl %eax, %edi -; SSSE3-NEXT: .LBB2_8: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, %r9d -; SSSE3-NEXT: jb .LBB2_10 -; SSSE3-NEXT: # %bb.9: -; SSSE3-NEXT: movl %eax, %r9d -; SSSE3-NEXT: .LBB2_10: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, %edx -; SSSE3-NEXT: jb .LBB2_12 -; SSSE3-NEXT: # %bb.11: -; SSSE3-NEXT: movl %eax, %edx -; SSSE3-NEXT: .LBB2_12: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, %ecx -; SSSE3-NEXT: jb .LBB2_14 -; SSSE3-NEXT: # %bb.13: -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: .LBB2_14: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, %r10d -; SSSE3-NEXT: jb .LBB2_16 -; SSSE3-NEXT: # %bb.15: -; SSSE3-NEXT: movl %eax, %r10d -; SSSE3-NEXT: .LBB2_16: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, %r11d -; SSSE3-NEXT: jb .LBB2_18 -; SSSE3-NEXT: # %bb.17: -; SSSE3-NEXT: movl %eax, %r11d -; SSSE3-NEXT: .LBB2_18: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, %r14d -; SSSE3-NEXT: jb .LBB2_20 -; SSSE3-NEXT: # %bb.19: -; SSSE3-NEXT: movl %eax, %r14d -; SSSE3-NEXT: .LBB2_20: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, %r15d -; SSSE3-NEXT: jb .LBB2_22 -; SSSE3-NEXT: # %bb.21: -; SSSE3-NEXT: movl %eax, %r15d -; SSSE3-NEXT: .LBB2_22: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, %r12d -; SSSE3-NEXT: jb .LBB2_24 -; SSSE3-NEXT: # %bb.23: -; SSSE3-NEXT: movl %eax, %r12d -; SSSE3-NEXT: .LBB2_24: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, %r13d -; SSSE3-NEXT: jb .LBB2_26 -; SSSE3-NEXT: # %bb.25: -; SSSE3-NEXT: movl %eax, %r13d -; SSSE3-NEXT: .LBB2_26: -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, %r8d -; SSSE3-NEXT: jb .LBB2_28 -; SSSE3-NEXT: # %bb.27: -; SSSE3-NEXT: movl %eax, %r8d -; SSSE3-NEXT: .LBB2_28: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_30 -; SSSE3-NEXT: # %bb.29: -; SSSE3-NEXT: # kill: def $cl killed $cl def $ecx -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: .LBB2_30: -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, %ecx -; SSSE3-NEXT: jb .LBB2_32 -; SSSE3-NEXT: # %bb.31: -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: .LBB2_32: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_34 -; SSSE3-NEXT: # %bb.33: -; SSSE3-NEXT: # kill: def $al killed $al def $eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: .LBB2_34: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_36 -; SSSE3-NEXT: # %bb.35: -; SSSE3-NEXT: # kill: def $al killed $al def $eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: .LBB2_36: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_38 -; SSSE3-NEXT: # %bb.37: -; SSSE3-NEXT: # kill: def $al killed $al def $eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: .LBB2_38: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_40 -; SSSE3-NEXT: # %bb.39: -; SSSE3-NEXT: # kill: def $al killed $al def $eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: .LBB2_40: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_42 -; SSSE3-NEXT: # %bb.41: -; SSSE3-NEXT: # kill: def $al killed $al def $eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: .LBB2_42: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_44 -; SSSE3-NEXT: # %bb.43: -; SSSE3-NEXT: # kill: def $al killed $al def $eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: .LBB2_44: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_46 -; SSSE3-NEXT: # %bb.45: -; SSSE3-NEXT: # kill: def $al killed $al def $eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: .LBB2_46: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_48 -; SSSE3-NEXT: # %bb.47: -; SSSE3-NEXT: # kill: def $al killed $al def $eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: .LBB2_48: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_50 -; SSSE3-NEXT: # %bb.49: -; SSSE3-NEXT: # kill: def $al killed $al def $eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: .LBB2_50: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_52 -; SSSE3-NEXT: # %bb.51: -; SSSE3-NEXT: # kill: def $al killed $al def $eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: .LBB2_52: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_54 -; SSSE3-NEXT: # %bb.53: -; SSSE3-NEXT: # kill: def $al killed $al def $eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: .LBB2_54: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_56 -; SSSE3-NEXT: # %bb.55: -; SSSE3-NEXT: # kill: def $al killed $al def $eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: .LBB2_56: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_58 -; SSSE3-NEXT: # %bb.57: -; SSSE3-NEXT: # kill: def $al killed $al def $eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: .LBB2_58: -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, %ecx -; SSSE3-NEXT: jb .LBB2_60 -; SSSE3-NEXT: # %bb.59: -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: .LBB2_60: -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_62 -; SSSE3-NEXT: # %bb.61: -; SSSE3-NEXT: # kill: def $cl killed $cl def $ecx -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: .LBB2_62: -; SSSE3-NEXT: subb (%rsp), %al -; SSSE3-NEXT: movl $0, %ecx -; SSSE3-NEXT: jb .LBB2_64 -; SSSE3-NEXT: # %bb.63: -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: .LBB2_64: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_66 -; SSSE3-NEXT: # %bb.65: -; SSSE3-NEXT: # kill: def $al killed $al def $eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: .LBB2_66: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_68 -; SSSE3-NEXT: # %bb.67: -; SSSE3-NEXT: # kill: def $al killed $al def $eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: .LBB2_68: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_70 -; SSSE3-NEXT: # %bb.69: -; SSSE3-NEXT: # kill: def $al killed $al def $eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: .LBB2_70: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_72 -; SSSE3-NEXT: # %bb.71: -; SSSE3-NEXT: # kill: def $al killed $al def $eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: .LBB2_72: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_74 -; SSSE3-NEXT: # %bb.73: -; SSSE3-NEXT: # kill: def $al killed $al def $eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: .LBB2_74: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_76 -; SSSE3-NEXT: # %bb.75: -; SSSE3-NEXT: # kill: def $al killed $al def $eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: .LBB2_76: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_78 -; SSSE3-NEXT: # %bb.77: -; SSSE3-NEXT: # kill: def $al killed $al def $eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: .LBB2_78: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_80 -; SSSE3-NEXT: # %bb.79: -; SSSE3-NEXT: # kill: def $al killed $al def $eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: .LBB2_80: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_82 -; SSSE3-NEXT: # %bb.81: -; SSSE3-NEXT: # kill: def $al killed $al def $eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: .LBB2_82: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_84 -; SSSE3-NEXT: # %bb.83: -; SSSE3-NEXT: # kill: def $al killed $al def $eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: .LBB2_84: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_86 -; SSSE3-NEXT: # %bb.85: -; SSSE3-NEXT: # kill: def $al killed $al def $eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: .LBB2_86: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_88 -; SSSE3-NEXT: # %bb.87: -; SSSE3-NEXT: # kill: def $al killed $al def $eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: .LBB2_88: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_90 -; SSSE3-NEXT: # %bb.89: -; SSSE3-NEXT: # kill: def $al killed $al def $eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: .LBB2_90: -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, %ecx -; SSSE3-NEXT: jb .LBB2_92 -; SSSE3-NEXT: # %bb.91: -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: .LBB2_92: -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_94 -; SSSE3-NEXT: # %bb.93: -; SSSE3-NEXT: # kill: def $cl killed $cl def $ecx -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: .LBB2_94: -; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, %ecx -; SSSE3-NEXT: jb .LBB2_96 -; SSSE3-NEXT: # %bb.95: -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: .LBB2_96: -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, %ecx -; SSSE3-NEXT: jb .LBB2_98 -; SSSE3-NEXT: # %bb.97: -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: .LBB2_98: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_100 -; SSSE3-NEXT: # %bb.99: -; SSSE3-NEXT: # kill: def $al killed $al def $eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: .LBB2_100: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_102 -; SSSE3-NEXT: # %bb.101: -; SSSE3-NEXT: # kill: def $al killed $al def $eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: .LBB2_102: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_104 -; SSSE3-NEXT: # %bb.103: -; SSSE3-NEXT: # kill: def $al killed $al def $eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: .LBB2_104: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_106 -; SSSE3-NEXT: # %bb.105: -; SSSE3-NEXT: # kill: def $al killed $al def $eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: .LBB2_106: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_108 -; SSSE3-NEXT: # %bb.107: -; SSSE3-NEXT: # kill: def $al killed $al def $eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: .LBB2_108: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_110 -; SSSE3-NEXT: # %bb.109: -; SSSE3-NEXT: # kill: def $al killed $al def $eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: .LBB2_110: -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, %ecx -; SSSE3-NEXT: jb .LBB2_112 -; SSSE3-NEXT: # %bb.111: -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: .LBB2_112: -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, %ecx -; SSSE3-NEXT: jb .LBB2_114 -; SSSE3-NEXT: # %bb.113: -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: .LBB2_114: -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, %ecx -; SSSE3-NEXT: jb .LBB2_116 -; SSSE3-NEXT: # %bb.115: -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: .LBB2_116: -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl %sil, %eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl %bl, %esi -; SSSE3-NEXT: movzbl %bpl, %ebx -; SSSE3-NEXT: movzbl %dil, %ebp -; SSSE3-NEXT: movzbl %r9b, %edi -; SSSE3-NEXT: movzbl %dl, %ecx -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; SSSE3-NEXT: movzbl %r10b, %r10d -; SSSE3-NEXT: movzbl %r11b, %r11d -; SSSE3-NEXT: movzbl %r14b, %r14d -; SSSE3-NEXT: movzbl %r15b, %r15d -; SSSE3-NEXT: movzbl %r12b, %r12d -; SSSE3-NEXT: movzbl %r13b, %r13d -; SSSE3-NEXT: movzbl %r8b, %r8d -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSSE3-NEXT: jb .LBB2_118 -; SSSE3-NEXT: # %bb.117: -; SSSE3-NEXT: # kill: def $al killed $al def $eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: .LBB2_118: -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm10 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd %esi, %xmm4 -; SSSE3-NEXT: movd %ebx, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd %ebp, %xmm3 -; SSSE3-NEXT: movd %edi, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd %ecx, %xmm5 -; SSSE3-NEXT: movd %edx, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd %r10d, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd %r11d, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd %r14d, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd %r15d, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd %r12d, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd %r13d, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd %r8d, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd %r9d, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movl $0, %ebx -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm6 = mem[0],zero,zero,zero -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload -; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload -; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm15 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm9 = mem[0],zero,zero,zero -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload -; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm11 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm14 = mem[0],zero,zero,zero -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload -; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload -; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload -; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm13 = mem[0],zero,zero,zero -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload -; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload -; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload -; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload -; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload -; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload -; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm12 = mem[0],zero,zero,zero -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload -; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload -; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload -; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm7 = mem[0],zero,zero,zero -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload -; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload -; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload -; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm8 = mem[0],zero,zero,zero -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload -; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload -; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload -; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: jb .LBB2_120 -; SSSE3-NEXT: # %bb.119: -; SSSE3-NEXT: movl %ecx, %ebx -; SSSE3-NEXT: .LBB2_120: -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] -; SSSE3-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3],xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] -; SSSE3-NEXT: movdqa %xmm5, %xmm4 -; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm10 = xmm10[0],mem[0],xmm10[1],mem[1],xmm10[2],mem[2],xmm10[3],mem[3],xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] -; SSSE3-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; SSSE3-NEXT: movdqa %xmm3, %xmm6 -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1],xmm15[2],mem[2],xmm15[3],mem[3],xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] -; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] -; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3],xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] -; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1],xmm13[2],mem[2],xmm13[3],mem[3],xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] -; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm12 = xmm12[0],mem[0],xmm12[1],mem[1],xmm12[2],mem[2],xmm12[3],mem[3],xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] -; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3],xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] -; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3],xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] -; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSSE3-NEXT: movd %r10d, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd %r11d, %xmm10 -; SSSE3-NEXT: movd %r14d, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd %eax, %xmm9 -; SSSE3-NEXT: movd %r15d, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd %r12d, %xmm11 -; SSSE3-NEXT: movd %r13d, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd %esi, %xmm5 -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; SSSE3-NEXT: movzbl %bl, %esi -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl $0, %edi -; SSSE3-NEXT: jb .LBB2_122 -; SSSE3-NEXT: # %bb.121: -; SSSE3-NEXT: movl %ebx, %edi -; SSSE3-NEXT: .LBB2_122: -; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3] -; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm14 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3] -; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1],xmm13[2],mem[2],xmm13[3],mem[3] -; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm12 = xmm12[0],mem[0],xmm12[1],mem[1],xmm12[2],mem[2],xmm12[3],mem[3] -; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3] -; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3] -; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm10 = xmm10[0],mem[0],xmm10[1],mem[1],xmm10[2],mem[2],xmm10[3],mem[3],xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1],xmm9[2],mem[2],xmm9[3],mem[3],xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3],xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3],xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] -; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd %ecx, %xmm3 -; SSSE3-NEXT: movd %edx, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd %esi, %xmm4 -; SSSE3-NEXT: movzbl %dil, %eax -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl $0, %ecx -; SSSE3-NEXT: jb .LBB2_124 -; SSSE3-NEXT: # %bb.123: -; SSSE3-NEXT: movl %edx, %ecx -; SSSE3-NEXT: .LBB2_124: -; SSSE3-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSSE3-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm6 = xmm6[0],mem[0],xmm6[1],mem[1] -; SSSE3-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3],xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; SSSE3-NEXT: movd %eax, %xmm8 -; SSSE3-NEXT: movzbl %cl, %eax -; SSSE3-NEXT: movd %eax, %xmm6 -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl $0, %ecx -; SSSE3-NEXT: jb .LBB2_126 -; SSSE3-NEXT: # %bb.125: -; SSSE3-NEXT: movl %edx, %ecx -; SSSE3-NEXT: .LBB2_126: -; SSSE3-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm0 = xmm0[0],mem[0] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm14[0] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm7[0] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; SSSE3-NEXT: movzbl %cl, %ecx -; SSSE3-NEXT: movd %ecx, %xmm7 -; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, %ecx -; SSSE3-NEXT: jb .LBB2_128 -; SSSE3-NEXT: # %bb.127: -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: .LBB2_128: -; SSSE3-NEXT: movzbl %cl, %eax -; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0] -; SSSE3-NEXT: addq $648, %rsp # imm = 0x288 -; SSSE3-NEXT: popq %rbx -; SSSE3-NEXT: popq %r12 -; SSSE3-NEXT: popq %r13 -; SSSE3-NEXT: popq %r14 -; SSSE3-NEXT: popq %r15 -; SSSE3-NEXT: popq %rbp -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v64i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pextrb $1, %xmm4, %ecx -; SSE41-NEXT: pextrb $1, %xmm0, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %edi -; SSE41-NEXT: jb .LBB2_2 -; SSE41-NEXT: # %bb.1: -; SSE41-NEXT: movl %eax, %edi -; SSE41-NEXT: .LBB2_2: -; SSE41-NEXT: pextrb $0, %xmm4, %ecx -; SSE41-NEXT: pextrb $0, %xmm0, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB2_4 -; SSE41-NEXT: # %bb.3: -; SSE41-NEXT: movl %eax, %edx -; SSE41-NEXT: .LBB2_4: -; SSE41-NEXT: pextrb $2, %xmm4, %ecx -; SSE41-NEXT: pextrb $2, %xmm0, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB2_6 -; SSE41-NEXT: # %bb.5: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_6: -; SSE41-NEXT: pushq %rbp -; SSE41-NEXT: pushq %r15 -; SSE41-NEXT: pushq %r14 -; SSE41-NEXT: pushq %r13 -; SSE41-NEXT: pushq %r12 -; SSE41-NEXT: pushq %rbx -; SSE41-NEXT: subq $16, %rsp -; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrb $3, %xmm4, %ecx -; SSE41-NEXT: pextrb $3, %xmm0, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB2_8 -; SSE41-NEXT: # %bb.7: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_8: -; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrb $4, %xmm4, %ecx -; SSE41-NEXT: pextrb $4, %xmm0, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB2_10 -; SSE41-NEXT: # %bb.9: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_10: -; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrb $5, %xmm4, %ecx -; SSE41-NEXT: pextrb $5, %xmm0, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB2_12 -; SSE41-NEXT: # %bb.11: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_12: -; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrb $6, %xmm4, %ecx -; SSE41-NEXT: pextrb $6, %xmm0, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB2_14 -; SSE41-NEXT: # %bb.13: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_14: -; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrb $7, %xmm4, %ecx -; SSE41-NEXT: pextrb $7, %xmm0, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB2_16 -; SSE41-NEXT: # %bb.15: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_16: -; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrb $8, %xmm4, %ecx -; SSE41-NEXT: pextrb $8, %xmm0, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB2_18 -; SSE41-NEXT: # %bb.17: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_18: -; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrb $9, %xmm4, %ecx -; SSE41-NEXT: pextrb $9, %xmm0, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB2_20 -; SSE41-NEXT: # %bb.19: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_20: -; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrb $10, %xmm4, %ecx -; SSE41-NEXT: pextrb $10, %xmm0, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB2_22 -; SSE41-NEXT: # %bb.21: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_22: -; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrb $11, %xmm4, %ecx -; SSE41-NEXT: pextrb $11, %xmm0, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB2_24 -; SSE41-NEXT: # %bb.23: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_24: -; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrb $12, %xmm4, %ecx -; SSE41-NEXT: pextrb $12, %xmm0, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB2_26 -; SSE41-NEXT: # %bb.25: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_26: -; SSE41-NEXT: movl %ecx, (%rsp) # 4-byte Spill -; SSE41-NEXT: pextrb $13, %xmm4, %ecx -; SSE41-NEXT: pextrb $13, %xmm0, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB2_28 -; SSE41-NEXT: # %bb.27: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_28: -; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrb $14, %xmm4, %ecx -; SSE41-NEXT: pextrb $14, %xmm0, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB2_30 -; SSE41-NEXT: # %bb.29: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_30: -; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrb $15, %xmm4, %ecx -; SSE41-NEXT: pextrb $15, %xmm0, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB2_32 -; SSE41-NEXT: # %bb.31: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_32: -; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrb $1, %xmm5, %ecx -; SSE41-NEXT: pextrb $1, %xmm1, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB2_34 -; SSE41-NEXT: # %bb.33: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_34: -; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrb $0, %xmm5, %ecx -; SSE41-NEXT: pextrb $0, %xmm1, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %ebx -; SSE41-NEXT: jb .LBB2_36 -; SSE41-NEXT: # %bb.35: -; SSE41-NEXT: movl %eax, %ebx -; SSE41-NEXT: .LBB2_36: -; SSE41-NEXT: pextrb $2, %xmm5, %ecx -; SSE41-NEXT: pextrb $2, %xmm1, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB2_38 -; SSE41-NEXT: # %bb.37: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_38: -; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrb $3, %xmm5, %ecx -; SSE41-NEXT: pextrb $3, %xmm1, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB2_40 -; SSE41-NEXT: # %bb.39: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_40: -; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrb $4, %xmm5, %ecx -; SSE41-NEXT: pextrb $4, %xmm1, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB2_42 -; SSE41-NEXT: # %bb.41: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_42: -; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrb $5, %xmm5, %ecx -; SSE41-NEXT: pextrb $5, %xmm1, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB2_44 -; SSE41-NEXT: # %bb.43: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_44: -; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrb $6, %xmm5, %ecx -; SSE41-NEXT: pextrb $6, %xmm1, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB2_46 -; SSE41-NEXT: # %bb.45: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_46: -; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrb $7, %xmm5, %ecx -; SSE41-NEXT: pextrb $7, %xmm1, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB2_48 -; SSE41-NEXT: # %bb.47: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_48: -; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrb $8, %xmm5, %ecx -; SSE41-NEXT: pextrb $8, %xmm1, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB2_50 -; SSE41-NEXT: # %bb.49: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_50: -; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrb $9, %xmm5, %ecx -; SSE41-NEXT: pextrb $9, %xmm1, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB2_52 -; SSE41-NEXT: # %bb.51: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_52: -; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrb $10, %xmm5, %ecx -; SSE41-NEXT: pextrb $10, %xmm1, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB2_54 -; SSE41-NEXT: # %bb.53: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_54: -; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrb $11, %xmm5, %ecx -; SSE41-NEXT: pextrb $11, %xmm1, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB2_56 -; SSE41-NEXT: # %bb.55: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_56: -; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrb $12, %xmm5, %ecx -; SSE41-NEXT: pextrb $12, %xmm1, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB2_58 -; SSE41-NEXT: # %bb.57: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_58: -; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrb $13, %xmm5, %ecx -; SSE41-NEXT: pextrb $13, %xmm1, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB2_60 -; SSE41-NEXT: # %bb.59: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_60: -; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrb $14, %xmm5, %ecx -; SSE41-NEXT: pextrb $14, %xmm1, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB2_62 -; SSE41-NEXT: # %bb.61: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_62: -; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrb $15, %xmm5, %ecx -; SSE41-NEXT: pextrb $15, %xmm1, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB2_64 -; SSE41-NEXT: # %bb.63: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_64: -; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrb $1, %xmm6, %ecx -; SSE41-NEXT: pextrb $1, %xmm2, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %r9d -; SSE41-NEXT: jb .LBB2_66 -; SSE41-NEXT: # %bb.65: -; SSE41-NEXT: movl %eax, %r9d -; SSE41-NEXT: .LBB2_66: -; SSE41-NEXT: pextrb $0, %xmm6, %ecx -; SSE41-NEXT: pextrb $0, %xmm2, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %esi -; SSE41-NEXT: jb .LBB2_68 -; SSE41-NEXT: # %bb.67: -; SSE41-NEXT: movl %eax, %esi -; SSE41-NEXT: .LBB2_68: -; SSE41-NEXT: pextrb $2, %xmm6, %ecx -; SSE41-NEXT: pextrb $2, %xmm2, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %r10d -; SSE41-NEXT: jb .LBB2_70 -; SSE41-NEXT: # %bb.69: -; SSE41-NEXT: movl %eax, %r10d -; SSE41-NEXT: .LBB2_70: -; SSE41-NEXT: pextrb $3, %xmm6, %ecx -; SSE41-NEXT: pextrb $3, %xmm2, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %r8d -; SSE41-NEXT: jb .LBB2_72 -; SSE41-NEXT: # %bb.71: -; SSE41-NEXT: movl %eax, %r8d -; SSE41-NEXT: .LBB2_72: -; SSE41-NEXT: pextrb $4, %xmm6, %ecx -; SSE41-NEXT: pextrb $4, %xmm2, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %r14d -; SSE41-NEXT: jb .LBB2_74 -; SSE41-NEXT: # %bb.73: -; SSE41-NEXT: movl %eax, %r14d -; SSE41-NEXT: .LBB2_74: -; SSE41-NEXT: pextrb $5, %xmm6, %ecx -; SSE41-NEXT: pextrb $5, %xmm2, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %r11d -; SSE41-NEXT: jb .LBB2_76 -; SSE41-NEXT: # %bb.75: -; SSE41-NEXT: movl %eax, %r11d -; SSE41-NEXT: .LBB2_76: -; SSE41-NEXT: pextrb $6, %xmm6, %ecx -; SSE41-NEXT: pextrb $6, %xmm2, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %r13d -; SSE41-NEXT: jb .LBB2_78 -; SSE41-NEXT: # %bb.77: -; SSE41-NEXT: movl %eax, %r13d -; SSE41-NEXT: .LBB2_78: -; SSE41-NEXT: pextrb $7, %xmm6, %ecx -; SSE41-NEXT: pextrb $7, %xmm2, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %r12d -; SSE41-NEXT: jb .LBB2_80 -; SSE41-NEXT: # %bb.79: -; SSE41-NEXT: movl %eax, %r12d -; SSE41-NEXT: .LBB2_80: -; SSE41-NEXT: pextrb $8, %xmm6, %ecx -; SSE41-NEXT: pextrb $8, %xmm2, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %r15d -; SSE41-NEXT: jb .LBB2_82 -; SSE41-NEXT: # %bb.81: -; SSE41-NEXT: movl %eax, %r15d -; SSE41-NEXT: .LBB2_82: -; SSE41-NEXT: pextrb $9, %xmm6, %ecx -; SSE41-NEXT: pextrb $9, %xmm2, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB2_84 -; SSE41-NEXT: # %bb.83: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_84: -; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrb $10, %xmm6, %ecx -; SSE41-NEXT: pextrb $10, %xmm2, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB2_86 -; SSE41-NEXT: # %bb.85: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_86: -; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrb $11, %xmm6, %ecx -; SSE41-NEXT: pextrb $11, %xmm2, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB2_88 -; SSE41-NEXT: # %bb.87: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_88: -; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrb $12, %xmm6, %ecx -; SSE41-NEXT: pextrb $12, %xmm2, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB2_90 -; SSE41-NEXT: # %bb.89: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_90: -; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrb $13, %xmm6, %ecx -; SSE41-NEXT: pextrb $13, %xmm2, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB2_92 -; SSE41-NEXT: # %bb.91: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_92: -; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrb $14, %xmm6, %ecx -; SSE41-NEXT: pextrb $14, %xmm2, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB2_94 -; SSE41-NEXT: # %bb.93: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_94: -; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movzbl %dl, %eax -; SSE41-NEXT: movzbl %bl, %ebp -; SSE41-NEXT: movzbl %sil, %ebx -; SSE41-NEXT: pextrb $15, %xmm6, %edx -; SSE41-NEXT: pextrb $15, %xmm2, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB2_96 -; SSE41-NEXT: # %bb.95: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB2_96: -; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movzbl %dil, %edi -; SSE41-NEXT: movd %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: movd %ebp, %xmm1 -; SSE41-NEXT: movzbl %r9b, %esi -; SSE41-NEXT: movd %ebx, %xmm2 -; SSE41-NEXT: pextrb $1, %xmm7, %edx -; SSE41-NEXT: pextrb $1, %xmm3, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %ebx -; SSE41-NEXT: jb .LBB2_98 -; SSE41-NEXT: # %bb.97: -; SSE41-NEXT: movl %ecx, %ebx -; SSE41-NEXT: .LBB2_98: -; SSE41-NEXT: pinsrb $1, %edi, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $1, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $1, %esi, %xmm2 -; SSE41-NEXT: movzbl %r10b, %eax -; SSE41-NEXT: movzbl %bl, %esi -; SSE41-NEXT: pextrb $0, %xmm7, %edx -; SSE41-NEXT: pextrb $0, %xmm3, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB2_100 -; SSE41-NEXT: # %bb.99: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB2_100: -; SSE41-NEXT: pinsrb $2, %edi, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $2, %ebp, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $2, %eax, %xmm2 -; SSE41-NEXT: movzbl %r8b, %edi -; SSE41-NEXT: movzbl %dl, %eax -; SSE41-NEXT: movd %eax, %xmm4 -; SSE41-NEXT: pinsrb $1, %esi, %xmm4 -; SSE41-NEXT: pextrb $2, %xmm7, %eax -; SSE41-NEXT: pextrb $2, %xmm3, %ecx -; SSE41-NEXT: subb %al, %cl -; SSE41-NEXT: movl $0, %eax -; SSE41-NEXT: jb .LBB2_102 -; SSE41-NEXT: # %bb.101: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB2_102: -; SSE41-NEXT: pinsrb $3, %ebx, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $3, %ebp, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $3, %edi, %xmm2 -; SSE41-NEXT: movzbl %r14b, %edi -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $2, %eax, %xmm4 -; SSE41-NEXT: pextrb $3, %xmm7, %eax -; SSE41-NEXT: pextrb $3, %xmm3, %ecx -; SSE41-NEXT: subb %al, %cl -; SSE41-NEXT: movl $0, %eax -; SSE41-NEXT: jb .LBB2_104 -; SSE41-NEXT: # %bb.103: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB2_104: -; SSE41-NEXT: pinsrb $4, %esi, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $4, %ebp, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $4, %edi, %xmm2 -; SSE41-NEXT: movzbl %r11b, %edi -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $3, %eax, %xmm4 -; SSE41-NEXT: pextrb $4, %xmm7, %eax -; SSE41-NEXT: pextrb $4, %xmm3, %ecx -; SSE41-NEXT: subb %al, %cl -; SSE41-NEXT: movl $0, %eax -; SSE41-NEXT: jb .LBB2_106 -; SSE41-NEXT: # %bb.105: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB2_106: -; SSE41-NEXT: pinsrb $5, %edx, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $5, %esi, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $5, %edi, %xmm2 -; SSE41-NEXT: movzbl %r13b, %esi -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $4, %eax, %xmm4 -; SSE41-NEXT: pextrb $5, %xmm7, %edi -; SSE41-NEXT: pextrb $5, %xmm3, %eax -; SSE41-NEXT: subb %dil, %al -; SSE41-NEXT: movl $0, %edi -; SSE41-NEXT: jb .LBB2_108 -; SSE41-NEXT: # %bb.107: -; SSE41-NEXT: movl %eax, %edi -; SSE41-NEXT: .LBB2_108: -; SSE41-NEXT: pinsrb $6, %ecx, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $6, %edx, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $6, %esi, %xmm2 -; SSE41-NEXT: movzbl %r12b, %edx -; SSE41-NEXT: movzbl %dil, %esi -; SSE41-NEXT: pinsrb $5, %esi, %xmm4 -; SSE41-NEXT: pextrb $6, %xmm7, %esi -; SSE41-NEXT: pextrb $6, %xmm3, %edi -; SSE41-NEXT: subb %sil, %dil -; SSE41-NEXT: movl $0, %esi -; SSE41-NEXT: jb .LBB2_110 -; SSE41-NEXT: # %bb.109: -; SSE41-NEXT: movl %edi, %esi -; SSE41-NEXT: .LBB2_110: -; SSE41-NEXT: pinsrb $7, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $7, %ecx, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $7, %edx, %xmm2 -; SSE41-NEXT: movzbl %r15b, %edx -; SSE41-NEXT: movzbl %sil, %esi -; SSE41-NEXT: pinsrb $6, %esi, %xmm4 -; SSE41-NEXT: pextrb $7, %xmm7, %esi -; SSE41-NEXT: pextrb $7, %xmm3, %edi -; SSE41-NEXT: subb %sil, %dil -; SSE41-NEXT: movl $0, %esi -; SSE41-NEXT: jb .LBB2_112 -; SSE41-NEXT: # %bb.111: -; SSE41-NEXT: movl %edi, %esi -; SSE41-NEXT: .LBB2_112: -; SSE41-NEXT: pinsrb $8, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $8, %ecx, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $8, %edx, %xmm2 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; SSE41-NEXT: movzbl %sil, %esi -; SSE41-NEXT: pinsrb $7, %esi, %xmm4 -; SSE41-NEXT: pextrb $8, %xmm7, %esi -; SSE41-NEXT: pextrb $8, %xmm3, %edi -; SSE41-NEXT: subb %sil, %dil -; SSE41-NEXT: movl $0, %esi -; SSE41-NEXT: jb .LBB2_114 -; SSE41-NEXT: # %bb.113: -; SSE41-NEXT: movl %edi, %esi -; SSE41-NEXT: .LBB2_114: -; SSE41-NEXT: pinsrb $9, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $9, %ecx, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $9, %edx, %xmm2 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; SSE41-NEXT: movzbl %sil, %esi -; SSE41-NEXT: pinsrb $8, %esi, %xmm4 -; SSE41-NEXT: pextrb $9, %xmm7, %esi -; SSE41-NEXT: pextrb $9, %xmm3, %edi -; SSE41-NEXT: subb %sil, %dil -; SSE41-NEXT: movl $0, %esi -; SSE41-NEXT: jb .LBB2_116 -; SSE41-NEXT: # %bb.115: -; SSE41-NEXT: movl %edi, %esi -; SSE41-NEXT: .LBB2_116: -; SSE41-NEXT: pinsrb $10, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $10, %ecx, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $10, %edx, %xmm2 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; SSE41-NEXT: movzbl %sil, %esi -; SSE41-NEXT: pinsrb $9, %esi, %xmm4 -; SSE41-NEXT: pextrb $10, %xmm7, %esi -; SSE41-NEXT: pextrb $10, %xmm3, %edi -; SSE41-NEXT: subb %sil, %dil -; SSE41-NEXT: movl $0, %esi -; SSE41-NEXT: jb .LBB2_118 -; SSE41-NEXT: # %bb.117: -; SSE41-NEXT: movl %edi, %esi -; SSE41-NEXT: .LBB2_118: -; SSE41-NEXT: pinsrb $11, %eax, %xmm0 -; SSE41-NEXT: movzbl (%rsp), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $11, %ecx, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $11, %edx, %xmm2 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; SSE41-NEXT: movzbl %sil, %esi -; SSE41-NEXT: pinsrb $10, %esi, %xmm4 -; SSE41-NEXT: pextrb $11, %xmm7, %esi -; SSE41-NEXT: pextrb $11, %xmm3, %edi -; SSE41-NEXT: subb %sil, %dil -; SSE41-NEXT: movl $0, %esi -; SSE41-NEXT: jb .LBB2_120 -; SSE41-NEXT: # %bb.119: -; SSE41-NEXT: movl %edi, %esi -; SSE41-NEXT: .LBB2_120: -; SSE41-NEXT: pinsrb $12, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $12, %ecx, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $12, %edx, %xmm2 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; SSE41-NEXT: movzbl %sil, %esi -; SSE41-NEXT: pinsrb $11, %esi, %xmm4 -; SSE41-NEXT: pextrb $12, %xmm7, %esi -; SSE41-NEXT: pextrb $12, %xmm3, %edi -; SSE41-NEXT: subb %sil, %dil -; SSE41-NEXT: movl $0, %esi -; SSE41-NEXT: jb .LBB2_122 -; SSE41-NEXT: # %bb.121: -; SSE41-NEXT: movl %edi, %esi -; SSE41-NEXT: .LBB2_122: -; SSE41-NEXT: pinsrb $13, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $13, %ecx, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $13, %edx, %xmm2 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; SSE41-NEXT: movzbl %sil, %esi -; SSE41-NEXT: pinsrb $12, %esi, %xmm4 -; SSE41-NEXT: pextrb $13, %xmm7, %esi -; SSE41-NEXT: pextrb $13, %xmm3, %edi -; SSE41-NEXT: subb %sil, %dil -; SSE41-NEXT: movl $0, %esi -; SSE41-NEXT: jb .LBB2_124 -; SSE41-NEXT: # %bb.123: -; SSE41-NEXT: movl %edi, %esi -; SSE41-NEXT: .LBB2_124: -; SSE41-NEXT: pinsrb $14, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $14, %ecx, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $14, %edx, %xmm2 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; SSE41-NEXT: movzbl %sil, %esi -; SSE41-NEXT: pinsrb $13, %esi, %xmm4 -; SSE41-NEXT: pextrb $14, %xmm7, %esi -; SSE41-NEXT: pextrb $14, %xmm3, %edi -; SSE41-NEXT: subb %sil, %dil -; SSE41-NEXT: movl $0, %esi -; SSE41-NEXT: leaq {{[0-9]+}}(%rsp), %rsp -; SSE41-NEXT: popq %rbx -; SSE41-NEXT: popq %r12 -; SSE41-NEXT: popq %r13 -; SSE41-NEXT: popq %r14 -; SSE41-NEXT: popq %r15 -; SSE41-NEXT: popq %rbp -; SSE41-NEXT: jb .LBB2_126 -; SSE41-NEXT: # %bb.125: -; SSE41-NEXT: movl %edi, %esi -; SSE41-NEXT: .LBB2_126: -; SSE41-NEXT: pinsrb $15, %eax, %xmm0 -; SSE41-NEXT: pinsrb $15, %ecx, %xmm1 -; SSE41-NEXT: pinsrb $15, %edx, %xmm2 -; SSE41-NEXT: movzbl %sil, %eax -; SSE41-NEXT: pinsrb $14, %eax, %xmm4 -; SSE41-NEXT: pextrb $15, %xmm7, %ecx -; SSE41-NEXT: pextrb $15, %xmm3, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: jb .LBB2_128 -; SSE41-NEXT: # %bb.127: -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: .LBB2_128: -; SSE41-NEXT: movzbl %cl, %eax -; SSE41-NEXT: pinsrb $15, %eax, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm3 -; SSE41-NEXT: retq +; SSE-LABEL: v64i8: +; SSE: # %bb.0: +; SSE-NEXT: psubusb %xmm4, %xmm0 +; SSE-NEXT: psubusb %xmm5, %xmm1 +; SSE-NEXT: psubusb %xmm6, %xmm2 +; SSE-NEXT: psubusb %xmm7, %xmm3 +; SSE-NEXT: retq ; ; AVX1-LABEL: v64i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vpextrb $1, %xmm4, %ecx ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vpextrb $1, %xmm5, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %ecx -; AVX1-NEXT: jb .LBB2_2 -; AVX1-NEXT: # %bb.1: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_2: -; AVX1-NEXT: pushq %rbp -; AVX1-NEXT: pushq %r15 -; AVX1-NEXT: pushq %r14 -; AVX1-NEXT: pushq %r13 -; AVX1-NEXT: pushq %r12 -; AVX1-NEXT: pushq %rbx -; AVX1-NEXT: subq $12, %rsp -; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrb $0, %xmm4, %ecx -; AVX1-NEXT: vpextrb $0, %xmm5, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %edi -; AVX1-NEXT: jb .LBB2_4 -; AVX1-NEXT: # %bb.3: -; AVX1-NEXT: movl %eax, %edi -; AVX1-NEXT: .LBB2_4: -; AVX1-NEXT: vpextrb $2, %xmm4, %ecx -; AVX1-NEXT: vpextrb $2, %xmm5, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %ecx -; AVX1-NEXT: jb .LBB2_6 -; AVX1-NEXT: # %bb.5: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_6: -; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrb $3, %xmm4, %ecx -; AVX1-NEXT: vpextrb $3, %xmm5, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %ecx -; AVX1-NEXT: jb .LBB2_8 -; AVX1-NEXT: # %bb.7: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_8: -; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrb $4, %xmm4, %ecx -; AVX1-NEXT: vpextrb $4, %xmm5, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %ecx -; AVX1-NEXT: jb .LBB2_10 -; AVX1-NEXT: # %bb.9: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_10: -; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrb $5, %xmm4, %ecx -; AVX1-NEXT: vpextrb $5, %xmm5, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %ecx -; AVX1-NEXT: jb .LBB2_12 -; AVX1-NEXT: # %bb.11: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_12: -; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrb $6, %xmm4, %ecx -; AVX1-NEXT: vpextrb $6, %xmm5, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %ecx -; AVX1-NEXT: jb .LBB2_14 -; AVX1-NEXT: # %bb.13: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_14: -; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrb $7, %xmm4, %ecx -; AVX1-NEXT: vpextrb $7, %xmm5, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %ecx -; AVX1-NEXT: jb .LBB2_16 -; AVX1-NEXT: # %bb.15: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_16: -; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrb $8, %xmm4, %ecx -; AVX1-NEXT: vpextrb $8, %xmm5, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %ecx -; AVX1-NEXT: jb .LBB2_18 -; AVX1-NEXT: # %bb.17: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_18: -; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrb $9, %xmm4, %ecx -; AVX1-NEXT: vpextrb $9, %xmm5, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %ecx -; AVX1-NEXT: jb .LBB2_20 -; AVX1-NEXT: # %bb.19: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_20: -; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrb $10, %xmm4, %ecx -; AVX1-NEXT: vpextrb $10, %xmm5, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %ecx -; AVX1-NEXT: jb .LBB2_22 -; AVX1-NEXT: # %bb.21: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_22: -; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrb $11, %xmm4, %ecx -; AVX1-NEXT: vpextrb $11, %xmm5, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %ecx -; AVX1-NEXT: jb .LBB2_24 -; AVX1-NEXT: # %bb.23: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_24: -; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrb $12, %xmm4, %ecx -; AVX1-NEXT: vpextrb $12, %xmm5, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %ecx -; AVX1-NEXT: jb .LBB2_26 -; AVX1-NEXT: # %bb.25: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_26: -; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrb $13, %xmm4, %ecx -; AVX1-NEXT: vpextrb $13, %xmm5, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %ecx -; AVX1-NEXT: jb .LBB2_28 -; AVX1-NEXT: # %bb.27: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_28: -; AVX1-NEXT: movl %ecx, (%rsp) # 4-byte Spill -; AVX1-NEXT: vpextrb $14, %xmm4, %ecx -; AVX1-NEXT: vpextrb $14, %xmm5, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %ecx -; AVX1-NEXT: jb .LBB2_30 -; AVX1-NEXT: # %bb.29: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_30: -; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrb $15, %xmm4, %ecx -; AVX1-NEXT: vpextrb $15, %xmm5, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %ecx -; AVX1-NEXT: jb .LBB2_32 -; AVX1-NEXT: # %bb.31: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_32: -; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrb $1, %xmm2, %ecx -; AVX1-NEXT: vpextrb $1, %xmm0, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %esi -; AVX1-NEXT: jb .LBB2_34 -; AVX1-NEXT: # %bb.33: -; AVX1-NEXT: movl %eax, %esi -; AVX1-NEXT: .LBB2_34: -; AVX1-NEXT: vpextrb $0, %xmm2, %ecx -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %ebx -; AVX1-NEXT: jb .LBB2_36 -; AVX1-NEXT: # %bb.35: -; AVX1-NEXT: movl %eax, %ebx -; AVX1-NEXT: .LBB2_36: -; AVX1-NEXT: vpextrb $2, %xmm2, %ecx -; AVX1-NEXT: vpextrb $2, %xmm0, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %ecx -; AVX1-NEXT: jb .LBB2_38 -; AVX1-NEXT: # %bb.37: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_38: -; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrb $3, %xmm2, %ecx -; AVX1-NEXT: vpextrb $3, %xmm0, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %ecx -; AVX1-NEXT: jb .LBB2_40 -; AVX1-NEXT: # %bb.39: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_40: -; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrb $4, %xmm2, %ecx -; AVX1-NEXT: vpextrb $4, %xmm0, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %ecx -; AVX1-NEXT: jb .LBB2_42 -; AVX1-NEXT: # %bb.41: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_42: -; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrb $5, %xmm2, %ecx -; AVX1-NEXT: vpextrb $5, %xmm0, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %ecx -; AVX1-NEXT: jb .LBB2_44 -; AVX1-NEXT: # %bb.43: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_44: -; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrb $6, %xmm2, %ecx -; AVX1-NEXT: vpextrb $6, %xmm0, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %ecx -; AVX1-NEXT: jb .LBB2_46 -; AVX1-NEXT: # %bb.45: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_46: -; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrb $7, %xmm2, %ecx -; AVX1-NEXT: vpextrb $7, %xmm0, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %ecx -; AVX1-NEXT: jb .LBB2_48 -; AVX1-NEXT: # %bb.47: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_48: -; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrb $8, %xmm2, %ecx -; AVX1-NEXT: vpextrb $8, %xmm0, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %ecx -; AVX1-NEXT: jb .LBB2_50 -; AVX1-NEXT: # %bb.49: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_50: -; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrb $9, %xmm2, %ecx -; AVX1-NEXT: vpextrb $9, %xmm0, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %ecx -; AVX1-NEXT: jb .LBB2_52 -; AVX1-NEXT: # %bb.51: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_52: -; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrb $10, %xmm2, %ecx -; AVX1-NEXT: vpextrb $10, %xmm0, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %ecx -; AVX1-NEXT: jb .LBB2_54 -; AVX1-NEXT: # %bb.53: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_54: -; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrb $11, %xmm2, %ecx -; AVX1-NEXT: vpextrb $11, %xmm0, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %ecx -; AVX1-NEXT: jb .LBB2_56 -; AVX1-NEXT: # %bb.55: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_56: -; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrb $12, %xmm2, %ecx -; AVX1-NEXT: vpextrb $12, %xmm0, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %ecx -; AVX1-NEXT: jb .LBB2_58 -; AVX1-NEXT: # %bb.57: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_58: -; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrb $13, %xmm2, %ecx -; AVX1-NEXT: vpextrb $13, %xmm0, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %ecx -; AVX1-NEXT: jb .LBB2_60 -; AVX1-NEXT: # %bb.59: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_60: -; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrb $14, %xmm2, %ecx -; AVX1-NEXT: vpextrb $14, %xmm0, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %ecx -; AVX1-NEXT: jb .LBB2_62 -; AVX1-NEXT: # %bb.61: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_62: -; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrb $15, %xmm2, %ecx -; AVX1-NEXT: vpextrb $15, %xmm0, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %ecx -; AVX1-NEXT: jb .LBB2_64 -; AVX1-NEXT: # %bb.63: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_64: -; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm0 -; AVX1-NEXT: vpextrb $1, %xmm0, %ecx -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpextrb $1, %xmm2, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %r9d -; AVX1-NEXT: jb .LBB2_66 -; AVX1-NEXT: # %bb.65: -; AVX1-NEXT: movl %eax, %r9d -; AVX1-NEXT: .LBB2_66: -; AVX1-NEXT: vpextrb $0, %xmm0, %ecx -; AVX1-NEXT: vpextrb $0, %xmm2, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %edx -; AVX1-NEXT: jb .LBB2_68 -; AVX1-NEXT: # %bb.67: -; AVX1-NEXT: movl %eax, %edx -; AVX1-NEXT: .LBB2_68: -; AVX1-NEXT: vpextrb $2, %xmm0, %ecx -; AVX1-NEXT: vpextrb $2, %xmm2, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %r10d -; AVX1-NEXT: jb .LBB2_70 -; AVX1-NEXT: # %bb.69: -; AVX1-NEXT: movl %eax, %r10d -; AVX1-NEXT: .LBB2_70: -; AVX1-NEXT: vpextrb $3, %xmm0, %ecx -; AVX1-NEXT: vpextrb $3, %xmm2, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %r8d -; AVX1-NEXT: jb .LBB2_72 -; AVX1-NEXT: # %bb.71: -; AVX1-NEXT: movl %eax, %r8d -; AVX1-NEXT: .LBB2_72: -; AVX1-NEXT: vpextrb $4, %xmm0, %ecx -; AVX1-NEXT: vpextrb $4, %xmm2, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %r11d -; AVX1-NEXT: jb .LBB2_74 -; AVX1-NEXT: # %bb.73: -; AVX1-NEXT: movl %eax, %r11d -; AVX1-NEXT: .LBB2_74: -; AVX1-NEXT: vpextrb $5, %xmm0, %ecx -; AVX1-NEXT: vpextrb $5, %xmm2, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %r13d -; AVX1-NEXT: jb .LBB2_76 -; AVX1-NEXT: # %bb.75: -; AVX1-NEXT: movl %eax, %r13d -; AVX1-NEXT: .LBB2_76: -; AVX1-NEXT: vpextrb $6, %xmm0, %ecx -; AVX1-NEXT: vpextrb $6, %xmm2, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %r12d -; AVX1-NEXT: jb .LBB2_78 -; AVX1-NEXT: # %bb.77: -; AVX1-NEXT: movl %eax, %r12d -; AVX1-NEXT: .LBB2_78: -; AVX1-NEXT: vpextrb $7, %xmm0, %ecx -; AVX1-NEXT: vpextrb $7, %xmm2, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %r15d -; AVX1-NEXT: jb .LBB2_80 -; AVX1-NEXT: # %bb.79: -; AVX1-NEXT: movl %eax, %r15d -; AVX1-NEXT: .LBB2_80: -; AVX1-NEXT: vpextrb $8, %xmm0, %ecx -; AVX1-NEXT: vpextrb $8, %xmm2, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %r14d -; AVX1-NEXT: jb .LBB2_82 -; AVX1-NEXT: # %bb.81: -; AVX1-NEXT: movl %eax, %r14d -; AVX1-NEXT: .LBB2_82: -; AVX1-NEXT: vpextrb $9, %xmm0, %ecx -; AVX1-NEXT: vpextrb $9, %xmm2, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %ecx -; AVX1-NEXT: jb .LBB2_84 -; AVX1-NEXT: # %bb.83: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_84: -; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrb $10, %xmm0, %ecx -; AVX1-NEXT: vpextrb $10, %xmm2, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %ecx -; AVX1-NEXT: jb .LBB2_86 -; AVX1-NEXT: # %bb.85: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_86: -; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrb $11, %xmm0, %ecx -; AVX1-NEXT: vpextrb $11, %xmm2, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %ecx -; AVX1-NEXT: jb .LBB2_88 -; AVX1-NEXT: # %bb.87: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_88: -; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrb $12, %xmm0, %ecx -; AVX1-NEXT: vpextrb $12, %xmm2, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %ecx -; AVX1-NEXT: jb .LBB2_90 -; AVX1-NEXT: # %bb.89: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_90: -; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movzbl %bl, %ebp -; AVX1-NEXT: vpextrb $13, %xmm0, %ecx -; AVX1-NEXT: vpextrb $13, %xmm2, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %ecx -; AVX1-NEXT: jb .LBB2_92 -; AVX1-NEXT: # %bb.91: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_92: -; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movzbl %dil, %edi -; AVX1-NEXT: movzbl %sil, %ebx -; AVX1-NEXT: vmovd %ebp, %xmm4 -; AVX1-NEXT: vpextrb $14, %xmm0, %ecx -; AVX1-NEXT: vpextrb $14, %xmm2, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %ecx -; AVX1-NEXT: jb .LBB2_94 -; AVX1-NEXT: # %bb.93: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_94: -; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; AVX1-NEXT: vmovd %edi, %xmm5 -; AVX1-NEXT: vpinsrb $1, %ebx, %xmm4, %xmm4 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload -; AVX1-NEXT: movzbl %dl, %ebp -; AVX1-NEXT: vpextrb $15, %xmm0, %ecx -; AVX1-NEXT: vpextrb $15, %xmm2, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %ecx -; AVX1-NEXT: jb .LBB2_96 -; AVX1-NEXT: # %bb.95: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_96: -; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpinsrb $1, %esi, %xmm5, %xmm0 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $2, %edi, %xmm4, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; AVX1-NEXT: movzbl %r9b, %edx -; AVX1-NEXT: vmovd %ebp, %xmm4 -; AVX1-NEXT: vpextrb $1, %xmm3, %eax -; AVX1-NEXT: vpextrb $1, %xmm1, %edi -; AVX1-NEXT: subb %al, %dil -; AVX1-NEXT: movl $0, %eax -; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload -; AVX1-NEXT: jb .LBB2_98 -; AVX1-NEXT: # %bb.97: -; AVX1-NEXT: movl %edi, %eax -; AVX1-NEXT: .LBB2_98: -; AVX1-NEXT: vpinsrb $2, %esi, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %cl, %esi -; AVX1-NEXT: vpinsrb $3, %ebx, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $1, %edx, %xmm4, %xmm4 -; AVX1-NEXT: movzbl %r10b, %edi -; AVX1-NEXT: movzbl %al, %edx -; AVX1-NEXT: vpextrb $0, %xmm3, %ecx -; AVX1-NEXT: vpextrb $0, %xmm1, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %ecx -; AVX1-NEXT: jb .LBB2_100 -; AVX1-NEXT: # %bb.99: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_100: -; AVX1-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $4, %ebp, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $2, %edi, %xmm4, %xmm4 -; AVX1-NEXT: movzbl %r8b, %ebp -; AVX1-NEXT: movzbl %cl, %eax -; AVX1-NEXT: vmovd %eax, %xmm5 -; AVX1-NEXT: vpinsrb $1, %edx, %xmm5, %xmm5 -; AVX1-NEXT: vpextrb $2, %xmm3, %eax -; AVX1-NEXT: vpextrb $2, %xmm1, %ecx -; AVX1-NEXT: subb %al, %cl -; AVX1-NEXT: movl $0, %eax -; AVX1-NEXT: jb .LBB2_102 -; AVX1-NEXT: # %bb.101: -; AVX1-NEXT: movl %ecx, %eax -; AVX1-NEXT: .LBB2_102: -; AVX1-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $5, %ebx, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $3, %ebp, %xmm4, %xmm4 -; AVX1-NEXT: movzbl %r11b, %ebp -; AVX1-NEXT: movzbl %al, %eax -; AVX1-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5 -; AVX1-NEXT: vpextrb $3, %xmm3, %eax -; AVX1-NEXT: vpextrb $3, %xmm1, %ecx -; AVX1-NEXT: subb %al, %cl -; AVX1-NEXT: movl $0, %eax -; AVX1-NEXT: jb .LBB2_104 -; AVX1-NEXT: # %bb.103: -; AVX1-NEXT: movl %ecx, %eax -; AVX1-NEXT: .LBB2_104: -; AVX1-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $6, %esi, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $4, %ebp, %xmm4, %xmm4 -; AVX1-NEXT: movzbl %r13b, %esi -; AVX1-NEXT: movzbl %al, %eax -; AVX1-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5 -; AVX1-NEXT: vpextrb $4, %xmm3, %edi -; AVX1-NEXT: vpextrb $4, %xmm1, %eax -; AVX1-NEXT: subb %dil, %al -; AVX1-NEXT: movl $0, %edi -; AVX1-NEXT: jb .LBB2_106 -; AVX1-NEXT: # %bb.105: -; AVX1-NEXT: movl %eax, %edi -; AVX1-NEXT: .LBB2_106: -; AVX1-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $7, %edx, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $5, %esi, %xmm4, %xmm4 -; AVX1-NEXT: movzbl %r12b, %edx -; AVX1-NEXT: movzbl %dil, %esi -; AVX1-NEXT: vpinsrb $4, %esi, %xmm5, %xmm5 -; AVX1-NEXT: vpextrb $5, %xmm3, %esi -; AVX1-NEXT: vpextrb $5, %xmm1, %edi -; AVX1-NEXT: subb %sil, %dil -; AVX1-NEXT: movl $0, %esi -; AVX1-NEXT: jb .LBB2_108 -; AVX1-NEXT: # %bb.107: -; AVX1-NEXT: movl %edi, %esi -; AVX1-NEXT: .LBB2_108: -; AVX1-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $6, %edx, %xmm4, %xmm4 -; AVX1-NEXT: movzbl %r15b, %edx -; AVX1-NEXT: movzbl %sil, %esi -; AVX1-NEXT: vpinsrb $5, %esi, %xmm5, %xmm5 -; AVX1-NEXT: vpextrb $6, %xmm3, %esi -; AVX1-NEXT: vpextrb $6, %xmm1, %edi -; AVX1-NEXT: subb %sil, %dil -; AVX1-NEXT: movl $0, %esi -; AVX1-NEXT: jb .LBB2_110 -; AVX1-NEXT: # %bb.109: -; AVX1-NEXT: movl %edi, %esi -; AVX1-NEXT: .LBB2_110: -; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $7, %edx, %xmm4, %xmm4 -; AVX1-NEXT: movzbl %r14b, %edx -; AVX1-NEXT: movzbl %sil, %esi -; AVX1-NEXT: vpinsrb $6, %esi, %xmm5, %xmm5 -; AVX1-NEXT: vpextrb $7, %xmm3, %esi -; AVX1-NEXT: vpextrb $7, %xmm1, %edi -; AVX1-NEXT: subb %sil, %dil -; AVX1-NEXT: movl $0, %esi -; AVX1-NEXT: jb .LBB2_112 -; AVX1-NEXT: # %bb.111: -; AVX1-NEXT: movl %edi, %esi -; AVX1-NEXT: .LBB2_112: -; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $8, %edx, %xmm4, %xmm4 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX1-NEXT: movzbl %sil, %esi -; AVX1-NEXT: vpinsrb $7, %esi, %xmm5, %xmm5 -; AVX1-NEXT: vpextrb $8, %xmm3, %esi -; AVX1-NEXT: vpextrb $8, %xmm1, %edi -; AVX1-NEXT: subb %sil, %dil -; AVX1-NEXT: movl $0, %esi -; AVX1-NEXT: jb .LBB2_114 -; AVX1-NEXT: # %bb.113: -; AVX1-NEXT: movl %edi, %esi -; AVX1-NEXT: .LBB2_114: -; AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $9, %edx, %xmm4, %xmm4 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX1-NEXT: movzbl %sil, %esi -; AVX1-NEXT: vpinsrb $8, %esi, %xmm5, %xmm5 -; AVX1-NEXT: vpextrb $9, %xmm3, %esi -; AVX1-NEXT: vpextrb $9, %xmm1, %edi -; AVX1-NEXT: subb %sil, %dil -; AVX1-NEXT: movl $0, %esi -; AVX1-NEXT: jb .LBB2_116 -; AVX1-NEXT: # %bb.115: -; AVX1-NEXT: movl %edi, %esi -; AVX1-NEXT: .LBB2_116: -; AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $10, %edx, %xmm4, %xmm4 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX1-NEXT: movzbl %sil, %esi -; AVX1-NEXT: vpinsrb $9, %esi, %xmm5, %xmm5 -; AVX1-NEXT: vpextrb $10, %xmm3, %esi -; AVX1-NEXT: vpextrb $10, %xmm1, %edi -; AVX1-NEXT: subb %sil, %dil -; AVX1-NEXT: movl $0, %esi -; AVX1-NEXT: jb .LBB2_118 -; AVX1-NEXT: # %bb.117: -; AVX1-NEXT: movl %edi, %esi -; AVX1-NEXT: .LBB2_118: -; AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl (%rsp), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $11, %edx, %xmm4, %xmm4 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX1-NEXT: movzbl %sil, %esi -; AVX1-NEXT: vpinsrb $10, %esi, %xmm5, %xmm5 -; AVX1-NEXT: vpextrb $11, %xmm3, %esi -; AVX1-NEXT: vpextrb $11, %xmm1, %edi -; AVX1-NEXT: subb %sil, %dil -; AVX1-NEXT: movl $0, %esi -; AVX1-NEXT: jb .LBB2_120 -; AVX1-NEXT: # %bb.119: -; AVX1-NEXT: movl %edi, %esi -; AVX1-NEXT: .LBB2_120: -; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $12, %edx, %xmm4, %xmm6 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX1-NEXT: movzbl %sil, %esi -; AVX1-NEXT: vpinsrb $11, %esi, %xmm5, %xmm5 -; AVX1-NEXT: vpextrb $12, %xmm3, %esi -; AVX1-NEXT: vpextrb $12, %xmm1, %edi -; AVX1-NEXT: subb %sil, %dil -; AVX1-NEXT: movl $0, %esi -; AVX1-NEXT: jb .LBB2_122 -; AVX1-NEXT: # %bb.121: -; AVX1-NEXT: movl %edi, %esi -; AVX1-NEXT: .LBB2_122: -; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm4 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $15, %ecx, %xmm2, %xmm0 -; AVX1-NEXT: vpinsrb $13, %edx, %xmm6, %xmm6 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX1-NEXT: movzbl %sil, %edx -; AVX1-NEXT: vpinsrb $12, %edx, %xmm5, %xmm5 -; AVX1-NEXT: vpextrb $13, %xmm3, %edx -; AVX1-NEXT: vpextrb $13, %xmm1, %esi -; AVX1-NEXT: subb %dl, %sil -; AVX1-NEXT: movl $0, %edx -; AVX1-NEXT: jb .LBB2_124 -; AVX1-NEXT: # %bb.123: -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: .LBB2_124: -; AVX1-NEXT: vpinsrb $15, %eax, %xmm4, %xmm2 -; AVX1-NEXT: vpinsrb $14, %ecx, %xmm6, %xmm4 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: movzbl %dl, %ecx -; AVX1-NEXT: vpinsrb $13, %ecx, %xmm5, %xmm5 -; AVX1-NEXT: vpextrb $14, %xmm3, %edx -; AVX1-NEXT: vpextrb $14, %xmm1, %ecx -; AVX1-NEXT: subb %dl, %cl -; AVX1-NEXT: movl $0, %edx -; AVX1-NEXT: leaq {{[0-9]+}}(%rsp), %rsp -; AVX1-NEXT: popq %rbx -; AVX1-NEXT: popq %r12 -; AVX1-NEXT: popq %r13 -; AVX1-NEXT: popq %r14 -; AVX1-NEXT: popq %r15 -; AVX1-NEXT: popq %rbp -; AVX1-NEXT: jb .LBB2_126 -; AVX1-NEXT: # %bb.125: -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: .LBB2_126: -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vpinsrb $15, %eax, %xmm4, %xmm2 -; AVX1-NEXT: movzbl %dl, %eax -; AVX1-NEXT: vpinsrb $14, %eax, %xmm5, %xmm4 -; AVX1-NEXT: vpextrb $15, %xmm3, %ecx -; AVX1-NEXT: vpextrb $15, %xmm1, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movl $0, %ecx -; AVX1-NEXT: jb .LBB2_128 -; AVX1-NEXT: # %bb.127: -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: .LBB2_128: -; AVX1-NEXT: movzbl %cl, %eax -; AVX1-NEXT: vpinsrb $15, %eax, %xmm4, %xmm1 +; AVX1-NEXT: vpsubusb %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpsubusb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpsubusb %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpsubusb %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v64i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-NEXT: vpextrb $1, %xmm4, %ecx -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX2-NEXT: vpextrb $1, %xmm5, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %ecx -; AVX2-NEXT: jb .LBB2_2 -; AVX2-NEXT: # %bb.1: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_2: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: pushq %r15 -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %r13 -; AVX2-NEXT: pushq %r12 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: subq $12, %rsp -; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrb $0, %xmm4, %ecx -; AVX2-NEXT: vpextrb $0, %xmm5, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %edi -; AVX2-NEXT: jb .LBB2_4 -; AVX2-NEXT: # %bb.3: -; AVX2-NEXT: movl %eax, %edi -; AVX2-NEXT: .LBB2_4: -; AVX2-NEXT: vpextrb $2, %xmm4, %ecx -; AVX2-NEXT: vpextrb $2, %xmm5, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %ecx -; AVX2-NEXT: jb .LBB2_6 -; AVX2-NEXT: # %bb.5: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_6: -; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrb $3, %xmm4, %ecx -; AVX2-NEXT: vpextrb $3, %xmm5, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %ecx -; AVX2-NEXT: jb .LBB2_8 -; AVX2-NEXT: # %bb.7: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_8: -; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrb $4, %xmm4, %ecx -; AVX2-NEXT: vpextrb $4, %xmm5, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %ecx -; AVX2-NEXT: jb .LBB2_10 -; AVX2-NEXT: # %bb.9: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_10: -; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrb $5, %xmm4, %ecx -; AVX2-NEXT: vpextrb $5, %xmm5, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %ecx -; AVX2-NEXT: jb .LBB2_12 -; AVX2-NEXT: # %bb.11: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_12: -; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrb $6, %xmm4, %ecx -; AVX2-NEXT: vpextrb $6, %xmm5, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %ecx -; AVX2-NEXT: jb .LBB2_14 -; AVX2-NEXT: # %bb.13: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_14: -; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrb $7, %xmm4, %ecx -; AVX2-NEXT: vpextrb $7, %xmm5, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %ecx -; AVX2-NEXT: jb .LBB2_16 -; AVX2-NEXT: # %bb.15: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_16: -; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrb $8, %xmm4, %ecx -; AVX2-NEXT: vpextrb $8, %xmm5, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %ecx -; AVX2-NEXT: jb .LBB2_18 -; AVX2-NEXT: # %bb.17: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_18: -; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrb $9, %xmm4, %ecx -; AVX2-NEXT: vpextrb $9, %xmm5, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %ecx -; AVX2-NEXT: jb .LBB2_20 -; AVX2-NEXT: # %bb.19: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_20: -; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrb $10, %xmm4, %ecx -; AVX2-NEXT: vpextrb $10, %xmm5, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %ecx -; AVX2-NEXT: jb .LBB2_22 -; AVX2-NEXT: # %bb.21: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_22: -; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrb $11, %xmm4, %ecx -; AVX2-NEXT: vpextrb $11, %xmm5, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %ecx -; AVX2-NEXT: jb .LBB2_24 -; AVX2-NEXT: # %bb.23: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_24: -; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrb $12, %xmm4, %ecx -; AVX2-NEXT: vpextrb $12, %xmm5, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %ecx -; AVX2-NEXT: jb .LBB2_26 -; AVX2-NEXT: # %bb.25: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_26: -; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrb $13, %xmm4, %ecx -; AVX2-NEXT: vpextrb $13, %xmm5, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %ecx -; AVX2-NEXT: jb .LBB2_28 -; AVX2-NEXT: # %bb.27: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_28: -; AVX2-NEXT: movl %ecx, (%rsp) # 4-byte Spill -; AVX2-NEXT: vpextrb $14, %xmm4, %ecx -; AVX2-NEXT: vpextrb $14, %xmm5, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %ecx -; AVX2-NEXT: jb .LBB2_30 -; AVX2-NEXT: # %bb.29: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_30: -; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrb $15, %xmm4, %ecx -; AVX2-NEXT: vpextrb $15, %xmm5, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %ecx -; AVX2-NEXT: jb .LBB2_32 -; AVX2-NEXT: # %bb.31: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_32: -; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrb $1, %xmm2, %ecx -; AVX2-NEXT: vpextrb $1, %xmm0, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %esi -; AVX2-NEXT: jb .LBB2_34 -; AVX2-NEXT: # %bb.33: -; AVX2-NEXT: movl %eax, %esi -; AVX2-NEXT: .LBB2_34: -; AVX2-NEXT: vpextrb $0, %xmm2, %ecx -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %ebx -; AVX2-NEXT: jb .LBB2_36 -; AVX2-NEXT: # %bb.35: -; AVX2-NEXT: movl %eax, %ebx -; AVX2-NEXT: .LBB2_36: -; AVX2-NEXT: vpextrb $2, %xmm2, %ecx -; AVX2-NEXT: vpextrb $2, %xmm0, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %ecx -; AVX2-NEXT: jb .LBB2_38 -; AVX2-NEXT: # %bb.37: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_38: -; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrb $3, %xmm2, %ecx -; AVX2-NEXT: vpextrb $3, %xmm0, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %ecx -; AVX2-NEXT: jb .LBB2_40 -; AVX2-NEXT: # %bb.39: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_40: -; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrb $4, %xmm2, %ecx -; AVX2-NEXT: vpextrb $4, %xmm0, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %ecx -; AVX2-NEXT: jb .LBB2_42 -; AVX2-NEXT: # %bb.41: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_42: -; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrb $5, %xmm2, %ecx -; AVX2-NEXT: vpextrb $5, %xmm0, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %ecx -; AVX2-NEXT: jb .LBB2_44 -; AVX2-NEXT: # %bb.43: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_44: -; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrb $6, %xmm2, %ecx -; AVX2-NEXT: vpextrb $6, %xmm0, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %ecx -; AVX2-NEXT: jb .LBB2_46 -; AVX2-NEXT: # %bb.45: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_46: -; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrb $7, %xmm2, %ecx -; AVX2-NEXT: vpextrb $7, %xmm0, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %ecx -; AVX2-NEXT: jb .LBB2_48 -; AVX2-NEXT: # %bb.47: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_48: -; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrb $8, %xmm2, %ecx -; AVX2-NEXT: vpextrb $8, %xmm0, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %ecx -; AVX2-NEXT: jb .LBB2_50 -; AVX2-NEXT: # %bb.49: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_50: -; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrb $9, %xmm2, %ecx -; AVX2-NEXT: vpextrb $9, %xmm0, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %ecx -; AVX2-NEXT: jb .LBB2_52 -; AVX2-NEXT: # %bb.51: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_52: -; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrb $10, %xmm2, %ecx -; AVX2-NEXT: vpextrb $10, %xmm0, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %ecx -; AVX2-NEXT: jb .LBB2_54 -; AVX2-NEXT: # %bb.53: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_54: -; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrb $11, %xmm2, %ecx -; AVX2-NEXT: vpextrb $11, %xmm0, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %ecx -; AVX2-NEXT: jb .LBB2_56 -; AVX2-NEXT: # %bb.55: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_56: -; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrb $12, %xmm2, %ecx -; AVX2-NEXT: vpextrb $12, %xmm0, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %ecx -; AVX2-NEXT: jb .LBB2_58 -; AVX2-NEXT: # %bb.57: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_58: -; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrb $13, %xmm2, %ecx -; AVX2-NEXT: vpextrb $13, %xmm0, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %ecx -; AVX2-NEXT: jb .LBB2_60 -; AVX2-NEXT: # %bb.59: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_60: -; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrb $14, %xmm2, %ecx -; AVX2-NEXT: vpextrb $14, %xmm0, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %ecx -; AVX2-NEXT: jb .LBB2_62 -; AVX2-NEXT: # %bb.61: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_62: -; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrb $15, %xmm2, %ecx -; AVX2-NEXT: vpextrb $15, %xmm0, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %ecx -; AVX2-NEXT: jb .LBB2_64 -; AVX2-NEXT: # %bb.63: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_64: -; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm0 -; AVX2-NEXT: vpextrb $1, %xmm0, %ecx -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpextrb $1, %xmm2, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %r9d -; AVX2-NEXT: jb .LBB2_66 -; AVX2-NEXT: # %bb.65: -; AVX2-NEXT: movl %eax, %r9d -; AVX2-NEXT: .LBB2_66: -; AVX2-NEXT: vpextrb $0, %xmm0, %ecx -; AVX2-NEXT: vpextrb $0, %xmm2, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %edx -; AVX2-NEXT: jb .LBB2_68 -; AVX2-NEXT: # %bb.67: -; AVX2-NEXT: movl %eax, %edx -; AVX2-NEXT: .LBB2_68: -; AVX2-NEXT: vpextrb $2, %xmm0, %ecx -; AVX2-NEXT: vpextrb $2, %xmm2, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %r10d -; AVX2-NEXT: jb .LBB2_70 -; AVX2-NEXT: # %bb.69: -; AVX2-NEXT: movl %eax, %r10d -; AVX2-NEXT: .LBB2_70: -; AVX2-NEXT: vpextrb $3, %xmm0, %ecx -; AVX2-NEXT: vpextrb $3, %xmm2, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %r8d -; AVX2-NEXT: jb .LBB2_72 -; AVX2-NEXT: # %bb.71: -; AVX2-NEXT: movl %eax, %r8d -; AVX2-NEXT: .LBB2_72: -; AVX2-NEXT: vpextrb $4, %xmm0, %ecx -; AVX2-NEXT: vpextrb $4, %xmm2, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %r11d -; AVX2-NEXT: jb .LBB2_74 -; AVX2-NEXT: # %bb.73: -; AVX2-NEXT: movl %eax, %r11d -; AVX2-NEXT: .LBB2_74: -; AVX2-NEXT: vpextrb $5, %xmm0, %ecx -; AVX2-NEXT: vpextrb $5, %xmm2, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %r13d -; AVX2-NEXT: jb .LBB2_76 -; AVX2-NEXT: # %bb.75: -; AVX2-NEXT: movl %eax, %r13d -; AVX2-NEXT: .LBB2_76: -; AVX2-NEXT: vpextrb $6, %xmm0, %ecx -; AVX2-NEXT: vpextrb $6, %xmm2, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %r12d -; AVX2-NEXT: jb .LBB2_78 -; AVX2-NEXT: # %bb.77: -; AVX2-NEXT: movl %eax, %r12d -; AVX2-NEXT: .LBB2_78: -; AVX2-NEXT: vpextrb $7, %xmm0, %ecx -; AVX2-NEXT: vpextrb $7, %xmm2, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %r15d -; AVX2-NEXT: jb .LBB2_80 -; AVX2-NEXT: # %bb.79: -; AVX2-NEXT: movl %eax, %r15d -; AVX2-NEXT: .LBB2_80: -; AVX2-NEXT: vpextrb $8, %xmm0, %ecx -; AVX2-NEXT: vpextrb $8, %xmm2, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %r14d -; AVX2-NEXT: jb .LBB2_82 -; AVX2-NEXT: # %bb.81: -; AVX2-NEXT: movl %eax, %r14d -; AVX2-NEXT: .LBB2_82: -; AVX2-NEXT: vpextrb $9, %xmm0, %ecx -; AVX2-NEXT: vpextrb $9, %xmm2, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %ecx -; AVX2-NEXT: jb .LBB2_84 -; AVX2-NEXT: # %bb.83: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_84: -; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrb $10, %xmm0, %ecx -; AVX2-NEXT: vpextrb $10, %xmm2, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %ecx -; AVX2-NEXT: jb .LBB2_86 -; AVX2-NEXT: # %bb.85: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_86: -; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrb $11, %xmm0, %ecx -; AVX2-NEXT: vpextrb $11, %xmm2, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %ecx -; AVX2-NEXT: jb .LBB2_88 -; AVX2-NEXT: # %bb.87: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_88: -; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrb $12, %xmm0, %ecx -; AVX2-NEXT: vpextrb $12, %xmm2, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %ecx -; AVX2-NEXT: jb .LBB2_90 -; AVX2-NEXT: # %bb.89: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_90: -; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movzbl %bl, %ebp -; AVX2-NEXT: vpextrb $13, %xmm0, %ecx -; AVX2-NEXT: vpextrb $13, %xmm2, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %ecx -; AVX2-NEXT: jb .LBB2_92 -; AVX2-NEXT: # %bb.91: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_92: -; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movzbl %dil, %edi -; AVX2-NEXT: movzbl %sil, %ebx -; AVX2-NEXT: vmovd %ebp, %xmm4 -; AVX2-NEXT: vpextrb $14, %xmm0, %ecx -; AVX2-NEXT: vpextrb $14, %xmm2, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %ecx -; AVX2-NEXT: jb .LBB2_94 -; AVX2-NEXT: # %bb.93: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_94: -; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; AVX2-NEXT: vmovd %edi, %xmm5 -; AVX2-NEXT: vpinsrb $1, %ebx, %xmm4, %xmm4 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload -; AVX2-NEXT: movzbl %dl, %ebp -; AVX2-NEXT: vpextrb $15, %xmm0, %ecx -; AVX2-NEXT: vpextrb $15, %xmm2, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %ecx -; AVX2-NEXT: jb .LBB2_96 -; AVX2-NEXT: # %bb.95: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_96: -; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpinsrb $1, %esi, %xmm5, %xmm0 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $2, %edi, %xmm4, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; AVX2-NEXT: movzbl %r9b, %edx -; AVX2-NEXT: vmovd %ebp, %xmm4 -; AVX2-NEXT: vpextrb $1, %xmm3, %eax -; AVX2-NEXT: vpextrb $1, %xmm1, %edi -; AVX2-NEXT: subb %al, %dil -; AVX2-NEXT: movl $0, %eax -; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload -; AVX2-NEXT: jb .LBB2_98 -; AVX2-NEXT: # %bb.97: -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: .LBB2_98: -; AVX2-NEXT: vpinsrb $2, %esi, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %cl, %esi -; AVX2-NEXT: vpinsrb $3, %ebx, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $1, %edx, %xmm4, %xmm4 -; AVX2-NEXT: movzbl %r10b, %edi -; AVX2-NEXT: movzbl %al, %edx -; AVX2-NEXT: vpextrb $0, %xmm3, %ecx -; AVX2-NEXT: vpextrb $0, %xmm1, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %ecx -; AVX2-NEXT: jb .LBB2_100 -; AVX2-NEXT: # %bb.99: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_100: -; AVX2-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $4, %ebp, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $2, %edi, %xmm4, %xmm4 -; AVX2-NEXT: movzbl %r8b, %ebp -; AVX2-NEXT: movzbl %cl, %eax -; AVX2-NEXT: vmovd %eax, %xmm5 -; AVX2-NEXT: vpinsrb $1, %edx, %xmm5, %xmm5 -; AVX2-NEXT: vpextrb $2, %xmm3, %eax -; AVX2-NEXT: vpextrb $2, %xmm1, %ecx -; AVX2-NEXT: subb %al, %cl -; AVX2-NEXT: movl $0, %eax -; AVX2-NEXT: jb .LBB2_102 -; AVX2-NEXT: # %bb.101: -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: .LBB2_102: -; AVX2-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $5, %ebx, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $3, %ebp, %xmm4, %xmm4 -; AVX2-NEXT: movzbl %r11b, %ebp -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5 -; AVX2-NEXT: vpextrb $3, %xmm3, %eax -; AVX2-NEXT: vpextrb $3, %xmm1, %ecx -; AVX2-NEXT: subb %al, %cl -; AVX2-NEXT: movl $0, %eax -; AVX2-NEXT: jb .LBB2_104 -; AVX2-NEXT: # %bb.103: -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: .LBB2_104: -; AVX2-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $6, %esi, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $4, %ebp, %xmm4, %xmm4 -; AVX2-NEXT: movzbl %r13b, %esi -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5 -; AVX2-NEXT: vpextrb $4, %xmm3, %edi -; AVX2-NEXT: vpextrb $4, %xmm1, %eax -; AVX2-NEXT: subb %dil, %al -; AVX2-NEXT: movl $0, %edi -; AVX2-NEXT: jb .LBB2_106 -; AVX2-NEXT: # %bb.105: -; AVX2-NEXT: movl %eax, %edi -; AVX2-NEXT: .LBB2_106: -; AVX2-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $7, %edx, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $5, %esi, %xmm4, %xmm4 -; AVX2-NEXT: movzbl %r12b, %edx -; AVX2-NEXT: movzbl %dil, %esi -; AVX2-NEXT: vpinsrb $4, %esi, %xmm5, %xmm5 -; AVX2-NEXT: vpextrb $5, %xmm3, %esi -; AVX2-NEXT: vpextrb $5, %xmm1, %edi -; AVX2-NEXT: subb %sil, %dil -; AVX2-NEXT: movl $0, %esi -; AVX2-NEXT: jb .LBB2_108 -; AVX2-NEXT: # %bb.107: -; AVX2-NEXT: movl %edi, %esi -; AVX2-NEXT: .LBB2_108: -; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $6, %edx, %xmm4, %xmm4 -; AVX2-NEXT: movzbl %r15b, %edx -; AVX2-NEXT: movzbl %sil, %esi -; AVX2-NEXT: vpinsrb $5, %esi, %xmm5, %xmm5 -; AVX2-NEXT: vpextrb $6, %xmm3, %esi -; AVX2-NEXT: vpextrb $6, %xmm1, %edi -; AVX2-NEXT: subb %sil, %dil -; AVX2-NEXT: movl $0, %esi -; AVX2-NEXT: jb .LBB2_110 -; AVX2-NEXT: # %bb.109: -; AVX2-NEXT: movl %edi, %esi -; AVX2-NEXT: .LBB2_110: -; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $7, %edx, %xmm4, %xmm4 -; AVX2-NEXT: movzbl %r14b, %edx -; AVX2-NEXT: movzbl %sil, %esi -; AVX2-NEXT: vpinsrb $6, %esi, %xmm5, %xmm5 -; AVX2-NEXT: vpextrb $7, %xmm3, %esi -; AVX2-NEXT: vpextrb $7, %xmm1, %edi -; AVX2-NEXT: subb %sil, %dil -; AVX2-NEXT: movl $0, %esi -; AVX2-NEXT: jb .LBB2_112 -; AVX2-NEXT: # %bb.111: -; AVX2-NEXT: movl %edi, %esi -; AVX2-NEXT: .LBB2_112: -; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $8, %edx, %xmm4, %xmm4 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX2-NEXT: movzbl %sil, %esi -; AVX2-NEXT: vpinsrb $7, %esi, %xmm5, %xmm5 -; AVX2-NEXT: vpextrb $8, %xmm3, %esi -; AVX2-NEXT: vpextrb $8, %xmm1, %edi -; AVX2-NEXT: subb %sil, %dil -; AVX2-NEXT: movl $0, %esi -; AVX2-NEXT: jb .LBB2_114 -; AVX2-NEXT: # %bb.113: -; AVX2-NEXT: movl %edi, %esi -; AVX2-NEXT: .LBB2_114: -; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $9, %edx, %xmm4, %xmm4 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX2-NEXT: movzbl %sil, %esi -; AVX2-NEXT: vpinsrb $8, %esi, %xmm5, %xmm5 -; AVX2-NEXT: vpextrb $9, %xmm3, %esi -; AVX2-NEXT: vpextrb $9, %xmm1, %edi -; AVX2-NEXT: subb %sil, %dil -; AVX2-NEXT: movl $0, %esi -; AVX2-NEXT: jb .LBB2_116 -; AVX2-NEXT: # %bb.115: -; AVX2-NEXT: movl %edi, %esi -; AVX2-NEXT: .LBB2_116: -; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $10, %edx, %xmm4, %xmm4 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX2-NEXT: movzbl %sil, %esi -; AVX2-NEXT: vpinsrb $9, %esi, %xmm5, %xmm5 -; AVX2-NEXT: vpextrb $10, %xmm3, %esi -; AVX2-NEXT: vpextrb $10, %xmm1, %edi -; AVX2-NEXT: subb %sil, %dil -; AVX2-NEXT: movl $0, %esi -; AVX2-NEXT: jb .LBB2_118 -; AVX2-NEXT: # %bb.117: -; AVX2-NEXT: movl %edi, %esi -; AVX2-NEXT: .LBB2_118: -; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl (%rsp), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $11, %edx, %xmm4, %xmm4 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX2-NEXT: movzbl %sil, %esi -; AVX2-NEXT: vpinsrb $10, %esi, %xmm5, %xmm5 -; AVX2-NEXT: vpextrb $11, %xmm3, %esi -; AVX2-NEXT: vpextrb $11, %xmm1, %edi -; AVX2-NEXT: subb %sil, %dil -; AVX2-NEXT: movl $0, %esi -; AVX2-NEXT: jb .LBB2_120 -; AVX2-NEXT: # %bb.119: -; AVX2-NEXT: movl %edi, %esi -; AVX2-NEXT: .LBB2_120: -; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $12, %edx, %xmm4, %xmm6 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX2-NEXT: movzbl %sil, %esi -; AVX2-NEXT: vpinsrb $11, %esi, %xmm5, %xmm5 -; AVX2-NEXT: vpextrb $12, %xmm3, %esi -; AVX2-NEXT: vpextrb $12, %xmm1, %edi -; AVX2-NEXT: subb %sil, %dil -; AVX2-NEXT: movl $0, %esi -; AVX2-NEXT: jb .LBB2_122 -; AVX2-NEXT: # %bb.121: -; AVX2-NEXT: movl %edi, %esi -; AVX2-NEXT: .LBB2_122: -; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm4 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $15, %ecx, %xmm2, %xmm0 -; AVX2-NEXT: vpinsrb $13, %edx, %xmm6, %xmm6 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX2-NEXT: movzbl %sil, %edx -; AVX2-NEXT: vpinsrb $12, %edx, %xmm5, %xmm5 -; AVX2-NEXT: vpextrb $13, %xmm3, %edx -; AVX2-NEXT: vpextrb $13, %xmm1, %esi -; AVX2-NEXT: subb %dl, %sil -; AVX2-NEXT: movl $0, %edx -; AVX2-NEXT: jb .LBB2_124 -; AVX2-NEXT: # %bb.123: -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: .LBB2_124: -; AVX2-NEXT: vpinsrb $15, %eax, %xmm4, %xmm2 -; AVX2-NEXT: vpinsrb $14, %ecx, %xmm6, %xmm4 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: movzbl %dl, %ecx -; AVX2-NEXT: vpinsrb $13, %ecx, %xmm5, %xmm5 -; AVX2-NEXT: vpextrb $14, %xmm3, %edx -; AVX2-NEXT: vpextrb $14, %xmm1, %ecx -; AVX2-NEXT: subb %dl, %cl -; AVX2-NEXT: movl $0, %edx -; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rsp -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r12 -; AVX2-NEXT: popq %r13 -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %r15 -; AVX2-NEXT: popq %rbp -; AVX2-NEXT: jb .LBB2_126 -; AVX2-NEXT: # %bb.125: -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: .LBB2_126: -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: vpinsrb $15, %eax, %xmm4, %xmm2 -; AVX2-NEXT: movzbl %dl, %eax -; AVX2-NEXT: vpinsrb $14, %eax, %xmm5, %xmm4 -; AVX2-NEXT: vpextrb $15, %xmm3, %ecx -; AVX2-NEXT: vpextrb $15, %xmm1, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movl $0, %ecx -; AVX2-NEXT: jb .LBB2_128 -; AVX2-NEXT: # %bb.127: -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: .LBB2_128: -; AVX2-NEXT: movzbl %cl, %eax -; AVX2-NEXT: vpinsrb $15, %eax, %xmm4, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: vpsubusb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsubusb %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: v64i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm2 -; AVX512-NEXT: vpextrb $1, %xmm2, %ecx -; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm3 -; AVX512-NEXT: vpextrb $1, %xmm3, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %ecx -; AVX512-NEXT: jb .LBB2_2 -; AVX512-NEXT: # %bb.1: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_2: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: subq $12, %rsp -; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrb $0, %xmm2, %ecx -; AVX512-NEXT: vpextrb $0, %xmm3, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %edi -; AVX512-NEXT: jb .LBB2_4 -; AVX512-NEXT: # %bb.3: -; AVX512-NEXT: movl %eax, %edi -; AVX512-NEXT: .LBB2_4: -; AVX512-NEXT: vpextrb $2, %xmm2, %ecx -; AVX512-NEXT: vpextrb $2, %xmm3, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %ecx -; AVX512-NEXT: jb .LBB2_6 -; AVX512-NEXT: # %bb.5: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_6: -; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrb $3, %xmm2, %ecx -; AVX512-NEXT: vpextrb $3, %xmm3, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %ecx -; AVX512-NEXT: jb .LBB2_8 -; AVX512-NEXT: # %bb.7: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_8: -; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrb $4, %xmm2, %ecx -; AVX512-NEXT: vpextrb $4, %xmm3, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %ecx -; AVX512-NEXT: jb .LBB2_10 -; AVX512-NEXT: # %bb.9: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_10: -; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrb $5, %xmm2, %ecx -; AVX512-NEXT: vpextrb $5, %xmm3, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %ecx -; AVX512-NEXT: jb .LBB2_12 -; AVX512-NEXT: # %bb.11: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_12: -; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrb $6, %xmm2, %ecx -; AVX512-NEXT: vpextrb $6, %xmm3, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %ecx -; AVX512-NEXT: jb .LBB2_14 -; AVX512-NEXT: # %bb.13: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_14: -; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrb $7, %xmm2, %ecx -; AVX512-NEXT: vpextrb $7, %xmm3, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %ecx -; AVX512-NEXT: jb .LBB2_16 -; AVX512-NEXT: # %bb.15: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_16: -; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrb $8, %xmm2, %ecx -; AVX512-NEXT: vpextrb $8, %xmm3, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %ecx -; AVX512-NEXT: jb .LBB2_18 -; AVX512-NEXT: # %bb.17: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_18: -; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrb $9, %xmm2, %ecx -; AVX512-NEXT: vpextrb $9, %xmm3, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %ecx -; AVX512-NEXT: jb .LBB2_20 -; AVX512-NEXT: # %bb.19: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_20: -; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrb $10, %xmm2, %ecx -; AVX512-NEXT: vpextrb $10, %xmm3, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %ecx -; AVX512-NEXT: jb .LBB2_22 -; AVX512-NEXT: # %bb.21: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_22: -; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrb $11, %xmm2, %ecx -; AVX512-NEXT: vpextrb $11, %xmm3, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %ecx -; AVX512-NEXT: jb .LBB2_24 -; AVX512-NEXT: # %bb.23: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_24: -; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrb $12, %xmm2, %ecx -; AVX512-NEXT: vpextrb $12, %xmm3, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %ecx -; AVX512-NEXT: jb .LBB2_26 -; AVX512-NEXT: # %bb.25: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_26: -; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrb $13, %xmm2, %ecx -; AVX512-NEXT: vpextrb $13, %xmm3, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %ecx -; AVX512-NEXT: jb .LBB2_28 -; AVX512-NEXT: # %bb.27: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_28: -; AVX512-NEXT: movl %ecx, (%rsp) # 4-byte Spill -; AVX512-NEXT: vpextrb $14, %xmm2, %ecx -; AVX512-NEXT: vpextrb $14, %xmm3, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %ecx -; AVX512-NEXT: jb .LBB2_30 -; AVX512-NEXT: # %bb.29: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_30: -; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrb $15, %xmm2, %ecx -; AVX512-NEXT: vpextrb $15, %xmm3, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %ecx -; AVX512-NEXT: jb .LBB2_32 -; AVX512-NEXT: # %bb.31: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_32: -; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm2 -; AVX512-NEXT: vpextrb $1, %xmm2, %ecx -; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm3 -; AVX512-NEXT: vpextrb $1, %xmm3, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %esi -; AVX512-NEXT: jb .LBB2_34 -; AVX512-NEXT: # %bb.33: -; AVX512-NEXT: movl %eax, %esi -; AVX512-NEXT: .LBB2_34: -; AVX512-NEXT: vpextrb $0, %xmm2, %ecx -; AVX512-NEXT: vpextrb $0, %xmm3, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %ebx -; AVX512-NEXT: jb .LBB2_36 -; AVX512-NEXT: # %bb.35: -; AVX512-NEXT: movl %eax, %ebx -; AVX512-NEXT: .LBB2_36: -; AVX512-NEXT: vpextrb $2, %xmm2, %ecx -; AVX512-NEXT: vpextrb $2, %xmm3, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %ecx -; AVX512-NEXT: jb .LBB2_38 -; AVX512-NEXT: # %bb.37: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_38: -; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrb $3, %xmm2, %ecx -; AVX512-NEXT: vpextrb $3, %xmm3, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %ecx -; AVX512-NEXT: jb .LBB2_40 -; AVX512-NEXT: # %bb.39: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_40: -; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrb $4, %xmm2, %ecx -; AVX512-NEXT: vpextrb $4, %xmm3, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %ecx -; AVX512-NEXT: jb .LBB2_42 -; AVX512-NEXT: # %bb.41: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_42: -; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrb $5, %xmm2, %ecx -; AVX512-NEXT: vpextrb $5, %xmm3, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %ecx -; AVX512-NEXT: jb .LBB2_44 -; AVX512-NEXT: # %bb.43: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_44: -; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrb $6, %xmm2, %ecx -; AVX512-NEXT: vpextrb $6, %xmm3, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %ecx -; AVX512-NEXT: jb .LBB2_46 -; AVX512-NEXT: # %bb.45: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_46: -; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrb $7, %xmm2, %ecx -; AVX512-NEXT: vpextrb $7, %xmm3, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %ecx -; AVX512-NEXT: jb .LBB2_48 -; AVX512-NEXT: # %bb.47: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_48: -; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrb $8, %xmm2, %ecx -; AVX512-NEXT: vpextrb $8, %xmm3, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %ecx -; AVX512-NEXT: jb .LBB2_50 -; AVX512-NEXT: # %bb.49: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_50: -; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrb $9, %xmm2, %ecx -; AVX512-NEXT: vpextrb $9, %xmm3, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %ecx -; AVX512-NEXT: jb .LBB2_52 -; AVX512-NEXT: # %bb.51: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_52: -; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrb $10, %xmm2, %ecx -; AVX512-NEXT: vpextrb $10, %xmm3, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %ecx -; AVX512-NEXT: jb .LBB2_54 -; AVX512-NEXT: # %bb.53: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_54: -; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrb $11, %xmm2, %ecx -; AVX512-NEXT: vpextrb $11, %xmm3, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %ecx -; AVX512-NEXT: jb .LBB2_56 -; AVX512-NEXT: # %bb.55: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_56: -; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrb $12, %xmm2, %ecx -; AVX512-NEXT: vpextrb $12, %xmm3, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %ecx -; AVX512-NEXT: jb .LBB2_58 -; AVX512-NEXT: # %bb.57: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_58: -; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrb $13, %xmm2, %ecx -; AVX512-NEXT: vpextrb $13, %xmm3, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %ecx -; AVX512-NEXT: jb .LBB2_60 -; AVX512-NEXT: # %bb.59: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_60: -; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrb $14, %xmm2, %ecx -; AVX512-NEXT: vpextrb $14, %xmm3, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %ecx -; AVX512-NEXT: jb .LBB2_62 -; AVX512-NEXT: # %bb.61: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_62: -; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrb $15, %xmm2, %ecx -; AVX512-NEXT: vpextrb $15, %xmm3, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %ecx -; AVX512-NEXT: jb .LBB2_64 -; AVX512-NEXT: # %bb.63: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_64: -; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512-NEXT: vpextrb $1, %xmm2, %ecx -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512-NEXT: vpextrb $1, %xmm3, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %r9d -; AVX512-NEXT: jb .LBB2_66 -; AVX512-NEXT: # %bb.65: -; AVX512-NEXT: movl %eax, %r9d -; AVX512-NEXT: .LBB2_66: -; AVX512-NEXT: vpextrb $0, %xmm2, %ecx -; AVX512-NEXT: vpextrb $0, %xmm3, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: jb .LBB2_68 -; AVX512-NEXT: # %bb.67: -; AVX512-NEXT: movl %eax, %edx -; AVX512-NEXT: .LBB2_68: -; AVX512-NEXT: vpextrb $2, %xmm2, %ecx -; AVX512-NEXT: vpextrb $2, %xmm3, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %r10d -; AVX512-NEXT: jb .LBB2_70 -; AVX512-NEXT: # %bb.69: -; AVX512-NEXT: movl %eax, %r10d -; AVX512-NEXT: .LBB2_70: -; AVX512-NEXT: vpextrb $3, %xmm2, %ecx -; AVX512-NEXT: vpextrb $3, %xmm3, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %r8d -; AVX512-NEXT: jb .LBB2_72 -; AVX512-NEXT: # %bb.71: -; AVX512-NEXT: movl %eax, %r8d -; AVX512-NEXT: .LBB2_72: -; AVX512-NEXT: vpextrb $4, %xmm2, %ecx -; AVX512-NEXT: vpextrb $4, %xmm3, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %r11d -; AVX512-NEXT: jb .LBB2_74 -; AVX512-NEXT: # %bb.73: -; AVX512-NEXT: movl %eax, %r11d -; AVX512-NEXT: .LBB2_74: -; AVX512-NEXT: vpextrb $5, %xmm2, %ecx -; AVX512-NEXT: vpextrb $5, %xmm3, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %r13d -; AVX512-NEXT: jb .LBB2_76 -; AVX512-NEXT: # %bb.75: -; AVX512-NEXT: movl %eax, %r13d -; AVX512-NEXT: .LBB2_76: -; AVX512-NEXT: vpextrb $6, %xmm2, %ecx -; AVX512-NEXT: vpextrb $6, %xmm3, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %r12d -; AVX512-NEXT: jb .LBB2_78 -; AVX512-NEXT: # %bb.77: -; AVX512-NEXT: movl %eax, %r12d -; AVX512-NEXT: .LBB2_78: -; AVX512-NEXT: vpextrb $7, %xmm2, %ecx -; AVX512-NEXT: vpextrb $7, %xmm3, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %r15d -; AVX512-NEXT: jb .LBB2_80 -; AVX512-NEXT: # %bb.79: -; AVX512-NEXT: movl %eax, %r15d -; AVX512-NEXT: .LBB2_80: -; AVX512-NEXT: vpextrb $8, %xmm2, %ecx -; AVX512-NEXT: vpextrb $8, %xmm3, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %r14d -; AVX512-NEXT: jb .LBB2_82 -; AVX512-NEXT: # %bb.81: -; AVX512-NEXT: movl %eax, %r14d -; AVX512-NEXT: .LBB2_82: -; AVX512-NEXT: vpextrb $9, %xmm2, %ecx -; AVX512-NEXT: vpextrb $9, %xmm3, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %ecx -; AVX512-NEXT: jb .LBB2_84 -; AVX512-NEXT: # %bb.83: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_84: -; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrb $10, %xmm2, %ecx -; AVX512-NEXT: vpextrb $10, %xmm3, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %ecx -; AVX512-NEXT: jb .LBB2_86 -; AVX512-NEXT: # %bb.85: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_86: -; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrb $11, %xmm2, %ecx -; AVX512-NEXT: vpextrb $11, %xmm3, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %ecx -; AVX512-NEXT: jb .LBB2_88 -; AVX512-NEXT: # %bb.87: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_88: -; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrb $12, %xmm2, %ecx -; AVX512-NEXT: vpextrb $12, %xmm3, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %ecx -; AVX512-NEXT: jb .LBB2_90 -; AVX512-NEXT: # %bb.89: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_90: -; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movzbl %bl, %ebp -; AVX512-NEXT: vpextrb $13, %xmm2, %ecx -; AVX512-NEXT: vpextrb $13, %xmm3, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %ecx -; AVX512-NEXT: jb .LBB2_92 -; AVX512-NEXT: # %bb.91: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_92: -; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movzbl %dil, %edi -; AVX512-NEXT: movzbl %sil, %ebx -; AVX512-NEXT: vmovd %ebp, %xmm4 -; AVX512-NEXT: vpextrb $14, %xmm2, %ecx -; AVX512-NEXT: vpextrb $14, %xmm3, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %ecx -; AVX512-NEXT: jb .LBB2_94 -; AVX512-NEXT: # %bb.93: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_94: -; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; AVX512-NEXT: vmovd %edi, %xmm5 -; AVX512-NEXT: vpinsrb $1, %ebx, %xmm4, %xmm4 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload -; AVX512-NEXT: movzbl %dl, %ebp -; AVX512-NEXT: vpextrb $15, %xmm2, %ecx -; AVX512-NEXT: vpextrb $15, %xmm3, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %ecx -; AVX512-NEXT: jb .LBB2_96 -; AVX512-NEXT: # %bb.95: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_96: -; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpinsrb $1, %esi, %xmm5, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $2, %edi, %xmm4, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; AVX512-NEXT: movzbl %r9b, %edx -; AVX512-NEXT: vmovd %ebp, %xmm4 -; AVX512-NEXT: vpextrb $1, %xmm1, %eax -; AVX512-NEXT: vpextrb $1, %xmm0, %edi -; AVX512-NEXT: subb %al, %dil -; AVX512-NEXT: movl $0, %eax -; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload -; AVX512-NEXT: jb .LBB2_98 -; AVX512-NEXT: # %bb.97: -; AVX512-NEXT: movl %edi, %eax -; AVX512-NEXT: .LBB2_98: -; AVX512-NEXT: vpinsrb $2, %esi, %xmm2, %xmm2 -; AVX512-NEXT: movzbl %cl, %esi -; AVX512-NEXT: vpinsrb $3, %ebx, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $1, %edx, %xmm4, %xmm4 -; AVX512-NEXT: movzbl %r10b, %edi -; AVX512-NEXT: movzbl %al, %edx -; AVX512-NEXT: vpextrb $0, %xmm1, %ecx -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %ecx -; AVX512-NEXT: jb .LBB2_100 -; AVX512-NEXT: # %bb.99: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_100: -; AVX512-NEXT: vpinsrb $3, %esi, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $4, %ebp, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $2, %edi, %xmm4, %xmm4 -; AVX512-NEXT: movzbl %r8b, %ebp -; AVX512-NEXT: movzbl %cl, %eax -; AVX512-NEXT: vmovd %eax, %xmm5 -; AVX512-NEXT: vpinsrb $1, %edx, %xmm5, %xmm5 -; AVX512-NEXT: vpextrb $2, %xmm1, %eax -; AVX512-NEXT: vpextrb $2, %xmm0, %ecx -; AVX512-NEXT: subb %al, %cl -; AVX512-NEXT: movl $0, %eax -; AVX512-NEXT: jb .LBB2_102 -; AVX512-NEXT: # %bb.101: -; AVX512-NEXT: movl %ecx, %eax -; AVX512-NEXT: .LBB2_102: -; AVX512-NEXT: vpinsrb $4, %esi, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $5, %ebx, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $3, %ebp, %xmm4, %xmm4 -; AVX512-NEXT: movzbl %r11b, %ebp -; AVX512-NEXT: movzbl %al, %eax -; AVX512-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5 -; AVX512-NEXT: vpextrb $3, %xmm1, %eax -; AVX512-NEXT: vpextrb $3, %xmm0, %ecx -; AVX512-NEXT: subb %al, %cl -; AVX512-NEXT: movl $0, %eax -; AVX512-NEXT: jb .LBB2_104 -; AVX512-NEXT: # %bb.103: -; AVX512-NEXT: movl %ecx, %eax -; AVX512-NEXT: .LBB2_104: -; AVX512-NEXT: vpinsrb $5, %edx, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $6, %esi, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $4, %ebp, %xmm4, %xmm4 -; AVX512-NEXT: movzbl %r13b, %esi -; AVX512-NEXT: movzbl %al, %eax -; AVX512-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5 -; AVX512-NEXT: vpextrb $4, %xmm1, %edi -; AVX512-NEXT: vpextrb $4, %xmm0, %eax -; AVX512-NEXT: subb %dil, %al -; AVX512-NEXT: movl $0, %edi -; AVX512-NEXT: jb .LBB2_106 -; AVX512-NEXT: # %bb.105: -; AVX512-NEXT: movl %eax, %edi -; AVX512-NEXT: .LBB2_106: -; AVX512-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $7, %edx, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $5, %esi, %xmm4, %xmm4 -; AVX512-NEXT: movzbl %r12b, %edx -; AVX512-NEXT: movzbl %dil, %esi -; AVX512-NEXT: vpinsrb $4, %esi, %xmm5, %xmm5 -; AVX512-NEXT: vpextrb $5, %xmm1, %esi -; AVX512-NEXT: vpextrb $5, %xmm0, %edi -; AVX512-NEXT: subb %sil, %dil -; AVX512-NEXT: movl $0, %esi -; AVX512-NEXT: jb .LBB2_108 -; AVX512-NEXT: # %bb.107: -; AVX512-NEXT: movl %edi, %esi -; AVX512-NEXT: .LBB2_108: -; AVX512-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $8, %ecx, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $6, %edx, %xmm4, %xmm4 -; AVX512-NEXT: movzbl %r15b, %edx -; AVX512-NEXT: movzbl %sil, %esi -; AVX512-NEXT: vpinsrb $5, %esi, %xmm5, %xmm5 -; AVX512-NEXT: vpextrb $6, %xmm1, %esi -; AVX512-NEXT: vpextrb $6, %xmm0, %edi -; AVX512-NEXT: subb %sil, %dil -; AVX512-NEXT: movl $0, %esi -; AVX512-NEXT: jb .LBB2_110 -; AVX512-NEXT: # %bb.109: -; AVX512-NEXT: movl %edi, %esi -; AVX512-NEXT: .LBB2_110: -; AVX512-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $9, %ecx, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $7, %edx, %xmm4, %xmm4 -; AVX512-NEXT: movzbl %r14b, %edx -; AVX512-NEXT: movzbl %sil, %esi -; AVX512-NEXT: vpinsrb $6, %esi, %xmm5, %xmm5 -; AVX512-NEXT: vpextrb $7, %xmm1, %esi -; AVX512-NEXT: vpextrb $7, %xmm0, %edi -; AVX512-NEXT: subb %sil, %dil -; AVX512-NEXT: movl $0, %esi -; AVX512-NEXT: jb .LBB2_112 -; AVX512-NEXT: # %bb.111: -; AVX512-NEXT: movl %edi, %esi -; AVX512-NEXT: .LBB2_112: -; AVX512-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $10, %ecx, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $8, %edx, %xmm4, %xmm4 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX512-NEXT: movzbl %sil, %esi -; AVX512-NEXT: vpinsrb $7, %esi, %xmm5, %xmm5 -; AVX512-NEXT: vpextrb $8, %xmm1, %esi -; AVX512-NEXT: vpextrb $8, %xmm0, %edi -; AVX512-NEXT: subb %sil, %dil -; AVX512-NEXT: movl $0, %esi -; AVX512-NEXT: jb .LBB2_114 -; AVX512-NEXT: # %bb.113: -; AVX512-NEXT: movl %edi, %esi -; AVX512-NEXT: .LBB2_114: -; AVX512-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $11, %ecx, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $9, %edx, %xmm4, %xmm4 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX512-NEXT: movzbl %sil, %esi -; AVX512-NEXT: vpinsrb $8, %esi, %xmm5, %xmm5 -; AVX512-NEXT: vpextrb $9, %xmm1, %esi -; AVX512-NEXT: vpextrb $9, %xmm0, %edi -; AVX512-NEXT: subb %sil, %dil -; AVX512-NEXT: movl $0, %esi -; AVX512-NEXT: jb .LBB2_116 -; AVX512-NEXT: # %bb.115: -; AVX512-NEXT: movl %edi, %esi -; AVX512-NEXT: .LBB2_116: -; AVX512-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $12, %ecx, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $10, %edx, %xmm4, %xmm4 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX512-NEXT: movzbl %sil, %esi -; AVX512-NEXT: vpinsrb $9, %esi, %xmm5, %xmm5 -; AVX512-NEXT: vpextrb $10, %xmm1, %esi -; AVX512-NEXT: vpextrb $10, %xmm0, %edi -; AVX512-NEXT: subb %sil, %dil -; AVX512-NEXT: movl $0, %esi -; AVX512-NEXT: jb .LBB2_118 -; AVX512-NEXT: # %bb.117: -; AVX512-NEXT: movl %edi, %esi -; AVX512-NEXT: .LBB2_118: -; AVX512-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl (%rsp), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $13, %ecx, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $11, %edx, %xmm4, %xmm4 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX512-NEXT: movzbl %sil, %esi -; AVX512-NEXT: vpinsrb $10, %esi, %xmm5, %xmm5 -; AVX512-NEXT: vpextrb $11, %xmm1, %esi -; AVX512-NEXT: vpextrb $11, %xmm0, %edi -; AVX512-NEXT: subb %sil, %dil -; AVX512-NEXT: movl $0, %esi -; AVX512-NEXT: jb .LBB2_120 -; AVX512-NEXT: # %bb.119: -; AVX512-NEXT: movl %edi, %esi -; AVX512-NEXT: .LBB2_120: -; AVX512-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $14, %ecx, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $12, %edx, %xmm4, %xmm6 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX512-NEXT: movzbl %sil, %esi -; AVX512-NEXT: vpinsrb $11, %esi, %xmm5, %xmm5 -; AVX512-NEXT: vpextrb $12, %xmm1, %esi -; AVX512-NEXT: vpextrb $12, %xmm0, %edi -; AVX512-NEXT: subb %sil, %dil -; AVX512-NEXT: movl $0, %esi -; AVX512-NEXT: jb .LBB2_122 -; AVX512-NEXT: # %bb.121: -; AVX512-NEXT: movl %edi, %esi -; AVX512-NEXT: .LBB2_122: -; AVX512-NEXT: vpinsrb $14, %eax, %xmm2, %xmm4 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $15, %ecx, %xmm3, %xmm2 -; AVX512-NEXT: vpinsrb $13, %edx, %xmm6, %xmm6 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX512-NEXT: movzbl %sil, %edx -; AVX512-NEXT: vpinsrb $12, %edx, %xmm5, %xmm5 -; AVX512-NEXT: vpextrb $13, %xmm1, %edx -; AVX512-NEXT: vpextrb $13, %xmm0, %esi -; AVX512-NEXT: subb %dl, %sil -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: jb .LBB2_124 -; AVX512-NEXT: # %bb.123: -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: .LBB2_124: -; AVX512-NEXT: vpinsrb $15, %eax, %xmm4, %xmm3 -; AVX512-NEXT: vpinsrb $14, %ecx, %xmm6, %xmm4 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: movzbl %dl, %ecx -; AVX512-NEXT: vpinsrb $13, %ecx, %xmm5, %xmm5 -; AVX512-NEXT: vpextrb $14, %xmm1, %edx -; AVX512-NEXT: vpextrb $14, %xmm0, %ecx -; AVX512-NEXT: subb %dl, %cl -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsp -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp -; AVX512-NEXT: jb .LBB2_126 -; AVX512-NEXT: # %bb.125: -; AVX512-NEXT: movl %ecx, %edx -; AVX512-NEXT: .LBB2_126: -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512-NEXT: vpinsrb $15, %eax, %xmm4, %xmm3 -; AVX512-NEXT: movzbl %dl, %eax -; AVX512-NEXT: vpinsrb $14, %eax, %xmm5, %xmm4 -; AVX512-NEXT: vpextrb $15, %xmm1, %ecx -; AVX512-NEXT: vpextrb $15, %xmm0, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: movl $0, %ecx -; AVX512-NEXT: jb .LBB2_128 -; AVX512-NEXT: # %bb.127: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB2_128: -; AVX512-NEXT: movzbl %cl, %eax -; AVX512-NEXT: vpinsrb $15, %eax, %xmm4, %xmm0 -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: vpsubusb %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq %z = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> %x, <64 x i8> %y) ret <64 x i8> %z } define <8 x i16> @v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { -; SSE2-LABEL: v8i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pextrw $7, %xmm1, %ecx -; SSE2-NEXT: pextrw $7, %xmm0, %edx -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm2 -; SSE2-NEXT: pextrw $6, %xmm1, %ecx -; SSE2-NEXT: pextrw $6, %xmm0, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE2-NEXT: pextrw $5, %xmm1, %ecx -; SSE2-NEXT: pextrw $5, %xmm0, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm4 -; SSE2-NEXT: pextrw $4, %xmm1, %ecx -; SSE2-NEXT: pextrw $4, %xmm0, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE2-NEXT: pextrw $3, %xmm1, %ecx -; SSE2-NEXT: pextrw $3, %xmm0, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm3 -; SSE2-NEXT: pextrw $2, %xmm1, %ecx -; SSE2-NEXT: pextrw $2, %xmm0, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE2-NEXT: pextrw $1, %xmm1, %ecx -; SSE2-NEXT: pextrw $1, %xmm0, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm3 -; SSE2-NEXT: movd %xmm1, %ecx -; SSE2-NEXT: movd %xmm0, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v8i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pextrw $7, %xmm1, %ecx -; SSSE3-NEXT: pextrw $7, %xmm0, %edx -; SSSE3-NEXT: xorl %eax, %eax -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm2 -; SSSE3-NEXT: pextrw $6, %xmm1, %ecx -; SSSE3-NEXT: pextrw $6, %xmm0, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm3 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSSE3-NEXT: pextrw $5, %xmm1, %ecx -; SSSE3-NEXT: pextrw $5, %xmm0, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm4 -; SSSE3-NEXT: pextrw $4, %xmm1, %ecx -; SSSE3-NEXT: pextrw $4, %xmm0, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSSE3-NEXT: pextrw $3, %xmm1, %ecx -; SSSE3-NEXT: pextrw $3, %xmm0, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm3 -; SSSE3-NEXT: pextrw $2, %xmm1, %ecx -; SSSE3-NEXT: pextrw $2, %xmm0, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm4 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSSE3-NEXT: pextrw $1, %xmm1, %ecx -; SSSE3-NEXT: pextrw $1, %xmm0, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm3 -; SSSE3-NEXT: movd %xmm1, %ecx -; SSSE3-NEXT: movd %xmm0, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v8i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pextrw $1, %xmm1, %ecx -; SSE41-NEXT: pextrw $1, %xmm0, %edx -; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: movd %xmm1, %ecx -; SSE41-NEXT: movd %xmm0, %esi -; SSE41-NEXT: subw %cx, %si -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: movd %esi, %xmm2 -; SSE41-NEXT: pinsrw $1, %edx, %xmm2 -; SSE41-NEXT: pextrw $2, %xmm1, %ecx -; SSE41-NEXT: pextrw $2, %xmm0, %edx -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $2, %edx, %xmm2 -; SSE41-NEXT: pextrw $3, %xmm1, %ecx -; SSE41-NEXT: pextrw $3, %xmm0, %edx -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $3, %edx, %xmm2 -; SSE41-NEXT: pextrw $4, %xmm1, %ecx -; SSE41-NEXT: pextrw $4, %xmm0, %edx -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $4, %edx, %xmm2 -; SSE41-NEXT: pextrw $5, %xmm1, %ecx -; SSE41-NEXT: pextrw $5, %xmm0, %edx -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $5, %edx, %xmm2 -; SSE41-NEXT: pextrw $6, %xmm1, %ecx -; SSE41-NEXT: pextrw $6, %xmm0, %edx -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $6, %edx, %xmm2 -; SSE41-NEXT: pextrw $7, %xmm1, %ecx -; SSE41-NEXT: pextrw $7, %xmm0, %edx -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $7, %edx, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: v8i16: +; SSE: # %bb.0: +; SSE-NEXT: psubusw %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: v8i16: ; AVX: # %bb.0: -; AVX-NEXT: vpextrw $1, %xmm1, %ecx -; AVX-NEXT: vpextrw $1, %xmm0, %edx -; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: subw %cx, %dx -; AVX-NEXT: cmovbl %eax, %edx -; AVX-NEXT: vmovd %xmm1, %ecx -; AVX-NEXT: vmovd %xmm0, %esi -; AVX-NEXT: subw %cx, %si -; AVX-NEXT: cmovbl %eax, %esi -; AVX-NEXT: vmovd %esi, %xmm2 -; AVX-NEXT: vpinsrw $1, %edx, %xmm2, %xmm2 -; AVX-NEXT: vpextrw $2, %xmm1, %ecx -; AVX-NEXT: vpextrw $2, %xmm0, %edx -; AVX-NEXT: subw %cx, %dx -; AVX-NEXT: cmovbl %eax, %edx -; AVX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2 -; AVX-NEXT: vpextrw $3, %xmm1, %ecx -; AVX-NEXT: vpextrw $3, %xmm0, %edx -; AVX-NEXT: subw %cx, %dx -; AVX-NEXT: cmovbl %eax, %edx -; AVX-NEXT: vpinsrw $3, %edx, %xmm2, %xmm2 -; AVX-NEXT: vpextrw $4, %xmm1, %ecx -; AVX-NEXT: vpextrw $4, %xmm0, %edx -; AVX-NEXT: subw %cx, %dx -; AVX-NEXT: cmovbl %eax, %edx -; AVX-NEXT: vpinsrw $4, %edx, %xmm2, %xmm2 -; AVX-NEXT: vpextrw $5, %xmm1, %ecx -; AVX-NEXT: vpextrw $5, %xmm0, %edx -; AVX-NEXT: subw %cx, %dx -; AVX-NEXT: cmovbl %eax, %edx -; AVX-NEXT: vpinsrw $5, %edx, %xmm2, %xmm2 -; AVX-NEXT: vpextrw $6, %xmm1, %ecx -; AVX-NEXT: vpextrw $6, %xmm0, %edx -; AVX-NEXT: subw %cx, %dx -; AVX-NEXT: cmovbl %eax, %edx -; AVX-NEXT: vpinsrw $6, %edx, %xmm2, %xmm2 -; AVX-NEXT: vpextrw $7, %xmm1, %ecx -; AVX-NEXT: vpextrw $7, %xmm0, %edx -; AVX-NEXT: subw %cx, %dx -; AVX-NEXT: cmovbl %eax, %edx -; AVX-NEXT: vpinsrw $7, %edx, %xmm2, %xmm0 +; AVX-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %z = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %x, <8 x i16> %y) ret <8 x i16> %z } define <16 x i16> @v16i16(<16 x i16> %x, <16 x i16> %y) nounwind { -; SSE2-LABEL: v16i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pextrw $7, %xmm2, %ecx -; SSE2-NEXT: pextrw $7, %xmm0, %edx -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm4 -; SSE2-NEXT: pextrw $6, %xmm2, %ecx -; SSE2-NEXT: pextrw $6, %xmm0, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm5 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE2-NEXT: pextrw $5, %xmm2, %ecx -; SSE2-NEXT: pextrw $5, %xmm0, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm6 -; SSE2-NEXT: pextrw $4, %xmm2, %ecx -; SSE2-NEXT: pextrw $4, %xmm0, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSE2-NEXT: pextrw $3, %xmm2, %ecx -; SSE2-NEXT: pextrw $3, %xmm0, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm5 -; SSE2-NEXT: pextrw $2, %xmm2, %ecx -; SSE2-NEXT: pextrw $2, %xmm0, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm6 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; SSE2-NEXT: pextrw $1, %xmm2, %ecx -; SSE2-NEXT: pextrw $1, %xmm0, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm5 -; SSE2-NEXT: movd %xmm2, %ecx -; SSE2-NEXT: movd %xmm0, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; SSE2-NEXT: pextrw $7, %xmm3, %ecx -; SSE2-NEXT: pextrw $7, %xmm1, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm2 -; SSE2-NEXT: pextrw $6, %xmm3, %ecx -; SSE2-NEXT: pextrw $6, %xmm1, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSE2-NEXT: pextrw $5, %xmm3, %ecx -; SSE2-NEXT: pextrw $5, %xmm1, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm5 -; SSE2-NEXT: pextrw $4, %xmm3, %ecx -; SSE2-NEXT: pextrw $4, %xmm1, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSE2-NEXT: pextrw $3, %xmm3, %ecx -; SSE2-NEXT: pextrw $3, %xmm1, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm4 -; SSE2-NEXT: pextrw $2, %xmm3, %ecx -; SSE2-NEXT: pextrw $2, %xmm1, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm5 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE2-NEXT: pextrw $1, %xmm3, %ecx -; SSE2-NEXT: pextrw $1, %xmm1, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm4 -; SSE2-NEXT: movd %xmm3, %ecx -; SSE2-NEXT: movd %xmm1, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v16i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pextrw $7, %xmm2, %ecx -; SSSE3-NEXT: pextrw $7, %xmm0, %edx -; SSSE3-NEXT: xorl %eax, %eax -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm4 -; SSSE3-NEXT: pextrw $6, %xmm2, %ecx -; SSSE3-NEXT: pextrw $6, %xmm0, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm5 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSSE3-NEXT: pextrw $5, %xmm2, %ecx -; SSSE3-NEXT: pextrw $5, %xmm0, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm6 -; SSSE3-NEXT: pextrw $4, %xmm2, %ecx -; SSSE3-NEXT: pextrw $4, %xmm0, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm4 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSSE3-NEXT: pextrw $3, %xmm2, %ecx -; SSSE3-NEXT: pextrw $3, %xmm0, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm5 -; SSSE3-NEXT: pextrw $2, %xmm2, %ecx -; SSSE3-NEXT: pextrw $2, %xmm0, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm6 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; SSSE3-NEXT: pextrw $1, %xmm2, %ecx -; SSSE3-NEXT: pextrw $1, %xmm0, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm5 -; SSSE3-NEXT: movd %xmm2, %ecx -; SSSE3-NEXT: movd %xmm0, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; SSSE3-NEXT: pextrw $7, %xmm3, %ecx -; SSSE3-NEXT: pextrw $7, %xmm1, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm2 -; SSSE3-NEXT: pextrw $6, %xmm3, %ecx -; SSSE3-NEXT: pextrw $6, %xmm1, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm4 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSSE3-NEXT: pextrw $5, %xmm3, %ecx -; SSSE3-NEXT: pextrw $5, %xmm1, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm5 -; SSSE3-NEXT: pextrw $4, %xmm3, %ecx -; SSSE3-NEXT: pextrw $4, %xmm1, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSSE3-NEXT: pextrw $3, %xmm3, %ecx -; SSSE3-NEXT: pextrw $3, %xmm1, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm4 -; SSSE3-NEXT: pextrw $2, %xmm3, %ecx -; SSSE3-NEXT: pextrw $2, %xmm1, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm5 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSSE3-NEXT: pextrw $1, %xmm3, %ecx -; SSSE3-NEXT: pextrw $1, %xmm1, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm4 -; SSSE3-NEXT: movd %xmm3, %ecx -; SSSE3-NEXT: movd %xmm1, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v16i16: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pextrw $1, %xmm2, %ecx -; SSE41-NEXT: pextrw $1, %xmm0, %edx -; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: movd %xmm2, %ecx -; SSE41-NEXT: movd %xmm0, %esi -; SSE41-NEXT: subw %cx, %si -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: movd %esi, %xmm0 -; SSE41-NEXT: pinsrw $1, %edx, %xmm0 -; SSE41-NEXT: pextrw $2, %xmm2, %ecx -; SSE41-NEXT: pextrw $2, %xmm4, %edx -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $2, %edx, %xmm0 -; SSE41-NEXT: pextrw $3, %xmm2, %ecx -; SSE41-NEXT: pextrw $3, %xmm4, %edx -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $3, %edx, %xmm0 -; SSE41-NEXT: pextrw $4, %xmm2, %ecx -; SSE41-NEXT: pextrw $4, %xmm4, %edx -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $4, %edx, %xmm0 -; SSE41-NEXT: pextrw $5, %xmm2, %ecx -; SSE41-NEXT: pextrw $5, %xmm4, %edx -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $5, %edx, %xmm0 -; SSE41-NEXT: pextrw $6, %xmm2, %ecx -; SSE41-NEXT: pextrw $6, %xmm4, %edx -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $6, %edx, %xmm0 -; SSE41-NEXT: pextrw $7, %xmm2, %ecx -; SSE41-NEXT: pextrw $7, %xmm4, %edx -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $7, %edx, %xmm0 -; SSE41-NEXT: pextrw $1, %xmm3, %ecx -; SSE41-NEXT: pextrw $1, %xmm1, %edx -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: movd %xmm3, %ecx -; SSE41-NEXT: movd %xmm1, %esi -; SSE41-NEXT: subw %cx, %si -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: movd %esi, %xmm2 -; SSE41-NEXT: pinsrw $1, %edx, %xmm2 -; SSE41-NEXT: pextrw $2, %xmm3, %ecx -; SSE41-NEXT: pextrw $2, %xmm1, %edx -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $2, %edx, %xmm2 -; SSE41-NEXT: pextrw $3, %xmm3, %ecx -; SSE41-NEXT: pextrw $3, %xmm1, %edx -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $3, %edx, %xmm2 -; SSE41-NEXT: pextrw $4, %xmm3, %ecx -; SSE41-NEXT: pextrw $4, %xmm1, %edx -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $4, %edx, %xmm2 -; SSE41-NEXT: pextrw $5, %xmm3, %ecx -; SSE41-NEXT: pextrw $5, %xmm1, %edx -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $5, %edx, %xmm2 -; SSE41-NEXT: pextrw $6, %xmm3, %ecx -; SSE41-NEXT: pextrw $6, %xmm1, %edx -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $6, %edx, %xmm2 -; SSE41-NEXT: pextrw $7, %xmm3, %ecx -; SSE41-NEXT: pextrw $7, %xmm1, %edx -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $7, %edx, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm1 -; SSE41-NEXT: retq +; SSE-LABEL: v16i16: +; SSE: # %bb.0: +; SSE-NEXT: psubusw %xmm2, %xmm0 +; SSE-NEXT: psubusw %xmm3, %xmm1 +; SSE-NEXT: retq ; ; AVX1-LABEL: v16i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpextrw $1, %xmm2, %ecx ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpextrw $1, %xmm3, %edx -; AVX1-NEXT: xorl %eax, %eax -; AVX1-NEXT: subw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vmovd %xmm2, %ecx -; AVX1-NEXT: vmovd %xmm3, %esi -; AVX1-NEXT: subw %cx, %si -; AVX1-NEXT: cmovbl %eax, %esi -; AVX1-NEXT: vmovd %esi, %xmm4 -; AVX1-NEXT: vpinsrw $1, %edx, %xmm4, %xmm4 -; AVX1-NEXT: vpextrw $2, %xmm2, %ecx -; AVX1-NEXT: vpextrw $2, %xmm3, %edx -; AVX1-NEXT: subw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 -; AVX1-NEXT: vpextrw $3, %xmm2, %ecx -; AVX1-NEXT: vpextrw $3, %xmm3, %edx -; AVX1-NEXT: subw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $3, %edx, %xmm4, %xmm4 -; AVX1-NEXT: vpextrw $4, %xmm2, %ecx -; AVX1-NEXT: vpextrw $4, %xmm3, %edx -; AVX1-NEXT: subw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $4, %edx, %xmm4, %xmm4 -; AVX1-NEXT: vpextrw $5, %xmm2, %ecx -; AVX1-NEXT: vpextrw $5, %xmm3, %edx -; AVX1-NEXT: subw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $5, %edx, %xmm4, %xmm4 -; AVX1-NEXT: vpextrw $6, %xmm2, %ecx -; AVX1-NEXT: vpextrw $6, %xmm3, %edx -; AVX1-NEXT: subw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $6, %edx, %xmm4, %xmm4 -; AVX1-NEXT: vpextrw $7, %xmm2, %ecx -; AVX1-NEXT: vpextrw $7, %xmm3, %edx -; AVX1-NEXT: subw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $7, %edx, %xmm4, %xmm2 -; AVX1-NEXT: vpextrw $1, %xmm1, %ecx -; AVX1-NEXT: vpextrw $1, %xmm0, %edx -; AVX1-NEXT: subw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vmovd %xmm1, %ecx -; AVX1-NEXT: vmovd %xmm0, %esi -; AVX1-NEXT: subw %cx, %si -; AVX1-NEXT: cmovbl %eax, %esi -; AVX1-NEXT: vmovd %esi, %xmm3 -; AVX1-NEXT: vpinsrw $1, %edx, %xmm3, %xmm3 -; AVX1-NEXT: vpextrw $2, %xmm1, %ecx -; AVX1-NEXT: vpextrw $2, %xmm0, %edx -; AVX1-NEXT: subw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 -; AVX1-NEXT: vpextrw $3, %xmm1, %ecx -; AVX1-NEXT: vpextrw $3, %xmm0, %edx -; AVX1-NEXT: subw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $3, %edx, %xmm3, %xmm3 -; AVX1-NEXT: vpextrw $4, %xmm1, %ecx -; AVX1-NEXT: vpextrw $4, %xmm0, %edx -; AVX1-NEXT: subw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $4, %edx, %xmm3, %xmm3 -; AVX1-NEXT: vpextrw $5, %xmm1, %ecx -; AVX1-NEXT: vpextrw $5, %xmm0, %edx -; AVX1-NEXT: subw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $5, %edx, %xmm3, %xmm3 -; AVX1-NEXT: vpextrw $6, %xmm1, %ecx -; AVX1-NEXT: vpextrw $6, %xmm0, %edx -; AVX1-NEXT: subw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $6, %edx, %xmm3, %xmm3 -; AVX1-NEXT: vpextrw $7, %xmm1, %ecx -; AVX1-NEXT: vpextrw $7, %xmm0, %edx -; AVX1-NEXT: subw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $7, %edx, %xmm3, %xmm0 +; AVX1-NEXT: vpsubusw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpextrw $1, %xmm2, %ecx -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpextrw $1, %xmm3, %edx -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: subw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vmovd %xmm2, %ecx -; AVX2-NEXT: vmovd %xmm3, %esi -; AVX2-NEXT: subw %cx, %si -; AVX2-NEXT: cmovbl %eax, %esi -; AVX2-NEXT: vmovd %esi, %xmm4 -; AVX2-NEXT: vpinsrw $1, %edx, %xmm4, %xmm4 -; AVX2-NEXT: vpextrw $2, %xmm2, %ecx -; AVX2-NEXT: vpextrw $2, %xmm3, %edx -; AVX2-NEXT: subw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 -; AVX2-NEXT: vpextrw $3, %xmm2, %ecx -; AVX2-NEXT: vpextrw $3, %xmm3, %edx -; AVX2-NEXT: subw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $3, %edx, %xmm4, %xmm4 -; AVX2-NEXT: vpextrw $4, %xmm2, %ecx -; AVX2-NEXT: vpextrw $4, %xmm3, %edx -; AVX2-NEXT: subw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $4, %edx, %xmm4, %xmm4 -; AVX2-NEXT: vpextrw $5, %xmm2, %ecx -; AVX2-NEXT: vpextrw $5, %xmm3, %edx -; AVX2-NEXT: subw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $5, %edx, %xmm4, %xmm4 -; AVX2-NEXT: vpextrw $6, %xmm2, %ecx -; AVX2-NEXT: vpextrw $6, %xmm3, %edx -; AVX2-NEXT: subw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $6, %edx, %xmm4, %xmm4 -; AVX2-NEXT: vpextrw $7, %xmm2, %ecx -; AVX2-NEXT: vpextrw $7, %xmm3, %edx -; AVX2-NEXT: subw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $7, %edx, %xmm4, %xmm2 -; AVX2-NEXT: vpextrw $1, %xmm1, %ecx -; AVX2-NEXT: vpextrw $1, %xmm0, %edx -; AVX2-NEXT: subw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vmovd %xmm1, %ecx -; AVX2-NEXT: vmovd %xmm0, %esi -; AVX2-NEXT: subw %cx, %si -; AVX2-NEXT: cmovbl %eax, %esi -; AVX2-NEXT: vmovd %esi, %xmm3 -; AVX2-NEXT: vpinsrw $1, %edx, %xmm3, %xmm3 -; AVX2-NEXT: vpextrw $2, %xmm1, %ecx -; AVX2-NEXT: vpextrw $2, %xmm0, %edx -; AVX2-NEXT: subw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 -; AVX2-NEXT: vpextrw $3, %xmm1, %ecx -; AVX2-NEXT: vpextrw $3, %xmm0, %edx -; AVX2-NEXT: subw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $3, %edx, %xmm3, %xmm3 -; AVX2-NEXT: vpextrw $4, %xmm1, %ecx -; AVX2-NEXT: vpextrw $4, %xmm0, %edx -; AVX2-NEXT: subw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $4, %edx, %xmm3, %xmm3 -; AVX2-NEXT: vpextrw $5, %xmm1, %ecx -; AVX2-NEXT: vpextrw $5, %xmm0, %edx -; AVX2-NEXT: subw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $5, %edx, %xmm3, %xmm3 -; AVX2-NEXT: vpextrw $6, %xmm1, %ecx -; AVX2-NEXT: vpextrw $6, %xmm0, %edx -; AVX2-NEXT: subw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $6, %edx, %xmm3, %xmm3 -; AVX2-NEXT: vpextrw $7, %xmm1, %ecx -; AVX2-NEXT: vpextrw $7, %xmm0, %edx -; AVX2-NEXT: subw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $7, %edx, %xmm3, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: v16i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512-NEXT: vpextrw $1, %xmm2, %ecx -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512-NEXT: vpextrw $1, %xmm3, %edx -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: subw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vmovd %xmm2, %ecx -; AVX512-NEXT: vmovd %xmm3, %esi -; AVX512-NEXT: subw %cx, %si -; AVX512-NEXT: cmovbl %eax, %esi -; AVX512-NEXT: vmovd %esi, %xmm4 -; AVX512-NEXT: vpinsrw $1, %edx, %xmm4, %xmm4 -; AVX512-NEXT: vpextrw $2, %xmm2, %ecx -; AVX512-NEXT: vpextrw $2, %xmm3, %edx -; AVX512-NEXT: subw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 -; AVX512-NEXT: vpextrw $3, %xmm2, %ecx -; AVX512-NEXT: vpextrw $3, %xmm3, %edx -; AVX512-NEXT: subw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $3, %edx, %xmm4, %xmm4 -; AVX512-NEXT: vpextrw $4, %xmm2, %ecx -; AVX512-NEXT: vpextrw $4, %xmm3, %edx -; AVX512-NEXT: subw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $4, %edx, %xmm4, %xmm4 -; AVX512-NEXT: vpextrw $5, %xmm2, %ecx -; AVX512-NEXT: vpextrw $5, %xmm3, %edx -; AVX512-NEXT: subw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $5, %edx, %xmm4, %xmm4 -; AVX512-NEXT: vpextrw $6, %xmm2, %ecx -; AVX512-NEXT: vpextrw $6, %xmm3, %edx -; AVX512-NEXT: subw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $6, %edx, %xmm4, %xmm4 -; AVX512-NEXT: vpextrw $7, %xmm2, %ecx -; AVX512-NEXT: vpextrw $7, %xmm3, %edx -; AVX512-NEXT: subw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $7, %edx, %xmm4, %xmm2 -; AVX512-NEXT: vpextrw $1, %xmm1, %ecx -; AVX512-NEXT: vpextrw $1, %xmm0, %edx -; AVX512-NEXT: subw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vmovd %xmm1, %ecx -; AVX512-NEXT: vmovd %xmm0, %esi -; AVX512-NEXT: subw %cx, %si -; AVX512-NEXT: cmovbl %eax, %esi -; AVX512-NEXT: vmovd %esi, %xmm3 -; AVX512-NEXT: vpinsrw $1, %edx, %xmm3, %xmm3 -; AVX512-NEXT: vpextrw $2, %xmm1, %ecx -; AVX512-NEXT: vpextrw $2, %xmm0, %edx -; AVX512-NEXT: subw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 -; AVX512-NEXT: vpextrw $3, %xmm1, %ecx -; AVX512-NEXT: vpextrw $3, %xmm0, %edx -; AVX512-NEXT: subw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $3, %edx, %xmm3, %xmm3 -; AVX512-NEXT: vpextrw $4, %xmm1, %ecx -; AVX512-NEXT: vpextrw $4, %xmm0, %edx -; AVX512-NEXT: subw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $4, %edx, %xmm3, %xmm3 -; AVX512-NEXT: vpextrw $5, %xmm1, %ecx -; AVX512-NEXT: vpextrw $5, %xmm0, %edx -; AVX512-NEXT: subw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $5, %edx, %xmm3, %xmm3 -; AVX512-NEXT: vpextrw $6, %xmm1, %ecx -; AVX512-NEXT: vpextrw $6, %xmm0, %edx -; AVX512-NEXT: subw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $6, %edx, %xmm3, %xmm3 -; AVX512-NEXT: vpextrw $7, %xmm1, %ecx -; AVX512-NEXT: vpextrw $7, %xmm0, %edx -; AVX512-NEXT: subw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $7, %edx, %xmm3, %xmm0 -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq %z = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> %x, <16 x i16> %y) ret <16 x i16> %z } define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind { -; SSE2-LABEL: v32i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pextrw $7, %xmm4, %ecx -; SSE2-NEXT: pextrw $7, %xmm0, %edx -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm8 -; SSE2-NEXT: pextrw $6, %xmm4, %ecx -; SSE2-NEXT: pextrw $6, %xmm0, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm9 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; SSE2-NEXT: pextrw $5, %xmm4, %ecx -; SSE2-NEXT: pextrw $5, %xmm0, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm10 -; SSE2-NEXT: pextrw $4, %xmm4, %ecx -; SSE2-NEXT: pextrw $4, %xmm0, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm8 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; SSE2-NEXT: pextrw $3, %xmm4, %ecx -; SSE2-NEXT: pextrw $3, %xmm0, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm9 -; SSE2-NEXT: pextrw $2, %xmm4, %ecx -; SSE2-NEXT: pextrw $2, %xmm0, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm10 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; SSE2-NEXT: pextrw $1, %xmm4, %ecx -; SSE2-NEXT: pextrw $1, %xmm0, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm9 -; SSE2-NEXT: movd %xmm4, %ecx -; SSE2-NEXT: movd %xmm0, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm8[0] -; SSE2-NEXT: pextrw $7, %xmm5, %ecx -; SSE2-NEXT: pextrw $7, %xmm1, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm8 -; SSE2-NEXT: pextrw $6, %xmm5, %ecx -; SSE2-NEXT: pextrw $6, %xmm1, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] -; SSE2-NEXT: pextrw $5, %xmm5, %ecx -; SSE2-NEXT: pextrw $5, %xmm1, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm9 -; SSE2-NEXT: pextrw $4, %xmm5, %ecx -; SSE2-NEXT: pextrw $4, %xmm1, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm8 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; SSE2-NEXT: pextrw $3, %xmm5, %ecx -; SSE2-NEXT: pextrw $3, %xmm1, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm9 -; SSE2-NEXT: pextrw $2, %xmm5, %ecx -; SSE2-NEXT: pextrw $2, %xmm1, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] -; SSE2-NEXT: pextrw $1, %xmm5, %ecx -; SSE2-NEXT: pextrw $1, %xmm1, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm9 -; SSE2-NEXT: movd %xmm5, %ecx -; SSE2-NEXT: movd %xmm1, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm8[0] -; SSE2-NEXT: pextrw $7, %xmm6, %ecx -; SSE2-NEXT: pextrw $7, %xmm2, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm4 -; SSE2-NEXT: pextrw $6, %xmm6, %ecx -; SSE2-NEXT: pextrw $6, %xmm2, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm5 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE2-NEXT: pextrw $5, %xmm6, %ecx -; SSE2-NEXT: pextrw $5, %xmm2, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm8 -; SSE2-NEXT: pextrw $4, %xmm6, %ecx -; SSE2-NEXT: pextrw $4, %xmm2, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSE2-NEXT: pextrw $3, %xmm6, %ecx -; SSE2-NEXT: pextrw $3, %xmm2, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm8 -; SSE2-NEXT: pextrw $2, %xmm6, %ecx -; SSE2-NEXT: pextrw $2, %xmm2, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm5 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3] -; SSE2-NEXT: pextrw $1, %xmm6, %ecx -; SSE2-NEXT: pextrw $1, %xmm2, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm8 -; SSE2-NEXT: movd %xmm6, %ecx -; SSE2-NEXT: movd %xmm2, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] -; SSE2-NEXT: pextrw $7, %xmm7, %ecx -; SSE2-NEXT: pextrw $7, %xmm3, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm4 -; SSE2-NEXT: pextrw $6, %xmm7, %ecx -; SSE2-NEXT: pextrw $6, %xmm3, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm5 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE2-NEXT: pextrw $5, %xmm7, %ecx -; SSE2-NEXT: pextrw $5, %xmm3, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm6 -; SSE2-NEXT: pextrw $4, %xmm7, %ecx -; SSE2-NEXT: pextrw $4, %xmm3, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSE2-NEXT: pextrw $3, %xmm7, %ecx -; SSE2-NEXT: pextrw $3, %xmm3, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm5 -; SSE2-NEXT: pextrw $2, %xmm7, %ecx -; SSE2-NEXT: pextrw $2, %xmm3, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm6 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; SSE2-NEXT: pextrw $1, %xmm7, %ecx -; SSE2-NEXT: pextrw $1, %xmm3, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm5 -; SSE2-NEXT: movd %xmm7, %ecx -; SSE2-NEXT: movd %xmm3, %edx -; SSE2-NEXT: subw %cx, %dx -; SSE2-NEXT: cmovbl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v32i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pextrw $7, %xmm4, %ecx -; SSSE3-NEXT: pextrw $7, %xmm0, %edx -; SSSE3-NEXT: xorl %eax, %eax -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm8 -; SSSE3-NEXT: pextrw $6, %xmm4, %ecx -; SSSE3-NEXT: pextrw $6, %xmm0, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm9 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; SSSE3-NEXT: pextrw $5, %xmm4, %ecx -; SSSE3-NEXT: pextrw $5, %xmm0, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm10 -; SSSE3-NEXT: pextrw $4, %xmm4, %ecx -; SSSE3-NEXT: pextrw $4, %xmm0, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm8 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; SSSE3-NEXT: pextrw $3, %xmm4, %ecx -; SSSE3-NEXT: pextrw $3, %xmm0, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm9 -; SSSE3-NEXT: pextrw $2, %xmm4, %ecx -; SSSE3-NEXT: pextrw $2, %xmm0, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm10 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; SSSE3-NEXT: pextrw $1, %xmm4, %ecx -; SSSE3-NEXT: pextrw $1, %xmm0, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm9 -; SSSE3-NEXT: movd %xmm4, %ecx -; SSSE3-NEXT: movd %xmm0, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm8[0] -; SSSE3-NEXT: pextrw $7, %xmm5, %ecx -; SSSE3-NEXT: pextrw $7, %xmm1, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm8 -; SSSE3-NEXT: pextrw $6, %xmm5, %ecx -; SSSE3-NEXT: pextrw $6, %xmm1, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm4 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] -; SSSE3-NEXT: pextrw $5, %xmm5, %ecx -; SSSE3-NEXT: pextrw $5, %xmm1, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm9 -; SSSE3-NEXT: pextrw $4, %xmm5, %ecx -; SSSE3-NEXT: pextrw $4, %xmm1, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm8 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; SSSE3-NEXT: pextrw $3, %xmm5, %ecx -; SSSE3-NEXT: pextrw $3, %xmm1, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm9 -; SSSE3-NEXT: pextrw $2, %xmm5, %ecx -; SSSE3-NEXT: pextrw $2, %xmm1, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm4 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] -; SSSE3-NEXT: pextrw $1, %xmm5, %ecx -; SSSE3-NEXT: pextrw $1, %xmm1, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm9 -; SSSE3-NEXT: movd %xmm5, %ecx -; SSSE3-NEXT: movd %xmm1, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm8[0] -; SSSE3-NEXT: pextrw $7, %xmm6, %ecx -; SSSE3-NEXT: pextrw $7, %xmm2, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm4 -; SSSE3-NEXT: pextrw $6, %xmm6, %ecx -; SSSE3-NEXT: pextrw $6, %xmm2, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm5 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSSE3-NEXT: pextrw $5, %xmm6, %ecx -; SSSE3-NEXT: pextrw $5, %xmm2, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm8 -; SSSE3-NEXT: pextrw $4, %xmm6, %ecx -; SSSE3-NEXT: pextrw $4, %xmm2, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm4 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSSE3-NEXT: pextrw $3, %xmm6, %ecx -; SSSE3-NEXT: pextrw $3, %xmm2, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm8 -; SSSE3-NEXT: pextrw $2, %xmm6, %ecx -; SSSE3-NEXT: pextrw $2, %xmm2, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm5 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3] -; SSSE3-NEXT: pextrw $1, %xmm6, %ecx -; SSSE3-NEXT: pextrw $1, %xmm2, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm8 -; SSSE3-NEXT: movd %xmm6, %ecx -; SSSE3-NEXT: movd %xmm2, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] -; SSSE3-NEXT: pextrw $7, %xmm7, %ecx -; SSSE3-NEXT: pextrw $7, %xmm3, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm4 -; SSSE3-NEXT: pextrw $6, %xmm7, %ecx -; SSSE3-NEXT: pextrw $6, %xmm3, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm5 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSSE3-NEXT: pextrw $5, %xmm7, %ecx -; SSSE3-NEXT: pextrw $5, %xmm3, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm6 -; SSSE3-NEXT: pextrw $4, %xmm7, %ecx -; SSSE3-NEXT: pextrw $4, %xmm3, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm4 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSSE3-NEXT: pextrw $3, %xmm7, %ecx -; SSSE3-NEXT: pextrw $3, %xmm3, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm5 -; SSSE3-NEXT: pextrw $2, %xmm7, %ecx -; SSSE3-NEXT: pextrw $2, %xmm3, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm6 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; SSSE3-NEXT: pextrw $1, %xmm7, %ecx -; SSSE3-NEXT: pextrw $1, %xmm3, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm5 -; SSSE3-NEXT: movd %xmm7, %ecx -; SSSE3-NEXT: movd %xmm3, %edx -; SSSE3-NEXT: subw %cx, %dx -; SSSE3-NEXT: cmovbl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm3 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v32i16: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm1, %xmm8 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pextrw $1, %xmm4, %ecx -; SSE41-NEXT: pextrw $1, %xmm0, %edx -; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: movd %xmm4, %ecx -; SSE41-NEXT: movd %xmm0, %esi -; SSE41-NEXT: subw %cx, %si -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: movd %esi, %xmm0 -; SSE41-NEXT: pinsrw $1, %edx, %xmm0 -; SSE41-NEXT: pextrw $2, %xmm4, %ecx -; SSE41-NEXT: pextrw $2, %xmm1, %edx -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $2, %edx, %xmm0 -; SSE41-NEXT: pextrw $3, %xmm4, %ecx -; SSE41-NEXT: pextrw $3, %xmm1, %edx -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $3, %edx, %xmm0 -; SSE41-NEXT: pextrw $4, %xmm4, %ecx -; SSE41-NEXT: pextrw $4, %xmm1, %edx -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $4, %edx, %xmm0 -; SSE41-NEXT: pextrw $5, %xmm4, %ecx -; SSE41-NEXT: pextrw $5, %xmm1, %edx -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $5, %edx, %xmm0 -; SSE41-NEXT: pextrw $6, %xmm4, %ecx -; SSE41-NEXT: pextrw $6, %xmm1, %edx -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $6, %edx, %xmm0 -; SSE41-NEXT: pextrw $7, %xmm4, %ecx -; SSE41-NEXT: pextrw $7, %xmm1, %edx -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $7, %edx, %xmm0 -; SSE41-NEXT: pextrw $1, %xmm5, %ecx -; SSE41-NEXT: pextrw $1, %xmm8, %edx -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: movd %xmm5, %ecx -; SSE41-NEXT: movd %xmm8, %esi -; SSE41-NEXT: subw %cx, %si -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: movd %esi, %xmm1 -; SSE41-NEXT: pinsrw $1, %edx, %xmm1 -; SSE41-NEXT: pextrw $2, %xmm5, %ecx -; SSE41-NEXT: pextrw $2, %xmm8, %edx -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $2, %edx, %xmm1 -; SSE41-NEXT: pextrw $3, %xmm5, %ecx -; SSE41-NEXT: pextrw $3, %xmm8, %edx -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $3, %edx, %xmm1 -; SSE41-NEXT: pextrw $4, %xmm5, %ecx -; SSE41-NEXT: pextrw $4, %xmm8, %edx -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $4, %edx, %xmm1 -; SSE41-NEXT: pextrw $5, %xmm5, %ecx -; SSE41-NEXT: pextrw $5, %xmm8, %edx -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $5, %edx, %xmm1 -; SSE41-NEXT: pextrw $6, %xmm5, %ecx -; SSE41-NEXT: pextrw $6, %xmm8, %edx -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $6, %edx, %xmm1 -; SSE41-NEXT: pextrw $7, %xmm5, %ecx -; SSE41-NEXT: pextrw $7, %xmm8, %edx -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $7, %edx, %xmm1 -; SSE41-NEXT: pextrw $1, %xmm6, %ecx -; SSE41-NEXT: pextrw $1, %xmm2, %edx -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: movd %xmm6, %ecx -; SSE41-NEXT: movd %xmm2, %esi -; SSE41-NEXT: subw %cx, %si -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: movd %esi, %xmm4 -; SSE41-NEXT: pinsrw $1, %edx, %xmm4 -; SSE41-NEXT: pextrw $2, %xmm6, %ecx -; SSE41-NEXT: pextrw $2, %xmm2, %edx -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $2, %edx, %xmm4 -; SSE41-NEXT: pextrw $3, %xmm6, %ecx -; SSE41-NEXT: pextrw $3, %xmm2, %edx -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $3, %edx, %xmm4 -; SSE41-NEXT: pextrw $4, %xmm6, %ecx -; SSE41-NEXT: pextrw $4, %xmm2, %edx -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $4, %edx, %xmm4 -; SSE41-NEXT: pextrw $5, %xmm6, %ecx -; SSE41-NEXT: pextrw $5, %xmm2, %edx -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $5, %edx, %xmm4 -; SSE41-NEXT: pextrw $6, %xmm6, %ecx -; SSE41-NEXT: pextrw $6, %xmm2, %edx -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $6, %edx, %xmm4 -; SSE41-NEXT: pextrw $7, %xmm6, %ecx -; SSE41-NEXT: pextrw $7, %xmm2, %edx -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $7, %edx, %xmm4 -; SSE41-NEXT: pextrw $1, %xmm7, %ecx -; SSE41-NEXT: pextrw $1, %xmm3, %edx -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: movd %xmm7, %ecx -; SSE41-NEXT: movd %xmm3, %esi -; SSE41-NEXT: subw %cx, %si -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: movd %esi, %xmm5 -; SSE41-NEXT: pinsrw $1, %edx, %xmm5 -; SSE41-NEXT: pextrw $2, %xmm7, %ecx -; SSE41-NEXT: pextrw $2, %xmm3, %edx -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $2, %edx, %xmm5 -; SSE41-NEXT: pextrw $3, %xmm7, %ecx -; SSE41-NEXT: pextrw $3, %xmm3, %edx -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $3, %edx, %xmm5 -; SSE41-NEXT: pextrw $4, %xmm7, %ecx -; SSE41-NEXT: pextrw $4, %xmm3, %edx -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $4, %edx, %xmm5 -; SSE41-NEXT: pextrw $5, %xmm7, %ecx -; SSE41-NEXT: pextrw $5, %xmm3, %edx -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $5, %edx, %xmm5 -; SSE41-NEXT: pextrw $6, %xmm7, %ecx -; SSE41-NEXT: pextrw $6, %xmm3, %edx -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $6, %edx, %xmm5 -; SSE41-NEXT: pextrw $7, %xmm7, %ecx -; SSE41-NEXT: pextrw $7, %xmm3, %edx -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovbl %eax, %edx -; SSE41-NEXT: pinsrw $7, %edx, %xmm5 -; SSE41-NEXT: movdqa %xmm4, %xmm2 -; SSE41-NEXT: movdqa %xmm5, %xmm3 -; SSE41-NEXT: retq +; SSE-LABEL: v32i16: +; SSE: # %bb.0: +; SSE-NEXT: psubusw %xmm4, %xmm0 +; SSE-NEXT: psubusw %xmm5, %xmm1 +; SSE-NEXT: psubusw %xmm6, %xmm2 +; SSE-NEXT: psubusw %xmm7, %xmm3 +; SSE-NEXT: retq ; ; AVX1-LABEL: v32i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vpextrw $1, %xmm4, %ecx ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vpextrw $1, %xmm5, %edx -; AVX1-NEXT: xorl %eax, %eax -; AVX1-NEXT: subw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vmovd %xmm4, %ecx -; AVX1-NEXT: vmovd %xmm5, %esi -; AVX1-NEXT: subw %cx, %si -; AVX1-NEXT: cmovbl %eax, %esi -; AVX1-NEXT: vmovd %esi, %xmm6 -; AVX1-NEXT: vpinsrw $1, %edx, %xmm6, %xmm6 -; AVX1-NEXT: vpextrw $2, %xmm4, %ecx -; AVX1-NEXT: vpextrw $2, %xmm5, %edx -; AVX1-NEXT: subw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $2, %edx, %xmm6, %xmm6 -; AVX1-NEXT: vpextrw $3, %xmm4, %ecx -; AVX1-NEXT: vpextrw $3, %xmm5, %edx -; AVX1-NEXT: subw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $3, %edx, %xmm6, %xmm6 -; AVX1-NEXT: vpextrw $4, %xmm4, %ecx -; AVX1-NEXT: vpextrw $4, %xmm5, %edx -; AVX1-NEXT: subw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $4, %edx, %xmm6, %xmm6 -; AVX1-NEXT: vpextrw $5, %xmm4, %ecx -; AVX1-NEXT: vpextrw $5, %xmm5, %edx -; AVX1-NEXT: subw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $5, %edx, %xmm6, %xmm6 -; AVX1-NEXT: vpextrw $6, %xmm4, %ecx -; AVX1-NEXT: vpextrw $6, %xmm5, %edx -; AVX1-NEXT: subw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $6, %edx, %xmm6, %xmm6 -; AVX1-NEXT: vpextrw $7, %xmm4, %ecx -; AVX1-NEXT: vpextrw $7, %xmm5, %edx -; AVX1-NEXT: subw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $7, %edx, %xmm6, %xmm4 -; AVX1-NEXT: vpextrw $1, %xmm2, %ecx -; AVX1-NEXT: vpextrw $1, %xmm0, %edx -; AVX1-NEXT: subw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vmovd %xmm2, %ecx -; AVX1-NEXT: vmovd %xmm0, %esi -; AVX1-NEXT: subw %cx, %si -; AVX1-NEXT: cmovbl %eax, %esi -; AVX1-NEXT: vmovd %esi, %xmm5 -; AVX1-NEXT: vpinsrw $1, %edx, %xmm5, %xmm5 -; AVX1-NEXT: vpextrw $2, %xmm2, %ecx -; AVX1-NEXT: vpextrw $2, %xmm0, %edx -; AVX1-NEXT: subw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 -; AVX1-NEXT: vpextrw $3, %xmm2, %ecx -; AVX1-NEXT: vpextrw $3, %xmm0, %edx -; AVX1-NEXT: subw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $3, %edx, %xmm5, %xmm5 -; AVX1-NEXT: vpextrw $4, %xmm2, %ecx -; AVX1-NEXT: vpextrw $4, %xmm0, %edx -; AVX1-NEXT: subw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $4, %edx, %xmm5, %xmm5 -; AVX1-NEXT: vpextrw $5, %xmm2, %ecx -; AVX1-NEXT: vpextrw $5, %xmm0, %edx -; AVX1-NEXT: subw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $5, %edx, %xmm5, %xmm5 -; AVX1-NEXT: vpextrw $6, %xmm2, %ecx -; AVX1-NEXT: vpextrw $6, %xmm0, %edx -; AVX1-NEXT: subw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $6, %edx, %xmm5, %xmm5 -; AVX1-NEXT: vpextrw $7, %xmm2, %ecx -; AVX1-NEXT: vpextrw $7, %xmm0, %edx -; AVX1-NEXT: subw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $7, %edx, %xmm5, %xmm0 +; AVX1-NEXT: vpsubusw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 -; AVX1-NEXT: vpextrw $1, %xmm2, %ecx ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpextrw $1, %xmm4, %edx -; AVX1-NEXT: subw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vmovd %xmm2, %ecx -; AVX1-NEXT: vmovd %xmm4, %esi -; AVX1-NEXT: subw %cx, %si -; AVX1-NEXT: cmovbl %eax, %esi -; AVX1-NEXT: vmovd %esi, %xmm5 -; AVX1-NEXT: vpinsrw $1, %edx, %xmm5, %xmm5 -; AVX1-NEXT: vpextrw $2, %xmm2, %ecx -; AVX1-NEXT: vpextrw $2, %xmm4, %edx -; AVX1-NEXT: subw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 -; AVX1-NEXT: vpextrw $3, %xmm2, %ecx -; AVX1-NEXT: vpextrw $3, %xmm4, %edx -; AVX1-NEXT: subw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $3, %edx, %xmm5, %xmm5 -; AVX1-NEXT: vpextrw $4, %xmm2, %ecx -; AVX1-NEXT: vpextrw $4, %xmm4, %edx -; AVX1-NEXT: subw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $4, %edx, %xmm5, %xmm5 -; AVX1-NEXT: vpextrw $5, %xmm2, %ecx -; AVX1-NEXT: vpextrw $5, %xmm4, %edx -; AVX1-NEXT: subw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $5, %edx, %xmm5, %xmm5 -; AVX1-NEXT: vpextrw $6, %xmm2, %ecx -; AVX1-NEXT: vpextrw $6, %xmm4, %edx -; AVX1-NEXT: subw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $6, %edx, %xmm5, %xmm5 -; AVX1-NEXT: vpextrw $7, %xmm2, %ecx -; AVX1-NEXT: vpextrw $7, %xmm4, %edx -; AVX1-NEXT: subw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $7, %edx, %xmm5, %xmm2 -; AVX1-NEXT: vpextrw $1, %xmm3, %ecx -; AVX1-NEXT: vpextrw $1, %xmm1, %edx -; AVX1-NEXT: subw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vmovd %xmm3, %ecx -; AVX1-NEXT: vmovd %xmm1, %esi -; AVX1-NEXT: subw %cx, %si -; AVX1-NEXT: cmovbl %eax, %esi -; AVX1-NEXT: vmovd %esi, %xmm4 -; AVX1-NEXT: vpinsrw $1, %edx, %xmm4, %xmm4 -; AVX1-NEXT: vpextrw $2, %xmm3, %ecx -; AVX1-NEXT: vpextrw $2, %xmm1, %edx -; AVX1-NEXT: subw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 -; AVX1-NEXT: vpextrw $3, %xmm3, %ecx -; AVX1-NEXT: vpextrw $3, %xmm1, %edx -; AVX1-NEXT: subw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $3, %edx, %xmm4, %xmm4 -; AVX1-NEXT: vpextrw $4, %xmm3, %ecx -; AVX1-NEXT: vpextrw $4, %xmm1, %edx -; AVX1-NEXT: subw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $4, %edx, %xmm4, %xmm4 -; AVX1-NEXT: vpextrw $5, %xmm3, %ecx -; AVX1-NEXT: vpextrw $5, %xmm1, %edx -; AVX1-NEXT: subw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $5, %edx, %xmm4, %xmm4 -; AVX1-NEXT: vpextrw $6, %xmm3, %ecx -; AVX1-NEXT: vpextrw $6, %xmm1, %edx -; AVX1-NEXT: subw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $6, %edx, %xmm4, %xmm4 -; AVX1-NEXT: vpextrw $7, %xmm3, %ecx -; AVX1-NEXT: vpextrw $7, %xmm1, %edx -; AVX1-NEXT: subw %cx, %dx -; AVX1-NEXT: cmovbl %eax, %edx -; AVX1-NEXT: vpinsrw $7, %edx, %xmm4, %xmm1 +; AVX1-NEXT: vpsubusw %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpsubusw %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v32i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-NEXT: vpextrw $1, %xmm4, %ecx -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX2-NEXT: vpextrw $1, %xmm5, %edx -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: subw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vmovd %xmm4, %ecx -; AVX2-NEXT: vmovd %xmm5, %esi -; AVX2-NEXT: subw %cx, %si -; AVX2-NEXT: cmovbl %eax, %esi -; AVX2-NEXT: vmovd %esi, %xmm6 -; AVX2-NEXT: vpinsrw $1, %edx, %xmm6, %xmm6 -; AVX2-NEXT: vpextrw $2, %xmm4, %ecx -; AVX2-NEXT: vpextrw $2, %xmm5, %edx -; AVX2-NEXT: subw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $2, %edx, %xmm6, %xmm6 -; AVX2-NEXT: vpextrw $3, %xmm4, %ecx -; AVX2-NEXT: vpextrw $3, %xmm5, %edx -; AVX2-NEXT: subw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $3, %edx, %xmm6, %xmm6 -; AVX2-NEXT: vpextrw $4, %xmm4, %ecx -; AVX2-NEXT: vpextrw $4, %xmm5, %edx -; AVX2-NEXT: subw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $4, %edx, %xmm6, %xmm6 -; AVX2-NEXT: vpextrw $5, %xmm4, %ecx -; AVX2-NEXT: vpextrw $5, %xmm5, %edx -; AVX2-NEXT: subw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $5, %edx, %xmm6, %xmm6 -; AVX2-NEXT: vpextrw $6, %xmm4, %ecx -; AVX2-NEXT: vpextrw $6, %xmm5, %edx -; AVX2-NEXT: subw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $6, %edx, %xmm6, %xmm6 -; AVX2-NEXT: vpextrw $7, %xmm4, %ecx -; AVX2-NEXT: vpextrw $7, %xmm5, %edx -; AVX2-NEXT: subw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $7, %edx, %xmm6, %xmm4 -; AVX2-NEXT: vpextrw $1, %xmm2, %ecx -; AVX2-NEXT: vpextrw $1, %xmm0, %edx -; AVX2-NEXT: subw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vmovd %xmm2, %ecx -; AVX2-NEXT: vmovd %xmm0, %esi -; AVX2-NEXT: subw %cx, %si -; AVX2-NEXT: cmovbl %eax, %esi -; AVX2-NEXT: vmovd %esi, %xmm5 -; AVX2-NEXT: vpinsrw $1, %edx, %xmm5, %xmm5 -; AVX2-NEXT: vpextrw $2, %xmm2, %ecx -; AVX2-NEXT: vpextrw $2, %xmm0, %edx -; AVX2-NEXT: subw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 -; AVX2-NEXT: vpextrw $3, %xmm2, %ecx -; AVX2-NEXT: vpextrw $3, %xmm0, %edx -; AVX2-NEXT: subw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $3, %edx, %xmm5, %xmm5 -; AVX2-NEXT: vpextrw $4, %xmm2, %ecx -; AVX2-NEXT: vpextrw $4, %xmm0, %edx -; AVX2-NEXT: subw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $4, %edx, %xmm5, %xmm5 -; AVX2-NEXT: vpextrw $5, %xmm2, %ecx -; AVX2-NEXT: vpextrw $5, %xmm0, %edx -; AVX2-NEXT: subw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $5, %edx, %xmm5, %xmm5 -; AVX2-NEXT: vpextrw $6, %xmm2, %ecx -; AVX2-NEXT: vpextrw $6, %xmm0, %edx -; AVX2-NEXT: subw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $6, %edx, %xmm5, %xmm5 -; AVX2-NEXT: vpextrw $7, %xmm2, %ecx -; AVX2-NEXT: vpextrw $7, %xmm0, %edx -; AVX2-NEXT: subw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $7, %edx, %xmm5, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm2 -; AVX2-NEXT: vpextrw $1, %xmm2, %ecx -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-NEXT: vpextrw $1, %xmm4, %edx -; AVX2-NEXT: subw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vmovd %xmm2, %ecx -; AVX2-NEXT: vmovd %xmm4, %esi -; AVX2-NEXT: subw %cx, %si -; AVX2-NEXT: cmovbl %eax, %esi -; AVX2-NEXT: vmovd %esi, %xmm5 -; AVX2-NEXT: vpinsrw $1, %edx, %xmm5, %xmm5 -; AVX2-NEXT: vpextrw $2, %xmm2, %ecx -; AVX2-NEXT: vpextrw $2, %xmm4, %edx -; AVX2-NEXT: subw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 -; AVX2-NEXT: vpextrw $3, %xmm2, %ecx -; AVX2-NEXT: vpextrw $3, %xmm4, %edx -; AVX2-NEXT: subw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $3, %edx, %xmm5, %xmm5 -; AVX2-NEXT: vpextrw $4, %xmm2, %ecx -; AVX2-NEXT: vpextrw $4, %xmm4, %edx -; AVX2-NEXT: subw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $4, %edx, %xmm5, %xmm5 -; AVX2-NEXT: vpextrw $5, %xmm2, %ecx -; AVX2-NEXT: vpextrw $5, %xmm4, %edx -; AVX2-NEXT: subw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $5, %edx, %xmm5, %xmm5 -; AVX2-NEXT: vpextrw $6, %xmm2, %ecx -; AVX2-NEXT: vpextrw $6, %xmm4, %edx -; AVX2-NEXT: subw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $6, %edx, %xmm5, %xmm5 -; AVX2-NEXT: vpextrw $7, %xmm2, %ecx -; AVX2-NEXT: vpextrw $7, %xmm4, %edx -; AVX2-NEXT: subw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $7, %edx, %xmm5, %xmm2 -; AVX2-NEXT: vpextrw $1, %xmm3, %ecx -; AVX2-NEXT: vpextrw $1, %xmm1, %edx -; AVX2-NEXT: subw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vmovd %xmm3, %ecx -; AVX2-NEXT: vmovd %xmm1, %esi -; AVX2-NEXT: subw %cx, %si -; AVX2-NEXT: cmovbl %eax, %esi -; AVX2-NEXT: vmovd %esi, %xmm4 -; AVX2-NEXT: vpinsrw $1, %edx, %xmm4, %xmm4 -; AVX2-NEXT: vpextrw $2, %xmm3, %ecx -; AVX2-NEXT: vpextrw $2, %xmm1, %edx -; AVX2-NEXT: subw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 -; AVX2-NEXT: vpextrw $3, %xmm3, %ecx -; AVX2-NEXT: vpextrw $3, %xmm1, %edx -; AVX2-NEXT: subw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $3, %edx, %xmm4, %xmm4 -; AVX2-NEXT: vpextrw $4, %xmm3, %ecx -; AVX2-NEXT: vpextrw $4, %xmm1, %edx -; AVX2-NEXT: subw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $4, %edx, %xmm4, %xmm4 -; AVX2-NEXT: vpextrw $5, %xmm3, %ecx -; AVX2-NEXT: vpextrw $5, %xmm1, %edx -; AVX2-NEXT: subw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $5, %edx, %xmm4, %xmm4 -; AVX2-NEXT: vpextrw $6, %xmm3, %ecx -; AVX2-NEXT: vpextrw $6, %xmm1, %edx -; AVX2-NEXT: subw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $6, %edx, %xmm4, %xmm4 -; AVX2-NEXT: vpextrw $7, %xmm3, %ecx -; AVX2-NEXT: vpextrw $7, %xmm1, %edx -; AVX2-NEXT: subw %cx, %dx -; AVX2-NEXT: cmovbl %eax, %edx -; AVX2-NEXT: vpinsrw $7, %edx, %xmm4, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: vpsubusw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsubusw %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: v32i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm2 -; AVX512-NEXT: vpextrw $1, %xmm2, %ecx -; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm3 -; AVX512-NEXT: vpextrw $1, %xmm3, %edx -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: subw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vmovd %xmm2, %ecx -; AVX512-NEXT: vmovd %xmm3, %esi -; AVX512-NEXT: subw %cx, %si -; AVX512-NEXT: cmovbl %eax, %esi -; AVX512-NEXT: vmovd %esi, %xmm4 -; AVX512-NEXT: vpinsrw $1, %edx, %xmm4, %xmm4 -; AVX512-NEXT: vpextrw $2, %xmm2, %ecx -; AVX512-NEXT: vpextrw $2, %xmm3, %edx -; AVX512-NEXT: subw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 -; AVX512-NEXT: vpextrw $3, %xmm2, %ecx -; AVX512-NEXT: vpextrw $3, %xmm3, %edx -; AVX512-NEXT: subw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $3, %edx, %xmm4, %xmm4 -; AVX512-NEXT: vpextrw $4, %xmm2, %ecx -; AVX512-NEXT: vpextrw $4, %xmm3, %edx -; AVX512-NEXT: subw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $4, %edx, %xmm4, %xmm4 -; AVX512-NEXT: vpextrw $5, %xmm2, %ecx -; AVX512-NEXT: vpextrw $5, %xmm3, %edx -; AVX512-NEXT: subw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $5, %edx, %xmm4, %xmm4 -; AVX512-NEXT: vpextrw $6, %xmm2, %ecx -; AVX512-NEXT: vpextrw $6, %xmm3, %edx -; AVX512-NEXT: subw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $6, %edx, %xmm4, %xmm4 -; AVX512-NEXT: vpextrw $7, %xmm2, %ecx -; AVX512-NEXT: vpextrw $7, %xmm3, %edx -; AVX512-NEXT: subw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $7, %edx, %xmm4, %xmm2 -; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm3 -; AVX512-NEXT: vpextrw $1, %xmm3, %ecx -; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm4 -; AVX512-NEXT: vpextrw $1, %xmm4, %edx -; AVX512-NEXT: subw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vmovd %xmm3, %ecx -; AVX512-NEXT: vmovd %xmm4, %esi -; AVX512-NEXT: subw %cx, %si -; AVX512-NEXT: cmovbl %eax, %esi -; AVX512-NEXT: vmovd %esi, %xmm5 -; AVX512-NEXT: vpinsrw $1, %edx, %xmm5, %xmm5 -; AVX512-NEXT: vpextrw $2, %xmm3, %ecx -; AVX512-NEXT: vpextrw $2, %xmm4, %edx -; AVX512-NEXT: subw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 -; AVX512-NEXT: vpextrw $3, %xmm3, %ecx -; AVX512-NEXT: vpextrw $3, %xmm4, %edx -; AVX512-NEXT: subw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $3, %edx, %xmm5, %xmm5 -; AVX512-NEXT: vpextrw $4, %xmm3, %ecx -; AVX512-NEXT: vpextrw $4, %xmm4, %edx -; AVX512-NEXT: subw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $4, %edx, %xmm5, %xmm5 -; AVX512-NEXT: vpextrw $5, %xmm3, %ecx -; AVX512-NEXT: vpextrw $5, %xmm4, %edx -; AVX512-NEXT: subw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $5, %edx, %xmm5, %xmm5 -; AVX512-NEXT: vpextrw $6, %xmm3, %ecx -; AVX512-NEXT: vpextrw $6, %xmm4, %edx -; AVX512-NEXT: subw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $6, %edx, %xmm5, %xmm5 -; AVX512-NEXT: vpextrw $7, %xmm3, %ecx -; AVX512-NEXT: vpextrw $7, %xmm4, %edx -; AVX512-NEXT: subw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $7, %edx, %xmm5, %xmm3 -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512-NEXT: vpextrw $1, %xmm3, %ecx -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512-NEXT: vpextrw $1, %xmm4, %edx -; AVX512-NEXT: subw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vmovd %xmm3, %ecx -; AVX512-NEXT: vmovd %xmm4, %esi -; AVX512-NEXT: subw %cx, %si -; AVX512-NEXT: cmovbl %eax, %esi -; AVX512-NEXT: vmovd %esi, %xmm5 -; AVX512-NEXT: vpinsrw $1, %edx, %xmm5, %xmm5 -; AVX512-NEXT: vpextrw $2, %xmm3, %ecx -; AVX512-NEXT: vpextrw $2, %xmm4, %edx -; AVX512-NEXT: subw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 -; AVX512-NEXT: vpextrw $3, %xmm3, %ecx -; AVX512-NEXT: vpextrw $3, %xmm4, %edx -; AVX512-NEXT: subw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $3, %edx, %xmm5, %xmm5 -; AVX512-NEXT: vpextrw $4, %xmm3, %ecx -; AVX512-NEXT: vpextrw $4, %xmm4, %edx -; AVX512-NEXT: subw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $4, %edx, %xmm5, %xmm5 -; AVX512-NEXT: vpextrw $5, %xmm3, %ecx -; AVX512-NEXT: vpextrw $5, %xmm4, %edx -; AVX512-NEXT: subw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $5, %edx, %xmm5, %xmm5 -; AVX512-NEXT: vpextrw $6, %xmm3, %ecx -; AVX512-NEXT: vpextrw $6, %xmm4, %edx -; AVX512-NEXT: subw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $6, %edx, %xmm5, %xmm5 -; AVX512-NEXT: vpextrw $7, %xmm3, %ecx -; AVX512-NEXT: vpextrw $7, %xmm4, %edx -; AVX512-NEXT: subw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $7, %edx, %xmm5, %xmm3 -; AVX512-NEXT: vpextrw $1, %xmm1, %ecx -; AVX512-NEXT: vpextrw $1, %xmm0, %edx -; AVX512-NEXT: subw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vmovd %xmm1, %ecx -; AVX512-NEXT: vmovd %xmm0, %esi -; AVX512-NEXT: subw %cx, %si -; AVX512-NEXT: cmovbl %eax, %esi -; AVX512-NEXT: vmovd %esi, %xmm4 -; AVX512-NEXT: vpinsrw $1, %edx, %xmm4, %xmm4 -; AVX512-NEXT: vpextrw $2, %xmm1, %ecx -; AVX512-NEXT: vpextrw $2, %xmm0, %edx -; AVX512-NEXT: subw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 -; AVX512-NEXT: vpextrw $3, %xmm1, %ecx -; AVX512-NEXT: vpextrw $3, %xmm0, %edx -; AVX512-NEXT: subw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $3, %edx, %xmm4, %xmm4 -; AVX512-NEXT: vpextrw $4, %xmm1, %ecx -; AVX512-NEXT: vpextrw $4, %xmm0, %edx -; AVX512-NEXT: subw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $4, %edx, %xmm4, %xmm4 -; AVX512-NEXT: vpextrw $5, %xmm1, %ecx -; AVX512-NEXT: vpextrw $5, %xmm0, %edx -; AVX512-NEXT: subw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $5, %edx, %xmm4, %xmm4 -; AVX512-NEXT: vpextrw $6, %xmm1, %ecx -; AVX512-NEXT: vpextrw $6, %xmm0, %edx -; AVX512-NEXT: subw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $6, %edx, %xmm4, %xmm4 -; AVX512-NEXT: vpextrw $7, %xmm1, %ecx -; AVX512-NEXT: vpextrw $7, %xmm0, %edx -; AVX512-NEXT: subw %cx, %dx -; AVX512-NEXT: cmovbl %eax, %edx -; AVX512-NEXT: vpinsrw $7, %edx, %xmm4, %xmm0 -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq %z = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> %x, <32 x i16> %y) ret <32 x i16> %z @@ -9218,337 +196,36 @@ ; Too narrow vectors, legalized by widening. define void @v8i8(<8 x i8>* %px, <8 x i8>* %py, <8 x i8>* %pz) nounwind { -; SSE2-LABEL: v8i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE2-NEXT: pextrw $7, %xmm1, %ecx -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: pextrw $7, %xmm0, %esi -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: subw %cx, %si -; SSE2-NEXT: cmovbl %eax, %esi -; SSE2-NEXT: movd %esi, %xmm2 -; SSE2-NEXT: pextrw $6, %xmm1, %ecx -; SSE2-NEXT: pextrw $6, %xmm0, %esi -; SSE2-NEXT: subw %cx, %si -; SSE2-NEXT: cmovbl %eax, %esi -; SSE2-NEXT: movd %esi, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE2-NEXT: pextrw $5, %xmm1, %ecx -; SSE2-NEXT: pextrw $5, %xmm0, %esi -; SSE2-NEXT: subw %cx, %si -; SSE2-NEXT: cmovbl %eax, %esi -; SSE2-NEXT: movd %esi, %xmm4 -; SSE2-NEXT: pextrw $4, %xmm1, %ecx -; SSE2-NEXT: pextrw $4, %xmm0, %esi -; SSE2-NEXT: subw %cx, %si -; SSE2-NEXT: cmovbl %eax, %esi -; SSE2-NEXT: movd %esi, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE2-NEXT: pextrw $3, %xmm1, %ecx -; SSE2-NEXT: pextrw $3, %xmm0, %esi -; SSE2-NEXT: subw %cx, %si -; SSE2-NEXT: cmovbl %eax, %esi -; SSE2-NEXT: movd %esi, %xmm3 -; SSE2-NEXT: pextrw $2, %xmm1, %ecx -; SSE2-NEXT: pextrw $2, %xmm0, %esi -; SSE2-NEXT: subw %cx, %si -; SSE2-NEXT: cmovbl %eax, %esi -; SSE2-NEXT: movd %esi, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE2-NEXT: pextrw $1, %xmm1, %ecx -; SSE2-NEXT: pextrw $1, %xmm0, %esi -; SSE2-NEXT: subw %cx, %si -; SSE2-NEXT: cmovbl %eax, %esi -; SSE2-NEXT: movd %esi, %xmm3 -; SSE2-NEXT: movd %xmm1, %ecx -; SSE2-NEXT: movd %xmm0, %esi -; SSE2-NEXT: subw %cx, %si -; SSE2-NEXT: cmovbl %eax, %esi -; SSE2-NEXT: movd %esi, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: movq %xmm0, (%rdx) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v8i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSSE3-NEXT: movq {{.*#+}} xmm3 = mem[0],zero -; SSSE3-NEXT: pxor %xmm0, %xmm0 -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSSE3-NEXT: pextrw $7, %xmm1, %ecx -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSSE3-NEXT: pextrw $7, %xmm0, %esi -; SSSE3-NEXT: xorl %eax, %eax -; SSSE3-NEXT: subw %cx, %si -; SSSE3-NEXT: cmovbl %eax, %esi -; SSSE3-NEXT: movd %esi, %xmm2 -; SSSE3-NEXT: pextrw $6, %xmm1, %ecx -; SSSE3-NEXT: pextrw $6, %xmm0, %esi -; SSSE3-NEXT: subw %cx, %si -; SSSE3-NEXT: cmovbl %eax, %esi -; SSSE3-NEXT: movd %esi, %xmm3 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSSE3-NEXT: pextrw $5, %xmm1, %ecx -; SSSE3-NEXT: pextrw $5, %xmm0, %esi -; SSSE3-NEXT: subw %cx, %si -; SSSE3-NEXT: cmovbl %eax, %esi -; SSSE3-NEXT: movd %esi, %xmm4 -; SSSE3-NEXT: pextrw $4, %xmm1, %ecx -; SSSE3-NEXT: pextrw $4, %xmm0, %esi -; SSSE3-NEXT: subw %cx, %si -; SSSE3-NEXT: cmovbl %eax, %esi -; SSSE3-NEXT: movd %esi, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSSE3-NEXT: pextrw $3, %xmm1, %ecx -; SSSE3-NEXT: pextrw $3, %xmm0, %esi -; SSSE3-NEXT: subw %cx, %si -; SSSE3-NEXT: cmovbl %eax, %esi -; SSSE3-NEXT: movd %esi, %xmm3 -; SSSE3-NEXT: pextrw $2, %xmm1, %ecx -; SSSE3-NEXT: pextrw $2, %xmm0, %esi -; SSSE3-NEXT: subw %cx, %si -; SSSE3-NEXT: cmovbl %eax, %esi -; SSSE3-NEXT: movd %esi, %xmm4 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSSE3-NEXT: pextrw $1, %xmm1, %ecx -; SSSE3-NEXT: pextrw $1, %xmm0, %esi -; SSSE3-NEXT: subw %cx, %si -; SSSE3-NEXT: cmovbl %eax, %esi -; SSSE3-NEXT: movd %esi, %xmm3 -; SSSE3-NEXT: movd %xmm1, %ecx -; SSSE3-NEXT: movd %xmm0, %esi -; SSSE3-NEXT: subw %cx, %si -; SSSE3-NEXT: cmovbl %eax, %esi -; SSSE3-NEXT: movd %esi, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSSE3-NEXT: psrlw $8, %xmm0 -; SSSE3-NEXT: packuswb %xmm0, %xmm0 -; SSSE3-NEXT: movq %xmm0, (%rdx) -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v8i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSE41-NEXT: movq {{.*#+}} xmm3 = mem[0],zero -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE41-NEXT: pextrw $1, %xmm1, %ecx -; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE41-NEXT: pextrw $1, %xmm0, %esi -; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: subw %cx, %si -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: movd %xmm1, %ecx -; SSE41-NEXT: movd %xmm0, %edi -; SSE41-NEXT: subw %cx, %di -; SSE41-NEXT: cmovbl %eax, %edi -; SSE41-NEXT: movd %edi, %xmm2 -; SSE41-NEXT: pinsrw $1, %esi, %xmm2 -; SSE41-NEXT: pextrw $2, %xmm1, %ecx -; SSE41-NEXT: pextrw $2, %xmm0, %esi -; SSE41-NEXT: subw %cx, %si -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: pinsrw $2, %esi, %xmm2 -; SSE41-NEXT: pextrw $3, %xmm1, %ecx -; SSE41-NEXT: pextrw $3, %xmm0, %esi -; SSE41-NEXT: subw %cx, %si -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: pinsrw $3, %esi, %xmm2 -; SSE41-NEXT: pextrw $4, %xmm1, %ecx -; SSE41-NEXT: pextrw $4, %xmm0, %esi -; SSE41-NEXT: subw %cx, %si -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: pinsrw $4, %esi, %xmm2 -; SSE41-NEXT: pextrw $5, %xmm1, %ecx -; SSE41-NEXT: pextrw $5, %xmm0, %esi -; SSE41-NEXT: subw %cx, %si -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: pinsrw $5, %esi, %xmm2 -; SSE41-NEXT: pextrw $6, %xmm1, %ecx -; SSE41-NEXT: pextrw $6, %xmm0, %esi -; SSE41-NEXT: subw %cx, %si -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: pinsrw $6, %esi, %xmm2 -; SSE41-NEXT: pextrw $7, %xmm1, %ecx -; SSE41-NEXT: pextrw $7, %xmm0, %esi -; SSE41-NEXT: subw %cx, %si -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: pinsrw $7, %esi, %xmm2 -; SSE41-NEXT: psrlw $8, %xmm2 -; SSE41-NEXT: packuswb %xmm0, %xmm2 -; SSE41-NEXT: movq %xmm2, (%rdx) -; SSE41-NEXT: retq +; SSE-LABEL: v8i8: +; SSE: # %bb.0: +; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE-NEXT: psubusb %xmm1, %xmm0 +; SSE-NEXT: movq %xmm0, (%rdx) +; SSE-NEXT: retq ; ; AVX1-LABEL: v8i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX1-NEXT: vpextrw $1, %xmm0, %ecx -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX1-NEXT: vpextrw $1, %xmm1, %esi -; AVX1-NEXT: xorl %eax, %eax -; AVX1-NEXT: subw %cx, %si -; AVX1-NEXT: cmovbl %eax, %esi -; AVX1-NEXT: vmovd %xmm0, %ecx -; AVX1-NEXT: vmovd %xmm1, %edi -; AVX1-NEXT: subw %cx, %di -; AVX1-NEXT: cmovbl %eax, %edi -; AVX1-NEXT: vmovd %edi, %xmm2 -; AVX1-NEXT: vpinsrw $1, %esi, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $2, %xmm0, %ecx -; AVX1-NEXT: vpextrw $2, %xmm1, %esi -; AVX1-NEXT: subw %cx, %si -; AVX1-NEXT: cmovbl %eax, %esi -; AVX1-NEXT: vpinsrw $2, %esi, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $3, %xmm0, %ecx -; AVX1-NEXT: vpextrw $3, %xmm1, %esi -; AVX1-NEXT: subw %cx, %si -; AVX1-NEXT: cmovbl %eax, %esi -; AVX1-NEXT: vpinsrw $3, %esi, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $4, %xmm0, %ecx -; AVX1-NEXT: vpextrw $4, %xmm1, %esi -; AVX1-NEXT: subw %cx, %si -; AVX1-NEXT: cmovbl %eax, %esi -; AVX1-NEXT: vpinsrw $4, %esi, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $5, %xmm0, %ecx -; AVX1-NEXT: vpextrw $5, %xmm1, %esi -; AVX1-NEXT: subw %cx, %si -; AVX1-NEXT: cmovbl %eax, %esi -; AVX1-NEXT: vpinsrw $5, %esi, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $6, %xmm0, %ecx -; AVX1-NEXT: vpextrw $6, %xmm1, %esi -; AVX1-NEXT: subw %cx, %si -; AVX1-NEXT: cmovbl %eax, %esi -; AVX1-NEXT: vpinsrw $6, %esi, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $7, %xmm0, %ecx -; AVX1-NEXT: vpextrw $7, %xmm1, %esi -; AVX1-NEXT: subw %cx, %si -; AVX1-NEXT: cmovbl %eax, %esi -; AVX1-NEXT: vpinsrw $7, %esi, %xmm2, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, (%rdx) ; AVX1-NEXT: retq ; ; AVX2-LABEL: v8i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX2-NEXT: vpextrw $1, %xmm0, %ecx -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX2-NEXT: vpextrw $1, %xmm1, %esi -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: subw %cx, %si -; AVX2-NEXT: cmovbl %eax, %esi -; AVX2-NEXT: vmovd %xmm0, %ecx -; AVX2-NEXT: vmovd %xmm1, %edi -; AVX2-NEXT: subw %cx, %di -; AVX2-NEXT: cmovbl %eax, %edi -; AVX2-NEXT: vmovd %edi, %xmm2 -; AVX2-NEXT: vpinsrw $1, %esi, %xmm2, %xmm2 -; AVX2-NEXT: vpextrw $2, %xmm0, %ecx -; AVX2-NEXT: vpextrw $2, %xmm1, %esi -; AVX2-NEXT: subw %cx, %si -; AVX2-NEXT: cmovbl %eax, %esi -; AVX2-NEXT: vpinsrw $2, %esi, %xmm2, %xmm2 -; AVX2-NEXT: vpextrw $3, %xmm0, %ecx -; AVX2-NEXT: vpextrw $3, %xmm1, %esi -; AVX2-NEXT: subw %cx, %si -; AVX2-NEXT: cmovbl %eax, %esi -; AVX2-NEXT: vpinsrw $3, %esi, %xmm2, %xmm2 -; AVX2-NEXT: vpextrw $4, %xmm0, %ecx -; AVX2-NEXT: vpextrw $4, %xmm1, %esi -; AVX2-NEXT: subw %cx, %si -; AVX2-NEXT: cmovbl %eax, %esi -; AVX2-NEXT: vpinsrw $4, %esi, %xmm2, %xmm2 -; AVX2-NEXT: vpextrw $5, %xmm0, %ecx -; AVX2-NEXT: vpextrw $5, %xmm1, %esi -; AVX2-NEXT: subw %cx, %si -; AVX2-NEXT: cmovbl %eax, %esi -; AVX2-NEXT: vpinsrw $5, %esi, %xmm2, %xmm2 -; AVX2-NEXT: vpextrw $6, %xmm0, %ecx -; AVX2-NEXT: vpextrw $6, %xmm1, %esi -; AVX2-NEXT: subw %cx, %si -; AVX2-NEXT: cmovbl %eax, %esi -; AVX2-NEXT: vpinsrw $6, %esi, %xmm2, %xmm2 -; AVX2-NEXT: vpextrw $7, %xmm0, %ecx -; AVX2-NEXT: vpextrw $7, %xmm1, %esi -; AVX2-NEXT: subw %cx, %si -; AVX2-NEXT: cmovbl %eax, %esi -; AVX2-NEXT: vpinsrw $7, %esi, %xmm2, %xmm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, (%rdx) ; AVX2-NEXT: retq ; ; AVX512-LABEL: v8i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512-NEXT: vpextrw $1, %xmm0, %ecx -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512-NEXT: vpextrw $1, %xmm1, %esi -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: subw %cx, %si -; AVX512-NEXT: cmovbl %eax, %esi -; AVX512-NEXT: vmovd %xmm0, %ecx -; AVX512-NEXT: vmovd %xmm1, %edi -; AVX512-NEXT: subw %cx, %di -; AVX512-NEXT: cmovbl %eax, %edi -; AVX512-NEXT: vmovd %edi, %xmm2 -; AVX512-NEXT: vpinsrw $1, %esi, %xmm2, %xmm2 -; AVX512-NEXT: vpextrw $2, %xmm0, %ecx -; AVX512-NEXT: vpextrw $2, %xmm1, %esi -; AVX512-NEXT: subw %cx, %si -; AVX512-NEXT: cmovbl %eax, %esi -; AVX512-NEXT: vpinsrw $2, %esi, %xmm2, %xmm2 -; AVX512-NEXT: vpextrw $3, %xmm0, %ecx -; AVX512-NEXT: vpextrw $3, %xmm1, %esi -; AVX512-NEXT: subw %cx, %si -; AVX512-NEXT: cmovbl %eax, %esi -; AVX512-NEXT: vpinsrw $3, %esi, %xmm2, %xmm2 -; AVX512-NEXT: vpextrw $4, %xmm0, %ecx -; AVX512-NEXT: vpextrw $4, %xmm1, %esi -; AVX512-NEXT: subw %cx, %si -; AVX512-NEXT: cmovbl %eax, %esi -; AVX512-NEXT: vpinsrw $4, %esi, %xmm2, %xmm2 -; AVX512-NEXT: vpextrw $5, %xmm0, %ecx -; AVX512-NEXT: vpextrw $5, %xmm1, %esi -; AVX512-NEXT: subw %cx, %si -; AVX512-NEXT: cmovbl %eax, %esi -; AVX512-NEXT: vpinsrw $5, %esi, %xmm2, %xmm2 -; AVX512-NEXT: vpextrw $6, %xmm0, %ecx -; AVX512-NEXT: vpextrw $6, %xmm1, %esi -; AVX512-NEXT: subw %cx, %si -; AVX512-NEXT: cmovbl %eax, %esi -; AVX512-NEXT: vpinsrw $6, %esi, %xmm2, %xmm2 -; AVX512-NEXT: vpextrw $7, %xmm0, %ecx -; AVX512-NEXT: vpextrw $7, %xmm1, %esi -; AVX512-NEXT: subw %cx, %si -; AVX512-NEXT: cmovbl %eax, %esi -; AVX512-NEXT: vpinsrw $7, %esi, %xmm2, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512-NEXT: vpmovwb %xmm0, (%rdx) ; AVX512-NEXT: retq %x = load <8 x i8>, <8 x i8>* %px @@ -9559,183 +236,27 @@ } define void @v4i8(<4 x i8>* %px, <4 x i8>* %py, <4 x i8>* %pz) nounwind { -; SSE2-LABEL: v4i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: pslld $24, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3] -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: pslld $24, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] -; SSE2-NEXT: movd %xmm2, %ecx -; SSE2-NEXT: xorl %esi, %esi -; SSE2-NEXT: subl %eax, %ecx -; SSE2-NEXT: cmovbl %esi, %ecx -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] -; SSE2-NEXT: movd %xmm3, %eax -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm3, %ecx -; SSE2-NEXT: subl %eax, %ecx -; SSE2-NEXT: cmovbl %esi, %ecx -; SSE2-NEXT: movd %ecx, %xmm3 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: movd %xmm0, %ecx -; SSE2-NEXT: subl %eax, %ecx -; SSE2-NEXT: cmovbl %esi, %ecx -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE2-NEXT: movd %xmm0, %ecx -; SSE2-NEXT: subl %eax, %ecx -; SSE2-NEXT: cmovbl %esi, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE2-NEXT: psrld $24, %xmm2 -; SSE2-NEXT: packuswb %xmm2, %xmm2 -; SSE2-NEXT: packuswb %xmm2, %xmm2 -; SSE2-NEXT: movd %xmm2, (%rdx) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v4i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,1,255,255,255,2,255,255,255,3] -; SSSE3-NEXT: pshufb %xmm2, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,1,2,3] -; SSSE3-NEXT: movd %xmm3, %eax -; SSSE3-NEXT: pshufb %xmm2, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] -; SSSE3-NEXT: movd %xmm2, %ecx -; SSSE3-NEXT: xorl %esi, %esi -; SSSE3-NEXT: subl %eax, %ecx -; SSSE3-NEXT: cmovbl %esi, %ecx -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] -; SSSE3-NEXT: movd %xmm3, %eax -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] -; SSSE3-NEXT: movd %xmm3, %ecx -; SSSE3-NEXT: subl %eax, %ecx -; SSSE3-NEXT: cmovbl %esi, %ecx -; SSSE3-NEXT: movd %ecx, %xmm3 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSSE3-NEXT: movd %xmm1, %eax -; SSSE3-NEXT: movd %xmm0, %ecx -; SSSE3-NEXT: subl %eax, %ecx -; SSSE3-NEXT: cmovbl %esi, %ecx -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; SSSE3-NEXT: movd %xmm1, %eax -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSSE3-NEXT: movd %xmm0, %ecx -; SSSE3-NEXT: subl %eax, %ecx -; SSSE3-NEXT: cmovbl %esi, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] -; SSSE3-NEXT: movd %xmm2, (%rdx) -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v4i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; SSE41-NEXT: pslld $24, %xmm1 -; SSE41-NEXT: pextrd $1, %xmm1, %eax -; SSE41-NEXT: pslld $24, %xmm0 -; SSE41-NEXT: pextrd $1, %xmm0, %ecx -; SSE41-NEXT: xorl %esi, %esi -; SSE41-NEXT: subl %eax, %ecx -; SSE41-NEXT: cmovbl %esi, %ecx -; SSE41-NEXT: movd %xmm1, %eax -; SSE41-NEXT: movd %xmm0, %edi -; SSE41-NEXT: subl %eax, %edi -; SSE41-NEXT: cmovbl %esi, %edi -; SSE41-NEXT: movd %edi, %xmm2 -; SSE41-NEXT: pinsrd $1, %ecx, %xmm2 -; SSE41-NEXT: pextrd $2, %xmm1, %eax -; SSE41-NEXT: pextrd $2, %xmm0, %ecx -; SSE41-NEXT: subl %eax, %ecx -; SSE41-NEXT: cmovbl %esi, %ecx -; SSE41-NEXT: pinsrd $2, %ecx, %xmm2 -; SSE41-NEXT: pextrd $3, %xmm1, %eax -; SSE41-NEXT: pextrd $3, %xmm0, %ecx -; SSE41-NEXT: subl %eax, %ecx -; SSE41-NEXT: cmovbl %esi, %ecx -; SSE41-NEXT: pinsrd $3, %ecx, %xmm2 -; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] -; SSE41-NEXT: movd %xmm2, (%rdx) -; SSE41-NEXT: retq +; SSE-LABEL: v4i8: +; SSE: # %bb.0: +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: psubusb %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, (%rdx) +; SSE-NEXT: retq ; ; AVX1-LABEL: v4i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpslld $24, %xmm1, %xmm1 -; AVX1-NEXT: vpextrd $1, %xmm1, %eax -; AVX1-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX1-NEXT: vpextrd $1, %xmm0, %ecx -; AVX1-NEXT: xorl %esi, %esi -; AVX1-NEXT: subl %eax, %ecx -; AVX1-NEXT: cmovbl %esi, %ecx -; AVX1-NEXT: vmovd %xmm1, %eax -; AVX1-NEXT: vmovd %xmm0, %edi -; AVX1-NEXT: subl %eax, %edi -; AVX1-NEXT: cmovbl %esi, %edi -; AVX1-NEXT: vmovd %edi, %xmm2 -; AVX1-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrd $2, %xmm1, %eax -; AVX1-NEXT: vpextrd $2, %xmm0, %ecx -; AVX1-NEXT: subl %eax, %ecx -; AVX1-NEXT: cmovbl %esi, %ecx -; AVX1-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrd $3, %xmm1, %eax -; AVX1-NEXT: vpextrd $3, %xmm0, %ecx -; AVX1-NEXT: subl %eax, %ecx -; AVX1-NEXT: cmovbl %esi, %ecx -; AVX1-NEXT: vpinsrd $3, %ecx, %xmm2, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, (%rdx) ; AVX1-NEXT: retq ; ; AVX2-LABEL: v4i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX2-NEXT: vpslld $24, %xmm1, %xmm1 -; AVX2-NEXT: vpextrd $1, %xmm1, %eax -; AVX2-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX2-NEXT: vpextrd $1, %xmm0, %ecx -; AVX2-NEXT: xorl %esi, %esi -; AVX2-NEXT: subl %eax, %ecx -; AVX2-NEXT: cmovbl %esi, %ecx -; AVX2-NEXT: vmovd %xmm1, %eax -; AVX2-NEXT: vmovd %xmm0, %edi -; AVX2-NEXT: subl %eax, %edi -; AVX2-NEXT: cmovbl %esi, %edi -; AVX2-NEXT: vmovd %edi, %xmm2 -; AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrd $2, %xmm1, %eax -; AVX2-NEXT: vpextrd $2, %xmm0, %ecx -; AVX2-NEXT: subl %eax, %ecx -; AVX2-NEXT: cmovbl %esi, %ecx -; AVX2-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrd $3, %xmm1, %eax -; AVX2-NEXT: vpextrd $3, %xmm0, %ecx -; AVX2-NEXT: subl %eax, %ecx -; AVX2-NEXT: cmovbl %esi, %ecx -; AVX2-NEXT: vpinsrd $3, %ecx, %xmm2, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, (%rdx) ; AVX2-NEXT: retq ; @@ -9743,31 +264,8 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,1,255,255,255,2,255,255,255,3] -; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpextrd $1, %xmm1, %eax -; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpextrd $1, %xmm0, %ecx -; AVX512-NEXT: xorl %esi, %esi -; AVX512-NEXT: subl %eax, %ecx -; AVX512-NEXT: cmovbl %esi, %ecx -; AVX512-NEXT: vmovd %xmm1, %eax -; AVX512-NEXT: vmovd %xmm0, %edi -; AVX512-NEXT: subl %eax, %edi -; AVX512-NEXT: cmovbl %esi, %edi -; AVX512-NEXT: vmovd %edi, %xmm2 -; AVX512-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 -; AVX512-NEXT: vpextrd $2, %xmm1, %eax -; AVX512-NEXT: vpextrd $2, %xmm0, %ecx -; AVX512-NEXT: subl %eax, %ecx -; AVX512-NEXT: cmovbl %esi, %ecx -; AVX512-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm2 -; AVX512-NEXT: vpextrd $3, %xmm1, %eax -; AVX512-NEXT: vpextrd $3, %xmm0, %ecx -; AVX512-NEXT: subl %eax, %ecx -; AVX512-NEXT: cmovbl %esi, %ecx -; AVX512-NEXT: vpinsrd $3, %ecx, %xmm2, %xmm0 -; AVX512-NEXT: vpsrld $24, %xmm0, %xmm0 +; AVX512-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX512-NEXT: vpmovdb %xmm0, (%rdx) ; AVX512-NEXT: retq %x = load <4 x i8>, <4 x i8>* %px @@ -9782,35 +280,10 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movzwl (%rdi), %eax ; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; SSE2-NEXT: movzwl (%rsi), %eax ; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; SSE2-NEXT: psllq $56, %xmm1 -; SSE2-NEXT: movq %xmm1, %rax -; SSE2-NEXT: psllq $56, %xmm0 -; SSE2-NEXT: movq %xmm0, %rcx -; SSE2-NEXT: xorl %esi, %esi -; SSE2-NEXT: subq %rax, %rcx -; SSE2-NEXT: cmovbq %rsi, %rcx -; SSE2-NEXT: movq %rcx, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: movq %xmm1, %rax -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movq %xmm0, %rcx -; SSE2-NEXT: subq %rax, %rcx -; SSE2-NEXT: cmovbq %rsi, %rcx -; SSE2-NEXT: movq %rcx, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE2-NEXT: psrlq $56, %xmm2 -; SSE2-NEXT: packuswb %xmm2, %xmm2 -; SSE2-NEXT: packuswb %xmm2, %xmm2 -; SSE2-NEXT: packuswb %xmm2, %xmm2 -; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: psubusb %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: movw %ax, (%rdx) ; SSE2-NEXT: retq ; @@ -9820,91 +293,38 @@ ; SSSE3-NEXT: movd %eax, %xmm0 ; SSSE3-NEXT: movzwl (%rsi), %eax ; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,1] -; SSSE3-NEXT: pshufb %xmm2, %xmm1 -; SSSE3-NEXT: movq %xmm1, %rax -; SSSE3-NEXT: pshufb %xmm2, %xmm0 -; SSSE3-NEXT: movq %xmm0, %rcx -; SSSE3-NEXT: xorl %esi, %esi -; SSSE3-NEXT: subq %rax, %rcx -; SSSE3-NEXT: cmovbq %rsi, %rcx -; SSSE3-NEXT: movq %rcx, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSSE3-NEXT: movq %xmm1, %rax -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSSE3-NEXT: movq %xmm0, %rcx -; SSSE3-NEXT: subq %rax, %rcx -; SSSE3-NEXT: cmovbq %rsi, %rcx -; SSSE3-NEXT: movq %rcx, %xmm0 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; SSSE3-NEXT: movd %xmm2, %eax +; SSSE3-NEXT: psubusb %xmm1, %xmm0 +; SSSE3-NEXT: movd %xmm0, %eax ; SSSE3-NEXT: movw %ax, (%rdx) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v2i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: psllq $56, %xmm1 -; SSE41-NEXT: pextrq $1, %xmm1, %rax -; SSE41-NEXT: psllq $56, %xmm0 -; SSE41-NEXT: pextrq $1, %xmm0, %rcx -; SSE41-NEXT: xorl %esi, %esi -; SSE41-NEXT: subq %rax, %rcx -; SSE41-NEXT: cmovbq %rsi, %rcx -; SSE41-NEXT: movq %rcx, %xmm2 -; SSE41-NEXT: movq %xmm1, %rax -; SSE41-NEXT: movq %xmm0, %rcx -; SSE41-NEXT: subq %rax, %rcx -; SSE41-NEXT: cmovbq %rsi, %rcx -; SSE41-NEXT: movq %rcx, %xmm0 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; SSE41-NEXT: movzwl (%rdi), %eax +; SSE41-NEXT: movd %eax, %xmm0 +; SSE41-NEXT: movzwl (%rsi), %eax +; SSE41-NEXT: movd %eax, %xmm1 +; SSE41-NEXT: psubusb %xmm1, %xmm0 ; SSE41-NEXT: pextrw $0, %xmm0, (%rdx) ; SSE41-NEXT: retq ; ; AVX1-LABEL: v2i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsllq $56, %xmm1, %xmm1 -; AVX1-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX1-NEXT: vpextrq $1, %xmm0, %rcx -; AVX1-NEXT: xorl %esi, %esi -; AVX1-NEXT: subq %rax, %rcx -; AVX1-NEXT: cmovbq %rsi, %rcx -; AVX1-NEXT: vmovq %rcx, %xmm2 -; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: vmovq %xmm0, %rcx -; AVX1-NEXT: subq %rax, %rcx -; AVX1-NEXT: cmovbq %rsi, %rcx -; AVX1-NEXT: vmovq %rcx, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: movzwl (%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: movzwl (%rsi), %eax +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpextrw $0, %xmm0, (%rdx) ; AVX1-NEXT: retq ; ; AVX2-LABEL: v2i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpsllq $56, %xmm1, %xmm1 -; AVX2-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX2-NEXT: vpextrq $1, %xmm0, %rcx -; AVX2-NEXT: xorl %esi, %esi -; AVX2-NEXT: subq %rax, %rcx -; AVX2-NEXT: cmovbq %rsi, %rcx -; AVX2-NEXT: vmovq %rcx, %xmm2 -; AVX2-NEXT: vmovq %xmm1, %rax -; AVX2-NEXT: vmovq %xmm0, %rcx -; AVX2-NEXT: subq %rax, %rcx -; AVX2-NEXT: cmovbq %rsi, %rcx -; AVX2-NEXT: vmovq %rcx, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: movzwl (%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: movzwl (%rsi), %eax +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpextrw $0, %xmm0, (%rdx) ; AVX2-NEXT: retq ; @@ -9914,22 +334,8 @@ ; AVX512-NEXT: vmovd %eax, %xmm0 ; AVX512-NEXT: movzwl (%rsi), %eax ; AVX512-NEXT: vmovd %eax, %xmm1 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,1] -; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpextrq $1, %xmm1, %rax -; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpextrq $1, %xmm0, %rcx -; AVX512-NEXT: xorl %esi, %esi -; AVX512-NEXT: subq %rax, %rcx -; AVX512-NEXT: cmovbq %rsi, %rcx -; AVX512-NEXT: vmovq %rcx, %xmm2 -; AVX512-NEXT: vmovq %xmm1, %rax -; AVX512-NEXT: vmovq %xmm0, %rcx -; AVX512-NEXT: subq %rax, %rcx -; AVX512-NEXT: cmovbq %rsi, %rcx -; AVX512-NEXT: vmovq %rcx, %xmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX512-NEXT: vpsrlq $56, %xmm0, %xmm0 +; AVX512-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vpmovqb %xmm0, (%rdx) ; AVX512-NEXT: retq %x = load <2 x i8>, <2 x i8>* %px @@ -9940,157 +346,19 @@ } define void @v4i16(<4 x i16>* %px, <4 x i16>* %py, <4 x i16>* %pz) nounwind { -; SSE2-LABEL: v4i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,1,2,3] -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] -; SSE2-NEXT: movd %xmm1, %ecx -; SSE2-NEXT: xorl %esi, %esi -; SSE2-NEXT: subl %eax, %ecx -; SSE2-NEXT: cmovbl %esi, %ecx -; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm2, %ecx -; SSE2-NEXT: subl %eax, %ecx -; SSE2-NEXT: cmovbl %esi, %ecx -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: movd %xmm3, %eax -; SSE2-NEXT: movd %xmm0, %ecx -; SSE2-NEXT: subl %eax, %ecx -; SSE2-NEXT: cmovbl %esi, %ecx -; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] -; SSE2-NEXT: movd %xmm3, %eax -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE2-NEXT: movd %xmm0, %ecx -; SSE2-NEXT: subl %eax, %ecx -; SSE2-NEXT: cmovbl %esi, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: movq %xmm0, (%rdx) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v4i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSSE3-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSSE3-NEXT: pxor %xmm0, %xmm0 -; SSSE3-NEXT: pxor %xmm3, %xmm3 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,1,2,3] -; SSSE3-NEXT: movd %xmm2, %eax -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] -; SSSE3-NEXT: movd %xmm1, %ecx -; SSSE3-NEXT: xorl %esi, %esi -; SSSE3-NEXT: subl %eax, %ecx -; SSSE3-NEXT: cmovbl %esi, %ecx -; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] -; SSSE3-NEXT: movd %xmm2, %eax -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSSE3-NEXT: movd %xmm2, %ecx -; SSSE3-NEXT: subl %eax, %ecx -; SSSE3-NEXT: cmovbl %esi, %ecx -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSSE3-NEXT: movd %xmm3, %eax -; SSSE3-NEXT: movd %xmm0, %ecx -; SSSE3-NEXT: subl %eax, %ecx -; SSSE3-NEXT: cmovbl %esi, %ecx -; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] -; SSSE3-NEXT: movd %xmm3, %eax -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSSE3-NEXT: movd %xmm0, %ecx -; SSSE3-NEXT: subl %eax, %ecx -; SSSE3-NEXT: cmovbl %esi, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,10,11,14,15,14,15],zero,zero -; SSSE3-NEXT: movq %xmm1, (%rdx) -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v4i16: -; SSE41: # %bb.0: -; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE41-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pxor %xmm3, %xmm3 -; SSE41-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE41-NEXT: pextrd $1, %xmm3, %eax -; SSE41-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE41-NEXT: pextrd $1, %xmm2, %ecx -; SSE41-NEXT: xorl %esi, %esi -; SSE41-NEXT: subl %eax, %ecx -; SSE41-NEXT: cmovbl %esi, %ecx -; SSE41-NEXT: movd %xmm3, %eax -; SSE41-NEXT: movd %xmm2, %edi -; SSE41-NEXT: subl %eax, %edi -; SSE41-NEXT: cmovbl %esi, %edi -; SSE41-NEXT: movd %edi, %xmm0 -; SSE41-NEXT: pinsrd $1, %ecx, %xmm0 -; SSE41-NEXT: pextrd $2, %xmm3, %eax -; SSE41-NEXT: pextrd $2, %xmm2, %ecx -; SSE41-NEXT: subl %eax, %ecx -; SSE41-NEXT: cmovbl %esi, %ecx -; SSE41-NEXT: pinsrd $2, %ecx, %xmm0 -; SSE41-NEXT: pextrd $3, %xmm3, %eax -; SSE41-NEXT: pextrd $3, %xmm2, %ecx -; SSE41-NEXT: subl %eax, %ecx -; SSE41-NEXT: cmovbl %esi, %ecx -; SSE41-NEXT: pinsrd $3, %ecx, %xmm0 -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: packusdw %xmm0, %xmm0 -; SSE41-NEXT: movq %xmm0, (%rdx) -; SSE41-NEXT: retq +; SSE-LABEL: v4i16: +; SSE: # %bb.0: +; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE-NEXT: psubusw %xmm1, %xmm0 +; SSE-NEXT: movq %xmm0, (%rdx) +; SSE-NEXT: retq ; ; AVX1-LABEL: v4i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-NEXT: vpextrd $1, %xmm1, %eax -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX1-NEXT: vpextrd $1, %xmm0, %ecx -; AVX1-NEXT: xorl %esi, %esi -; AVX1-NEXT: subl %eax, %ecx -; AVX1-NEXT: cmovbl %esi, %ecx -; AVX1-NEXT: vmovd %xmm1, %eax -; AVX1-NEXT: vmovd %xmm0, %edi -; AVX1-NEXT: subl %eax, %edi -; AVX1-NEXT: cmovbl %esi, %edi -; AVX1-NEXT: vmovd %edi, %xmm2 -; AVX1-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrd $2, %xmm1, %eax -; AVX1-NEXT: vpextrd $2, %xmm0, %ecx -; AVX1-NEXT: subl %eax, %ecx -; AVX1-NEXT: cmovbl %esi, %ecx -; AVX1-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrd $3, %xmm1, %eax -; AVX1-NEXT: vpextrd $3, %xmm0, %ecx -; AVX1-NEXT: subl %eax, %ecx -; AVX1-NEXT: cmovbl %esi, %ecx -; AVX1-NEXT: vpinsrd $3, %ecx, %xmm2, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, (%rdx) ; AVX1-NEXT: retq ; @@ -10098,32 +366,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-NEXT: vpextrd $1, %xmm1, %eax -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX2-NEXT: vpextrd $1, %xmm0, %ecx -; AVX2-NEXT: xorl %esi, %esi -; AVX2-NEXT: subl %eax, %ecx -; AVX2-NEXT: cmovbl %esi, %ecx -; AVX2-NEXT: vmovd %xmm1, %eax -; AVX2-NEXT: vmovd %xmm0, %edi -; AVX2-NEXT: subl %eax, %edi -; AVX2-NEXT: cmovbl %esi, %edi -; AVX2-NEXT: vmovd %edi, %xmm2 -; AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrd $2, %xmm1, %eax -; AVX2-NEXT: vpextrd $2, %xmm0, %ecx -; AVX2-NEXT: subl %eax, %ecx -; AVX2-NEXT: cmovbl %esi, %ecx -; AVX2-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrd $3, %xmm1, %eax -; AVX2-NEXT: vpextrd $3, %xmm0, %ecx -; AVX2-NEXT: subl %eax, %ecx -; AVX2-NEXT: cmovbl %esi, %ecx -; AVX2-NEXT: vpinsrd $3, %ecx, %xmm2, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, (%rdx) ; AVX2-NEXT: retq ; @@ -10131,31 +374,8 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,0,1,255,255,2,3,255,255,4,5,255,255,6,7] -; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpextrd $1, %xmm1, %eax -; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpextrd $1, %xmm0, %ecx -; AVX512-NEXT: xorl %esi, %esi -; AVX512-NEXT: subl %eax, %ecx -; AVX512-NEXT: cmovbl %esi, %ecx -; AVX512-NEXT: vmovd %xmm1, %eax -; AVX512-NEXT: vmovd %xmm0, %edi -; AVX512-NEXT: subl %eax, %edi -; AVX512-NEXT: cmovbl %esi, %edi -; AVX512-NEXT: vmovd %edi, %xmm2 -; AVX512-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 -; AVX512-NEXT: vpextrd $2, %xmm1, %eax -; AVX512-NEXT: vpextrd $2, %xmm0, %ecx -; AVX512-NEXT: subl %eax, %ecx -; AVX512-NEXT: cmovbl %esi, %ecx -; AVX512-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm2 -; AVX512-NEXT: vpextrd $3, %xmm1, %eax -; AVX512-NEXT: vpextrd $3, %xmm0, %ecx -; AVX512-NEXT: subl %eax, %ecx -; AVX512-NEXT: cmovbl %esi, %ecx -; AVX512-NEXT: vpinsrd $3, %ecx, %xmm2, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX512-NEXT: vpmovdw %xmm0, (%rdx) ; AVX512-NEXT: retq %x = load <4 x i16>, <4 x i16>* %px @@ -10166,124 +386,27 @@ } define void @v2i16(<2 x i16>* %px, <2 x i16>* %py, <2 x i16>* %pz) nounwind { -; SSE2-LABEL: v2i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7] -; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,6,7] -; SSE2-NEXT: psllq $48, %xmm1 -; SSE2-NEXT: movq %xmm1, %rax -; SSE2-NEXT: psllq $48, %xmm0 -; SSE2-NEXT: movq %xmm0, %rcx -; SSE2-NEXT: xorl %esi, %esi -; SSE2-NEXT: subq %rax, %rcx -; SSE2-NEXT: cmovbq %rsi, %rcx -; SSE2-NEXT: movq %rcx, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: movq %xmm1, %rax -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movq %xmm0, %rcx -; SSE2-NEXT: subq %rax, %rcx -; SSE2-NEXT: cmovbq %rsi, %rcx -; SSE2-NEXT: movq %rcx, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE2-NEXT: psrlq $48, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: movd %xmm0, (%rdx) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v2i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,0,1,255,255,255,255,255,255,2,3] -; SSSE3-NEXT: pshufb %xmm2, %xmm1 -; SSSE3-NEXT: movq %xmm1, %rax -; SSSE3-NEXT: pshufb %xmm2, %xmm0 -; SSSE3-NEXT: movq %xmm0, %rcx -; SSSE3-NEXT: xorl %esi, %esi -; SSSE3-NEXT: subq %rax, %rcx -; SSSE3-NEXT: cmovbq %rsi, %rcx -; SSSE3-NEXT: movq %rcx, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSSE3-NEXT: movq %xmm1, %rax -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSSE3-NEXT: movq %xmm0, %rcx -; SSSE3-NEXT: subq %rax, %rcx -; SSSE3-NEXT: cmovbq %rsi, %rcx -; SSSE3-NEXT: movq %rcx, %xmm0 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[6,7,14,15,14,15],zero,zero,xmm2[14,15],zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: movd %xmm2, (%rdx) -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v2i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; SSE41-NEXT: psllq $48, %xmm1 -; SSE41-NEXT: pextrq $1, %xmm1, %rax -; SSE41-NEXT: psllq $48, %xmm0 -; SSE41-NEXT: pextrq $1, %xmm0, %rcx -; SSE41-NEXT: xorl %esi, %esi -; SSE41-NEXT: subq %rax, %rcx -; SSE41-NEXT: cmovbq %rsi, %rcx -; SSE41-NEXT: movq %rcx, %xmm2 -; SSE41-NEXT: movq %xmm1, %rax -; SSE41-NEXT: movq %xmm0, %rcx -; SSE41-NEXT: subq %rax, %rcx -; SSE41-NEXT: cmovbq %rsi, %rcx -; SSE41-NEXT: movq %rcx, %xmm0 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,14,15],zero,zero,xmm0[14,15],zero,zero,zero,zero,zero,zero -; SSE41-NEXT: movd %xmm0, (%rdx) -; SSE41-NEXT: retq +; SSE-LABEL: v2i16: +; SSE: # %bb.0: +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: psubusw %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, (%rdx) +; SSE-NEXT: retq ; ; AVX1-LABEL: v2i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; AVX1-NEXT: vpsllq $48, %xmm1, %xmm1 -; AVX1-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-NEXT: vpsllq $48, %xmm0, %xmm0 -; AVX1-NEXT: vpextrq $1, %xmm0, %rcx -; AVX1-NEXT: xorl %esi, %esi -; AVX1-NEXT: subq %rax, %rcx -; AVX1-NEXT: cmovbq %rsi, %rcx -; AVX1-NEXT: vmovq %rcx, %xmm2 -; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: vmovq %xmm0, %rcx -; AVX1-NEXT: subq %rax, %rcx -; AVX1-NEXT: cmovbq %rsi, %rcx -; AVX1-NEXT: vmovq %rcx, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,14,15],zero,zero,xmm0[14,15],zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, (%rdx) ; AVX1-NEXT: retq ; ; AVX2-LABEL: v2i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1 -; AVX2-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-NEXT: vpsllq $48, %xmm0, %xmm0 -; AVX2-NEXT: vpextrq $1, %xmm0, %rcx -; AVX2-NEXT: xorl %esi, %esi -; AVX2-NEXT: subq %rax, %rcx -; AVX2-NEXT: cmovbq %rsi, %rcx -; AVX2-NEXT: vmovq %rcx, %xmm2 -; AVX2-NEXT: vmovq %xmm1, %rax -; AVX2-NEXT: vmovq %xmm0, %rcx -; AVX2-NEXT: subq %rax, %rcx -; AVX2-NEXT: cmovbq %rsi, %rcx -; AVX2-NEXT: vmovq %rcx, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,14,15],zero,zero,xmm0[14,15],zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, (%rdx) ; AVX2-NEXT: retq ; @@ -10291,22 +414,8 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,0,1,255,255,255,255,255,255,2,3] -; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpextrq $1, %xmm1, %rax -; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpextrq $1, %xmm0, %rcx -; AVX512-NEXT: xorl %esi, %esi -; AVX512-NEXT: subq %rax, %rcx -; AVX512-NEXT: cmovbq %rsi, %rcx -; AVX512-NEXT: vmovq %rcx, %xmm2 -; AVX512-NEXT: vmovq %xmm1, %rax -; AVX512-NEXT: vmovq %xmm0, %rcx -; AVX512-NEXT: subq %rax, %rcx -; AVX512-NEXT: cmovbq %rsi, %rcx -; AVX512-NEXT: vmovq %rcx, %xmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX512-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512-NEXT: vpmovqw %xmm0, (%rdx) ; AVX512-NEXT: retq %x = load <2 x i16>, <2 x i16>* %px @@ -10317,1014 +426,59 @@ } define <12 x i8> @v12i8(<12 x i8> %x, <12 x i8> %y) nounwind { -; SSE2-LABEL: v12i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pushq %rbp -; SSE2-NEXT: pushq %r15 -; SSE2-NEXT: pushq %r14 -; SSE2-NEXT: pushq %r13 -; SSE2-NEXT: pushq %r12 -; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movl $0, %eax -; SSE2-NEXT: jb .LBB11_2 -; SSE2-NEXT: # %bb.1: -; SSE2-NEXT: movl %ecx, %eax -; SSE2-NEXT: .LBB11_2: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl $0, %ecx -; SSE2-NEXT: jb .LBB11_4 -; SSE2-NEXT: # %bb.3: -; SSE2-NEXT: movl %edx, %ecx -; SSE2-NEXT: .LBB11_4: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl $0, %edx -; SSE2-NEXT: jb .LBB11_6 -; SSE2-NEXT: # %bb.5: -; SSE2-NEXT: movl %ebx, %edx -; SSE2-NEXT: .LBB11_6: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl $0, %esi -; SSE2-NEXT: jb .LBB11_8 -; SSE2-NEXT: # %bb.7: -; SSE2-NEXT: movl %ebx, %esi -; SSE2-NEXT: .LBB11_8: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl $0, %edi -; SSE2-NEXT: jb .LBB11_10 -; SSE2-NEXT: # %bb.9: -; SSE2-NEXT: movl %ebx, %edi -; SSE2-NEXT: .LBB11_10: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl $0, %r12d -; SSE2-NEXT: jb .LBB11_12 -; SSE2-NEXT: # %bb.11: -; SSE2-NEXT: movl %ebx, %r12d -; SSE2-NEXT: .LBB11_12: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl $0, %r8d -; SSE2-NEXT: jb .LBB11_14 -; SSE2-NEXT: # %bb.13: -; SSE2-NEXT: movl %ebx, %r8d -; SSE2-NEXT: .LBB11_14: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl $0, %r10d -; SSE2-NEXT: jb .LBB11_16 -; SSE2-NEXT: # %bb.15: -; SSE2-NEXT: movl %ebx, %r10d -; SSE2-NEXT: .LBB11_16: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl $0, %r13d -; SSE2-NEXT: jb .LBB11_18 -; SSE2-NEXT: # %bb.17: -; SSE2-NEXT: movl %ebx, %r13d -; SSE2-NEXT: .LBB11_18: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl $0, %r9d -; SSE2-NEXT: jb .LBB11_20 -; SSE2-NEXT: # %bb.19: -; SSE2-NEXT: movl %ebx, %r9d -; SSE2-NEXT: .LBB11_20: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl $0, %r11d -; SSE2-NEXT: jb .LBB11_22 -; SSE2-NEXT: # %bb.21: -; SSE2-NEXT: movl %ebx, %r11d -; SSE2-NEXT: .LBB11_22: -; SSE2-NEXT: movzbl %al, %r14d -; SSE2-NEXT: movzbl %cl, %r15d -; SSE2-NEXT: movzbl %dl, %edx -; SSE2-NEXT: movzbl %sil, %esi -; SSE2-NEXT: movzbl %dil, %ebx -; SSE2-NEXT: movzbl %r12b, %ebp -; SSE2-NEXT: movzbl %r8b, %edi -; SSE2-NEXT: movzbl %r10b, %r8d -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movl $0, %eax -; SSE2-NEXT: jb .LBB11_24 -; SSE2-NEXT: # %bb.23: -; SSE2-NEXT: movl %ecx, %eax -; SSE2-NEXT: .LBB11_24: -; SSE2-NEXT: movd %r14d, %xmm2 -; SSE2-NEXT: movd %r15d, %xmm3 -; SSE2-NEXT: movd %edx, %xmm5 -; SSE2-NEXT: movd %esi, %xmm0 -; SSE2-NEXT: movd %ebx, %xmm6 -; SSE2-NEXT: movd %ebp, %xmm4 -; SSE2-NEXT: movd %edi, %xmm7 -; SSE2-NEXT: movd %r8d, %xmm1 -; SSE2-NEXT: movzbl %r13b, %ebp -; SSE2-NEXT: movzbl %r9b, %ecx -; SSE2-NEXT: movzbl %r11b, %edx -; SSE2-NEXT: movzbl %al, %esi -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, %edi -; SSE2-NEXT: jb .LBB11_26 -; SSE2-NEXT: # %bb.25: -; SSE2-NEXT: movl %eax, %edi -; SSE2-NEXT: .LBB11_26: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; SSE2-NEXT: movd %ebp, %xmm6 -; SSE2-NEXT: movd %ecx, %xmm5 -; SSE2-NEXT: movd %edx, %xmm7 -; SSE2-NEXT: movd %esi, %xmm2 -; SSE2-NEXT: movzbl %dil, %eax -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl $0, %ecx -; SSE2-NEXT: jb .LBB11_28 -; SSE2-NEXT: # %bb.27: -; SSE2-NEXT: movl %edx, %ecx -; SSE2-NEXT: .LBB11_28: -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] -; SSE2-NEXT: movd %eax, %xmm4 -; SSE2-NEXT: movzbl %cl, %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl $0, %ecx -; SSE2-NEXT: jb .LBB11_30 -; SSE2-NEXT: # %bb.29: -; SSE2-NEXT: movl %edx, %ecx -; SSE2-NEXT: .LBB11_30: -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE2-NEXT: movzbl %cl, %ecx -; SSE2-NEXT: movd %ecx, %xmm4 -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, %ecx -; SSE2-NEXT: jb .LBB11_32 -; SSE2-NEXT: # %bb.31: -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: .LBB11_32: -; SSE2-NEXT: movzbl %cl, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: popq %rbx -; SSE2-NEXT: popq %r12 -; SSE2-NEXT: popq %r13 -; SSE2-NEXT: popq %r14 -; SSE2-NEXT: popq %r15 -; SSE2-NEXT: popq %rbp -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v12i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pushq %rbp -; SSSE3-NEXT: pushq %r15 -; SSSE3-NEXT: pushq %r14 -; SSSE3-NEXT: pushq %r13 -; SSSE3-NEXT: pushq %r12 -; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movl $0, %eax -; SSSE3-NEXT: jb .LBB11_2 -; SSSE3-NEXT: # %bb.1: -; SSSE3-NEXT: movl %ecx, %eax -; SSSE3-NEXT: .LBB11_2: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl $0, %ecx -; SSSE3-NEXT: jb .LBB11_4 -; SSSE3-NEXT: # %bb.3: -; SSSE3-NEXT: movl %edx, %ecx -; SSSE3-NEXT: .LBB11_4: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl $0, %edx -; SSSE3-NEXT: jb .LBB11_6 -; SSSE3-NEXT: # %bb.5: -; SSSE3-NEXT: movl %ebx, %edx -; SSSE3-NEXT: .LBB11_6: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl $0, %esi -; SSSE3-NEXT: jb .LBB11_8 -; SSSE3-NEXT: # %bb.7: -; SSSE3-NEXT: movl %ebx, %esi -; SSSE3-NEXT: .LBB11_8: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl $0, %edi -; SSSE3-NEXT: jb .LBB11_10 -; SSSE3-NEXT: # %bb.9: -; SSSE3-NEXT: movl %ebx, %edi -; SSSE3-NEXT: .LBB11_10: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl $0, %r12d -; SSSE3-NEXT: jb .LBB11_12 -; SSSE3-NEXT: # %bb.11: -; SSSE3-NEXT: movl %ebx, %r12d -; SSSE3-NEXT: .LBB11_12: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl $0, %r8d -; SSSE3-NEXT: jb .LBB11_14 -; SSSE3-NEXT: # %bb.13: -; SSSE3-NEXT: movl %ebx, %r8d -; SSSE3-NEXT: .LBB11_14: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl $0, %r10d -; SSSE3-NEXT: jb .LBB11_16 -; SSSE3-NEXT: # %bb.15: -; SSSE3-NEXT: movl %ebx, %r10d -; SSSE3-NEXT: .LBB11_16: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl $0, %r13d -; SSSE3-NEXT: jb .LBB11_18 -; SSSE3-NEXT: # %bb.17: -; SSSE3-NEXT: movl %ebx, %r13d -; SSSE3-NEXT: .LBB11_18: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl $0, %r9d -; SSSE3-NEXT: jb .LBB11_20 -; SSSE3-NEXT: # %bb.19: -; SSSE3-NEXT: movl %ebx, %r9d -; SSSE3-NEXT: .LBB11_20: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl $0, %r11d -; SSSE3-NEXT: jb .LBB11_22 -; SSSE3-NEXT: # %bb.21: -; SSSE3-NEXT: movl %ebx, %r11d -; SSSE3-NEXT: .LBB11_22: -; SSSE3-NEXT: movzbl %al, %r14d -; SSSE3-NEXT: movzbl %cl, %r15d -; SSSE3-NEXT: movzbl %dl, %edx -; SSSE3-NEXT: movzbl %sil, %esi -; SSSE3-NEXT: movzbl %dil, %ebx -; SSSE3-NEXT: movzbl %r12b, %ebp -; SSSE3-NEXT: movzbl %r8b, %edi -; SSSE3-NEXT: movzbl %r10b, %r8d -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movl $0, %eax -; SSSE3-NEXT: jb .LBB11_24 -; SSSE3-NEXT: # %bb.23: -; SSSE3-NEXT: movl %ecx, %eax -; SSSE3-NEXT: .LBB11_24: -; SSSE3-NEXT: movd %r14d, %xmm2 -; SSSE3-NEXT: movd %r15d, %xmm3 -; SSSE3-NEXT: movd %edx, %xmm5 -; SSSE3-NEXT: movd %esi, %xmm0 -; SSSE3-NEXT: movd %ebx, %xmm6 -; SSSE3-NEXT: movd %ebp, %xmm4 -; SSSE3-NEXT: movd %edi, %xmm7 -; SSSE3-NEXT: movd %r8d, %xmm1 -; SSSE3-NEXT: movzbl %r13b, %ebp -; SSSE3-NEXT: movzbl %r9b, %ecx -; SSSE3-NEXT: movzbl %r11b, %edx -; SSSE3-NEXT: movzbl %al, %esi -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, %edi -; SSSE3-NEXT: jb .LBB11_26 -; SSSE3-NEXT: # %bb.25: -; SSSE3-NEXT: movl %eax, %edi -; SSSE3-NEXT: .LBB11_26: -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; SSSE3-NEXT: movd %ebp, %xmm6 -; SSSE3-NEXT: movd %ecx, %xmm5 -; SSSE3-NEXT: movd %edx, %xmm7 -; SSSE3-NEXT: movd %esi, %xmm2 -; SSSE3-NEXT: movzbl %dil, %eax -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl $0, %ecx -; SSSE3-NEXT: jb .LBB11_28 -; SSSE3-NEXT: # %bb.27: -; SSSE3-NEXT: movl %edx, %ecx -; SSSE3-NEXT: .LBB11_28: -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] -; SSSE3-NEXT: movd %eax, %xmm4 -; SSSE3-NEXT: movzbl %cl, %eax -; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl $0, %ecx -; SSSE3-NEXT: jb .LBB11_30 -; SSSE3-NEXT: # %bb.29: -; SSSE3-NEXT: movl %edx, %ecx -; SSSE3-NEXT: .LBB11_30: -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSSE3-NEXT: movzbl %cl, %ecx -; SSSE3-NEXT: movd %ecx, %xmm4 -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, %ecx -; SSSE3-NEXT: jb .LBB11_32 -; SSSE3-NEXT: # %bb.31: -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: .LBB11_32: -; SSSE3-NEXT: movzbl %cl, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: popq %rbx -; SSSE3-NEXT: popq %r12 -; SSSE3-NEXT: popq %r13 -; SSSE3-NEXT: popq %r14 -; SSSE3-NEXT: popq %r15 -; SSSE3-NEXT: popq %rbp -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v12i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pextrb $1, %xmm1, %edx -; SSE41-NEXT: pextrb $1, %xmm0, %ecx -; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB11_2 -; SSE41-NEXT: # %bb.1: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB11_2: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pextrb $0, %xmm1, %esi -; SSE41-NEXT: pextrb $0, %xmm0, %edx -; SSE41-NEXT: subb %sil, %dl -; SSE41-NEXT: movl $0, %esi -; SSE41-NEXT: jb .LBB11_4 -; SSE41-NEXT: # %bb.3: -; SSE41-NEXT: movl %edx, %esi -; SSE41-NEXT: .LBB11_4: -; SSE41-NEXT: movzbl %sil, %edx -; SSE41-NEXT: movd %edx, %xmm2 -; SSE41-NEXT: pinsrb $1, %ecx, %xmm2 -; SSE41-NEXT: pextrb $2, %xmm1, %edx -; SSE41-NEXT: pextrb $2, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB11_6 -; SSE41-NEXT: # %bb.5: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB11_6: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $2, %ecx, %xmm2 -; SSE41-NEXT: pextrb $3, %xmm1, %edx -; SSE41-NEXT: pextrb $3, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB11_8 -; SSE41-NEXT: # %bb.7: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB11_8: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $3, %ecx, %xmm2 -; SSE41-NEXT: pextrb $4, %xmm1, %edx -; SSE41-NEXT: pextrb $4, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB11_10 -; SSE41-NEXT: # %bb.9: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB11_10: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $4, %ecx, %xmm2 -; SSE41-NEXT: pextrb $5, %xmm1, %edx -; SSE41-NEXT: pextrb $5, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB11_12 -; SSE41-NEXT: # %bb.11: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB11_12: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $5, %ecx, %xmm2 -; SSE41-NEXT: pextrb $6, %xmm1, %edx -; SSE41-NEXT: pextrb $6, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB11_14 -; SSE41-NEXT: # %bb.13: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB11_14: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $6, %ecx, %xmm2 -; SSE41-NEXT: pextrb $7, %xmm1, %edx -; SSE41-NEXT: pextrb $7, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB11_16 -; SSE41-NEXT: # %bb.15: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB11_16: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $7, %ecx, %xmm2 -; SSE41-NEXT: pextrb $8, %xmm1, %edx -; SSE41-NEXT: pextrb $8, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB11_18 -; SSE41-NEXT: # %bb.17: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB11_18: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $8, %ecx, %xmm2 -; SSE41-NEXT: pextrb $9, %xmm1, %edx -; SSE41-NEXT: pextrb $9, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB11_20 -; SSE41-NEXT: # %bb.19: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB11_20: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $9, %ecx, %xmm2 -; SSE41-NEXT: pextrb $10, %xmm1, %edx -; SSE41-NEXT: pextrb $10, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB11_22 -; SSE41-NEXT: # %bb.21: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB11_22: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $10, %ecx, %xmm2 -; SSE41-NEXT: pextrb $11, %xmm1, %edx -; SSE41-NEXT: pextrb $11, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB11_24 -; SSE41-NEXT: # %bb.23: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB11_24: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $11, %ecx, %xmm2 -; SSE41-NEXT: pextrb $12, %xmm1, %edx -; SSE41-NEXT: pextrb $12, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB11_26 -; SSE41-NEXT: # %bb.25: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB11_26: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $12, %ecx, %xmm2 -; SSE41-NEXT: pextrb $13, %xmm1, %edx -; SSE41-NEXT: pextrb $13, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB11_28 -; SSE41-NEXT: # %bb.27: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB11_28: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $13, %ecx, %xmm2 -; SSE41-NEXT: pextrb $14, %xmm1, %edx -; SSE41-NEXT: pextrb $14, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB11_30 -; SSE41-NEXT: # %bb.29: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB11_30: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $14, %ecx, %xmm2 -; SSE41-NEXT: pextrb $15, %xmm1, %edx -; SSE41-NEXT: pextrb $15, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: jb .LBB11_32 -; SSE41-NEXT: # %bb.31: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB11_32: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $15, %eax, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: v12i8: +; SSE: # %bb.0: +; SSE-NEXT: psubusb %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: v12i8: ; AVX: # %bb.0: -; AVX-NEXT: vpextrb $1, %xmm1, %edx -; AVX-NEXT: vpextrb $1, %xmm0, %ecx -; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: subb %dl, %cl -; AVX-NEXT: movl $0, %edx -; AVX-NEXT: jb .LBB11_2 -; AVX-NEXT: # %bb.1: -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: .LBB11_2: -; AVX-NEXT: movzbl %dl, %ecx -; AVX-NEXT: vpextrb $0, %xmm1, %esi -; AVX-NEXT: vpextrb $0, %xmm0, %edx -; AVX-NEXT: subb %sil, %dl -; AVX-NEXT: movl $0, %esi -; AVX-NEXT: jb .LBB11_4 -; AVX-NEXT: # %bb.3: -; AVX-NEXT: movl %edx, %esi -; AVX-NEXT: .LBB11_4: -; AVX-NEXT: movzbl %sil, %edx -; AVX-NEXT: vmovd %edx, %xmm2 -; AVX-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $2, %xmm1, %edx -; AVX-NEXT: vpextrb $2, %xmm0, %ecx -; AVX-NEXT: subb %dl, %cl -; AVX-NEXT: movl $0, %edx -; AVX-NEXT: jb .LBB11_6 -; AVX-NEXT: # %bb.5: -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: .LBB11_6: -; AVX-NEXT: movzbl %dl, %ecx -; AVX-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $3, %xmm1, %edx -; AVX-NEXT: vpextrb $3, %xmm0, %ecx -; AVX-NEXT: subb %dl, %cl -; AVX-NEXT: movl $0, %edx -; AVX-NEXT: jb .LBB11_8 -; AVX-NEXT: # %bb.7: -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: .LBB11_8: -; AVX-NEXT: movzbl %dl, %ecx -; AVX-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $4, %xmm1, %edx -; AVX-NEXT: vpextrb $4, %xmm0, %ecx -; AVX-NEXT: subb %dl, %cl -; AVX-NEXT: movl $0, %edx -; AVX-NEXT: jb .LBB11_10 -; AVX-NEXT: # %bb.9: -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: .LBB11_10: -; AVX-NEXT: movzbl %dl, %ecx -; AVX-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $5, %xmm1, %edx -; AVX-NEXT: vpextrb $5, %xmm0, %ecx -; AVX-NEXT: subb %dl, %cl -; AVX-NEXT: movl $0, %edx -; AVX-NEXT: jb .LBB11_12 -; AVX-NEXT: # %bb.11: -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: .LBB11_12: -; AVX-NEXT: movzbl %dl, %ecx -; AVX-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $6, %xmm1, %edx -; AVX-NEXT: vpextrb $6, %xmm0, %ecx -; AVX-NEXT: subb %dl, %cl -; AVX-NEXT: movl $0, %edx -; AVX-NEXT: jb .LBB11_14 -; AVX-NEXT: # %bb.13: -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: .LBB11_14: -; AVX-NEXT: movzbl %dl, %ecx -; AVX-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $7, %xmm1, %edx -; AVX-NEXT: vpextrb $7, %xmm0, %ecx -; AVX-NEXT: subb %dl, %cl -; AVX-NEXT: movl $0, %edx -; AVX-NEXT: jb .LBB11_16 -; AVX-NEXT: # %bb.15: -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: .LBB11_16: -; AVX-NEXT: movzbl %dl, %ecx -; AVX-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $8, %xmm1, %edx -; AVX-NEXT: vpextrb $8, %xmm0, %ecx -; AVX-NEXT: subb %dl, %cl -; AVX-NEXT: movl $0, %edx -; AVX-NEXT: jb .LBB11_18 -; AVX-NEXT: # %bb.17: -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: .LBB11_18: -; AVX-NEXT: movzbl %dl, %ecx -; AVX-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $9, %xmm1, %edx -; AVX-NEXT: vpextrb $9, %xmm0, %ecx -; AVX-NEXT: subb %dl, %cl -; AVX-NEXT: movl $0, %edx -; AVX-NEXT: jb .LBB11_20 -; AVX-NEXT: # %bb.19: -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: .LBB11_20: -; AVX-NEXT: movzbl %dl, %ecx -; AVX-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $10, %xmm1, %edx -; AVX-NEXT: vpextrb $10, %xmm0, %ecx -; AVX-NEXT: subb %dl, %cl -; AVX-NEXT: movl $0, %edx -; AVX-NEXT: jb .LBB11_22 -; AVX-NEXT: # %bb.21: -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: .LBB11_22: -; AVX-NEXT: movzbl %dl, %ecx -; AVX-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $11, %xmm1, %edx -; AVX-NEXT: vpextrb $11, %xmm0, %ecx -; AVX-NEXT: subb %dl, %cl -; AVX-NEXT: movl $0, %edx -; AVX-NEXT: jb .LBB11_24 -; AVX-NEXT: # %bb.23: -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: .LBB11_24: -; AVX-NEXT: movzbl %dl, %ecx -; AVX-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $12, %xmm1, %edx -; AVX-NEXT: vpextrb $12, %xmm0, %ecx -; AVX-NEXT: subb %dl, %cl -; AVX-NEXT: movl $0, %edx -; AVX-NEXT: jb .LBB11_26 -; AVX-NEXT: # %bb.25: -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: .LBB11_26: -; AVX-NEXT: movzbl %dl, %ecx -; AVX-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $13, %xmm1, %edx -; AVX-NEXT: vpextrb $13, %xmm0, %ecx -; AVX-NEXT: subb %dl, %cl -; AVX-NEXT: movl $0, %edx -; AVX-NEXT: jb .LBB11_28 -; AVX-NEXT: # %bb.27: -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: .LBB11_28: -; AVX-NEXT: movzbl %dl, %ecx -; AVX-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $14, %xmm1, %edx -; AVX-NEXT: vpextrb $14, %xmm0, %ecx -; AVX-NEXT: subb %dl, %cl -; AVX-NEXT: movl $0, %edx -; AVX-NEXT: jb .LBB11_30 -; AVX-NEXT: # %bb.29: -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: .LBB11_30: -; AVX-NEXT: movzbl %dl, %ecx -; AVX-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $15, %xmm1, %edx -; AVX-NEXT: vpextrb $15, %xmm0, %ecx -; AVX-NEXT: subb %dl, %cl -; AVX-NEXT: jb .LBB11_32 -; AVX-NEXT: # %bb.31: -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: .LBB11_32: -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0 +; AVX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %z = call <12 x i8> @llvm.usub.sat.v12i8(<12 x i8> %x, <12 x i8> %y) ret <12 x i8> %z } define void @v12i16(<12 x i16>* %px, <12 x i16>* %py, <12 x i16>* %pz) nounwind { -; SSE2-LABEL: v12i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm2 -; SSE2-NEXT: movdqa 16(%rdi), %xmm0 -; SSE2-NEXT: movdqa (%rsi), %xmm3 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: pextrw $7, %xmm3, %ecx -; SSE2-NEXT: pextrw $7, %xmm2, %esi -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: subw %cx, %si -; SSE2-NEXT: cmovbl %eax, %esi -; SSE2-NEXT: movd %esi, %xmm4 -; SSE2-NEXT: pextrw $6, %xmm3, %ecx -; SSE2-NEXT: pextrw $6, %xmm2, %esi -; SSE2-NEXT: subw %cx, %si -; SSE2-NEXT: cmovbl %eax, %esi -; SSE2-NEXT: movd %esi, %xmm5 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE2-NEXT: pextrw $5, %xmm3, %ecx -; SSE2-NEXT: pextrw $5, %xmm2, %esi -; SSE2-NEXT: subw %cx, %si -; SSE2-NEXT: cmovbl %eax, %esi -; SSE2-NEXT: movd %esi, %xmm6 -; SSE2-NEXT: pextrw $4, %xmm3, %ecx -; SSE2-NEXT: pextrw $4, %xmm2, %esi -; SSE2-NEXT: subw %cx, %si -; SSE2-NEXT: cmovbl %eax, %esi -; SSE2-NEXT: movd %esi, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSE2-NEXT: pextrw $3, %xmm3, %ecx -; SSE2-NEXT: pextrw $3, %xmm2, %esi -; SSE2-NEXT: subw %cx, %si -; SSE2-NEXT: cmovbl %eax, %esi -; SSE2-NEXT: movd %esi, %xmm5 -; SSE2-NEXT: pextrw $2, %xmm3, %ecx -; SSE2-NEXT: pextrw $2, %xmm2, %esi -; SSE2-NEXT: subw %cx, %si -; SSE2-NEXT: cmovbl %eax, %esi -; SSE2-NEXT: movd %esi, %xmm6 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; SSE2-NEXT: pextrw $1, %xmm3, %ecx -; SSE2-NEXT: pextrw $1, %xmm2, %esi -; SSE2-NEXT: subw %cx, %si -; SSE2-NEXT: cmovbl %eax, %esi -; SSE2-NEXT: movd %esi, %xmm5 -; SSE2-NEXT: movd %xmm3, %ecx -; SSE2-NEXT: movd %xmm2, %esi -; SSE2-NEXT: subw %cx, %si -; SSE2-NEXT: cmovbl %eax, %esi -; SSE2-NEXT: movd %esi, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] -; SSE2-NEXT: pextrw $1, %xmm1, %ecx -; SSE2-NEXT: pextrw $1, %xmm0, %esi -; SSE2-NEXT: subw %cx, %si -; SSE2-NEXT: cmovbl %eax, %esi -; SSE2-NEXT: movd %xmm1, %ecx -; SSE2-NEXT: movd %xmm0, %edi -; SSE2-NEXT: subw %cx, %di -; SSE2-NEXT: cmovbl %eax, %edi -; SSE2-NEXT: movd %edi, %xmm3 -; SSE2-NEXT: pinsrw $1, %esi, %xmm3 -; SSE2-NEXT: pextrw $2, %xmm1, %ecx -; SSE2-NEXT: pextrw $2, %xmm0, %esi -; SSE2-NEXT: subw %cx, %si -; SSE2-NEXT: cmovbl %eax, %esi -; SSE2-NEXT: pinsrw $2, %esi, %xmm3 -; SSE2-NEXT: pextrw $3, %xmm1, %ecx -; SSE2-NEXT: pextrw $3, %xmm0, %esi -; SSE2-NEXT: subw %cx, %si -; SSE2-NEXT: cmovbl %eax, %esi -; SSE2-NEXT: pinsrw $3, %esi, %xmm3 -; SSE2-NEXT: movq %xmm3, 16(%rdx) -; SSE2-NEXT: movdqa %xmm2, (%rdx) -; SSE2-NEXT: retq +; SSE-LABEL: v12i16: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa 16(%rdi), %xmm1 +; SSE-NEXT: psubusw (%rsi), %xmm0 +; SSE-NEXT: psubusw 16(%rsi), %xmm1 +; SSE-NEXT: movq %xmm1, 16(%rdx) +; SSE-NEXT: movdqa %xmm0, (%rdx) +; SSE-NEXT: retq ; -; SSSE3-LABEL: v12i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa (%rdi), %xmm2 -; SSSE3-NEXT: movdqa 16(%rdi), %xmm0 -; SSSE3-NEXT: movdqa (%rsi), %xmm3 -; SSSE3-NEXT: movdqa 16(%rsi), %xmm1 -; SSSE3-NEXT: pextrw $7, %xmm3, %ecx -; SSSE3-NEXT: pextrw $7, %xmm2, %esi -; SSSE3-NEXT: xorl %eax, %eax -; SSSE3-NEXT: subw %cx, %si -; SSSE3-NEXT: cmovbl %eax, %esi -; SSSE3-NEXT: movd %esi, %xmm4 -; SSSE3-NEXT: pextrw $6, %xmm3, %ecx -; SSSE3-NEXT: pextrw $6, %xmm2, %esi -; SSSE3-NEXT: subw %cx, %si -; SSSE3-NEXT: cmovbl %eax, %esi -; SSSE3-NEXT: movd %esi, %xmm5 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSSE3-NEXT: pextrw $5, %xmm3, %ecx -; SSSE3-NEXT: pextrw $5, %xmm2, %esi -; SSSE3-NEXT: subw %cx, %si -; SSSE3-NEXT: cmovbl %eax, %esi -; SSSE3-NEXT: movd %esi, %xmm6 -; SSSE3-NEXT: pextrw $4, %xmm3, %ecx -; SSSE3-NEXT: pextrw $4, %xmm2, %esi -; SSSE3-NEXT: subw %cx, %si -; SSSE3-NEXT: cmovbl %eax, %esi -; SSSE3-NEXT: movd %esi, %xmm4 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSSE3-NEXT: pextrw $3, %xmm3, %ecx -; SSSE3-NEXT: pextrw $3, %xmm2, %esi -; SSSE3-NEXT: subw %cx, %si -; SSSE3-NEXT: cmovbl %eax, %esi -; SSSE3-NEXT: movd %esi, %xmm5 -; SSSE3-NEXT: pextrw $2, %xmm3, %ecx -; SSSE3-NEXT: pextrw $2, %xmm2, %esi -; SSSE3-NEXT: subw %cx, %si -; SSSE3-NEXT: cmovbl %eax, %esi -; SSSE3-NEXT: movd %esi, %xmm6 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; SSSE3-NEXT: pextrw $1, %xmm3, %ecx -; SSSE3-NEXT: pextrw $1, %xmm2, %esi -; SSSE3-NEXT: subw %cx, %si -; SSSE3-NEXT: cmovbl %eax, %esi -; SSSE3-NEXT: movd %esi, %xmm5 -; SSSE3-NEXT: movd %xmm3, %ecx -; SSSE3-NEXT: movd %xmm2, %esi -; SSSE3-NEXT: subw %cx, %si -; SSSE3-NEXT: cmovbl %eax, %esi -; SSSE3-NEXT: movd %esi, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] -; SSSE3-NEXT: pextrw $1, %xmm1, %ecx -; SSSE3-NEXT: pextrw $1, %xmm0, %esi -; SSSE3-NEXT: subw %cx, %si -; SSSE3-NEXT: cmovbl %eax, %esi -; SSSE3-NEXT: movd %xmm1, %ecx -; SSSE3-NEXT: movd %xmm0, %edi -; SSSE3-NEXT: subw %cx, %di -; SSSE3-NEXT: cmovbl %eax, %edi -; SSSE3-NEXT: movd %edi, %xmm3 -; SSSE3-NEXT: pinsrw $1, %esi, %xmm3 -; SSSE3-NEXT: pextrw $2, %xmm1, %ecx -; SSSE3-NEXT: pextrw $2, %xmm0, %esi -; SSSE3-NEXT: subw %cx, %si -; SSSE3-NEXT: cmovbl %eax, %esi -; SSSE3-NEXT: pinsrw $2, %esi, %xmm3 -; SSSE3-NEXT: pextrw $3, %xmm1, %ecx -; SSSE3-NEXT: pextrw $3, %xmm0, %esi -; SSSE3-NEXT: subw %cx, %si -; SSSE3-NEXT: cmovbl %eax, %esi -; SSSE3-NEXT: pinsrw $3, %esi, %xmm3 -; SSSE3-NEXT: movq %xmm3, 16(%rdx) -; SSSE3-NEXT: movdqa %xmm2, (%rdx) -; SSSE3-NEXT: retq +; AVX1-LABEL: v12i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vpsubusw (%rsi), %xmm0, %xmm0 +; AVX1-NEXT: vpsubusw 16(%rsi), %xmm1, %xmm1 +; AVX1-NEXT: vmovq %xmm1, 16(%rdx) +; AVX1-NEXT: vmovdqa %xmm0, (%rdx) +; AVX1-NEXT: retq ; -; SSE41-LABEL: v12i16: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm3 -; SSE41-NEXT: movdqa 16(%rdi), %xmm0 -; SSE41-NEXT: movdqa (%rsi), %xmm4 -; SSE41-NEXT: movdqa 16(%rsi), %xmm1 -; SSE41-NEXT: pextrw $1, %xmm4, %ecx -; SSE41-NEXT: pextrw $1, %xmm3, %esi -; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: subw %cx, %si -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: movd %xmm4, %ecx -; SSE41-NEXT: movd %xmm3, %edi -; SSE41-NEXT: subw %cx, %di -; SSE41-NEXT: cmovbl %eax, %edi -; SSE41-NEXT: movd %edi, %xmm2 -; SSE41-NEXT: pinsrw $1, %esi, %xmm2 -; SSE41-NEXT: pextrw $2, %xmm4, %ecx -; SSE41-NEXT: pextrw $2, %xmm3, %esi -; SSE41-NEXT: subw %cx, %si -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: pinsrw $2, %esi, %xmm2 -; SSE41-NEXT: pextrw $3, %xmm4, %ecx -; SSE41-NEXT: pextrw $3, %xmm3, %esi -; SSE41-NEXT: subw %cx, %si -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: pinsrw $3, %esi, %xmm2 -; SSE41-NEXT: pextrw $4, %xmm4, %ecx -; SSE41-NEXT: pextrw $4, %xmm3, %esi -; SSE41-NEXT: subw %cx, %si -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: pinsrw $4, %esi, %xmm2 -; SSE41-NEXT: pextrw $5, %xmm4, %ecx -; SSE41-NEXT: pextrw $5, %xmm3, %esi -; SSE41-NEXT: subw %cx, %si -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: pinsrw $5, %esi, %xmm2 -; SSE41-NEXT: pextrw $6, %xmm4, %ecx -; SSE41-NEXT: pextrw $6, %xmm3, %esi -; SSE41-NEXT: subw %cx, %si -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: pinsrw $6, %esi, %xmm2 -; SSE41-NEXT: pextrw $7, %xmm4, %ecx -; SSE41-NEXT: pextrw $7, %xmm3, %esi -; SSE41-NEXT: subw %cx, %si -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: pinsrw $7, %esi, %xmm2 -; SSE41-NEXT: pextrw $1, %xmm1, %ecx -; SSE41-NEXT: pextrw $1, %xmm0, %esi -; SSE41-NEXT: subw %cx, %si -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: movd %xmm1, %ecx -; SSE41-NEXT: movd %xmm0, %edi -; SSE41-NEXT: subw %cx, %di -; SSE41-NEXT: cmovbl %eax, %edi -; SSE41-NEXT: movd %edi, %xmm3 -; SSE41-NEXT: pinsrw $1, %esi, %xmm3 -; SSE41-NEXT: pextrw $2, %xmm1, %ecx -; SSE41-NEXT: pextrw $2, %xmm0, %esi -; SSE41-NEXT: subw %cx, %si -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: pinsrw $2, %esi, %xmm3 -; SSE41-NEXT: pextrw $3, %xmm1, %ecx -; SSE41-NEXT: pextrw $3, %xmm0, %esi -; SSE41-NEXT: subw %cx, %si -; SSE41-NEXT: cmovbl %eax, %esi -; SSE41-NEXT: pinsrw $3, %esi, %xmm3 -; SSE41-NEXT: movq %xmm3, 16(%rdx) -; SSE41-NEXT: movdqa %xmm2, (%rdx) -; SSE41-NEXT: retq +; AVX2-LABEL: v12i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpsubusw (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovq %xmm1, 16(%rdx) +; AVX2-NEXT: vmovdqa %xmm0, (%rdx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; -; AVX-LABEL: v12i16: -; AVX: # %bb.0: -; AVX-NEXT: pushq %rbp -; AVX-NEXT: pushq %rbx -; AVX-NEXT: vmovdqa (%rsi), %xmm0 -; AVX-NEXT: vmovdqa 16(%rsi), %xmm2 -; AVX-NEXT: vpextrw $1, %xmm2, %eax -; AVX-NEXT: vmovdqa (%rdi), %xmm1 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX-NEXT: vpextrw $1, %xmm3, %ecx -; AVX-NEXT: xorl %r8d, %r8d -; AVX-NEXT: subw %ax, %cx -; AVX-NEXT: cmovbl %r8d, %ecx -; AVX-NEXT: vmovd %xmm2, %eax -; AVX-NEXT: vmovd %xmm3, %esi -; AVX-NEXT: subw %ax, %si -; AVX-NEXT: cmovbl %r8d, %esi -; AVX-NEXT: vmovd %esi, %xmm4 -; AVX-NEXT: vpinsrw $1, %ecx, %xmm4, %xmm4 -; AVX-NEXT: vpextrw $2, %xmm2, %eax -; AVX-NEXT: vpextrw $2, %xmm3, %ecx -; AVX-NEXT: subw %ax, %cx -; AVX-NEXT: cmovbl %r8d, %ecx -; AVX-NEXT: vpinsrw $2, %ecx, %xmm4, %xmm4 -; AVX-NEXT: vpextrw $3, %xmm2, %eax -; AVX-NEXT: vpextrw $3, %xmm3, %ecx -; AVX-NEXT: subw %ax, %cx -; AVX-NEXT: cmovbl %r8d, %ecx -; AVX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm4 -; AVX-NEXT: vpextrw $4, %xmm2, %eax -; AVX-NEXT: vpextrw $4, %xmm3, %ecx -; AVX-NEXT: subw %ax, %cx -; AVX-NEXT: cmovbl %r8d, %ecx -; AVX-NEXT: vpinsrw $4, %ecx, %xmm4, %xmm4 -; AVX-NEXT: vpextrw $5, %xmm2, %eax -; AVX-NEXT: vpextrw $5, %xmm3, %ecx -; AVX-NEXT: subw %ax, %cx -; AVX-NEXT: cmovbl %r8d, %ecx -; AVX-NEXT: vpinsrw $5, %ecx, %xmm4, %xmm4 -; AVX-NEXT: vpextrw $6, %xmm2, %eax -; AVX-NEXT: vpextrw $6, %xmm3, %ecx -; AVX-NEXT: subw %ax, %cx -; AVX-NEXT: cmovbl %r8d, %ecx -; AVX-NEXT: vpinsrw $6, %ecx, %xmm4, %xmm4 -; AVX-NEXT: vpextrw $7, %xmm2, %eax -; AVX-NEXT: vpextrw $7, %xmm3, %ecx -; AVX-NEXT: subw %ax, %cx -; AVX-NEXT: cmovbl %r8d, %ecx -; AVX-NEXT: vpinsrw $7, %ecx, %xmm4, %xmm2 -; AVX-NEXT: vpextrw $7, %xmm0, %eax -; AVX-NEXT: vpextrw $7, %xmm1, %r9d -; AVX-NEXT: subw %ax, %r9w -; AVX-NEXT: cmovbl %r8d, %r9d -; AVX-NEXT: vpextrw $6, %xmm0, %eax -; AVX-NEXT: vpextrw $6, %xmm1, %r10d -; AVX-NEXT: subw %ax, %r10w -; AVX-NEXT: cmovbl %r8d, %r10d -; AVX-NEXT: vpextrw $5, %xmm0, %eax -; AVX-NEXT: vpextrw $5, %xmm1, %edi -; AVX-NEXT: subw %ax, %di -; AVX-NEXT: cmovbl %r8d, %edi -; AVX-NEXT: vpextrw $4, %xmm0, %ecx -; AVX-NEXT: vpextrw $4, %xmm1, %eax -; AVX-NEXT: subw %cx, %ax -; AVX-NEXT: cmovbl %r8d, %eax -; AVX-NEXT: vpextrw $3, %xmm0, %esi -; AVX-NEXT: vpextrw $3, %xmm1, %ecx -; AVX-NEXT: subw %si, %cx -; AVX-NEXT: cmovbl %r8d, %ecx -; AVX-NEXT: vpextrw $2, %xmm0, %r11d -; AVX-NEXT: vpextrw $2, %xmm1, %esi -; AVX-NEXT: subw %r11w, %si -; AVX-NEXT: cmovbl %r8d, %esi -; AVX-NEXT: vpextrw $1, %xmm0, %r11d -; AVX-NEXT: vpextrw $1, %xmm1, %ebx -; AVX-NEXT: subw %r11w, %bx -; AVX-NEXT: cmovbl %r8d, %ebx -; AVX-NEXT: vmovd %xmm0, %r11d -; AVX-NEXT: vmovd %xmm1, %ebp -; AVX-NEXT: subw %r11w, %bp -; AVX-NEXT: cmovbl %r8d, %ebp -; AVX-NEXT: vmovq %xmm2, 16(%rdx) -; AVX-NEXT: vmovd %ebp, %xmm0 -; AVX-NEXT: vpinsrw $1, %ebx, %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $2, %esi, %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $5, %edi, %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $6, %r10d, %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $7, %r9d, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: popq %rbx -; AVX-NEXT: popq %rbp -; AVX-NEXT: retq +; AVX512-LABEL: v12i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512-NEXT: vpsubusw (%rsi), %ymm0, %ymm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vmovq %xmm1, 16(%rdx) +; AVX512-NEXT: vmovdqa %xmm0, (%rdx) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %x = load <12 x i16>, <12 x i16>* %px %y = load <12 x i16>, <12 x i16>* %py %z = call <12 x i16> @llvm.usub.sat.v12i16(<12 x i16> %x, <12 x i16> %y) @@ -11391,713 +545,26 @@ ; Promotion define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { -; SSE2-LABEL: v16i4: -; SSE2: # %bb.0: -; SSE2-NEXT: pushq %rbp -; SSE2-NEXT: pushq %r15 -; SSE2-NEXT: pushq %r14 -; SSE2-NEXT: pushq %r13 -; SSE2-NEXT: pushq %r12 -; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: psllw $4, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: psllw $4, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movl $0, %eax -; SSE2-NEXT: jb .LBB15_2 -; SSE2-NEXT: # %bb.1: -; SSE2-NEXT: movl %ecx, %eax -; SSE2-NEXT: .LBB15_2: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl $0, %ecx -; SSE2-NEXT: jb .LBB15_4 -; SSE2-NEXT: # %bb.3: -; SSE2-NEXT: movl %edx, %ecx -; SSE2-NEXT: .LBB15_4: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl $0, %edx -; SSE2-NEXT: jb .LBB15_6 -; SSE2-NEXT: # %bb.5: -; SSE2-NEXT: movl %ebx, %edx -; SSE2-NEXT: .LBB15_6: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl $0, %esi -; SSE2-NEXT: jb .LBB15_8 -; SSE2-NEXT: # %bb.7: -; SSE2-NEXT: movl %ebx, %esi -; SSE2-NEXT: .LBB15_8: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl $0, %edi -; SSE2-NEXT: jb .LBB15_10 -; SSE2-NEXT: # %bb.9: -; SSE2-NEXT: movl %ebx, %edi -; SSE2-NEXT: .LBB15_10: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl $0, %r12d -; SSE2-NEXT: jb .LBB15_12 -; SSE2-NEXT: # %bb.11: -; SSE2-NEXT: movl %ebx, %r12d -; SSE2-NEXT: .LBB15_12: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl $0, %r8d -; SSE2-NEXT: jb .LBB15_14 -; SSE2-NEXT: # %bb.13: -; SSE2-NEXT: movl %ebx, %r8d -; SSE2-NEXT: .LBB15_14: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl $0, %r10d -; SSE2-NEXT: jb .LBB15_16 -; SSE2-NEXT: # %bb.15: -; SSE2-NEXT: movl %ebx, %r10d -; SSE2-NEXT: .LBB15_16: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl $0, %r13d -; SSE2-NEXT: jb .LBB15_18 -; SSE2-NEXT: # %bb.17: -; SSE2-NEXT: movl %ebx, %r13d -; SSE2-NEXT: .LBB15_18: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl $0, %r9d -; SSE2-NEXT: jb .LBB15_20 -; SSE2-NEXT: # %bb.19: -; SSE2-NEXT: movl %ebx, %r9d -; SSE2-NEXT: .LBB15_20: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl $0, %r11d -; SSE2-NEXT: jb .LBB15_22 -; SSE2-NEXT: # %bb.21: -; SSE2-NEXT: movl %ebx, %r11d -; SSE2-NEXT: .LBB15_22: -; SSE2-NEXT: movzbl %al, %r14d -; SSE2-NEXT: movzbl %cl, %r15d -; SSE2-NEXT: movzbl %dl, %edx -; SSE2-NEXT: movzbl %sil, %esi -; SSE2-NEXT: movzbl %dil, %ebx -; SSE2-NEXT: movzbl %r12b, %ebp -; SSE2-NEXT: movzbl %r8b, %edi -; SSE2-NEXT: movzbl %r10b, %r8d -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movl $0, %eax -; SSE2-NEXT: jb .LBB15_24 -; SSE2-NEXT: # %bb.23: -; SSE2-NEXT: movl %ecx, %eax -; SSE2-NEXT: .LBB15_24: -; SSE2-NEXT: movd %r14d, %xmm2 -; SSE2-NEXT: movd %r15d, %xmm3 -; SSE2-NEXT: movd %edx, %xmm5 -; SSE2-NEXT: movd %esi, %xmm0 -; SSE2-NEXT: movd %ebx, %xmm6 -; SSE2-NEXT: movd %ebp, %xmm4 -; SSE2-NEXT: movd %edi, %xmm7 -; SSE2-NEXT: movd %r8d, %xmm1 -; SSE2-NEXT: movzbl %r13b, %ebp -; SSE2-NEXT: movzbl %r9b, %ecx -; SSE2-NEXT: movzbl %r11b, %edx -; SSE2-NEXT: movzbl %al, %esi -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, %edi -; SSE2-NEXT: jb .LBB15_26 -; SSE2-NEXT: # %bb.25: -; SSE2-NEXT: movl %eax, %edi -; SSE2-NEXT: .LBB15_26: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; SSE2-NEXT: movd %ebp, %xmm6 -; SSE2-NEXT: movd %ecx, %xmm5 -; SSE2-NEXT: movd %edx, %xmm7 -; SSE2-NEXT: movd %esi, %xmm2 -; SSE2-NEXT: movzbl %dil, %eax -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl $0, %ecx -; SSE2-NEXT: jb .LBB15_28 -; SSE2-NEXT: # %bb.27: -; SSE2-NEXT: movl %edx, %ecx -; SSE2-NEXT: .LBB15_28: -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] -; SSE2-NEXT: movd %eax, %xmm4 -; SSE2-NEXT: movzbl %cl, %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl $0, %ecx -; SSE2-NEXT: jb .LBB15_30 -; SSE2-NEXT: # %bb.29: -; SSE2-NEXT: movl %edx, %ecx -; SSE2-NEXT: .LBB15_30: -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE2-NEXT: movzbl %cl, %ecx -; SSE2-NEXT: movd %ecx, %xmm4 -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, %ecx -; SSE2-NEXT: jb .LBB15_32 -; SSE2-NEXT: # %bb.31: -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: .LBB15_32: -; SSE2-NEXT: movzbl %cl, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: psrlw $4, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: popq %rbx -; SSE2-NEXT: popq %r12 -; SSE2-NEXT: popq %r13 -; SSE2-NEXT: popq %r14 -; SSE2-NEXT: popq %r15 -; SSE2-NEXT: popq %rbp -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v16i4: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pushq %rbp -; SSSE3-NEXT: pushq %r15 -; SSSE3-NEXT: pushq %r14 -; SSSE3-NEXT: pushq %r13 -; SSSE3-NEXT: pushq %r12 -; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: psllw $4, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: psllw $4, %xmm1 -; SSSE3-NEXT: pand %xmm2, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movl $0, %eax -; SSSE3-NEXT: jb .LBB15_2 -; SSSE3-NEXT: # %bb.1: -; SSSE3-NEXT: movl %ecx, %eax -; SSSE3-NEXT: .LBB15_2: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl $0, %ecx -; SSSE3-NEXT: jb .LBB15_4 -; SSSE3-NEXT: # %bb.3: -; SSSE3-NEXT: movl %edx, %ecx -; SSSE3-NEXT: .LBB15_4: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl $0, %edx -; SSSE3-NEXT: jb .LBB15_6 -; SSSE3-NEXT: # %bb.5: -; SSSE3-NEXT: movl %ebx, %edx -; SSSE3-NEXT: .LBB15_6: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl $0, %esi -; SSSE3-NEXT: jb .LBB15_8 -; SSSE3-NEXT: # %bb.7: -; SSSE3-NEXT: movl %ebx, %esi -; SSSE3-NEXT: .LBB15_8: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl $0, %edi -; SSSE3-NEXT: jb .LBB15_10 -; SSSE3-NEXT: # %bb.9: -; SSSE3-NEXT: movl %ebx, %edi -; SSSE3-NEXT: .LBB15_10: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl $0, %r12d -; SSSE3-NEXT: jb .LBB15_12 -; SSSE3-NEXT: # %bb.11: -; SSSE3-NEXT: movl %ebx, %r12d -; SSSE3-NEXT: .LBB15_12: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl $0, %r8d -; SSSE3-NEXT: jb .LBB15_14 -; SSSE3-NEXT: # %bb.13: -; SSSE3-NEXT: movl %ebx, %r8d -; SSSE3-NEXT: .LBB15_14: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl $0, %r10d -; SSSE3-NEXT: jb .LBB15_16 -; SSSE3-NEXT: # %bb.15: -; SSSE3-NEXT: movl %ebx, %r10d -; SSSE3-NEXT: .LBB15_16: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl $0, %r13d -; SSSE3-NEXT: jb .LBB15_18 -; SSSE3-NEXT: # %bb.17: -; SSSE3-NEXT: movl %ebx, %r13d -; SSSE3-NEXT: .LBB15_18: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl $0, %r9d -; SSSE3-NEXT: jb .LBB15_20 -; SSSE3-NEXT: # %bb.19: -; SSSE3-NEXT: movl %ebx, %r9d -; SSSE3-NEXT: .LBB15_20: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl $0, %r11d -; SSSE3-NEXT: jb .LBB15_22 -; SSSE3-NEXT: # %bb.21: -; SSSE3-NEXT: movl %ebx, %r11d -; SSSE3-NEXT: .LBB15_22: -; SSSE3-NEXT: movzbl %al, %r14d -; SSSE3-NEXT: movzbl %cl, %r15d -; SSSE3-NEXT: movzbl %dl, %edx -; SSSE3-NEXT: movzbl %sil, %esi -; SSSE3-NEXT: movzbl %dil, %ebx -; SSSE3-NEXT: movzbl %r12b, %ebp -; SSSE3-NEXT: movzbl %r8b, %edi -; SSSE3-NEXT: movzbl %r10b, %r8d -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movl $0, %eax -; SSSE3-NEXT: jb .LBB15_24 -; SSSE3-NEXT: # %bb.23: -; SSSE3-NEXT: movl %ecx, %eax -; SSSE3-NEXT: .LBB15_24: -; SSSE3-NEXT: movd %r14d, %xmm2 -; SSSE3-NEXT: movd %r15d, %xmm3 -; SSSE3-NEXT: movd %edx, %xmm5 -; SSSE3-NEXT: movd %esi, %xmm0 -; SSSE3-NEXT: movd %ebx, %xmm6 -; SSSE3-NEXT: movd %ebp, %xmm4 -; SSSE3-NEXT: movd %edi, %xmm7 -; SSSE3-NEXT: movd %r8d, %xmm1 -; SSSE3-NEXT: movzbl %r13b, %ebp -; SSSE3-NEXT: movzbl %r9b, %ecx -; SSSE3-NEXT: movzbl %r11b, %edx -; SSSE3-NEXT: movzbl %al, %esi -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, %edi -; SSSE3-NEXT: jb .LBB15_26 -; SSSE3-NEXT: # %bb.25: -; SSSE3-NEXT: movl %eax, %edi -; SSSE3-NEXT: .LBB15_26: -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; SSSE3-NEXT: movd %ebp, %xmm6 -; SSSE3-NEXT: movd %ecx, %xmm5 -; SSSE3-NEXT: movd %edx, %xmm7 -; SSSE3-NEXT: movd %esi, %xmm2 -; SSSE3-NEXT: movzbl %dil, %eax -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl $0, %ecx -; SSSE3-NEXT: jb .LBB15_28 -; SSSE3-NEXT: # %bb.27: -; SSSE3-NEXT: movl %edx, %ecx -; SSSE3-NEXT: .LBB15_28: -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] -; SSSE3-NEXT: movd %eax, %xmm4 -; SSSE3-NEXT: movzbl %cl, %eax -; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl $0, %ecx -; SSSE3-NEXT: jb .LBB15_30 -; SSSE3-NEXT: # %bb.29: -; SSSE3-NEXT: movl %edx, %ecx -; SSSE3-NEXT: .LBB15_30: -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSSE3-NEXT: movzbl %cl, %ecx -; SSSE3-NEXT: movd %ecx, %xmm4 -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, %ecx -; SSSE3-NEXT: jb .LBB15_32 -; SSSE3-NEXT: # %bb.31: -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: .LBB15_32: -; SSSE3-NEXT: movzbl %cl, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: psrlw $4, %xmm0 -; SSSE3-NEXT: pand {{.*}}(%rip), %xmm0 -; SSSE3-NEXT: popq %rbx -; SSSE3-NEXT: popq %r12 -; SSSE3-NEXT: popq %r13 -; SSSE3-NEXT: popq %r14 -; SSSE3-NEXT: popq %r15 -; SSSE3-NEXT: popq %rbp -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v16i4: -; SSE41: # %bb.0: -; SSE41-NEXT: psllw $4, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: pextrb $1, %xmm1, %edx -; SSE41-NEXT: psllw $4, %xmm0 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pextrb $1, %xmm0, %ecx -; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB15_2 -; SSE41-NEXT: # %bb.1: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB15_2: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pextrb $0, %xmm1, %esi -; SSE41-NEXT: pextrb $0, %xmm0, %edx -; SSE41-NEXT: subb %sil, %dl -; SSE41-NEXT: movl $0, %esi -; SSE41-NEXT: jb .LBB15_4 -; SSE41-NEXT: # %bb.3: -; SSE41-NEXT: movl %edx, %esi -; SSE41-NEXT: .LBB15_4: -; SSE41-NEXT: movzbl %sil, %edx -; SSE41-NEXT: movd %edx, %xmm2 -; SSE41-NEXT: pinsrb $1, %ecx, %xmm2 -; SSE41-NEXT: pextrb $2, %xmm1, %edx -; SSE41-NEXT: pextrb $2, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB15_6 -; SSE41-NEXT: # %bb.5: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB15_6: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $2, %ecx, %xmm2 -; SSE41-NEXT: pextrb $3, %xmm1, %edx -; SSE41-NEXT: pextrb $3, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB15_8 -; SSE41-NEXT: # %bb.7: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB15_8: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $3, %ecx, %xmm2 -; SSE41-NEXT: pextrb $4, %xmm1, %edx -; SSE41-NEXT: pextrb $4, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB15_10 -; SSE41-NEXT: # %bb.9: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB15_10: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $4, %ecx, %xmm2 -; SSE41-NEXT: pextrb $5, %xmm1, %edx -; SSE41-NEXT: pextrb $5, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB15_12 -; SSE41-NEXT: # %bb.11: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB15_12: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $5, %ecx, %xmm2 -; SSE41-NEXT: pextrb $6, %xmm1, %edx -; SSE41-NEXT: pextrb $6, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB15_14 -; SSE41-NEXT: # %bb.13: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB15_14: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $6, %ecx, %xmm2 -; SSE41-NEXT: pextrb $7, %xmm1, %edx -; SSE41-NEXT: pextrb $7, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB15_16 -; SSE41-NEXT: # %bb.15: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB15_16: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $7, %ecx, %xmm2 -; SSE41-NEXT: pextrb $8, %xmm1, %edx -; SSE41-NEXT: pextrb $8, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB15_18 -; SSE41-NEXT: # %bb.17: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB15_18: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $8, %ecx, %xmm2 -; SSE41-NEXT: pextrb $9, %xmm1, %edx -; SSE41-NEXT: pextrb $9, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB15_20 -; SSE41-NEXT: # %bb.19: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB15_20: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $9, %ecx, %xmm2 -; SSE41-NEXT: pextrb $10, %xmm1, %edx -; SSE41-NEXT: pextrb $10, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB15_22 -; SSE41-NEXT: # %bb.21: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB15_22: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $10, %ecx, %xmm2 -; SSE41-NEXT: pextrb $11, %xmm1, %edx -; SSE41-NEXT: pextrb $11, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB15_24 -; SSE41-NEXT: # %bb.23: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB15_24: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $11, %ecx, %xmm2 -; SSE41-NEXT: pextrb $12, %xmm1, %edx -; SSE41-NEXT: pextrb $12, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB15_26 -; SSE41-NEXT: # %bb.25: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB15_26: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $12, %ecx, %xmm2 -; SSE41-NEXT: pextrb $13, %xmm1, %edx -; SSE41-NEXT: pextrb $13, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB15_28 -; SSE41-NEXT: # %bb.27: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB15_28: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $13, %ecx, %xmm2 -; SSE41-NEXT: pextrb $14, %xmm1, %edx -; SSE41-NEXT: pextrb $14, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB15_30 -; SSE41-NEXT: # %bb.29: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB15_30: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $14, %ecx, %xmm2 -; SSE41-NEXT: pextrb $15, %xmm1, %edx -; SSE41-NEXT: pextrb $15, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: jb .LBB15_32 -; SSE41-NEXT: # %bb.31: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB15_32: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $15, %eax, %xmm2 -; SSE41-NEXT: psrlw $4, %xmm2 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: v16i4: +; SSE: # %bb.0: +; SSE-NEXT: psllw $4, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: psllw $4, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: psubusb %xmm1, %xmm0 +; SSE-NEXT: psrlw $4, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: v16i4: ; AVX: # %bb.0: ; AVX-NEXT: vpsllw $4, %xmm1, %xmm1 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpextrb $1, %xmm1, %edx ; AVX-NEXT: vpsllw $4, %xmm0, %xmm0 ; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpextrb $1, %xmm0, %ecx -; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: subb %dl, %cl -; AVX-NEXT: movl $0, %edx -; AVX-NEXT: jb .LBB15_2 -; AVX-NEXT: # %bb.1: -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: .LBB15_2: -; AVX-NEXT: movzbl %dl, %ecx -; AVX-NEXT: vpextrb $0, %xmm1, %esi -; AVX-NEXT: vpextrb $0, %xmm0, %edx -; AVX-NEXT: subb %sil, %dl -; AVX-NEXT: movl $0, %esi -; AVX-NEXT: jb .LBB15_4 -; AVX-NEXT: # %bb.3: -; AVX-NEXT: movl %edx, %esi -; AVX-NEXT: .LBB15_4: -; AVX-NEXT: movzbl %sil, %edx -; AVX-NEXT: vmovd %edx, %xmm2 -; AVX-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $2, %xmm1, %edx -; AVX-NEXT: vpextrb $2, %xmm0, %ecx -; AVX-NEXT: subb %dl, %cl -; AVX-NEXT: movl $0, %edx -; AVX-NEXT: jb .LBB15_6 -; AVX-NEXT: # %bb.5: -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: .LBB15_6: -; AVX-NEXT: movzbl %dl, %ecx -; AVX-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $3, %xmm1, %edx -; AVX-NEXT: vpextrb $3, %xmm0, %ecx -; AVX-NEXT: subb %dl, %cl -; AVX-NEXT: movl $0, %edx -; AVX-NEXT: jb .LBB15_8 -; AVX-NEXT: # %bb.7: -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: .LBB15_8: -; AVX-NEXT: movzbl %dl, %ecx -; AVX-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $4, %xmm1, %edx -; AVX-NEXT: vpextrb $4, %xmm0, %ecx -; AVX-NEXT: subb %dl, %cl -; AVX-NEXT: movl $0, %edx -; AVX-NEXT: jb .LBB15_10 -; AVX-NEXT: # %bb.9: -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: .LBB15_10: -; AVX-NEXT: movzbl %dl, %ecx -; AVX-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $5, %xmm1, %edx -; AVX-NEXT: vpextrb $5, %xmm0, %ecx -; AVX-NEXT: subb %dl, %cl -; AVX-NEXT: movl $0, %edx -; AVX-NEXT: jb .LBB15_12 -; AVX-NEXT: # %bb.11: -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: .LBB15_12: -; AVX-NEXT: movzbl %dl, %ecx -; AVX-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $6, %xmm1, %edx -; AVX-NEXT: vpextrb $6, %xmm0, %ecx -; AVX-NEXT: subb %dl, %cl -; AVX-NEXT: movl $0, %edx -; AVX-NEXT: jb .LBB15_14 -; AVX-NEXT: # %bb.13: -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: .LBB15_14: -; AVX-NEXT: movzbl %dl, %ecx -; AVX-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $7, %xmm1, %edx -; AVX-NEXT: vpextrb $7, %xmm0, %ecx -; AVX-NEXT: subb %dl, %cl -; AVX-NEXT: movl $0, %edx -; AVX-NEXT: jb .LBB15_16 -; AVX-NEXT: # %bb.15: -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: .LBB15_16: -; AVX-NEXT: movzbl %dl, %ecx -; AVX-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $8, %xmm1, %edx -; AVX-NEXT: vpextrb $8, %xmm0, %ecx -; AVX-NEXT: subb %dl, %cl -; AVX-NEXT: movl $0, %edx -; AVX-NEXT: jb .LBB15_18 -; AVX-NEXT: # %bb.17: -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: .LBB15_18: -; AVX-NEXT: movzbl %dl, %ecx -; AVX-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $9, %xmm1, %edx -; AVX-NEXT: vpextrb $9, %xmm0, %ecx -; AVX-NEXT: subb %dl, %cl -; AVX-NEXT: movl $0, %edx -; AVX-NEXT: jb .LBB15_20 -; AVX-NEXT: # %bb.19: -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: .LBB15_20: -; AVX-NEXT: movzbl %dl, %ecx -; AVX-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $10, %xmm1, %edx -; AVX-NEXT: vpextrb $10, %xmm0, %ecx -; AVX-NEXT: subb %dl, %cl -; AVX-NEXT: movl $0, %edx -; AVX-NEXT: jb .LBB15_22 -; AVX-NEXT: # %bb.21: -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: .LBB15_22: -; AVX-NEXT: movzbl %dl, %ecx -; AVX-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $11, %xmm1, %edx -; AVX-NEXT: vpextrb $11, %xmm0, %ecx -; AVX-NEXT: subb %dl, %cl -; AVX-NEXT: movl $0, %edx -; AVX-NEXT: jb .LBB15_24 -; AVX-NEXT: # %bb.23: -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: .LBB15_24: -; AVX-NEXT: movzbl %dl, %ecx -; AVX-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $12, %xmm1, %edx -; AVX-NEXT: vpextrb $12, %xmm0, %ecx -; AVX-NEXT: subb %dl, %cl -; AVX-NEXT: movl $0, %edx -; AVX-NEXT: jb .LBB15_26 -; AVX-NEXT: # %bb.25: -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: .LBB15_26: -; AVX-NEXT: movzbl %dl, %ecx -; AVX-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $13, %xmm1, %edx -; AVX-NEXT: vpextrb $13, %xmm0, %ecx -; AVX-NEXT: subb %dl, %cl -; AVX-NEXT: movl $0, %edx -; AVX-NEXT: jb .LBB15_28 -; AVX-NEXT: # %bb.27: -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: .LBB15_28: -; AVX-NEXT: movzbl %dl, %ecx -; AVX-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $14, %xmm1, %edx -; AVX-NEXT: vpextrb $14, %xmm0, %ecx -; AVX-NEXT: subb %dl, %cl -; AVX-NEXT: movl $0, %edx -; AVX-NEXT: jb .LBB15_30 -; AVX-NEXT: # %bb.29: -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: .LBB15_30: -; AVX-NEXT: movzbl %dl, %ecx -; AVX-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $15, %xmm1, %edx -; AVX-NEXT: vpextrb $15, %xmm0, %ecx -; AVX-NEXT: subb %dl, %cl -; AVX-NEXT: jb .LBB15_32 -; AVX-NEXT: # %bb.31: -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: .LBB15_32: -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0 +; AVX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq @@ -12106,713 +573,26 @@ } define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind { -; SSE2-LABEL: v16i1: -; SSE2: # %bb.0: -; SSE2-NEXT: pushq %rbp -; SSE2-NEXT: pushq %r15 -; SSE2-NEXT: pushq %r14 -; SSE2-NEXT: pushq %r13 -; SSE2-NEXT: pushq %r12 -; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: psllw $7, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: psllw $7, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movl $0, %eax -; SSE2-NEXT: jb .LBB16_2 -; SSE2-NEXT: # %bb.1: -; SSE2-NEXT: movl %ecx, %eax -; SSE2-NEXT: .LBB16_2: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl $0, %ecx -; SSE2-NEXT: jb .LBB16_4 -; SSE2-NEXT: # %bb.3: -; SSE2-NEXT: movl %edx, %ecx -; SSE2-NEXT: .LBB16_4: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl $0, %edx -; SSE2-NEXT: jb .LBB16_6 -; SSE2-NEXT: # %bb.5: -; SSE2-NEXT: movl %ebx, %edx -; SSE2-NEXT: .LBB16_6: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl $0, %esi -; SSE2-NEXT: jb .LBB16_8 -; SSE2-NEXT: # %bb.7: -; SSE2-NEXT: movl %ebx, %esi -; SSE2-NEXT: .LBB16_8: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl $0, %edi -; SSE2-NEXT: jb .LBB16_10 -; SSE2-NEXT: # %bb.9: -; SSE2-NEXT: movl %ebx, %edi -; SSE2-NEXT: .LBB16_10: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl $0, %r12d -; SSE2-NEXT: jb .LBB16_12 -; SSE2-NEXT: # %bb.11: -; SSE2-NEXT: movl %ebx, %r12d -; SSE2-NEXT: .LBB16_12: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl $0, %r8d -; SSE2-NEXT: jb .LBB16_14 -; SSE2-NEXT: # %bb.13: -; SSE2-NEXT: movl %ebx, %r8d -; SSE2-NEXT: .LBB16_14: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl $0, %r10d -; SSE2-NEXT: jb .LBB16_16 -; SSE2-NEXT: # %bb.15: -; SSE2-NEXT: movl %ebx, %r10d -; SSE2-NEXT: .LBB16_16: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl $0, %r13d -; SSE2-NEXT: jb .LBB16_18 -; SSE2-NEXT: # %bb.17: -; SSE2-NEXT: movl %ebx, %r13d -; SSE2-NEXT: .LBB16_18: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl $0, %r9d -; SSE2-NEXT: jb .LBB16_20 -; SSE2-NEXT: # %bb.19: -; SSE2-NEXT: movl %ebx, %r9d -; SSE2-NEXT: .LBB16_20: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl $0, %r11d -; SSE2-NEXT: jb .LBB16_22 -; SSE2-NEXT: # %bb.21: -; SSE2-NEXT: movl %ebx, %r11d -; SSE2-NEXT: .LBB16_22: -; SSE2-NEXT: movzbl %al, %r14d -; SSE2-NEXT: movzbl %cl, %r15d -; SSE2-NEXT: movzbl %dl, %edx -; SSE2-NEXT: movzbl %sil, %esi -; SSE2-NEXT: movzbl %dil, %ebx -; SSE2-NEXT: movzbl %r12b, %ebp -; SSE2-NEXT: movzbl %r8b, %edi -; SSE2-NEXT: movzbl %r10b, %r8d -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movl $0, %eax -; SSE2-NEXT: jb .LBB16_24 -; SSE2-NEXT: # %bb.23: -; SSE2-NEXT: movl %ecx, %eax -; SSE2-NEXT: .LBB16_24: -; SSE2-NEXT: movd %r14d, %xmm2 -; SSE2-NEXT: movd %r15d, %xmm3 -; SSE2-NEXT: movd %edx, %xmm5 -; SSE2-NEXT: movd %esi, %xmm0 -; SSE2-NEXT: movd %ebx, %xmm6 -; SSE2-NEXT: movd %ebp, %xmm4 -; SSE2-NEXT: movd %edi, %xmm7 -; SSE2-NEXT: movd %r8d, %xmm1 -; SSE2-NEXT: movzbl %r13b, %ebp -; SSE2-NEXT: movzbl %r9b, %ecx -; SSE2-NEXT: movzbl %r11b, %edx -; SSE2-NEXT: movzbl %al, %esi -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, %edi -; SSE2-NEXT: jb .LBB16_26 -; SSE2-NEXT: # %bb.25: -; SSE2-NEXT: movl %eax, %edi -; SSE2-NEXT: .LBB16_26: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; SSE2-NEXT: movd %ebp, %xmm6 -; SSE2-NEXT: movd %ecx, %xmm5 -; SSE2-NEXT: movd %edx, %xmm7 -; SSE2-NEXT: movd %esi, %xmm2 -; SSE2-NEXT: movzbl %dil, %eax -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl $0, %ecx -; SSE2-NEXT: jb .LBB16_28 -; SSE2-NEXT: # %bb.27: -; SSE2-NEXT: movl %edx, %ecx -; SSE2-NEXT: .LBB16_28: -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] -; SSE2-NEXT: movd %eax, %xmm4 -; SSE2-NEXT: movzbl %cl, %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl $0, %ecx -; SSE2-NEXT: jb .LBB16_30 -; SSE2-NEXT: # %bb.29: -; SSE2-NEXT: movl %edx, %ecx -; SSE2-NEXT: .LBB16_30: -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE2-NEXT: movzbl %cl, %ecx -; SSE2-NEXT: movd %ecx, %xmm4 -; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl $0, %ecx -; SSE2-NEXT: jb .LBB16_32 -; SSE2-NEXT: # %bb.31: -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: .LBB16_32: -; SSE2-NEXT: movzbl %cl, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: psrlw $7, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: popq %rbx -; SSE2-NEXT: popq %r12 -; SSE2-NEXT: popq %r13 -; SSE2-NEXT: popq %r14 -; SSE2-NEXT: popq %r15 -; SSE2-NEXT: popq %rbp -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v16i1: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pushq %rbp -; SSSE3-NEXT: pushq %r15 -; SSSE3-NEXT: pushq %r14 -; SSSE3-NEXT: pushq %r13 -; SSSE3-NEXT: pushq %r12 -; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: psllw $7, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: psllw $7, %xmm1 -; SSSE3-NEXT: pand %xmm2, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movl $0, %eax -; SSSE3-NEXT: jb .LBB16_2 -; SSSE3-NEXT: # %bb.1: -; SSSE3-NEXT: movl %ecx, %eax -; SSSE3-NEXT: .LBB16_2: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl $0, %ecx -; SSSE3-NEXT: jb .LBB16_4 -; SSSE3-NEXT: # %bb.3: -; SSSE3-NEXT: movl %edx, %ecx -; SSSE3-NEXT: .LBB16_4: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl $0, %edx -; SSSE3-NEXT: jb .LBB16_6 -; SSSE3-NEXT: # %bb.5: -; SSSE3-NEXT: movl %ebx, %edx -; SSSE3-NEXT: .LBB16_6: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl $0, %esi -; SSSE3-NEXT: jb .LBB16_8 -; SSSE3-NEXT: # %bb.7: -; SSSE3-NEXT: movl %ebx, %esi -; SSSE3-NEXT: .LBB16_8: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl $0, %edi -; SSSE3-NEXT: jb .LBB16_10 -; SSSE3-NEXT: # %bb.9: -; SSSE3-NEXT: movl %ebx, %edi -; SSSE3-NEXT: .LBB16_10: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl $0, %r12d -; SSSE3-NEXT: jb .LBB16_12 -; SSSE3-NEXT: # %bb.11: -; SSSE3-NEXT: movl %ebx, %r12d -; SSSE3-NEXT: .LBB16_12: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl $0, %r8d -; SSSE3-NEXT: jb .LBB16_14 -; SSSE3-NEXT: # %bb.13: -; SSSE3-NEXT: movl %ebx, %r8d -; SSSE3-NEXT: .LBB16_14: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl $0, %r10d -; SSSE3-NEXT: jb .LBB16_16 -; SSSE3-NEXT: # %bb.15: -; SSSE3-NEXT: movl %ebx, %r10d -; SSSE3-NEXT: .LBB16_16: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl $0, %r13d -; SSSE3-NEXT: jb .LBB16_18 -; SSSE3-NEXT: # %bb.17: -; SSSE3-NEXT: movl %ebx, %r13d -; SSSE3-NEXT: .LBB16_18: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl $0, %r9d -; SSSE3-NEXT: jb .LBB16_20 -; SSSE3-NEXT: # %bb.19: -; SSSE3-NEXT: movl %ebx, %r9d -; SSSE3-NEXT: .LBB16_20: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl $0, %r11d -; SSSE3-NEXT: jb .LBB16_22 -; SSSE3-NEXT: # %bb.21: -; SSSE3-NEXT: movl %ebx, %r11d -; SSSE3-NEXT: .LBB16_22: -; SSSE3-NEXT: movzbl %al, %r14d -; SSSE3-NEXT: movzbl %cl, %r15d -; SSSE3-NEXT: movzbl %dl, %edx -; SSSE3-NEXT: movzbl %sil, %esi -; SSSE3-NEXT: movzbl %dil, %ebx -; SSSE3-NEXT: movzbl %r12b, %ebp -; SSSE3-NEXT: movzbl %r8b, %edi -; SSSE3-NEXT: movzbl %r10b, %r8d -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movl $0, %eax -; SSSE3-NEXT: jb .LBB16_24 -; SSSE3-NEXT: # %bb.23: -; SSSE3-NEXT: movl %ecx, %eax -; SSSE3-NEXT: .LBB16_24: -; SSSE3-NEXT: movd %r14d, %xmm2 -; SSSE3-NEXT: movd %r15d, %xmm3 -; SSSE3-NEXT: movd %edx, %xmm5 -; SSSE3-NEXT: movd %esi, %xmm0 -; SSSE3-NEXT: movd %ebx, %xmm6 -; SSSE3-NEXT: movd %ebp, %xmm4 -; SSSE3-NEXT: movd %edi, %xmm7 -; SSSE3-NEXT: movd %r8d, %xmm1 -; SSSE3-NEXT: movzbl %r13b, %ebp -; SSSE3-NEXT: movzbl %r9b, %ecx -; SSSE3-NEXT: movzbl %r11b, %edx -; SSSE3-NEXT: movzbl %al, %esi -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, %edi -; SSSE3-NEXT: jb .LBB16_26 -; SSSE3-NEXT: # %bb.25: -; SSSE3-NEXT: movl %eax, %edi -; SSSE3-NEXT: .LBB16_26: -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; SSSE3-NEXT: movd %ebp, %xmm6 -; SSSE3-NEXT: movd %ecx, %xmm5 -; SSSE3-NEXT: movd %edx, %xmm7 -; SSSE3-NEXT: movd %esi, %xmm2 -; SSSE3-NEXT: movzbl %dil, %eax -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl $0, %ecx -; SSSE3-NEXT: jb .LBB16_28 -; SSSE3-NEXT: # %bb.27: -; SSSE3-NEXT: movl %edx, %ecx -; SSSE3-NEXT: .LBB16_28: -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] -; SSSE3-NEXT: movd %eax, %xmm4 -; SSSE3-NEXT: movzbl %cl, %eax -; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl $0, %ecx -; SSSE3-NEXT: jb .LBB16_30 -; SSSE3-NEXT: # %bb.29: -; SSSE3-NEXT: movl %edx, %ecx -; SSSE3-NEXT: .LBB16_30: -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSSE3-NEXT: movzbl %cl, %ecx -; SSSE3-NEXT: movd %ecx, %xmm4 -; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl $0, %ecx -; SSSE3-NEXT: jb .LBB16_32 -; SSSE3-NEXT: # %bb.31: -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: .LBB16_32: -; SSSE3-NEXT: movzbl %cl, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: psrlw $7, %xmm0 -; SSSE3-NEXT: pand {{.*}}(%rip), %xmm0 -; SSSE3-NEXT: popq %rbx -; SSSE3-NEXT: popq %r12 -; SSSE3-NEXT: popq %r13 -; SSSE3-NEXT: popq %r14 -; SSSE3-NEXT: popq %r15 -; SSSE3-NEXT: popq %rbp -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v16i1: -; SSE41: # %bb.0: -; SSE41-NEXT: psllw $7, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: pextrb $1, %xmm1, %edx -; SSE41-NEXT: psllw $7, %xmm0 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pextrb $1, %xmm0, %ecx -; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB16_2 -; SSE41-NEXT: # %bb.1: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB16_2: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pextrb $0, %xmm1, %esi -; SSE41-NEXT: pextrb $0, %xmm0, %edx -; SSE41-NEXT: subb %sil, %dl -; SSE41-NEXT: movl $0, %esi -; SSE41-NEXT: jb .LBB16_4 -; SSE41-NEXT: # %bb.3: -; SSE41-NEXT: movl %edx, %esi -; SSE41-NEXT: .LBB16_4: -; SSE41-NEXT: movzbl %sil, %edx -; SSE41-NEXT: movd %edx, %xmm2 -; SSE41-NEXT: pinsrb $1, %ecx, %xmm2 -; SSE41-NEXT: pextrb $2, %xmm1, %edx -; SSE41-NEXT: pextrb $2, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB16_6 -; SSE41-NEXT: # %bb.5: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB16_6: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $2, %ecx, %xmm2 -; SSE41-NEXT: pextrb $3, %xmm1, %edx -; SSE41-NEXT: pextrb $3, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB16_8 -; SSE41-NEXT: # %bb.7: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB16_8: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $3, %ecx, %xmm2 -; SSE41-NEXT: pextrb $4, %xmm1, %edx -; SSE41-NEXT: pextrb $4, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB16_10 -; SSE41-NEXT: # %bb.9: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB16_10: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $4, %ecx, %xmm2 -; SSE41-NEXT: pextrb $5, %xmm1, %edx -; SSE41-NEXT: pextrb $5, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB16_12 -; SSE41-NEXT: # %bb.11: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB16_12: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $5, %ecx, %xmm2 -; SSE41-NEXT: pextrb $6, %xmm1, %edx -; SSE41-NEXT: pextrb $6, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB16_14 -; SSE41-NEXT: # %bb.13: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB16_14: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $6, %ecx, %xmm2 -; SSE41-NEXT: pextrb $7, %xmm1, %edx -; SSE41-NEXT: pextrb $7, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB16_16 -; SSE41-NEXT: # %bb.15: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB16_16: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $7, %ecx, %xmm2 -; SSE41-NEXT: pextrb $8, %xmm1, %edx -; SSE41-NEXT: pextrb $8, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB16_18 -; SSE41-NEXT: # %bb.17: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB16_18: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $8, %ecx, %xmm2 -; SSE41-NEXT: pextrb $9, %xmm1, %edx -; SSE41-NEXT: pextrb $9, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB16_20 -; SSE41-NEXT: # %bb.19: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB16_20: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $9, %ecx, %xmm2 -; SSE41-NEXT: pextrb $10, %xmm1, %edx -; SSE41-NEXT: pextrb $10, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB16_22 -; SSE41-NEXT: # %bb.21: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB16_22: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $10, %ecx, %xmm2 -; SSE41-NEXT: pextrb $11, %xmm1, %edx -; SSE41-NEXT: pextrb $11, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB16_24 -; SSE41-NEXT: # %bb.23: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB16_24: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $11, %ecx, %xmm2 -; SSE41-NEXT: pextrb $12, %xmm1, %edx -; SSE41-NEXT: pextrb $12, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB16_26 -; SSE41-NEXT: # %bb.25: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB16_26: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $12, %ecx, %xmm2 -; SSE41-NEXT: pextrb $13, %xmm1, %edx -; SSE41-NEXT: pextrb $13, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB16_28 -; SSE41-NEXT: # %bb.27: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB16_28: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $13, %ecx, %xmm2 -; SSE41-NEXT: pextrb $14, %xmm1, %edx -; SSE41-NEXT: pextrb $14, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: jb .LBB16_30 -; SSE41-NEXT: # %bb.29: -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: .LBB16_30: -; SSE41-NEXT: movzbl %dl, %ecx -; SSE41-NEXT: pinsrb $14, %ecx, %xmm2 -; SSE41-NEXT: pextrb $15, %xmm1, %edx -; SSE41-NEXT: pextrb $15, %xmm0, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: jb .LBB16_32 -; SSE41-NEXT: # %bb.31: -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB16_32: -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $15, %eax, %xmm2 -; SSE41-NEXT: psrlw $7, %xmm2 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: v16i1: +; SSE: # %bb.0: +; SSE-NEXT: psllw $7, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: psllw $7, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: psubusb %xmm1, %xmm0 +; SSE-NEXT: psrlw $7, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: v16i1: ; AVX1: # %bb.0: ; AVX1-NEXT: vpsllw $7, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $1, %xmm1, %edx ; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $1, %xmm0, %ecx -; AVX1-NEXT: xorl %eax, %eax -; AVX1-NEXT: subb %dl, %cl -; AVX1-NEXT: movl $0, %edx -; AVX1-NEXT: jb .LBB16_2 -; AVX1-NEXT: # %bb.1: -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: .LBB16_2: -; AVX1-NEXT: movzbl %dl, %ecx -; AVX1-NEXT: vpextrb $0, %xmm1, %esi -; AVX1-NEXT: vpextrb $0, %xmm0, %edx -; AVX1-NEXT: subb %sil, %dl -; AVX1-NEXT: movl $0, %esi -; AVX1-NEXT: jb .LBB16_4 -; AVX1-NEXT: # %bb.3: -; AVX1-NEXT: movl %edx, %esi -; AVX1-NEXT: .LBB16_4: -; AVX1-NEXT: movzbl %sil, %edx -; AVX1-NEXT: vmovd %edx, %xmm2 -; AVX1-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $2, %xmm1, %edx -; AVX1-NEXT: vpextrb $2, %xmm0, %ecx -; AVX1-NEXT: subb %dl, %cl -; AVX1-NEXT: movl $0, %edx -; AVX1-NEXT: jb .LBB16_6 -; AVX1-NEXT: # %bb.5: -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: .LBB16_6: -; AVX1-NEXT: movzbl %dl, %ecx -; AVX1-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $3, %xmm1, %edx -; AVX1-NEXT: vpextrb $3, %xmm0, %ecx -; AVX1-NEXT: subb %dl, %cl -; AVX1-NEXT: movl $0, %edx -; AVX1-NEXT: jb .LBB16_8 -; AVX1-NEXT: # %bb.7: -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: .LBB16_8: -; AVX1-NEXT: movzbl %dl, %ecx -; AVX1-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $4, %xmm1, %edx -; AVX1-NEXT: vpextrb $4, %xmm0, %ecx -; AVX1-NEXT: subb %dl, %cl -; AVX1-NEXT: movl $0, %edx -; AVX1-NEXT: jb .LBB16_10 -; AVX1-NEXT: # %bb.9: -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: .LBB16_10: -; AVX1-NEXT: movzbl %dl, %ecx -; AVX1-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $5, %xmm1, %edx -; AVX1-NEXT: vpextrb $5, %xmm0, %ecx -; AVX1-NEXT: subb %dl, %cl -; AVX1-NEXT: movl $0, %edx -; AVX1-NEXT: jb .LBB16_12 -; AVX1-NEXT: # %bb.11: -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: .LBB16_12: -; AVX1-NEXT: movzbl %dl, %ecx -; AVX1-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $6, %xmm1, %edx -; AVX1-NEXT: vpextrb $6, %xmm0, %ecx -; AVX1-NEXT: subb %dl, %cl -; AVX1-NEXT: movl $0, %edx -; AVX1-NEXT: jb .LBB16_14 -; AVX1-NEXT: # %bb.13: -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: .LBB16_14: -; AVX1-NEXT: movzbl %dl, %ecx -; AVX1-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $7, %xmm1, %edx -; AVX1-NEXT: vpextrb $7, %xmm0, %ecx -; AVX1-NEXT: subb %dl, %cl -; AVX1-NEXT: movl $0, %edx -; AVX1-NEXT: jb .LBB16_16 -; AVX1-NEXT: # %bb.15: -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: .LBB16_16: -; AVX1-NEXT: movzbl %dl, %ecx -; AVX1-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $8, %xmm1, %edx -; AVX1-NEXT: vpextrb $8, %xmm0, %ecx -; AVX1-NEXT: subb %dl, %cl -; AVX1-NEXT: movl $0, %edx -; AVX1-NEXT: jb .LBB16_18 -; AVX1-NEXT: # %bb.17: -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: .LBB16_18: -; AVX1-NEXT: movzbl %dl, %ecx -; AVX1-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $9, %xmm1, %edx -; AVX1-NEXT: vpextrb $9, %xmm0, %ecx -; AVX1-NEXT: subb %dl, %cl -; AVX1-NEXT: movl $0, %edx -; AVX1-NEXT: jb .LBB16_20 -; AVX1-NEXT: # %bb.19: -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: .LBB16_20: -; AVX1-NEXT: movzbl %dl, %ecx -; AVX1-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $10, %xmm1, %edx -; AVX1-NEXT: vpextrb $10, %xmm0, %ecx -; AVX1-NEXT: subb %dl, %cl -; AVX1-NEXT: movl $0, %edx -; AVX1-NEXT: jb .LBB16_22 -; AVX1-NEXT: # %bb.21: -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: .LBB16_22: -; AVX1-NEXT: movzbl %dl, %ecx -; AVX1-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $11, %xmm1, %edx -; AVX1-NEXT: vpextrb $11, %xmm0, %ecx -; AVX1-NEXT: subb %dl, %cl -; AVX1-NEXT: movl $0, %edx -; AVX1-NEXT: jb .LBB16_24 -; AVX1-NEXT: # %bb.23: -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: .LBB16_24: -; AVX1-NEXT: movzbl %dl, %ecx -; AVX1-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $12, %xmm1, %edx -; AVX1-NEXT: vpextrb $12, %xmm0, %ecx -; AVX1-NEXT: subb %dl, %cl -; AVX1-NEXT: movl $0, %edx -; AVX1-NEXT: jb .LBB16_26 -; AVX1-NEXT: # %bb.25: -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: .LBB16_26: -; AVX1-NEXT: movzbl %dl, %ecx -; AVX1-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $13, %xmm1, %edx -; AVX1-NEXT: vpextrb $13, %xmm0, %ecx -; AVX1-NEXT: subb %dl, %cl -; AVX1-NEXT: movl $0, %edx -; AVX1-NEXT: jb .LBB16_28 -; AVX1-NEXT: # %bb.27: -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: .LBB16_28: -; AVX1-NEXT: movzbl %dl, %ecx -; AVX1-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $14, %xmm1, %edx -; AVX1-NEXT: vpextrb $14, %xmm0, %ecx -; AVX1-NEXT: subb %dl, %cl -; AVX1-NEXT: movl $0, %edx -; AVX1-NEXT: jb .LBB16_30 -; AVX1-NEXT: # %bb.29: -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: .LBB16_30: -; AVX1-NEXT: movzbl %dl, %ecx -; AVX1-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $15, %xmm1, %edx -; AVX1-NEXT: vpextrb $15, %xmm0, %ecx -; AVX1-NEXT: subb %dl, %cl -; AVX1-NEXT: jb .LBB16_32 -; AVX1-NEXT: # %bb.31: -; AVX1-NEXT: movl %ecx, %eax -; AVX1-NEXT: .LBB16_32: -; AVX1-NEXT: movzbl %al, %eax -; AVX1-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0 +; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -12822,483 +602,21 @@ ; AVX2-NEXT: vpsllw $7, %xmm1, %xmm1 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $1, %xmm1, %edx ; AVX2-NEXT: vpsllw $7, %xmm0, %xmm0 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $1, %xmm0, %ecx -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: subb %dl, %cl -; AVX2-NEXT: movl $0, %edx -; AVX2-NEXT: jb .LBB16_2 -; AVX2-NEXT: # %bb.1: -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: .LBB16_2: -; AVX2-NEXT: movzbl %dl, %ecx -; AVX2-NEXT: vpextrb $0, %xmm1, %esi -; AVX2-NEXT: vpextrb $0, %xmm0, %edx -; AVX2-NEXT: subb %sil, %dl -; AVX2-NEXT: movl $0, %esi -; AVX2-NEXT: jb .LBB16_4 -; AVX2-NEXT: # %bb.3: -; AVX2-NEXT: movl %edx, %esi -; AVX2-NEXT: .LBB16_4: -; AVX2-NEXT: movzbl %sil, %edx -; AVX2-NEXT: vmovd %edx, %xmm2 -; AVX2-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $2, %xmm1, %edx -; AVX2-NEXT: vpextrb $2, %xmm0, %ecx -; AVX2-NEXT: subb %dl, %cl -; AVX2-NEXT: movl $0, %edx -; AVX2-NEXT: jb .LBB16_6 -; AVX2-NEXT: # %bb.5: -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: .LBB16_6: -; AVX2-NEXT: movzbl %dl, %ecx -; AVX2-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $3, %xmm1, %edx -; AVX2-NEXT: vpextrb $3, %xmm0, %ecx -; AVX2-NEXT: subb %dl, %cl -; AVX2-NEXT: movl $0, %edx -; AVX2-NEXT: jb .LBB16_8 -; AVX2-NEXT: # %bb.7: -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: .LBB16_8: -; AVX2-NEXT: movzbl %dl, %ecx -; AVX2-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $4, %xmm1, %edx -; AVX2-NEXT: vpextrb $4, %xmm0, %ecx -; AVX2-NEXT: subb %dl, %cl -; AVX2-NEXT: movl $0, %edx -; AVX2-NEXT: jb .LBB16_10 -; AVX2-NEXT: # %bb.9: -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: .LBB16_10: -; AVX2-NEXT: movzbl %dl, %ecx -; AVX2-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $5, %xmm1, %edx -; AVX2-NEXT: vpextrb $5, %xmm0, %ecx -; AVX2-NEXT: subb %dl, %cl -; AVX2-NEXT: movl $0, %edx -; AVX2-NEXT: jb .LBB16_12 -; AVX2-NEXT: # %bb.11: -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: .LBB16_12: -; AVX2-NEXT: movzbl %dl, %ecx -; AVX2-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $6, %xmm1, %edx -; AVX2-NEXT: vpextrb $6, %xmm0, %ecx -; AVX2-NEXT: subb %dl, %cl -; AVX2-NEXT: movl $0, %edx -; AVX2-NEXT: jb .LBB16_14 -; AVX2-NEXT: # %bb.13: -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: .LBB16_14: -; AVX2-NEXT: movzbl %dl, %ecx -; AVX2-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $7, %xmm1, %edx -; AVX2-NEXT: vpextrb $7, %xmm0, %ecx -; AVX2-NEXT: subb %dl, %cl -; AVX2-NEXT: movl $0, %edx -; AVX2-NEXT: jb .LBB16_16 -; AVX2-NEXT: # %bb.15: -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: .LBB16_16: -; AVX2-NEXT: movzbl %dl, %ecx -; AVX2-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $8, %xmm1, %edx -; AVX2-NEXT: vpextrb $8, %xmm0, %ecx -; AVX2-NEXT: subb %dl, %cl -; AVX2-NEXT: movl $0, %edx -; AVX2-NEXT: jb .LBB16_18 -; AVX2-NEXT: # %bb.17: -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: .LBB16_18: -; AVX2-NEXT: movzbl %dl, %ecx -; AVX2-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $9, %xmm1, %edx -; AVX2-NEXT: vpextrb $9, %xmm0, %ecx -; AVX2-NEXT: subb %dl, %cl -; AVX2-NEXT: movl $0, %edx -; AVX2-NEXT: jb .LBB16_20 -; AVX2-NEXT: # %bb.19: -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: .LBB16_20: -; AVX2-NEXT: movzbl %dl, %ecx -; AVX2-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $10, %xmm1, %edx -; AVX2-NEXT: vpextrb $10, %xmm0, %ecx -; AVX2-NEXT: subb %dl, %cl -; AVX2-NEXT: movl $0, %edx -; AVX2-NEXT: jb .LBB16_22 -; AVX2-NEXT: # %bb.21: -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: .LBB16_22: -; AVX2-NEXT: movzbl %dl, %ecx -; AVX2-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $11, %xmm1, %edx -; AVX2-NEXT: vpextrb $11, %xmm0, %ecx -; AVX2-NEXT: subb %dl, %cl -; AVX2-NEXT: movl $0, %edx -; AVX2-NEXT: jb .LBB16_24 -; AVX2-NEXT: # %bb.23: -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: .LBB16_24: -; AVX2-NEXT: movzbl %dl, %ecx -; AVX2-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $12, %xmm1, %edx -; AVX2-NEXT: vpextrb $12, %xmm0, %ecx -; AVX2-NEXT: subb %dl, %cl -; AVX2-NEXT: movl $0, %edx -; AVX2-NEXT: jb .LBB16_26 -; AVX2-NEXT: # %bb.25: -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: .LBB16_26: -; AVX2-NEXT: movzbl %dl, %ecx -; AVX2-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $13, %xmm1, %edx -; AVX2-NEXT: vpextrb $13, %xmm0, %ecx -; AVX2-NEXT: subb %dl, %cl -; AVX2-NEXT: movl $0, %edx -; AVX2-NEXT: jb .LBB16_28 -; AVX2-NEXT: # %bb.27: -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: .LBB16_28: -; AVX2-NEXT: movzbl %dl, %ecx -; AVX2-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $14, %xmm1, %edx -; AVX2-NEXT: vpextrb $14, %xmm0, %ecx -; AVX2-NEXT: subb %dl, %cl -; AVX2-NEXT: movl $0, %edx -; AVX2-NEXT: jb .LBB16_30 -; AVX2-NEXT: # %bb.29: -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: .LBB16_30: -; AVX2-NEXT: movzbl %dl, %ecx -; AVX2-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $15, %xmm1, %edx -; AVX2-NEXT: vpextrb $15, %xmm0, %ecx -; AVX2-NEXT: subb %dl, %cl -; AVX2-NEXT: jb .LBB16_32 -; AVX2-NEXT: # %bb.31: -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: .LBB16_32: -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0 +; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrlw $7, %xmm0, %xmm0 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: v16i1: ; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %rbx ; AVX512-NEXT: vpsllw $7, %xmm0, %xmm0 ; AVX512-NEXT: vpmovb2m %xmm0, %k0 ; AVX512-NEXT: vpsllw $7, %xmm1, %xmm0 ; AVX512-NEXT: vpmovb2m %xmm0, %k1 -; AVX512-NEXT: kshiftrw $4, %k0, %k2 -; AVX512-NEXT: kshiftrw $4, %k1, %k3 -; AVX512-NEXT: kshiftrw $3, %k0, %k4 -; AVX512-NEXT: kmovd %k4, %r15d -; AVX512-NEXT: kshiftrw $3, %k1, %k4 -; AVX512-NEXT: kmovd %k4, %r9d -; AVX512-NEXT: kshiftrw $2, %k0, %k4 -; AVX512-NEXT: kmovd %k4, %eax -; AVX512-NEXT: kshiftrw $2, %k1, %k4 -; AVX512-NEXT: kmovd %k4, %ebp -; AVX512-NEXT: kmovd %k0, %ecx -; AVX512-NEXT: kmovd %k1, %esi -; AVX512-NEXT: kshiftrw $1, %k0, %k4 -; AVX512-NEXT: kmovd %k4, %edi -; AVX512-NEXT: kshiftrw $1, %k1, %k4 -; AVX512-NEXT: kmovd %k4, %edx -; AVX512-NEXT: shlb $7, %dl -; AVX512-NEXT: shlb $7, %dil -; AVX512-NEXT: xorl %r8d, %r8d -; AVX512-NEXT: subb %dl, %dil -; AVX512-NEXT: movl $0, %ebx -; AVX512-NEXT: jb .LBB16_2 -; AVX512-NEXT: # %bb.1: -; AVX512-NEXT: movl %edi, %ebx -; AVX512-NEXT: .LBB16_2: -; AVX512-NEXT: kshiftrw $5, %k0, %k4 -; AVX512-NEXT: kshiftrw $5, %k1, %k5 -; AVX512-NEXT: kmovd %k2, %edi -; AVX512-NEXT: kmovd %k3, %r11d -; AVX512-NEXT: shrb $7, %bl -; AVX512-NEXT: kmovd %ebx, %k6 -; AVX512-NEXT: shlb $7, %sil -; AVX512-NEXT: shlb $7, %cl -; AVX512-NEXT: subb %sil, %cl -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: jb .LBB16_4 -; AVX512-NEXT: # %bb.3: -; AVX512-NEXT: movl %ecx, %edx -; AVX512-NEXT: .LBB16_4: -; AVX512-NEXT: kshiftrw $6, %k0, %k2 -; AVX512-NEXT: kshiftrw $6, %k1, %k3 -; AVX512-NEXT: kmovd %k4, %esi -; AVX512-NEXT: kmovd %k5, %r14d -; AVX512-NEXT: shrb $7, %dl -; AVX512-NEXT: kmovd %edx, %k4 -; AVX512-NEXT: kshiftrw $1, %k4, %k5 -; AVX512-NEXT: kxorw %k6, %k5, %k5 -; AVX512-NEXT: kshiftlw $15, %k5, %k5 -; AVX512-NEXT: kshiftrw $14, %k5, %k5 -; AVX512-NEXT: kxorw %k5, %k4, %k6 -; AVX512-NEXT: kshiftrw $2, %k6, %k7 -; AVX512-NEXT: shlb $7, %bpl -; AVX512-NEXT: shlb $7, %al -; AVX512-NEXT: subb %bpl, %al -; AVX512-NEXT: movl $0, %ecx -; AVX512-NEXT: jb .LBB16_6 -; AVX512-NEXT: # %bb.5: -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: .LBB16_6: -; AVX512-NEXT: kshiftrw $7, %k0, %k4 -; AVX512-NEXT: kshiftrw $7, %k1, %k5 -; AVX512-NEXT: kmovd %k2, %eax -; AVX512-NEXT: kmovd %k3, %r10d -; AVX512-NEXT: shrb $7, %cl -; AVX512-NEXT: kmovd %ecx, %k2 -; AVX512-NEXT: kxorw %k2, %k7, %k2 -; AVX512-NEXT: kshiftlw $15, %k2, %k2 -; AVX512-NEXT: kshiftrw $13, %k2, %k2 -; AVX512-NEXT: kxorw %k2, %k6, %k6 -; AVX512-NEXT: kshiftrw $3, %k6, %k7 -; AVX512-NEXT: shlb $7, %r9b -; AVX512-NEXT: shlb $7, %r15b -; AVX512-NEXT: subb %r9b, %r15b -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: jb .LBB16_8 -; AVX512-NEXT: # %bb.7: -; AVX512-NEXT: movl %r15d, %edx -; AVX512-NEXT: .LBB16_8: -; AVX512-NEXT: kshiftrw $8, %k0, %k2 -; AVX512-NEXT: kshiftrw $8, %k1, %k3 -; AVX512-NEXT: kmovd %k4, %ecx -; AVX512-NEXT: kmovd %k5, %r9d -; AVX512-NEXT: shrb $7, %dl -; AVX512-NEXT: kmovd %edx, %k4 -; AVX512-NEXT: kxorw %k4, %k7, %k4 -; AVX512-NEXT: kshiftlw $15, %k4, %k4 -; AVX512-NEXT: kshiftrw $12, %k4, %k4 -; AVX512-NEXT: kxorw %k4, %k6, %k6 -; AVX512-NEXT: kshiftrw $4, %k6, %k7 -; AVX512-NEXT: shlb $7, %r11b -; AVX512-NEXT: shlb $7, %dil -; AVX512-NEXT: subb %r11b, %dil -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: jb .LBB16_10 -; AVX512-NEXT: # %bb.9: -; AVX512-NEXT: movl %edi, %edx -; AVX512-NEXT: .LBB16_10: -; AVX512-NEXT: kshiftrw $9, %k0, %k4 -; AVX512-NEXT: kshiftrw $9, %k1, %k5 -; AVX512-NEXT: kmovd %k2, %edi -; AVX512-NEXT: kmovd %k3, %ebx -; AVX512-NEXT: shrb $7, %dl -; AVX512-NEXT: kmovd %edx, %k2 -; AVX512-NEXT: kxorw %k2, %k7, %k2 -; AVX512-NEXT: kshiftlw $15, %k2, %k2 -; AVX512-NEXT: kshiftrw $11, %k2, %k2 -; AVX512-NEXT: kxorw %k2, %k6, %k6 -; AVX512-NEXT: kshiftrw $5, %k6, %k7 -; AVX512-NEXT: shlb $7, %r14b -; AVX512-NEXT: shlb $7, %sil -; AVX512-NEXT: subb %r14b, %sil -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: jb .LBB16_12 -; AVX512-NEXT: # %bb.11: -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: .LBB16_12: -; AVX512-NEXT: kshiftrw $10, %k0, %k2 -; AVX512-NEXT: kshiftrw $10, %k1, %k3 -; AVX512-NEXT: kmovd %k4, %esi -; AVX512-NEXT: kmovd %k5, %r11d -; AVX512-NEXT: shrb $7, %dl -; AVX512-NEXT: kmovd %edx, %k4 -; AVX512-NEXT: kxorw %k4, %k7, %k4 -; AVX512-NEXT: kshiftlw $15, %k4, %k4 -; AVX512-NEXT: kshiftrw $10, %k4, %k4 -; AVX512-NEXT: kxorw %k4, %k6, %k6 -; AVX512-NEXT: kshiftrw $6, %k6, %k7 -; AVX512-NEXT: shlb $7, %r10b -; AVX512-NEXT: shlb $7, %al -; AVX512-NEXT: subb %r10b, %al -; AVX512-NEXT: movl $0, %ebp -; AVX512-NEXT: jb .LBB16_14 -; AVX512-NEXT: # %bb.13: -; AVX512-NEXT: movl %eax, %ebp -; AVX512-NEXT: .LBB16_14: -; AVX512-NEXT: kshiftrw $11, %k0, %k4 -; AVX512-NEXT: kshiftrw $11, %k1, %k5 -; AVX512-NEXT: kmovd %k2, %r15d -; AVX512-NEXT: kmovd %k3, %r10d -; AVX512-NEXT: shrb $7, %bpl -; AVX512-NEXT: kmovd %ebp, %k2 -; AVX512-NEXT: kxorw %k2, %k7, %k2 -; AVX512-NEXT: kshiftlw $15, %k2, %k2 -; AVX512-NEXT: kshiftrw $9, %k2, %k2 -; AVX512-NEXT: kxorw %k2, %k6, %k6 -; AVX512-NEXT: kshiftrw $7, %k6, %k7 -; AVX512-NEXT: shlb $7, %r9b -; AVX512-NEXT: shlb $7, %cl -; AVX512-NEXT: subb %r9b, %cl -; AVX512-NEXT: movl $0, %eax -; AVX512-NEXT: jb .LBB16_16 -; AVX512-NEXT: # %bb.15: -; AVX512-NEXT: movl %ecx, %eax -; AVX512-NEXT: .LBB16_16: -; AVX512-NEXT: kshiftrw $12, %k0, %k2 -; AVX512-NEXT: kshiftrw $12, %k1, %k3 -; AVX512-NEXT: kmovd %k4, %ecx -; AVX512-NEXT: kmovd %k5, %r9d -; AVX512-NEXT: shrb $7, %al -; AVX512-NEXT: kmovd %eax, %k4 -; AVX512-NEXT: kxorw %k4, %k7, %k4 -; AVX512-NEXT: kshiftlw $15, %k4, %k4 -; AVX512-NEXT: kshiftrw $8, %k4, %k4 -; AVX512-NEXT: kxorw %k4, %k6, %k6 -; AVX512-NEXT: kshiftrw $8, %k6, %k7 -; AVX512-NEXT: shlb $7, %bl -; AVX512-NEXT: shlb $7, %dil -; AVX512-NEXT: subb %bl, %dil -; AVX512-NEXT: movl $0, %ebx -; AVX512-NEXT: jb .LBB16_18 -; AVX512-NEXT: # %bb.17: -; AVX512-NEXT: movl %edi, %ebx -; AVX512-NEXT: .LBB16_18: -; AVX512-NEXT: kshiftrw $13, %k0, %k4 -; AVX512-NEXT: kshiftrw $13, %k1, %k5 -; AVX512-NEXT: kmovd %k2, %eax -; AVX512-NEXT: kmovd %k3, %r14d -; AVX512-NEXT: shrb $7, %bl -; AVX512-NEXT: kmovd %ebx, %k2 -; AVX512-NEXT: kxorw %k2, %k7, %k2 -; AVX512-NEXT: kshiftlw $15, %k2, %k2 -; AVX512-NEXT: kshiftrw $7, %k2, %k2 -; AVX512-NEXT: kxorw %k2, %k6, %k6 -; AVX512-NEXT: kshiftrw $9, %k6, %k7 -; AVX512-NEXT: shlb $7, %r11b -; AVX512-NEXT: shlb $7, %sil -; AVX512-NEXT: subb %r11b, %sil -; AVX512-NEXT: movl $0, %edi -; AVX512-NEXT: jb .LBB16_20 -; AVX512-NEXT: # %bb.19: -; AVX512-NEXT: movl %esi, %edi -; AVX512-NEXT: .LBB16_20: -; AVX512-NEXT: kshiftrw $14, %k0, %k2 -; AVX512-NEXT: kshiftrw $14, %k1, %k3 -; AVX512-NEXT: kmovd %k4, %esi -; AVX512-NEXT: kmovd %k5, %r11d -; AVX512-NEXT: shrb $7, %dil -; AVX512-NEXT: kmovd %edi, %k4 -; AVX512-NEXT: kxorw %k4, %k7, %k4 -; AVX512-NEXT: kshiftlw $15, %k4, %k4 -; AVX512-NEXT: kshiftrw $6, %k4, %k4 -; AVX512-NEXT: kxorw %k4, %k6, %k4 -; AVX512-NEXT: kshiftrw $10, %k4, %k5 -; AVX512-NEXT: shlb $7, %r10b -; AVX512-NEXT: shlb $7, %r15b -; AVX512-NEXT: subb %r10b, %r15b -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: jb .LBB16_22 -; AVX512-NEXT: # %bb.21: -; AVX512-NEXT: movl %r15d, %edx -; AVX512-NEXT: .LBB16_22: -; AVX512-NEXT: kshiftrw $15, %k0, %k0 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovd %k2, %ebx -; AVX512-NEXT: kmovd %k3, %edi -; AVX512-NEXT: shrb $7, %dl -; AVX512-NEXT: kmovd %edx, %k2 -; AVX512-NEXT: kxorw %k2, %k5, %k2 -; AVX512-NEXT: kshiftlw $15, %k2, %k2 -; AVX512-NEXT: kshiftrw $5, %k2, %k2 -; AVX512-NEXT: kxorw %k2, %k4, %k2 -; AVX512-NEXT: kshiftrw $11, %k2, %k3 -; AVX512-NEXT: shlb $7, %r9b -; AVX512-NEXT: shlb $7, %cl -; AVX512-NEXT: subb %r9b, %cl -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: jb .LBB16_24 -; AVX512-NEXT: # %bb.23: -; AVX512-NEXT: movl %ecx, %edx -; AVX512-NEXT: .LBB16_24: -; AVX512-NEXT: kmovd %k0, %ecx -; AVX512-NEXT: kmovd %k1, %ebp -; AVX512-NEXT: shrb $7, %dl -; AVX512-NEXT: kmovd %edx, %k0 -; AVX512-NEXT: kxorw %k0, %k3, %k0 -; AVX512-NEXT: kshiftlw $15, %k0, %k0 -; AVX512-NEXT: kshiftrw $4, %k0, %k0 -; AVX512-NEXT: kxorw %k0, %k2, %k0 -; AVX512-NEXT: kshiftrw $12, %k0, %k1 -; AVX512-NEXT: shlb $7, %r14b -; AVX512-NEXT: shlb $7, %al -; AVX512-NEXT: subb %r14b, %al -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: jb .LBB16_26 -; AVX512-NEXT: # %bb.25: -; AVX512-NEXT: movl %eax, %edx -; AVX512-NEXT: .LBB16_26: -; AVX512-NEXT: shrb $7, %dl -; AVX512-NEXT: kmovd %edx, %k2 -; AVX512-NEXT: kxorw %k2, %k1, %k1 -; AVX512-NEXT: kshiftlw $15, %k1, %k1 -; AVX512-NEXT: kshiftrw $3, %k1, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k0 -; AVX512-NEXT: kshiftrw $13, %k0, %k1 -; AVX512-NEXT: shlb $7, %r11b -; AVX512-NEXT: shlb $7, %sil -; AVX512-NEXT: subb %r11b, %sil -; AVX512-NEXT: movl $0, %eax -; AVX512-NEXT: jb .LBB16_28 -; AVX512-NEXT: # %bb.27: -; AVX512-NEXT: movl %esi, %eax -; AVX512-NEXT: .LBB16_28: -; AVX512-NEXT: shrb $7, %al -; AVX512-NEXT: kmovd %eax, %k2 -; AVX512-NEXT: kxorw %k2, %k1, %k1 -; AVX512-NEXT: kshiftlw $15, %k1, %k1 -; AVX512-NEXT: kshiftrw $2, %k1, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k0 -; AVX512-NEXT: kshiftrw $14, %k0, %k1 -; AVX512-NEXT: shlb $7, %dil -; AVX512-NEXT: shlb $7, %bl -; AVX512-NEXT: subb %dil, %bl -; AVX512-NEXT: movl $0, %eax -; AVX512-NEXT: jb .LBB16_30 -; AVX512-NEXT: # %bb.29: -; AVX512-NEXT: movl %ebx, %eax -; AVX512-NEXT: .LBB16_30: -; AVX512-NEXT: shrb $7, %al -; AVX512-NEXT: kmovd %eax, %k2 -; AVX512-NEXT: kxorw %k2, %k1, %k1 -; AVX512-NEXT: kshiftlw $15, %k1, %k1 -; AVX512-NEXT: kshiftrw $1, %k1, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k0 -; AVX512-NEXT: kshiftlw $1, %k0, %k0 -; AVX512-NEXT: kshiftrw $1, %k0, %k0 -; AVX512-NEXT: shlb $7, %bpl -; AVX512-NEXT: shlb $7, %cl -; AVX512-NEXT: subb %bpl, %cl -; AVX512-NEXT: jb .LBB16_32 -; AVX512-NEXT: # %bb.31: -; AVX512-NEXT: movl %ecx, %r8d -; AVX512-NEXT: .LBB16_32: -; AVX512-NEXT: shrb $7, %r8b -; AVX512-NEXT: kmovd %r8d, %k1 -; AVX512-NEXT: kshiftlw $15, %k1, %k1 -; AVX512-NEXT: korw %k1, %k0, %k0 +; AVX512-NEXT: kandnw %k0, %k1, %k0 ; AVX512-NEXT: vpmovm2b %k0, %xmm0 -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp ; AVX512-NEXT: retq %z = call <16 x i1> @llvm.usub.sat.v16i1(<16 x i1> %x, <16 x i1> %y) ret <16 x i1> %z